141 files changed, 7771 insertions, 5516 deletions
diff --git a/fs/9p/mux.c b/fs/9p/mux.c
index f4407eb276c7..12e1baa4508d 100644
--- a/fs/9p/mux.c
+++ b/fs/9p/mux.c
@@ -712,7 +712,7 @@ static void v9fs_read_work(void *a)
 * v9fs_send_request - send 9P request
 * The function can sleep until the request is scheduled for sending.
 * The function can be interrupted. Return from the function is not
- * a guarantee that the request is sent succesfully. Can return errors
+ * a guarantee that the request is sent successfully. Can return errors
 * that can be retrieved by PTR_ERR macros.
 *
 * @m: mux data
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 2cb87ba4b1c1..5c6bdf82146c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -530,9 +530,6 @@ error:
        if (vfid)
                v9fs_fid_destroy(vfid);
-        if (inode)
-                iput(inode);
        return err;
 }
@@ -1054,6 +1051,9 @@ static int v9fs_vfs_readlink(struct dentry *dentry, char __user * buffer,
        int ret;
        char *link = __getname();
+        if (unlikely(!link))
+                return -ENOMEM;
        if (buflen > PATH_MAX)
                buflen = PATH_MAX;
@@ -1171,9 +1171,6 @@ error:
        if (vfid)
                v9fs_fid_destroy(vfid);
-        if (inode)
-                iput(inode);
        return err;
 }
@@ -1227,6 +1224,9 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
        }
        name = __getname();
+        if (unlikely(!name))
+                return -ENOMEM;
        sprintf(name, "%d\n", oldfid->fid);
        retval = v9fs_vfs_mkspecial(dir, dentry, V9FS_DMLINK, name);
        __putname(name);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 872943004e59..8b15bb22caca 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -256,11 +256,12 @@ static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt)
 }
 static void
-v9fs_umount_begin(struct super_block *sb)
+v9fs_umount_begin(struct vfsmount *vfsmnt, int flags)
 {
-        struct v9fs_session_info *v9ses = sb->s_fs_info;
+        struct v9fs_session_info *v9ses = vfsmnt->mnt_sb->s_fs_info;
-        v9fs_session_cancel(v9ses);
+        if (flags & MNT_FORCE)
+                v9fs_session_cancel(v9ses);
 }
 static struct super_operations v9fs_super_ops = {
diff --git a/fs/Kconfig b/fs/Kconfig
index 467f7ae5f092..00aa3d5c5a83 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -776,7 +776,8 @@ endmenu
 menu "Pseudo filesystems"
 config PROC_FS
-        bool "/proc file system support"
+        bool "/proc file system support" if EMBEDDED
+        default y
        help
          This is a virtual file system providing information about the status
          of the system. "Virtual" means that it doesn't take up any space on
@@ -1370,11 +1371,19 @@ config UFS_FS
 config UFS_FS_WRITE
        bool "UFS file system write support (DANGEROUS)"
-        depends on UFS_FS && EXPERIMENTAL && BROKEN
+        depends on UFS_FS && EXPERIMENTAL
        help
          Say Y here if you want to try writing to UFS partitions. This is
          experimental, so you should back up your UFS partitions beforehand.
+config UFS_DEBUG
+        bool "UFS debugging"
+        depends on UFS_FS
+        help
+          If you are experiencing any problems with the UFS filesystem, say
+          Y here.  This will result in _many_ additional debugging messages to be
+          written to the system log.
 endmenu
 menu "Network File Systems"
@@ -1481,7 +1490,12 @@ config NFSD
        select LOCKD
        select SUNRPC
        select EXPORTFS
-        select NFS_ACL_SUPPORT if NFSD_V3_ACL || NFSD_V2_ACL
+        select NFSD_V2_ACL if NFSD_V3_ACL
+        select NFS_ACL_SUPPORT if NFSD_V2_ACL
+        select NFSD_TCP if NFSD_V4
+        select CRYPTO_MD5 if NFSD_V4
+        select CRYPTO if NFSD_V4
+        select FS_POSIX_ACL if NFSD_V4
        help
          If you want your Linux box to act as an NFS *server*, so that other
          computers on your local network which support NFS can access certain
@@ -1519,7 +1533,6 @@ config NFSD_V3
 config NFSD_V3_ACL
        bool "Provide server support for the NFSv3 ACL protocol extension"
        depends on NFSD_V3
-        select NFSD_V2_ACL
        help
          Implement the NFSv3 ACL protocol extension for manipulating POSIX
          Access Control Lists on exported file systems. NFS clients should
@@ -1529,10 +1542,6 @@ config NFSD_V3_ACL
 config NFSD_V4
        bool "Provide NFSv4 server support (EXPERIMENTAL)"
        depends on NFSD_V3 && EXPERIMENTAL
-        select NFSD_TCP
-        select CRYPTO_MD5
-        select CRYPTO
-        select FS_POSIX_ACL
        help
          If you would like to include the NFSv4 server as well as the NFSv2
          and NFSv3 servers, say Y here.  This feature is experimental, and
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 8765cba35bb9..5200f4938df0 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -271,6 +271,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
        int                      reserved;
        unsigned long            mount_flags;
        int                      tmp_flags;     /* fix remount prototype... */
+        u8                       sig[4];
        pr_debug("AFFS: read_super(%s)\n",data ? (const char *)data : "no options");
@@ -370,8 +371,9 @@ got_root:
                printk(KERN_ERR "AFFS: Cannot read boot block\n");
                goto out_error;
        }
-        chksum = be32_to_cpu(*(__be32 *)boot_bh->b_data);
+        memcpy(sig, boot_bh->b_data, 4);
        brelse(boot_bh);
+        chksum = be32_to_cpu(*(__be32 *)sig);
        /* Dircache filesystems are compatible with non-dircache ones
         * when reading. As long as they aren't supported, writing is
@@ -420,11 +422,11 @@ got_root:
        }
        if (mount_flags & SF_VERBOSE) {
-                chksum = cpu_to_be32(chksum);
+                u8 len = AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0];
-                printk(KERN_NOTICE "AFFS: Mounting volume \"%*s\": Type=%.3s\\%c, Blocksize=%d\n",
+                printk(KERN_NOTICE "AFFS: Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n",
-                        AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0],
+                        len > 31 ? 31 : len,
                        AFFS_ROOT_TAIL(sb, root_bh)->disk_name + 1,
-                        (char *)&chksum,((char *)&chksum)[3] + '0',blocksize);
+                        sig, sig[3] + '0', blocksize);
        }
        sb->s_flags |= MS_NODEV | MS_NOSUID;
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 009a9ae88d61..bfc1fd22d5b1 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -413,8 +413,7 @@ int afs_server_find_by_peer(const struct rxrpc_peer *peer,
        /* we found it in the graveyard - resurrect it */
 found_dead_server:
-        list_del(&server->link);
+        list_move_tail(&server->link, &cell->sv_list);
-        list_add_tail(&server->link, &cell->sv_list);
        afs_get_server(server);
        afs_kafstimod_del_timer(&server->timeout);
        spin_unlock(&cell->sv_gylock);
diff --git a/fs/afs/kafsasyncd.c b/fs/afs/kafsasyncd.c
index 7ac07d0d47b9..f09a794f248e 100644
--- a/fs/afs/kafsasyncd.c
+++ b/fs/afs/kafsasyncd.c
@@ -136,8 +136,7 @@ static int kafsasyncd(void *arg)
                        if (!list_empty(&kafsasyncd_async_attnq)) {
                                op = list_entry(kafsasyncd_async_attnq.next,
                                                struct afs_async_op, link);
-                                list_del(&op->link);
+                                list_move_tail(&op->link,
-                                list_add_tail(&op->link,
                                              &kafsasyncd_async_busyq);
                        }
@@ -204,8 +203,7 @@ void afs_kafsasyncd_begin_op(struct afs_async_op *op)
        init_waitqueue_entry(&op->waiter, kafsasyncd_task);
        add_wait_queue(&op->call->waitq, &op->waiter);
-        list_del(&op->link);
+        list_move_tail(&op->link, &kafsasyncd_async_busyq);
-        list_add_tail(&op->link, &kafsasyncd_async_busyq);
        spin_unlock(&kafsasyncd_async_lock);
@@ -223,8 +221,7 @@ void afs_kafsasyncd_attend_op(struct afs_async_op *op)
        spin_lock(&kafsasyncd_async_lock);
-        list_del(&op->link);
+        list_move_tail(&op->link, &kafsasyncd_async_attnq);
-        list_add_tail(&op->link, &kafsasyncd_async_attnq);
        spin_unlock(&kafsasyncd_async_lock);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index b5cf9e1205ad..99785a79d043 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -203,7 +203,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
        /* try and do the mount */
        kdebug("--- attempting mount %s -o %s ---", devname, options);
-        mnt = do_kern_mount("afs", 0, devname, options);
+        mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options);
        kdebug("--- mount result %p ---", mnt);
        free_page((unsigned long) devname);
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 62b093aa41c6..22afaae1a4ce 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -123,8 +123,7 @@ int afs_server_lookup(struct afs_cell *cell, const struct in_addr *addr,
 resurrect_server:
        _debug("resurrecting server");
-        list_del(&zombie->link);
+        list_move_tail(&zombie->link, &cell->sv_list);
-        list_add_tail(&zombie->link, &cell->sv_list);
        afs_get_server(zombie);
        afs_kafstimod_del_timer(&zombie->timeout);
        spin_unlock(&cell->sv_gylock);
@@ -168,8 +167,7 @@ void afs_put_server(struct afs_server *server)
        }
        spin_lock(&cell->sv_gylock);
-        list_del(&server->link);
+        list_move_tail(&server->link, &cell->sv_graveyard);
-        list_add_tail(&server->link, &cell->sv_graveyard);
        /* time out in 10 secs */
        afs_kafstimod_add_timer(&server->timeout, 10 * HZ);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 82468df0ba54..67d1f5c819ec 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -48,7 +48,7 @@ static void afs_put_super(struct super_block *sb);
 static void afs_destroy_inode(struct inode *inode);
-static struct file_system_type afs_fs_type = {
+struct file_system_type afs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "afs",
        .get_sb         = afs_get_sb,
diff --git a/fs/afs/super.h b/fs/afs/super.h
index ac11362f4e95..32de8cc6fae8 100644
--- a/fs/afs/super.h
+++ b/fs/afs/super.h
@@ -38,6 +38,8 @@ static inline struct afs_super_info *AFS_FS_S(struct super_block *sb)
        return sb->s_fs_info;
 }
+extern struct file_system_type afs_fs_type;
 #endif /* __KERNEL__ */
 #endif /* _LINUX_AFS_SUPER_H */
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index eced20618ecc..331f730a1fb3 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -326,8 +326,7 @@ int afs_vlocation_lookup(struct afs_cell *cell,
        /* found in the graveyard - resurrect */
        _debug("found in graveyard");
        atomic_inc(&vlocation->usage);
-        list_del(&vlocation->link);
+        list_move_tail(&vlocation->link, &cell->vl_list);
-        list_add_tail(&vlocation->link, &cell->vl_list);
        spin_unlock(&cell->vl_gylock);
        afs_kafstimod_del_timer(&vlocation->timeout);
@@ -478,8 +477,7 @@ static void __afs_put_vlocation(struct afs_vlocation *vlocation)
        }
        /* move to graveyard queue */
-        list_del(&vlocation->link);
+        list_move_tail(&vlocation->link,&cell->vl_graveyard);
-        list_add_tail(&vlocation->link,&cell->vl_graveyard);
        /* remove from pending timeout queue (refcounted if actually being
         * updated) */
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index 9867fef3261d..cf62da5d7825 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -104,8 +104,7 @@ static void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
                                        vnode->cb_expiry * HZ);
                spin_lock(&afs_cb_hash_lock);
-                list_del(&vnode->cb_hash_link);
+                list_move_tail(&vnode->cb_hash_link,
-                list_add_tail(&vnode->cb_hash_link,
                              &afs_cb_hash(server, &vnode->fid));
                spin_unlock(&afs_cb_hash_lock);
diff --git a/fs/aio.c b/fs/aio.c
index 8c34a62df7d7..950630187acc 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -641,7 +641,7 @@ static inline int __queue_kicked_iocb(struct kiocb *iocb)
 *      invoked both for initial i/o submission and
 *      subsequent retries via the aio_kick_handler.
 *      Expects to be invoked with iocb->ki_ctx->lock
- *      already held. The lock is released and reaquired
+ *      already held. The lock is released and reacquired
 *      as needed during processing.
 *
 * Calls the iocb retry method (already setup for the
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index b8ce02607d66..8dbd44f10e9d 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -174,6 +174,12 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
                        struct autofs_info *ino = autofs4_dentry_ino(p);
                        unsigned int ino_count = atomic_read(&ino->count);
+                        /*
+                         * Clean stale dentries below that have not been
+                         * invalidated after a mount fail during lookup
+                         */
+                        d_invalidate(p);
                        /* allow for dget above and top is already dgot */
                        if (p == top)
                                ino_count += 2;
@@ -370,8 +376,7 @@ next:
                DPRINTK("returning %p %.*s",
                        expired, (int)expired->d_name.len, expired->d_name.name);
                spin_lock(&dcache_lock);
-                list_del(&expired->d_parent->d_subdirs);
+                list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
-                list_add(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
                spin_unlock(&dcache_lock);
                return expired;
        }
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index b1c902e319c1..c94d52eafd1b 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -510,7 +510,7 @@ static int load_flat_file(struct linux_binprm * bprm,
                }
                /* OK, This is the point of no return */
-                set_personality(PER_LINUX);
+                set_personality(PER_LINUX_32BIT);
        }
        /*
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 07a4996cca3f..34ebbc191e46 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -55,6 +55,7 @@ typedef struct {
 } Node;
 static DEFINE_RWLOCK(entries_lock);
+static struct file_system_type bm_fs_type;
 static struct vfsmount *bm_mnt;
 static int entry_count;
@@ -637,7 +638,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
        if (!inode)
                goto out2;
-        err = simple_pin_fs("binfmt_misc", &bm_mnt, &entry_count);
+        err = simple_pin_fs(&bm_fs_type, &bm_mnt, &entry_count);
        if (err) {
                iput(inode);
                inode = NULL;
diff --git a/fs/buffer.c b/fs/buffer.c
index 373bb6292bdc..f23bb647db47 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -564,7 +564,7 @@ still_busy:
 * Completion handler for block_write_full_page() - pages which are unlocked
 * during I/O, and which have PageWriteback cleared upon I/O completion.
 */
-void end_buffer_async_write(struct buffer_head *bh, int uptodate)
+static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 {
        char b[BDEVNAME_SIZE];
        unsigned long flags;
@@ -3166,7 +3166,6 @@ EXPORT_SYMBOL(block_sync_page);
 EXPORT_SYMBOL(block_truncate_page);
 EXPORT_SYMBOL(block_write_full_page);
 EXPORT_SYMBOL(cont_prepare_write);
-EXPORT_SYMBOL(end_buffer_async_write);
 EXPORT_SYMBOL(end_buffer_read_sync);
 EXPORT_SYMBOL(end_buffer_write_sync);
 EXPORT_SYMBOL(file_fsync);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index f2e285457bee..c28ede599946 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -403,12 +403,14 @@ static struct quotactl_ops cifs_quotactl_ops = {
 #endif
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-static void cifs_umount_begin(struct super_block * sblock)
+static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags)
 {
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo * tcon;
-        cifs_sb = CIFS_SB(sblock);
+        if (!(flags & MNT_FORCE))
+                return;
+        cifs_sb = CIFS_SB(vfsmnt->mnt_sb);
        if(cifs_sb == NULL)
                return;
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 6c6771db36da..7caee8d8ea3b 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -259,7 +259,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
        /* If request was not a signal, enqueue and don't free */
        if (!(req->uc_flags & REQ_ASYNC)) {
                req->uc_flags |= REQ_READ;
-                list_add(&(req->uc_chain), vcp->vc_processing.prev);
+                list_add_tail(&(req->uc_chain), &vcp->vc_processing);
                goto out;
        }
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index b040eba13a7d..a5b5e631ba61 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -725,7 +725,7 @@ static int coda_upcall(struct coda_sb_info *sbi,
        ((union inputArgs *)buffer)->ih.unique = req->uc_unique;
        /* Append msg to pending queue and poke Venus. */
-        list_add(&(req->uc_chain), vcommp->vc_pending.prev);
+        list_add_tail(&(req->uc_chain), &vcommp->vc_pending);
        
        wake_up_interruptible(&vcommp->vc_waitq);
        /* We can be interrupted while we wait for Venus to process
diff --git a/fs/compat.c b/fs/compat.c
index 7e7e5bc4f3cf..e31e9cf96647 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -55,6 +55,20 @@
 extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat);
+int compat_log = 1;
+int compat_printk(const char *fmt, ...)
+{
+        va_list ap;
+        int ret;
+        if (!compat_log)
+                return 0;
+        va_start(ap, fmt);
+        ret = vprintk(fmt, ap);
+        va_end(ap);
+        return ret;
+}
 /*
 * Not all architectures have sys_utime, so implement this in terms
 * of sys_utimes.
@@ -359,7 +373,7 @@ static void compat_ioctl_error(struct file *filp, unsigned int fd,
        sprintf(buf,"'%c'", (cmd>>24) & 0x3f);
        if (!isprint(buf[1]))
                sprintf(buf, "%02x", buf[1]);
-        printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
+        compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
                        "cmd(%08x){%s} arg(%08x) on %s\n",
                        current->comm, current->pid,
                        (int)fd, (unsigned int)cmd, buf,
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index d2c38875ab29..d8ecfedef189 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -80,6 +80,7 @@
 #include <net/bluetooth/rfcomm.h>
 #include <linux/capi.h>
+#include <linux/gigaset_dev.h>
 #include <scsi/scsi.h>
 #include <scsi/scsi_ioctl.h>
@@ -205,38 +206,6 @@ static int do_ext3_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
        return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg));
 }
-struct compat_dmx_event {
-        dmx_event_t     event;
-        compat_time_t   timeStamp;
-        union
-        {
-                dmx_scrambling_status_t scrambling;
-        } u;
-};
-static int do_dmx_get_event(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct dmx_event kevent;
-        mm_segment_t old_fs = get_fs();
-        int err;
-        set_fs(KERNEL_DS);
-        err = sys_ioctl(fd, cmd, (unsigned long) &kevent);
-        set_fs(old_fs);
-        if (!err) {
-                struct compat_dmx_event __user *up = compat_ptr(arg);
-                err  = put_user(kevent.event, &up->event);
-                err |= put_user(kevent.timeStamp, &up->timeStamp);
-                err |= put_user(kevent.u.scrambling, &up->u.scrambling);
-                if (err)
-                        err = -EFAULT;
-        }
-        return err;
-}
 struct compat_video_event {
        int32_t         type;
        compat_time_t   timestamp;
@@ -2964,7 +2933,6 @@ HANDLE_IOCTL(NCP_IOC_SETPRIVATEDATA_32, do_ncp_setprivatedata)
 #endif
 /* dvb */
-HANDLE_IOCTL(DMX_GET_EVENT, do_dmx_get_event)
 HANDLE_IOCTL(VIDEO_GET_EVENT, do_video_get_event)
 HANDLE_IOCTL(VIDEO_STILLPICTURE, do_video_stillpicture)
 HANDLE_IOCTL(VIDEO_SET_SPU_PALETTE, do_video_set_spu_palette)
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 5f952187fc53..207f8006fd6c 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1009,8 +1009,7 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
                        /* fallthrough */
                default:
                        if (filp->f_pos == 2) {
-                                list_del(q);
+                                list_move(q, &parent_sd->s_children);
-                                list_add(q, &parent_sd->s_children);
                        }
                        for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
                                struct configfs_dirent *next;
@@ -1033,8 +1032,7 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
                                                 dt_type(next)) < 0)
                                        return 0;
-                                list_del(q);
+                                list_move(q, p);
-                                list_add(q, p);
                                p = q;
                                filp->f_pos++;
                        }
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 94dab7bdd851..3e5fe843e1df 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -118,7 +118,7 @@ static struct file_system_type configfs_fs_type = {
 int configfs_pin_fs(void)
 {
-        return simple_pin_fs("configfs", &configfs_mount,
+        return simple_pin_fs(&configfs_fs_type, &configfs_mount,
                             &configfs_mnt_count);
 }
diff --git a/fs/dcache.c b/fs/dcache.c
index 313b54b2b8f2..48b44a714b35 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -406,7 +406,7 @@ static void prune_dcache(int count, struct super_block *sb)
                cond_resched_lock(&dcache_lock);
                tmp = dentry_unused.prev;
-                if (unlikely(sb)) {
+                if (sb) {
                        /* Try to find a dentry for this sb, but don't try
                         * too hard, if they aren't near the tail they will
                         * be moved down again soon
@@ -522,8 +522,7 @@ void shrink_dcache_sb(struct super_block * sb)
                dentry = list_entry(tmp, struct dentry, d_lru);
                if (dentry->d_sb != sb)
                        continue;
-                list_del(tmp);
+                list_move(tmp, &dentry_unused);
-                list_add(tmp, &dentry_unused);
        }
        /*
@@ -638,7 +637,7 @@ resume:
                 * of the unused list for prune_dcache
                 */
                if (!atomic_read(&dentry->d_count)) {
-                        list_add(&dentry->d_lru, dentry_unused.prev);
+                        list_add_tail(&dentry->d_lru, &dentry_unused);
                        dentry_stat.nr_unused++;
                        found++;
                }
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 440128ebef3b..6fa1e04f8415 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -199,7 +199,7 @@ struct dentry *debugfs_create_file(const char *name, mode_t mode,
        pr_debug("debugfs: creating file '%s'\n",name);
-        error = simple_pin_fs("debugfs", &debugfs_mount, &debugfs_mount_count);
+        error = simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count);
        if (error)
                goto exit;
diff --git a/fs/dquot.c b/fs/dquot.c
index 81d87a413c68..0122a279106a 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -250,7 +250,7 @@ static inline struct dquot *find_dquot(unsigned int hashent, struct super_block
 /* Add a dquot to the tail of the free list */
 static inline void put_dquot_last(struct dquot *dquot)
 {
-        list_add(&dquot->dq_free, free_dquots.prev);
+        list_add_tail(&dquot->dq_free, &free_dquots);
        dqstats.free_dquots++;
 }
@@ -266,7 +266,7 @@ static inline void put_inuse(struct dquot *dquot)
 {
        /* We add to the back of inuse list so we don't have to restart
         * when traversing this list and we block */
-        list_add(&dquot->dq_inuse, inuse_list.prev);
+        list_add_tail(&dquot->dq_inuse, &inuse_list);
        dqstats.allocated_dquots++;
 }
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 08e7e6a555ca..9c677bbd0b08 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1,6 +1,6 @@
 /*
 *  fs/eventpoll.c ( Efficent event polling implementation )
- *  Copyright (C) 2001,...,2003  Davide Libenzi
+ *  Copyright (C) 2001,...,2006  Davide Libenzi
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
@@ -1004,7 +1004,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
                /* Notify waiting tasks that events are available */
                if (waitqueue_active(&ep->wq))
-                        wake_up(&ep->wq);
+                        __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE);
                if (waitqueue_active(&ep->poll_wait))
                        pwake++;
        }
@@ -1083,7 +1083,8 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
                                /* Notify waiting tasks that events are available */
                                if (waitqueue_active(&ep->wq))
-                                        wake_up(&ep->wq);
+                                        __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
+                                                         TASK_INTERRUPTIBLE);
                                if (waitqueue_active(&ep->poll_wait))
                                        pwake++;
                        }
@@ -1260,7 +1261,8 @@ is_linked:
         * wait list.
         */
        if (waitqueue_active(&ep->wq))
-                wake_up(&ep->wq);
+                __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
+                                 TASK_INTERRUPTIBLE);
        if (waitqueue_active(&ep->poll_wait))
                pwake++;
@@ -1444,7 +1446,8 @@ static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist)
                 * wait list.
                 */
                if (waitqueue_active(&ep->wq))
-                        wake_up(&ep->wq);
+                        __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
+                                         TASK_INTERRUPTIBLE);
                if (waitqueue_active(&ep->poll_wait))
                        pwake++;
        }
@@ -1516,7 +1519,7 @@ retry:
                 * ep_poll_callback() when events will become available.
                 */
                init_waitqueue_entry(&wait, current);
-                add_wait_queue(&ep->wq, &wait);
+                __add_wait_queue(&ep->wq, &wait);
                for (;;) {
                        /*
@@ -1536,7 +1539,7 @@ retry:
                        jtimeout = schedule_timeout(jtimeout);
                        write_lock_irqsave(&ep->lock, flags);
                }
-                remove_wait_queue(&ep->wq, &wait);
+                __remove_wait_queue(&ep->wq, &wait);
                set_current_state(TASK_RUNNING);
        }
diff --git a/fs/exec.c b/fs/exec.c
index 0b88bf646143..c8494f513eaf 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -666,8 +666,6 @@ static int de_thread(struct task_struct *tsk)
         * and to assume its PID:
         */
        if (!thread_group_leader(current)) {
-                struct dentry *proc_dentry1, *proc_dentry2;
                /*
                 * Wait for the thread group leader to be a zombie.
                 * It should already be zombie at this point, most
@@ -689,10 +687,6 @@ static int de_thread(struct task_struct *tsk)
                 */
                current->start_time = leader->start_time;
-                spin_lock(&leader->proc_lock);
-                spin_lock(&current->proc_lock);
-                proc_dentry1 = proc_pid_unhash(current);
-                proc_dentry2 = proc_pid_unhash(leader);
                write_lock_irq(&tasklist_lock);
                BUG_ON(leader->tgid != current->tgid);
@@ -713,7 +707,7 @@ static int de_thread(struct task_struct *tsk)
                attach_pid(current, PIDTYPE_PID,  current->pid);
                attach_pid(current, PIDTYPE_PGID, current->signal->pgrp);
                attach_pid(current, PIDTYPE_SID,  current->signal->session);
-                list_add_tail_rcu(&current->tasks, &init_task.tasks);
+                list_replace_rcu(&leader->tasks, &current->tasks);
                current->group_leader = current;
                leader->group_leader = current;
@@ -721,7 +715,6 @@ static int de_thread(struct task_struct *tsk)
                /* Reduce leader to a thread */
                detach_pid(leader, PIDTYPE_PGID);
                detach_pid(leader, PIDTYPE_SID);
-                list_del_init(&leader->tasks);
                current->exit_signal = SIGCHLD;
@@ -729,10 +722,6 @@ static int de_thread(struct task_struct *tsk)
                leader->exit_state = EXIT_DEAD;
                write_unlock_irq(&tasklist_lock);
-                spin_unlock(&leader->proc_lock);
-                spin_unlock(&current->proc_lock);
-                proc_pid_flush(proc_dentry1);
-                proc_pid_flush(proc_dentry2);
        }
        /*
@@ -1379,67 +1368,102 @@ static void format_corename(char *corename, const char *pattern, long signr)
        *out_ptr = 0;
 }
-static void zap_threads (struct mm_struct *mm)
+static void zap_process(struct task_struct *start)
 {
-        struct task_struct *g, *p;
+        struct task_struct *t;
-        struct task_struct *tsk = current;
-        struct completion *vfork_done = tsk->vfork_done;
-        int traced = 0;
-        /*
+        start->signal->flags = SIGNAL_GROUP_EXIT;
-         * Make sure nobody is waiting for us to release the VM,
+        start->signal->group_stop_count = 0;
-         * otherwise we can deadlock when we wait on each other
-         */
-        if (vfork_done) {
-                tsk->vfork_done = NULL;
-                complete(vfork_done);
-        }
-        read_lock(&tasklist_lock);
+        t = start;
-        do_each_thread(g,p)
+        do {
-                if (mm == p->mm && p != tsk) {
+                if (t != current && t->mm) {
-                        force_sig_specific(SIGKILL, p);
+                        t->mm->core_waiters++;
-                        mm->core_waiters++;
+                        sigaddset(&t->pending.signal, SIGKILL);
-                        if (unlikely(p->ptrace) &&
+                        signal_wake_up(t, 1);
-                            unlikely(p->parent->mm == mm))
-                                traced = 1;
                }
-        while_each_thread(g,p);
+        } while ((t = next_thread(t)) != start);
+}
-        read_unlock(&tasklist_lock);
+static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
+                                int exit_code)
+{
+        struct task_struct *g, *p;
+        unsigned long flags;
+        int err = -EAGAIN;
+        spin_lock_irq(&tsk->sighand->siglock);
+        if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
+                tsk->signal->group_exit_code = exit_code;
+                zap_process(tsk);
+                err = 0;
+        }
+        spin_unlock_irq(&tsk->sighand->siglock);
+        if (err)
+                return err;
-        if (unlikely(traced)) {
+        if (atomic_read(&mm->mm_users) == mm->core_waiters + 1)
-                /*
+                goto done;
-                 * We are zapping a thread and the thread it ptraces.
-                 * If the tracee went into a ptrace stop for exit tracing,
+        rcu_read_lock();
-                 * we could deadlock since the tracer is waiting for this
+        for_each_process(g) {
-                 * coredump to finish.  Detach them so they can both die.
+                if (g == tsk->group_leader)
-                 */
+                        continue;
-                write_lock_irq(&tasklist_lock);
-                do_each_thread(g,p) {
+                p = g;
-                        if (mm == p->mm && p != tsk &&
+                do {
-                            p->ptrace && p->parent->mm == mm) {
+                        if (p->mm) {
-                                __ptrace_detach(p, 0);
+                                if (p->mm == mm) {
+                                        /*
+                                         * p->sighand can't disappear, but
+                                         * may be changed by de_thread()
+                                         */
+                                        lock_task_sighand(p, &flags);
+                                        zap_process(p);
+                                        unlock_task_sighand(p, &flags);
+                                }
+                                break;
                        }
-                } while_each_thread(g,p);
+                } while ((p = next_thread(p)) != g);
-                write_unlock_irq(&tasklist_lock);
        }
+        rcu_read_unlock();
+done:
+        return mm->core_waiters;
 }
-static void coredump_wait(struct mm_struct *mm)
+static int coredump_wait(int exit_code)
 {
-        DECLARE_COMPLETION(startup_done);
+        struct task_struct *tsk = current;
+        struct mm_struct *mm = tsk->mm;
+        struct completion startup_done;
+        struct completion *vfork_done;
        int core_waiters;
+        init_completion(&mm->core_done);
+        init_completion(&startup_done);
        mm->core_startup_done = &startup_done;
-        zap_threads(mm);
+        core_waiters = zap_threads(tsk, mm, exit_code);
-        core_waiters = mm->core_waiters;
        up_write(&mm->mmap_sem);
+        if (unlikely(core_waiters < 0))
+                goto fail;
+        /*
+         * Make sure nobody is waiting for us to release the VM,
+         * otherwise we can deadlock when we wait on each other
+         */
+        vfork_done = tsk->vfork_done;
+        if (vfork_done) {
+                tsk->vfork_done = NULL;
+                complete(vfork_done);
+        }
        if (core_waiters)
                wait_for_completion(&startup_done);
+fail:
        BUG_ON(mm->core_waiters);
+        return core_waiters;
 }
 int do_coredump(long signr, int exit_code, struct pt_regs * regs)
@@ -1473,22 +1497,9 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
        }
        mm->dumpable = 0;
-        retval = -EAGAIN;
+        retval = coredump_wait(exit_code);
-        spin_lock_irq(&current->sighand->siglock);
+        if (retval < 0)
-        if (!(current->signal->flags & SIGNAL_GROUP_EXIT)) {
-                current->signal->flags = SIGNAL_GROUP_EXIT;
-                current->signal->group_exit_code = exit_code;
-                current->signal->group_stop_count = 0;
-                retval = 0;
-        }
-        spin_unlock_irq(&current->sighand->siglock);
-        if (retval) {
-                up_write(&mm->mmap_sem);
                goto fail;
-        }
-        init_completion(&mm->core_done);
-        coredump_wait(mm);
        /*
         * Clear any false indication of pending signals that might
diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile
index c5d02da73bc3..e0b2b43c1fdb 100644
--- a/fs/ext2/Makefile
+++ b/fs/ext2/Makefile
@@ -4,7 +4,7 @@
 obj-$(CONFIG_EXT2_FS) += ext2.o
-ext2-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+ext2-y := balloc.o dir.o file.o fsync.o ialloc.o inode.o \
          ioctl.o namei.o super.o symlink.o
 ext2-$(CONFIG_EXT2_FS_XATTR)     += xattr.o xattr_user.o xattr_trusted.o
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 2c00953d4b0b..433a213a8bd9 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -521,6 +521,26 @@ io_error:
        goto out_release;
 }
+#ifdef EXT2FS_DEBUG
+static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
+unsigned long ext2_count_free (struct buffer_head * map, unsigned int numchars)
+{
+        unsigned int i;
+        unsigned long sum = 0;
+        if (!map)
+                return (0);
+        for (i = 0; i < numchars; i++)
+                sum += nibblemap[map->b_data[i] & 0xf] +
+                        nibblemap[(map->b_data[i] >> 4) & 0xf];
+        return (sum);
+}
+#endif  /*  EXT2FS_DEBUG  */
+/* Superblock must be locked */
 unsigned long ext2_count_free_blocks (struct super_block * sb)
 {
        struct ext2_group_desc * desc;
@@ -530,7 +550,6 @@ unsigned long ext2_count_free_blocks (struct super_block * sb)
        unsigned long bitmap_count, x;
        struct ext2_super_block *es;
-        lock_super (sb);
        es = EXT2_SB(sb)->s_es;
        desc_count = 0;
        bitmap_count = 0;
@@ -554,7 +573,6 @@ unsigned long ext2_count_free_blocks (struct super_block * sb)
        printk("ext2_count_free_blocks: stored = %lu, computed = %lu, %lu\n",
                (long)le32_to_cpu(es->s_free_blocks_count),
                desc_count, bitmap_count);
-        unlock_super (sb);
        return bitmap_count;
 #else
        for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
diff --git a/fs/ext2/bitmap.c b/fs/ext2/bitmap.c
deleted file mode 100644
index e9983a0dd396..000000000000
--- a/fs/ext2/bitmap.c
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  linux/fs/ext2/bitmap.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- */
-#ifdef EXT2FS_DEBUG
-#include <linux/buffer_head.h>
-#include "ext2.h"
-static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
-unsigned long ext2_count_free (struct buffer_head * map, unsigned int numchars)
-{
-        unsigned int i;
-        unsigned long sum = 0;
-        
-        if (!map) 
-                return (0);
-        for (i = 0; i < numchars; i++)
-                sum += nibblemap[map->b_data[i] & 0xf] +
-                        nibblemap[(map->b_data[i] >> 4) & 0xf];
-        return (sum);
-}
-#endif  /*  EXT2FS_DEBUG  */
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 3c1c9aaaca6b..92ea8265d7d5 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -399,8 +399,7 @@ ino_t ext2_inode_by_name(struct inode * dir, struct dentry *dentry)
        de = ext2_find_entry (dir, dentry, &page);
        if (de) {
                res = le32_to_cpu(de->inode);
-                kunmap(page);
+                ext2_put_page(page);
-                page_cache_release(page);
        }
        return res;
 }
diff --git a/fs/ext2/fsync.c b/fs/ext2/fsync.c
index c9c2e5ffa48e..7806b9e8155b 100644
--- a/fs/ext2/fsync.c
+++ b/fs/ext2/fsync.c
@@ -24,7 +24,7 @@
 #include "ext2.h"
 #include <linux/smp_lock.h>
-#include <linux/buffer_head.h>          /* for fsync_inode_buffers() */
+#include <linux/buffer_head.h>          /* for sync_mapping_buffers() */
 /*
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index e52765219e16..308c252568c6 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -638,6 +638,7 @@ fail:
        return ERR_PTR(err);
 }
+/* Superblock must be locked */
 unsigned long ext2_count_free_inodes (struct super_block * sb)
 {
        struct ext2_group_desc *desc;
@@ -649,7 +650,6 @@ unsigned long ext2_count_free_inodes (struct super_block * sb)
        unsigned long bitmap_count = 0;
        struct buffer_head *bitmap_bh = NULL;
-        lock_super (sb);
        es = EXT2_SB(sb)->s_es;
        for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
                unsigned x;
@@ -672,7 +672,6 @@ unsigned long ext2_count_free_inodes (struct super_block * sb)
        printk("ext2_count_free_inodes: stored = %lu, computed = %lu, %lu\n",
                percpu_counter_read(&EXT2_SB(sb)->s_freeinodes_counter),
                desc_count, bitmap_count);
-        unlock_super(sb);
        return desc_count;
 #else
        for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index ee4ba759581e..d4233b2e6436 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -854,7 +854,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        }
        if (!ext2_check_descriptors (sb)) {
                printk ("EXT2-fs: group descriptors corrupted!\n");
-                db_count = i;
                goto failed_mount2;
        }
        sbi->s_gdb_count = db_count;
@@ -1046,6 +1045,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
        unsigned long overhead;
        int i;
+        lock_super(sb);
        if (test_opt (sb, MINIX_DF))
                overhead = 0;
        else {
@@ -1086,6 +1086,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
        buf->f_files = le32_to_cpu(sbi->s_es->s_inodes_count);
        buf->f_ffree = ext2_count_free_inodes (sb);
        buf->f_namelen = EXT2_NAME_LEN;
+        unlock_super(sb);
        return 0;
 }
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 77927d6938f6..96172e89ddc3 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -163,20 +163,19 @@ restart:
 #endif
 static int
-goal_in_my_reservation(struct ext3_reserve_window *rsv, int goal,
+goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal,
                        unsigned int group, struct super_block * sb)
 {
-        unsigned long group_first_block, group_last_block;
+        ext3_fsblk_t group_first_block, group_last_block;
-        group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
+        group_first_block = ext3_group_first_block_no(sb, group);
-                                group * EXT3_BLOCKS_PER_GROUP(sb);
        group_last_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;
        if ((rsv->_rsv_start > group_last_block) ||
            (rsv->_rsv_end < group_first_block))
                return 0;
-        if ((goal >= 0) && ((goal + group_first_block < rsv->_rsv_start)
+        if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
-                || (goal + group_first_block > rsv->_rsv_end)))
+                || (grp_goal + group_first_block > rsv->_rsv_end)))
                return 0;
        return 1;
 }
@@ -187,7 +186,7 @@ goal_in_my_reservation(struct ext3_reserve_window *rsv, int goal,
 * Returns NULL if there are no windows or if all windows start after the goal.
 */
 static struct ext3_reserve_window_node *
-search_reserve_window(struct rb_root *root, unsigned long goal)
+search_reserve_window(struct rb_root *root, ext3_fsblk_t goal)
 {
        struct rb_node *n = root->rb_node;
        struct ext3_reserve_window_node *rsv;
@@ -223,7 +222,7 @@ void ext3_rsv_window_add(struct super_block *sb,
 {
        struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root;
        struct rb_node *node = &rsv->rsv_node;
-        unsigned int start = rsv->rsv_start;
+        ext3_fsblk_t start = rsv->rsv_start;
        struct rb_node ** p = &root->rb_node;
        struct rb_node * parent = NULL;
@@ -310,20 +309,20 @@ void ext3_discard_reservation(struct inode *inode)
 /* Free given blocks, update quota and i_blocks field */
 void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
-                         unsigned long block, unsigned long count,
+                         ext3_fsblk_t block, unsigned long count,
-                         int *pdquot_freed_blocks)
+                         unsigned long *pdquot_freed_blocks)
 {
        struct buffer_head *bitmap_bh = NULL;
        struct buffer_head *gd_bh;
        unsigned long block_group;
-        unsigned long bit;
+        ext3_grpblk_t bit;
        unsigned long i;
        unsigned long overflow;
        struct ext3_group_desc * desc;
        struct ext3_super_block * es;
        struct ext3_sb_info *sbi;
        int err = 0, ret;
-        unsigned group_freed;
+        ext3_grpblk_t group_freed;
        *pdquot_freed_blocks = 0;
        sbi = EXT3_SB(sb);
@@ -333,7 +332,7 @@ void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
            block + count > le32_to_cpu(es->s_blocks_count)) {
                ext3_error (sb, "ext3_free_blocks",
                            "Freeing blocks not in datazone - "
-                            "block = %lu, count = %lu", block, count);
+                            "block = "E3FSBLK", count = %lu", block, count);
                goto error_return;
        }
@@ -369,7 +368,7 @@ do_more:
                      sbi->s_itb_per_group))
                ext3_error (sb, "ext3_free_blocks",
                            "Freeing blocks in system zones - "
-                            "Block = %lu, count = %lu",
+                            "Block = "E3FSBLK", count = %lu",
                            block, count);
        /*
@@ -453,7 +452,8 @@ do_more:
                                                bit + i, bitmap_bh->b_data)) {
                        jbd_unlock_bh_state(bitmap_bh);
                        ext3_error(sb, __FUNCTION__,
-                                "bit already cleared for block %lu", block + i);
+                                "bit already cleared for block "E3FSBLK,
+                                 block + i);
                        jbd_lock_bh_state(bitmap_bh);
                        BUFFER_TRACE(bitmap_bh, "bit already cleared");
                } else {
@@ -493,10 +493,10 @@ error_return:
 /* Free given blocks, update quota and i_blocks field */
 void ext3_free_blocks(handle_t *handle, struct inode *inode,
-                        unsigned long block, unsigned long count)
+                        ext3_fsblk_t block, unsigned long count)
 {
        struct super_block * sb;
-        int dquot_freed_blocks;
+        unsigned long dquot_freed_blocks;
        sb = inode->i_sb;
        if (!sb) {
@@ -525,7 +525,7 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode,
 * data-writes at some point, and disable it for metadata allocations or
 * sync-data inodes.
 */
-static int ext3_test_allocatable(int nr, struct buffer_head *bh)
+static int ext3_test_allocatable(ext3_grpblk_t nr, struct buffer_head *bh)
 {
        int ret;
        struct journal_head *jh = bh2jh(bh);
@@ -542,11 +542,11 @@ static int ext3_test_allocatable(int nr, struct buffer_head *bh)
        return ret;
 }
-static int
+static ext3_grpblk_t
-bitmap_search_next_usable_block(int start, struct buffer_head *bh,
+bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
-                                        int maxblocks)
+                                        ext3_grpblk_t maxblocks)
 {
-        int next;
+        ext3_grpblk_t next;
        struct journal_head *jh = bh2jh(bh);
        /*
@@ -576,10 +576,11 @@ bitmap_search_next_usable_block(int start, struct buffer_head *bh,
 * the initial goal; then for a free byte somewhere in the bitmap; then
 * for any free bit in the bitmap.
 */
-static int
+static ext3_grpblk_t
-find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
+find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
+                        ext3_grpblk_t maxblocks)
 {
-        int here, next;
+        ext3_grpblk_t here, next;
        char *p, *r;
        if (start > 0) {
@@ -591,7 +592,7 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
                 * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the
                 * next 64-bit boundary is simple..
                 */
-                int end_goal = (start + 63) & ~63;
+                ext3_grpblk_t end_goal = (start + 63) & ~63;
                if (end_goal > maxblocks)
                        end_goal = maxblocks;
                here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
@@ -628,7 +629,7 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
 * zero (failure).
 */
 static inline int
-claim_block(spinlock_t *lock, int block, struct buffer_head *bh)
+claim_block(spinlock_t *lock, ext3_grpblk_t block, struct buffer_head *bh)
 {
        struct journal_head *jh = bh2jh(bh);
        int ret;
@@ -651,19 +652,18 @@ claim_block(spinlock_t *lock, int block, struct buffer_head *bh)
 * new bitmap.  In that case we must release write access to the old one via
 * ext3_journal_release_buffer(), else we'll run out of credits.
 */
-static int
+static ext3_grpblk_t
 ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
-                        struct buffer_head *bitmap_bh, int goal,
+                        struct buffer_head *bitmap_bh, ext3_grpblk_t grp_goal,
                        unsigned long *count, struct ext3_reserve_window *my_rsv)
 {
-        int group_first_block, start, end;
+        ext3_fsblk_t group_first_block;
+        ext3_grpblk_t start, end;
        unsigned long num = 0;
        /* we do allocation within the reservation window if we have a window */
        if (my_rsv) {
-                group_first_block =
+                group_first_block = ext3_group_first_block_no(sb, group);
-                        le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
-                        group * EXT3_BLOCKS_PER_GROUP(sb);
                if (my_rsv->_rsv_start >= group_first_block)
                        start = my_rsv->_rsv_start - group_first_block;
                else
@@ -673,13 +673,13 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
                if (end > EXT3_BLOCKS_PER_GROUP(sb))
                        /* reservation window crosses group boundary */
                        end = EXT3_BLOCKS_PER_GROUP(sb);
-                if ((start <= goal) && (goal < end))
+                if ((start <= grp_goal) && (grp_goal < end))
-                        start = goal;
+                        start = grp_goal;
                else
-                        goal = -1;
+                        grp_goal = -1;
        } else {
-                if (goal > 0)
+                if (grp_goal > 0)
-                        start = goal;
+                        start = grp_goal;
                else
                        start = 0;
                end = EXT3_BLOCKS_PER_GROUP(sb);
@@ -688,43 +688,43 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
        BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb));
 repeat:
-        if (goal < 0 || !ext3_test_allocatable(goal, bitmap_bh)) {
+        if (grp_goal < 0 || !ext3_test_allocatable(grp_goal, bitmap_bh)) {
-                goal = find_next_usable_block(start, bitmap_bh, end);
+                grp_goal = find_next_usable_block(start, bitmap_bh, end);
-                if (goal < 0)
+                if (grp_goal < 0)
                        goto fail_access;
                if (!my_rsv) {
                        int i;
-                        for (i = 0; i < 7 && goal > start &&
+                        for (i = 0; i < 7 && grp_goal > start &&
-                                        ext3_test_allocatable(goal - 1,
+                                        ext3_test_allocatable(grp_goal - 1,
                                                                bitmap_bh);
-                                        i++, goal--)
+                                        i++, grp_goal--)
                                ;
                }
        }
-        start = goal;
+        start = grp_goal;
-        if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), goal, bitmap_bh)) {
+        if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), grp_goal, bitmap_bh)) {
                /*
                 * The block was allocated by another thread, or it was
                 * allocated and then freed by another thread
                 */
                start++;
-                goal++;
+                grp_goal++;
                if (start >= end)
                        goto fail_access;
                goto repeat;
        }
        num++;
-        goal++;
+        grp_goal++;
-        while (num < *count && goal < end
+        while (num < *count && grp_goal < end
-                && ext3_test_allocatable(goal, bitmap_bh)
+                && ext3_test_allocatable(grp_goal, bitmap_bh)
-                && claim_block(sb_bgl_lock(EXT3_SB(sb), group), goal, bitmap_bh)) {
+                && claim_block(sb_bgl_lock(EXT3_SB(sb), group), grp_goal, bitmap_bh)) {
                num++;
-                goal++;
+                grp_goal++;
        }
        *count = num;
-        return goal - num;
+        return grp_goal - num;
 fail_access:
        *count = num;
        return -1;
@@ -766,12 +766,13 @@ fail_access:
 static int find_next_reservable_window(
                                struct ext3_reserve_window_node *search_head,
                                struct ext3_reserve_window_node *my_rsv,
-                                struct super_block * sb, int start_block,
+                                struct super_block * sb,
-                                int last_block)
+                                ext3_fsblk_t start_block,
+                                ext3_fsblk_t last_block)
 {
        struct rb_node *next;
        struct ext3_reserve_window_node *rsv, *prev;
-        int cur;
+        ext3_fsblk_t cur;
        int size = my_rsv->rsv_goal_size;
        /* TODO: make the start of the reservation window byte-aligned */
@@ -873,10 +874,10 @@ static int find_next_reservable_window(
 *
 *      @rsv: the reservation
 *
- *      @goal: The goal (group-relative).  It is where the search for a
+ *      @grp_goal: The goal (group-relative).  It is where the search for a
 *              free reservable space should start from.
- *              if we have a goal(goal >0 ), then start from there,
+ *              if we have a grp_goal(grp_goal >0 ), then start from there,
- *              no goal(goal = -1), we start from the first block
+ *              no grp_goal(grp_goal = -1), we start from the first block
 *              of the group.
 *
 *      @sb: the super block
@@ -885,25 +886,24 @@ static int find_next_reservable_window(
 *
 */
 static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
-                int goal, struct super_block *sb,
+                ext3_grpblk_t grp_goal, struct super_block *sb,
                unsigned int group, struct buffer_head *bitmap_bh)
 {
        struct ext3_reserve_window_node *search_head;
-        int group_first_block, group_end_block, start_block;
+        ext3_fsblk_t group_first_block, group_end_block, start_block;
-        int first_free_block;
+        ext3_grpblk_t first_free_block;
        struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root;
        unsigned long size;
        int ret;
        spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
-        group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
+        group_first_block = ext3_group_first_block_no(sb, group);
-                                group * EXT3_BLOCKS_PER_GROUP(sb);
        group_end_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;
-        if (goal < 0)
+        if (grp_goal < 0)
                start_block = group_first_block;
        else
-                start_block = goal + group_first_block;
+                start_block = grp_goal + group_first_block;
        size = my_rsv->rsv_goal_size;
@@ -1057,14 +1057,15 @@ static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
 * sorted double linked list should be fast.
 *
 */
-static int
+static ext3_grpblk_t
 ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
                        unsigned int group, struct buffer_head *bitmap_bh,
-                        int goal, struct ext3_reserve_window_node * my_rsv,
+                        ext3_grpblk_t grp_goal,
+                        struct ext3_reserve_window_node * my_rsv,
                        unsigned long *count, int *errp)
 {
-        unsigned long group_first_block;
+        ext3_fsblk_t group_first_block;
-        int ret = 0;
+        ext3_grpblk_t ret = 0;
        int fatal;
        unsigned long num = *count;
@@ -1090,17 +1091,16 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
         */
        if (my_rsv == NULL ) {
                ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
-                                                goal, count, NULL);
+                                                grp_goal, count, NULL);
                goto out;
        }
        /*
-         * goal is a group relative block number (if there is a goal)
+         * grp_goal is a group relative block number (if there is a goal)
-         * 0 < goal < EXT3_BLOCKS_PER_GROUP(sb)
+         * 0 < grp_goal < EXT3_BLOCKS_PER_GROUP(sb)
         * first block is a filesystem wide block number
         * first block is the block number of the first block in this group
         */
-        group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
+        group_first_block = ext3_group_first_block_no(sb, group);
-                        group * EXT3_BLOCKS_PER_GROUP(sb);
        /*
         * Basically we will allocate a new block from inode's reservation
@@ -1119,24 +1119,24 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
         */
        while (1) {
                if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
-                        !goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb)) {
+                        !goal_in_my_reservation(&my_rsv->rsv_window, grp_goal, group, sb)) {
                        if (my_rsv->rsv_goal_size < *count)
                                my_rsv->rsv_goal_size = *count;
-                        ret = alloc_new_reservation(my_rsv, goal, sb,
+                        ret = alloc_new_reservation(my_rsv, grp_goal, sb,
                                                        group, bitmap_bh);
                        if (ret < 0)
                                break;                  /* failed */
-                        if (!goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb))
+                        if (!goal_in_my_reservation(&my_rsv->rsv_window, grp_goal, group, sb))
-                                goal = -1;
+                                grp_goal = -1;
-                } else if (goal > 0 && (my_rsv->rsv_end-goal+1) < *count)
+                } else if (grp_goal > 0 && (my_rsv->rsv_end-grp_goal+1) < *count)
                        try_to_extend_reservation(my_rsv, sb,
-                                        *count-my_rsv->rsv_end + goal - 1);
+                                        *count-my_rsv->rsv_end + grp_goal - 1);
                if ((my_rsv->rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb))
                    || (my_rsv->rsv_end < group_first_block))
                        BUG();
-                ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal,
+                ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, grp_goal,
                                           &num, &my_rsv->rsv_window);
                if (ret >= 0) {
                        my_rsv->rsv_alloc_hit += num;
@@ -1164,7 +1164,7 @@ out:
 static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
 {
-        int free_blocks, root_blocks;
+        ext3_fsblk_t free_blocks, root_blocks;
        free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
        root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
@@ -1200,19 +1200,20 @@ int ext3_should_retry_alloc(struct super_block *sb, int *retries)
 * bitmap, and then for any free bit if that fails.
 * This function also updates quota and i_blocks field.
 */
-int ext3_new_blocks(handle_t *handle, struct inode *inode,
+ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
-                        unsigned long goal, unsigned long *count, int *errp)
+                        ext3_fsblk_t goal, unsigned long *count, int *errp)
 {
        struct buffer_head *bitmap_bh = NULL;
        struct buffer_head *gdp_bh;
        int group_no;
        int goal_group;
-        int ret_block;
+        ext3_grpblk_t grp_target_blk;   /* blockgroup relative goal block */
+        ext3_grpblk_t grp_alloc_blk;    /* blockgroup-relative allocated block*/
+        ext3_fsblk_t ret_block;         /* filesyetem-wide allocated block */
        int bgi;                        /* blockgroup iteration index */
-        int target_block;
        int fatal = 0, err;
        int performed_allocation = 0;
-        int free_blocks;
+        ext3_grpblk_t free_blocks;      /* number of free blocks in a group */
        struct super_block *sb;
        struct ext3_group_desc *gdp;
        struct ext3_super_block *es;
@@ -1285,16 +1286,17 @@ retry:
                my_rsv = NULL;
        if (free_blocks > 0) {
-                ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) %
+                grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) %
                                EXT3_BLOCKS_PER_GROUP(sb));
                bitmap_bh = read_block_bitmap(sb, group_no);
                if (!bitmap_bh)
                        goto io_error;
-                ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
+                grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
-                                        bitmap_bh, ret_block, my_rsv, &num, &fatal);
+                                        group_no, bitmap_bh, grp_target_blk,
+                                        my_rsv, &num, &fatal);
                if (fatal)
                        goto out;
-                if (ret_block >= 0)
+                if (grp_alloc_blk >= 0)
                        goto allocated;
        }
@@ -1327,11 +1329,15 @@ retry:
                bitmap_bh = read_block_bitmap(sb, group_no);
                if (!bitmap_bh)
                        goto io_error;
-                ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
+                /*
-                                        bitmap_bh, -1, my_rsv, &num, &fatal);
+                 * try to allocate block(s) from this group, without a goal(-1).
+                 */
+                grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
+                                        group_no, bitmap_bh, -1, my_rsv,
+                                        &num, &fatal);
                if (fatal)
                        goto out;
-                if (ret_block >= 0) 
+                if (grp_alloc_blk >= 0)
                        goto allocated;
        }
        /*
@@ -1360,18 +1366,18 @@ allocated:
        if (fatal)
                goto out;
-        target_block = ret_block + group_no * EXT3_BLOCKS_PER_GROUP(sb)
+        ret_block = grp_alloc_blk + ext3_group_first_block_no(sb, group_no);
-                                + le32_to_cpu(es->s_first_data_block);
-        if (in_range(le32_to_cpu(gdp->bg_block_bitmap), target_block, num) ||
+        if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) ||
-            in_range(le32_to_cpu(gdp->bg_inode_bitmap), target_block, num) ||
+            in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) ||
-            in_range(target_block, le32_to_cpu(gdp->bg_inode_table),
+            in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
                      EXT3_SB(sb)->s_itb_per_group) ||
-            in_range(target_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
+            in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
                      EXT3_SB(sb)->s_itb_per_group))
                ext3_error(sb, "ext3_new_block",
                            "Allocating block in system zone - "
-                            "blocks from %u, length %lu", target_block, num);
+                            "blocks from "E3FSBLK", length %lu",
+                             ret_block, num);
        performed_allocation = 1;
@@ -1380,7 +1386,7 @@ allocated:
                struct buffer_head *debug_bh;
                /* Record bitmap buffer state in the newly allocated block */
-                debug_bh = sb_find_get_block(sb, target_block);
+                debug_bh = sb_find_get_block(sb, ret_block);
                if (debug_bh) {
                        BUFFER_TRACE(debug_bh, "state when allocated");
                        BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
@@ -1393,24 +1399,21 @@ allocated:
                int i;
                for (i = 0; i < num; i++) {
-                        if (ext3_test_bit(ret_block,
+                        if (ext3_test_bit(grp_alloc_blk+i,
                                        bh2jh(bitmap_bh)->b_committed_data)) {
                                printk("%s: block was unexpectedly set in "
                                        "b_committed_data\n", __FUNCTION__);
                        }
                }
        }
-        ext3_debug("found bit %d\n", ret_block);
+        ext3_debug("found bit %d\n", grp_alloc_blk);
        spin_unlock(sb_bgl_lock(sbi, group_no));
        jbd_unlock_bh_state(bitmap_bh);
 #endif
-        /* ret_block was blockgroup-relative.  Now it becomes fs-relative */
-        ret_block = target_block;
        if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) {
                ext3_error(sb, "ext3_new_block",
-                            "block(%d) >= blocks count(%d) - "
+                            "block("E3FSBLK") >= blocks count(%d) - "
                            "block_group = %d, es == %p ", ret_block,
                        le32_to_cpu(es->s_blocks_count), group_no, es);
                goto out;
@@ -1421,7 +1424,7 @@ allocated:
         * list of some description.  We don't know in advance whether
         * the caller wants to use it as metadata or data.
         */
-        ext3_debug("allocating block %d. Goal hits %d of %d.\n",
+        ext3_debug("allocating block %lu. Goal hits %d of %d.\n",
                        ret_block, goal_hits, goal_attempts);
        spin_lock(sb_bgl_lock(sbi, group_no));
@@ -1461,23 +1464,24 @@ out:
        return 0;
 }
-int ext3_new_block(handle_t *handle, struct inode *inode,
+ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
-                        unsigned long goal, int *errp)
+                        ext3_fsblk_t goal, int *errp)
 {
        unsigned long count = 1;
        return ext3_new_blocks(handle, inode, goal, &count, errp);
 }
-unsigned long ext3_count_free_blocks(struct super_block *sb)
+ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb)
 {
-        unsigned long desc_count;
+        ext3_fsblk_t desc_count;
        struct ext3_group_desc *gdp;
        int i;
        unsigned long ngroups = EXT3_SB(sb)->s_groups_count;
 #ifdef EXT3FS_DEBUG
        struct ext3_super_block *es;
-        unsigned long bitmap_count, x;
+        ext3_fsblk_t bitmap_count;
+        unsigned long x;
        struct buffer_head *bitmap_bh = NULL;
        es = EXT3_SB(sb)->s_es;
@@ -1502,8 +1506,10 @@ unsigned long ext3_count_free_blocks(struct super_block *sb)
                bitmap_count += x;
        }
        brelse(bitmap_bh);
-        printk("ext3_count_free_blocks: stored = %u, computed = %lu, %lu\n",
+        printk("ext3_count_free_blocks: stored = "E3FSBLK
-               le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count);
+                ", computed = "E3FSBLK", "E3FSBLK"\n",
+               le32_to_cpu(es->s_free_blocks_count),
+                desc_count, bitmap_count);
        return bitmap_count;
 #else
        desc_count = 0;
@@ -1520,7 +1526,7 @@ unsigned long ext3_count_free_blocks(struct super_block *sb)
 }
 static inline int
-block_in_use(unsigned long block, struct super_block *sb, unsigned char *map)
+block_in_use(ext3_fsblk_t block, struct super_block *sb, unsigned char *map)
 {
        return ext3_test_bit ((block -
                le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) %
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index dc826464f313..36546ed36a14 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -262,9 +262,11 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
        int ngroups = sbi->s_groups_count;
        int inodes_per_group = EXT3_INODES_PER_GROUP(sb);
        int freei, avefreei;
-        int freeb, avefreeb;
+        ext3_fsblk_t freeb, avefreeb;
-        int blocks_per_dir, ndirs;
+        ext3_fsblk_t blocks_per_dir;
-        int max_debt, max_dirs, min_blocks, min_inodes;
+        int ndirs;
+        int max_debt, max_dirs, min_inodes;
+        ext3_grpblk_t min_blocks;
        int group = -1, i;
        struct ext3_group_desc *desc;
        struct buffer_head *bh;
@@ -307,7 +309,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
        min_inodes = avefreei - inodes_per_group / 4;
        min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4;
-        max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, BLOCK_COST);
+        max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, (ext3_fsblk_t)BLOCK_COST);
        if (max_debt * INODE_COST > inodes_per_group)
                max_debt = inodes_per_group / INODE_COST;
        if (max_debt > 255)
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 2edd7eec88fd..0321e1b9034a 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -62,7 +62,7 @@ static int ext3_inode_is_fast_symlink(struct inode *inode)
 * still needs to be revoked.
 */
 int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
-                        struct buffer_head *bh, int blocknr)
+                        struct buffer_head *bh, ext3_fsblk_t blocknr)
 {
        int err;
@@ -407,13 +407,13 @@ no_block:
 *
 *      Caller must make sure that @ind is valid and will stay that way.
 */
-static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
+static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind)
 {
        struct ext3_inode_info *ei = EXT3_I(inode);
        __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
        __le32 *p;
-        unsigned long bg_start;
+        ext3_fsblk_t bg_start;
-        unsigned long colour;
+        ext3_grpblk_t colour;
        /* Try to find previous block */
        for (p = ind->p - 1; p >= start; p--) {
@@ -429,8 +429,7 @@ static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
         * It is going to be referred to from the inode itself? OK, just put it
         * into the same cylinder group then.
         */
-        bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
+        bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group);
-                le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
        colour = (current->pid % 16) *
                        (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
        return bg_start + colour;
@@ -448,7 +447,7 @@ static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
 *      stores it in *@goal and returns zero.
 */
-static unsigned long ext3_find_goal(struct inode *inode, long block,
+static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
                Indirect chain[4], Indirect *partial)
 {
        struct ext3_block_alloc_info *block_i;
@@ -516,13 +515,13 @@ static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
 *              direct blocks
 */
 static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
-                        unsigned long goal, int indirect_blks, int blks,
+                        ext3_fsblk_t goal, int indirect_blks, int blks,
-                        unsigned long long new_blocks[4], int *err)
+                        ext3_fsblk_t new_blocks[4], int *err)
 {
        int target, i;
        unsigned long count = 0;
        int index = 0;
-        unsigned long current_block = 0;
+        ext3_fsblk_t current_block = 0;
        int ret = 0;
        /*
@@ -592,7 +591,7 @@ failed_out:
 *      as described above and return 0.
 */
 static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
-                        int indirect_blks, int *blks, unsigned long goal,
+                        int indirect_blks, int *blks, ext3_fsblk_t goal,
                        int *offsets, Indirect *branch)
 {
        int blocksize = inode->i_sb->s_blocksize;
@@ -600,8 +599,8 @@ static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
        int err = 0;
        struct buffer_head *bh;
        int num;
-        unsigned long long new_blocks[4];
+        ext3_fsblk_t new_blocks[4];
-        unsigned long long current_block;
+        ext3_fsblk_t current_block;
        num = ext3_alloc_blocks(handle, inode, goal, indirect_blks,
                                *blks, new_blocks, &err);
@@ -688,7 +687,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
        int i;
        int err = 0;
        struct ext3_block_alloc_info *block_i;
-        unsigned long current_block;
+        ext3_fsblk_t current_block;
        block_i = EXT3_I(inode)->i_block_alloc_info;
        /*
@@ -795,13 +794,13 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
        int offsets[4];
        Indirect chain[4];
        Indirect *partial;
-        unsigned long goal;
+        ext3_fsblk_t goal;
        int indirect_blks;
        int blocks_to_boundary = 0;
        int depth;
        struct ext3_inode_info *ei = EXT3_I(inode);
        int count = 0;
-        unsigned long first_block = 0;
+        ext3_fsblk_t first_block = 0;
        J_ASSERT(handle != NULL || create == 0);
@@ -819,7 +818,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
                count++;
                /*map more blocks*/
                while (count < maxblocks && count <= blocks_to_boundary) {
-                        unsigned long blk;
+                        ext3_fsblk_t blk;
                        if (!verify_chain(chain, partial)) {
                                /*
@@ -1759,7 +1758,7 @@ void ext3_set_aops(struct inode *inode)
 static int ext3_block_truncate_page(handle_t *handle, struct page *page,
                struct address_space *mapping, loff_t from)
 {
-        unsigned long index = from >> PAGE_CACHE_SHIFT;
+        ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT;
        unsigned offset = from & (PAGE_CACHE_SIZE-1);
        unsigned blocksize, iblock, length, pos;
        struct inode *inode = mapping->host;
@@ -1960,7 +1959,7 @@ no_top:
 * than `count' because there can be holes in there.
 */
 static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
-                struct buffer_head *bh, unsigned long block_to_free,
+                struct buffer_head *bh, ext3_fsblk_t block_to_free,
                unsigned long count, __le32 *first, __le32 *last)
 {
        __le32 *p;
@@ -2022,12 +2021,12 @@ static void ext3_free_data(handle_t *handle, struct inode *inode,
                           struct buffer_head *this_bh,
                           __le32 *first, __le32 *last)
 {
-        unsigned long block_to_free = 0;    /* Starting block # of a run */
+        ext3_fsblk_t block_to_free = 0;    /* Starting block # of a run */
        unsigned long count = 0;            /* Number of blocks in the run */ 
        __le32 *block_to_free_p = NULL;     /* Pointer into inode/ind
                                               corresponding to
                                               block_to_free */
-        unsigned long nr;                   /* Current block # */
+        ext3_fsblk_t nr;                    /* Current block # */
        __le32 *p;                          /* Pointer into inode/ind
                                               for current block */
        int err;
@@ -2089,7 +2088,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
                               struct buffer_head *parent_bh,
                               __le32 *first, __le32 *last, int depth)
 {
-        unsigned long nr;
+        ext3_fsblk_t nr;
        __le32 *p;
        if (is_handle_aborted(handle))
@@ -2113,7 +2112,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
                         */
                        if (!bh) {
                                ext3_error(inode->i_sb, "ext3_free_branches",
-                                           "Read failure, inode=%ld, block=%ld",
+                                           "Read failure, inode=%ld, block="E3FSBLK,
                                           inode->i_ino, nr);
                                continue;
                        }
@@ -2394,11 +2393,12 @@ out_stop:
        ext3_journal_stop(handle);
 }
-static unsigned long ext3_get_inode_block(struct super_block *sb,
+static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
                unsigned long ino, struct ext3_iloc *iloc)
 {
        unsigned long desc, group_desc, block_group;
-        unsigned long offset, block;
+        unsigned long offset;
+        ext3_fsblk_t block;
        struct buffer_head *bh;
        struct ext3_group_desc * gdp;
@@ -2448,7 +2448,7 @@ static unsigned long ext3_get_inode_block(struct super_block *sb,
 static int __ext3_get_inode_loc(struct inode *inode,
                                struct ext3_iloc *iloc, int in_mem)
 {
-        unsigned long block;
+        ext3_fsblk_t block;
        struct buffer_head *bh;
        block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
@@ -2459,7 +2459,8 @@ static int __ext3_get_inode_loc(struct inode *inode,
        if (!bh) {
                ext3_error (inode->i_sb, "ext3_get_inode_loc",
                                "unable to read inode block - "
-                                "inode=%lu, block=%lu", inode->i_ino, block);
+                                "inode=%lu, block="E3FSBLK,
+                                 inode->i_ino, block);
                return -EIO;
        }
        if (!buffer_uptodate(bh)) {
@@ -2540,7 +2541,7 @@ make_io:
                if (!buffer_uptodate(bh)) {
                        ext3_error(inode->i_sb, "ext3_get_inode_loc",
                                        "unable to read inode block - "
-                                        "inode=%lu, block=%lu",
+                                        "inode=%lu, block="E3FSBLK,
                                        inode->i_ino, block);
                        brelse(bh);
                        return -EIO;
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 8c22aa9a7fbb..3a6b012d120c 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -204,7 +204,7 @@ flags_err:
                return 0;
        }
        case EXT3_IOC_GROUP_EXTEND: {
-                unsigned long n_blocks_count;
+                ext3_fsblk_t n_blocks_count;
                struct super_block *sb = inode->i_sb;
                int err;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index b8f5cd1e540d..d9176dba3698 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1379,7 +1379,6 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
        int     dx_fallback=0;
 #endif
        unsigned blocksize;
-        unsigned nlen, rlen;
        u32 block, blocks;
        sb = dir->i_sb;
@@ -1417,8 +1416,7 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
                return retval;
        de = (struct ext3_dir_entry_2 *) bh->b_data;
        de->inode = 0;
-        de->rec_len = cpu_to_le16(rlen = blocksize);
+        de->rec_len = cpu_to_le16(blocksize);
-        nlen = 0;
        return add_dirent_to_buf(handle, dentry, inode, de, bh);
 }
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 34b39e9a1e5a..dfd811895d8f 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -28,16 +28,16 @@ static int verify_group_input(struct super_block *sb,
 {
        struct ext3_sb_info *sbi = EXT3_SB(sb);
        struct ext3_super_block *es = sbi->s_es;
-        unsigned start = le32_to_cpu(es->s_blocks_count);
+        ext3_fsblk_t start = le32_to_cpu(es->s_blocks_count);
-        unsigned end = start + input->blocks_count;
+        ext3_fsblk_t end = start + input->blocks_count;
        unsigned group = input->group;
-        unsigned itend = input->inode_table + sbi->s_itb_per_group;
+        ext3_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
        unsigned overhead = ext3_bg_has_super(sb, group) ?
                (1 + ext3_bg_num_gdb(sb, group) +
                 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
-        unsigned metaend = start + overhead;
+        ext3_fsblk_t metaend = start + overhead;
        struct buffer_head *bh = NULL;
-        int free_blocks_count;
+        ext3_grpblk_t free_blocks_count;
        int err = -EINVAL;
        input->free_blocks_count = free_blocks_count =
@@ -64,7 +64,8 @@ static int verify_group_input(struct super_block *sb,
                ext3_warning(sb, __FUNCTION__, "Bad blocks count %u",
                             input->blocks_count);
        else if (!(bh = sb_bread(sb, end - 1)))
-                ext3_warning(sb, __FUNCTION__, "Cannot read last block (%u)",
+                ext3_warning(sb, __FUNCTION__,
+                             "Cannot read last block ("E3FSBLK")",
                             end - 1);
        else if (outside(input->block_bitmap, start, end))
                ext3_warning(sb, __FUNCTION__,
@@ -77,7 +78,7 @@ static int verify_group_input(struct super_block *sb,
        else if (outside(input->inode_table, start, end) ||
                 outside(itend - 1, start, end))
                ext3_warning(sb, __FUNCTION__,
-                             "Inode table not in group (blocks %u-%u)",
+                             "Inode table not in group (blocks %u-"E3FSBLK")",
                             input->inode_table, itend - 1);
        else if (input->inode_bitmap == input->block_bitmap)
                ext3_warning(sb, __FUNCTION__,
@@ -85,24 +86,27 @@ static int verify_group_input(struct super_block *sb,
                             input->block_bitmap);
        else if (inside(input->block_bitmap, input->inode_table, itend))
                ext3_warning(sb, __FUNCTION__,
-                             "Block bitmap (%u) in inode table (%u-%u)",
+                             "Block bitmap (%u) in inode table (%u-"E3FSBLK")",
                             input->block_bitmap, input->inode_table, itend-1);
        else if (inside(input->inode_bitmap, input->inode_table, itend))
                ext3_warning(sb, __FUNCTION__,
-                             "Inode bitmap (%u) in inode table (%u-%u)",
+                             "Inode bitmap (%u) in inode table (%u-"E3FSBLK")",
                             input->inode_bitmap, input->inode_table, itend-1);
        else if (inside(input->block_bitmap, start, metaend))
                ext3_warning(sb, __FUNCTION__,
-                             "Block bitmap (%u) in GDT table (%u-%u)",
+                             "Block bitmap (%u) in GDT table"
+                             " ("E3FSBLK"-"E3FSBLK")",
                             input->block_bitmap, start, metaend - 1);
        else if (inside(input->inode_bitmap, start, metaend))
                ext3_warning(sb, __FUNCTION__,
-                             "Inode bitmap (%u) in GDT table (%u-%u)",
+                             "Inode bitmap (%u) in GDT table"
+                             " ("E3FSBLK"-"E3FSBLK")",
                             input->inode_bitmap, start, metaend - 1);
        else if (inside(input->inode_table, start, metaend) ||
                 inside(itend - 1, start, metaend))
                ext3_warning(sb, __FUNCTION__,
-                             "Inode table (%u-%u) overlaps GDT table (%u-%u)",
+                             "Inode table (%u-"E3FSBLK") overlaps"
+                             "GDT table ("E3FSBLK"-"E3FSBLK")",
                             input->inode_table, itend - 1, start, metaend - 1);
        else
                err = 0;
@@ -112,7 +116,7 @@ static int verify_group_input(struct super_block *sb,
 }
 static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
-                                  unsigned long blk)
+                                  ext3_fsblk_t blk)
 {
        struct buffer_head *bh;
        int err;
@@ -163,15 +167,14 @@ static int setup_new_group_blocks(struct super_block *sb,
                                  struct ext3_new_group_data *input)
 {
        struct ext3_sb_info *sbi = EXT3_SB(sb);
-        unsigned long start = input->group * sbi->s_blocks_per_group +
+        ext3_fsblk_t start = ext3_group_first_block_no(sb, input->group);
-                le32_to_cpu(sbi->s_es->s_first_data_block);
        int reserved_gdb = ext3_bg_has_super(sb, input->group) ?
                le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
        unsigned long gdblocks = ext3_bg_num_gdb(sb, input->group);
        struct buffer_head *bh;
        handle_t *handle;
-        unsigned long block;
+        ext3_fsblk_t block;
-        int bit;
+        ext3_grpblk_t bit;
        int i;
        int err = 0, err2;
@@ -328,7 +331,7 @@ static unsigned ext3_list_backups(struct super_block *sb, unsigned *three,
 static int verify_reserved_gdb(struct super_block *sb,
                               struct buffer_head *primary)
 {
-        const unsigned long blk = primary->b_blocknr;
+        const ext3_fsblk_t blk = primary->b_blocknr;
        const unsigned long end = EXT3_SB(sb)->s_groups_count;
        unsigned three = 1;
        unsigned five = 5;
@@ -340,7 +343,8 @@ static int verify_reserved_gdb(struct super_block *sb,
        while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) {
                if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){
                        ext3_warning(sb, __FUNCTION__,
-                                     "reserved GDT %ld missing grp %d (%ld)",
+                                     "reserved GDT "E3FSBLK
+                                     " missing grp %d ("E3FSBLK")",
                                     blk, grp,
                                     grp * EXT3_BLOCKS_PER_GROUP(sb) + blk);
                        return -EINVAL;
@@ -372,7 +376,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        struct super_block *sb = inode->i_sb;
        struct ext3_super_block *es = EXT3_SB(sb)->s_es;
        unsigned long gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb);
-        unsigned long gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
+        ext3_fsblk_t gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
        struct buffer_head **o_group_desc, **n_group_desc;
        struct buffer_head *dind;
        int gdbackups;
@@ -417,7 +421,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        data = (__u32 *)dind->b_data;
        if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) {
                ext3_warning(sb, __FUNCTION__,
-                             "new group %u GDT block %lu not reserved",
+                             "new group %u GDT block "E3FSBLK" not reserved",
                             input->group, gdblock);
                err = -EINVAL;
                goto exit_dind;
@@ -515,7 +519,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
        struct buffer_head **primary;
        struct buffer_head *dind;
        struct ext3_iloc iloc;
-        unsigned long blk;
+        ext3_fsblk_t blk;
        __u32 *data, *end;
        int gdbackups = 0;
        int res, i;
@@ -540,7 +544,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
        for (res = 0; res < reserved_gdb; res++, blk++) {
                if (le32_to_cpu(*data) != blk) {
                        ext3_warning(sb, __FUNCTION__,
-                                     "reserved block %lu not at offset %ld",
+                                     "reserved block "E3FSBLK
+                                     " not at offset %ld",
                                     blk, (long)(data - (__u32 *)dind->b_data));
                        err = -EINVAL;
                        goto exit_bh;
@@ -902,15 +907,16 @@ exit_put:
 * GDT blocks are reserved to grow to the desired size.
 */
 int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
-                      unsigned long n_blocks_count)
+                      ext3_fsblk_t n_blocks_count)
 {
-        unsigned long o_blocks_count;
+        ext3_fsblk_t o_blocks_count;
        unsigned long o_groups_count;
-        unsigned long last;
+        ext3_grpblk_t last;
-        int add;
+        ext3_grpblk_t add;
        struct buffer_head * bh;
        handle_t *handle;
-        int err, freed_blocks;
+        int err;
+        unsigned long freed_blocks;
        /* We don't need to worry about locking wrt other resizers just
         * yet: we're going to revalidate es->s_blocks_count after
@@ -919,12 +925,22 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
        o_groups_count = EXT3_SB(sb)->s_groups_count;
        if (test_opt(sb, DEBUG))
-                printk(KERN_DEBUG "EXT3-fs: extending last group from %lu to %lu blocks\n",
+                printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n",
                       o_blocks_count, n_blocks_count);
        if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
                return 0;
+        if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+                printk(KERN_ERR "EXT3-fs: filesystem on %s:"
+                        " too large to resize to %lu blocks safely\n",
+                        sb->s_id, n_blocks_count);
+                if (sizeof(sector_t) < 8)
+                        ext3_warning(sb, __FUNCTION__,
+                        "CONFIG_LBD not enabled\n");
+                return -EINVAL;
+        }
        if (n_blocks_count < o_blocks_count) {
                ext3_warning(sb, __FUNCTION__,
                             "can't shrink FS - resize aborted");
@@ -948,7 +964,8 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
        if (o_blocks_count + add < n_blocks_count)
                ext3_warning(sb, __FUNCTION__,
-                             "will only finish group (%lu blocks, %u new)",
+                             "will only finish group ("E3FSBLK
+                             " blocks, %u new)",
                             o_blocks_count + add, add);
        /* See if the device is actually as big as what was requested */
@@ -991,10 +1008,10 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
        ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
        sb->s_dirt = 1;
        unlock_super(sb);
-        ext3_debug("freeing blocks %ld through %ld\n", o_blocks_count,
+        ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count,
                   o_blocks_count + add);
        ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
-        ext3_debug("freed blocks %ld through %ld\n", o_blocks_count,
+        ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n", o_blocks_count,
                   o_blocks_count + add);
        if ((err = ext3_journal_stop(handle)))
                goto exit_put;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index a60cc6ec130f..b7483360a2db 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -630,7 +630,7 @@ enum {
        Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
        Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
        Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
-        Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh,
+        Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
        Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
@@ -666,6 +666,7 @@ static match_table_t tokens = {
        {Opt_noreservation, "noreservation"},
        {Opt_noload, "noload"},
        {Opt_nobh, "nobh"},
+        {Opt_bh, "bh"},
        {Opt_commit, "commit=%u"},
        {Opt_journal_update, "journal=update"},
        {Opt_journal_inum, "journal=%u"},
@@ -689,14 +690,15 @@ static match_table_t tokens = {
        {Opt_resize, "resize"},
 };
-static unsigned long get_sb_block(void **data)
+static ext3_fsblk_t get_sb_block(void **data)
 {
-        unsigned long   sb_block;
+        ext3_fsblk_t    sb_block;
        char            *options = (char *) *data;
        if (!options || strncmp(options, "sb=", 3) != 0)
                return 1;       /* Default location */
        options += 3;
+        /*todo: use simple_strtoll with >32bit ext3 */
        sb_block = simple_strtoul(options, &options, 0);
        if (*options && *options != ',') {
                printk("EXT3-fs: Invalid sb specification: %s\n",
@@ -711,7 +713,7 @@ static unsigned long get_sb_block(void **data)
 static int parse_options (char *options, struct super_block *sb,
                          unsigned long *inum, unsigned long *journal_devnum,
-                          unsigned long *n_blocks_count, int is_remount)
+                          ext3_fsblk_t *n_blocks_count, int is_remount)
 {
        struct ext3_sb_info *sbi = EXT3_SB(sb);
        char * p;
@@ -1013,6 +1015,9 @@ clear_qf_name:
                case Opt_nobh:
                        set_opt(sbi->s_mount_opt, NOBH);
                        break;
+                case Opt_bh:
+                        clear_opt(sbi->s_mount_opt, NOBH);
+                        break;
                default:
                        printk (KERN_ERR
                                "EXT3-fs: Unrecognized mount option \"%s\" "
@@ -1128,7 +1133,7 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
 static int ext3_check_descriptors (struct super_block * sb)
 {
        struct ext3_sb_info *sbi = EXT3_SB(sb);
-        unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block);
+        ext3_fsblk_t block = le32_to_cpu(sbi->s_es->s_first_data_block);
        struct ext3_group_desc * gdp = NULL;
        int desc_block = 0;
        int i;
@@ -1315,15 +1320,14 @@ static loff_t ext3_max_size(int bits)
        return res;
 }
-static unsigned long descriptor_loc(struct super_block *sb,
+static ext3_fsblk_t descriptor_loc(struct super_block *sb,
-                                    unsigned long logic_sb_block,
+                                    ext3_fsblk_t logic_sb_block,
                                    int nr)
 {
        struct ext3_sb_info *sbi = EXT3_SB(sb);
-        unsigned long bg, first_data_block, first_meta_bg;
+        unsigned long bg, first_meta_bg;
        int has_super = 0;
-        first_data_block = le32_to_cpu(sbi->s_es->s_first_data_block);
        first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
        if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_META_BG) ||
@@ -1332,7 +1336,7 @@ static unsigned long descriptor_loc(struct super_block *sb,
        bg = sbi->s_desc_per_block * nr;
        if (ext3_bg_has_super(sb, bg))
                has_super = 1;
-        return (first_data_block + has_super + (bg * sbi->s_blocks_per_group));
+        return (has_super + ext3_group_first_block_no(sb, bg));
 }
@@ -1341,9 +1345,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        struct buffer_head * bh;
        struct ext3_super_block *es = NULL;
        struct ext3_sb_info *sbi;
-        unsigned long block;
+        ext3_fsblk_t block;
-        unsigned long sb_block = get_sb_block(&data);
+        ext3_fsblk_t sb_block = get_sb_block(&data);
-        unsigned long logic_sb_block;
+        ext3_fsblk_t logic_sb_block;
        unsigned long offset = 0;
        unsigned long journal_inum = 0;
        unsigned long journal_devnum = 0;
@@ -1565,6 +1569,16 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
+        if (le32_to_cpu(es->s_blocks_count) >
+                    (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+                printk(KERN_ERR "EXT3-fs: filesystem on %s:"
+                        " too large to mount safely\n", sb->s_id);
+                if (sizeof(sector_t) < 8)
+                        printk(KERN_WARNING "EXT3-fs: CONFIG_LBD not "
+                                        "enabled\n");
+                goto failed_mount;
+        }
        if (EXT3_BLOCKS_PER_GROUP(sb) == 0)
                goto cantfind_ext3;
        sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) -
@@ -1593,7 +1607,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                }
        }
        if (!ext3_check_descriptors (sb)) {
-                printk (KERN_ERR "EXT3-fs: group descriptors corrupted !\n");
+                printk(KERN_ERR "EXT3-fs: group descriptors corrupted!\n");
                goto failed_mount2;
        }
        sbi->s_gdb_count = db_count;
@@ -1830,10 +1844,10 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 {
        struct buffer_head * bh;
        journal_t *journal;
-        int start;
+        ext3_fsblk_t start;
-        int len;
+        ext3_fsblk_t len;
        int hblock, blocksize;
-        unsigned long sb_block;
+        ext3_fsblk_t sb_block;
        unsigned long offset;
        struct ext3_super_block * es;
        struct block_device *bdev;
@@ -2206,7 +2220,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 {
        struct ext3_super_block * es;
        struct ext3_sb_info *sbi = EXT3_SB(sb);
-        unsigned long n_blocks_count = 0;
+        ext3_fsblk_t n_blocks_count = 0;
        unsigned long old_sb_flags;
        struct ext3_mount_options old_opts;
        int err;
@@ -2326,7 +2340,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
        struct super_block *sb = dentry->d_sb;
        struct ext3_sb_info *sbi = EXT3_SB(sb);
        struct ext3_super_block *es = sbi->s_es;
-        unsigned long overhead;
+        ext3_fsblk_t overhead;
        int i;
        if (test_opt (sb, MINIX_DF))
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index e8d60bf6b7df..a44a0562203a 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -225,7 +225,7 @@ ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
        error = -ENODATA;
        if (!EXT3_I(inode)->i_file_acl)
                goto cleanup;
-        ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl);
+        ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
        bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
        if (!bh)
                goto cleanup;
@@ -233,7 +233,7 @@ ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
        if (ext3_xattr_check_block(bh)) {
 bad_block:      ext3_error(inode->i_sb, __FUNCTION__,
-                           "inode %ld: bad block %d", inode->i_ino,
+                           "inode %ld: bad block "E3FSBLK, inode->i_ino,
                           EXT3_I(inode)->i_file_acl);
                error = -EIO;
                goto cleanup;
@@ -366,7 +366,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
        error = 0;
        if (!EXT3_I(inode)->i_file_acl)
                goto cleanup;
-        ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl);
+        ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
        bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
        error = -EIO;
        if (!bh)
@@ -375,7 +375,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
        if (ext3_xattr_check_block(bh)) {
                ext3_error(inode->i_sb, __FUNCTION__,
-                           "inode %ld: bad block %d", inode->i_ino,
+                           "inode %ld: bad block "E3FSBLK, inode->i_ino,
                           EXT3_I(inode)->i_file_acl);
                error = -EIO;
                goto cleanup;
@@ -647,7 +647,7 @@ ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i,
                        le32_to_cpu(BHDR(bs->bh)->h_refcount));
                if (ext3_xattr_check_block(bs->bh)) {
                        ext3_error(sb, __FUNCTION__,
-                                "inode %ld: bad block %d", inode->i_ino,
+                                "inode %ld: bad block "E3FSBLK, inode->i_ino,
                                EXT3_I(inode)->i_file_acl);
                        error = -EIO;
                        goto cleanup;
@@ -792,11 +792,12 @@ inserted:
                        get_bh(new_bh);
                } else {
                        /* We need to allocate a new block */
-                        int goal = le32_to_cpu(
+                        ext3_fsblk_t goal = le32_to_cpu(
                                        EXT3_SB(sb)->s_es->s_first_data_block) +
-                                EXT3_I(inode)->i_block_group *
+                                (ext3_fsblk_t)EXT3_I(inode)->i_block_group *
                                EXT3_BLOCKS_PER_GROUP(sb);
-                        int block = ext3_new_block(handle, inode, goal, &error);
+                        ext3_fsblk_t block = ext3_new_block(handle, inode,
+                                                        goal, &error);
                        if (error)
                                goto cleanup;
                        ea_idebug(inode, "creating block %d", block);
@@ -847,7 +848,7 @@ cleanup_dquot:
 bad_block:
        ext3_error(inode->i_sb, __FUNCTION__,
-                   "inode %ld: bad block %d", inode->i_ino,
+                   "inode %ld: bad block "E3FSBLK, inode->i_ino,
                   EXT3_I(inode)->i_file_acl);
        goto cleanup;
@@ -1076,14 +1077,14 @@ ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
        bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
        if (!bh) {
                ext3_error(inode->i_sb, __FUNCTION__,
-                        "inode %ld: block %d read error", inode->i_ino,
+                        "inode %ld: block "E3FSBLK" read error", inode->i_ino,
                        EXT3_I(inode)->i_file_acl);
                goto cleanup;
        }
        if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
            BHDR(bh)->h_blocks != cpu_to_le32(1)) {
                ext3_error(inode->i_sb, __FUNCTION__,
-                        "inode %ld: bad block %d", inode->i_ino,
+                        "inode %ld: bad block "E3FSBLK, inode->i_ino,
                        EXT3_I(inode)->i_file_acl);
                goto cleanup;
        }
@@ -1210,11 +1211,11 @@ again:
                bh = sb_bread(inode->i_sb, ce->e_block);
                if (!bh) {
                        ext3_error(inode->i_sb, __FUNCTION__,
-                                "inode %ld: block %ld read error",
+                                "inode %ld: block %lu read error",
                                inode->i_ino, (unsigned long) ce->e_block);
                } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
                                EXT3_XATTR_REFCOUNT_MAX) {
-                        ea_idebug(inode, "block %ld refcount %d>=%d",
+                        ea_idebug(inode, "block %lu refcount %d>=%d",
                                  (unsigned long) ce->e_block,
                                  le32_to_cpu(BHDR(bh)->h_refcount),
                                          EXT3_XATTR_REFCOUNT_MAX);
diff --git a/fs/freevxfs/vxfs.h b/fs/freevxfs/vxfs.h
index 583bd78086d8..d35979a58743 100644
--- a/fs/freevxfs/vxfs.h
+++ b/fs/freevxfs/vxfs.h
@@ -159,11 +159,11 @@ struct vxfs_sb {
 * In core superblock filesystem private data for VxFS.
 */
 struct vxfs_sb_info {
-        struct vxfs_sb          *vsi_raw;       /* raw (on disk) supeblock */
+        struct vxfs_sb          *vsi_raw;       /* raw (on disk) superblock */
        struct buffer_head      *vsi_bp;        /* buffer for raw superblock*/
        struct inode            *vsi_fship;     /* fileset header inode */
        struct inode            *vsi_ilist;     /* inode list inode */
-        struct inode            *vsi_stilist;   /* structual inode list inode */
+        struct inode            *vsi_stilist;   /* structural inode list inode */
        u_long                  vsi_iext;       /* initial inode list */
        ino_t                   vsi_fshino;     /* fileset header inode */
        daddr_t                 vsi_oltext;     /* OLT extent */
diff --git a/fs/freevxfs/vxfs_fshead.c b/fs/freevxfs/vxfs_fshead.c
index 6dee109aeea4..78948b4b1894 100644
--- a/fs/freevxfs/vxfs_fshead.c
+++ b/fs/freevxfs/vxfs_fshead.c
@@ -112,7 +112,7 @@ vxfs_read_fshead(struct super_block *sbp)
        vip = vxfs_blkiget(sbp, infp->vsi_iext, infp->vsi_fshino);
        if (!vip) {
-                printk(KERN_ERR "vxfs: unabled to read fsh inode\n");
+                printk(KERN_ERR "vxfs: unable to read fsh inode\n");
                return -EINVAL;
        }
        if (!VXFS_ISFSH(vip)) {
@@ -129,13 +129,13 @@ vxfs_read_fshead(struct super_block *sbp)
        infp->vsi_fship = vxfs_get_fake_inode(sbp, vip);
        if (!infp->vsi_fship) {
-                printk(KERN_ERR "vxfs: unabled to get fsh inode\n");
+                printk(KERN_ERR "vxfs: unable to get fsh inode\n");
                goto out_free_fship;
        }
        sfp = vxfs_getfsh(infp->vsi_fship, 0);
        if (!sfp) {
-                printk(KERN_ERR "vxfs: unabled to get structural fsh\n");
+                printk(KERN_ERR "vxfs: unable to get structural fsh\n");
                goto out_iput_fship;
        } 
@@ -145,7 +145,7 @@ vxfs_read_fshead(struct super_block *sbp)
        pfp = vxfs_getfsh(infp->vsi_fship, 1);
        if (!pfp) {
-                printk(KERN_ERR "vxfs: unabled to get primary fsh\n");
+                printk(KERN_ERR "vxfs: unable to get primary fsh\n");
                goto out_free_sfp;
        }
@@ -159,7 +159,7 @@ vxfs_read_fshead(struct super_block *sbp)
        infp->vsi_stilist = vxfs_get_fake_inode(sbp, tip);
        if (!infp->vsi_stilist) {
-                printk(KERN_ERR "vxfs: unabled to get structual list inode\n");
+                printk(KERN_ERR "vxfs: unable to get structural list inode\n");
                kfree(tip);
                goto out_free_pfp;
        }
@@ -174,7 +174,7 @@ vxfs_read_fshead(struct super_block *sbp)
                goto out_iput_stilist;
        infp->vsi_ilist = vxfs_get_fake_inode(sbp, tip);
        if (!infp->vsi_ilist) {
-                printk(KERN_ERR "vxfs: unabled to get inode list inode\n");
+                printk(KERN_ERR "vxfs: unable to get inode list inode\n");
                kfree(tip);
                goto out_iput_stilist;
        }
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index c3e1f760cac9..72437065f6ad 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -4,4 +4,4 @@
 obj-$(CONFIG_FUSE_FS) += fuse.o
-fuse-objs := dev.o dir.o file.o inode.o
+fuse-objs := dev.o dir.o file.o inode.o control.o
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
new file mode 100644
index 000000000000..a3bce3a77253
--- /dev/null
+++ b/fs/fuse/control.c
@@ -0,0 +1,218 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+#include "fuse_i.h"
+#include <linux/init.h>
+#include <linux/module.h>
+#define FUSE_CTL_SUPER_MAGIC 0x65735543
+/*
+ * This is non-NULL when the single instance of the control filesystem
+ * exists.  Protected by fuse_mutex
+ */
+static struct super_block *fuse_control_sb;
+static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file)
+{
+        struct fuse_conn *fc;
+        mutex_lock(&fuse_mutex);
+        fc = file->f_dentry->d_inode->u.generic_ip;
+        if (fc)
+                fc = fuse_conn_get(fc);
+        mutex_unlock(&fuse_mutex);
+        return fc;
+}
+static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf,
+                                     size_t count, loff_t *ppos)
+{
+        struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+        if (fc) {
+                fuse_abort_conn(fc);
+                fuse_conn_put(fc);
+        }
+        return count;
+}
+static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
+                                      size_t len, loff_t *ppos)
+{
+        char tmp[32];
+        size_t size;
+        if (!*ppos) {
+                struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+                if (!fc)
+                        return 0;
+                file->private_data=(void *)(long)atomic_read(&fc->num_waiting);
+                fuse_conn_put(fc);
+        }
+        size = sprintf(tmp, "%ld\n", (long)file->private_data);
+        return simple_read_from_buffer(buf, len, ppos, tmp, size);
+}
+static const struct file_operations fuse_ctl_abort_ops = {
+        .open = nonseekable_open,
+        .write = fuse_conn_abort_write,
+};
+static const struct file_operations fuse_ctl_waiting_ops = {
+        .open = nonseekable_open,
+        .read = fuse_conn_waiting_read,
+};
+static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
+                                          struct fuse_conn *fc,
+                                          const char *name,
+                                          int mode, int nlink,
+                                          struct inode_operations *iop,
+                                          const struct file_operations *fop)
+{
+        struct dentry *dentry;
+        struct inode *inode;
+        BUG_ON(fc->ctl_ndents >= FUSE_CTL_NUM_DENTRIES);
+        dentry = d_alloc_name(parent, name);
+        if (!dentry)
+                return NULL;
+        fc->ctl_dentry[fc->ctl_ndents++] = dentry;
+        inode = new_inode(fuse_control_sb);
+        if (!inode)
+                return NULL;
+        inode->i_mode = mode;
+        inode->i_uid = fc->user_id;
+        inode->i_gid = fc->group_id;
+        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        /* setting ->i_op to NULL is not allowed */
+        if (iop)
+                inode->i_op = iop;
+        inode->i_fop = fop;
+        inode->i_nlink = nlink;
+        inode->u.generic_ip = fc;
+        d_add(dentry, inode);
+        return dentry;
+}
+/*
+ * Add a connection to the control filesystem (if it exists).  Caller
+ * must host fuse_mutex
+ */
+int fuse_ctl_add_conn(struct fuse_conn *fc)
+{
+        struct dentry *parent;
+        char name[32];
+        if (!fuse_control_sb)
+                return 0;
+        parent = fuse_control_sb->s_root;
+        parent->d_inode->i_nlink++;
+        sprintf(name, "%llu", (unsigned long long) fc->id);
+        parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2,
+                                     &simple_dir_inode_operations,
+                                     &simple_dir_operations);
+        if (!parent)
+                goto err;
+        if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1,
+                                NULL, &fuse_ctl_waiting_ops) ||
+            !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1,
+                                 NULL, &fuse_ctl_abort_ops))
+                goto err;
+        return 0;
+ err:
+        fuse_ctl_remove_conn(fc);
+        return -ENOMEM;
+}
+/*
+ * Remove a connection from the control filesystem (if it exists).
+ * Caller must host fuse_mutex
+ */
+void fuse_ctl_remove_conn(struct fuse_conn *fc)
+{
+        int i;
+        if (!fuse_control_sb)
+                return;
+        for (i = fc->ctl_ndents - 1; i >= 0; i--) {
+                struct dentry *dentry = fc->ctl_dentry[i];
+                dentry->d_inode->u.generic_ip = NULL;
+                d_drop(dentry);
+                dput(dentry);
+        }
+        fuse_control_sb->s_root->d_inode->i_nlink--;
+}
+static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct tree_descr empty_descr = {""};
+        struct fuse_conn *fc;
+        int err;
+        err = simple_fill_super(sb, FUSE_CTL_SUPER_MAGIC, &empty_descr);
+        if (err)
+                return err;
+        mutex_lock(&fuse_mutex);
+        BUG_ON(fuse_control_sb);
+        fuse_control_sb = sb;
+        list_for_each_entry(fc, &fuse_conn_list, entry) {
+                err = fuse_ctl_add_conn(fc);
+                if (err) {
+                        fuse_control_sb = NULL;
+                        mutex_unlock(&fuse_mutex);
+                        return err;
+                }
+        }
+        mutex_unlock(&fuse_mutex);
+        return 0;
+}
+static int fuse_ctl_get_sb(struct file_system_type *fs_type, int flags,
+                        const char *dev_name, void *raw_data,
+                        struct vfsmount *mnt)
+{
+        return get_sb_single(fs_type, flags, raw_data,
+                                fuse_ctl_fill_super, mnt);
+}
+static void fuse_ctl_kill_sb(struct super_block *sb)
+{
+        mutex_lock(&fuse_mutex);
+        fuse_control_sb = NULL;
+        mutex_unlock(&fuse_mutex);
+        kill_litter_super(sb);
+}
+static struct file_system_type fuse_ctl_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "fusectl",
+        .get_sb         = fuse_ctl_get_sb,
+        .kill_sb        = fuse_ctl_kill_sb,
+};
+int __init fuse_ctl_init(void)
+{
+        return register_filesystem(&fuse_ctl_fs_type);
+}
+void fuse_ctl_cleanup(void)
+{
+        unregister_filesystem(&fuse_ctl_fs_type);
+}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 104a62dadb94..1e2006caf158 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -34,6 +34,7 @@ static void fuse_request_init(struct fuse_req *req)
 {
        memset(req, 0, sizeof(*req));
        INIT_LIST_HEAD(&req->list);
+        INIT_LIST_HEAD(&req->intr_entry);
        init_waitqueue_head(&req->waitq);
        atomic_set(&req->count, 1);
 }
@@ -64,18 +65,6 @@ static void restore_sigs(sigset_t *oldset)
        sigprocmask(SIG_SETMASK, oldset, NULL);
 }
-/*
- * Reset request, so that it can be reused
- *
- * The caller must be _very_ careful to make sure, that it is holding
- * the only reference to req
- */
-void fuse_reset_request(struct fuse_req *req)
-{
-        BUG_ON(atomic_read(&req->count) != 1);
-        fuse_request_init(req);
-}
 static void __fuse_get_request(struct fuse_req *req)
 {
        atomic_inc(&req->count);
@@ -88,6 +77,13 @@ static void __fuse_put_request(struct fuse_req *req)
        atomic_dec(&req->count);
 }
+static void fuse_req_init_context(struct fuse_req *req)
+{
+        req->in.h.uid = current->fsuid;
+        req->in.h.gid = current->fsgid;
+        req->in.h.pid = current->pid;
+}
 struct fuse_req *fuse_get_req(struct fuse_conn *fc)
 {
        struct fuse_req *req;
@@ -103,14 +99,16 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc)
        if (intr)
                goto out;
+        err = -ENOTCONN;
+        if (!fc->connected)
+                goto out;
        req = fuse_request_alloc();
        err = -ENOMEM;
        if (!req)
                goto out;
-        req->in.h.uid = current->fsuid;
+        fuse_req_init_context(req);
-        req->in.h.gid = current->fsgid;
-        req->in.h.pid = current->pid;
        req->waiting = 1;
        return req;
@@ -119,142 +117,183 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc)
        return ERR_PTR(err);
 }
-void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
+/*
+ * Return request in fuse_file->reserved_req.  However that may
+ * currently be in use.  If that is the case, wait for it to become
+ * available.
+ */
+static struct fuse_req *get_reserved_req(struct fuse_conn *fc,
+                                         struct file *file)
 {
-        if (atomic_dec_and_test(&req->count)) {
+        struct fuse_req *req = NULL;
-                if (req->waiting)
+        struct fuse_file *ff = file->private_data;
-                        atomic_dec(&fc->num_waiting);
-                fuse_request_free(req);
+        do {
-        }
+                wait_event(fc->blocked_waitq, ff->reserved_req);
+                spin_lock(&fc->lock);
+                if (ff->reserved_req) {
+                        req = ff->reserved_req;
+                        ff->reserved_req = NULL;
+                        get_file(file);
+                        req->stolen_file = file;
+                }
+                spin_unlock(&fc->lock);
+        } while (!req);
+        return req;
 }
 /*
- * Called with sbput_sem held for read (request_end) or write
+ * Put stolen request back into fuse_file->reserved_req
- * (fuse_put_super).  By the time fuse_put_super() is finished, all
- * inodes belonging to background requests must be released, so the
- * iputs have to be done within the locked region.
 */
-void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req)
+static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)
 {
-        iput(req->inode);
+        struct file *file = req->stolen_file;
-        iput(req->inode2);
+        struct fuse_file *ff = file->private_data;
        spin_lock(&fc->lock);
-        list_del(&req->bg_entry);
+        fuse_request_init(req);
-        if (fc->num_background == FUSE_MAX_BACKGROUND) {
+        BUG_ON(ff->reserved_req);
-                fc->blocked = 0;
+        ff->reserved_req = req;
-                wake_up_all(&fc->blocked_waitq);
+        wake_up(&fc->blocked_waitq);
-        }
-        fc->num_background--;
        spin_unlock(&fc->lock);
+        fput(file);
 }
 /*
- * This function is called when a request is finished.  Either a reply
+ * Gets a requests for a file operation, always succeeds
- * has arrived or it was interrupted (and not yet sent) or some error
- * occurred during communication with userspace, or the device file
- * was closed.  In case of a background request the reference to the
- * stored objects are released.  The requester thread is woken up (if
- * still waiting), the 'end' callback is called if given, else the
- * reference to the request is released
 *
- * Releasing extra reference for foreground requests must be done
+ * This is used for sending the FLUSH request, which must get to
- * within the same locked region as setting state to finished.  This
+ * userspace, due to POSIX locks which may need to be unlocked.
- * is because fuse_reset_request() may be called after request is
- * finished and it must be the sole possessor.  If request is
- * interrupted and put in the background, it will return with an error
- * and hence never be reset and reused.
 *
- * Called with fc->lock, unlocks it
+ * If allocation fails due to OOM, use the reserved request in
+ * fuse_file.
+ *
+ * This is very unlikely to deadlock accidentally, since the
+ * filesystem should not have it's own file open.  If deadlock is
+ * intentional, it can still be broken by "aborting" the filesystem.
 */
-static void request_end(struct fuse_conn *fc, struct fuse_req *req)
+struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file)
 {
-        list_del(&req->list);
+        struct fuse_req *req;
-        req->state = FUSE_REQ_FINISHED;
-        if (!req->background) {
-                spin_unlock(&fc->lock);
-                wake_up(&req->waitq);
-                fuse_put_request(fc, req);
-        } else {
-                void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
-                req->end = NULL;
-                spin_unlock(&fc->lock);
-                down_read(&fc->sbput_sem);
-                if (fc->mounted)
-                        fuse_release_background(fc, req);
-                up_read(&fc->sbput_sem);
-                /* fput must go outside sbput_sem, otherwise it can deadlock */
+        atomic_inc(&fc->num_waiting);
-                if (req->file)
+        wait_event(fc->blocked_waitq, !fc->blocked);
-                        fput(req->file);
+        req = fuse_request_alloc();
+        if (!req)
+                req = get_reserved_req(fc, file);
-                if (end)
+        fuse_req_init_context(req);
-                        end(fc, req);
+        req->waiting = 1;
+        return req;
+}
+void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+        if (atomic_dec_and_test(&req->count)) {
+                if (req->waiting)
+                        atomic_dec(&fc->num_waiting);
+                if (req->stolen_file)
+                        put_reserved_req(fc, req);
                else
-                        fuse_put_request(fc, req);
+                        fuse_request_free(req);
        }
 }
 /*
- * Unfortunately request interruption not just solves the deadlock
+ * This function is called when a request is finished.  Either a reply
- * problem, it causes problems too.  These stem from the fact, that an
+ * has arrived or it was aborted (and not yet sent) or some error
- * interrupted request is continued to be processed in userspace,
+ * occurred during communication with userspace, or the device file
- * while all the locks and object references (inode and file) held
+ * was closed.  The requester thread is woken up (if still waiting),
- * during the operation are released.
+ * the 'end' callback is called if given, else the reference to the
- *
+ * request is released
- * To release the locks is exactly why there's a need to interrupt the
- * request, so there's not a lot that can be done about this, except
- * introduce additional locking in userspace.
- *
- * More important is to keep inode and file references until userspace
- * has replied, otherwise FORGET and RELEASE could be sent while the
- * inode/file is still used by the filesystem.
- *
- * For this reason the concept of "background" request is introduced.
- * An interrupted request is backgrounded if it has been already sent
- * to userspace.  Backgrounding involves getting an extra reference to
- * inode(s) or file used in the request, and adding the request to
- * fc->background list.  When a reply is received for a background
- * request, the object references are released, and the request is
- * removed from the list.  If the filesystem is unmounted while there
- * are still background requests, the list is walked and references
- * are released as if a reply was received.
 *
- * There's one more use for a background request.  The RELEASE message is
+ * Called with fc->lock, unlocks it
- * always sent as background, since it doesn't return an error or
- * data.
 */
-static void background_request(struct fuse_conn *fc, struct fuse_req *req)
+static void request_end(struct fuse_conn *fc, struct fuse_req *req)
-{
+{
-        req->background = 1;
+        void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
-        list_add(&req->bg_entry, &fc->background);
+        req->end = NULL;
-        fc->num_background++;
+        list_del(&req->list);
-        if (fc->num_background == FUSE_MAX_BACKGROUND)
+        list_del(&req->intr_entry);
-                fc->blocked = 1;
+        req->state = FUSE_REQ_FINISHED;
-        if (req->inode)
+        if (req->background) {
-                req->inode = igrab(req->inode);
+                if (fc->num_background == FUSE_MAX_BACKGROUND) {
-        if (req->inode2)
+                        fc->blocked = 0;
-                req->inode2 = igrab(req->inode2);
+                        wake_up_all(&fc->blocked_waitq);
+                }
+                fc->num_background--;
+        }
+        spin_unlock(&fc->lock);
+        dput(req->dentry);
+        mntput(req->vfsmount);
        if (req->file)
-                get_file(req->file);
+                fput(req->file);
+        wake_up(&req->waitq);
+        if (end)
+                end(fc, req);
+        else
+                fuse_put_request(fc, req);
 }
-/* Called with fc->lock held.  Releases, and then reacquires it. */
+static void wait_answer_interruptible(struct fuse_conn *fc,
-static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
+                                      struct fuse_req *req)
 {
-        sigset_t oldset;
+        if (signal_pending(current))
+                return;
        spin_unlock(&fc->lock);
-        block_sigs(&oldset);
        wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED);
-        restore_sigs(&oldset);
        spin_lock(&fc->lock);
-        if (req->state == FUSE_REQ_FINISHED && !req->interrupted)
+}
-                return;
+static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
+{
+        list_add_tail(&req->intr_entry, &fc->interrupts);
+        wake_up(&fc->waitq);
+        kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+}
+/* Called with fc->lock held.  Releases, and then reacquires it. */
+static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
+{
+        if (!fc->no_interrupt) {
+                /* Any signal may interrupt this */
+                wait_answer_interruptible(fc, req);
+                if (req->aborted)
+                        goto aborted;
+                if (req->state == FUSE_REQ_FINISHED)
+                        return;
-        if (!req->interrupted) {
-                req->out.h.error = -EINTR;
                req->interrupted = 1;
+                if (req->state == FUSE_REQ_SENT)
+                        queue_interrupt(fc, req);
+        }
+        if (req->force) {
+                spin_unlock(&fc->lock);
+                wait_event(req->waitq, req->state == FUSE_REQ_FINISHED);
+                spin_lock(&fc->lock);
+        } else {
+                sigset_t oldset;
+                /* Only fatal signals may interrupt this */
+                block_sigs(&oldset);
+                wait_answer_interruptible(fc, req);
+                restore_sigs(&oldset);
        }
+        if (req->aborted)
+                goto aborted;
+        if (req->state == FUSE_REQ_FINISHED)
+                return;
+        req->out.h.error = -EINTR;
+        req->aborted = 1;
+ aborted:
        if (req->locked) {
                /* This is uninterruptible sleep, because data is
                   being copied to/from the buffers of req.  During
@@ -268,8 +307,11 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
        if (req->state == FUSE_REQ_PENDING) {
                list_del(&req->list);
                __fuse_put_request(req);
-        } else if (req->state == FUSE_REQ_SENT)
+        } else if (req->state == FUSE_REQ_SENT) {
-                background_request(fc, req);
+                spin_unlock(&fc->lock);
+                wait_event(req->waitq, req->state == FUSE_REQ_FINISHED);
+                spin_lock(&fc->lock);
+        }
 }
 static unsigned len_args(unsigned numargs, struct fuse_arg *args)
@@ -283,13 +325,19 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args)
        return nbytes;
 }
+static u64 fuse_get_unique(struct fuse_conn *fc)
+ {
+        fc->reqctr++;
+        /* zero is special */
+        if (fc->reqctr == 0)
+                fc->reqctr = 1;
+        return fc->reqctr;
+}
 static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 {
-        fc->reqctr++;
+        req->in.h.unique = fuse_get_unique(fc);
-        /* zero is special */
-        if (fc->reqctr == 0)
-                fc->reqctr = 1;
-        req->in.h.unique = fc->reqctr;
        req->in.h.len = sizeof(struct fuse_in_header) +
                len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
        list_add_tail(&req->list, &fc->pending);
@@ -302,9 +350,6 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
        kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 }
-/*
- * This can only be interrupted by a SIGKILL
- */
 void request_send(struct fuse_conn *fc, struct fuse_req *req)
 {
        req->isreply = 1;
@@ -327,8 +372,12 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req)
 static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
 {
        spin_lock(&fc->lock);
-        background_request(fc, req);
        if (fc->connected) {
+                req->background = 1;
+                fc->num_background++;
+                if (fc->num_background == FUSE_MAX_BACKGROUND)
+                        fc->blocked = 1;
                queue_request(fc, req);
                spin_unlock(&fc->lock);
        } else {
@@ -352,14 +401,14 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 /*
 * Lock the request.  Up to the next unlock_request() there mustn't be
 * anything that could cause a page-fault.  If the request was already
- * interrupted bail out.
+ * aborted bail out.
 */
 static int lock_request(struct fuse_conn *fc, struct fuse_req *req)
 {
        int err = 0;
        if (req) {
                spin_lock(&fc->lock);
-                if (req->interrupted)
+                if (req->aborted)
                        err = -ENOENT;
                else
                        req->locked = 1;
@@ -369,7 +418,7 @@ static int lock_request(struct fuse_conn *fc, struct fuse_req *req)
 }
 /*
- * Unlock request.  If it was interrupted during being locked, the
+ * Unlock request.  If it was aborted during being locked, the
 * requester thread is currently waiting for it to be unlocked, so
 * wake it up.
 */
@@ -378,7 +427,7 @@ static void unlock_request(struct fuse_conn *fc, struct fuse_req *req)
        if (req) {
                spin_lock(&fc->lock);
                req->locked = 0;
-                if (req->interrupted)
+                if (req->aborted)
                        wake_up(&req->waitq);
                spin_unlock(&fc->lock);
        }
@@ -557,13 +606,18 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
        return err;
 }
+static int request_pending(struct fuse_conn *fc)
+{
+        return !list_empty(&fc->pending) || !list_empty(&fc->interrupts);
+}
 /* Wait until a request is available on the pending list */
 static void request_wait(struct fuse_conn *fc)
 {
        DECLARE_WAITQUEUE(wait, current);
        add_wait_queue_exclusive(&fc->waitq, &wait);
-        while (fc->connected && list_empty(&fc->pending)) {
+        while (fc->connected && !request_pending(fc)) {
                set_current_state(TASK_INTERRUPTIBLE);
                if (signal_pending(current))
                        break;
@@ -577,11 +631,50 @@ static void request_wait(struct fuse_conn *fc)
 }
 /*
+ * Transfer an interrupt request to userspace
+ *
+ * Unlike other requests this is assembled on demand, without a need
+ * to allocate a separate fuse_req structure.
+ *
+ * Called with fc->lock held, releases it
+ */
+static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req,
+                               const struct iovec *iov, unsigned long nr_segs)
+{
+        struct fuse_copy_state cs;
+        struct fuse_in_header ih;
+        struct fuse_interrupt_in arg;
+        unsigned reqsize = sizeof(ih) + sizeof(arg);
+        int err;
+        list_del_init(&req->intr_entry);
+        req->intr_unique = fuse_get_unique(fc);
+        memset(&ih, 0, sizeof(ih));
+        memset(&arg, 0, sizeof(arg));
+        ih.len = reqsize;
+        ih.opcode = FUSE_INTERRUPT;
+        ih.unique = req->intr_unique;
+        arg.unique = req->in.h.unique;
+        spin_unlock(&fc->lock);
+        if (iov_length(iov, nr_segs) < reqsize)
+                return -EINVAL;
+        fuse_copy_init(&cs, fc, 1, NULL, iov, nr_segs);
+        err = fuse_copy_one(&cs, &ih, sizeof(ih));
+        if (!err)
+                err = fuse_copy_one(&cs, &arg, sizeof(arg));
+        fuse_copy_finish(&cs);
+        return err ? err : reqsize;
+}
+/*
 * Read a single request into the userspace filesystem's buffer.  This
 * function waits until a request is available, then removes it from
 * the pending list and copies request data to userspace buffer.  If
- * no reply is needed (FORGET) or request has been interrupted or
+ * no reply is needed (FORGET) or request has been aborted or there
- * there was an error during the copying then it's finished by calling
+ * was an error during the copying then it's finished by calling
 * request_end().  Otherwise add it to the processing list, and set
 * the 'sent' flag.
 */
@@ -601,7 +694,7 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
        spin_lock(&fc->lock);
        err = -EAGAIN;
        if ((file->f_flags & O_NONBLOCK) && fc->connected &&
-            list_empty(&fc->pending))
+            !request_pending(fc))
                goto err_unlock;
        request_wait(fc);
@@ -609,9 +702,15 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
        if (!fc->connected)
                goto err_unlock;
        err = -ERESTARTSYS;
-        if (list_empty(&fc->pending))
+        if (!request_pending(fc))
                goto err_unlock;
+        if (!list_empty(&fc->interrupts)) {
+                req = list_entry(fc->interrupts.next, struct fuse_req,
+                                 intr_entry);
+                return fuse_read_interrupt(fc, req, iov, nr_segs);
+        }
        req = list_entry(fc->pending.next, struct fuse_req, list);
        req->state = FUSE_REQ_READING;
        list_move(&req->list, &fc->io);
@@ -636,10 +735,10 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
        fuse_copy_finish(&cs);
        spin_lock(&fc->lock);
        req->locked = 0;
-        if (!err && req->interrupted)
+        if (!err && req->aborted)
                err = -ENOENT;
        if (err) {
-                if (!req->interrupted)
+                if (!req->aborted)
                        req->out.h.error = -EIO;
                request_end(fc, req);
                return err;
@@ -649,6 +748,8 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
        else {
                req->state = FUSE_REQ_SENT;
                list_move_tail(&req->list, &fc->processing);
+                if (req->interrupted)
+                        queue_interrupt(fc, req);
                spin_unlock(&fc->lock);
        }
        return reqsize;
@@ -675,7 +776,7 @@ static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
        list_for_each(entry, &fc->processing) {
                struct fuse_req *req;
                req = list_entry(entry, struct fuse_req, list);
-                if (req->in.h.unique == unique)
+                if (req->in.h.unique == unique || req->intr_unique == unique)
                        return req;
        }
        return NULL;
@@ -741,17 +842,33 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
                goto err_unlock;
        req = request_find(fc, oh.unique);
-        err = -EINVAL;
        if (!req)
                goto err_unlock;
-        if (req->interrupted) {
+        if (req->aborted) {
                spin_unlock(&fc->lock);
                fuse_copy_finish(&cs);
                spin_lock(&fc->lock);
                request_end(fc, req);
                return -ENOENT;
        }
+        /* Is it an interrupt reply? */
+        if (req->intr_unique == oh.unique) {
+                err = -EINVAL;
+                if (nbytes != sizeof(struct fuse_out_header))
+                        goto err_unlock;
+                if (oh.error == -ENOSYS)
+                        fc->no_interrupt = 1;
+                else if (oh.error == -EAGAIN)
+                        queue_interrupt(fc, req);
+                spin_unlock(&fc->lock);
+                fuse_copy_finish(&cs);
+                return nbytes;
+        }
+        req->state = FUSE_REQ_WRITING;
        list_move(&req->list, &fc->io);
        req->out.h = oh;
        req->locked = 1;
@@ -764,9 +881,9 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
        spin_lock(&fc->lock);
        req->locked = 0;
        if (!err) {
-                if (req->interrupted)
+                if (req->aborted)
                        err = -ENOENT;
-        } else if (!req->interrupted)
+        } else if (!req->aborted)
                req->out.h.error = -EIO;
        request_end(fc, req);
@@ -800,7 +917,7 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
        spin_lock(&fc->lock);
        if (!fc->connected)
                mask = POLLERR;
-        else if (!list_empty(&fc->pending))
+        else if (request_pending(fc))
                mask |= POLLIN | POLLRDNORM;
        spin_unlock(&fc->lock);
@@ -826,7 +943,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
 /*
 * Abort requests under I/O
 *
- * The requests are set to interrupted and finished, and the request
+ * The requests are set to aborted and finished, and the request
 * waiter is woken up.  This will make request_wait_answer() wait
 * until the request is unlocked and then return.
 *
@@ -841,7 +958,7 @@ static void end_io_requests(struct fuse_conn *fc)
                        list_entry(fc->io.next, struct fuse_req, list);
                void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
-                req->interrupted = 1;
+                req->aborted = 1;
                req->out.h.error = -ECONNABORTED;
                req->state = FUSE_REQ_FINISHED;
                list_del_init(&req->list);
@@ -874,19 +991,20 @@ static void end_io_requests(struct fuse_conn *fc)
 * onto the pending list is prevented by req->connected being false.
 *
 * Progression of requests under I/O to the processing list is
- * prevented by the req->interrupted flag being true for these
+ * prevented by the req->aborted flag being true for these requests.
- * requests.  For this reason requests on the io list must be aborted
+ * For this reason requests on the io list must be aborted first.
- * first.
 */
 void fuse_abort_conn(struct fuse_conn *fc)
 {
        spin_lock(&fc->lock);
        if (fc->connected) {
                fc->connected = 0;
+                fc->blocked = 0;
                end_io_requests(fc);
                end_requests(fc, &fc->pending);
                end_requests(fc, &fc->processing);
                wake_up_all(&fc->waitq);
+                wake_up_all(&fc->blocked_waitq);
                kill_fasync(&fc->fasync, SIGIO, POLL_IN);
        }
        spin_unlock(&fc->lock);
@@ -902,7 +1020,7 @@ static int fuse_dev_release(struct inode *inode, struct file *file)
                end_requests(fc, &fc->processing);
                spin_unlock(&fc->lock);
                fasync_helper(-1, file, 0, &fc->fasync);
-                kobject_put(&fc->kobj);
+                fuse_conn_put(fc);
        }
        return 0;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 8d7546e832e8..72a74cde6de8 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1,6 +1,6 @@
 /*
  FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
  This program can be distributed under the terms of the GNU GPL.
  See the file COPYING.
@@ -79,7 +79,6 @@ static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
 {
        req->in.h.opcode = FUSE_LOOKUP;
        req->in.h.nodeid = get_node_id(dir);
-        req->inode = dir;
        req->in.numargs = 1;
        req->in.args[0].size = entry->d_name.len + 1;
        req->in.args[0].value = entry->d_name.name;
@@ -225,6 +224,20 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 }
 /*
+ * Synchronous release for the case when something goes wrong in CREATE_OPEN
+ */
+static void fuse_sync_release(struct fuse_conn *fc, struct fuse_file *ff,
+                              u64 nodeid, int flags)
+{
+        struct fuse_req *req;
+        req = fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE);
+        req->force = 1;
+        request_send(fc, req);
+        fuse_put_request(fc, req);
+}
+/*
 * Atomic create+open operation
 *
 * If the filesystem doesn't support this, then fall back to separate
@@ -237,6 +250,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        struct inode *inode;
        struct fuse_conn *fc = get_fuse_conn(dir);
        struct fuse_req *req;
+        struct fuse_req *forget_req;
        struct fuse_open_in inarg;
        struct fuse_open_out outopen;
        struct fuse_entry_out outentry;
@@ -247,9 +261,14 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        if (fc->no_create)
                return -ENOSYS;
+        forget_req = fuse_get_req(fc);
+        if (IS_ERR(forget_req))
+                return PTR_ERR(forget_req);
        req = fuse_get_req(fc);
+        err = PTR_ERR(req);
        if (IS_ERR(req))
-                return PTR_ERR(req);
+                goto out_put_forget_req;
        err = -ENOMEM;
        ff = fuse_file_alloc();
@@ -262,7 +281,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        inarg.mode = mode;
        req->in.h.opcode = FUSE_CREATE;
        req->in.h.nodeid = get_node_id(dir);
-        req->inode = dir;
        req->in.numargs = 2;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
@@ -285,25 +303,23 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid))
                goto out_free_ff;
+        fuse_put_request(fc, req);
        inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
                          &outentry.attr);
-        err = -ENOMEM;
        if (!inode) {
                flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
                ff->fh = outopen.fh;
-                /* Special release, with inode = NULL, this will
+                fuse_sync_release(fc, ff, outentry.nodeid, flags);
-                   trigger a 'forget' request when the release is
+                fuse_send_forget(fc, forget_req, outentry.nodeid, 1);
-                   complete */
+                return -ENOMEM;
-                fuse_send_release(fc, ff, outentry.nodeid, NULL, flags, 0);
-                goto out_put_request;
        }
-        fuse_put_request(fc, req);
+        fuse_put_request(fc, forget_req);
        d_instantiate(entry, inode);
        fuse_change_timeout(entry, &outentry);
        file = lookup_instantiate_filp(nd, entry, generic_file_open);
        if (IS_ERR(file)) {
                ff->fh = outopen.fh;
-                fuse_send_release(fc, ff, outentry.nodeid, inode, flags, 0);
+                fuse_sync_release(fc, ff, outentry.nodeid, flags);
                return PTR_ERR(file);
        }
        fuse_finish_open(inode, file, ff, &outopen);
@@ -313,6 +329,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        fuse_file_free(ff);
 out_put_request:
        fuse_put_request(fc, req);
+ out_put_forget_req:
+        fuse_put_request(fc, forget_req);
        return err;
 }
@@ -328,7 +346,6 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
        int err;
        req->in.h.nodeid = get_node_id(dir);
-        req->inode = dir;
        req->out.numargs = 1;
        req->out.args[0].size = sizeof(outarg);
        req->out.args[0].value = &outarg;
@@ -448,7 +465,6 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
        req->in.h.opcode = FUSE_UNLINK;
        req->in.h.nodeid = get_node_id(dir);
-        req->inode = dir;
        req->in.numargs = 1;
        req->in.args[0].size = entry->d_name.len + 1;
        req->in.args[0].value = entry->d_name.name;
@@ -480,7 +496,6 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
        req->in.h.opcode = FUSE_RMDIR;
        req->in.h.nodeid = get_node_id(dir);
-        req->inode = dir;
        req->in.numargs = 1;
        req->in.args[0].size = entry->d_name.len + 1;
        req->in.args[0].value = entry->d_name.name;
@@ -510,8 +525,6 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
        inarg.newdir = get_node_id(newdir);
        req->in.h.opcode = FUSE_RENAME;
        req->in.h.nodeid = get_node_id(olddir);
-        req->inode = olddir;
-        req->inode2 = newdir;
        req->in.numargs = 3;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
@@ -558,7 +571,6 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
        memset(&inarg, 0, sizeof(inarg));
        inarg.oldnodeid = get_node_id(inode);
        req->in.h.opcode = FUSE_LINK;
-        req->inode2 = inode;
        req->in.numargs = 2;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
@@ -587,7 +599,6 @@ int fuse_do_getattr(struct inode *inode)
        req->in.h.opcode = FUSE_GETATTR;
        req->in.h.nodeid = get_node_id(inode);
-        req->inode = inode;
        req->out.numargs = 1;
        req->out.args[0].size = sizeof(arg);
        req->out.args[0].value = &arg;
@@ -679,7 +690,6 @@ static int fuse_access(struct inode *inode, int mask)
        inarg.mask = mask;
        req->in.h.opcode = FUSE_ACCESS;
        req->in.h.nodeid = get_node_id(inode);
-        req->inode = inode;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
@@ -820,7 +830,6 @@ static char *read_link(struct dentry *dentry)
        }
        req->in.h.opcode = FUSE_READLINK;
        req->in.h.nodeid = get_node_id(inode);
-        req->inode = inode;
        req->out.argvar = 1;
        req->out.numargs = 1;
        req->out.args[0].size = PAGE_SIZE - 1;
@@ -939,7 +948,6 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
        iattr_to_fattr(attr, &inarg);
        req->in.h.opcode = FUSE_SETATTR;
        req->in.h.nodeid = get_node_id(inode);
-        req->inode = inode;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
@@ -1002,7 +1010,6 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
        inarg.flags = flags;
        req->in.h.opcode = FUSE_SETXATTR;
        req->in.h.nodeid = get_node_id(inode);
-        req->inode = inode;
        req->in.numargs = 3;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
@@ -1041,7 +1048,6 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
        inarg.size = size;
        req->in.h.opcode = FUSE_GETXATTR;
        req->in.h.nodeid = get_node_id(inode);
-        req->inode = inode;
        req->in.numargs = 2;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
@@ -1091,7 +1097,6 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
        inarg.size = size;
        req->in.h.opcode = FUSE_LISTXATTR;
        req->in.h.nodeid = get_node_id(inode);
-        req->inode = inode;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
@@ -1135,7 +1140,6 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
        req->in.h.opcode = FUSE_REMOVEXATTR;
        req->in.h.nodeid = get_node_id(inode);
-        req->inode = inode;
        req->in.numargs = 1;
        req->in.args[0].size = strlen(name) + 1;
        req->in.args[0].value = name;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 087f3b734f40..28aa81eae2cc 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -30,7 +30,6 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
        inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
        req->in.h.opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
        req->in.h.nodeid = get_node_id(inode);
-        req->inode = inode;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
@@ -49,8 +48,8 @@ struct fuse_file *fuse_file_alloc(void)
        struct fuse_file *ff;
        ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
        if (ff) {
-                ff->release_req = fuse_request_alloc();
+                ff->reserved_req = fuse_request_alloc();
-                if (!ff->release_req) {
+                if (!ff->reserved_req) {
                        kfree(ff);
                        ff = NULL;
                }
@@ -60,7 +59,7 @@ struct fuse_file *fuse_file_alloc(void)
 void fuse_file_free(struct fuse_file *ff)
 {
-        fuse_request_free(ff->release_req);
+        fuse_request_free(ff->reserved_req);
        kfree(ff);
 }
@@ -113,37 +112,22 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
        return err;
 }
-/* Special case for failed iget in CREATE */
+struct fuse_req *fuse_release_fill(struct fuse_file *ff, u64 nodeid, int flags,
-static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
+                                   int opcode)
 {
-        /* If called from end_io_requests(), req has more than one
+        struct fuse_req *req = ff->reserved_req;
-           reference and fuse_reset_request() cannot work */
-        if (fc->connected) {
-                u64 nodeid = req->in.h.nodeid;
-                fuse_reset_request(req);
-                fuse_send_forget(fc, req, nodeid, 1);
-        } else
-                fuse_put_request(fc, req);
-}
-void fuse_send_release(struct fuse_conn *fc, struct fuse_file *ff,
-                       u64 nodeid, struct inode *inode, int flags, int isdir)
-{
-        struct fuse_req * req = ff->release_req;
        struct fuse_release_in *inarg = &req->misc.release_in;
        inarg->fh = ff->fh;
        inarg->flags = flags;
-        req->in.h.opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
+        req->in.h.opcode = opcode;
        req->in.h.nodeid = nodeid;
-        req->inode = inode;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(struct fuse_release_in);
        req->in.args[0].value = inarg;
-        request_send_background(fc, req);
-        if (!inode)
-                req->end = fuse_release_end;
        kfree(ff);
+        return req;
 }
 int fuse_release_common(struct inode *inode, struct file *file, int isdir)
@@ -151,8 +135,15 @@ int fuse_release_common(struct inode *inode, struct file *file, int isdir)
        struct fuse_file *ff = file->private_data;
        if (ff) {
                struct fuse_conn *fc = get_fuse_conn(inode);
-                u64 nodeid = get_node_id(inode);
+                struct fuse_req *req;
-                fuse_send_release(fc, ff, nodeid, inode, file->f_flags, isdir);
+                req = fuse_release_fill(ff, get_node_id(inode), file->f_flags,
+                                        isdir ? FUSE_RELEASEDIR : FUSE_RELEASE);
+                /* Hold vfsmount and dentry until release is finished */
+                req->vfsmount = mntget(file->f_vfsmnt);
+                req->dentry = dget(file->f_dentry);
+                request_send_background(fc, req);
        }
        /* Return value is ignored by VFS */
@@ -169,6 +160,28 @@ static int fuse_release(struct inode *inode, struct file *file)
        return fuse_release_common(inode, file, 0);
 }
+/*
+ * Scramble the ID space with XTEA, so that the value of the files_struct
+ * pointer is not exposed to userspace.
+ */
+static u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
+{
+        u32 *k = fc->scramble_key;
+        u64 v = (unsigned long) id;
+        u32 v0 = v;
+        u32 v1 = v >> 32;
+        u32 sum = 0;
+        int i;
+        for (i = 0; i < 32; i++) {
+                v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
+                sum += 0x9E3779B9;
+                v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
+        }
+        return (u64) v0 + ((u64) v1 << 32);
+}
 static int fuse_flush(struct file *file, fl_owner_t id)
 {
        struct inode *inode = file->f_dentry->d_inode;
@@ -184,19 +197,16 @@ static int fuse_flush(struct file *file, fl_owner_t id)
        if (fc->no_flush)
                return 0;
-        req = fuse_get_req(fc);
+        req = fuse_get_req_nofail(fc, file);
-        if (IS_ERR(req))
-                return PTR_ERR(req);
        memset(&inarg, 0, sizeof(inarg));
        inarg.fh = ff->fh;
+        inarg.lock_owner = fuse_lock_owner_id(fc, id);
        req->in.h.opcode = FUSE_FLUSH;
        req->in.h.nodeid = get_node_id(inode);
-        req->inode = inode;
-        req->file = file;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
+        req->force = 1;
        request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
@@ -232,8 +242,6 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
        inarg.fsync_flags = datasync ? 1 : 0;
        req->in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC;
        req->in.h.nodeid = get_node_id(inode);
-        req->inode = inode;
-        req->file = file;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
@@ -266,8 +274,6 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
        inarg->size = count;
        req->in.h.opcode = opcode;
        req->in.h.nodeid = get_node_id(inode);
-        req->inode = inode;
-        req->file = file;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(struct fuse_read_in);
        req->in.args[0].value = inarg;
@@ -342,6 +348,8 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file,
        req->out.page_zeroing = 1;
        fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
        if (fc->async_read) {
+                get_file(file);
+                req->file = file;
                req->end = fuse_readpages_end;
                request_send_background(fc, req);
        } else {
@@ -420,8 +428,6 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,
        inarg.size = count;
        req->in.h.opcode = FUSE_WRITE;
        req->in.h.nodeid = get_node_id(inode);
-        req->inode = inode;
-        req->file = file;
        req->in.argpages = 1;
        req->in.numargs = 2;
        req->in.args[0].size = sizeof(struct fuse_write_in);
@@ -619,6 +625,126 @@ static int fuse_set_page_dirty(struct page *page)
        return 0;
 }
+static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
+                                  struct file_lock *fl)
+{
+        switch (ffl->type) {
+        case F_UNLCK:
+                break;
+        case F_RDLCK:
+        case F_WRLCK:
+                if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
+                    ffl->end < ffl->start)
+                        return -EIO;
+                fl->fl_start = ffl->start;
+                fl->fl_end = ffl->end;
+                fl->fl_pid = ffl->pid;
+                break;
+        default:
+                return -EIO;
+        }
+        fl->fl_type = ffl->type;
+        return 0;
+}
+static void fuse_lk_fill(struct fuse_req *req, struct file *file,
+                         const struct file_lock *fl, int opcode, pid_t pid)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_file *ff = file->private_data;
+        struct fuse_lk_in *arg = &req->misc.lk_in;
+        arg->fh = ff->fh;
+        arg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
+        arg->lk.start = fl->fl_start;
+        arg->lk.end = fl->fl_end;
+        arg->lk.type = fl->fl_type;
+        arg->lk.pid = pid;
+        req->in.h.opcode = opcode;
+        req->in.h.nodeid = get_node_id(inode);
+        req->in.numargs = 1;
+        req->in.args[0].size = sizeof(*arg);
+        req->in.args[0].value = arg;
+}
+static int fuse_getlk(struct file *file, struct file_lock *fl)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_req *req;
+        struct fuse_lk_out outarg;
+        int err;
+        req = fuse_get_req(fc);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        fuse_lk_fill(req, file, fl, FUSE_GETLK, 0);
+        req->out.numargs = 1;
+        req->out.args[0].size = sizeof(outarg);
+        req->out.args[0].value = &outarg;
+        request_send(fc, req);
+        err = req->out.h.error;
+        fuse_put_request(fc, req);
+        if (!err)
+                err = convert_fuse_file_lock(&outarg.lk, fl);
+        return err;
+}
+static int fuse_setlk(struct file *file, struct file_lock *fl)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_req *req;
+        int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
+        pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0;
+        int err;
+        /* Unlock on close is handled by the flush method */
+        if (fl->fl_flags & FL_CLOSE)
+                return 0;
+        req = fuse_get_req(fc);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        fuse_lk_fill(req, file, fl, opcode, pid);
+        request_send(fc, req);
+        err = req->out.h.error;
+        /* locking is restartable */
+        if (err == -EINTR)
+                err = -ERESTARTSYS;
+        fuse_put_request(fc, req);
+        return err;
+}
+static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        int err;
+        if (cmd == F_GETLK) {
+                if (fc->no_lock) {
+                        if (!posix_test_lock(file, fl, fl))
+                                fl->fl_type = F_UNLCK;
+                        err = 0;
+                } else
+                        err = fuse_getlk(file, fl);
+        } else {
+                if (fc->no_lock)
+                        err = posix_lock_file_wait(file, fl);
+                else
+                        err = fuse_setlk(file, fl);
+        }
+        return err;
+}
 static const struct file_operations fuse_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_file_read,
@@ -628,6 +754,7 @@ static const struct file_operations fuse_file_operations = {
        .flush          = fuse_flush,
        .release        = fuse_release,
        .fsync          = fuse_fsync,
+        .lock           = fuse_file_lock,
        .sendfile       = generic_file_sendfile,
 };
@@ -639,6 +766,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
        .flush          = fuse_flush,
        .release        = fuse_release,
        .fsync          = fuse_fsync,
+        .lock           = fuse_file_lock,
        /* no mmap and sendfile */
 };
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 0474202cb5dc..0dbf96621841 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -8,12 +8,13 @@
 #include <linux/fuse.h>
 #include <linux/fs.h>
+#include <linux/mount.h>
 #include <linux/wait.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/mm.h>
 #include <linux/backing-dev.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
@@ -24,6 +25,9 @@
 /** It could be as large as PATH_MAX, but would that have any uses? */
 #define FUSE_NAME_MAX 1024
+/** Number of dentries for each connection in the control filesystem */
+#define FUSE_CTL_NUM_DENTRIES 3
 /** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
    module will check permissions based on the file mode.  Otherwise no
    permission checking is done in the kernel */
@@ -33,6 +37,11 @@
    doing the mount will be allowed to access the filesystem */
 #define FUSE_ALLOW_OTHER         (1 << 1)
+/** List of active connections */
+extern struct list_head fuse_conn_list;
+/** Global mutex protecting fuse_conn_list and the control filesystem */
+extern struct mutex fuse_mutex;
 /** FUSE inode */
 struct fuse_inode {
@@ -56,7 +65,7 @@ struct fuse_inode {
 /** FUSE specific file data */
 struct fuse_file {
        /** Request reserved for flush and release */
-        struct fuse_req *release_req;
+        struct fuse_req *reserved_req;
        /** File handle used by userspace */
        u64 fh;
@@ -122,6 +131,7 @@ enum fuse_req_state {
        FUSE_REQ_PENDING,
        FUSE_REQ_READING,
        FUSE_REQ_SENT,
+        FUSE_REQ_WRITING,
        FUSE_REQ_FINISHED
 };
@@ -135,12 +145,15 @@ struct fuse_req {
            fuse_conn */
        struct list_head list;
-        /** Entry on the background list */
+        /** Entry on the interrupts list  */
-        struct list_head bg_entry;
+        struct list_head intr_entry;
        /** refcount */
        atomic_t count;
+        /** Unique ID for the interrupt request */
+        u64 intr_unique;
        /*
         * The following bitfields are either set once before the
         * request is queued or setting/clearing them is protected by
@@ -150,12 +163,18 @@ struct fuse_req {
        /** True if the request has reply */
        unsigned isreply:1;
-        /** The request was interrupted */
+        /** Force sending of the request even if interrupted */
-        unsigned interrupted:1;
+        unsigned force:1;
+        /** The request was aborted */
+        unsigned aborted:1;
        /** Request is sent in the background */
        unsigned background:1;
+        /** The request has been interrupted */
+        unsigned interrupted:1;
        /** Data is being copied to/from the request */
        unsigned locked:1;
@@ -181,6 +200,7 @@ struct fuse_req {
                struct fuse_init_in init_in;
                struct fuse_init_out init_out;
                struct fuse_read_in read_in;
+                struct fuse_lk_in lk_in;
        } misc;
        /** page vector */
@@ -192,17 +212,20 @@ struct fuse_req {
        /** offset of data on first page */
        unsigned page_offset;
-        /** Inode used in the request */
-        struct inode *inode;
-        /** Second inode used in the request (or NULL) */
-        struct inode *inode2;
        /** File used in the request (or NULL) */
        struct file *file;
+        /** vfsmount used in release */
+        struct vfsmount *vfsmount;
+        /** dentry used in release */
+        struct dentry *dentry;
        /** Request completion callback */
        void (*end)(struct fuse_conn *, struct fuse_req *);
+        /** Request is stolen from fuse_file->reserved_req */
+        struct file *stolen_file;
 };
 /**
@@ -216,6 +239,9 @@ struct fuse_conn {
        /** Lock protecting accessess to  members of this structure */
        spinlock_t lock;
+        /** Refcount */
+        atomic_t count;
        /** The user id for this mount */
        uid_t user_id;
@@ -243,13 +269,12 @@ struct fuse_conn {
        /** The list of requests under I/O */
        struct list_head io;
-        /** Requests put in the background (RELEASE or any other
-            interrupted request) */
-        struct list_head background;
        /** Number of requests currently in the background */
        unsigned num_background;
+        /** Pending interrupts */
+        struct list_head interrupts;
        /** Flag indicating if connection is blocked.  This will be
            the case before the INIT reply is received, and if there
            are too many outstading backgrounds requests */
@@ -258,15 +283,9 @@ struct fuse_conn {
        /** waitq for blocked connection */
        wait_queue_head_t blocked_waitq;
-        /** RW semaphore for exclusion with fuse_put_super() */
-        struct rw_semaphore sbput_sem;
        /** The next unique request id */
        u64 reqctr;
-        /** Mount is active */
-        unsigned mounted;
        /** Connection established, cleared on umount, connection
            abort and device release */
        unsigned connected;
@@ -305,12 +324,18 @@ struct fuse_conn {
        /** Is removexattr not implemented by fs? */
        unsigned no_removexattr : 1;
+        /** Are file locking primitives not implemented by fs? */
+        unsigned no_lock : 1;
        /** Is access not implemented by fs? */
        unsigned no_access : 1;
        /** Is create not implemented by fs? */
        unsigned no_create : 1;
+        /** Is interrupt not implemented by fs? */
+        unsigned no_interrupt : 1;
        /** The number of requests waiting for completion */
        atomic_t num_waiting;
@@ -320,11 +345,23 @@ struct fuse_conn {
        /** Backing dev info */
        struct backing_dev_info bdi;
-        /** kobject */
+        /** Entry on the fuse_conn_list */
-        struct kobject kobj;
+        struct list_head entry;
+        /** Unique ID */
+        u64 id;
+        /** Dentries in the control filesystem */
+        struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES];
+        /** number of dentries used in the above array */
+        int ctl_ndents;
        /** O_ASYNC requests */
        struct fasync_struct *fasync;
+        /** Key for lock owner ID scrambling */
+        u32 scramble_key[4];
 };
 static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -337,11 +374,6 @@ static inline struct fuse_conn *get_fuse_conn(struct inode *inode)
        return get_fuse_conn_super(inode->i_sb);
 }
-static inline struct fuse_conn *get_fuse_conn_kobj(struct kobject *obj)
-{
-        return container_of(obj, struct fuse_conn, kobj);
-}
 static inline struct fuse_inode *get_fuse_inode(struct inode *inode)
 {
        return container_of(inode, struct fuse_inode, inode);
@@ -383,12 +415,9 @@ void fuse_file_free(struct fuse_file *ff);
 void fuse_finish_open(struct inode *inode, struct file *file,
                      struct fuse_file *ff, struct fuse_open_out *outarg);
-/**
+/** */
- * Send a RELEASE request
+struct fuse_req *fuse_release_fill(struct fuse_file *ff, u64 nodeid, int flags,
- */
+                                   int opcode);
-void fuse_send_release(struct fuse_conn *fc, struct fuse_file *ff,
-                       u64 nodeid, struct inode *inode, int flags, int isdir);
 /**
 * Send RELEASE or RELEASEDIR request
 */
@@ -435,6 +464,9 @@ int fuse_dev_init(void);
 */
 void fuse_dev_cleanup(void);
+int fuse_ctl_init(void);
+void fuse_ctl_cleanup(void);
 /**
 * Allocate a request
 */
@@ -446,14 +478,14 @@ struct fuse_req *fuse_request_alloc(void);
 void fuse_request_free(struct fuse_req *req);
 /**
- * Reinitialize a request, the preallocated flag is left unmodified
+ * Get a request, may fail with -ENOMEM
 */
-void fuse_reset_request(struct fuse_req *req);
+struct fuse_req *fuse_get_req(struct fuse_conn *fc);
 /**
- * Reserve a preallocated request
+ * Gets a requests for a file operation, always succeeds
 */
-struct fuse_req *fuse_get_req(struct fuse_conn *fc);
+struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file);
 /**
 * Decrement reference count of a request.  If count goes to zero free
@@ -476,11 +508,6 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
 */
 void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
-/**
- * Release inodes and file associated with background request
- */
-void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req);
 /* Abort all requests */
 void fuse_abort_conn(struct fuse_conn *fc);
@@ -493,3 +520,23 @@ int fuse_do_getattr(struct inode *inode);
 * Invalidate inode attributes
 */
 void fuse_invalidate_attr(struct inode *inode);
+/**
+ * Acquire reference to fuse_conn
+ */
+struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
+/**
+ * Release reference to fuse_conn
+ */
+void fuse_conn_put(struct fuse_conn *fc);
+/**
+ * Add connection to control filesystem
+ */
+int fuse_ctl_add_conn(struct fuse_conn *fc);
+/**
+ * Remove connection from control filesystem
+ */
+void fuse_ctl_remove_conn(struct fuse_conn *fc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index a13c0f529058..dcaaabd3b9c4 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -11,25 +11,20 @@
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/file.h>
-#include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/parser.h>
 #include <linux/statfs.h>
+#include <linux/random.h>
 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Filesystem in Userspace");
 MODULE_LICENSE("GPL");
 static kmem_cache_t *fuse_inode_cachep;
-static struct subsystem connections_subsys;
+struct list_head fuse_conn_list;
+DEFINE_MUTEX(fuse_mutex);
-struct fuse_conn_attr {
-        struct attribute attr;
-        ssize_t (*show)(struct fuse_conn *, char *);
-        ssize_t (*store)(struct fuse_conn *, const char *, size_t);
-};
 #define FUSE_SUPER_MAGIC 0x65735546
@@ -104,6 +99,14 @@ static void fuse_clear_inode(struct inode *inode)
        }
 }
+static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+        if (*flags & MS_MANDLOCK)
+                return -EINVAL;
+        return 0;
+}
 void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr)
 {
        if (S_ISREG(inode->i_mode) && i_size_read(inode) != attr->size)
@@ -195,31 +198,29 @@ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
        return inode;
 }
-static void fuse_umount_begin(struct super_block *sb)
+static void fuse_umount_begin(struct vfsmount *vfsmnt, int flags)
 {
-        fuse_abort_conn(get_fuse_conn_super(sb));
+        if (flags & MNT_FORCE)
+                fuse_abort_conn(get_fuse_conn_super(vfsmnt->mnt_sb));
 }
 static void fuse_put_super(struct super_block *sb)
 {
        struct fuse_conn *fc = get_fuse_conn_super(sb);
-        down_write(&fc->sbput_sem);
-        while (!list_empty(&fc->background))
-                fuse_release_background(fc,
-                                        list_entry(fc->background.next,
-                                                   struct fuse_req, bg_entry));
        spin_lock(&fc->lock);
-        fc->mounted = 0;
        fc->connected = 0;
+        fc->blocked = 0;
        spin_unlock(&fc->lock);
-        up_write(&fc->sbput_sem);
        /* Flush all readers on this fs */
        kill_fasync(&fc->fasync, SIGIO, POLL_IN);
        wake_up_all(&fc->waitq);
-        kobject_del(&fc->kobj);
+        wake_up_all(&fc->blocked_waitq);
-        kobject_put(&fc->kobj);
+        mutex_lock(&fuse_mutex);
+        list_del(&fc->entry);
+        fuse_ctl_remove_conn(fc);
+        mutex_unlock(&fuse_mutex);
+        fuse_conn_put(fc);
 }
 static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr)
@@ -369,11 +370,6 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
        return 0;
 }
-static void fuse_conn_release(struct kobject *kobj)
-{
-        kfree(get_fuse_conn_kobj(kobj));
-}
 static struct fuse_conn *new_conn(void)
 {
        struct fuse_conn *fc;
@@ -381,24 +377,35 @@ static struct fuse_conn *new_conn(void)
        fc = kzalloc(sizeof(*fc), GFP_KERNEL);
        if (fc) {
                spin_lock_init(&fc->lock);
+                atomic_set(&fc->count, 1);
                init_waitqueue_head(&fc->waitq);
                init_waitqueue_head(&fc->blocked_waitq);
                INIT_LIST_HEAD(&fc->pending);
                INIT_LIST_HEAD(&fc->processing);
                INIT_LIST_HEAD(&fc->io);
-                INIT_LIST_HEAD(&fc->background);
+                INIT_LIST_HEAD(&fc->interrupts);
-                init_rwsem(&fc->sbput_sem);
-                kobj_set_kset_s(fc, connections_subsys);
-                kobject_init(&fc->kobj);
                atomic_set(&fc->num_waiting, 0);
                fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
                fc->bdi.unplug_io_fn = default_unplug_io_fn;
                fc->reqctr = 0;
                fc->blocked = 1;
+                get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
        }
        return fc;
 }
+void fuse_conn_put(struct fuse_conn *fc)
+{
+        if (atomic_dec_and_test(&fc->count))
+                kfree(fc);
+}
+struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
+{
+        atomic_inc(&fc->count);
+        return fc;
+}
 static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
 {
        struct fuse_attr attr;
@@ -414,6 +421,7 @@ static struct super_operations fuse_super_operations = {
        .destroy_inode  = fuse_destroy_inode,
        .read_inode     = fuse_read_inode,
        .clear_inode    = fuse_clear_inode,
+        .remount_fs     = fuse_remount_fs,
        .put_super      = fuse_put_super,
        .umount_begin   = fuse_umount_begin,
        .statfs         = fuse_statfs,
@@ -433,8 +441,12 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
                        ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
                        if (arg->flags & FUSE_ASYNC_READ)
                                fc->async_read = 1;
-                } else
+                        if (!(arg->flags & FUSE_POSIX_LOCKS))
+                                fc->no_lock = 1;
+                } else {
                        ra_pages = fc->max_read / PAGE_CACHE_SIZE;
+                        fc->no_lock = 1;
+                }
                fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
                fc->minor = arg->minor;
@@ -452,7 +464,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
        arg->major = FUSE_KERNEL_VERSION;
        arg->minor = FUSE_KERNEL_MINOR_VERSION;
        arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
-        arg->flags |= FUSE_ASYNC_READ;
+        arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS;
        req->in.h.opcode = FUSE_INIT;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(*arg);
@@ -468,10 +480,9 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
        request_send_background(fc, req);
 }
-static unsigned long long conn_id(void)
+static u64 conn_id(void)
 {
-        /* BKL is held for ->get_sb() */
+        static u64 ctr = 1;
-        static unsigned long long ctr = 1;
        return ctr++;
 }
@@ -485,6 +496,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        struct fuse_req *init_req;
        int err;
+        if (sb->s_flags & MS_MANDLOCK)
+                return -EINVAL;
        if (!parse_fuse_opt((char *) data, &d))
                return -EINVAL;
@@ -528,25 +542,21 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        if (!init_req)
                goto err_put_root;
-        err = kobject_set_name(&fc->kobj, "%llu", conn_id());
+        mutex_lock(&fuse_mutex);
-        if (err)
-                goto err_free_req;
-        err = kobject_add(&fc->kobj);
-        if (err)
-                goto err_free_req;
-        /* Setting file->private_data can't race with other mount()
-           instances, since BKL is held for ->get_sb() */
        err = -EINVAL;
        if (file->private_data)
-                goto err_kobject_del;
+                goto err_unlock;
+        fc->id = conn_id();
+        err = fuse_ctl_add_conn(fc);
+        if (err)
+                goto err_unlock;
+        list_add_tail(&fc->entry, &fuse_conn_list);
        sb->s_root = root_dentry;
-        fc->mounted = 1;
        fc->connected = 1;
-        kobject_get(&fc->kobj);
+        file->private_data = fuse_conn_get(fc);
-        file->private_data = fc;
+        mutex_unlock(&fuse_mutex);
        /*
         * atomic_dec_and_test() in fput() provides the necessary
         * memory barrier for file->private_data to be visible on all
@@ -558,15 +568,14 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
- err_kobject_del:
+ err_unlock:
-        kobject_del(&fc->kobj);
+        mutex_unlock(&fuse_mutex);
- err_free_req:
        fuse_request_free(init_req);
 err_put_root:
        dput(root_dentry);
 err:
        fput(file);
-        kobject_put(&fc->kobj);
+        fuse_conn_put(fc);
        return err;
 }
@@ -584,68 +593,8 @@ static struct file_system_type fuse_fs_type = {
        .kill_sb        = kill_anon_super,
 };
-static ssize_t fuse_conn_waiting_show(struct fuse_conn *fc, char *page)
-{
-        return sprintf(page, "%i\n", atomic_read(&fc->num_waiting));
-}
-static ssize_t fuse_conn_abort_store(struct fuse_conn *fc, const char *page,
-                                     size_t count)
-{
-        fuse_abort_conn(fc);
-        return count;
-}
-static struct fuse_conn_attr fuse_conn_waiting =
-        __ATTR(waiting, 0400, fuse_conn_waiting_show, NULL);
-static struct fuse_conn_attr fuse_conn_abort =
-        __ATTR(abort, 0600, NULL, fuse_conn_abort_store);
-static struct attribute *fuse_conn_attrs[] = {
-        &fuse_conn_waiting.attr,
-        &fuse_conn_abort.attr,
-        NULL,
-};
-static ssize_t fuse_conn_attr_show(struct kobject *kobj,
-                                   struct attribute *attr,
-                                   char *page)
-{
-        struct fuse_conn_attr *fca =
-                container_of(attr, struct fuse_conn_attr, attr);
-        if (fca->show)
-                return fca->show(get_fuse_conn_kobj(kobj), page);
-        else
-                return -EACCES;
-}
-static ssize_t fuse_conn_attr_store(struct kobject *kobj,
-                                    struct attribute *attr,
-                                    const char *page, size_t count)
-{
-        struct fuse_conn_attr *fca =
-                container_of(attr, struct fuse_conn_attr, attr);
-        if (fca->store)
-                return fca->store(get_fuse_conn_kobj(kobj), page, count);
-        else
-                return -EACCES;
-}
-static struct sysfs_ops fuse_conn_sysfs_ops = {
-        .show   = &fuse_conn_attr_show,
-        .store  = &fuse_conn_attr_store,
-};
-static struct kobj_type ktype_fuse_conn = {
-        .release        = fuse_conn_release,
-        .sysfs_ops      = &fuse_conn_sysfs_ops,
-        .default_attrs  = fuse_conn_attrs,
-};
 static decl_subsys(fuse, NULL, NULL);
-static decl_subsys(connections, &ktype_fuse_conn, NULL);
+static decl_subsys(connections, NULL, NULL);
 static void fuse_inode_init_once(void *foo, kmem_cache_t *cachep,
                                 unsigned long flags)
@@ -719,6 +668,7 @@ static int __init fuse_init(void)
        printk("fuse init (API version %i.%i)\n",
               FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
+        INIT_LIST_HEAD(&fuse_conn_list);
        res = fuse_fs_init();
        if (res)
                goto err;
@@ -731,8 +681,14 @@ static int __init fuse_init(void)
        if (res)
                goto err_dev_cleanup;
+        res = fuse_ctl_init();
+        if (res)
+                goto err_sysfs_cleanup;
        return 0;
+ err_sysfs_cleanup:
+        fuse_sysfs_cleanup();
 err_dev_cleanup:
        fuse_dev_cleanup();
 err_fs_cleanup:
@@ -745,6 +701,7 @@ static void __exit fuse_exit(void)
 {
        printk(KERN_DEBUG "fuse exit\n");
+        fuse_ctl_cleanup();
        fuse_sysfs_cleanup();
        fuse_fs_cleanup();
        fuse_dev_cleanup();
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 7f96b5cb6781..8c9b28dff119 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -34,6 +34,7 @@
 #include <linux/suspend.h>
 #include <linux/pagemap.h>
 #include <linux/kthread.h>
+#include <linux/poison.h>
 #include <linux/proc_fs.h>
 #include <asm/uaccess.h>
@@ -1675,7 +1676,7 @@ static void journal_free_journal_head(struct journal_head *jh)
 {
 #ifdef CONFIG_JBD_DEBUG
        atomic_dec(&nr_journal_heads);
-        memset(jh, 0x5b, sizeof(*jh));
+        memset(jh, JBD_POISON_FREE, sizeof(*jh));
 #endif
        kmem_cache_free(journal_head_cache, jh);
 }
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 80d7f53fd0a7..de5bafb4e853 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -531,6 +531,7 @@ static int do_one_pass(journal_t *journal,
                default:
                        jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
                                  blocktype);
+                        brelse(bh);
                        goto done;
                }
        }
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 1862e8bc101d..b8886f048eaa 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -53,8 +53,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
        if (!instr) {
                printk(KERN_WARNING "kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n");
                spin_lock(&c->erase_completion_lock);
-                list_del(&jeb->list);
+                list_move(&jeb->list, &c->erase_pending_list);
-                list_add(&jeb->list, &c->erase_pending_list);
                c->erasing_size -= c->sector_size;
                c->dirty_size += c->sector_size;
                jeb->dirty_size = c->sector_size;
@@ -86,8 +85,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
                /* Erase failed immediately. Refile it on the list */
                D1(printk(KERN_DEBUG "Erase at 0x%08x failed: %d. Refiling on erase_pending_list\n", jeb->offset, ret));
                spin_lock(&c->erase_completion_lock);
-                list_del(&jeb->list);
+                list_move(&jeb->list, &c->erase_pending_list);
-                list_add(&jeb->list, &c->erase_pending_list);
                c->erasing_size -= c->sector_size;
                c->dirty_size += c->sector_size;
                jeb->dirty_size = c->sector_size;
@@ -161,8 +159,7 @@ static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblo
 {
        D1(printk(KERN_DEBUG "Erase completed successfully at 0x%08x\n", jeb->offset));
        spin_lock(&c->erase_completion_lock);
-        list_del(&jeb->list);
+        list_move_tail(&jeb->list, &c->erase_complete_list);
-        list_add_tail(&jeb->list, &c->erase_complete_list);
        spin_unlock(&c->erase_completion_lock);
        /* Ensure that kupdated calls us again to mark them clean */
        jffs2_erase_pending_trigger(c);
@@ -178,8 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
                if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) {
                        /* We'd like to give this block another try. */
                        spin_lock(&c->erase_completion_lock);
-                        list_del(&jeb->list);
+                        list_move(&jeb->list, &c->erase_pending_list);
-                        list_add(&jeb->list, &c->erase_pending_list);
                        c->erasing_size -= c->sector_size;
                        c->dirty_size += c->sector_size;
                        jeb->dirty_size = c->sector_size;
@@ -191,8 +187,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
        spin_lock(&c->erase_completion_lock);
        c->erasing_size -= c->sector_size;
        c->bad_size += c->sector_size;
-        list_del(&jeb->list);
+        list_move(&jeb->list, &c->bad_list);
-        list_add(&jeb->list, &c->bad_list);
        c->nr_erasing_blocks--;
        spin_unlock(&c->erase_completion_lock);
        wake_up(&c->erase_wait);
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 8bedfd2ff689..ac0c350ed7d7 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -211,8 +211,7 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
                        struct jffs2_eraseblock *ejeb;
                        ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list);
-                        list_del(&ejeb->list);
+                        list_move_tail(&ejeb->list, &c->erase_pending_list);
-                        list_add_tail(&ejeb->list, &c->erase_pending_list);
                        c->nr_erasing_blocks++;
                        jffs2_erase_pending_trigger(c);
                        D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n",
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 0b02fc79e4d1..be1acc3dad97 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -43,7 +43,7 @@ int jffs2_sum_init(struct jffs2_sb_info *c)
                return -ENOMEM;
        }
-        dbg_summary("returned succesfully\n");
+        dbg_summary("returned successfully\n");
        return 0;
 }
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index a7f153f79ecb..b9b700730dfe 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -495,8 +495,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
        /* Fix up the original jeb now it's on the bad_list */
        if (first_raw == jeb->first_node) {
                D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset));
-                list_del(&jeb->list);
+                list_move(&jeb->list, &c->erase_pending_list);
-                list_add(&jeb->list, &c->erase_pending_list);
                c->nr_erasing_blocks++;
                jffs2_erase_pending_trigger(c);
        }
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 5549378358bf..4d52593a5fc6 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -126,7 +126,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr)
        /* allocate the disk blocks for the extent.  initially, extBalloc()
         * will try to allocate disk blocks for the requested size (xlen). 
-         * if this fails (xlen contigious free blocks not avaliable), it'll
+         * if this fails (xlen contiguous free blocks not avaliable), it'll
         * try to allocate a smaller number of blocks (producing a smaller
         * extent), with this smaller number of blocks consisting of the
         * requested number of blocks rounded down to the next smaller
@@ -493,7 +493,7 @@ int extFill(struct inode *ip, xad_t * xp)
 *
 *              initially, we will try to allocate disk blocks for the
 *              requested size (nblocks).  if this fails (nblocks 
- *              contigious free blocks not avaliable), we'll try to allocate
+ *              contiguous free blocks not avaliable), we'll try to allocate
 *              a smaller number of blocks (producing a smaller extent), with
 *              this smaller number of blocks consisting of the requested
 *              number of blocks rounded down to the next smaller power of 2
@@ -529,7 +529,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
        /* get the number of blocks to initially attempt to allocate.
         * we'll first try the number of blocks requested unless this
-         * number is greater than the maximum number of contigious free
+         * number is greater than the maximum number of contiguous free
         * blocks in the map. in that case, we'll start off with the 
         * maximum free.
         */
@@ -586,7 +586,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
 *              in place.  if this fails, we'll try to move the extent
 *              to a new set of blocks. if moving the extent, we initially
 *              will try to allocate disk blocks for the requested size
- *              (nnew).  if this fails  (nnew contigious free blocks not
+ *              (nnew).  if this fails  (new contiguous free blocks not
 *              avaliable), we'll try  to allocate a smaller number of
 *              blocks (producing a smaller extent), with this smaller
 *              number of blocks consisting of the requested number of
diff --git a/fs/libfs.c b/fs/libfs.c
index 1b1156381787..ac02ea602c3d 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -149,10 +149,9 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
                        /* fallthrough */
                default:
                        spin_lock(&dcache_lock);
-                        if (filp->f_pos == 2) {
+                        if (filp->f_pos == 2)
-                                list_del(q);
+                                list_move(q, &dentry->d_subdirs);
-                                list_add(q, &dentry->d_subdirs);
-                        }
                        for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
                                struct dentry *next;
                                next = list_entry(p, struct dentry, d_u.d_child);
@@ -164,8 +163,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
                                        return 0;
                                spin_lock(&dcache_lock);
                                /* next is still alive */
-                                list_del(q);
+                                list_move(q, p);
-                                list_add(q, p);
                                p = q;
                                filp->f_pos++;
                        }
@@ -424,13 +422,13 @@ out:
 static DEFINE_SPINLOCK(pin_fs_lock);
-int simple_pin_fs(char *name, struct vfsmount **mount, int *count)
+int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count)
 {
        struct vfsmount *mnt = NULL;
        spin_lock(&pin_fs_lock);
        if (unlikely(!*mount)) {
                spin_unlock(&pin_fs_lock);
-                mnt = do_kern_mount(name, 0, name, NULL);
+                mnt = vfs_kern_mount(type, 0, type->name, NULL);
                if (IS_ERR(mnt))
                        return PTR_ERR(mnt);
                spin_lock(&pin_fs_lock);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index bce744468708..52774feab93f 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -147,11 +147,10 @@ u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
 * Someone has sent us an SM_NOTIFY. Ensure we bind to the new port number,
 * that we mark locks for reclaiming, and that we bump the pseudo NSM state.
 */
-static inline
+static void nlmclnt_prepare_reclaim(struct nlm_host *host)
-void nlmclnt_prepare_reclaim(struct nlm_host *host, u32 newstate)
 {
+        down_write(&host->h_rwsem);
        host->h_monitored = 0;
-        host->h_nsmstate = newstate;
        host->h_state++;
        host->h_nextrebind = 0;
        nlm_rebind_host(host);
@@ -164,6 +163,13 @@ void nlmclnt_prepare_reclaim(struct nlm_host *host, u32 newstate)
        dprintk("NLM: reclaiming locks for host %s", host->h_name);
 }
+static void nlmclnt_finish_reclaim(struct nlm_host *host)
+{
+        host->h_reclaiming = 0;
+        up_write(&host->h_rwsem);
+        dprintk("NLM: done reclaiming locks for host %s", host->h_name);
+}
 /*
 * Reclaim all locks on server host. We do this by spawning a separate
 * reclaimer thread.
@@ -171,12 +177,10 @@ void nlmclnt_prepare_reclaim(struct nlm_host *host, u32 newstate)
 void
 nlmclnt_recovery(struct nlm_host *host, u32 newstate)
 {
-        if (host->h_reclaiming++) {
+        if (host->h_nsmstate == newstate)
-                if (host->h_nsmstate == newstate)
+                return;
-                        return;
+        host->h_nsmstate = newstate;
-                nlmclnt_prepare_reclaim(host, newstate);
+        if (!host->h_reclaiming++) {
-        } else {
-                nlmclnt_prepare_reclaim(host, newstate);
                nlm_get_host(host);
                __module_get(THIS_MODULE);
                if (kernel_thread(reclaimer, host, CLONE_KERNEL) < 0)
@@ -190,6 +194,7 @@ reclaimer(void *ptr)
        struct nlm_host   *host = (struct nlm_host *) ptr;
        struct nlm_wait   *block;
        struct file_lock *fl, *next;
+        u32 nsmstate;
        daemonize("%s-reclaim", host->h_name);
        allow_signal(SIGKILL);
@@ -199,19 +204,25 @@ reclaimer(void *ptr)
        lock_kernel();
        lockd_up();
+        nlmclnt_prepare_reclaim(host);
        /* First, reclaim all locks that have been marked. */
 restart:
+        nsmstate = host->h_nsmstate;
        list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) {
                list_del_init(&fl->fl_u.nfs_fl.list);
                if (signalled())
                        continue;
-                if (nlmclnt_reclaim(host, fl) == 0)
+                if (nlmclnt_reclaim(host, fl) != 0)
-                        list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted);
+                        continue;
-                goto restart;
+                list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted);
+                if (host->h_nsmstate != nsmstate) {
+                        /* Argh! The server rebooted again! */
+                        list_splice_init(&host->h_granted, &host->h_reclaim);
+                        goto restart;
+                }
        }
+        nlmclnt_finish_reclaim(host);
-        host->h_reclaiming = 0;
        /* Now, wake up all processes that sleep on a blocked lock */
        list_for_each_entry(block, &nlm_blocked, b_list) {
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index f96e38155b5c..4db62098d3f4 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -508,7 +508,10 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
        }
        block = nlmclnt_prepare_block(host, fl);
+again:
        for(;;) {
+                /* Reboot protection */
+                fl->fl_u.nfs_fl.state = host->h_state;
                status = nlmclnt_call(req, NLMPROC_LOCK);
                if (status < 0)
                        goto out_unblock;
@@ -531,10 +534,16 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
        }
        if (resp->status == NLM_LCK_GRANTED) {
-                fl->fl_u.nfs_fl.state = host->h_state;
+                down_read(&host->h_rwsem);
+                /* Check whether or not the server has rebooted */
+                if (fl->fl_u.nfs_fl.state != host->h_state) {
+                        up_read(&host->h_rwsem);
+                        goto again;
+                }
                fl->fl_flags |= FL_SLEEP;
                /* Ensure the resulting lock will get added to granted list */
                do_vfs_lock(fl);
+                up_read(&host->h_rwsem);
        }
        status = nlm_stat_to_errno(resp->status);
 out_unblock:
@@ -596,6 +605,7 @@ nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl)
 static int
 nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 {
+        struct nlm_host *host = req->a_host;
        struct nlm_res  *resp = &req->a_res;
        int             status;
@@ -604,7 +614,9 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
         * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either
         * case, we want to unlock.
         */
+        down_read(&host->h_rwsem);
        do_vfs_lock(fl);
+        up_read(&host->h_rwsem);
        if (req->a_flags & RPC_TASK_ASYNC)
                return nlm_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 729ac427d359..38b0e8a1aec0 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -112,11 +112,12 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
        host->h_version    = version;
        host->h_proto      = proto;
        host->h_rpcclnt    = NULL;
-        init_MUTEX(&host->h_sema);
+        mutex_init(&host->h_mutex);
        host->h_nextrebind = jiffies + NLM_HOST_REBIND;
        host->h_expires    = jiffies + NLM_HOST_EXPIRE;
        atomic_set(&host->h_count, 1);
        init_waitqueue_head(&host->h_gracewait);
+        init_rwsem(&host->h_rwsem);
        host->h_state      = 0;                 /* pseudo NSM state */
        host->h_nsmstate   = 0;                 /* real NSM state */
        host->h_server     = server;
@@ -172,7 +173,7 @@ nlm_bind_host(struct nlm_host *host)
                        (unsigned)ntohl(host->h_addr.sin_addr.s_addr));
        /* Lock host handle */
-        down(&host->h_sema);
+        mutex_lock(&host->h_mutex);
        /* If we've already created an RPC client, check whether
         * RPC rebind is required
@@ -204,12 +205,12 @@ nlm_bind_host(struct nlm_host *host)
                host->h_rpcclnt = clnt;
        }
-        up(&host->h_sema);
+        mutex_unlock(&host->h_mutex);
        return clnt;
 forgetit:
        printk("lockd: couldn't create RPC handle for %s\n", host->h_name);
-        up(&host->h_sema);
+        mutex_unlock(&host->h_mutex);
        return NULL;
 }
diff --git a/fs/namei.c b/fs/namei.c
index bb4a3e40e432..c784e8bb57a3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2243,14 +2243,16 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
        int error;
        char * to;
-        if (flags != 0)
+        if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
                return -EINVAL;
        to = getname(newname);
        if (IS_ERR(to))
                return PTR_ERR(to);
-        error = __user_walk_fd(olddfd, oldname, 0, &old_nd);
+        error = __user_walk_fd(olddfd, oldname,
+                               flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
+                               &old_nd);
        if (error)
                goto exit;
        error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
diff --git a/fs/namespace.c b/fs/namespace.c
index c13072a5f1ee..b3ed212ea416 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -526,10 +526,8 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 {
        struct vfsmount *p;
-        for (p = mnt; p; p = next_mnt(p, mnt)) {
+        for (p = mnt; p; p = next_mnt(p, mnt))
-                list_del(&p->mnt_hash);
+                list_move(&p->mnt_hash, kill);
-                list_add(&p->mnt_hash, kill);
-        }
        if (propagate)
                propagate_umount(kill);
@@ -585,8 +583,8 @@ static int do_umount(struct vfsmount *mnt, int flags)
         */
        lock_kernel();
-        if ((flags & MNT_FORCE) && sb->s_op->umount_begin)
+        if (sb->s_op->umount_begin)
-                sb->s_op->umount_begin(sb);
+                sb->s_op->umount_begin(mnt, flags);
        unlock_kernel();
        /*
@@ -1172,13 +1170,46 @@ static void expire_mount(struct vfsmount *mnt, struct list_head *mounts,
 }
 /*
+ * go through the vfsmounts we've just consigned to the graveyard to
+ * - check that they're still dead
+ * - delete the vfsmount from the appropriate namespace under lock
+ * - dispose of the corpse
+ */
+static void expire_mount_list(struct list_head *graveyard, struct list_head *mounts)
+{
+        struct namespace *namespace;
+        struct vfsmount *mnt;
+        while (!list_empty(graveyard)) {
+                LIST_HEAD(umounts);
+                mnt = list_entry(graveyard->next, struct vfsmount, mnt_expire);
+                list_del_init(&mnt->mnt_expire);
+                /* don't do anything if the namespace is dead - all the
+                 * vfsmounts from it are going away anyway */
+                namespace = mnt->mnt_namespace;
+                if (!namespace || !namespace->root)
+                        continue;
+                get_namespace(namespace);
+                spin_unlock(&vfsmount_lock);
+                down_write(&namespace_sem);
+                expire_mount(mnt, mounts, &umounts);
+                up_write(&namespace_sem);
+                release_mounts(&umounts);
+                mntput(mnt);
+                put_namespace(namespace);
+                spin_lock(&vfsmount_lock);
+        }
+}
+/*
 * process a list of expirable mountpoints with the intent of discarding any
 * mountpoints that aren't in use and haven't been touched since last we came
 * here
 */
 void mark_mounts_for_expiry(struct list_head *mounts)
 {
-        struct namespace *namespace;
        struct vfsmount *mnt, *next;
        LIST_HEAD(graveyard);
@@ -1202,38 +1233,79 @@ void mark_mounts_for_expiry(struct list_head *mounts)
                list_move(&mnt->mnt_expire, &graveyard);
        }
-        /*
+        expire_mount_list(&graveyard, mounts);
-         * go through the vfsmounts we've just consigned to the graveyard to
-         * - check that they're still dead
-         * - delete the vfsmount from the appropriate namespace under lock
-         * - dispose of the corpse
-         */
-        while (!list_empty(&graveyard)) {
-                LIST_HEAD(umounts);
-                mnt = list_entry(graveyard.next, struct vfsmount, mnt_expire);
-                list_del_init(&mnt->mnt_expire);
-                /* don't do anything if the namespace is dead - all the
+        spin_unlock(&vfsmount_lock);
-                 * vfsmounts from it are going away anyway */
+}
-                namespace = mnt->mnt_namespace;
-                if (!namespace || !namespace->root)
+EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
+/*
+ * Ripoff of 'select_parent()'
+ *
+ * search the list of submounts for a given mountpoint, and move any
+ * shrinkable submounts to the 'graveyard' list.
+ */
+static int select_submounts(struct vfsmount *parent, struct list_head *graveyard)
+{
+        struct vfsmount *this_parent = parent;
+        struct list_head *next;
+        int found = 0;
+repeat:
+        next = this_parent->mnt_mounts.next;
+resume:
+        while (next != &this_parent->mnt_mounts) {
+                struct list_head *tmp = next;
+                struct vfsmount *mnt = list_entry(tmp, struct vfsmount, mnt_child);
+                next = tmp->next;
+                if (!(mnt->mnt_flags & MNT_SHRINKABLE))
                        continue;
-                get_namespace(namespace);
+                /*
+                 * Descend a level if the d_mounts list is non-empty.
+                 */
+                if (!list_empty(&mnt->mnt_mounts)) {
+                        this_parent = mnt;
+                        goto repeat;
+                }
-                spin_unlock(&vfsmount_lock);
+                if (!propagate_mount_busy(mnt, 1)) {
-                down_write(&namespace_sem);
+                        mntget(mnt);
-                expire_mount(mnt, mounts, &umounts);
+                        list_move_tail(&mnt->mnt_expire, graveyard);
-                up_write(&namespace_sem);
+                        found++;
-                release_mounts(&umounts);
+                }
-                mntput(mnt);
+        }
-                put_namespace(namespace);
+        /*
-                spin_lock(&vfsmount_lock);
+         * All done at this level ... ascend and resume the search
+         */
+        if (this_parent != parent) {
+                next = this_parent->mnt_child.next;
+                this_parent = this_parent->mnt_parent;
+                goto resume;
        }
+        return found;
+}
+/*
+ * process a list of expirable mountpoints with the intent of discarding any
+ * submounts of a specific parent mountpoint
+ */
+void shrink_submounts(struct vfsmount *mountpoint, struct list_head *mounts)
+{
+        LIST_HEAD(graveyard);
+        int found;
+        spin_lock(&vfsmount_lock);
+        /* extract submounts of 'mountpoint' from the expiration list */
+        while ((found = select_submounts(mountpoint, &graveyard)) != 0)
+                expire_mount_list(&graveyard, mounts);
        spin_unlock(&vfsmount_lock);
 }
-EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
+EXPORT_SYMBOL_GPL(shrink_submounts);
 /*
 * Some copy_from_user() implementations do not return the exact number of
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index ec61fd56a1a9..0b572a0c1967 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -4,14 +4,16 @@
 obj-$(CONFIG_NFS_FS) += nfs.o
-nfs-y                   := dir.o file.o inode.o nfs2xdr.o pagelist.o \
+nfs-y                   := dir.o file.o inode.o super.o nfs2xdr.o pagelist.o \
-                           proc.o read.o symlink.o unlink.o write.o
+                           proc.o read.o symlink.o unlink.o write.o \
+                           namespace.o
 nfs-$(CONFIG_ROOT_NFS)  += nfsroot.o mount_clnt.o      
 nfs-$(CONFIG_NFS_V3)    += nfs3proc.o nfs3xdr.o
 nfs-$(CONFIG_NFS_V3_ACL)        += nfs3acl.o
 nfs-$(CONFIG_NFS_V4)    += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
                           delegation.o idmap.o \
-                           callback.o callback_xdr.o callback_proc.o
+                           callback.o callback_xdr.o callback_proc.o \
+                           nfs4namespace.o
 nfs-$(CONFIG_NFS_DIRECTIO) += direct.o
 nfs-$(CONFIG_SYSCTL) += sysctl.o
 nfs-objs                := $(nfs-y)
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 90c95adc8c1b..d53f8c6a9ecb 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -182,8 +182,6 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 /*
 * Define NFS4 callback program
 */
-extern struct svc_version nfs4_callback_version1;
 static struct svc_version *nfs4_callback_version[] = {
        [1] = &nfs4_callback_version1,
 };
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 05c38cf40b69..c92991328d9a 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -202,7 +202,7 @@ static unsigned decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xd
        status = decode_fh(xdr, &args->fh);
 out:
        dprintk("%s: exit with status = %d\n", __FUNCTION__, status);
-        return 0;
+        return status;
 }
 static unsigned encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index cae74dd4c7f5..3ddda6f7ecc2 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -528,7 +528,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        lock_kernel();
-        res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+        res = nfs_revalidate_mapping(inode, filp->f_mapping);
        if (res < 0) {
                unlock_kernel();
                return res;
@@ -868,6 +868,17 @@ int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd)
        return (nd->intent.open.flags & O_EXCL) != 0;
 }
+static inline int nfs_reval_fsid(struct inode *dir,
+                struct nfs_fh *fh, struct nfs_fattr *fattr)
+{
+        struct nfs_server *server = NFS_SERVER(dir);
+        if (!nfs_fsid_equal(&server->fsid, &fattr->fsid))
+                /* Revalidate fsid on root dir */
+                return __nfs_revalidate_inode(server, dir->i_sb->s_root->d_inode);
+        return 0;
+}
 static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
        struct dentry *res;
@@ -900,6 +911,11 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
                res = ERR_PTR(error);
                goto out_unlock;
        }
+        error = nfs_reval_fsid(dir, &fhandle, &fattr);
+        if (error < 0) {
+                res = ERR_PTR(error);
+                goto out_unlock;
+        }
        inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr);
        res = (struct dentry *)inode;
        if (IS_ERR(res))
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 3c72b0c07283..8ca9707be6c9 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -892,7 +892,7 @@ out:
 * nfs_init_directcache - create a slab cache for nfs_direct_req structures
 *
 */
-int nfs_init_directcache(void)
+int __init nfs_init_directcache(void)
 {
        nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
                                                sizeof(struct nfs_direct_req),
@@ -906,7 +906,7 @@ int nfs_init_directcache(void)
 }
 /**
- * nfs_init_directcache - destroy the slab cache for nfs_direct_req structures
+ * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
 *
 */
 void nfs_destroy_directcache(void)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index fa05c027ea11..add289138836 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -127,23 +127,6 @@ nfs_file_release(struct inode *inode, struct file *filp)
 }
 /**
- * nfs_revalidate_file - Revalidate the page cache & related metadata
- * @inode - pointer to inode struct
- * @file - pointer to file
- */
-static int nfs_revalidate_file(struct inode *inode, struct file *filp)
-{
-        struct nfs_inode *nfsi = NFS_I(inode);
-        int retval = 0;
-        if ((nfsi->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATTR))
-                        || nfs_attribute_timeout(inode))
-                retval = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
-        nfs_revalidate_mapping(inode, filp->f_mapping);
-        return 0;
-}
-/**
 * nfs_revalidate_size - Revalidate the file size
 * @inode - pointer to inode struct
 * @file - pointer to struct file
@@ -228,7 +211,7 @@ nfs_file_read(struct kiocb *iocb, char __user * buf, size_t count, loff_t pos)
                dentry->d_parent->d_name.name, dentry->d_name.name,
                (unsigned long) count, (unsigned long) pos);
-        result = nfs_revalidate_file(inode, iocb->ki_filp);
+        result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
        nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, count);
        if (!result)
                result = generic_file_aio_read(iocb, buf, count, pos);
@@ -247,7 +230,7 @@ nfs_file_sendfile(struct file *filp, loff_t *ppos, size_t count,
                dentry->d_parent->d_name.name, dentry->d_name.name,
                (unsigned long) count, (unsigned long long) *ppos);
-        res = nfs_revalidate_file(inode, filp);
+        res = nfs_revalidate_mapping(inode, filp->f_mapping);
        if (!res)
                res = generic_file_sendfile(filp, ppos, count, actor, target);
        return res;
@@ -263,7 +246,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
        dfprintk(VFS, "nfs: mmap(%s/%s)\n",
                dentry->d_parent->d_name.name, dentry->d_name.name);
-        status = nfs_revalidate_file(inode, file);
+        status = nfs_revalidate_mapping(inode, file->f_mapping);
        if (!status)
                status = generic_file_mmap(file, vma);
        return status;
@@ -320,7 +303,11 @@ static int nfs_commit_write(struct file *file, struct page *page, unsigned offse
 static void nfs_invalidate_page(struct page *page, unsigned long offset)
 {
-        /* FIXME: we really should cancel any unstarted writes on this page */
+        struct inode *inode = page->mapping->host;
+        /* Cancel any unstarted writes on this page */
+        if (offset == 0)
+                nfs_sync_inode_wait(inode, page->index, 1, FLUSH_INVALIDATE);
 }
 static int nfs_release_page(struct page *page, gfp_t gfp)
@@ -373,7 +360,6 @@ nfs_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t
                if (result)
                        goto out;
        }
-        nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
        result = count;
        if (!count)
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 3fab5b0cfc5a..b81e7ed3c902 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -47,7 +47,6 @@
 #include <linux/workqueue.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
-#include <linux/nfs_fs_sb.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_idmap.h>
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 937fbfc381bb..c5b916605fb0 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -36,6 +36,8 @@
 #include <linux/mount.h>
 #include <linux/nfs_idmap.h>
 #include <linux/vfs.h>
+#include <linux/inet.h>
+#include <linux/nfs_xdr.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -44,89 +46,17 @@
 #include "callback.h"
 #include "delegation.h"
 #include "iostat.h"
+#include "internal.h"
 #define NFSDBG_FACILITY         NFSDBG_VFS
 #define NFS_PARANOIA 1
-/* Maximum number of readahead requests
- * FIXME: this should really be a sysctl so that users may tune it to suit
- *        their needs. People that do NFS over a slow network, might for
- *        instance want to reduce it to something closer to 1 for improved
- *        interactive response.
- */
-#define NFS_MAX_READAHEAD       (RPC_DEF_SLOT_TABLE - 1)
 static void nfs_invalidate_inode(struct inode *);
 static int nfs_update_inode(struct inode *, struct nfs_fattr *);
-static struct inode *nfs_alloc_inode(struct super_block *sb);
-static void nfs_destroy_inode(struct inode *);
-static int nfs_write_inode(struct inode *,int);
-static void nfs_delete_inode(struct inode *);
-static void nfs_clear_inode(struct inode *);
-static void nfs_umount_begin(struct super_block *);
-static int  nfs_statfs(struct dentry *, struct kstatfs *);
-static int  nfs_show_options(struct seq_file *, struct vfsmount *);
-static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
 static void nfs_zap_acl_cache(struct inode *);
-static struct rpc_program       nfs_program;
+static kmem_cache_t * nfs_inode_cachep;
-static struct super_operations nfs_sops = { 
-        .alloc_inode    = nfs_alloc_inode,
-        .destroy_inode  = nfs_destroy_inode,
-        .write_inode    = nfs_write_inode,
-        .delete_inode   = nfs_delete_inode,
-        .statfs         = nfs_statfs,
-        .clear_inode    = nfs_clear_inode,
-        .umount_begin   = nfs_umount_begin,
-        .show_options   = nfs_show_options,
-        .show_stats     = nfs_show_stats,
-};
-/*
- * RPC cruft for NFS
- */
-static struct rpc_stat          nfs_rpcstat = {
-        .program                = &nfs_program
-};
-static struct rpc_version *     nfs_version[] = {
-        NULL,
-        NULL,
-        &nfs_version2,
-#if defined(CONFIG_NFS_V3)
-        &nfs_version3,
-#elif defined(CONFIG_NFS_V4)
-        NULL,
-#endif
-#if defined(CONFIG_NFS_V4)
-        &nfs_version4,
-#endif
-};
-static struct rpc_program       nfs_program = {
-        .name                   = "nfs",
-        .number                 = NFS_PROGRAM,
-        .nrvers                 = ARRAY_SIZE(nfs_version),
-        .version                = nfs_version,
-        .stats                  = &nfs_rpcstat,
-        .pipe_dir_name          = "/nfs",
-};
-#ifdef CONFIG_NFS_V3_ACL
-static struct rpc_stat          nfsacl_rpcstat = { &nfsacl_program };
-static struct rpc_version *     nfsacl_version[] = {
-        [3]                     = &nfsacl_version3,
-};
-struct rpc_program              nfsacl_program = {
-        .name =                 "nfsacl",
-        .number =               NFS_ACL_PROGRAM,
-        .nrvers =               ARRAY_SIZE(nfsacl_version),
-        .version =              nfsacl_version,
-        .stats =                &nfsacl_rpcstat,
-};
-#endif  /* CONFIG_NFS_V3_ACL */
 static inline unsigned long
 nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
@@ -134,8 +64,7 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
        return nfs_fileid_to_ino_t(fattr->fileid);
 }
-static int
+int nfs_write_inode(struct inode *inode, int sync)
-nfs_write_inode(struct inode *inode, int sync)
 {
        int flags = sync ? FLUSH_SYNC : 0;
        int ret;
@@ -146,31 +75,15 @@ nfs_write_inode(struct inode *inode, int sync)
        return 0;
 }
-static void
+void nfs_clear_inode(struct inode *inode)
-nfs_delete_inode(struct inode * inode)
 {
-        dprintk("NFS: delete_inode(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
+        struct nfs_inode *nfsi = NFS_I(inode);
+        struct rpc_cred *cred;
-        truncate_inode_pages(&inode->i_data, 0);
-        nfs_wb_all(inode);
        /*
         * The following should never happen...
         */
-        if (nfs_have_writebacks(inode)) {
+        BUG_ON(nfs_have_writebacks(inode));
-                printk(KERN_ERR "nfs_delete_inode: inode %ld has pending RPC requests\n", inode->i_ino);
-        }
-        clear_inode(inode);
-}
-static void
-nfs_clear_inode(struct inode *inode)
-{
-        struct nfs_inode *nfsi = NFS_I(inode);
-        struct rpc_cred *cred;
-        nfs_wb_all(inode);
        BUG_ON (!list_empty(&nfsi->open_files));
        nfs_zap_acl_cache(inode);
        cred = nfsi->cache_access.cred;
@@ -179,555 +92,6 @@ nfs_clear_inode(struct inode *inode)
        BUG_ON(atomic_read(&nfsi->data_updates) != 0);
 }
-void
-nfs_umount_begin(struct super_block *sb)
-{
-        struct rpc_clnt *rpc = NFS_SB(sb)->client;
-        /* -EIO all pending I/O */
-        if (!IS_ERR(rpc))
-                rpc_killall_tasks(rpc);
-        rpc = NFS_SB(sb)->client_acl;
-        if (!IS_ERR(rpc))
-                rpc_killall_tasks(rpc);
-}
-static inline unsigned long
-nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp)
-{
-        /* make sure blocksize is a power of two */
-        if ((bsize & (bsize - 1)) || nrbitsp) {
-                unsigned char   nrbits;
-                for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--)
-                        ;
-                bsize = 1 << nrbits;
-                if (nrbitsp)
-                        *nrbitsp = nrbits;
-        }
-        return bsize;
-}
-/*
- * Calculate the number of 512byte blocks used.
- */
-static inline unsigned long
-nfs_calc_block_size(u64 tsize)
-{
-        loff_t used = (tsize + 511) >> 9;
-        return (used > ULONG_MAX) ? ULONG_MAX : used;
-}
-/*
- * Compute and set NFS server blocksize
- */
-static inline unsigned long
-nfs_block_size(unsigned long bsize, unsigned char *nrbitsp)
-{
-        if (bsize < NFS_MIN_FILE_IO_SIZE)
-                bsize = NFS_DEF_FILE_IO_SIZE;
-        else if (bsize >= NFS_MAX_FILE_IO_SIZE)
-                bsize = NFS_MAX_FILE_IO_SIZE;
-        return nfs_block_bits(bsize, nrbitsp);
-}
-/*
- * Obtain the root inode of the file system.
- */
-static struct inode *
-nfs_get_root(struct super_block *sb, struct nfs_fh *rootfh, struct nfs_fsinfo *fsinfo)
-{
-        struct nfs_server       *server = NFS_SB(sb);
-        int                     error;
-        error = server->rpc_ops->getroot(server, rootfh, fsinfo);
-        if (error < 0) {
-                dprintk("nfs_get_root: getattr error = %d\n", -error);
-                return ERR_PTR(error);
-        }
-        return nfs_fhget(sb, rootfh, fsinfo->fattr);
-}
-/*
- * Do NFS version-independent mount processing, and sanity checking
- */
-static int
-nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor)
-{
-        struct nfs_server       *server;
-        struct inode            *root_inode;
-        struct nfs_fattr        fattr;
-        struct nfs_fsinfo       fsinfo = {
-                                        .fattr = &fattr,
-                                };
-        struct nfs_pathconf pathinfo = {
-                        .fattr = &fattr,
-        };
-        int no_root_error = 0;
-        unsigned long max_rpc_payload;
-        /* We probably want something more informative here */
-        snprintf(sb->s_id, sizeof(sb->s_id), "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev));
-        server = NFS_SB(sb);
-        sb->s_magic      = NFS_SUPER_MAGIC;
-        server->io_stats = nfs_alloc_iostats();
-        if (server->io_stats == NULL)
-                return -ENOMEM;
-        root_inode = nfs_get_root(sb, &server->fh, &fsinfo);
-        /* Did getting the root inode fail? */
-        if (IS_ERR(root_inode)) {
-                no_root_error = PTR_ERR(root_inode);
-                goto out_no_root;
-        }
-        sb->s_root = d_alloc_root(root_inode);
-        if (!sb->s_root) {
-                no_root_error = -ENOMEM;
-                goto out_no_root;
-        }
-        sb->s_root->d_op = server->rpc_ops->dentry_ops;
-        /* mount time stamp, in seconds */
-        server->mount_time = jiffies;
-        /* Get some general file system info */
-        if (server->namelen == 0 &&
-            server->rpc_ops->pathconf(server, &server->fh, &pathinfo) >= 0)
-                server->namelen = pathinfo.max_namelen;
-        /* Work out a lot of parameters */
-        if (server->rsize == 0)
-                server->rsize = nfs_block_size(fsinfo.rtpref, NULL);
-        if (server->wsize == 0)
-                server->wsize = nfs_block_size(fsinfo.wtpref, NULL);
-        if (fsinfo.rtmax >= 512 && server->rsize > fsinfo.rtmax)
-                server->rsize = nfs_block_size(fsinfo.rtmax, NULL);
-        if (fsinfo.wtmax >= 512 && server->wsize > fsinfo.wtmax)
-                server->wsize = nfs_block_size(fsinfo.wtmax, NULL);
-        max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL);
-        if (server->rsize > max_rpc_payload)
-                server->rsize = max_rpc_payload;
-        if (server->rsize > NFS_MAX_FILE_IO_SIZE)
-                server->rsize = NFS_MAX_FILE_IO_SIZE;
-        server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-        if (server->wsize > max_rpc_payload)
-                server->wsize = max_rpc_payload;
-        if (server->wsize > NFS_MAX_FILE_IO_SIZE)
-                server->wsize = NFS_MAX_FILE_IO_SIZE;
-        server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-        if (sb->s_blocksize == 0)
-                sb->s_blocksize = nfs_block_bits(server->wsize,
-                                                         &sb->s_blocksize_bits);
-        server->wtmult = nfs_block_bits(fsinfo.wtmult, NULL);
-        server->dtsize = nfs_block_size(fsinfo.dtpref, NULL);
-        if (server->dtsize > PAGE_CACHE_SIZE)
-                server->dtsize = PAGE_CACHE_SIZE;
-        if (server->dtsize > server->rsize)
-                server->dtsize = server->rsize;
-        if (server->flags & NFS_MOUNT_NOAC) {
-                server->acregmin = server->acregmax = 0;
-                server->acdirmin = server->acdirmax = 0;
-                sb->s_flags |= MS_SYNCHRONOUS;
-        }
-        server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
-        sb->s_maxbytes = fsinfo.maxfilesize;
-        if (sb->s_maxbytes > MAX_LFS_FILESIZE) 
-                sb->s_maxbytes = MAX_LFS_FILESIZE; 
-        server->client->cl_intr = (server->flags & NFS_MOUNT_INTR) ? 1 : 0;
-        server->client->cl_softrtry = (server->flags & NFS_MOUNT_SOFT) ? 1 : 0;
-        /* We're airborne Set socket buffersize */
-        rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
-        return 0;
-        /* Yargs. It didn't work out. */
-out_no_root:
-        dprintk("nfs_sb_init: get root inode failed: errno %d\n", -no_root_error);
-        if (!IS_ERR(root_inode))
-                iput(root_inode);
-        return no_root_error;
-}
-static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, unsigned int timeo, unsigned int retrans)
-{
-        to->to_initval = timeo * HZ / 10;
-        to->to_retries = retrans;
-        if (!to->to_retries)
-                to->to_retries = 2;
-        switch (proto) {
-        case IPPROTO_TCP:
-                if (!to->to_initval)
-                        to->to_initval = 60 * HZ;
-                if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
-                        to->to_initval = NFS_MAX_TCP_TIMEOUT;
-                to->to_increment = to->to_initval;
-                to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
-                to->to_exponential = 0;
-                break;
-        case IPPROTO_UDP:
-        default:
-                if (!to->to_initval)
-                        to->to_initval = 11 * HZ / 10;
-                if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
-                        to->to_initval = NFS_MAX_UDP_TIMEOUT;
-                to->to_maxval = NFS_MAX_UDP_TIMEOUT;
-                to->to_exponential = 1;
-                break;
-        }
-}
-/*
- * Create an RPC client handle.
- */
-static struct rpc_clnt *
-nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data)
-{
-        struct rpc_timeout      timeparms;
-        struct rpc_xprt         *xprt = NULL;
-        struct rpc_clnt         *clnt = NULL;
-        int                     proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
-        nfs_init_timeout_values(&timeparms, proto, data->timeo, data->retrans);
-        server->retrans_timeo = timeparms.to_initval;
-        server->retrans_count = timeparms.to_retries;
-        /* create transport and client */
-        xprt = xprt_create_proto(proto, &server->addr, &timeparms);
-        if (IS_ERR(xprt)) {
-                dprintk("%s: cannot create RPC transport. Error = %ld\n",
-                                __FUNCTION__, PTR_ERR(xprt));
-                return (struct rpc_clnt *)xprt;
-        }
-        clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
-                                 server->rpc_ops->version, data->pseudoflavor);
-        if (IS_ERR(clnt)) {
-                dprintk("%s: cannot create RPC client. Error = %ld\n",
-                                __FUNCTION__, PTR_ERR(xprt));
-                goto out_fail;
-        }
-        clnt->cl_intr     = 1;
-        clnt->cl_softrtry = 1;
-        return clnt;
-out_fail:
-        return clnt;
-}
-/*
- * The way this works is that the mount process passes a structure
- * in the data argument which contains the server's IP address
- * and the root file handle obtained from the server's mount
- * daemon. We stash these away in the private superblock fields.
- */
-static int
-nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent)
-{
-        struct nfs_server       *server;
-        rpc_authflavor_t        authflavor;
-        server           = NFS_SB(sb);
-        sb->s_blocksize_bits = 0;
-        sb->s_blocksize = 0;
-        if (data->bsize)
-                sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits);
-        if (data->rsize)
-                server->rsize = nfs_block_size(data->rsize, NULL);
-        if (data->wsize)
-                server->wsize = nfs_block_size(data->wsize, NULL);
-        server->flags    = data->flags & NFS_MOUNT_FLAGMASK;
-        server->acregmin = data->acregmin*HZ;
-        server->acregmax = data->acregmax*HZ;
-        server->acdirmin = data->acdirmin*HZ;
-        server->acdirmax = data->acdirmax*HZ;
-        /* Start lockd here, before we might error out */
-        if (!(server->flags & NFS_MOUNT_NONLM))
-                lockd_up();
-        server->namelen  = data->namlen;
-        server->hostname = kmalloc(strlen(data->hostname) + 1, GFP_KERNEL);
-        if (!server->hostname)
-                return -ENOMEM;
-        strcpy(server->hostname, data->hostname);
-        /* Check NFS protocol revision and initialize RPC op vector
-         * and file handle pool. */
-#ifdef CONFIG_NFS_V3
-        if (server->flags & NFS_MOUNT_VER3) {
-                server->rpc_ops = &nfs_v3_clientops;
-                server->caps |= NFS_CAP_READDIRPLUS;
-        } else {
-                server->rpc_ops = &nfs_v2_clientops;
-        }
-#else
-        server->rpc_ops = &nfs_v2_clientops;
-#endif
-        /* Fill in pseudoflavor for mount version < 5 */
-        if (!(data->flags & NFS_MOUNT_SECFLAVOUR))
-                data->pseudoflavor = RPC_AUTH_UNIX;
-        authflavor = data->pseudoflavor;        /* save for sb_init() */
-        /* XXX maybe we want to add a server->pseudoflavor field */
-        /* Create RPC client handles */
-        server->client = nfs_create_client(server, data);
-        if (IS_ERR(server->client))
-                return PTR_ERR(server->client);
-        /* RFC 2623, sec 2.3.2 */
-        if (authflavor != RPC_AUTH_UNIX) {
-                struct rpc_auth *auth;
-                server->client_sys = rpc_clone_client(server->client);
-                if (IS_ERR(server->client_sys))
-                        return PTR_ERR(server->client_sys);
-                auth = rpcauth_create(RPC_AUTH_UNIX, server->client_sys);
-                if (IS_ERR(auth))
-                        return PTR_ERR(auth);
-        } else {
-                atomic_inc(&server->client->cl_count);
-                server->client_sys = server->client;
-        }
-        if (server->flags & NFS_MOUNT_VER3) {
-#ifdef CONFIG_NFS_V3_ACL
-                if (!(server->flags & NFS_MOUNT_NOACL)) {
-                        server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3);
-                        /* No errors! Assume that Sun nfsacls are supported */
-                        if (!IS_ERR(server->client_acl))
-                                server->caps |= NFS_CAP_ACLS;
-                }
-#else
-                server->flags &= ~NFS_MOUNT_NOACL;
-#endif /* CONFIG_NFS_V3_ACL */
-                /*
-                 * The VFS shouldn't apply the umask to mode bits. We will
-                 * do so ourselves when necessary.
-                 */
-                sb->s_flags |= MS_POSIXACL;
-                if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
-                        server->namelen = NFS3_MAXNAMLEN;
-                sb->s_time_gran = 1;
-        } else {
-                if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
-                        server->namelen = NFS2_MAXNAMLEN;
-        }
-        sb->s_op = &nfs_sops;
-        return nfs_sb_init(sb, authflavor);
-}
-static int
-nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-        struct super_block *sb = dentry->d_sb;
-        struct nfs_server *server = NFS_SB(sb);
-        unsigned char blockbits;
-        unsigned long blockres;
-        struct nfs_fh *rootfh = NFS_FH(sb->s_root->d_inode);
-        struct nfs_fattr fattr;
-        struct nfs_fsstat res = {
-                        .fattr = &fattr,
-        };
-        int error;
-        lock_kernel();
-        error = server->rpc_ops->statfs(server, rootfh, &res);
-        buf->f_type = NFS_SUPER_MAGIC;
-        if (error < 0)
-                goto out_err;
-        /*
-         * Current versions of glibc do not correctly handle the
-         * case where f_frsize != f_bsize.  Eventually we want to
-         * report the value of wtmult in this field.
-         */
-        buf->f_frsize = sb->s_blocksize;
-        /*
-         * On most *nix systems, f_blocks, f_bfree, and f_bavail
-         * are reported in units of f_frsize.  Linux hasn't had
-         * an f_frsize field in its statfs struct until recently,
-         * thus historically Linux's sys_statfs reports these
-         * fields in units of f_bsize.
-         */
-        buf->f_bsize = sb->s_blocksize;
-        blockbits = sb->s_blocksize_bits;
-        blockres = (1 << blockbits) - 1;
-        buf->f_blocks = (res.tbytes + blockres) >> blockbits;
-        buf->f_bfree = (res.fbytes + blockres) >> blockbits;
-        buf->f_bavail = (res.abytes + blockres) >> blockbits;
-        buf->f_files = res.tfiles;
-        buf->f_ffree = res.afiles;
-        buf->f_namelen = server->namelen;
- out:
-        unlock_kernel();
-        return 0;
- out_err:
-        dprintk("%s: statfs error = %d\n", __FUNCTION__, -error);
-        buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1;
-        goto out;
-}
-static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, int showdefaults)
-{
-        static struct proc_nfs_info {
-                int flag;
-                char *str;
-                char *nostr;
-        } nfs_info[] = {
-                { NFS_MOUNT_SOFT, ",soft", ",hard" },
-                { NFS_MOUNT_INTR, ",intr", "" },
-                { NFS_MOUNT_NOCTO, ",nocto", "" },
-                { NFS_MOUNT_NOAC, ",noac", "" },
-                { NFS_MOUNT_NONLM, ",nolock", "" },
-                { NFS_MOUNT_NOACL, ",noacl", "" },
-                { 0, NULL, NULL }
-        };
-        struct proc_nfs_info *nfs_infop;
-        char buf[12];
-        char *proto;
-        seq_printf(m, ",vers=%d", nfss->rpc_ops->version);
-        seq_printf(m, ",rsize=%d", nfss->rsize);
-        seq_printf(m, ",wsize=%d", nfss->wsize);
-        if (nfss->acregmin != 3*HZ || showdefaults)
-                seq_printf(m, ",acregmin=%d", nfss->acregmin/HZ);
-        if (nfss->acregmax != 60*HZ || showdefaults)
-                seq_printf(m, ",acregmax=%d", nfss->acregmax/HZ);
-        if (nfss->acdirmin != 30*HZ || showdefaults)
-                seq_printf(m, ",acdirmin=%d", nfss->acdirmin/HZ);
-        if (nfss->acdirmax != 60*HZ || showdefaults)
-                seq_printf(m, ",acdirmax=%d", nfss->acdirmax/HZ);
-        for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
-                if (nfss->flags & nfs_infop->flag)
-                        seq_puts(m, nfs_infop->str);
-                else
-                        seq_puts(m, nfs_infop->nostr);
-        }
-        switch (nfss->client->cl_xprt->prot) {
-                case IPPROTO_TCP:
-                        proto = "tcp";
-                        break;
-                case IPPROTO_UDP:
-                        proto = "udp";
-                        break;
-                default:
-                        snprintf(buf, sizeof(buf), "%u", nfss->client->cl_xprt->prot);
-                        proto = buf;
-        }
-        seq_printf(m, ",proto=%s", proto);
-        seq_printf(m, ",timeo=%lu", 10U * nfss->retrans_timeo / HZ);
-        seq_printf(m, ",retrans=%u", nfss->retrans_count);
-}
-static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
-{
-        struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
-        nfs_show_mount_options(m, nfss, 0);
-        seq_puts(m, ",addr=");
-        seq_escape(m, nfss->hostname, " \t\n\\");
-        return 0;
-}
-static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
-{
-        int i, cpu;
-        struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
-        struct rpc_auth *auth = nfss->client->cl_auth;
-        struct nfs_iostats totals = { };
-        seq_printf(m, "statvers=%s", NFS_IOSTAT_VERS);
-        /*
-         * Display all mount option settings
-         */
-        seq_printf(m, "\n\topts:\t");
-        seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw");
-        seq_puts(m, mnt->mnt_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : "");
-        seq_puts(m, mnt->mnt_sb->s_flags & MS_NOATIME ? ",noatime" : "");
-        seq_puts(m, mnt->mnt_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : "");
-        nfs_show_mount_options(m, nfss, 1);
-        seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ);
-        seq_printf(m, "\n\tcaps:\t");
-        seq_printf(m, "caps=0x%x", nfss->caps);
-        seq_printf(m, ",wtmult=%d", nfss->wtmult);
-        seq_printf(m, ",dtsize=%d", nfss->dtsize);
-        seq_printf(m, ",bsize=%d", nfss->bsize);
-        seq_printf(m, ",namelen=%d", nfss->namelen);
-#ifdef CONFIG_NFS_V4
-        if (nfss->rpc_ops->version == 4) {
-                seq_printf(m, "\n\tnfsv4:\t");
-                seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
-                seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
-                seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
-        }
-#endif
-        /*
-         * Display security flavor in effect for this mount
-         */
-        seq_printf(m, "\n\tsec:\tflavor=%d", auth->au_ops->au_flavor);
-        if (auth->au_flavor)
-                seq_printf(m, ",pseudoflavor=%d", auth->au_flavor);
-        /*
-         * Display superblock I/O counters
-         */
-        for_each_possible_cpu(cpu) {
-                struct nfs_iostats *stats;
-                preempt_disable();
-                stats = per_cpu_ptr(nfss->io_stats, cpu);
-                for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
-                        totals.events[i] += stats->events[i];
-                for (i = 0; i < __NFSIOS_BYTESMAX; i++)
-                        totals.bytes[i] += stats->bytes[i];
-                preempt_enable();
-        }
-        seq_printf(m, "\n\tevents:\t");
-        for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
-                seq_printf(m, "%lu ", totals.events[i]);
-        seq_printf(m, "\n\tbytes:\t");
-        for (i = 0; i < __NFSIOS_BYTESMAX; i++)
-                seq_printf(m, "%Lu ", totals.bytes[i]);
-        seq_printf(m, "\n");
-        rpc_print_iostats(m, nfss->client);
-        return 0;
-}
 /**
 * nfs_sync_mapping - helper to flush all mmapped dirty data to disk
 */
@@ -890,6 +254,14 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                        if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)
                            && fattr->size <= NFS_LIMIT_READDIRPLUS)
                                set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
+                        /* Deal with crossing mountpoints */
+                        if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
+                                if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
+                                        inode->i_op = &nfs_referral_inode_operations;
+                                else
+                                        inode->i_op = &nfs_mountpoint_inode_operations;
+                                inode->i_fop = NULL;
+                        }
                } else if (S_ISLNK(inode->i_mode))
                        inode->i_op = &nfs_symlink_inode_operations;
                else
@@ -1208,6 +580,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
        dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
                inode->i_sb->s_id, (long long)NFS_FILEID(inode));
+        nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
        lock_kernel();
        if (!inode || is_bad_inode(inode))
                goto out_nowait;
@@ -1221,7 +594,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
                status = -ESTALE;
                /* Do we trust the cached ESTALE? */
                if (NFS_ATTRTIMEO(inode) != 0) {
-                        if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ATIME)) {
+                        if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME)) {
                                /* no */
                        } else
                                goto out;
@@ -1252,8 +625,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
        }
        spin_unlock(&inode->i_lock);
-        nfs_revalidate_mapping(inode, inode->i_mapping);
        if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
                nfs_zap_acl_cache(inode);
@@ -1287,8 +658,7 @@ int nfs_attribute_timeout(struct inode *inode)
 */
 int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 {
-        nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
+        if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR)
-        if (!(NFS_I(inode)->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
                        && !nfs_attribute_timeout(inode))
                return NFS_STALE(inode) ? -ESTALE : 0;
        return __nfs_revalidate_inode(server, inode);
@@ -1299,9 +669,16 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 * @inode - pointer to host inode
 * @mapping - pointer to mapping
 */
-void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
+int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
+        int ret = 0;
+        if (NFS_STALE(inode))
+                ret = -ESTALE;
+        if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
+                        || nfs_attribute_timeout(inode))
+                ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
        if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
                nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
@@ -1322,6 +699,7 @@ void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
                                inode->i_sb->s_id,
                                (long long)NFS_FILEID(inode));
        }
+        return ret;
 }
 /**
@@ -1361,12 +739,6 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
-        if ((fattr->valid & NFS_ATTR_PRE_CHANGE) != 0
-                        && nfsi->change_attr == fattr->pre_change_attr) {
-                nfsi->change_attr = fattr->change_attr;
-                nfsi->cache_change_attribute = jiffies;
-        }
        /* If we have atomic WCC data, we may update some attributes */
        if ((fattr->valid & NFS_ATTR_WCC) != 0) {
                if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) {
@@ -1400,9 +772,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
        int data_unstable;
-        if ((fattr->valid & NFS_ATTR_FATTR) == 0)
-                return 0;
        /* Has the inode gone and changed behind our back? */
        if (nfsi->fileid != fattr->fileid
                        || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
@@ -1415,20 +784,13 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
        /* Do atomic weak cache consistency updates */
        nfs_wcc_update_inode(inode, fattr);
-        if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0) {
+        if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
-                if (nfsi->change_attr == fattr->change_attr)
+                        nfsi->change_attr != fattr->change_attr)
-                        goto out;
+                nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
-                nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
-                if (!data_unstable)
-                        nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE;
-        }
        /* Verify a few of the more important attributes */
-        if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
+        if (!timespec_equal(&inode->i_mtime, &fattr->mtime))
-                nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+                nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
-                if (!data_unstable)
-                        nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE;
-        }
        cur_size = i_size_read(inode);
        new_isize = nfs_size_to_loff_t(fattr->size);
@@ -1445,7 +807,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
        if (inode->i_nlink != fattr->nlink)
                nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
-out:
        if (!timespec_equal(&inode->i_atime, &fattr->atime))
                nfsi->cache_validity |= NFS_INO_INVALID_ATIME;
@@ -1471,7 +832,6 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
        if ((fattr->valid & NFS_ATTR_FATTR) == 0)
                return 0;
        spin_lock(&inode->i_lock);
-        nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE;
        if (time_after(fattr->time_start, nfsi->last_updated))
                status = nfs_update_inode(inode, fattr);
        else
@@ -1496,7 +856,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        spin_lock(&inode->i_lock);
        if (unlikely((fattr->valid & NFS_ATTR_FATTR) == 0)) {
-                nfsi->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS;
+                nfsi->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
                goto out;
        }
        status = nfs_update_inode(inode, fattr);
@@ -1519,6 +879,7 @@ out:
 */
 static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
+        struct nfs_server *server;
        struct nfs_inode *nfsi = NFS_I(inode);
        loff_t cur_isize, new_isize;
        unsigned int    invalid = 0;
@@ -1528,9 +889,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        __FUNCTION__, inode->i_sb->s_id, inode->i_ino,
                        atomic_read(&inode->i_count), fattr->valid);
-        if ((fattr->valid & NFS_ATTR_FATTR) == 0)
-                return 0;
        if (nfsi->fileid != fattr->fileid)
                goto out_fileid;
@@ -1540,6 +898,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
                goto out_changed;
+        server = NFS_SERVER(inode);
+        /* Update the fsid if and only if this is the root directory */
+        if (inode == inode->i_sb->s_root->d_inode
+                        && !nfs_fsid_equal(&server->fsid, &fattr->fsid))
+                server->fsid = fattr->fsid;
        /*
         * Update the read time so we don't revalidate too often.
         */
@@ -1549,7 +913,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        /* Are we racing with known updates of the metadata on the server? */
        data_stable = nfs_verify_change_attribute(inode, fattr->time_start);
        if (data_stable)
-                nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME);
+                nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATIME);
        /* Do atomic weak cache consistency updates */
        nfs_wcc_update_inode(inode, fattr);
@@ -1613,15 +977,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                inode->i_blksize = fattr->du.nfs2.blocksize;
        }
-        if ((fattr->valid & NFS_ATTR_FATTR_V4)) {
+        if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
-                if (nfsi->change_attr != fattr->change_attr) {
+                        nfsi->change_attr != fattr->change_attr) {
-                        dprintk("NFS: change_attr change on server for file %s/%ld\n",
+                dprintk("NFS: change_attr change on server for file %s/%ld\n",
-                                        inode->i_sb->s_id, inode->i_ino);
+                                inode->i_sb->s_id, inode->i_ino);
-                        nfsi->change_attr = fattr->change_attr;
+                nfsi->change_attr = fattr->change_attr;
-                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+                invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-                        nfsi->cache_change_attribute = jiffies;
+                nfsi->cache_change_attribute = jiffies;
-                } else
-                        invalid &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA);
        }
        /* Update attrtimeo value if we're out of the unstable period */
@@ -1669,202 +1031,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        goto out_err;
 }
-/*
- * File system information
- */
-static int nfs_set_super(struct super_block *s, void *data)
-{
-        s->s_fs_info = data;
-        return set_anon_super(s, data);
-}
- 
-static int nfs_compare_super(struct super_block *sb, void *data)
-{
-        struct nfs_server *server = data;
-        struct nfs_server *old = NFS_SB(sb);
-        if (old->addr.sin_addr.s_addr != server->addr.sin_addr.s_addr)
-                return 0;
-        if (old->addr.sin_port != server->addr.sin_port)
-                return 0;
-        return !nfs_compare_fh(&old->fh, &server->fh);
-}
-static int nfs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
-{
-        int error;
-        struct nfs_server *server = NULL;
-        struct super_block *s;
-        struct nfs_fh *root;
-        struct nfs_mount_data *data = raw_data;
-        error = -EINVAL;
-        if (data == NULL) {
-                dprintk("%s: missing data argument\n", __FUNCTION__);
-                goto out_err_noserver;
-        }
-        if (data->version <= 0 || data->version > NFS_MOUNT_VERSION) {
-                dprintk("%s: bad mount version\n", __FUNCTION__);
-                goto out_err_noserver;
-        }
-        switch (data->version) {
-                case 1:
-                        data->namlen = 0;
-                case 2:
-                        data->bsize  = 0;
-                case 3:
-                        if (data->flags & NFS_MOUNT_VER3) {
-                                dprintk("%s: mount structure version %d does not support NFSv3\n",
-                                                __FUNCTION__,
-                                                data->version);
-                                goto out_err_noserver;
-                        }
-                        data->root.size = NFS2_FHSIZE;
-                        memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
-                case 4:
-                        if (data->flags & NFS_MOUNT_SECFLAVOUR) {
-                                dprintk("%s: mount structure version %d does not support strong security\n",
-                                                __FUNCTION__,
-                                                data->version);
-                                goto out_err_noserver;
-                        }
-                case 5:
-                        memset(data->context, 0, sizeof(data->context));
-        }
-#ifndef CONFIG_NFS_V3
-        /* If NFSv3 is not compiled in, return -EPROTONOSUPPORT */
-        error = -EPROTONOSUPPORT;
-        if (data->flags & NFS_MOUNT_VER3) {
-                dprintk("%s: NFSv3 not compiled into kernel\n", __FUNCTION__);
-                goto out_err_noserver;
-        }
-#endif /* CONFIG_NFS_V3 */
-        error = -ENOMEM;
-        server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
-        if (!server)
-                goto out_err_noserver;
-        /* Zero out the NFS state stuff */
-        init_nfsv4_state(server);
-        server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
-        root = &server->fh;
-        if (data->flags & NFS_MOUNT_VER3)
-                root->size = data->root.size;
-        else
-                root->size = NFS2_FHSIZE;
-        error = -EINVAL;
-        if (root->size > sizeof(root->data)) {
-                dprintk("%s: invalid root filehandle\n", __FUNCTION__);
-                goto out_err;
-        }
-        memcpy(root->data, data->root.data, root->size);
-        /* We now require that the mount process passes the remote address */
-        memcpy(&server->addr, &data->addr, sizeof(server->addr));
-        if (server->addr.sin_addr.s_addr == INADDR_ANY) {
-                dprintk("%s: mount program didn't pass remote address!\n",
-                                __FUNCTION__);
-                goto out_err;
-        }
-        /* Fire up rpciod if not yet running */
-        error = rpciod_up();
-        if (error < 0) {
-                dprintk("%s: couldn't start rpciod! Error = %d\n",
-                                __FUNCTION__, error);
-                goto out_err;
-        }
-        s = sget(fs_type, nfs_compare_super, nfs_set_super, server);
-        if (IS_ERR(s)) {
-                error = PTR_ERR(s);
-                goto out_err_rpciod;
-        }
-        if (s->s_root)
-                goto out_rpciod_down;
-        s->s_flags = flags;
-        error = nfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
-        if (error) {
-                up_write(&s->s_umount);
-                deactivate_super(s);
-                return error;
-        }
-        s->s_flags |= MS_ACTIVE;
-        return simple_set_mnt(mnt, s);
-out_rpciod_down:
-        rpciod_down();
-        kfree(server);
-        return simple_set_mnt(mnt, s);
-out_err_rpciod:
-        rpciod_down();
-out_err:
-        kfree(server);
-out_err_noserver:
-        return error;
-}
-static void nfs_kill_super(struct super_block *s)
-{
-        struct nfs_server *server = NFS_SB(s);
-        kill_anon_super(s);
-        if (!IS_ERR(server->client))
-                rpc_shutdown_client(server->client);
-        if (!IS_ERR(server->client_sys))
-                rpc_shutdown_client(server->client_sys);
-        if (!IS_ERR(server->client_acl))
-                rpc_shutdown_client(server->client_acl);
-        if (!(server->flags & NFS_MOUNT_NONLM))
-                lockd_down();   /* release rpc.lockd */
-        rpciod_down();          /* release rpciod */
-        nfs_free_iostats(server->io_stats);
-        kfree(server->hostname);
-        kfree(server);
-}
-static struct file_system_type nfs_fs_type = {
-        .owner          = THIS_MODULE,
-        .name           = "nfs",
-        .get_sb         = nfs_get_sb,
-        .kill_sb        = nfs_kill_super,
-        .fs_flags       = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
-};
 #ifdef CONFIG_NFS_V4
-static void nfs4_clear_inode(struct inode *);
-static struct super_operations nfs4_sops = { 
-        .alloc_inode    = nfs_alloc_inode,
-        .destroy_inode  = nfs_destroy_inode,
-        .write_inode    = nfs_write_inode,
-        .delete_inode   = nfs_delete_inode,
-        .statfs         = nfs_statfs,
-        .clear_inode    = nfs4_clear_inode,
-        .umount_begin   = nfs_umount_begin,
-        .show_options   = nfs_show_options,
-        .show_stats     = nfs_show_stats,
-};
 /*
 * Clean out any remaining NFSv4 state that might be left over due
 * to open() calls that passed nfs_atomic_lookup, but failed to call
 * nfs_open().
 */
-static void nfs4_clear_inode(struct inode *inode)
+void nfs4_clear_inode(struct inode *inode)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
@@ -1888,365 +1063,9 @@ static void nfs4_clear_inode(struct inode *inode)
                nfs4_close_state(state, state->state);
        }
 }
-static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data, int silent)
-{
-        struct nfs_server *server;
-        struct nfs4_client *clp = NULL;
-        struct rpc_xprt *xprt = NULL;
-        struct rpc_clnt *clnt = NULL;
-        struct rpc_timeout timeparms;
-        rpc_authflavor_t authflavour;
-        int err = -EIO;
-        sb->s_blocksize_bits = 0;
-        sb->s_blocksize = 0;
-        server = NFS_SB(sb);
-        if (data->rsize != 0)
-                server->rsize = nfs_block_size(data->rsize, NULL);
-        if (data->wsize != 0)
-                server->wsize = nfs_block_size(data->wsize, NULL);
-        server->flags = data->flags & NFS_MOUNT_FLAGMASK;
-        server->caps = NFS_CAP_ATOMIC_OPEN;
-        server->acregmin = data->acregmin*HZ;
-        server->acregmax = data->acregmax*HZ;
-        server->acdirmin = data->acdirmin*HZ;
-        server->acdirmax = data->acdirmax*HZ;
-        server->rpc_ops = &nfs_v4_clientops;
-        nfs_init_timeout_values(&timeparms, data->proto, data->timeo, data->retrans);
-        server->retrans_timeo = timeparms.to_initval;
-        server->retrans_count = timeparms.to_retries;
-        clp = nfs4_get_client(&server->addr.sin_addr);
-        if (!clp) {
-                dprintk("%s: failed to create NFS4 client.\n", __FUNCTION__);
-                return -EIO;
-        }
-        /* Now create transport and client */
-        authflavour = RPC_AUTH_UNIX;
-        if (data->auth_flavourlen != 0) {
-                if (data->auth_flavourlen != 1) {
-                        dprintk("%s: Invalid number of RPC auth flavours %d.\n",
-                                        __FUNCTION__, data->auth_flavourlen);
-                        err = -EINVAL;
-                        goto out_fail;
-                }
-                if (copy_from_user(&authflavour, data->auth_flavours, sizeof(authflavour))) {
-                        err = -EFAULT;
-                        goto out_fail;
-                }
-        }
-        down_write(&clp->cl_sem);
-        if (IS_ERR(clp->cl_rpcclient)) {
-                xprt = xprt_create_proto(data->proto, &server->addr, &timeparms);
-                if (IS_ERR(xprt)) {
-                        up_write(&clp->cl_sem);
-                        err = PTR_ERR(xprt);
-                        dprintk("%s: cannot create RPC transport. Error = %d\n",
-                                        __FUNCTION__, err);
-                        goto out_fail;
-                }
-                clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
-                                server->rpc_ops->version, authflavour);
-                if (IS_ERR(clnt)) {
-                        up_write(&clp->cl_sem);
-                        err = PTR_ERR(clnt);
-                        dprintk("%s: cannot create RPC client. Error = %d\n",
-                                        __FUNCTION__, err);
-                        goto out_fail;
-                }
-                clnt->cl_intr     = 1;
-                clnt->cl_softrtry = 1;
-                clp->cl_rpcclient = clnt;
-                memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr));
-                nfs_idmap_new(clp);
-        }
-        list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
-        clnt = rpc_clone_client(clp->cl_rpcclient);
-        if (!IS_ERR(clnt))
-                        server->nfs4_state = clp;
-        up_write(&clp->cl_sem);
-        clp = NULL;
-        if (IS_ERR(clnt)) {
-                err = PTR_ERR(clnt);
-                dprintk("%s: cannot create RPC client. Error = %d\n",
-                                __FUNCTION__, err);
-                return err;
-        }
-        server->client    = clnt;
-        if (server->nfs4_state->cl_idmap == NULL) {
-                dprintk("%s: failed to create idmapper.\n", __FUNCTION__);
-                return -ENOMEM;
-        }
-        if (clnt->cl_auth->au_flavor != authflavour) {
-                struct rpc_auth *auth;
-                auth = rpcauth_create(authflavour, clnt);
-                if (IS_ERR(auth)) {
-                        dprintk("%s: couldn't create credcache!\n", __FUNCTION__);
-                        return PTR_ERR(auth);
-                }
-        }
-        sb->s_time_gran = 1;
-        sb->s_op = &nfs4_sops;
-        err = nfs_sb_init(sb, authflavour);
-        if (err == 0)
-                return 0;
-out_fail:
-        if (clp)
-                nfs4_put_client(clp);
-        return err;
-}
-static int nfs4_compare_super(struct super_block *sb, void *data)
-{
-        struct nfs_server *server = data;
-        struct nfs_server *old = NFS_SB(sb);
-        if (strcmp(server->hostname, old->hostname) != 0)
-                return 0;
-        if (strcmp(server->mnt_path, old->mnt_path) != 0)
-                return 0;
-        return 1;
-}
-static void *
-nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen)
-{
-        void *p = NULL;
-        if (!src->len)
-                return ERR_PTR(-EINVAL);
-        if (src->len < maxlen)
-                maxlen = src->len;
-        if (dst == NULL) {
-                p = dst = kmalloc(maxlen + 1, GFP_KERNEL);
-                if (p == NULL)
-                        return ERR_PTR(-ENOMEM);
-        }
-        if (copy_from_user(dst, src->data, maxlen)) {
-                kfree(p);
-                return ERR_PTR(-EFAULT);
-        }
-        dst[maxlen] = '\0';
-        return dst;
-}
-static int nfs4_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
-{
-        int error;
-        struct nfs_server *server;
-        struct super_block *s;
-        struct nfs4_mount_data *data = raw_data;
-        void *p;
-        if (data == NULL) {
-                dprintk("%s: missing data argument\n", __FUNCTION__);
-                return -EINVAL;
-        }
-        if (data->version <= 0 || data->version > NFS4_MOUNT_VERSION) {
-                dprintk("%s: bad mount version\n", __FUNCTION__);
-                return -EINVAL;
-        }
-        server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
-        if (!server)
-                return -ENOMEM;
-        /* Zero out the NFS state stuff */
-        init_nfsv4_state(server);
-        server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
-        p = nfs_copy_user_string(NULL, &data->hostname, 256);
-        if (IS_ERR(p))
-                goto out_err;
-        server->hostname = p;
-        p = nfs_copy_user_string(NULL, &data->mnt_path, 1024);
-        if (IS_ERR(p))
-                goto out_err;
-        server->mnt_path = p;
-        p = nfs_copy_user_string(server->ip_addr, &data->client_addr,
-                        sizeof(server->ip_addr) - 1);
-        if (IS_ERR(p))
-                goto out_err;
-        /* We now require that the mount process passes the remote address */
-        if (data->host_addrlen != sizeof(server->addr)) {
-                error = -EINVAL;
-                goto out_free;
-        }
-        if (copy_from_user(&server->addr, data->host_addr, sizeof(server->addr))) {
-                error = -EFAULT;
-                goto out_free;
-        }
-        if (server->addr.sin_family != AF_INET ||
-            server->addr.sin_addr.s_addr == INADDR_ANY) {
-                dprintk("%s: mount program didn't pass remote IP address!\n",
-                                __FUNCTION__);
-                error = -EINVAL;
-                goto out_free;
-        }
-        /* Fire up rpciod if not yet running */
-        error = rpciod_up();
-        if (error < 0) {
-                dprintk("%s: couldn't start rpciod! Error = %d\n",
-                                __FUNCTION__, error);
-                goto out_free;
-        }
-        s = sget(fs_type, nfs4_compare_super, nfs_set_super, server);
-        if (IS_ERR(s)) {
-                error = PTR_ERR(s);
-                goto out_free;
-        }
-        if (s->s_root) {
-                kfree(server->mnt_path);
-                kfree(server->hostname);
-                kfree(server);
-                return simple_set_mnt(mnt, s);
-        }
-        s->s_flags = flags;
-        error = nfs4_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
-        if (error) {
-                up_write(&s->s_umount);
-                deactivate_super(s);
-                return error;
-        }
-        s->s_flags |= MS_ACTIVE;
-        return simple_set_mnt(mnt, s);
-out_err:
-        error = PTR_ERR(p);
-out_free:
-        kfree(server->mnt_path);
-        kfree(server->hostname);
-        kfree(server);
-        return error;
-}
-static void nfs4_kill_super(struct super_block *sb)
-{
-        struct nfs_server *server = NFS_SB(sb);
-        nfs_return_all_delegations(sb);
-        kill_anon_super(sb);
-        nfs4_renewd_prepare_shutdown(server);
-        if (server->client != NULL && !IS_ERR(server->client))
-                rpc_shutdown_client(server->client);
-        destroy_nfsv4_state(server);
-        rpciod_down();
-        nfs_free_iostats(server->io_stats);
-        kfree(server->hostname);
-        kfree(server);
-}
-static struct file_system_type nfs4_fs_type = {
-        .owner          = THIS_MODULE,
-        .name           = "nfs4",
-        .get_sb         = nfs4_get_sb,
-        .kill_sb        = nfs4_kill_super,
-        .fs_flags       = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
-};
-static const int nfs_set_port_min = 0;
-static const int nfs_set_port_max = 65535;
-static int param_set_port(const char *val, struct kernel_param *kp)
-{
-        char *endp;
-        int num = simple_strtol(val, &endp, 0);
-        if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max)
-                return -EINVAL;
-        *((int *)kp->arg) = num;
-        return 0;
-}
-module_param_call(callback_tcpport, param_set_port, param_get_int,
-                 &nfs_callback_set_tcpport, 0644);
-static int param_set_idmap_timeout(const char *val, struct kernel_param *kp)
-{
-        char *endp;
-        int num = simple_strtol(val, &endp, 0);
-        int jif = num * HZ;
-        if (endp == val || *endp || num < 0 || jif < num)
-                return -EINVAL;
-        *((int *)kp->arg) = jif;
-        return 0;
-}
-module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
-                 &nfs_idmap_cache_timeout, 0644);
-#define nfs4_init_once(nfsi) \
-        do { \
-                INIT_LIST_HEAD(&(nfsi)->open_states); \
-                nfsi->delegation = NULL; \
-                nfsi->delegation_state = 0; \
-                init_rwsem(&nfsi->rwsem); \
-        } while(0)
-static inline int register_nfs4fs(void)
-{
-        int ret;
-        ret = nfs_register_sysctl();
-        if (ret != 0)
-                return ret;
-        ret = register_filesystem(&nfs4_fs_type);
-        if (ret != 0)
-                nfs_unregister_sysctl();
-        return ret;
-}
-static inline void unregister_nfs4fs(void)
-{
-        unregister_filesystem(&nfs4_fs_type);
-        nfs_unregister_sysctl();
-}
-#else
-#define nfs4_init_once(nfsi) \
-        do { } while (0)
-#define register_nfs4fs() (0)
-#define unregister_nfs4fs()
 #endif
-extern int nfs_init_nfspagecache(void);
+struct inode *nfs_alloc_inode(struct super_block *sb)
-extern void nfs_destroy_nfspagecache(void);
-extern int nfs_init_readpagecache(void);
-extern void nfs_destroy_readpagecache(void);
-extern int nfs_init_writepagecache(void);
-extern void nfs_destroy_writepagecache(void);
-#ifdef CONFIG_NFS_DIRECTIO
-extern int nfs_init_directcache(void);
-extern void nfs_destroy_directcache(void);
-#endif
-static kmem_cache_t * nfs_inode_cachep;
-static struct inode *nfs_alloc_inode(struct super_block *sb)
 {
        struct nfs_inode *nfsi;
        nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, SLAB_KERNEL);
@@ -2265,11 +1084,21 @@ static struct inode *nfs_alloc_inode(struct super_block *sb)
        return &nfsi->vfs_inode;
 }
-static void nfs_destroy_inode(struct inode *inode)
+void nfs_destroy_inode(struct inode *inode)
 {
        kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
 }
+static inline void nfs4_init_once(struct nfs_inode *nfsi)
+{
+#ifdef CONFIG_NFS_V4
+        INIT_LIST_HEAD(&nfsi->open_states);
+        nfsi->delegation = NULL;
+        nfsi->delegation_state = 0;
+        init_rwsem(&nfsi->rwsem);
+#endif
+}
 static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
 {
        struct nfs_inode *nfsi = (struct nfs_inode *) foo;
@@ -2290,7 +1119,7 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
        }
 }
 
-static int nfs_init_inodecache(void)
+static int __init nfs_init_inodecache(void)
 {
        nfs_inode_cachep = kmem_cache_create("nfs_inode_cache",
                                             sizeof(struct nfs_inode),
@@ -2332,29 +1161,22 @@ static int __init init_nfs_fs(void)
        if (err)
                goto out1;
-#ifdef CONFIG_NFS_DIRECTIO
        err = nfs_init_directcache();
        if (err)
                goto out0;
-#endif
 #ifdef CONFIG_PROC_FS
        rpc_proc_register(&nfs_rpcstat);
 #endif
-        err = register_filesystem(&nfs_fs_type);
+        if ((err = register_nfs_fs()) != 0)
-        if (err)
-                goto out;
-        if ((err = register_nfs4fs()) != 0)
                goto out;
        return 0;
 out:
 #ifdef CONFIG_PROC_FS
        rpc_proc_unregister("nfs");
 #endif
-#ifdef CONFIG_NFS_DIRECTIO
        nfs_destroy_directcache();
 out0:
-#endif
        nfs_destroy_writepagecache();
 out1:
        nfs_destroy_readpagecache();
@@ -2368,9 +1190,7 @@ out4:
 static void __exit exit_nfs_fs(void)
 {
-#ifdef CONFIG_NFS_DIRECTIO
        nfs_destroy_directcache();
-#endif
        nfs_destroy_writepagecache();
        nfs_destroy_readpagecache();
        nfs_destroy_inodecache();
@@ -2378,8 +1198,7 @@ static void __exit exit_nfs_fs(void)
 #ifdef CONFIG_PROC_FS
        rpc_proc_unregister("nfs");
 #endif
-        unregister_filesystem(&nfs_fs_type);
+        unregister_nfs_fs();
-        unregister_nfs4fs();
 }
 /* Not quite true; I just maintain it */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
new file mode 100644
index 000000000000..4fe51c1292bb
--- /dev/null
+++ b/fs/nfs/internal.h
@@ -0,0 +1,186 @@
+/*
+ * NFS internal definitions
+ */
+#include <linux/mount.h>
+struct nfs_clone_mount {
+        const struct super_block *sb;
+        const struct dentry *dentry;
+        struct nfs_fh *fh;
+        struct nfs_fattr *fattr;
+        char *hostname;
+        char *mnt_path;
+        struct sockaddr_in *addr;
+        rpc_authflavor_t authflavor;
+};
+/* namespace-nfs4.c */
+#ifdef CONFIG_NFS_V4
+extern struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry);
+#else
+static inline
+struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
+{
+        return ERR_PTR(-ENOENT);
+}
+#endif
+/* callback_xdr.c */
+extern struct svc_version nfs4_callback_version1;
+/* pagelist.c */
+extern int __init nfs_init_nfspagecache(void);
+extern void nfs_destroy_nfspagecache(void);
+extern int __init nfs_init_readpagecache(void);
+extern void nfs_destroy_readpagecache(void);
+extern int __init nfs_init_writepagecache(void);
+extern void nfs_destroy_writepagecache(void);
+#ifdef CONFIG_NFS_DIRECTIO
+extern int __init nfs_init_directcache(void);
+extern void nfs_destroy_directcache(void);
+#else
+#define nfs_init_directcache() (0)
+#define nfs_destroy_directcache() do {} while(0)
+#endif
+/* nfs2xdr.c */
+extern struct rpc_procinfo nfs_procedures[];
+extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int);
+/* nfs3xdr.c */
+extern struct rpc_procinfo nfs3_procedures[];
+extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int);
+/* nfs4xdr.c */
+extern int nfs_stat_to_errno(int);
+extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus);
+/* nfs4proc.c */
+#ifdef CONFIG_NFS_V4
+extern struct rpc_procinfo nfs4_procedures[];
+extern int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
+                                  struct nfs4_fs_locations *fs_locations,
+                                  struct page *page);
+#endif
+/* inode.c */
+extern struct inode *nfs_alloc_inode(struct super_block *sb);
+extern void nfs_destroy_inode(struct inode *);
+extern int nfs_write_inode(struct inode *,int);
+extern void nfs_clear_inode(struct inode *);
+#ifdef CONFIG_NFS_V4
+extern void nfs4_clear_inode(struct inode *);
+#endif
+/* super.c */
+extern struct file_system_type nfs_referral_nfs4_fs_type;
+extern struct file_system_type clone_nfs_fs_type;
+#ifdef CONFIG_NFS_V4
+extern struct file_system_type clone_nfs4_fs_type;
+#endif
+#ifdef CONFIG_PROC_FS
+extern struct rpc_stat nfs_rpcstat;
+#endif
+extern int __init register_nfs_fs(void);
+extern void __exit unregister_nfs_fs(void);
+/* namespace.c */
+extern char *nfs_path(const char *base, const struct dentry *dentry,
+                      char *buffer, ssize_t buflen);
+/*
+ * Determine the mount path as a string
+ */
+static inline char *
+nfs4_path(const struct dentry *dentry, char *buffer, ssize_t buflen)
+{
+#ifdef CONFIG_NFS_V4
+        return nfs_path(NFS_SB(dentry->d_sb)->mnt_path, dentry, buffer, buflen);
+#else
+        return NULL;
+#endif
+}
+/*
+ * Determine the device name as a string
+ */
+static inline char *nfs_devname(const struct vfsmount *mnt_parent,
+                         const struct dentry *dentry,
+                         char *buffer, ssize_t buflen)
+{
+        return nfs_path(mnt_parent->mnt_devname, dentry, buffer, buflen);
+}
+/*
+ * Determine the actual block size (and log2 thereof)
+ */
+static inline
+unsigned long nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp)
+{
+        /* make sure blocksize is a power of two */
+        if ((bsize & (bsize - 1)) || nrbitsp) {
+                unsigned char   nrbits;
+                for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--)
+                        ;
+                bsize = 1 << nrbits;
+                if (nrbitsp)
+                        *nrbitsp = nrbits;
+        }
+        return bsize;
+}
+/*
+ * Calculate the number of 512byte blocks used.
+ */
+static inline unsigned long nfs_calc_block_size(u64 tsize)
+{
+        loff_t used = (tsize + 511) >> 9;
+        return (used > ULONG_MAX) ? ULONG_MAX : used;
+}
+/*
+ * Compute and set NFS server blocksize
+ */
+static inline
+unsigned long nfs_block_size(unsigned long bsize, unsigned char *nrbitsp)
+{
+        if (bsize < NFS_MIN_FILE_IO_SIZE)
+                bsize = NFS_DEF_FILE_IO_SIZE;
+        else if (bsize >= NFS_MAX_FILE_IO_SIZE)
+                bsize = NFS_MAX_FILE_IO_SIZE;
+        return nfs_block_bits(bsize, nrbitsp);
+}
+/*
+ * Determine the maximum file size for a superblock
+ */
+static inline
+void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
+{
+        sb->s_maxbytes = (loff_t)maxfilesize;
+        if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0)
+                sb->s_maxbytes = MAX_LFS_FILESIZE;
+}
+/*
+ * Check if the string represents a "valid" IPv4 address
+ */
+static inline int valid_ipaddr4(const char *buf)
+{
+        int rc, count, in[4];
+        rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]);
+        if (rc != 4)
+                return -EINVAL;
+        for (count = 0; count < 4; count++) {
+                if (in[count] > 255)
+                        return -EINVAL;
+        }
+        return 0;
+}
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
new file mode 100644
index 000000000000..19b98ca468eb
--- /dev/null
+++ b/fs/nfs/namespace.c
@@ -0,0 +1,229 @@
+/*
+ * linux/fs/nfs/namespace.c
+ *
+ * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ * NFS namespace
+ */
+#include <linux/config.h>
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/nfs_fs.h>
+#include <linux/string.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/vfs.h>
+#include "internal.h"
+#define NFSDBG_FACILITY         NFSDBG_VFS
+static void nfs_expire_automounts(void *list);
+LIST_HEAD(nfs_automount_list);
+static DECLARE_WORK(nfs_automount_task, nfs_expire_automounts, &nfs_automount_list);
+int nfs_mountpoint_expiry_timeout = 500 * HZ;
+/*
+ * nfs_path - reconstruct the path given an arbitrary dentry
+ * @base - arbitrary string to prepend to the path
+ * @dentry - pointer to dentry
+ * @buffer - result buffer
+ * @buflen - length of buffer
+ *
+ * Helper function for constructing the path from the
+ * root dentry to an arbitrary hashed dentry.
+ *
+ * This is mainly for use in figuring out the path on the
+ * server side when automounting on top of an existing partition.
+ */
+char *nfs_path(const char *base, const struct dentry *dentry,
+               char *buffer, ssize_t buflen)
+{
+        char *end = buffer+buflen;
+        int namelen;
+        *--end = '\0';
+        buflen--;
+        spin_lock(&dcache_lock);
+        while (!IS_ROOT(dentry)) {
+                namelen = dentry->d_name.len;
+                buflen -= namelen + 1;
+                if (buflen < 0)
+                        goto Elong;
+                end -= namelen;
+                memcpy(end, dentry->d_name.name, namelen);
+                *--end = '/';
+                dentry = dentry->d_parent;
+        }
+        spin_unlock(&dcache_lock);
+        namelen = strlen(base);
+        /* Strip off excess slashes in base string */
+        while (namelen > 0 && base[namelen - 1] == '/')
+                namelen--;
+        buflen -= namelen;
+        if (buflen < 0)
+                goto Elong;
+        end -= namelen;
+        memcpy(end, base, namelen);
+        return end;
+Elong:
+        return ERR_PTR(-ENAMETOOLONG);
+}
+/*
+ * nfs_follow_mountpoint - handle crossing a mountpoint on the server
+ * @dentry - dentry of mountpoint
+ * @nd - nameidata info
+ *
+ * When we encounter a mountpoint on the server, we want to set up
+ * a mountpoint on the client too, to prevent inode numbers from
+ * colliding, and to allow "df" to work properly.
+ * On NFSv4, we also want to allow for the fact that different
+ * filesystems may be migrated to different servers in a failover
+ * situation, and that different filesystems may want to use
+ * different security flavours.
+ */
+static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
+{
+        struct vfsmount *mnt;
+        struct nfs_server *server = NFS_SERVER(dentry->d_inode);
+        struct dentry *parent;
+        struct nfs_fh fh;
+        struct nfs_fattr fattr;
+        int err;
+        BUG_ON(IS_ROOT(dentry));
+        dprintk("%s: enter\n", __FUNCTION__);
+        dput(nd->dentry);
+        nd->dentry = dget(dentry);
+        if (d_mountpoint(nd->dentry))
+                goto out_follow;
+        /* Look it up again */
+        parent = dget_parent(nd->dentry);
+        err = server->rpc_ops->lookup(parent->d_inode, &nd->dentry->d_name, &fh, &fattr);
+        dput(parent);
+        if (err != 0)
+                goto out_err;
+        if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL)
+                mnt = nfs_do_refmount(nd->mnt, nd->dentry);
+        else
+                mnt = nfs_do_submount(nd->mnt, nd->dentry, &fh, &fattr);
+        err = PTR_ERR(mnt);
+        if (IS_ERR(mnt))
+                goto out_err;
+        mntget(mnt);
+        err = do_add_mount(mnt, nd, nd->mnt->mnt_flags|MNT_SHRINKABLE, &nfs_automount_list);
+        if (err < 0) {
+                mntput(mnt);
+                if (err == -EBUSY)
+                        goto out_follow;
+                goto out_err;
+        }
+        mntput(nd->mnt);
+        dput(nd->dentry);
+        nd->mnt = mnt;
+        nd->dentry = dget(mnt->mnt_root);
+        schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+out:
+        dprintk("%s: done, returned %d\n", __FUNCTION__, err);
+        return ERR_PTR(err);
+out_err:
+        path_release(nd);
+        goto out;
+out_follow:
+        while(d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
+                ;
+        err = 0;
+        goto out;
+}
+struct inode_operations nfs_mountpoint_inode_operations = {
+        .follow_link    = nfs_follow_mountpoint,
+        .getattr        = nfs_getattr,
+};
+struct inode_operations nfs_referral_inode_operations = {
+        .follow_link    = nfs_follow_mountpoint,
+};
+static void nfs_expire_automounts(void *data)
+{
+        struct list_head *list = (struct list_head *)data;
+        mark_mounts_for_expiry(list);
+        if (!list_empty(list))
+                schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+}
+void nfs_release_automount_timer(void)
+{
+        if (list_empty(&nfs_automount_list)) {
+                cancel_delayed_work(&nfs_automount_task);
+                flush_scheduled_work();
+        }
+}
+/*
+ * Clone a mountpoint of the appropriate type
+ */
+static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, char *devname,
+                                           struct nfs_clone_mount *mountdata)
+{
+#ifdef CONFIG_NFS_V4
+        struct vfsmount *mnt = NULL;
+        switch (server->rpc_ops->version) {
+                case 2:
+                case 3:
+                        mnt = vfs_kern_mount(&clone_nfs_fs_type, 0, devname, mountdata);
+                        break;
+                case 4:
+                        mnt = vfs_kern_mount(&clone_nfs4_fs_type, 0, devname, mountdata);
+        }
+        return mnt;
+#else
+        return vfs_kern_mount(&clone_nfs_fs_type, 0, devname, mountdata);
+#endif
+}
+/**
+ * nfs_do_submount - set up mountpoint when crossing a filesystem boundary
+ * @mnt_parent - mountpoint of parent directory
+ * @dentry - parent directory
+ * @fh - filehandle for new root dentry
+ * @fattr - attributes for new root inode
+ *
+ */
+struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
+                const struct dentry *dentry, struct nfs_fh *fh,
+                struct nfs_fattr *fattr)
+{
+        struct nfs_clone_mount mountdata = {
+                .sb = mnt_parent->mnt_sb,
+                .dentry = dentry,
+                .fh = fh,
+                .fattr = fattr,
+        };
+        struct vfsmount *mnt = ERR_PTR(-ENOMEM);
+        char *page = (char *) __get_free_page(GFP_USER);
+        char *devname;
+        dprintk("%s: submounting on %s/%s\n", __FUNCTION__,
+                        dentry->d_parent->d_name.name,
+                        dentry->d_name.name);
+        if (page == NULL)
+                goto out;
+        devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE);
+        mnt = (struct vfsmount *)devname;
+        if (IS_ERR(devname))
+                goto free_page;
+        mnt = nfs_do_clone_mount(NFS_SB(mnt_parent->mnt_sb), devname, &mountdata);
+free_page:
+        free_page((unsigned long)page);
+out:
+        dprintk("%s: done\n", __FUNCTION__);
+        return mnt;
+}
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index f0015fa876e1..67391eef6b93 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -23,12 +23,11 @@
 #include <linux/nfs.h>
 #include <linux/nfs2.h>
 #include <linux/nfs_fs.h>
+#include "internal.h"
 #define NFSDBG_FACILITY         NFSDBG_XDR
 /* #define NFS_PARANOIA 1 */
-extern int                      nfs_stat_to_errno(int stat);
 /* Mapping from NFS error code to "errno" error code. */
 #define errno_NFSERR_IO         EIO
@@ -131,7 +130,8 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
        fattr->du.nfs2.blocksize = ntohl(*p++);
        rdev = ntohl(*p++);
        fattr->du.nfs2.blocks = ntohl(*p++);
-        fattr->fsid_u.nfs3 = ntohl(*p++);
+        fattr->fsid.major = ntohl(*p++);
+        fattr->fsid.minor = 0;
        fattr->fileid = ntohl(*p++);
        p = xdr_decode_time(p, &fattr->atime);
        p = xdr_decode_time(p, &fattr->mtime);
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 33287879bd23..7322da4d2055 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -172,8 +172,10 @@ static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl,
                inode->i_ino, acl, dfacl);
        spin_lock(&inode->i_lock);
        __nfs3_forget_cached_acls(NFS_I(inode));
-        nfsi->acl_access = posix_acl_dup(acl);
+        if (!IS_ERR(acl))
-        nfsi->acl_default = posix_acl_dup(dfacl);
+                nfsi->acl_access = posix_acl_dup(acl);
+        if (!IS_ERR(dfacl))
+                nfsi->acl_default = posix_acl_dup(dfacl);
        spin_unlock(&inode->i_lock);
 }
@@ -254,7 +256,9 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
                        res.acl_access = NULL;
                }
        }
-        nfs3_cache_acls(inode, res.acl_access, res.acl_default);
+        nfs3_cache_acls(inode,
+                (res.mask & NFS_ACL)   ? res.acl_access  : ERR_PTR(-EINVAL),
+                (res.mask & NFS_DFACL) ? res.acl_default : ERR_PTR(-EINVAL));
        switch(type) {
                case ACL_TYPE_ACCESS:
@@ -329,6 +333,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
        switch (status) {
                case 0:
                        status = nfs_refresh_inode(inode, &fattr);
+                        nfs3_cache_acls(inode, acl, dfacl);
                        break;
                case -EPFNOSUPPORT:
                case -EPROTONOSUPPORT:
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index cf186f0d2b3b..7143b1f82cea 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -20,11 +20,10 @@
 #include <linux/nfs_mount.h>
 #include "iostat.h"
+#include "internal.h"
 #define NFSDBG_FACILITY         NFSDBG_PROC
-extern struct rpc_procinfo nfs3_procedures[];
 /* A wrapper to handle the EJUKEBOX error message */
 static int
 nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
@@ -809,8 +808,6 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
        return status;
 }
-extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int);
 static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
        if (nfs3_async_handle_jukebox(task, data->inode))
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index ec233619687e..0250269e9753 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -22,14 +22,13 @@
 #include <linux/nfs3.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfsacl.h>
+#include "internal.h"
 #define NFSDBG_FACILITY         NFSDBG_XDR
 /* Mapping from NFS error code to "errno" error code. */
 #define errno_NFSERR_IO         EIO
-extern int                      nfs_stat_to_errno(int);
 /*
 * Declare the space requirements for NFS arguments and replies as
 * number of 32bit-words
@@ -166,7 +165,8 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
        if (MAJOR(fattr->rdev) != major || MINOR(fattr->rdev) != minor)
                fattr->rdev = 0;
-        p = xdr_decode_hyper(p, &fattr->fsid_u.nfs3);
+        p = xdr_decode_hyper(p, &fattr->fsid.major);
+        fattr->fsid.minor = 0;
        p = xdr_decode_hyper(p, &fattr->fileid);
        p = xdr_decode_time3(p, &fattr->atime);
        p = xdr_decode_time3(p, &fattr->mtime);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 0f5e4e7cddec..9a102860df37 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -217,6 +217,9 @@ extern int nfs4_proc_renew(struct nfs4_client *, struct rpc_cred *);
 extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state);
 extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
 extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
+extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
+extern int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
+                struct nfs4_fs_locations *fs_locations, struct page *page);
 extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
 extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops;
@@ -225,6 +228,7 @@ extern const u32 nfs4_fattr_bitmap[2];
 extern const u32 nfs4_statfs_bitmap[2];
 extern const u32 nfs4_pathconf_bitmap[2];
 extern const u32 nfs4_fsinfo_bitmap[2];
+extern const u32 nfs4_fs_locations_bitmap[2];
 /* nfs4renewd.c */
 extern void nfs4_schedule_state_renewal(struct nfs4_client *);
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
new file mode 100644
index 000000000000..ea38d27b74e6
--- /dev/null
+++ b/fs/nfs/nfs4namespace.c
@@ -0,0 +1,201 @@
+/*
+ * linux/fs/nfs/nfs4namespace.c
+ *
+ * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ * NFSv4 namespace
+ */
+#include <linux/config.h>
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/nfs_fs.h>
+#include <linux/string.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include "internal.h"
+#define NFSDBG_FACILITY         NFSDBG_VFS
+/*
+ * Check if fs_root is valid
+ */
+static inline char *nfs4_pathname_string(struct nfs4_pathname *pathname,
+                                         char *buffer, ssize_t buflen)
+{
+        char *end = buffer + buflen;
+        int n;
+        *--end = '\0';
+        buflen--;
+        n = pathname->ncomponents;
+        while (--n >= 0) {
+                struct nfs4_string *component = &pathname->components[n];
+                buflen -= component->len + 1;
+                if (buflen < 0)
+                        goto Elong;
+                end -= component->len;
+                memcpy(end, component->data, component->len);
+                *--end = '/';
+        }
+        return end;
+Elong:
+        return ERR_PTR(-ENAMETOOLONG);
+}
+/**
+ * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
+ * @mnt_parent - mountpoint of parent directory
+ * @dentry - parent directory
+ * @fspath - fs path returned in fs_locations
+ * @mntpath - mount path to new server
+ * @hostname - hostname of new server
+ * @addr - host addr of new server
+ *
+ */
+static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
+                                            const struct dentry *dentry,
+                                            struct nfs4_fs_locations *locations)
+{
+        struct vfsmount *mnt = ERR_PTR(-ENOENT);
+        struct nfs_clone_mount mountdata = {
+                .sb = mnt_parent->mnt_sb,
+                .dentry = dentry,
+                .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor,
+        };
+        char *page, *page2;
+        char *path, *fs_path;
+        char *devname;
+        int loc, s;
+        if (locations == NULL || locations->nlocations <= 0)
+                goto out;
+        dprintk("%s: referral at %s/%s\n", __FUNCTION__,
+                dentry->d_parent->d_name.name, dentry->d_name.name);
+        /* Ensure fs path is a prefix of current dentry path */
+        page = (char *) __get_free_page(GFP_USER);
+        if (page == NULL)
+                goto out;
+        page2 = (char *) __get_free_page(GFP_USER);
+        if (page2 == NULL)
+                goto out;
+        path = nfs4_path(dentry, page, PAGE_SIZE);
+        if (IS_ERR(path))
+                goto out_free;
+        fs_path = nfs4_pathname_string(&locations->fs_path, page2, PAGE_SIZE);
+        if (IS_ERR(fs_path))
+                goto out_free;
+        if (strncmp(path, fs_path, strlen(fs_path)) != 0) {
+                dprintk("%s: path %s does not begin with fsroot %s\n", __FUNCTION__, path, fs_path);
+                goto out_free;
+        }
+        devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE);
+        if (IS_ERR(devname)) {
+                mnt = (struct vfsmount *)devname;
+                goto out_free;
+        }
+        loc = 0;
+        while (loc < locations->nlocations && IS_ERR(mnt)) {
+                struct nfs4_fs_location *location = &locations->locations[loc];
+                char *mnt_path;
+                if (location == NULL || location->nservers <= 0 ||
+                    location->rootpath.ncomponents == 0) {
+                        loc++;
+                        continue;
+                }
+                mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
+                if (IS_ERR(mnt_path)) {
+                        loc++;
+                        continue;
+                }
+                mountdata.mnt_path = mnt_path;
+                s = 0;
+                while (s < location->nservers) {
+                        struct sockaddr_in addr = {};
+                        if (location->servers[s].len <= 0 ||
+                            valid_ipaddr4(location->servers[s].data) < 0) {
+                                s++;
+                                continue;
+                        }
+                        mountdata.hostname = location->servers[s].data;
+                        addr.sin_addr.s_addr = in_aton(mountdata.hostname);
+                        addr.sin_family = AF_INET;
+                        addr.sin_port = htons(NFS_PORT);
+                        mountdata.addr = &addr;
+                        mnt = vfs_kern_mount(&nfs_referral_nfs4_fs_type, 0, devname, &mountdata);
+                        if (!IS_ERR(mnt)) {
+                                break;
+                        }
+                        s++;
+                }
+                loc++;
+        }
+out_free:
+        free_page((unsigned long)page);
+        free_page((unsigned long)page2);
+out:
+        dprintk("%s: done\n", __FUNCTION__);
+        return mnt;
+}
+/*
+ * nfs_do_refmount - handle crossing a referral on server
+ * @dentry - dentry of referral
+ * @nd - nameidata info
+ *
+ */
+struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
+{
+        struct vfsmount *mnt = ERR_PTR(-ENOENT);
+        struct dentry *parent;
+        struct nfs4_fs_locations *fs_locations = NULL;
+        struct page *page;
+        int err;
+        /* BUG_ON(IS_ROOT(dentry)); */
+        dprintk("%s: enter\n", __FUNCTION__);
+        page = alloc_page(GFP_KERNEL);
+        if (page == NULL)
+                goto out;
+        fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+        if (fs_locations == NULL)
+                goto out_free;
+        /* Get locations */
+        parent = dget_parent(dentry);
+        dprintk("%s: getting locations for %s/%s\n", __FUNCTION__, parent->d_name.name, dentry->d_name.name);
+        err = nfs4_proc_fs_locations(parent->d_inode, dentry, fs_locations, page);
+        dput(parent);
+        if (err != 0 || fs_locations->nlocations <= 0 ||
+            fs_locations->fs_path.ncomponents <= 0)
+                goto out_free;
+        mnt = nfs_follow_referral(mnt_parent, dentry, fs_locations);
+out_free:
+        __free_page(page);
+        kfree(fs_locations);
+out:
+        dprintk("%s: done\n", __FUNCTION__);
+        return mnt;
+}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d86c0db7b1e8..b4916b092194 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -65,8 +65,6 @@ static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *)
 static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry);
 static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception);
 static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp);
-extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus);
-extern struct rpc_procinfo nfs4_procedures[];
 /* Prevent leaks of NFSv4 errors into userland */
 int nfs4_map_errors(int err)
@@ -121,6 +119,25 @@ const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
                        0
 };
+const u32 nfs4_fs_locations_bitmap[2] = {
+        FATTR4_WORD0_TYPE
+        | FATTR4_WORD0_CHANGE
+        | FATTR4_WORD0_SIZE
+        | FATTR4_WORD0_FSID
+        | FATTR4_WORD0_FILEID
+        | FATTR4_WORD0_FS_LOCATIONS,
+        FATTR4_WORD1_MODE
+        | FATTR4_WORD1_NUMLINKS
+        | FATTR4_WORD1_OWNER
+        | FATTR4_WORD1_OWNER_GROUP
+        | FATTR4_WORD1_RAWDEV
+        | FATTR4_WORD1_SPACE_USED
+        | FATTR4_WORD1_TIME_ACCESS
+        | FATTR4_WORD1_TIME_METADATA
+        | FATTR4_WORD1_TIME_MODIFY
+        | FATTR4_WORD1_MOUNTED_ON_FILEID
+};
 static void nfs4_setup_readdir(u64 cookie, u32 *verifier, struct dentry *dentry,
                struct nfs4_readdir_arg *readdir)
 {
@@ -185,15 +202,15 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
        spin_unlock(&clp->cl_lock);
 }
-static void update_changeattr(struct inode *inode, struct nfs4_change_info *cinfo)
+static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
 {
-        struct nfs_inode *nfsi = NFS_I(inode);
+        struct nfs_inode *nfsi = NFS_I(dir);
-        spin_lock(&inode->i_lock);
+        spin_lock(&dir->i_lock);
-        nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+        nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
        if (cinfo->before == nfsi->change_attr && cinfo->atomic)
                nfsi->change_attr = cinfo->after;
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&dir->i_lock);
 }
 struct nfs4_opendata {
@@ -1331,7 +1348,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
        return status;
 }
-static int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
+int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
        struct nfs4_exception exception = { };
        int err;
@@ -1443,6 +1460,50 @@ out:
        return nfs4_map_errors(status);
 }
+/*
+ * Get locations and (maybe) other attributes of a referral.
+ * Note that we'll actually follow the referral later when
+ * we detect fsid mismatch in inode revalidation
+ */
+static int nfs4_get_referral(struct inode *dir, struct qstr *name, struct nfs_fattr *fattr, struct nfs_fh *fhandle)
+{
+        int status = -ENOMEM;
+        struct page *page = NULL;
+        struct nfs4_fs_locations *locations = NULL;
+        struct dentry dentry = {};
+        page = alloc_page(GFP_KERNEL);
+        if (page == NULL)
+                goto out;
+        locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+        if (locations == NULL)
+                goto out;
+        dentry.d_name.name = name->name;
+        dentry.d_name.len = name->len;
+        status = nfs4_proc_fs_locations(dir, &dentry, locations, page);
+        if (status != 0)
+                goto out;
+        /* Make sure server returned a different fsid for the referral */
+        if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) {
+                dprintk("%s: server did not return a different fsid for a referral at %s\n", __FUNCTION__, name->name);
+                status = -EIO;
+                goto out;
+        }
+        memcpy(fattr, &locations->fattr, sizeof(struct nfs_fattr));
+        fattr->valid |= NFS_ATTR_FATTR_V4_REFERRAL;
+        if (!fattr->mode)
+                fattr->mode = S_IFDIR;
+        memset(fhandle, 0, sizeof(struct nfs_fh));
+out:
+        if (page)
+                __free_page(page);
+        if (locations)
+                kfree(locations);
+        return status;
+}
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
        struct nfs4_getattr_arg args = {
@@ -1547,6 +1608,8 @@ static int _nfs4_proc_lookup(struct inode *dir, struct qstr *name,
        
        dprintk("NFS call  lookup %s\n", name->name);
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+        if (status == -NFS4ERR_MOVED)
+                status = nfs4_get_referral(dir, name, fattr, fhandle);
        dprintk("NFS reply lookup: %d\n", status);
        return status;
 }
@@ -2008,7 +2071,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
        if (!status) {
                update_changeattr(dir, &res.cinfo);
                nfs_post_op_update_inode(dir, res.dir_attr);
-                nfs_refresh_inode(inode, res.fattr);
+                nfs_post_op_update_inode(inode, res.fattr);
        }
        return status;
@@ -3570,6 +3633,36 @@ ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
        return len;
 }
+int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
+                struct nfs4_fs_locations *fs_locations, struct page *page)
+{
+        struct nfs_server *server = NFS_SERVER(dir);
+        u32 bitmask[2] = {
+                [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
+                [1] = FATTR4_WORD1_MOUNTED_ON_FILEID,
+        };
+        struct nfs4_fs_locations_arg args = {
+                .dir_fh = NFS_FH(dir),
+                .name = &dentry->d_name,
+                .page = page,
+                .bitmask = bitmask,
+        };
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
+                .rpc_argp = &args,
+                .rpc_resp = fs_locations,
+        };
+        int status;
+        dprintk("%s: start\n", __FUNCTION__);
+        fs_locations->fattr.valid = 0;
+        fs_locations->server = server;
+        fs_locations->nlocations = 0;
+        status = rpc_call_sync(server->client, &msg, 0);
+        dprintk("%s: returned status = %d\n", __FUNCTION__, status);
+        return status;
+}
 struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops = {
        .recover_open   = nfs4_open_reclaim,
        .recover_lock   = nfs4_lock_reclaim,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 7c5d70efe720..1750d996f49f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -411,6 +411,15 @@ static int nfs_stat_to_errno(int);
 #define NFS4_dec_setacl_sz      (compound_decode_hdr_maxsz + \
                                decode_putfh_maxsz + \
                                op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz)
+#define NFS4_enc_fs_locations_sz \
+                                (compound_encode_hdr_maxsz + \
+                                 encode_putfh_maxsz + \
+                                 encode_getattr_maxsz)
+#define NFS4_dec_fs_locations_sz \
+                                (compound_decode_hdr_maxsz + \
+                                 decode_putfh_maxsz + \
+                                 op_decode_hdr_maxsz + \
+                                 nfs4_fattr_bitmap_maxsz)
 static struct {
        unsigned int    mode;
@@ -722,6 +731,13 @@ static int encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask)
                        bitmask[1] & nfs4_fsinfo_bitmap[1]);
 }
+static int encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask)
+{
+        return encode_getattr_two(xdr,
+                                  bitmask[0] & nfs4_fs_locations_bitmap[0],
+                                  bitmask[1] & nfs4_fs_locations_bitmap[1]);
+}
 static int encode_getfh(struct xdr_stream *xdr)
 {
        uint32_t *p;
@@ -2003,6 +2019,38 @@ out:
 }
 /*
+ * Encode FS_LOCATIONS request
+ */
+static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs_locations_arg *args)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr = {
+                .nops = 3,
+        };
+        struct rpc_auth *auth = req->rq_task->tk_auth;
+        int replen;
+        int status;
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(&xdr, &hdr);
+        if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
+                goto out;
+        if ((status = encode_lookup(&xdr, args->name)) != 0)
+                goto out;
+        if ((status = encode_fs_locations(&xdr, args->bitmask)) != 0)
+                goto out;
+        /* set up reply
+         *   toplevel_status + OP_PUTFH + status
+         *   + OP_LOOKUP + status + OP_GETATTR + status = 7
+         */
+        replen = (RPC_REPHDRSIZE + auth->au_rslack + 7) << 2;
+        xdr_inline_pages(&req->rq_rcv_buf, replen, &args->page,
+                        0, PAGE_SIZE);
+out:
+        return status;
+}
+/*
 * START OF "GENERIC" DECODE ROUTINES.
 *   These may look a little ugly since they are imported from a "generic"
 * set of XDR encode/decode routines which are intended to be shared by
@@ -2036,7 +2084,7 @@ out:
        } \
 } while (0)
-static int decode_opaque_inline(struct xdr_stream *xdr, uint32_t *len, char **string)
+static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
 {
        uint32_t *p;
@@ -2087,7 +2135,7 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs4_client *clp)
 {
        uint32_t *p;
-        uint32_t strlen;
+        unsigned int strlen;
        char *str;
        READ_BUF(12);
@@ -2217,7 +2265,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
        return 0;
 }
-static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fsid *fsid)
+static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
 {
        uint32_t *p;
@@ -2285,6 +2333,22 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
        return 0;
 }
+static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
+{
+        uint32_t *p;
+        *fileid = 0;
+        if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
+                return -EIO;
+        if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
+                READ_BUF(8);
+                READ64(*fileid);
+                bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
+        }
+        dprintk("%s: fileid=%Lu\n", __FUNCTION__, (unsigned long long)*fileid);
+        return 0;
+}
 static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
 {
        uint32_t *p;
@@ -2336,6 +2400,116 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
        return status;
 }
+static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
+{
+        int n;
+        uint32_t *p;
+        int status = 0;
+        READ_BUF(4);
+        READ32(n);
+        if (n < 0)
+                goto out_eio;
+        if (n == 0)
+                goto root_path;
+        dprintk("path ");
+        path->ncomponents = 0;
+        while (path->ncomponents < n) {
+                struct nfs4_string *component = &path->components[path->ncomponents];
+                status = decode_opaque_inline(xdr, &component->len, &component->data);
+                if (unlikely(status != 0))
+                        goto out_eio;
+                if (path->ncomponents != n)
+                        dprintk("/");
+                dprintk("%s", component->data);
+                if (path->ncomponents < NFS4_PATHNAME_MAXCOMPONENTS)
+                        path->ncomponents++;
+                else {
+                        dprintk("cannot parse %d components in path\n", n);
+                        goto out_eio;
+                }
+        }
+out:
+        dprintk("\n");
+        return status;
+root_path:
+/* a root pathname is sent as a zero component4 */
+        path->ncomponents = 1;
+        path->components[0].len=0;
+        path->components[0].data=NULL;
+        dprintk("path /\n");
+        goto out;
+out_eio:
+        dprintk(" status %d", status);
+        status = -EIO;
+        goto out;
+}
+static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
+{
+        int n;
+        uint32_t *p;
+        int status = -EIO;
+        if (unlikely(bitmap[0] & (FATTR4_WORD0_FS_LOCATIONS -1U)))
+                goto out;
+        status = 0;
+        if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS)))
+                goto out;
+        dprintk("%s: fsroot ", __FUNCTION__);
+        status = decode_pathname(xdr, &res->fs_path);
+        if (unlikely(status != 0))
+                goto out;
+        READ_BUF(4);
+        READ32(n);
+        if (n <= 0)
+                goto out_eio;
+        res->nlocations = 0;
+        while (res->nlocations < n) {
+                int m;
+                struct nfs4_fs_location *loc = &res->locations[res->nlocations];
+                READ_BUF(4);
+                READ32(m);
+                if (m <= 0)
+                        goto out_eio;
+                loc->nservers = 0;
+                dprintk("%s: servers ", __FUNCTION__);
+                while (loc->nservers < m) {
+                        struct nfs4_string *server = &loc->servers[loc->nservers];
+                        status = decode_opaque_inline(xdr, &server->len, &server->data);
+                        if (unlikely(status != 0))
+                                goto out_eio;
+                        dprintk("%s ", server->data);
+                        if (loc->nservers < NFS4_FS_LOCATION_MAXSERVERS)
+                                loc->nservers++;
+                        else {
+                                int i;
+                                dprintk("%s: using first %d of %d servers returned for location %d\n", __FUNCTION__, NFS4_FS_LOCATION_MAXSERVERS, m, res->nlocations);
+                                for (i = loc->nservers; i < m; i++) {
+                                        int len;
+                                        char *data;
+                                        status = decode_opaque_inline(xdr, &len, &data);
+                                        if (unlikely(status != 0))
+                                                goto out_eio;
+                                }
+                        }
+                }
+                status = decode_pathname(xdr, &loc->rootpath);
+                if (unlikely(status != 0))
+                        goto out_eio;
+                if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES)
+                        res->nlocations++;
+        }
+out:
+        dprintk("%s: fs_locations done, error = %d\n", __FUNCTION__, status);
+        return status;
+out_eio:
+        status = -EIO;
+        goto out;
+}
 static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
 {
        uint32_t *p;
@@ -2841,6 +3015,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
                 bitmap[2] = {0},
                 type;
        int status, fmode = 0;
+        uint64_t fileid;
        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
                goto xdr_error;
@@ -2863,10 +3038,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
                goto xdr_error;
        if ((status = decode_attr_size(xdr, bitmap, &fattr->size)) != 0)
                goto xdr_error;
-        if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid_u.nfs4)) != 0)
+        if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid)) != 0)
                goto xdr_error;
        if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0)
                goto xdr_error;
+        if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
+                                                struct nfs4_fs_locations,
+                                                fattr))) != 0)
+                goto xdr_error;
        if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0)
                goto xdr_error;
        fattr->mode |= fmode;
@@ -2886,6 +3065,10 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
                goto xdr_error;
        if ((status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime)) != 0)
                goto xdr_error;
+        if ((status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid)) != 0)
+                goto xdr_error;
+        if (fattr->fileid == 0 && fileid != 0)
+                fattr->fileid = fileid;
        if ((status = verify_attr_len(xdr, savep, attrlen)) == 0)
                fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4;
 xdr_error:
@@ -3350,8 +3533,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
                                        attrlen, recvd);
                        return -EINVAL;
                }
-                if (attrlen <= *acl_len)
+                xdr_read_pages(xdr, attrlen);
-                        xdr_read_pages(xdr, attrlen);
                *acl_len = attrlen;
        } else
                status = -EOPNOTSUPP;
@@ -4211,6 +4393,29 @@ out:
        return status;
 }
+/*
+ * FS_LOCATIONS request
+ */
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs_locations *res)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr;
+        int status;
+        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (status != 0)
+                goto out;
+        if ((status = decode_putfh(&xdr)) != 0)
+                goto out;
+        if ((status = decode_lookup(&xdr)) != 0)
+                goto out;
+        xdr_enter_page(&xdr, PAGE_SIZE);
+        status = decode_getfattr(&xdr, &res->fattr, res->server);
+out:
+        return status;
+}
 uint32_t *nfs4_decode_dirent(uint32_t *p, struct nfs_entry *entry, int plus)
 {
        uint32_t bitmap[2] = {0};
@@ -4382,6 +4587,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
  PROC(DELEGRETURN,     enc_delegreturn, dec_delegreturn),
  PROC(GETACL,          enc_getacl,     dec_getacl),
  PROC(SETACL,          enc_setacl,     dec_setacl),
+  PROC(FS_LOCATIONS,    enc_fs_locations, dec_fs_locations),
 };
 struct rpc_version              nfs_version4 = {
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 106aca388ebc..d89f6fb3b3a3 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -325,6 +325,7 @@ out:
 /**
 * nfs_scan_list - Scan a list for matching requests
+ * @nfsi: NFS inode
 * @head: One of the NFS inode request lists
 * @dst: Destination list
 * @idx_start: lower bound of page->index to scan
@@ -336,14 +337,15 @@ out:
 * The requests are *not* checked to ensure that they form a contiguous set.
 * You must be holding the inode's req_lock when calling this function
 */
-int
+int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head,
-nfs_scan_list(struct list_head *head, struct list_head *dst,
+                struct list_head *dst, unsigned long idx_start,
-              unsigned long idx_start, unsigned int npages)
+                unsigned int npages)
 {
-        struct list_head        *pos, *tmp;
+        struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
-        struct nfs_page         *req;
+        struct nfs_page *req;
-        unsigned long           idx_end;
+        unsigned long idx_end;
-        int                     res;
+        int found, i;
+        int res;
        res = 0;
        if (npages == 0)
@@ -351,25 +353,32 @@ nfs_scan_list(struct list_head *head, struct list_head *dst,
        else
                idx_end = idx_start + npages - 1;
-        list_for_each_safe(pos, tmp, head) {
+        for (;;) {
+                found = radix_tree_gang_lookup(&nfsi->nfs_page_tree,
-                req = nfs_list_entry(pos);
+                                (void **)&pgvec[0], idx_start,
+                                NFS_SCAN_MAXENTRIES);
-                if (req->wb_index < idx_start)
+                if (found <= 0)
-                        continue;
-                if (req->wb_index > idx_end)
                        break;
+                for (i = 0; i < found; i++) {
+                        req = pgvec[i];
+                        if (req->wb_index > idx_end)
+                                goto out;
+                        idx_start = req->wb_index + 1;
+                        if (req->wb_list_head != head)
+                                continue;
+                        if (nfs_set_page_writeback_locked(req)) {
+                                nfs_list_remove_request(req);
+                                nfs_list_add_request(req, dst);
+                                res++;
+                        }
+                }
-                if (!nfs_set_page_writeback_locked(req))
-                        continue;
-                nfs_list_remove_request(req);
-                nfs_list_add_request(req, dst);
-                res++;
        }
+out:
        return res;
 }
-int nfs_init_nfspagecache(void)
+int __init nfs_init_nfspagecache(void)
 {
        nfs_page_cachep = kmem_cache_create("nfs_page",
                                            sizeof(struct nfs_page),
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 9dd85cac2df0..b3899ea3229e 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -44,11 +44,10 @@
 #include <linux/nfs_page.h>
 #include <linux/lockd/bind.h>
 #include <linux/smp_lock.h>
+#include "internal.h"
 #define NFSDBG_FACILITY         NFSDBG_PROC
-extern struct rpc_procinfo nfs_procedures[];
 /*
 * Bare-bones access to getattr: this is for nfs_read_super.
 */
@@ -611,8 +610,6 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
        return 0;
 }
-extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int);
 static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
        if (task->tk_status >= 0) {
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 624ca7146b6b..32cf3773af0c 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -51,14 +51,11 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
        if (p) {
                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->pages);
-                if (pagecount < NFS_PAGEVEC_SIZE)
+                if (pagecount <= ARRAY_SIZE(p->page_array))
-                        p->pagevec = &p->page_array[0];
+                        p->pagevec = p->page_array;
                else {
-                        size_t size = ++pagecount * sizeof(struct page *);
+                        p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
-                        p->pagevec = kmalloc(size, GFP_NOFS);
+                        if (!p->pagevec) {
-                        if (p->pagevec) {
-                                memset(p->pagevec, 0, size);
-                        } else {
                                mempool_free(p, nfs_rdata_mempool);
                                p = NULL;
                        }
@@ -104,6 +101,28 @@ int nfs_return_empty_page(struct page *page)
        return 0;
 }
+static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
+{
+        unsigned int remainder = data->args.count - data->res.count;
+        unsigned int base = data->args.pgbase + data->res.count;
+        unsigned int pglen;
+        struct page **pages;
+        if (data->res.eof == 0 || remainder == 0)
+                return;
+        /*
+         * Note: "remainder" can never be negative, since we check for
+         *      this in the XDR code.
+         */
+        pages = &data->args.pages[base >> PAGE_CACHE_SHIFT];
+        base &= ~PAGE_CACHE_MASK;
+        pglen = PAGE_CACHE_SIZE - base;
+        if (pglen < remainder)
+                memclear_highpage_flush(*pages, base, pglen);
+        else
+                memclear_highpage_flush(*pages, base, remainder);
+}
 /*
 * Read a page synchronously.
 */
@@ -177,11 +196,9 @@ static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode,
        NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
        spin_unlock(&inode->i_lock);
-        if (count)
+        nfs_readpage_truncate_uninitialised_page(rdata);
-                memclear_highpage_flush(page, rdata->args.pgbase, count);
+        if (rdata->res.eof || rdata->res.count == rdata->args.count)
-        SetPageUptodate(page);
+                SetPageUptodate(page);
-        if (PageError(page))
-                ClearPageError(page);
        result = 0;
 io_error:
@@ -436,20 +453,12 @@ static void nfs_readpage_result_partial(struct rpc_task *task, void *calldata)
        struct nfs_page *req = data->req;
        struct page *page = req->wb_page;
 
+        if (likely(task->tk_status >= 0))
+                nfs_readpage_truncate_uninitialised_page(data);
+        else
+                SetPageError(page);
        if (nfs_readpage_result(task, data) != 0)
                return;
-        if (task->tk_status >= 0) {
-                unsigned int request = data->args.count;
-                unsigned int result = data->res.count;
-                if (result < request) {
-                        memclear_highpage_flush(page,
-                                                data->args.pgbase + result,
-                                                request - result);
-                }
-        } else
-                SetPageError(page);
        if (atomic_dec_and_test(&req->wb_complete)) {
                if (!PageError(page))
                        SetPageUptodate(page);
@@ -462,6 +471,40 @@ static const struct rpc_call_ops nfs_read_partial_ops = {
        .rpc_release = nfs_readdata_release,
 };
+static void nfs_readpage_set_pages_uptodate(struct nfs_read_data *data)
+{
+        unsigned int count = data->res.count;
+        unsigned int base = data->args.pgbase;
+        struct page **pages;
+        if (unlikely(count == 0))
+                return;
+        pages = &data->args.pages[base >> PAGE_CACHE_SHIFT];
+        base &= ~PAGE_CACHE_MASK;
+        count += base;
+        for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++)
+                SetPageUptodate(*pages);
+        /*
+         * Was this an eof or a short read? If the latter, don't mark the page
+         * as uptodate yet.
+         */
+        if (count > 0 && (data->res.eof || data->args.count == data->res.count))
+                SetPageUptodate(*pages);
+}
+static void nfs_readpage_set_pages_error(struct nfs_read_data *data)
+{
+        unsigned int count = data->args.count;
+        unsigned int base = data->args.pgbase;
+        struct page **pages;
+        pages = &data->args.pages[base >> PAGE_CACHE_SHIFT];
+        base &= ~PAGE_CACHE_MASK;
+        count += base;
+        for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++)
+                SetPageError(*pages);
+}
 /*
 * This is the callback from RPC telling us whether a reply was
 * received or some error occurred (timeout or socket shutdown).
@@ -469,27 +512,24 @@ static const struct rpc_call_ops nfs_read_partial_ops = {
 static void nfs_readpage_result_full(struct rpc_task *task, void *calldata)
 {
        struct nfs_read_data *data = calldata;
-        unsigned int count = data->res.count;
+        /*
+         * Note: nfs_readpage_result may change the values of
+         * data->args. In the multi-page case, we therefore need
+         * to ensure that we call the next nfs_readpage_set_page_uptodate()
+         * first in the multi-page case.
+         */
+        if (likely(task->tk_status >= 0)) {
+                nfs_readpage_truncate_uninitialised_page(data);
+                nfs_readpage_set_pages_uptodate(data);
+        } else
+                nfs_readpage_set_pages_error(data);
        if (nfs_readpage_result(task, data) != 0)
                return;
        while (!list_empty(&data->pages)) {
                struct nfs_page *req = nfs_list_entry(data->pages.next);
-                struct page *page = req->wb_page;
-                nfs_list_remove_request(req);
-                if (task->tk_status >= 0) {
+                nfs_list_remove_request(req);
-                        if (count < PAGE_CACHE_SIZE) {
-                                if (count < req->wb_bytes)
-                                        memclear_highpage_flush(page,
-                                                        req->wb_pgbase + count,
-                                                        req->wb_bytes - count);
-                                count = 0;
-                        } else
-                                count -= PAGE_CACHE_SIZE;
-                        SetPageUptodate(page);
-                } else
-                        SetPageError(page);
                nfs_readpage_release(req);
        }
 }
@@ -654,7 +694,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
        return ret;
 }
-int nfs_init_readpagecache(void)
+int __init nfs_init_readpagecache(void)
 {
        nfs_rdata_cachep = kmem_cache_create("nfs_read_data",
                                             sizeof(struct nfs_read_data),
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
new file mode 100644
index 000000000000..e8a9bee74d9d
--- /dev/null
+++ b/fs/nfs/super.c
@@ -0,0 +1,1537 @@
+/*
+ *  linux/fs/nfs/super.c
+ *
+ *  Copyright (C) 1992  Rick Sladkey
+ *
+ *  nfs superblock handling functions
+ *
+ *  Modularised by Alan Cox <Alan.Cox@linux.org>, while hacking some
+ *  experimental NFS changes. Modularisation taken straight from SYS5 fs.
+ *
+ *  Change to nfs_read_super() to permit NFS mounts to multi-homed hosts.
+ *  J.S.Peatfield@damtp.cam.ac.uk
+ *
+ *  Split from inode.c by David Howells <dhowells@redhat.com>
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/smp_lock.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/nfs_idmap.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include <linux/nfs_xdr.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "iostat.h"
+#include "internal.h"
+#define NFSDBG_FACILITY         NFSDBG_VFS
+/* Maximum number of readahead requests
+ * FIXME: this should really be a sysctl so that users may tune it to suit
+ *        their needs. People that do NFS over a slow network, might for
+ *        instance want to reduce it to something closer to 1 for improved
+ *        interactive response.
+ */
+#define NFS_MAX_READAHEAD       (RPC_DEF_SLOT_TABLE - 1)
+/*
+ * RPC cruft for NFS
+ */
+static struct rpc_version * nfs_version[] = {
+        NULL,
+        NULL,
+        &nfs_version2,
+#if defined(CONFIG_NFS_V3)
+        &nfs_version3,
+#elif defined(CONFIG_NFS_V4)
+        NULL,
+#endif
+#if defined(CONFIG_NFS_V4)
+        &nfs_version4,
+#endif
+};
+static struct rpc_program nfs_program = {
+        .name                   = "nfs",
+        .number                 = NFS_PROGRAM,
+        .nrvers                 = ARRAY_SIZE(nfs_version),
+        .version                = nfs_version,
+        .stats                  = &nfs_rpcstat,
+        .pipe_dir_name          = "/nfs",
+};
+struct rpc_stat nfs_rpcstat = {
+        .program                = &nfs_program
+};
+#ifdef CONFIG_NFS_V3_ACL
+static struct rpc_stat          nfsacl_rpcstat = { &nfsacl_program };
+static struct rpc_version *     nfsacl_version[] = {
+        [3]                     = &nfsacl_version3,
+};
+struct rpc_program              nfsacl_program = {
+        .name =                 "nfsacl",
+        .number =               NFS_ACL_PROGRAM,
+        .nrvers =               ARRAY_SIZE(nfsacl_version),
+        .version =              nfsacl_version,
+        .stats =                &nfsacl_rpcstat,
+};
+#endif  /* CONFIG_NFS_V3_ACL */
+static void nfs_umount_begin(struct vfsmount *, int);
+static int  nfs_statfs(struct dentry *, struct kstatfs *);
+static int  nfs_show_options(struct seq_file *, struct vfsmount *);
+static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
+static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *);
+static int nfs_clone_nfs_sb(struct file_system_type *fs_type,
+                int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static void nfs_kill_super(struct super_block *);
+static struct file_system_type nfs_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "nfs",
+        .get_sb         = nfs_get_sb,
+        .kill_sb        = nfs_kill_super,
+        .fs_flags       = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+struct file_system_type clone_nfs_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "nfs",
+        .get_sb         = nfs_clone_nfs_sb,
+        .kill_sb        = nfs_kill_super,
+        .fs_flags       = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+static struct super_operations nfs_sops = {
+        .alloc_inode    = nfs_alloc_inode,
+        .destroy_inode  = nfs_destroy_inode,
+        .write_inode    = nfs_write_inode,
+        .statfs         = nfs_statfs,
+        .clear_inode    = nfs_clear_inode,
+        .umount_begin   = nfs_umount_begin,
+        .show_options   = nfs_show_options,
+        .show_stats     = nfs_show_stats,
+};
+#ifdef CONFIG_NFS_V4
+static int nfs4_get_sb(struct file_system_type *fs_type,
+        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static int nfs_clone_nfs4_sb(struct file_system_type *fs_type,
+                int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static int nfs_referral_nfs4_sb(struct file_system_type *fs_type,
+                int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static void nfs4_kill_super(struct super_block *sb);
+static struct file_system_type nfs4_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "nfs4",
+        .get_sb         = nfs4_get_sb,
+        .kill_sb        = nfs4_kill_super,
+        .fs_flags       = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+struct file_system_type clone_nfs4_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "nfs4",
+        .get_sb         = nfs_clone_nfs4_sb,
+        .kill_sb        = nfs4_kill_super,
+        .fs_flags       = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+struct file_system_type nfs_referral_nfs4_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "nfs4",
+        .get_sb         = nfs_referral_nfs4_sb,
+        .kill_sb        = nfs4_kill_super,
+        .fs_flags       = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+static struct super_operations nfs4_sops = {
+        .alloc_inode    = nfs_alloc_inode,
+        .destroy_inode  = nfs_destroy_inode,
+        .write_inode    = nfs_write_inode,
+        .statfs         = nfs_statfs,
+        .clear_inode    = nfs4_clear_inode,
+        .umount_begin   = nfs_umount_begin,
+        .show_options   = nfs_show_options,
+        .show_stats     = nfs_show_stats,
+};
+#endif
+#ifdef CONFIG_NFS_V4
+static const int nfs_set_port_min = 0;
+static const int nfs_set_port_max = 65535;
+static int param_set_port(const char *val, struct kernel_param *kp)
+{
+        char *endp;
+        int num = simple_strtol(val, &endp, 0);
+        if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max)
+                return -EINVAL;
+        *((int *)kp->arg) = num;
+        return 0;
+}
+module_param_call(callback_tcpport, param_set_port, param_get_int,
+                 &nfs_callback_set_tcpport, 0644);
+#endif
+#ifdef CONFIG_NFS_V4
+static int param_set_idmap_timeout(const char *val, struct kernel_param *kp)
+{
+        char *endp;
+        int num = simple_strtol(val, &endp, 0);
+        int jif = num * HZ;
+        if (endp == val || *endp || num < 0 || jif < num)
+                return -EINVAL;
+        *((int *)kp->arg) = jif;
+        return 0;
+}
+module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
+                 &nfs_idmap_cache_timeout, 0644);
+#endif
+/*
+ * Register the NFS filesystems
+ */
+int __init register_nfs_fs(void)
+{
+        int ret;
+        ret = register_filesystem(&nfs_fs_type);
+        if (ret < 0)
+                goto error_0;
+#ifdef CONFIG_NFS_V4
+        ret = nfs_register_sysctl();
+        if (ret < 0)
+                goto error_1;
+        ret = register_filesystem(&nfs4_fs_type);
+        if (ret < 0)
+                goto error_2;
+#endif
+        return 0;
+#ifdef CONFIG_NFS_V4
+error_2:
+        nfs_unregister_sysctl();
+error_1:
+        unregister_filesystem(&nfs_fs_type);
+#endif
+error_0:
+        return ret;
+}
+/*
+ * Unregister the NFS filesystems
+ */
+void __exit unregister_nfs_fs(void)
+{
+#ifdef CONFIG_NFS_V4
+        unregister_filesystem(&nfs4_fs_type);
+        nfs_unregister_sysctl();
+#endif
+        unregister_filesystem(&nfs_fs_type);
+}
+/*
+ * Deliver file system statistics to userspace
+ */
+static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct super_block *sb = dentry->d_sb;
+        struct nfs_server *server = NFS_SB(sb);
+        unsigned char blockbits;
+        unsigned long blockres;
+        struct nfs_fh *rootfh = NFS_FH(sb->s_root->d_inode);
+        struct nfs_fattr fattr;
+        struct nfs_fsstat res = {
+                        .fattr = &fattr,
+        };
+        int error;
+        lock_kernel();
+        error = server->rpc_ops->statfs(server, rootfh, &res);
+        buf->f_type = NFS_SUPER_MAGIC;
+        if (error < 0)
+                goto out_err;
+        /*
+         * Current versions of glibc do not correctly handle the
+         * case where f_frsize != f_bsize.  Eventually we want to
+         * report the value of wtmult in this field.
+         */
+        buf->f_frsize = sb->s_blocksize;
+        /*
+         * On most *nix systems, f_blocks, f_bfree, and f_bavail
+         * are reported in units of f_frsize.  Linux hasn't had
+         * an f_frsize field in its statfs struct until recently,
+         * thus historically Linux's sys_statfs reports these
+         * fields in units of f_bsize.
+         */
+        buf->f_bsize = sb->s_blocksize;
+        blockbits = sb->s_blocksize_bits;
+        blockres = (1 << blockbits) - 1;
+        buf->f_blocks = (res.tbytes + blockres) >> blockbits;
+        buf->f_bfree = (res.fbytes + blockres) >> blockbits;
+        buf->f_bavail = (res.abytes + blockres) >> blockbits;
+        buf->f_files = res.tfiles;
+        buf->f_ffree = res.afiles;
+        buf->f_namelen = server->namelen;
+ out:
+        unlock_kernel();
+        return 0;
+ out_err:
+        dprintk("%s: statfs error = %d\n", __FUNCTION__, -error);
+        buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1;
+        goto out;
+}
+static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour)
+{
+        static struct {
+                rpc_authflavor_t flavour;
+                const char *str;
+        } sec_flavours[] = {
+                { RPC_AUTH_NULL, "null" },
+                { RPC_AUTH_UNIX, "sys" },
+                { RPC_AUTH_GSS_KRB5, "krb5" },
+                { RPC_AUTH_GSS_KRB5I, "krb5i" },
+                { RPC_AUTH_GSS_KRB5P, "krb5p" },
+                { RPC_AUTH_GSS_LKEY, "lkey" },
+                { RPC_AUTH_GSS_LKEYI, "lkeyi" },
+                { RPC_AUTH_GSS_LKEYP, "lkeyp" },
+                { RPC_AUTH_GSS_SPKM, "spkm" },
+                { RPC_AUTH_GSS_SPKMI, "spkmi" },
+                { RPC_AUTH_GSS_SPKMP, "spkmp" },
+                { -1, "unknown" }
+        };
+        int i;
+        for (i=0; sec_flavours[i].flavour != -1; i++) {
+                if (sec_flavours[i].flavour == flavour)
+                        break;
+        }
+        return sec_flavours[i].str;
+}
+/*
+ * Describe the mount options in force on this server representation
+ */
+static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, int showdefaults)
+{
+        static struct proc_nfs_info {
+                int flag;
+                char *str;
+                char *nostr;
+        } nfs_info[] = {
+                { NFS_MOUNT_SOFT, ",soft", ",hard" },
+                { NFS_MOUNT_INTR, ",intr", "" },
+                { NFS_MOUNT_NOCTO, ",nocto", "" },
+                { NFS_MOUNT_NOAC, ",noac", "" },
+                { NFS_MOUNT_NONLM, ",nolock", "" },
+                { NFS_MOUNT_NOACL, ",noacl", "" },
+                { 0, NULL, NULL }
+        };
+        struct proc_nfs_info *nfs_infop;
+        char buf[12];
+        char *proto;
+        seq_printf(m, ",vers=%d", nfss->rpc_ops->version);
+        seq_printf(m, ",rsize=%d", nfss->rsize);
+        seq_printf(m, ",wsize=%d", nfss->wsize);
+        if (nfss->acregmin != 3*HZ || showdefaults)
+                seq_printf(m, ",acregmin=%d", nfss->acregmin/HZ);
+        if (nfss->acregmax != 60*HZ || showdefaults)
+                seq_printf(m, ",acregmax=%d", nfss->acregmax/HZ);
+        if (nfss->acdirmin != 30*HZ || showdefaults)
+                seq_printf(m, ",acdirmin=%d", nfss->acdirmin/HZ);
+        if (nfss->acdirmax != 60*HZ || showdefaults)
+                seq_printf(m, ",acdirmax=%d", nfss->acdirmax/HZ);
+        for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
+                if (nfss->flags & nfs_infop->flag)
+                        seq_puts(m, nfs_infop->str);
+                else
+                        seq_puts(m, nfs_infop->nostr);
+        }
+        switch (nfss->client->cl_xprt->prot) {
+                case IPPROTO_TCP:
+                        proto = "tcp";
+                        break;
+                case IPPROTO_UDP:
+                        proto = "udp";
+                        break;
+                default:
+                        snprintf(buf, sizeof(buf), "%u", nfss->client->cl_xprt->prot);
+                        proto = buf;
+        }
+        seq_printf(m, ",proto=%s", proto);
+        seq_printf(m, ",timeo=%lu", 10U * nfss->retrans_timeo / HZ);
+        seq_printf(m, ",retrans=%u", nfss->retrans_count);
+        seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor));
+}
+/*
+ * Describe the mount options on this VFS mountpoint
+ */
+static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+        struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
+        nfs_show_mount_options(m, nfss, 0);
+        seq_puts(m, ",addr=");
+        seq_escape(m, nfss->hostname, " \t\n\\");
+        return 0;
+}
+/*
+ * Present statistical information for this VFS mountpoint
+ */
+static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
+{
+        int i, cpu;
+        struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
+        struct rpc_auth *auth = nfss->client->cl_auth;
+        struct nfs_iostats totals = { };
+        seq_printf(m, "statvers=%s", NFS_IOSTAT_VERS);
+        /*
+         * Display all mount option settings
+         */
+        seq_printf(m, "\n\topts:\t");
+        seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw");
+        seq_puts(m, mnt->mnt_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : "");
+        seq_puts(m, mnt->mnt_sb->s_flags & MS_NOATIME ? ",noatime" : "");
+        seq_puts(m, mnt->mnt_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : "");
+        nfs_show_mount_options(m, nfss, 1);
+        seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ);
+        seq_printf(m, "\n\tcaps:\t");
+        seq_printf(m, "caps=0x%x", nfss->caps);
+        seq_printf(m, ",wtmult=%d", nfss->wtmult);
+        seq_printf(m, ",dtsize=%d", nfss->dtsize);
+        seq_printf(m, ",bsize=%d", nfss->bsize);
+        seq_printf(m, ",namelen=%d", nfss->namelen);
+#ifdef CONFIG_NFS_V4
+        if (nfss->rpc_ops->version == 4) {
+                seq_printf(m, "\n\tnfsv4:\t");
+                seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
+                seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
+                seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
+        }
+#endif
+        /*
+         * Display security flavor in effect for this mount
+         */
+        seq_printf(m, "\n\tsec:\tflavor=%d", auth->au_ops->au_flavor);
+        if (auth->au_flavor)
+                seq_printf(m, ",pseudoflavor=%d", auth->au_flavor);
+        /*
+         * Display superblock I/O counters
+         */
+        for_each_possible_cpu(cpu) {
+                struct nfs_iostats *stats;
+                preempt_disable();
+                stats = per_cpu_ptr(nfss->io_stats, cpu);
+                for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
+                        totals.events[i] += stats->events[i];
+                for (i = 0; i < __NFSIOS_BYTESMAX; i++)
+                        totals.bytes[i] += stats->bytes[i];
+                preempt_enable();
+        }
+        seq_printf(m, "\n\tevents:\t");
+        for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
+                seq_printf(m, "%lu ", totals.events[i]);
+        seq_printf(m, "\n\tbytes:\t");
+        for (i = 0; i < __NFSIOS_BYTESMAX; i++)
+                seq_printf(m, "%Lu ", totals.bytes[i]);
+        seq_printf(m, "\n");
+        rpc_print_iostats(m, nfss->client);
+        return 0;
+}
+/*
+ * Begin unmount by attempting to remove all automounted mountpoints we added
+ * in response to traversals
+ */
+static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags)
+{
+        struct nfs_server *server;
+        struct rpc_clnt *rpc;
+        shrink_submounts(vfsmnt, &nfs_automount_list);
+        if (!(flags & MNT_FORCE))
+                return;
+        /* -EIO all pending I/O */
+        server = NFS_SB(vfsmnt->mnt_sb);
+        rpc = server->client;
+        if (!IS_ERR(rpc))
+                rpc_killall_tasks(rpc);
+        rpc = server->client_acl;
+        if (!IS_ERR(rpc))
+                rpc_killall_tasks(rpc);
+}
+/*
+ * Obtain the root inode of the file system.
+ */
+static struct inode *
+nfs_get_root(struct super_block *sb, struct nfs_fh *rootfh, struct nfs_fsinfo *fsinfo)
+{
+        struct nfs_server       *server = NFS_SB(sb);
+        int                     error;
+        error = server->rpc_ops->getroot(server, rootfh, fsinfo);
+        if (error < 0) {
+                dprintk("nfs_get_root: getattr error = %d\n", -error);
+                return ERR_PTR(error);
+        }
+        server->fsid = fsinfo->fattr->fsid;
+        return nfs_fhget(sb, rootfh, fsinfo->fattr);
+}
+/*
+ * Do NFS version-independent mount processing, and sanity checking
+ */
+static int
+nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor)
+{
+        struct nfs_server       *server;
+        struct inode            *root_inode;
+        struct nfs_fattr        fattr;
+        struct nfs_fsinfo       fsinfo = {
+                                        .fattr = &fattr,
+                                };
+        struct nfs_pathconf pathinfo = {
+                        .fattr = &fattr,
+        };
+        int no_root_error = 0;
+        unsigned long max_rpc_payload;
+        /* We probably want something more informative here */
+        snprintf(sb->s_id, sizeof(sb->s_id), "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev));
+        server = NFS_SB(sb);
+        sb->s_magic      = NFS_SUPER_MAGIC;
+        server->io_stats = nfs_alloc_iostats();
+        if (server->io_stats == NULL)
+                return -ENOMEM;
+        root_inode = nfs_get_root(sb, &server->fh, &fsinfo);
+        /* Did getting the root inode fail? */
+        if (IS_ERR(root_inode)) {
+                no_root_error = PTR_ERR(root_inode);
+                goto out_no_root;
+        }
+        sb->s_root = d_alloc_root(root_inode);
+        if (!sb->s_root) {
+                no_root_error = -ENOMEM;
+                goto out_no_root;
+        }
+        sb->s_root->d_op = server->rpc_ops->dentry_ops;
+        /* mount time stamp, in seconds */
+        server->mount_time = jiffies;
+        /* Get some general file system info */
+        if (server->namelen == 0 &&
+            server->rpc_ops->pathconf(server, &server->fh, &pathinfo) >= 0)
+                server->namelen = pathinfo.max_namelen;
+        /* Work out a lot of parameters */
+        if (server->rsize == 0)
+                server->rsize = nfs_block_size(fsinfo.rtpref, NULL);
+        if (server->wsize == 0)
+                server->wsize = nfs_block_size(fsinfo.wtpref, NULL);
+        if (fsinfo.rtmax >= 512 && server->rsize > fsinfo.rtmax)
+                server->rsize = nfs_block_size(fsinfo.rtmax, NULL);
+        if (fsinfo.wtmax >= 512 && server->wsize > fsinfo.wtmax)
+                server->wsize = nfs_block_size(fsinfo.wtmax, NULL);
+        max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL);
+        if (server->rsize > max_rpc_payload)
+                server->rsize = max_rpc_payload;
+        if (server->rsize > NFS_MAX_FILE_IO_SIZE)
+                server->rsize = NFS_MAX_FILE_IO_SIZE;
+        server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        if (server->wsize > max_rpc_payload)
+                server->wsize = max_rpc_payload;
+        if (server->wsize > NFS_MAX_FILE_IO_SIZE)
+                server->wsize = NFS_MAX_FILE_IO_SIZE;
+        server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        if (sb->s_blocksize == 0)
+                sb->s_blocksize = nfs_block_bits(server->wsize,
+                                                         &sb->s_blocksize_bits);
+        server->wtmult = nfs_block_bits(fsinfo.wtmult, NULL);
+        server->dtsize = nfs_block_size(fsinfo.dtpref, NULL);
+        if (server->dtsize > PAGE_CACHE_SIZE)
+                server->dtsize = PAGE_CACHE_SIZE;
+        if (server->dtsize > server->rsize)
+                server->dtsize = server->rsize;
+        if (server->flags & NFS_MOUNT_NOAC) {
+                server->acregmin = server->acregmax = 0;
+                server->acdirmin = server->acdirmax = 0;
+                sb->s_flags |= MS_SYNCHRONOUS;
+        }
+        server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
+        nfs_super_set_maxbytes(sb, fsinfo.maxfilesize);
+        server->client->cl_intr = (server->flags & NFS_MOUNT_INTR) ? 1 : 0;
+        server->client->cl_softrtry = (server->flags & NFS_MOUNT_SOFT) ? 1 : 0;
+        /* We're airborne Set socket buffersize */
+        rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
+        return 0;
+        /* Yargs. It didn't work out. */
+out_no_root:
+        dprintk("nfs_sb_init: get root inode failed: errno %d\n", -no_root_error);
+        if (!IS_ERR(root_inode))
+                iput(root_inode);
+        return no_root_error;
+}
+/*
+ * Initialise the timeout values for a connection
+ */
+static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, unsigned int timeo, unsigned int retrans)
+{
+        to->to_initval = timeo * HZ / 10;
+        to->to_retries = retrans;
+        if (!to->to_retries)
+                to->to_retries = 2;
+        switch (proto) {
+        case IPPROTO_TCP:
+                if (!to->to_initval)
+                        to->to_initval = 60 * HZ;
+                if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
+                        to->to_initval = NFS_MAX_TCP_TIMEOUT;
+                to->to_increment = to->to_initval;
+                to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
+                to->to_exponential = 0;
+                break;
+        case IPPROTO_UDP:
+        default:
+                if (!to->to_initval)
+                        to->to_initval = 11 * HZ / 10;
+                if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
+                        to->to_initval = NFS_MAX_UDP_TIMEOUT;
+                to->to_maxval = NFS_MAX_UDP_TIMEOUT;
+                to->to_exponential = 1;
+                break;
+        }
+}
+/*
+ * Create an RPC client handle.
+ */
+static struct rpc_clnt *
+nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data)
+{
+        struct rpc_timeout      timeparms;
+        struct rpc_xprt         *xprt = NULL;
+        struct rpc_clnt         *clnt = NULL;
+        int                     proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
+        nfs_init_timeout_values(&timeparms, proto, data->timeo, data->retrans);
+        server->retrans_timeo = timeparms.to_initval;
+        server->retrans_count = timeparms.to_retries;
+        /* create transport and client */
+        xprt = xprt_create_proto(proto, &server->addr, &timeparms);
+        if (IS_ERR(xprt)) {
+                dprintk("%s: cannot create RPC transport. Error = %ld\n",
+                                __FUNCTION__, PTR_ERR(xprt));
+                return (struct rpc_clnt *)xprt;
+        }
+        clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
+                                 server->rpc_ops->version, data->pseudoflavor);
+        if (IS_ERR(clnt)) {
+                dprintk("%s: cannot create RPC client. Error = %ld\n",
+                                __FUNCTION__, PTR_ERR(xprt));
+                goto out_fail;
+        }
+        clnt->cl_intr     = 1;
+        clnt->cl_softrtry = 1;
+        return clnt;
+out_fail:
+        return clnt;
+}
+/*
+ * Clone a server record
+ */
+static struct nfs_server *nfs_clone_server(struct super_block *sb, struct nfs_clone_mount *data)
+{
+        struct nfs_server *server = NFS_SB(sb);
+        struct nfs_server *parent = NFS_SB(data->sb);
+        struct inode *root_inode;
+        struct nfs_fsinfo fsinfo;
+        void *err = ERR_PTR(-ENOMEM);
+        sb->s_op = data->sb->s_op;
+        sb->s_blocksize = data->sb->s_blocksize;
+        sb->s_blocksize_bits = data->sb->s_blocksize_bits;
+        sb->s_maxbytes = data->sb->s_maxbytes;
+        server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
+        server->io_stats = nfs_alloc_iostats();
+        if (server->io_stats == NULL)
+                goto out;
+        server->client = rpc_clone_client(parent->client);
+        if (IS_ERR((err = server->client)))
+                goto out;
+        if (!IS_ERR(parent->client_sys)) {
+                server->client_sys = rpc_clone_client(parent->client_sys);
+                if (IS_ERR((err = server->client_sys)))
+                        goto out;
+        }
+        if (!IS_ERR(parent->client_acl)) {
+                server->client_acl = rpc_clone_client(parent->client_acl);
+                if (IS_ERR((err = server->client_acl)))
+                        goto out;
+        }
+        root_inode = nfs_fhget(sb, data->fh, data->fattr);
+        if (!root_inode)
+                goto out;
+        sb->s_root = d_alloc_root(root_inode);
+        if (!sb->s_root)
+                goto out_put_root;
+        fsinfo.fattr = data->fattr;
+        if (NFS_PROTO(root_inode)->fsinfo(server, data->fh, &fsinfo) == 0)
+                nfs_super_set_maxbytes(sb, fsinfo.maxfilesize);
+        sb->s_root->d_op = server->rpc_ops->dentry_ops;
+        sb->s_flags |= MS_ACTIVE;
+        return server;
+out_put_root:
+        iput(root_inode);
+out:
+        return err;
+}
+/*
+ * Copy an existing superblock and attach revised data
+ */
+static int nfs_clone_generic_sb(struct nfs_clone_mount *data,
+                struct super_block *(*fill_sb)(struct nfs_server *, struct nfs_clone_mount *),
+                struct nfs_server *(*fill_server)(struct super_block *, struct nfs_clone_mount *),
+                struct vfsmount *mnt)
+{
+        struct nfs_server *server;
+        struct nfs_server *parent = NFS_SB(data->sb);
+        struct super_block *sb = ERR_PTR(-EINVAL);
+        char *hostname;
+        int error = -ENOMEM;
+        int len;
+        server = kmalloc(sizeof(struct nfs_server), GFP_KERNEL);
+        if (server == NULL)
+                goto out_err;
+        memcpy(server, parent, sizeof(*server));
+        hostname = (data->hostname != NULL) ? data->hostname : parent->hostname;
+        len = strlen(hostname) + 1;
+        server->hostname = kmalloc(len, GFP_KERNEL);
+        if (server->hostname == NULL)
+                goto free_server;
+        memcpy(server->hostname, hostname, len);
+        error = rpciod_up();
+        if (error != 0)
+                goto free_hostname;
+        sb = fill_sb(server, data);
+        if (IS_ERR(sb)) {
+                error = PTR_ERR(sb);
+                goto kill_rpciod;
+        }
+                
+        if (sb->s_root)
+                goto out_rpciod_down;
+        server = fill_server(sb, data);
+        if (IS_ERR(server)) {
+                error = PTR_ERR(server);
+                goto out_deactivate;
+        }
+        return simple_set_mnt(mnt, sb);
+out_deactivate:
+        up_write(&sb->s_umount);
+        deactivate_super(sb);
+        return error;
+out_rpciod_down:
+        rpciod_down();
+        kfree(server->hostname);
+        kfree(server);
+        return simple_set_mnt(mnt, sb);
+kill_rpciod:
+        rpciod_down();
+free_hostname:
+        kfree(server->hostname);
+free_server:
+        kfree(server);
+out_err:
+        return error;
+}
+/*
+ * Set up an NFS2/3 superblock
+ *
+ * The way this works is that the mount process passes a structure
+ * in the data argument which contains the server's IP address
+ * and the root file handle obtained from the server's mount
+ * daemon. We stash these away in the private superblock fields.
+ */
+static int
+nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent)
+{
+        struct nfs_server       *server;
+        rpc_authflavor_t        authflavor;
+        server           = NFS_SB(sb);
+        sb->s_blocksize_bits = 0;
+        sb->s_blocksize = 0;
+        if (data->bsize)
+                sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits);
+        if (data->rsize)
+                server->rsize = nfs_block_size(data->rsize, NULL);
+        if (data->wsize)
+                server->wsize = nfs_block_size(data->wsize, NULL);
+        server->flags    = data->flags & NFS_MOUNT_FLAGMASK;
+        server->acregmin = data->acregmin*HZ;
+        server->acregmax = data->acregmax*HZ;
+        server->acdirmin = data->acdirmin*HZ;
+        server->acdirmax = data->acdirmax*HZ;
+        /* Start lockd here, before we might error out */
+        if (!(server->flags & NFS_MOUNT_NONLM))
+                lockd_up();
+        server->namelen  = data->namlen;
+        server->hostname = kmalloc(strlen(data->hostname) + 1, GFP_KERNEL);
+        if (!server->hostname)
+                return -ENOMEM;
+        strcpy(server->hostname, data->hostname);
+        /* Check NFS protocol revision and initialize RPC op vector
+         * and file handle pool. */
+#ifdef CONFIG_NFS_V3
+        if (server->flags & NFS_MOUNT_VER3) {
+                server->rpc_ops = &nfs_v3_clientops;
+                server->caps |= NFS_CAP_READDIRPLUS;
+        } else {
+                server->rpc_ops = &nfs_v2_clientops;
+        }
+#else
+        server->rpc_ops = &nfs_v2_clientops;
+#endif
+        /* Fill in pseudoflavor for mount version < 5 */
+        if (!(data->flags & NFS_MOUNT_SECFLAVOUR))
+                data->pseudoflavor = RPC_AUTH_UNIX;
+        authflavor = data->pseudoflavor;        /* save for sb_init() */
+        /* XXX maybe we want to add a server->pseudoflavor field */
+        /* Create RPC client handles */
+        server->client = nfs_create_client(server, data);
+        if (IS_ERR(server->client))
+                return PTR_ERR(server->client);
+        /* RFC 2623, sec 2.3.2 */
+        if (authflavor != RPC_AUTH_UNIX) {
+                struct rpc_auth *auth;
+                server->client_sys = rpc_clone_client(server->client);
+                if (IS_ERR(server->client_sys))
+                        return PTR_ERR(server->client_sys);
+                auth = rpcauth_create(RPC_AUTH_UNIX, server->client_sys);
+                if (IS_ERR(auth))
+                        return PTR_ERR(auth);
+        } else {
+                atomic_inc(&server->client->cl_count);
+                server->client_sys = server->client;
+        }
+        if (server->flags & NFS_MOUNT_VER3) {
+#ifdef CONFIG_NFS_V3_ACL
+                if (!(server->flags & NFS_MOUNT_NOACL)) {
+                        server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3);
+                        /* No errors! Assume that Sun nfsacls are supported */
+                        if (!IS_ERR(server->client_acl))
+                                server->caps |= NFS_CAP_ACLS;
+                }
+#else
+                server->flags &= ~NFS_MOUNT_NOACL;
+#endif /* CONFIG_NFS_V3_ACL */
+                /*
+                 * The VFS shouldn't apply the umask to mode bits. We will
+                 * do so ourselves when necessary.
+                 */
+                sb->s_flags |= MS_POSIXACL;
+                if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
+                        server->namelen = NFS3_MAXNAMLEN;
+                sb->s_time_gran = 1;
+        } else {
+                if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
+                        server->namelen = NFS2_MAXNAMLEN;
+        }
+        sb->s_op = &nfs_sops;
+        return nfs_sb_init(sb, authflavor);
+}
+static int nfs_set_super(struct super_block *s, void *data)
+{
+        s->s_fs_info = data;
+        return set_anon_super(s, data);
+}
+static int nfs_compare_super(struct super_block *sb, void *data)
+{
+        struct nfs_server *server = data;
+        struct nfs_server *old = NFS_SB(sb);
+        if (old->addr.sin_addr.s_addr != server->addr.sin_addr.s_addr)
+                return 0;
+        if (old->addr.sin_port != server->addr.sin_port)
+                return 0;
+        return !nfs_compare_fh(&old->fh, &server->fh);
+}
+static int nfs_get_sb(struct file_system_type *fs_type,
+        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+{
+        int error;
+        struct nfs_server *server = NULL;
+        struct super_block *s;
+        struct nfs_fh *root;
+        struct nfs_mount_data *data = raw_data;
+        error = -EINVAL;
+        if (data == NULL) {
+                dprintk("%s: missing data argument\n", __FUNCTION__);
+                goto out_err_noserver;
+        }
+        if (data->version <= 0 || data->version > NFS_MOUNT_VERSION) {
+                dprintk("%s: bad mount version\n", __FUNCTION__);
+                goto out_err_noserver;
+        }
+        switch (data->version) {
+                case 1:
+                        data->namlen = 0;
+                case 2:
+                        data->bsize  = 0;
+                case 3:
+                        if (data->flags & NFS_MOUNT_VER3) {
+                                dprintk("%s: mount structure version %d does not support NFSv3\n",
+                                                __FUNCTION__,
+                                                data->version);
+                                goto out_err_noserver;
+                        }
+                        data->root.size = NFS2_FHSIZE;
+                        memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
+                case 4:
+                        if (data->flags & NFS_MOUNT_SECFLAVOUR) {
+                                dprintk("%s: mount structure version %d does not support strong security\n",
+                                                __FUNCTION__,
+                                                data->version);
+                                goto out_err_noserver;
+                        }
+                case 5:
+                        memset(data->context, 0, sizeof(data->context));
+        }
+#ifndef CONFIG_NFS_V3
+        /* If NFSv3 is not compiled in, return -EPROTONOSUPPORT */
+        error = -EPROTONOSUPPORT;
+        if (data->flags & NFS_MOUNT_VER3) {
+                dprintk("%s: NFSv3 not compiled into kernel\n", __FUNCTION__);
+                goto out_err_noserver;
+        }
+#endif /* CONFIG_NFS_V3 */
+        error = -ENOMEM;
+        server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
+        if (!server)
+                goto out_err_noserver;
+        /* Zero out the NFS state stuff */
+        init_nfsv4_state(server);
+        server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
+        root = &server->fh;
+        if (data->flags & NFS_MOUNT_VER3)
+                root->size = data->root.size;
+        else
+                root->size = NFS2_FHSIZE;
+        error = -EINVAL;
+        if (root->size > sizeof(root->data)) {
+                dprintk("%s: invalid root filehandle\n", __FUNCTION__);
+                goto out_err;
+        }
+        memcpy(root->data, data->root.data, root->size);
+        /* We now require that the mount process passes the remote address */
+        memcpy(&server->addr, &data->addr, sizeof(server->addr));
+        if (server->addr.sin_addr.s_addr == INADDR_ANY) {
+                dprintk("%s: mount program didn't pass remote address!\n",
+                                __FUNCTION__);
+                goto out_err;
+        }
+        /* Fire up rpciod if not yet running */
+        error = rpciod_up();
+        if (error < 0) {
+                dprintk("%s: couldn't start rpciod! Error = %d\n",
+                                __FUNCTION__, error);
+                goto out_err;
+        }
+        s = sget(fs_type, nfs_compare_super, nfs_set_super, server);
+        if (IS_ERR(s)) {
+                error = PTR_ERR(s);
+                goto out_err_rpciod;
+        }
+        if (s->s_root)
+                goto out_rpciod_down;
+        s->s_flags = flags;
+        error = nfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+        if (error) {
+                up_write(&s->s_umount);
+                deactivate_super(s);
+                return error;
+        }
+        s->s_flags |= MS_ACTIVE;
+        return simple_set_mnt(mnt, s);
+out_rpciod_down:
+        rpciod_down();
+        kfree(server);
+        return simple_set_mnt(mnt, s);
+out_err_rpciod:
+        rpciod_down();
+out_err:
+        kfree(server);
+out_err_noserver:
+        return error;
+}
+static void nfs_kill_super(struct super_block *s)
+{
+        struct nfs_server *server = NFS_SB(s);
+        kill_anon_super(s);
+        if (!IS_ERR(server->client))
+                rpc_shutdown_client(server->client);
+        if (!IS_ERR(server->client_sys))
+                rpc_shutdown_client(server->client_sys);
+        if (!IS_ERR(server->client_acl))
+                rpc_shutdown_client(server->client_acl);
+        if (!(server->flags & NFS_MOUNT_NONLM))
+                lockd_down();   /* release rpc.lockd */
+        rpciod_down();          /* release rpciod */
+        nfs_free_iostats(server->io_stats);
+        kfree(server->hostname);
+        kfree(server);
+        nfs_release_automount_timer();
+}
+static struct super_block *nfs_clone_sb(struct nfs_server *server, struct nfs_clone_mount *data)
+{
+        struct super_block *sb;
+        server->fsid = data->fattr->fsid;
+        nfs_copy_fh(&server->fh, data->fh);
+        sb = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server);
+        if (!IS_ERR(sb) && sb->s_root == NULL && !(server->flags & NFS_MOUNT_NONLM))
+                lockd_up();
+        return sb;
+}
+static int nfs_clone_nfs_sb(struct file_system_type *fs_type,
+                int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+{
+        struct nfs_clone_mount *data = raw_data;
+        return nfs_clone_generic_sb(data, nfs_clone_sb, nfs_clone_server, mnt);
+}
+#ifdef CONFIG_NFS_V4
+static struct rpc_clnt *nfs4_create_client(struct nfs_server *server,
+        struct rpc_timeout *timeparms, int proto, rpc_authflavor_t flavor)
+{
+        struct nfs4_client *clp;
+        struct rpc_xprt *xprt = NULL;
+        struct rpc_clnt *clnt = NULL;
+        int err = -EIO;
+        clp = nfs4_get_client(&server->addr.sin_addr);
+        if (!clp) {
+                dprintk("%s: failed to create NFS4 client.\n", __FUNCTION__);
+                return ERR_PTR(err);
+        }
+        /* Now create transport and client */
+        down_write(&clp->cl_sem);
+        if (IS_ERR(clp->cl_rpcclient)) {
+                xprt = xprt_create_proto(proto, &server->addr, timeparms);
+                if (IS_ERR(xprt)) {
+                        up_write(&clp->cl_sem);
+                        err = PTR_ERR(xprt);
+                        dprintk("%s: cannot create RPC transport. Error = %d\n",
+                                        __FUNCTION__, err);
+                        goto out_fail;
+                }
+                /* Bind to a reserved port! */
+                xprt->resvport = 1;
+                clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
+                                server->rpc_ops->version, flavor);
+                if (IS_ERR(clnt)) {
+                        up_write(&clp->cl_sem);
+                        err = PTR_ERR(clnt);
+                        dprintk("%s: cannot create RPC client. Error = %d\n",
+                                        __FUNCTION__, err);
+                        goto out_fail;
+                }
+                clnt->cl_intr     = 1;
+                clnt->cl_softrtry = 1;
+                clp->cl_rpcclient = clnt;
+                memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr));
+                nfs_idmap_new(clp);
+        }
+        list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
+        clnt = rpc_clone_client(clp->cl_rpcclient);
+        if (!IS_ERR(clnt))
+                server->nfs4_state = clp;
+        up_write(&clp->cl_sem);
+        clp = NULL;
+        if (IS_ERR(clnt)) {
+                dprintk("%s: cannot create RPC client. Error = %d\n",
+                                __FUNCTION__, err);
+                return clnt;
+        }
+        if (server->nfs4_state->cl_idmap == NULL) {
+                dprintk("%s: failed to create idmapper.\n", __FUNCTION__);
+                return ERR_PTR(-ENOMEM);
+        }
+        if (clnt->cl_auth->au_flavor != flavor) {
+                struct rpc_auth *auth;
+                auth = rpcauth_create(flavor, clnt);
+                if (IS_ERR(auth)) {
+                        dprintk("%s: couldn't create credcache!\n", __FUNCTION__);
+                        return (struct rpc_clnt *)auth;
+                }
+        }
+        return clnt;
+ out_fail:
+        if (clp)
+                nfs4_put_client(clp);
+        return ERR_PTR(err);
+}
+/*
+ * Set up an NFS4 superblock
+ */
+static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data, int silent)
+{
+        struct nfs_server *server;
+        struct rpc_timeout timeparms;
+        rpc_authflavor_t authflavour;
+        int err = -EIO;
+        sb->s_blocksize_bits = 0;
+        sb->s_blocksize = 0;
+        server = NFS_SB(sb);
+        if (data->rsize != 0)
+                server->rsize = nfs_block_size(data->rsize, NULL);
+        if (data->wsize != 0)
+                server->wsize = nfs_block_size(data->wsize, NULL);
+        server->flags = data->flags & NFS_MOUNT_FLAGMASK;
+        server->caps = NFS_CAP_ATOMIC_OPEN;
+        server->acregmin = data->acregmin*HZ;
+        server->acregmax = data->acregmax*HZ;
+        server->acdirmin = data->acdirmin*HZ;
+        server->acdirmax = data->acdirmax*HZ;
+        server->rpc_ops = &nfs_v4_clientops;
+        nfs_init_timeout_values(&timeparms, data->proto, data->timeo, data->retrans);
+        server->retrans_timeo = timeparms.to_initval;
+        server->retrans_count = timeparms.to_retries;
+        /* Now create transport and client */
+        authflavour = RPC_AUTH_UNIX;
+        if (data->auth_flavourlen != 0) {
+                if (data->auth_flavourlen != 1) {
+                        dprintk("%s: Invalid number of RPC auth flavours %d.\n",
+                                        __FUNCTION__, data->auth_flavourlen);
+                        err = -EINVAL;
+                        goto out_fail;
+                }
+                if (copy_from_user(&authflavour, data->auth_flavours, sizeof(authflavour))) {
+                        err = -EFAULT;
+                        goto out_fail;
+                }
+        }
+        server->client = nfs4_create_client(server, &timeparms, data->proto, authflavour);
+        if (IS_ERR(server->client)) {
+                err = PTR_ERR(server->client);
+                        dprintk("%s: cannot create RPC client. Error = %d\n",
+                                        __FUNCTION__, err);
+                        goto out_fail;
+        }
+        sb->s_time_gran = 1;
+        sb->s_op = &nfs4_sops;
+        err = nfs_sb_init(sb, authflavour);
+ out_fail:
+        return err;
+}
+static int nfs4_compare_super(struct super_block *sb, void *data)
+{
+        struct nfs_server *server = data;
+        struct nfs_server *old = NFS_SB(sb);
+        if (strcmp(server->hostname, old->hostname) != 0)
+                return 0;
+        if (strcmp(server->mnt_path, old->mnt_path) != 0)
+                return 0;
+        return 1;
+}
+static void *
+nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen)
+{
+        void *p = NULL;
+        if (!src->len)
+                return ERR_PTR(-EINVAL);
+        if (src->len < maxlen)
+                maxlen = src->len;
+        if (dst == NULL) {
+                p = dst = kmalloc(maxlen + 1, GFP_KERNEL);
+                if (p == NULL)
+                        return ERR_PTR(-ENOMEM);
+        }
+        if (copy_from_user(dst, src->data, maxlen)) {
+                kfree(p);
+                return ERR_PTR(-EFAULT);
+        }
+        dst[maxlen] = '\0';
+        return dst;
+}
+static int nfs4_get_sb(struct file_system_type *fs_type,
+        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+{
+        int error;
+        struct nfs_server *server;
+        struct super_block *s;
+        struct nfs4_mount_data *data = raw_data;
+        void *p;
+        if (data == NULL) {
+                dprintk("%s: missing data argument\n", __FUNCTION__);
+                return -EINVAL;
+        }
+        if (data->version <= 0 || data->version > NFS4_MOUNT_VERSION) {
+                dprintk("%s: bad mount version\n", __FUNCTION__);
+                return -EINVAL;
+        }
+        server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
+        if (!server)
+                return -ENOMEM;
+        /* Zero out the NFS state stuff */
+        init_nfsv4_state(server);
+        server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
+        p = nfs_copy_user_string(NULL, &data->hostname, 256);
+        if (IS_ERR(p))
+                goto out_err;
+        server->hostname = p;
+        p = nfs_copy_user_string(NULL, &data->mnt_path, 1024);
+        if (IS_ERR(p))
+                goto out_err;
+        server->mnt_path = p;
+        p = nfs_copy_user_string(server->ip_addr, &data->client_addr,
+                        sizeof(server->ip_addr) - 1);
+        if (IS_ERR(p))
+                goto out_err;
+        /* We now require that the mount process passes the remote address */
+        if (data->host_addrlen != sizeof(server->addr)) {
+                error = -EINVAL;
+                goto out_free;
+        }
+        if (copy_from_user(&server->addr, data->host_addr, sizeof(server->addr))) {
+                error = -EFAULT;
+                goto out_free;
+        }
+        if (server->addr.sin_family != AF_INET ||
+            server->addr.sin_addr.s_addr == INADDR_ANY) {
+                dprintk("%s: mount program didn't pass remote IP address!\n",
+                                __FUNCTION__);
+                error = -EINVAL;
+                goto out_free;
+        }
+        /* Fire up rpciod if not yet running */
+        error = rpciod_up();
+        if (error < 0) {
+                dprintk("%s: couldn't start rpciod! Error = %d\n",
+                                __FUNCTION__, error);
+                goto out_free;
+        }
+        s = sget(fs_type, nfs4_compare_super, nfs_set_super, server);
+        if (IS_ERR(s)) {
+                error = PTR_ERR(s);
+                goto out_free;
+        }
+        if (s->s_root) {
+                kfree(server->mnt_path);
+                kfree(server->hostname);
+                kfree(server);
+                return simple_set_mnt(mnt, s);
+        }
+        s->s_flags = flags;
+        error = nfs4_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+        if (error) {
+                up_write(&s->s_umount);
+                deactivate_super(s);
+                return error;
+        }
+        s->s_flags |= MS_ACTIVE;
+        return simple_set_mnt(mnt, s);
+out_err:
+        error = PTR_ERR(p);
+out_free:
+        kfree(server->mnt_path);
+        kfree(server->hostname);
+        kfree(server);
+        return error;
+}
+static void nfs4_kill_super(struct super_block *sb)
+{
+        struct nfs_server *server = NFS_SB(sb);
+        nfs_return_all_delegations(sb);
+        kill_anon_super(sb);
+        nfs4_renewd_prepare_shutdown(server);
+        if (server->client != NULL && !IS_ERR(server->client))
+                rpc_shutdown_client(server->client);
+        destroy_nfsv4_state(server);
+        rpciod_down();
+        nfs_free_iostats(server->io_stats);
+        kfree(server->hostname);
+        kfree(server);
+        nfs_release_automount_timer();
+}
+/*
+ * Constructs the SERVER-side path
+ */
+static inline char *nfs4_dup_path(const struct dentry *dentry)
+{
+        char *page = (char *) __get_free_page(GFP_USER);
+        char *path;
+        path = nfs4_path(dentry, page, PAGE_SIZE);
+        if (!IS_ERR(path)) {
+                int len = PAGE_SIZE + page - path;
+                char *tmp = path;
+                path = kmalloc(len, GFP_KERNEL);
+                if (path)
+                        memcpy(path, tmp, len);
+                else
+                        path = ERR_PTR(-ENOMEM);
+        }
+        free_page((unsigned long)page);
+        return path;
+}
+static struct super_block *nfs4_clone_sb(struct nfs_server *server, struct nfs_clone_mount *data)
+{
+        const struct dentry *dentry = data->dentry;
+        struct nfs4_client *clp = server->nfs4_state;
+        struct super_block *sb;
+        server->fsid = data->fattr->fsid;
+        nfs_copy_fh(&server->fh, data->fh);
+        server->mnt_path = nfs4_dup_path(dentry);
+        if (IS_ERR(server->mnt_path)) {
+                sb = (struct super_block *)server->mnt_path;
+                goto err;
+        }
+        sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server);
+        if (IS_ERR(sb) || sb->s_root)
+                goto free_path;
+        nfs4_server_capabilities(server, &server->fh);
+        down_write(&clp->cl_sem);
+        atomic_inc(&clp->cl_count);
+        list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
+        up_write(&clp->cl_sem);
+        return sb;
+free_path:
+        kfree(server->mnt_path);
+err:
+        server->mnt_path = NULL;
+        return sb;
+}
+static int nfs_clone_nfs4_sb(struct file_system_type *fs_type,
+                int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+{
+        struct nfs_clone_mount *data = raw_data;
+        return nfs_clone_generic_sb(data, nfs4_clone_sb, nfs_clone_server, mnt);
+}
+static struct super_block *nfs4_referral_sb(struct nfs_server *server, struct nfs_clone_mount *data)
+{
+        struct super_block *sb = ERR_PTR(-ENOMEM);
+        int len;
+        len = strlen(data->mnt_path) + 1;
+        server->mnt_path = kmalloc(len, GFP_KERNEL);
+        if (server->mnt_path == NULL)
+                goto err;
+        memcpy(server->mnt_path, data->mnt_path, len);
+        memcpy(&server->addr, data->addr, sizeof(struct sockaddr_in));
+        sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server);
+        if (IS_ERR(sb) || sb->s_root)
+                goto free_path;
+        return sb;
+free_path:
+        kfree(server->mnt_path);
+err:
+        server->mnt_path = NULL;
+        return sb;
+}
+static struct nfs_server *nfs4_referral_server(struct super_block *sb, struct nfs_clone_mount *data)
+{
+        struct nfs_server *server = NFS_SB(sb);
+        struct rpc_timeout timeparms;
+        int proto, timeo, retrans;
+        void *err;
+        proto = IPPROTO_TCP;
+        /* Since we are following a referral and there may be alternatives,
+           set the timeouts and retries to low values */
+        timeo = 2;
+        retrans = 1;
+        nfs_init_timeout_values(&timeparms, proto, timeo, retrans);
+        server->client = nfs4_create_client(server, &timeparms, proto, data->authflavor);
+        if (IS_ERR((err = server->client)))
+                goto out_err;
+        sb->s_time_gran = 1;
+        sb->s_op = &nfs4_sops;
+        err = ERR_PTR(nfs_sb_init(sb, data->authflavor));
+        if (!IS_ERR(err))
+                return server;
+out_err:
+        return (struct nfs_server *)err;
+}
+static int nfs_referral_nfs4_sb(struct file_system_type *fs_type,
+                int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+{
+        struct nfs_clone_mount *data = raw_data;
+        return nfs_clone_generic_sb(data, nfs4_referral_sb, nfs4_referral_server, mnt);
+}
+#endif
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 18dc95b0b646..600bbe630abd 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -52,7 +52,7 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
        struct inode *inode = dentry->d_inode;
        struct page *page;
-        void *err = ERR_PTR(nfs_revalidate_inode(NFS_SERVER(inode), inode));
+        void *err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
        if (err)
                goto read_failed;
        page = read_cache_page(&inode->i_data, 0,
@@ -75,22 +75,13 @@ read_failed:
        return NULL;
 }
-static void nfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
-{
-        if (cookie) {
-                struct page *page = cookie;
-                kunmap(page);
-                page_cache_release(page);
-        }
-}
 /*
 * symlinks can't do much...
 */
 struct inode_operations nfs_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = nfs_follow_link,
-        .put_link       = nfs_put_link,
+        .put_link       = page_put_link,
        .getattr        = nfs_getattr,
        .setattr        = nfs_setattr,
 };
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index 4c486eb867ca..db61e51bb154 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_idmap.h>
+#include <linux/nfs_fs.h>
 #include "callback.h"
@@ -46,6 +47,15 @@ static ctl_table nfs_cb_sysctls[] = {
                .strategy = &sysctl_jiffies,
        },
 #endif
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "nfs_mountpoint_timeout",
+                .data           = &nfs_mountpoint_expiry_timeout,
+                .maxlen         = sizeof(nfs_mountpoint_expiry_timeout),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+                .strategy       = &sysctl_jiffies,
+        },
        { .ctl_name = 0 }
 };
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 4cfada2cc09f..8fccb9cb173b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -98,11 +98,10 @@ struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount)
        if (p) {
                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->pages);
-                if (pagecount < NFS_PAGEVEC_SIZE)
+                if (pagecount <= ARRAY_SIZE(p->page_array))
-                        p->pagevec = &p->page_array[0];
+                        p->pagevec = p->page_array;
                else {
-                        size_t size = ++pagecount * sizeof(struct page *);
+                        p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
-                        p->pagevec = kzalloc(size, GFP_NOFS);
                        if (!p->pagevec) {
                                mempool_free(p, nfs_commit_mempool);
                                p = NULL;
@@ -126,14 +125,11 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
        if (p) {
                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->pages);
-                if (pagecount < NFS_PAGEVEC_SIZE)
+                if (pagecount <= ARRAY_SIZE(p->page_array))
-                        p->pagevec = &p->page_array[0];
+                        p->pagevec = p->page_array;
                else {
-                        size_t size = ++pagecount * sizeof(struct page *);
+                        p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
-                        p->pagevec = kmalloc(size, GFP_NOFS);
+                        if (!p->pagevec) {
-                        if (p->pagevec) {
-                                memset(p->pagevec, 0, size);
-                        } else {
                                mempool_free(p, nfs_wdata_mempool);
                                p = NULL;
                        }
@@ -583,6 +579,17 @@ static int nfs_wait_on_requests(struct inode *inode, unsigned long idx_start, un
        return ret;
 }
+static void nfs_cancel_requests(struct list_head *head)
+{
+        struct nfs_page *req;
+        while(!list_empty(head)) {
+                req = nfs_list_entry(head->next);
+                nfs_list_remove_request(req);
+                nfs_inode_remove_request(req);
+                nfs_clear_page_writeback(req);
+        }
+}
 /*
 * nfs_scan_dirty - Scan an inode for dirty requests
 * @inode: NFS inode to scan
@@ -627,7 +634,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_st
        int res = 0;
        if (nfsi->ncommit != 0) {
-                res = nfs_scan_list(&nfsi->commit, dst, idx_start, npages);
+                res = nfs_scan_list(nfsi, &nfsi->commit, dst, idx_start, npages);
                nfsi->ncommit -= res;
                if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit))
                        printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n");
@@ -1495,15 +1502,25 @@ int nfs_sync_inode_wait(struct inode *inode, unsigned long idx_start,
                pages = nfs_scan_dirty(inode, &head, idx_start, npages);
                if (pages != 0) {
                        spin_unlock(&nfsi->req_lock);
-                        ret = nfs_flush_list(inode, &head, pages, how);
+                        if (how & FLUSH_INVALIDATE)
+                                nfs_cancel_requests(&head);
+                        else
+                                ret = nfs_flush_list(inode, &head, pages, how);
                        spin_lock(&nfsi->req_lock);
                        continue;
                }
                if (nocommit)
                        break;
-                pages = nfs_scan_commit(inode, &head, 0, 0);
+                pages = nfs_scan_commit(inode, &head, idx_start, npages);
                if (pages == 0)
                        break;
+                if (how & FLUSH_INVALIDATE) {
+                        spin_unlock(&nfsi->req_lock);
+                        nfs_cancel_requests(&head);
+                        spin_lock(&nfsi->req_lock);
+                        continue;
+                }
+                pages += nfs_scan_commit(inode, &head, 0, 0);
                spin_unlock(&nfsi->req_lock);
                ret = nfs_commit_list(inode, &head, how);
                spin_lock(&nfsi->req_lock);
@@ -1512,7 +1529,7 @@ int nfs_sync_inode_wait(struct inode *inode, unsigned long idx_start,
        return ret;
 }
-int nfs_init_writepagecache(void)
+int __init nfs_init_writepagecache(void)
 {
        nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
                                             sizeof(struct nfs_write_data),
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 96c7578cbe1e..7c7d01672d35 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -123,7 +123,7 @@ static void release_stateid(struct nfs4_stateid *stp, int flags);
 */
 /* recall_lock protects the del_recall_lru */
-static spinlock_t recall_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(recall_lock);
 static struct list_head del_recall_lru;
 static void
@@ -529,8 +529,7 @@ move_to_confirmed(struct nfs4_client *clp)
        dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
        list_del_init(&clp->cl_strhash);
-        list_del_init(&clp->cl_idhash);
+        list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]);
-        list_add(&clp->cl_idhash, &conf_id_hashtbl[idhashval]);
        strhashval = clientstr_hashval(clp->cl_recdir);
        list_add(&clp->cl_strhash, &conf_str_hashtbl[strhashval]);
        renew_client(clp);
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index d852ebb538e3..fdf7cf3dfadc 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -103,8 +103,7 @@ nfsd_cache_shutdown(void)
 static void
 lru_put_end(struct svc_cacherep *rp)
 {
-        list_del(&rp->c_lru);
+        list_move_tail(&rp->c_lru, &lru_head);
-        list_add_tail(&rp->c_lru, &lru_head);
 }
 /*
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 88292f9e4b9b..2e42c2dcae12 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1358,7 +1358,7 @@ err_out:
        goto out;
 }
-static size_t __ntfs_copy_from_user_iovec(char *vaddr,
+static size_t __ntfs_copy_from_user_iovec_inatomic(char *vaddr,
                const struct iovec *iov, size_t iov_ofs, size_t bytes)
 {
        size_t total = 0;
@@ -1376,10 +1376,6 @@ static size_t __ntfs_copy_from_user_iovec(char *vaddr,
                bytes -= len;
                vaddr += len;
                if (unlikely(left)) {
-                        /*
-                         * Zero the rest of the target like __copy_from_user().
-                         */
-                        memset(vaddr, 0, bytes);
                        total -= left;
                        break;
                }
@@ -1420,11 +1416,13 @@ static inline void ntfs_set_next_iovec(const struct iovec **iovp,
 * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
 * single-segment behaviour.
 *
- * We call the same helper (__ntfs_copy_from_user_iovec()) both when atomic and
+ * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both
- * when not atomic.  This is ok because __ntfs_copy_from_user_iovec() calls
+ * when atomic and when not atomic.  This is ok because
- * __copy_from_user_inatomic() and it is ok to call this when non-atomic.  In
+ * __ntfs_copy_from_user_iovec_inatomic() calls __copy_from_user_inatomic()
- * fact, the only difference between __copy_from_user_inatomic() and
+ * and it is ok to call this when non-atomic.
- * __copy_from_user() is that the latter calls might_sleep().  And on many
+ * Infact, the only difference between __copy_from_user_inatomic() and
+ * __copy_from_user() is that the latter calls might_sleep() and the former
+ * should not zero the tail of the buffer on error.  And on many
 * architectures __copy_from_user_inatomic() is just defined to
 * __copy_from_user() so it makes no difference at all on those architectures.
 */
@@ -1441,14 +1439,18 @@ static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
                if (len > bytes)
                        len = bytes;
                kaddr = kmap_atomic(*pages, KM_USER0);
-                copied = __ntfs_copy_from_user_iovec(kaddr + ofs,
+                copied = __ntfs_copy_from_user_iovec_inatomic(kaddr + ofs,
                                *iov, *iov_ofs, len);
                kunmap_atomic(kaddr, KM_USER0);
                if (unlikely(copied != len)) {
                        /* Do it the slow way. */
                        kaddr = kmap(*pages);
-                        copied = __ntfs_copy_from_user_iovec(kaddr + ofs,
+                        copied = __ntfs_copy_from_user_iovec_inatomic(kaddr + ofs,
                                        *iov, *iov_ofs, len);
+                        /*
+                         * Zero the rest of the target like __copy_from_user().
+                         */
+                        memset(kaddr + ofs + copied, 0, len - copied);
                        kunmap(*pages);
                        if (unlikely(copied != len))
                                goto err_out;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 21f38accd039..1d26cfcd9f84 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -54,7 +54,7 @@ static DECLARE_RWSEM(o2hb_callback_sem);
 * multiple hb threads are watching multiple regions.  A node is live
 * whenever any of the threads sees activity from the node in its region.
 */
-static spinlock_t o2hb_live_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(o2hb_live_lock);
 static struct list_head o2hb_live_slots[O2NM_MAX_NODES];
 static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
 static LIST_HEAD(o2hb_node_events);
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 0f60cc0d3985..1591eb37a723 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -108,7 +108,7 @@
            ##args);                                                    \
 } while (0)
-static rwlock_t o2net_handler_lock = RW_LOCK_UNLOCKED;
+static DEFINE_RWLOCK(o2net_handler_lock);
 static struct rb_root o2net_handler_tree = RB_ROOT;
 static struct o2net_node o2net_nodes[O2NM_MAX_NODES];
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 355593dd8ef8..42775e2bbe2c 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -197,12 +197,14 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
                                  lock->ml.node == dlm->node_num ? "master" :
                                  "remote");
                        memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN);
-                } else if (lksb->flags & DLM_LKSB_PUT_LVB) {
-                        mlog(0, "setting lvb from lockres for %s node\n",
-                                  lock->ml.node == dlm->node_num ? "master" :
-                                  "remote");
-                        memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
                }
+                /* Do nothing for lvb put requests - they should be done in
+                 * place when the lock is downconverted - otherwise we risk
+                 * racing gets and puts which could result in old lvb data
+                 * being propagated. We leave the put flag set and clear it
+                 * here. In the future we might want to clear it at the time
+                 * the put is actually done.
+                 */
                spin_unlock(&res->spinlock);
        }
@@ -381,8 +383,7 @@ do_ast:
        ret = DLM_NORMAL;
        if (past->type == DLM_AST) {
                /* do not alter lock refcount.  switching lists. */
-                list_del_init(&lock->list);
+                list_move_tail(&lock->list, &res->granted);
-                list_add_tail(&lock->list, &res->granted);
                mlog(0, "ast: adding to granted list... type=%d, "
                          "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
                if (lock->ml.convert_type != LKM_IVMODE) {
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 88cc43df18f1..9bdc9cf65991 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -37,7 +37,17 @@
 #define DLM_THREAD_SHUFFLE_INTERVAL    5     // flush everything every 5 passes
 #define DLM_THREAD_MS                  200   // flush at least every 200 ms
-#define DLM_HASH_BUCKETS     (PAGE_SIZE / sizeof(struct hlist_head))
+#define DLM_HASH_SIZE_DEFAULT   (1 << 14)
+#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
+# define DLM_HASH_PAGES         1
+#else
+# define DLM_HASH_PAGES         (DLM_HASH_SIZE_DEFAULT / PAGE_SIZE)
+#endif
+#define DLM_BUCKETS_PER_PAGE    (PAGE_SIZE / sizeof(struct hlist_head))
+#define DLM_HASH_BUCKETS        (DLM_HASH_PAGES * DLM_BUCKETS_PER_PAGE)
+/* Intended to make it easier for us to switch out hash functions */
+#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
 enum dlm_ast_type {
        DLM_AST = 0,
@@ -61,7 +71,8 @@ static inline int dlm_is_recovery_lock(const char *lock_name, int name_len)
        return 0;
 }
-#define DLM_RECO_STATE_ACTIVE  0x0001
+#define DLM_RECO_STATE_ACTIVE    0x0001
+#define DLM_RECO_STATE_FINALIZE  0x0002
 struct dlm_recovery_ctxt
 {
@@ -85,7 +96,7 @@ enum dlm_ctxt_state {
 struct dlm_ctxt
 {
        struct list_head list;
-        struct hlist_head *lockres_hash;
+        struct hlist_head **lockres_hash;
        struct list_head dirty_list;
        struct list_head purge_list;
        struct list_head pending_asts;
@@ -120,6 +131,7 @@ struct dlm_ctxt
        struct o2hb_callback_func dlm_hb_down;
        struct task_struct *dlm_thread_task;
        struct task_struct *dlm_reco_thread_task;
+        struct workqueue_struct *dlm_worker;
        wait_queue_head_t dlm_thread_wq;
        wait_queue_head_t dlm_reco_thread_wq;
        wait_queue_head_t ast_wq;
@@ -132,6 +144,11 @@ struct dlm_ctxt
        struct list_head        dlm_eviction_callbacks;
 };
+static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned i)
+{
+        return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE);
+}
 /* these keventd work queue items are for less-frequently
 * called functions that cannot be directly called from the
 * net message handlers for some reason, usually because
@@ -216,20 +233,29 @@ struct dlm_lock_resource
        /* WARNING: Please see the comment in dlm_init_lockres before
         * adding fields here. */
        struct hlist_node hash_node;
+        struct qstr lockname;
        struct kref      refs;
-        /* please keep these next 3 in this order
+        /*
-         * some funcs want to iterate over all lists */
+         * Please keep granted, converting, and blocked in this order,
+         * as some funcs want to iterate over all lists.
+         *
+         * All four lists are protected by the hash's reference.
+         */
        struct list_head granted;
        struct list_head converting;
        struct list_head blocked;
+        struct list_head purge;
+        /*
+         * These two lists require you to hold an additional reference
+         * while they are on the list.
+         */
        struct list_head dirty;
        struct list_head recovering; // dlm_recovery_ctxt.resources list
        /* unused lock resources have their last_used stamped and are
         * put on a list for the dlm thread to run. */
-        struct list_head purge;
        unsigned long    last_used;
        unsigned migration_pending:1;
@@ -238,7 +264,6 @@ struct dlm_lock_resource
        wait_queue_head_t wq;
        u8  owner;              //node which owns the lock resource, or unknown
        u16 state;
-        struct qstr lockname;
        char lvb[DLM_LVB_LEN];
 };
@@ -300,6 +325,15 @@ enum dlm_lockres_list {
        DLM_BLOCKED_LIST
 };
+static inline int dlm_lvb_is_empty(char *lvb)
+{
+        int i;
+        for (i=0; i<DLM_LVB_LEN; i++)
+                if (lvb[i])
+                        return 0;
+        return 1;
+}
 static inline struct list_head *
 dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
 {
@@ -609,7 +643,8 @@ struct dlm_finalize_reco
 {
        u8 node_idx;
        u8 dead_node;
-        __be16 pad1;
+        u8 flags;
+        u8 pad1;
        __be32 pad2;
 };
@@ -676,6 +711,7 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
 void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
 int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
 int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
+int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
 void dlm_put(struct dlm_ctxt *dlm);
 struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
@@ -687,14 +723,20 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
                            struct dlm_lock_resource *res);
 void dlm_purge_lockres(struct dlm_ctxt *dlm,
                       struct dlm_lock_resource *lockres);
-void dlm_lockres_get(struct dlm_lock_resource *res);
+static inline void dlm_lockres_get(struct dlm_lock_resource *res)
+{
+        /* This is called on every lookup, so it might be worth
+         * inlining. */
+        kref_get(&res->refs);
+}
 void dlm_lockres_put(struct dlm_lock_resource *res);
 void __dlm_unhash_lockres(struct dlm_lock_resource *res);
 void __dlm_insert_lockres(struct dlm_ctxt *dlm,
                          struct dlm_lock_resource *res);
 struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
                                                const char *name,
-                                                unsigned int len);
+                                                unsigned int len,
+                                                unsigned int hash);
 struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
                                              const char *name,
                                              unsigned int len);
@@ -819,6 +861,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm,
                           u8 dead_node);
 int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+int __dlm_lockres_unused(struct dlm_lock_resource *res);
 static inline const char * dlm_lock_mode_name(int mode)
 {
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 8285228d9e37..c764dc8e40a2 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -214,6 +214,9 @@ grant:
        if (lock->ml.node == dlm->node_num)
                mlog(0, "doing in-place convert for nonlocal lock\n");
        lock->ml.type = type;
+        if (lock->lksb->flags & DLM_LKSB_PUT_LVB)
+                memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN);
        status = DLM_NORMAL;
        *call_ast = 1;
        goto unlock_exit;
@@ -231,8 +234,7 @@ switch_queues:
        lock->ml.convert_type = type;
        /* do not alter lock refcount.  switching lists. */
-        list_del_init(&lock->list);
+        list_move_tail(&lock->list, &res->converting);
-        list_add_tail(&lock->list, &res->converting);
 unlock_exit:
        spin_unlock(&lock->spinlock);
@@ -248,8 +250,7 @@ void dlm_revert_pending_convert(struct dlm_lock_resource *res,
                                struct dlm_lock *lock)
 {
        /* do not alter lock refcount.  switching lists. */
-        list_del_init(&lock->list);
+        list_move_tail(&lock->list, &res->granted);
-        list_add_tail(&lock->list, &res->granted);
        lock->ml.convert_type = LKM_IVMODE;
        lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
 }
@@ -294,8 +295,7 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
        res->state |= DLM_LOCK_RES_IN_PROGRESS;
        /* move lock to local convert queue */
        /* do not alter lock refcount.  switching lists. */
-        list_del_init(&lock->list);
+        list_move_tail(&lock->list, &res->converting);
-        list_add_tail(&lock->list, &res->converting);
        lock->convert_pending = 1;
        lock->ml.convert_type = type;
@@ -464,6 +464,12 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
        }
        spin_lock(&res->spinlock);
+        status = __dlm_lockres_state_to_status(res);
+        if (status != DLM_NORMAL) {
+                spin_unlock(&res->spinlock);
+                dlm_error(status);
+                goto leave;
+        }
        list_for_each(iter, &res->granted) {
                lock = list_entry(iter, struct dlm_lock, list);
                if (lock->ml.cookie == cnv->cookie &&
@@ -473,6 +479,21 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
                }
                lock = NULL;
        }
+        if (!lock) {
+                __dlm_print_one_lock_resource(res);
+                list_for_each(iter, &res->granted) {
+                        lock = list_entry(iter, struct dlm_lock, list);
+                        if (lock->ml.node == cnv->node_idx) {
+                                mlog(ML_ERROR, "There is something here "
+                                     "for node %u, lock->ml.cookie=%llu, "
+                                     "cnv->cookie=%llu\n", cnv->node_idx,
+                                     (unsigned long long)lock->ml.cookie,
+                                     (unsigned long long)cnv->cookie);
+                                break;
+                        }
+                }
+                lock = NULL;
+        }
        spin_unlock(&res->spinlock);
        if (!lock) {
                status = DLM_IVLOCKID;
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index c7eae5d3324e..3f6c8d88f7af 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -37,10 +37,8 @@
 #include "dlmapi.h"
 #include "dlmcommon.h"
-#include "dlmdebug.h"
 #include "dlmdomain.h"
-#include "dlmdebug.h"
 #define MLOG_MASK_PREFIX ML_DLM
 #include "cluster/masklog.h"
@@ -120,6 +118,7 @@ void dlm_print_one_lock(struct dlm_lock *lockid)
 }
 EXPORT_SYMBOL_GPL(dlm_print_one_lock);
+#if 0
 void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
 {
        struct dlm_lock_resource *res;
@@ -136,12 +135,13 @@ void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
        spin_lock(&dlm->spinlock);
        for (i=0; i<DLM_HASH_BUCKETS; i++) {
-                bucket = &(dlm->lockres_hash[i]);
+                bucket = dlm_lockres_hash(dlm, i);
                hlist_for_each_entry(res, iter, bucket, hash_node)
                        dlm_print_one_lock_resource(res);
        }
        spin_unlock(&dlm->spinlock);
 }
+#endif  /*  0  */
 static const char *dlm_errnames[] = {
        [DLM_NORMAL] =                  "DLM_NORMAL",
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
deleted file mode 100644
index 6858510c3ccd..000000000000
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmdebug.h
- *
- * Copyright (C) 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
- */
-#ifndef DLMDEBUG_H
-#define DLMDEBUG_H
-void dlm_dump_lock_resources(struct dlm_ctxt *dlm);
-#endif
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 8f3a9e3106fd..b8c23f7ba67e 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -41,7 +41,6 @@
 #include "dlmapi.h"
 #include "dlmcommon.h"
-#include "dlmdebug.h"
 #include "dlmdomain.h"
 #include "dlmver.h"
@@ -49,6 +48,33 @@
 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
 #include "cluster/masklog.h"
+static void dlm_free_pagevec(void **vec, int pages)
+{
+        while (pages--)
+                free_page((unsigned long)vec[pages]);
+        kfree(vec);
+}
+static void **dlm_alloc_pagevec(int pages)
+{
+        void **vec = kmalloc(pages * sizeof(void *), GFP_KERNEL);
+        int i;
+        if (!vec)
+                return NULL;
+        for (i = 0; i < pages; i++)
+                if (!(vec[i] = (void *)__get_free_page(GFP_KERNEL)))
+                        goto out_free;
+        mlog(0, "Allocated DLM hash pagevec; %d pages (%lu expected), %lu buckets per page\n",
+             pages, DLM_HASH_PAGES, (unsigned long)DLM_BUCKETS_PER_PAGE);
+        return vec;
+out_free:
+        dlm_free_pagevec(vec, i);
+        return NULL;
+}
 /*
 *
 * spinlock lock ordering: if multiple locks are needed, obey this ordering:
@@ -62,7 +88,7 @@
 *
 */
-spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(dlm_domain_lock);
 LIST_HEAD(dlm_domains);
 static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
@@ -90,8 +116,7 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
        assert_spin_locked(&dlm->spinlock);
        q = &res->lockname;
-        q->hash = full_name_hash(q->name, q->len);
+        bucket = dlm_lockres_hash(dlm, q->hash);
-        bucket = &(dlm->lockres_hash[q->hash % DLM_HASH_BUCKETS]);
        /* get a reference for our hashtable */
        dlm_lockres_get(res);
@@ -100,34 +125,32 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
 }
 struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
-                                         const char *name,
+                                                const char *name,
-                                         unsigned int len)
+                                                unsigned int len,
+                                                unsigned int hash)
 {
-        unsigned int hash;
-        struct hlist_node *iter;
-        struct dlm_lock_resource *tmpres=NULL;
        struct hlist_head *bucket;
+        struct hlist_node *list;
        mlog_entry("%.*s\n", len, name);
        assert_spin_locked(&dlm->spinlock);
-        hash = full_name_hash(name, len);
+        bucket = dlm_lockres_hash(dlm, hash);
-        bucket = &(dlm->lockres_hash[hash % DLM_HASH_BUCKETS]);
-        /* check for pre-existing lock */
-        hlist_for_each(iter, bucket) {
-                tmpres = hlist_entry(iter, struct dlm_lock_resource, hash_node);
-                if (tmpres->lockname.len == len &&
-                    memcmp(tmpres->lockname.name, name, len) == 0) {
-                        dlm_lockres_get(tmpres);
-                        break;
-                }
-                tmpres = NULL;
+        hlist_for_each(list, bucket) {
+                struct dlm_lock_resource *res = hlist_entry(list,
+                        struct dlm_lock_resource, hash_node);
+                if (res->lockname.name[0] != name[0])
+                        continue;
+                if (unlikely(res->lockname.len != len))
+                        continue;
+                if (memcmp(res->lockname.name + 1, name + 1, len - 1))
+                        continue;
+                dlm_lockres_get(res);
+                return res;
        }
-        return tmpres;
+        return NULL;
 }
 struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
@@ -135,9 +158,10 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
                                    unsigned int len)
 {
        struct dlm_lock_resource *res;
+        unsigned int hash = dlm_lockid_hash(name, len);
        spin_lock(&dlm->spinlock);
-        res = __dlm_lookup_lockres(dlm, name, len);
+        res = __dlm_lookup_lockres(dlm, name, len, hash);
        spin_unlock(&dlm->spinlock);
        return res;
 }
@@ -194,7 +218,7 @@ static int dlm_wait_on_domain_helper(const char *domain)
 static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
 {
        if (dlm->lockres_hash)
-                free_page((unsigned long) dlm->lockres_hash);
+                dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
        if (dlm->name)
                kfree(dlm->name);
@@ -278,11 +302,21 @@ int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
        return ret;
 }
+static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
+{
+        if (dlm->dlm_worker) {
+                flush_workqueue(dlm->dlm_worker);
+                destroy_workqueue(dlm->dlm_worker);
+                dlm->dlm_worker = NULL;
+        }
+}
 static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
 {
        dlm_unregister_domain_handlers(dlm);
        dlm_complete_thread(dlm);
        dlm_complete_recovery_thread(dlm);
+        dlm_destroy_dlm_worker(dlm);
        /* We've left the domain. Now we can take ourselves out of the
         * list and allow the kref stuff to help us free the
@@ -304,8 +338,8 @@ static void dlm_migrate_all_locks(struct dlm_ctxt *dlm)
 restart:
        spin_lock(&dlm->spinlock);
        for (i = 0; i < DLM_HASH_BUCKETS; i++) {
-                while (!hlist_empty(&dlm->lockres_hash[i])) {
+                while (!hlist_empty(dlm_lockres_hash(dlm, i))) {
-                        res = hlist_entry(dlm->lockres_hash[i].first,
+                        res = hlist_entry(dlm_lockres_hash(dlm, i)->first,
                                          struct dlm_lock_resource, hash_node);
                        /* need reference when manually grabbing lockres */
                        dlm_lockres_get(res);
@@ -1126,6 +1160,13 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
                goto bail;
        }
+        dlm->dlm_worker = create_singlethread_workqueue("dlm_wq");
+        if (!dlm->dlm_worker) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto bail;
+        }
        do {
                unsigned int backoff;
                status = dlm_try_to_join_domain(dlm);
@@ -1166,6 +1207,7 @@ bail:
                dlm_unregister_domain_handlers(dlm);
                dlm_complete_thread(dlm);
                dlm_complete_recovery_thread(dlm);
+                dlm_destroy_dlm_worker(dlm);
        }
        return status;
@@ -1191,7 +1233,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
                goto leave;
        }
-        dlm->lockres_hash = (struct hlist_head *) __get_free_page(GFP_KERNEL);
+        dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES);
        if (!dlm->lockres_hash) {
                mlog_errno(-ENOMEM);
                kfree(dlm->name);
@@ -1200,8 +1242,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
                goto leave;
        }
-        for (i=0; i<DLM_HASH_BUCKETS; i++)
+        for (i = 0; i < DLM_HASH_BUCKETS; i++)
-                INIT_HLIST_HEAD(&dlm->lockres_hash[i]);
+                INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i));
        strcpy(dlm->name, domain);
        dlm->key = key;
@@ -1231,6 +1273,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        dlm->dlm_thread_task = NULL;
        dlm->dlm_reco_thread_task = NULL;
+        dlm->dlm_worker = NULL;
        init_waitqueue_head(&dlm->dlm_thread_wq);
        init_waitqueue_head(&dlm->dlm_reco_thread_wq);
        init_waitqueue_head(&dlm->reco.event);
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 7273d9fa6bab..033ad1701232 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -116,7 +116,7 @@ static int dlmfs_file_open(struct inode *inode,
         * doesn't make sense for LVB writes. */
        file->f_flags &= ~O_APPEND;
-        fp = kmalloc(sizeof(*fp), GFP_KERNEL);
+        fp = kmalloc(sizeof(*fp), GFP_NOFS);
        if (!fp) {
                status = -ENOMEM;
                goto bail;
@@ -196,7 +196,7 @@ static ssize_t dlmfs_file_read(struct file *filp,
        else
                readlen = count - *ppos;
-        lvb_buf = kmalloc(readlen, GFP_KERNEL);
+        lvb_buf = kmalloc(readlen, GFP_NOFS);
        if (!lvb_buf)
                return -ENOMEM;
@@ -240,7 +240,7 @@ static ssize_t dlmfs_file_write(struct file *filp,
        else
                writelen = count - *ppos;
-        lvb_buf = kmalloc(writelen, GFP_KERNEL);
+        lvb_buf = kmalloc(writelen, GFP_NOFS);
        if (!lvb_buf)
                return -ENOMEM;
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 6fea28318d6d..5ca57ec650c7 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -53,7 +53,7 @@
 #define MLOG_MASK_PREFIX ML_DLM
 #include "cluster/masklog.h"
-static spinlock_t dlm_cookie_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(dlm_cookie_lock);
 static u64 dlm_next_cookie = 1;
 static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
@@ -201,6 +201,7 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
                                      struct dlm_lock *lock, int flags)
 {
        enum dlm_status status = DLM_DENIED;
+        int lockres_changed = 1;
        mlog_entry("type=%d\n", lock->ml.type);
        mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len,
@@ -226,8 +227,25 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
        res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
        lock->lock_pending = 0;
        if (status != DLM_NORMAL) {
-                if (status != DLM_NOTQUEUED)
+                if (status == DLM_RECOVERING &&
+                    dlm_is_recovery_lock(res->lockname.name,
+                                         res->lockname.len)) {
+                        /* recovery lock was mastered by dead node.
+                         * we need to have calc_usage shoot down this
+                         * lockres and completely remaster it. */
+                        mlog(0, "%s: recovery lock was owned by "
+                             "dead node %u, remaster it now.\n",
+                             dlm->name, res->owner);
+                } else if (status != DLM_NOTQUEUED) {
+                        /*
+                         * DO NOT call calc_usage, as this would unhash
+                         * the remote lockres before we ever get to use
+                         * it.  treat as if we never made any change to
+                         * the lockres.
+                         */
+                        lockres_changed = 0;
                        dlm_error(status);
+                }
                dlm_revert_pending_lock(res, lock);
                dlm_lock_put(lock);
        } else if (dlm_is_recovery_lock(res->lockname.name, 
@@ -239,12 +257,12 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
                mlog(0, "%s: $RECOVERY lock for this node (%u) is "
                     "mastered by %u; got lock, manually granting (no ast)\n",
                     dlm->name, dlm->node_num, res->owner);
-                list_del_init(&lock->list);
+                list_move_tail(&lock->list, &res->granted);
-                list_add_tail(&lock->list, &res->granted);
        }
        spin_unlock(&res->spinlock);
-        dlm_lockres_calc_usage(dlm, res);
+        if (lockres_changed)
+                dlm_lockres_calc_usage(dlm, res);
        wake_up(&res->wq);
        return status;
@@ -281,6 +299,14 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
        if (tmpret >= 0) {
                // successfully sent and received
                ret = status;  // this is already a dlm_status
+                if (ret == DLM_REJECTED) {
+                        mlog(ML_ERROR, "%s:%.*s: BUG.  this is a stale lockres "
+                             "no longer owned by %u.  that node is coming back "
+                             "up currently.\n", dlm->name, create.namelen,
+                             create.name, res->owner);
+                        dlm_print_one_lock_resource(res);
+                        BUG();
+                }
        } else {
                mlog_errno(tmpret);
                if (dlm_is_host_down(tmpret)) {
@@ -382,13 +408,13 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
        struct dlm_lock *lock;
        int kernel_allocated = 0;
-        lock = kcalloc(1, sizeof(*lock), GFP_KERNEL);
+        lock = kcalloc(1, sizeof(*lock), GFP_NOFS);
        if (!lock)
                return NULL;
        if (!lksb) {
                /* zero memory only if kernel-allocated */
-                lksb = kcalloc(1, sizeof(*lksb), GFP_KERNEL);
+                lksb = kcalloc(1, sizeof(*lksb), GFP_NOFS);
                if (!lksb) {
                        kfree(lock);
                        return NULL;
@@ -429,11 +455,16 @@ int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data)
        if (!dlm_grab(dlm))
                return DLM_REJECTED;
-        mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
-                        "Domain %s not fully joined!\n", dlm->name);
        name = create->name;
        namelen = create->namelen;
+        status = DLM_REJECTED;
+        if (!dlm_domain_fully_joined(dlm)) {
+                mlog(ML_ERROR, "Domain %s not fully joined, but node %u is "
+                     "sending a create_lock message for lock %.*s!\n",
+                     dlm->name, create->node_idx, namelen, name);
+                dlm_error(status);
+                goto leave;
+        }
        status = DLM_IVBUFLEN;
        if (namelen > DLM_LOCKID_NAME_MAX) {
@@ -669,18 +700,22 @@ retry_lock:
                        msleep(100);
                        /* no waiting for dlm_reco_thread */
                        if (recovery) {
-                                if (status == DLM_RECOVERING) {
+                                if (status != DLM_RECOVERING)
-                                        mlog(0, "%s: got RECOVERING "
+                                        goto retry_lock;
-                                             "for $REOCVERY lock, master "
-                                             "was %u\n", dlm->name, 
+                                mlog(0, "%s: got RECOVERING "
-                                             res->owner);
+                                     "for $RECOVERY lock, master "
-                                        dlm_wait_for_node_death(dlm, res->owner, 
+                                     "was %u\n", dlm->name,
-                                                        DLM_NODE_DEATH_WAIT_MAX);
+                                     res->owner);
-                                }
+                                /* wait to see the node go down, then
+                                 * drop down and allow the lockres to
+                                 * get cleaned up.  need to remaster. */
+                                dlm_wait_for_node_death(dlm, res->owner,
+                                                DLM_NODE_DEATH_WAIT_MAX);
                        } else {
                                dlm_wait_for_recovery(dlm);
+                                goto retry_lock;
                        }
-                        goto retry_lock;
                }
                if (status != DLM_NORMAL) {
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 940be4c13b1f..1b8346dd0572 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -47,7 +47,6 @@
 #include "dlmapi.h"
 #include "dlmcommon.h"
-#include "dlmdebug.h"
 #include "dlmdomain.h"
 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
@@ -74,6 +73,7 @@ struct dlm_master_list_entry
        wait_queue_head_t wq;
        atomic_t woken;
        struct kref mle_refs;
+        int inuse;
        unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
        unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
        unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
@@ -127,18 +127,30 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
        return 1;
 }
-#if 0
+#define dlm_print_nodemap(m)  _dlm_print_nodemap(m,#m)
-/* Code here is included but defined out as it aids debugging */
+static void _dlm_print_nodemap(unsigned long *map, const char *mapname)
+{
+        int i;
+        printk("%s=[ ", mapname);
+        for (i=0; i<O2NM_MAX_NODES; i++)
+                if (test_bit(i, map))
+                        printk("%d ", i);
+        printk("]");
+}
-void dlm_print_one_mle(struct dlm_master_list_entry *mle)
+static void dlm_print_one_mle(struct dlm_master_list_entry *mle)
 {
-        int i = 0, refs;
+        int refs;
        char *type;
        char attached;
        u8 master;
        unsigned int namelen;
        const char *name;
        struct kref *k;
+        unsigned long *maybe = mle->maybe_map,
+                      *vote = mle->vote_map,
+                      *resp = mle->response_map,
+                      *node = mle->node_map;
        k = &mle->mle_refs;
        if (mle->type == DLM_MLE_BLOCK)
@@ -159,18 +171,29 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle)
                name = mle->u.res->lockname.name;
        }
-        mlog(ML_NOTICE, "  #%3d: %3s  %3d  %3u   %3u %c    (%d)%.*s\n",
+        mlog(ML_NOTICE, "%.*s: %3s refs=%3d mas=%3u new=%3u evt=%c inuse=%d ",
-                  i, type, refs, master, mle->new_master, attached,
+                  namelen, name, type, refs, master, mle->new_master, attached,
-                  namelen, namelen, name);
+                  mle->inuse);
+        dlm_print_nodemap(maybe);
+        printk(", ");
+        dlm_print_nodemap(vote);
+        printk(", ");
+        dlm_print_nodemap(resp);
+        printk(", ");
+        dlm_print_nodemap(node);
+        printk(", ");
+        printk("\n");
 }
+#if 0
+/* Code here is included but defined out as it aids debugging */
 static void dlm_dump_mles(struct dlm_ctxt *dlm)
 {
        struct dlm_master_list_entry *mle;
        struct list_head *iter;
        
        mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
-        mlog(ML_NOTICE, "  ####: type refs owner new events? lockname nodemap votemap respmap maybemap\n");
        spin_lock(&dlm->master_lock);
        list_for_each(iter, &dlm->master_list) {
                mle = list_entry(iter, struct dlm_master_list_entry, list);
@@ -314,6 +337,31 @@ static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
        spin_unlock(&dlm->spinlock);
 }
+static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle)
+{
+        struct dlm_ctxt *dlm;
+        dlm = mle->dlm;
+        assert_spin_locked(&dlm->spinlock);
+        assert_spin_locked(&dlm->master_lock);
+        mle->inuse++;
+        kref_get(&mle->mle_refs);
+}
+static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle)
+{
+        struct dlm_ctxt *dlm;
+        dlm = mle->dlm;
+        spin_lock(&dlm->spinlock);
+        spin_lock(&dlm->master_lock);
+        mle->inuse--;
+        __dlm_put_mle(mle);
+        spin_unlock(&dlm->master_lock);
+        spin_unlock(&dlm->spinlock);
+}
 /* remove from list and free */
 static void __dlm_put_mle(struct dlm_master_list_entry *mle)
 {
@@ -322,9 +370,14 @@ static void __dlm_put_mle(struct dlm_master_list_entry *mle)
        assert_spin_locked(&dlm->spinlock);
        assert_spin_locked(&dlm->master_lock);
-        BUG_ON(!atomic_read(&mle->mle_refs.refcount));
+        if (!atomic_read(&mle->mle_refs.refcount)) {
+                /* this may or may not crash, but who cares.
-        kref_put(&mle->mle_refs, dlm_mle_release);
+                 * it's a BUG. */
+                mlog(ML_ERROR, "bad mle: %p\n", mle);
+                dlm_print_one_mle(mle);
+                BUG();
+        } else
+                kref_put(&mle->mle_refs, dlm_mle_release);
 }
@@ -367,6 +420,7 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
        memset(mle->response_map, 0, sizeof(mle->response_map));
        mle->master = O2NM_MAX_NODES;
        mle->new_master = O2NM_MAX_NODES;
+        mle->inuse = 0;
        if (mle->type == DLM_MLE_MASTER) {
                BUG_ON(!res);
@@ -564,6 +618,28 @@ static void dlm_lockres_release(struct kref *kref)
        mlog(0, "destroying lockres %.*s\n", res->lockname.len,
             res->lockname.name);
+        if (!hlist_unhashed(&res->hash_node) ||
+            !list_empty(&res->granted) ||
+            !list_empty(&res->converting) ||
+            !list_empty(&res->blocked) ||
+            !list_empty(&res->dirty) ||
+            !list_empty(&res->recovering) ||
+            !list_empty(&res->purge)) {
+                mlog(ML_ERROR,
+                     "Going to BUG for resource %.*s."
+                     "  We're on a list! [%c%c%c%c%c%c%c]\n",
+                     res->lockname.len, res->lockname.name,
+                     !hlist_unhashed(&res->hash_node) ? 'H' : ' ',
+                     !list_empty(&res->granted) ? 'G' : ' ',
+                     !list_empty(&res->converting) ? 'C' : ' ',
+                     !list_empty(&res->blocked) ? 'B' : ' ',
+                     !list_empty(&res->dirty) ? 'D' : ' ',
+                     !list_empty(&res->recovering) ? 'R' : ' ',
+                     !list_empty(&res->purge) ? 'P' : ' ');
+                dlm_print_one_lock_resource(res);
+        }
        /* By the time we're ready to blow this guy away, we shouldn't
         * be on any lists. */
        BUG_ON(!hlist_unhashed(&res->hash_node));
@@ -579,11 +655,6 @@ static void dlm_lockres_release(struct kref *kref)
        kfree(res);
 }
-void dlm_lockres_get(struct dlm_lock_resource *res)
-{
-        kref_get(&res->refs);
-}
 void dlm_lockres_put(struct dlm_lock_resource *res)
 {
        kref_put(&res->refs, dlm_lockres_release);
@@ -603,7 +674,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
        memcpy(qname, name, namelen);
        res->lockname.len = namelen;
-        res->lockname.hash = full_name_hash(name, namelen);
+        res->lockname.hash = dlm_lockid_hash(name, namelen);
        init_waitqueue_head(&res->wq);
        spin_lock_init(&res->spinlock);
@@ -637,11 +708,11 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
 {
        struct dlm_lock_resource *res;
-        res = kmalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
+        res = kmalloc(sizeof(struct dlm_lock_resource), GFP_NOFS);
        if (!res)
                return NULL;
-        res->lockname.name = kmalloc(namelen, GFP_KERNEL);
+        res->lockname.name = kmalloc(namelen, GFP_NOFS);
        if (!res->lockname.name) {
                kfree(res);
                return NULL;
@@ -677,19 +748,20 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
        int blocked = 0;
        int ret, nodenum;
        struct dlm_node_iter iter;
-        unsigned int namelen;
+        unsigned int namelen, hash;
        int tries = 0;
        int bit, wait_on_recovery = 0;
        BUG_ON(!lockid);
        namelen = strlen(lockid);
+        hash = dlm_lockid_hash(lockid, namelen);
        mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
 lookup:
        spin_lock(&dlm->spinlock);
-        tmpres = __dlm_lookup_lockres(dlm, lockid, namelen);
+        tmpres = __dlm_lookup_lockres(dlm, lockid, namelen, hash);
        if (tmpres) {
                spin_unlock(&dlm->spinlock);
                mlog(0, "found in hash!\n");
@@ -704,7 +776,7 @@ lookup:
                mlog(0, "allocating a new resource\n");
                /* nothing found and we need to allocate one. */
                alloc_mle = (struct dlm_master_list_entry *)
-                        kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
+                        kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
                if (!alloc_mle)
                        goto leave;
                res = dlm_new_lockres(dlm, lockid, namelen);
@@ -790,10 +862,11 @@ lookup:
         * if so, the creator of the BLOCK may try to put the last
         * ref at this time in the assert master handler, so we
         * need an extra one to keep from a bad ptr deref. */
-        dlm_get_mle(mle);
+        dlm_get_mle_inuse(mle);
        spin_unlock(&dlm->master_lock);
        spin_unlock(&dlm->spinlock);
+redo_request:
        while (wait_on_recovery) {
                /* any cluster changes that occurred after dropping the
                 * dlm spinlock would be detectable be a change on the mle,
@@ -812,7 +885,7 @@ lookup:
                } 
                dlm_kick_recovery_thread(dlm);
-                msleep(100);
+                msleep(1000);
                dlm_wait_for_recovery(dlm);
                spin_lock(&dlm->spinlock);
@@ -825,13 +898,15 @@ lookup:
                } else
                        wait_on_recovery = 0;
                spin_unlock(&dlm->spinlock);
+                if (wait_on_recovery)
+                        dlm_wait_for_node_recovery(dlm, bit, 10000);
        }
        /* must wait for lock to be mastered elsewhere */
        if (blocked)
                goto wait;
-redo_request:
        ret = -EINVAL;
        dlm_node_iter_init(mle->vote_map, &iter);
        while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
@@ -856,6 +931,7 @@ wait:
        /* keep going until the response map includes all nodes */
        ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
        if (ret < 0) {
+                wait_on_recovery = 1;
                mlog(0, "%s:%.*s: node map changed, redo the "
                     "master request now, blocked=%d\n",
                     dlm->name, res->lockname.len,
@@ -866,7 +942,7 @@ wait:
                             dlm->name, res->lockname.len, 
                             res->lockname.name, blocked);
                        dlm_print_one_lock_resource(res);
-                        /* dlm_print_one_mle(mle); */
+                        dlm_print_one_mle(mle);
                        tries = 0;
                }
                goto redo_request;
@@ -880,7 +956,7 @@ wait:
        dlm_mle_detach_hb_events(dlm, mle);
        dlm_put_mle(mle);
        /* put the extra ref */
-        dlm_put_mle(mle);
+        dlm_put_mle_inuse(mle);
 wake_waiters:
        spin_lock(&res->spinlock);
@@ -921,12 +997,14 @@ recheck:
                spin_unlock(&res->spinlock);
                /* this will cause the master to re-assert across
                 * the whole cluster, freeing up mles */
-                ret = dlm_do_master_request(mle, res->owner);
+                if (res->owner != dlm->node_num) {
-                if (ret < 0) {
+                        ret = dlm_do_master_request(mle, res->owner);
-                        /* give recovery a chance to run */
+                        if (ret < 0) {
-                        mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
+                                /* give recovery a chance to run */
-                        msleep(500);
+                                mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
-                        goto recheck;
+                                msleep(500);
+                                goto recheck;
+                        }
                }
                ret = 0;
                goto leave;
@@ -962,6 +1040,12 @@ recheck:
                     "rechecking now\n", dlm->name, res->lockname.len,
                     res->lockname.name);
                goto recheck;
+        } else {
+                if (!voting_done) {
+                        mlog(0, "map not changed and voting not done "
+                             "for %s:%.*s\n", dlm->name, res->lockname.len,
+                             res->lockname.name);
+                }
        }
        if (m != O2NM_MAX_NODES) {
@@ -1129,18 +1213,6 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
                        set_bit(node, mle->vote_map);
                } else {
                        mlog(ML_ERROR, "node down! %d\n", node);
-                        /* if the node wasn't involved in mastery skip it,
-                         * but clear it out from the maps so that it will
-                         * not affect mastery of this lockres */
-                        clear_bit(node, mle->response_map);
-                        clear_bit(node, mle->vote_map);
-                        if (!test_bit(node, mle->maybe_map))
-                                goto next;
-                        /* if we're already blocked on lock mastery, and the
-                         * dead node wasn't the expected master, or there is
-                         * another node in the maybe_map, keep waiting */
                        if (blocked) {
                                int lowest = find_next_bit(mle->maybe_map,
                                                       O2NM_MAX_NODES, 0);
@@ -1148,54 +1220,53 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
                                /* act like it was never there */
                                clear_bit(node, mle->maybe_map);
-                                if (node != lowest)
+                                if (node == lowest) {
-                                        goto next;
+                                        mlog(0, "expected master %u died"
+                                            " while this node was blocked "
-                                mlog(ML_ERROR, "expected master %u died while "
+                                            "waiting on it!\n", node);
-                                     "this node was blocked waiting on it!\n",
+                                        lowest = find_next_bit(mle->maybe_map,
-                                     node);
+                                                        O2NM_MAX_NODES,
-                                lowest = find_next_bit(mle->maybe_map,
+                                                        lowest+1);
-                                                       O2NM_MAX_NODES,
+                                        if (lowest < O2NM_MAX_NODES) {
-                                                       lowest+1);
+                                                mlog(0, "%s:%.*s:still "
-                                if (lowest < O2NM_MAX_NODES) {
+                                                     "blocked. waiting on %u "
-                                        mlog(0, "still blocked. waiting "
+                                                     "now\n", dlm->name,
-                                             "on %u now\n", lowest);
+                                                     res->lockname.len,
-                                        goto next;
+                                                     res->lockname.name,
+                                                     lowest);
+                                        } else {
+                                                /* mle is an MLE_BLOCK, but
+                                                 * there is now nothing left to
+                                                 * block on.  we need to return
+                                                 * all the way back out and try
+                                                 * again with an MLE_MASTER.
+                                                 * dlm_do_local_recovery_cleanup
+                                                 * has already run, so the mle
+                                                 * refcount is ok */
+                                                mlog(0, "%s:%.*s: no "
+                                                     "longer blocking. try to "
+                                                     "master this here\n",
+                                                     dlm->name,
+                                                     res->lockname.len,
+                                                     res->lockname.name);
+                                                mle->type = DLM_MLE_MASTER;
+                                                mle->u.res = res;
+                                        }
                                }
-                                /* mle is an MLE_BLOCK, but there is now
-                                 * nothing left to block on.  we need to return
-                                 * all the way back out and try again with
-                                 * an MLE_MASTER. dlm_do_local_recovery_cleanup
-                                 * has already run, so the mle refcount is ok */
-                                mlog(0, "no longer blocking. we can "
-                                     "try to master this here\n");
-                                mle->type = DLM_MLE_MASTER;
-                                memset(mle->maybe_map, 0,
-                                       sizeof(mle->maybe_map));
-                                memset(mle->response_map, 0,
-                                       sizeof(mle->maybe_map));
-                                memcpy(mle->vote_map, mle->node_map,
-                                       sizeof(mle->node_map));
-                                mle->u.res = res;
-                                set_bit(dlm->node_num, mle->maybe_map);
-                                ret = -EAGAIN;
-                                goto next;
                        }
-                        clear_bit(node, mle->maybe_map);
+                        /* now blank out everything, as if we had never
-                        if (node > dlm->node_num)
+                         * contacted anyone */
-                                goto next;
+                        memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
+                        memset(mle->response_map, 0, sizeof(mle->response_map));
-                        mlog(0, "dead node in map!\n");
+                        /* reset the vote_map to the current node_map */
-                        /* yuck. go back and re-contact all nodes
+                        memcpy(mle->vote_map, mle->node_map,
-                         * in the vote_map, removing this node. */
+                               sizeof(mle->node_map));
-                        memset(mle->response_map, 0,
+                        /* put myself into the maybe map */
-                               sizeof(mle->response_map));
+                        if (mle->type != DLM_MLE_BLOCK)
+                                set_bit(dlm->node_num, mle->maybe_map);
                }
                ret = -EAGAIN;
-next:
                node = dlm_bitmap_diff_iter_next(&bdi, &sc);
        }
        return ret;
@@ -1316,7 +1387,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
        struct dlm_master_request *request = (struct dlm_master_request *) msg->buf;
        struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
        char *name;
-        unsigned int namelen;
+        unsigned int namelen, hash;
        int found, ret;
        int set_maybe;
        int dispatch_assert = 0;
@@ -1331,6 +1402,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
        name = request->name;
        namelen = request->namelen;
+        hash = dlm_lockid_hash(name, namelen);
        if (namelen > DLM_LOCKID_NAME_MAX) {
                response = DLM_IVBUFLEN;
@@ -1339,7 +1411,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
 way_up_top:
        spin_lock(&dlm->spinlock);
-        res = __dlm_lookup_lockres(dlm, name, namelen);
+        res = __dlm_lookup_lockres(dlm, name, namelen, hash);
        if (res) {
                spin_unlock(&dlm->spinlock);
@@ -1459,21 +1531,18 @@ way_up_top:
                        spin_unlock(&dlm->spinlock);
                        mle = (struct dlm_master_list_entry *)
-                                kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
+                                kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
                        if (!mle) {
                                response = DLM_MASTER_RESP_ERROR;
                                mlog_errno(-ENOMEM);
                                goto send_response;
                        }
-                        spin_lock(&dlm->spinlock);
-                        dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL,
-                                         name, namelen);
-                        spin_unlock(&dlm->spinlock);
                        goto way_up_top;
                }
                // mlog(0, "this is second time thru, already allocated, "
                // "add the block.\n");
+                dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
                set_bit(request->node_idx, mle->maybe_map);
                list_add(&mle->list, &dlm->master_list);
                response = DLM_MASTER_RESP_NO;
@@ -1556,6 +1625,8 @@ again:
        dlm_node_iter_init(nodemap, &iter);
        while ((to = dlm_node_iter_next(&iter)) >= 0) {
                int r = 0;
+                struct dlm_master_list_entry *mle = NULL;
                mlog(0, "sending assert master to %d (%.*s)\n", to,
                     namelen, lockname);
                memset(&assert, 0, sizeof(assert));
@@ -1567,20 +1638,28 @@ again:
                tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
                                            &assert, sizeof(assert), to, &r);
                if (tmpret < 0) {
-                        mlog(ML_ERROR, "assert_master returned %d!\n", tmpret);
+                        mlog(0, "assert_master returned %d!\n", tmpret);
                        if (!dlm_is_host_down(tmpret)) {
-                                mlog(ML_ERROR, "unhandled error!\n");
+                                mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
                                BUG();
                        }
                        /* a node died.  finish out the rest of the nodes. */
-                        mlog(ML_ERROR, "link to %d went down!\n", to);
+                        mlog(0, "link to %d went down!\n", to);
                        /* any nonzero status return will do */
                        ret = tmpret;
                } else if (r < 0) {
                        /* ok, something horribly messed.  kill thyself. */
                        mlog(ML_ERROR,"during assert master of %.*s to %u, "
                             "got %d.\n", namelen, lockname, to, r);
-                        dlm_dump_lock_resources(dlm);
+                        spin_lock(&dlm->spinlock);
+                        spin_lock(&dlm->master_lock);
+                        if (dlm_find_mle(dlm, &mle, (char *)lockname,
+                                         namelen)) {
+                                dlm_print_one_mle(mle);
+                                __dlm_put_mle(mle);
+                        }
+                        spin_unlock(&dlm->master_lock);
+                        spin_unlock(&dlm->spinlock);
                        BUG();
                } else if (r == EAGAIN) {
                        mlog(0, "%.*s: node %u create mles on other "
@@ -1612,7 +1691,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
        struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf;
        struct dlm_lock_resource *res = NULL;
        char *name;
-        unsigned int namelen;
+        unsigned int namelen, hash;
        u32 flags;
        int master_request = 0;
        int ret = 0;
@@ -1622,6 +1701,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
        name = assert->name;
        namelen = assert->namelen;
+        hash = dlm_lockid_hash(name, namelen);
        flags = be32_to_cpu(assert->flags);
        if (namelen > DLM_LOCKID_NAME_MAX) {
@@ -1646,7 +1726,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
                if (bit >= O2NM_MAX_NODES) {
                        /* not necessarily an error, though less likely.
                         * could be master just re-asserting. */
-                        mlog(ML_ERROR, "no bits set in the maybe_map, but %u "
+                        mlog(0, "no bits set in the maybe_map, but %u "
                             "is asserting! (%.*s)\n", assert->node_idx,
                             namelen, name);
                } else if (bit != assert->node_idx) {
@@ -1658,19 +1738,36 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
                                 * number winning the mastery will respond
                                 * YES to mastery requests, but this node
                                 * had no way of knowing.  let it pass. */
-                                mlog(ML_ERROR, "%u is the lowest node, "
+                                mlog(0, "%u is the lowest node, "
                                     "%u is asserting. (%.*s)  %u must "
                                     "have begun after %u won.\n", bit,
                                     assert->node_idx, namelen, name, bit,
                                     assert->node_idx);
                        }
                }
+                if (mle->type == DLM_MLE_MIGRATION) {
+                        if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
+                                mlog(0, "%s:%.*s: got cleanup assert"
+                                     " from %u for migration\n",
+                                     dlm->name, namelen, name,
+                                     assert->node_idx);
+                        } else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) {
+                                mlog(0, "%s:%.*s: got unrelated assert"
+                                     " from %u for migration, ignoring\n",
+                                     dlm->name, namelen, name,
+                                     assert->node_idx);
+                                __dlm_put_mle(mle);
+                                spin_unlock(&dlm->master_lock);
+                                spin_unlock(&dlm->spinlock);
+                                goto done;
+                        }       
+                }
        }
        spin_unlock(&dlm->master_lock);
        /* ok everything checks out with the MLE
         * now check to see if there is a lockres */
-        res = __dlm_lookup_lockres(dlm, name, namelen);
+        res = __dlm_lookup_lockres(dlm, name, namelen, hash);
        if (res) {
                spin_lock(&res->spinlock);
                if (res->state & DLM_LOCK_RES_RECOVERING)  {
@@ -1679,7 +1776,8 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
                        goto kill;
                }
                if (!mle) {
-                        if (res->owner != assert->node_idx) {
+                        if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN &&
+                            res->owner != assert->node_idx) {
                                mlog(ML_ERROR, "assert_master from "
                                          "%u, but current owner is "
                                          "%u! (%.*s)\n",
@@ -1732,6 +1830,7 @@ ok:
        if (mle) {
                int extra_ref = 0;
                int nn = -1;
+                int rr, err = 0;
                
                spin_lock(&mle->spinlock);
                if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION)
@@ -1751,27 +1850,64 @@ ok:
                wake_up(&mle->wq);
                spin_unlock(&mle->spinlock);
-                if (mle->type == DLM_MLE_MIGRATION && res) {
+                if (res) {
-                        mlog(0, "finishing off migration of lockres %.*s, "
-                             "from %u to %u\n",
-                               res->lockname.len, res->lockname.name,
-                               dlm->node_num, mle->new_master);
                        spin_lock(&res->spinlock);
-                        res->state &= ~DLM_LOCK_RES_MIGRATING;
+                        if (mle->type == DLM_MLE_MIGRATION) {
-                        dlm_change_lockres_owner(dlm, res, mle->new_master);
+                                mlog(0, "finishing off migration of lockres %.*s, "
-                        BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
+                                        "from %u to %u\n",
+                                        res->lockname.len, res->lockname.name,
+                                        dlm->node_num, mle->new_master);
+                                res->state &= ~DLM_LOCK_RES_MIGRATING;
+                                dlm_change_lockres_owner(dlm, res, mle->new_master);
+                                BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
+                        } else {
+                                dlm_change_lockres_owner(dlm, res, mle->master);
+                        }
                        spin_unlock(&res->spinlock);
                }
-                /* master is known, detach if not already detached */
-                dlm_mle_detach_hb_events(dlm, mle);
+                /* master is known, detach if not already detached.
-                dlm_put_mle(mle);
+                 * ensures that only one assert_master call will happen
-                
+                 * on this mle. */
+                spin_lock(&dlm->spinlock);
+                spin_lock(&dlm->master_lock);
+                rr = atomic_read(&mle->mle_refs.refcount);
+                if (mle->inuse > 0) {
+                        if (extra_ref && rr < 3)
+                                err = 1;
+                        else if (!extra_ref && rr < 2)
+                                err = 1;
+                } else {
+                        if (extra_ref && rr < 2)
+                                err = 1;
+                        else if (!extra_ref && rr < 1)
+                                err = 1;
+                }
+                if (err) {
+                        mlog(ML_ERROR, "%s:%.*s: got assert master from %u "
+                             "that will mess up this node, refs=%d, extra=%d, "
+                             "inuse=%d\n", dlm->name, namelen, name,
+                             assert->node_idx, rr, extra_ref, mle->inuse);
+                        dlm_print_one_mle(mle);
+                }
+                list_del_init(&mle->list);
+                __dlm_mle_detach_hb_events(dlm, mle);
+                __dlm_put_mle(mle);
                if (extra_ref) {
                        /* the assert master message now balances the extra
                         * ref given by the master / migration request message.
                         * if this is the last put, it will be removed
                         * from the list. */
-                        dlm_put_mle(mle);
+                        __dlm_put_mle(mle);
+                }
+                spin_unlock(&dlm->master_lock);
+                spin_unlock(&dlm->spinlock);
+        } else if (res) {
+                if (res->owner != assert->node_idx) {
+                        mlog(0, "assert_master from %u, but current "
+                             "owner is %u (%.*s), no mle\n", assert->node_idx,
+                             res->owner, namelen, name);
                }
        }
@@ -1788,12 +1924,12 @@ done:
 kill:
        /* kill the caller! */
+        mlog(ML_ERROR, "Bad message received from another node.  Dumping state "
+             "and killing the other node now!  This node is OK and can continue.\n");
+        __dlm_print_one_lock_resource(res);
        spin_unlock(&res->spinlock);
        spin_unlock(&dlm->spinlock);
        dlm_lockres_put(res);
-        mlog(ML_ERROR, "Bad message received from another node.  Dumping state "
-             "and killing the other node now!  This node is OK and can continue.\n");
-        dlm_dump_lock_resources(dlm);
        dlm_put(dlm);
        return -EINVAL;
 }
@@ -1803,7 +1939,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
                               int ignore_higher, u8 request_from, u32 flags)
 {
        struct dlm_work_item *item;
-        item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+        item = kcalloc(1, sizeof(*item), GFP_NOFS);
        if (!item)
                return -ENOMEM;
@@ -1825,7 +1961,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
        list_add_tail(&item->list, &dlm->work_list);
        spin_unlock(&dlm->work_lock);
-        schedule_work(&dlm->dispatched_work);
+        queue_work(dlm->dlm_worker, &dlm->dispatched_work);
        return 0;
 }
@@ -1866,6 +2002,23 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
                }
        }
+        /*
+         * If we're migrating this lock to someone else, we are no
+         * longer allowed to assert out own mastery.  OTOH, we need to
+         * prevent migration from starting while we're still asserting
+         * our dominance.  The reserved ast delays migration.
+         */
+        spin_lock(&res->spinlock);
+        if (res->state & DLM_LOCK_RES_MIGRATING) {
+                mlog(0, "Someone asked us to assert mastery, but we're "
+                     "in the middle of migration.  Skipping assert, "
+                     "the new master will handle that.\n");
+                spin_unlock(&res->spinlock);
+                goto put;
+        } else
+                __dlm_lockres_reserve_ast(res);
+        spin_unlock(&res->spinlock);
        /* this call now finishes out the nodemap
         * even if one or more nodes die */
        mlog(0, "worker about to master %.*s here, this=%u\n",
@@ -1875,9 +2028,14 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
                                   nodemap, flags);
        if (ret < 0) {
                /* no need to restart, we are done */
-                mlog_errno(ret);
+                if (!dlm_is_host_down(ret))
+                        mlog_errno(ret);
        }
+        /* Ok, we've asserted ourselves.  Let's let migration start. */
+        dlm_lockres_release_ast(dlm, res);
+put:
        dlm_lockres_put(res);
        mlog(0, "finished with dlm_assert_master_worker\n");
@@ -1916,6 +2074,7 @@ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
                                BUG();
                        /* host is down, so answer for that node would be
                         * DLM_LOCK_RES_OWNER_UNKNOWN.  continue. */
+                        ret = 0;
                }
                if (master != DLM_LOCK_RES_OWNER_UNKNOWN) {
@@ -2016,14 +2175,14 @@ int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
         */
        ret = -ENOMEM;
-        mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_KERNEL);
+        mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
        if (!mres) {
                mlog_errno(ret);
                goto leave;
        }
        mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
-                                                                GFP_KERNEL);
+                                                                GFP_NOFS);
        if (!mle) {
                mlog_errno(ret);
                goto leave;
@@ -2117,7 +2276,7 @@ fail:
         * take both dlm->spinlock and dlm->master_lock */
        spin_lock(&dlm->spinlock);
        spin_lock(&dlm->master_lock);
-        dlm_get_mle(mle);
+        dlm_get_mle_inuse(mle);
        spin_unlock(&dlm->master_lock);
        spin_unlock(&dlm->spinlock);
@@ -2134,7 +2293,10 @@ fail:
                /* migration failed, detach and clean up mle */
                dlm_mle_detach_hb_events(dlm, mle);
                dlm_put_mle(mle);
-                dlm_put_mle(mle);
+                dlm_put_mle_inuse(mle);
+                spin_lock(&res->spinlock);
+                res->state &= ~DLM_LOCK_RES_MIGRATING;
+                spin_unlock(&res->spinlock);
                goto leave;
        }
@@ -2164,8 +2326,8 @@ fail:
                        /* avoid hang during shutdown when migrating lockres 
                         * to a node which also goes down */
                        if (dlm_is_node_dead(dlm, target)) {
-                                mlog(0, "%s:%.*s: expected migration target %u "
+                                mlog(0, "%s:%.*s: expected migration "
-                                     "is no longer up.  restarting.\n",
+                                     "target %u is no longer up, restarting\n",
                                     dlm->name, res->lockname.len,
                                     res->lockname.name, target);
                                ret = -ERESTARTSYS;
@@ -2175,7 +2337,10 @@ fail:
                        /* migration failed, detach and clean up mle */
                        dlm_mle_detach_hb_events(dlm, mle);
                        dlm_put_mle(mle);
-                        dlm_put_mle(mle);
+                        dlm_put_mle_inuse(mle);
+                        spin_lock(&res->spinlock);
+                        res->state &= ~DLM_LOCK_RES_MIGRATING;
+                        spin_unlock(&res->spinlock);
                        goto leave;
                }
                /* TODO: if node died: stop, clean up, return error */
@@ -2191,7 +2356,7 @@ fail:
        /* master is known, detach if not already detached */
        dlm_mle_detach_hb_events(dlm, mle);
-        dlm_put_mle(mle);
+        dlm_put_mle_inuse(mle);
        ret = 0;
        dlm_lockres_calc_usage(dlm, res);
@@ -2462,7 +2627,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
        struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf;
        struct dlm_master_list_entry *mle = NULL, *oldmle = NULL;
        const char *name;
-        unsigned int namelen;
+        unsigned int namelen, hash;
        int ret = 0;
        if (!dlm_grab(dlm))
@@ -2470,10 +2635,11 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
        name = migrate->name;
        namelen = migrate->namelen;
+        hash = dlm_lockid_hash(name, namelen);
        /* preallocate.. if this fails, abort */
        mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
-                                                         GFP_KERNEL);
+                                                         GFP_NOFS);
        if (!mle) {
                ret = -ENOMEM;
@@ -2482,7 +2648,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
        /* check for pre-existing lock */
        spin_lock(&dlm->spinlock);
-        res = __dlm_lookup_lockres(dlm, name, namelen);
+        res = __dlm_lookup_lockres(dlm, name, namelen, hash);
        spin_lock(&dlm->master_lock);
        if (res) {
@@ -2580,6 +2746,7 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
                        /* remove it from the list so that only one
                         * mle will be found */
                        list_del_init(&tmp->list);
+                        __dlm_mle_detach_hb_events(dlm, mle);
                }
                spin_unlock(&tmp->spinlock);
        }
@@ -2601,6 +2768,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
        struct list_head *iter, *iter2;
        struct dlm_master_list_entry *mle;
        struct dlm_lock_resource *res;
+        unsigned int hash;
        mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
 top:
@@ -2640,7 +2808,7 @@ top:
                                 * may result in the mle being unlinked and
                                 * freed, but there may still be a process
                                 * waiting in the dlmlock path which is fine. */
-                                mlog(ML_ERROR, "node %u was expected master\n",
+                                mlog(0, "node %u was expected master\n",
                                     dead_node);
                                atomic_set(&mle->woken, 1);
                                spin_unlock(&mle->spinlock);
@@ -2673,19 +2841,21 @@ top:
                /* remove from the list early.  NOTE: unlinking
                 * list_head while in list_for_each_safe */
+                __dlm_mle_detach_hb_events(dlm, mle);
                spin_lock(&mle->spinlock);
                list_del_init(&mle->list);
                atomic_set(&mle->woken, 1);
                spin_unlock(&mle->spinlock);
                wake_up(&mle->wq);
-                mlog(0, "node %u died during migration from "
+                mlog(0, "%s: node %u died during migration from "
-                     "%u to %u!\n", dead_node,
+                     "%u to %u!\n", dlm->name, dead_node,
                     mle->master, mle->new_master);
                /* if there is a lockres associated with this
                 * mle, find it and set its owner to UNKNOWN */
+                hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len);
                res = __dlm_lookup_lockres(dlm, mle->u.name.name,
-                                        mle->u.name.len);
+                                           mle->u.name.len, hash);
                if (res) {
                        /* unfortunately if we hit this rare case, our
                         * lock ordering is messed.  we need to drop
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 805cbabac051..29b2845f370d 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -98,8 +98,8 @@ static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data);
 static u64 dlm_get_next_mig_cookie(void);
-static spinlock_t dlm_reco_state_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(dlm_reco_state_lock);
-static spinlock_t dlm_mig_cookie_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(dlm_mig_cookie_lock);
 static u64 dlm_mig_cookie = 1;
 static u64 dlm_get_next_mig_cookie(void)
@@ -115,12 +115,37 @@ static u64 dlm_get_next_mig_cookie(void)
        return c;
 }
+static inline void dlm_set_reco_dead_node(struct dlm_ctxt *dlm,
+                                          u8 dead_node)
+{
+        assert_spin_locked(&dlm->spinlock);
+        if (dlm->reco.dead_node != dead_node)
+                mlog(0, "%s: changing dead_node from %u to %u\n",
+                     dlm->name, dlm->reco.dead_node, dead_node);
+        dlm->reco.dead_node = dead_node;
+}
+static inline void dlm_set_reco_master(struct dlm_ctxt *dlm,
+                                       u8 master)
+{
+        assert_spin_locked(&dlm->spinlock);
+        mlog(0, "%s: changing new_master from %u to %u\n",
+             dlm->name, dlm->reco.new_master, master);
+        dlm->reco.new_master = master;
+}
+static inline void __dlm_reset_recovery(struct dlm_ctxt *dlm)
+{
+        assert_spin_locked(&dlm->spinlock);
+        clear_bit(dlm->reco.dead_node, dlm->recovery_map);
+        dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
+        dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
+}
 static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
 {
        spin_lock(&dlm->spinlock);
-        clear_bit(dlm->reco.dead_node, dlm->recovery_map);
+        __dlm_reset_recovery(dlm);
-        dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
-        dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
        spin_unlock(&dlm->spinlock);
 }
@@ -132,12 +157,21 @@ void dlm_dispatch_work(void *data)
        struct list_head *iter, *iter2;
        struct dlm_work_item *item;
        dlm_workfunc_t *workfunc;
+        int tot=0;
+        if (!dlm_joined(dlm))
+                return;
        spin_lock(&dlm->work_lock);
        list_splice_init(&dlm->work_list, &tmp_list);
        spin_unlock(&dlm->work_lock);
        list_for_each_safe(iter, iter2, &tmp_list) {
+                tot++;
+        }
+        mlog(0, "%s: work thread has %d work items\n", dlm->name, tot);
+        list_for_each_safe(iter, iter2, &tmp_list) {
                item = list_entry(iter, struct dlm_work_item, list);
                workfunc = item->func;
                list_del_init(&item->list);
@@ -220,6 +254,52 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm)
 *
 */
+static void dlm_print_reco_node_status(struct dlm_ctxt *dlm)
+{
+        struct dlm_reco_node_data *ndata;
+        struct dlm_lock_resource *res;
+        mlog(ML_NOTICE, "%s(%d): recovery info, state=%s, dead=%u, master=%u\n",
+             dlm->name, dlm->dlm_reco_thread_task->pid,
+             dlm->reco.state & DLM_RECO_STATE_ACTIVE ? "ACTIVE" : "inactive",
+             dlm->reco.dead_node, dlm->reco.new_master);
+        list_for_each_entry(ndata, &dlm->reco.node_data, list) {
+                char *st = "unknown";
+                switch (ndata->state) {
+                        case DLM_RECO_NODE_DATA_INIT:
+                                st = "init";
+                                break;
+                        case DLM_RECO_NODE_DATA_REQUESTING:
+                                st = "requesting";
+                                break;
+                        case DLM_RECO_NODE_DATA_DEAD:
+                                st = "dead";
+                                break;
+                        case DLM_RECO_NODE_DATA_RECEIVING:
+                                st = "receiving";
+                                break;
+                        case DLM_RECO_NODE_DATA_REQUESTED:
+                                st = "requested";
+                                break;
+                        case DLM_RECO_NODE_DATA_DONE:
+                                st = "done";
+                                break;
+                        case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+                                st = "finalize-sent";
+                                break;
+                        default:
+                                st = "bad";
+                                break;
+                }
+                mlog(ML_NOTICE, "%s: reco state, node %u, state=%s\n",
+                     dlm->name, ndata->node_num, st);
+        }
+        list_for_each_entry(res, &dlm->reco.resources, recovering) {
+                mlog(ML_NOTICE, "%s: lockres %.*s on recovering list\n",
+                     dlm->name, res->lockname.len, res->lockname.name);
+        }
+}
 #define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000)
@@ -267,11 +347,23 @@ int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
 {
        int dead;
        spin_lock(&dlm->spinlock);
-        dead = test_bit(node, dlm->domain_map);
+        dead = !test_bit(node, dlm->domain_map);
        spin_unlock(&dlm->spinlock);
        return dead;
 }
+/* returns true if node is no longer in the domain
+ * could be dead or just not joined */
+static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
+{
+        int recovered;
+        spin_lock(&dlm->spinlock);
+        recovered = !test_bit(node, dlm->recovery_map);
+        spin_unlock(&dlm->spinlock);
+        return recovered;
+}
 int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
 {
        if (timeout) {
@@ -290,6 +382,24 @@ int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
        return 0;
 }
+int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
+{
+        if (timeout) {
+                mlog(0, "%s: waiting %dms for notification of "
+                     "recovery of node %u\n", dlm->name, timeout, node);
+                wait_event_timeout(dlm->dlm_reco_thread_wq,
+                           dlm_is_node_recovered(dlm, node),
+                           msecs_to_jiffies(timeout));
+        } else {
+                mlog(0, "%s: waiting indefinitely for notification "
+                     "of recovery of node %u\n", dlm->name, node);
+                wait_event(dlm->dlm_reco_thread_wq,
+                           dlm_is_node_recovered(dlm, node));
+        }
+        /* for now, return 0 */
+        return 0;
+}
 /* callers of the top-level api calls (dlmlock/dlmunlock) should
 * block on the dlm->reco.event when recovery is in progress.
 * the dlm recovery thread will set this state when it begins
@@ -308,6 +418,13 @@ static int dlm_in_recovery(struct dlm_ctxt *dlm)
 void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
 {
+        if (dlm_in_recovery(dlm)) {
+                mlog(0, "%s: reco thread %d in recovery: "
+                     "state=%d, master=%u, dead=%u\n",
+                     dlm->name, dlm->dlm_reco_thread_task->pid,
+                     dlm->reco.state, dlm->reco.new_master,
+                     dlm->reco.dead_node);
+        }
        wait_event(dlm->reco.event, !dlm_in_recovery(dlm));
 }
@@ -341,7 +458,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
                mlog(0, "new master %u died while recovering %u!\n",
                     dlm->reco.new_master, dlm->reco.dead_node);
                /* unset the new_master, leave dead_node */
-                dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
+                dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
        }
        /* select a target to recover */
@@ -350,14 +467,14 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
                bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
                if (bit >= O2NM_MAX_NODES || bit < 0)
-                        dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+                        dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
                else
-                        dlm->reco.dead_node = bit;
+                        dlm_set_reco_dead_node(dlm, bit);
        } else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
                /* BUG? */
                mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n",
                     dlm->reco.dead_node);
-                dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+                dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
        }
        if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
@@ -366,7 +483,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
                /* return to main thread loop and sleep. */
                return 0;
        }
-        mlog(0, "recovery thread found node %u in the recovery map!\n",
+        mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n",
+             dlm->name, dlm->dlm_reco_thread_task->pid,
             dlm->reco.dead_node);
        spin_unlock(&dlm->spinlock);
@@ -389,8 +507,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
                }
                mlog(0, "another node will master this recovery session.\n");
        }
-        mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n",
+        mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n",
-             dlm->name, dlm->reco.new_master,
+             dlm->name, dlm->dlm_reco_thread_task->pid, dlm->reco.new_master,
             dlm->node_num, dlm->reco.dead_node);
        /* it is safe to start everything back up here
@@ -402,11 +520,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
        return 0;
 master_here:
-        mlog(0, "mastering recovery of %s:%u here(this=%u)!\n",
+        mlog(0, "(%d) mastering recovery of %s:%u here(this=%u)!\n",
+             dlm->dlm_reco_thread_task->pid,
             dlm->name, dlm->reco.dead_node, dlm->node_num);
        status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
        if (status < 0) {
+                /* we should never hit this anymore */
                mlog(ML_ERROR, "error %d remastering locks for node %u, "
                     "retrying.\n", status, dlm->reco.dead_node);
                /* yield a bit to allow any final network messages
@@ -433,9 +553,16 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
        int destroy = 0;
        int pass = 0;
-        status = dlm_init_recovery_area(dlm, dead_node);
+        do {
-        if (status < 0)
+                /* we have become recovery master.  there is no escaping
-                goto leave;
+                 * this, so just keep trying until we get it. */
+                status = dlm_init_recovery_area(dlm, dead_node);
+                if (status < 0) {
+                        mlog(ML_ERROR, "%s: failed to alloc recovery area, "
+                             "retrying\n", dlm->name);
+                        msleep(1000);
+                }
+        } while (status != 0);
        /* safe to access the node data list without a lock, since this
         * process is the only one to change the list */
@@ -452,16 +579,36 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                        continue;
                }
-                status = dlm_request_all_locks(dlm, ndata->node_num, dead_node);
+                do {
-                if (status < 0) {
+                        status = dlm_request_all_locks(dlm, ndata->node_num,
-                        mlog_errno(status);
+                                                       dead_node);
-                        if (dlm_is_host_down(status))
+                        if (status < 0) {
-                                ndata->state = DLM_RECO_NODE_DATA_DEAD;
+                                mlog_errno(status);
-                        else {
+                                if (dlm_is_host_down(status)) {
-                                destroy = 1;
+                                        /* node died, ignore it for recovery */
-                                goto leave;
+                                        status = 0;
+                                        ndata->state = DLM_RECO_NODE_DATA_DEAD;
+                                        /* wait for the domain map to catch up
+                                         * with the network state. */
+                                        wait_event_timeout(dlm->dlm_reco_thread_wq,
+                                                           dlm_is_node_dead(dlm,
+                                                                ndata->node_num),
+                                                           msecs_to_jiffies(1000));
+                                        mlog(0, "waited 1 sec for %u, "
+                                             "dead? %s\n", ndata->node_num,
+                                             dlm_is_node_dead(dlm, ndata->node_num) ?
+                                             "yes" : "no");
+                                } else {
+                                        /* -ENOMEM on the other node */
+                                        mlog(0, "%s: node %u returned "
+                                             "%d during recovery, retrying "
+                                             "after a short wait\n",
+                                             dlm->name, ndata->node_num,
+                                             status);
+                                        msleep(100);
+                                }
                        }
-                }
+                } while (status != 0);
                switch (ndata->state) {
                        case DLM_RECO_NODE_DATA_INIT:
@@ -473,10 +620,9 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                                mlog(0, "node %u died after requesting "
                                     "recovery info for node %u\n",
                                     ndata->node_num, dead_node);
-                                // start all over
+                                /* fine.  don't need this node's info.
-                                destroy = 1;
+                                 * continue without it. */
-                                status = -EAGAIN;
+                                break;
-                                goto leave;
                        case DLM_RECO_NODE_DATA_REQUESTING:
                                ndata->state = DLM_RECO_NODE_DATA_REQUESTED;
                                mlog(0, "now receiving recovery data from "
@@ -520,35 +666,26 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                                        BUG();
                                        break;
                                case DLM_RECO_NODE_DATA_DEAD:
-                                        mlog(ML_NOTICE, "node %u died after "
+                                        mlog(0, "node %u died after "
                                             "requesting recovery info for "
                                             "node %u\n", ndata->node_num,
                                             dead_node);
-                                        spin_unlock(&dlm_reco_state_lock);
+                                        break;
-                                        // start all over
-                                        destroy = 1;
-                                        status = -EAGAIN;
-                                        /* instead of spinning like crazy here,
-                                         * wait for the domain map to catch up
-                                         * with the network state.  otherwise this
-                                         * can be hit hundreds of times before
-                                         * the node is really seen as dead. */
-                                        wait_event_timeout(dlm->dlm_reco_thread_wq,
-                                                           dlm_is_node_dead(dlm,
-                                                                ndata->node_num),
-                                                           msecs_to_jiffies(1000));
-                                        mlog(0, "waited 1 sec for %u, "
-                                             "dead? %s\n", ndata->node_num,
-                                             dlm_is_node_dead(dlm, ndata->node_num) ?
-                                             "yes" : "no");
-                                        goto leave;
                                case DLM_RECO_NODE_DATA_RECEIVING:
                                case DLM_RECO_NODE_DATA_REQUESTED:
+                                        mlog(0, "%s: node %u still in state %s\n",
+                                             dlm->name, ndata->node_num,
+                                             ndata->state==DLM_RECO_NODE_DATA_RECEIVING ?
+                                             "receiving" : "requested");
                                        all_nodes_done = 0;
                                        break;
                                case DLM_RECO_NODE_DATA_DONE:
+                                        mlog(0, "%s: node %u state is done\n",
+                                             dlm->name, ndata->node_num);
                                        break;
                                case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+                                        mlog(0, "%s: node %u state is finalize\n",
+                                             dlm->name, ndata->node_num);
                                        break;
                        }
                }
@@ -578,7 +715,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                             jiffies, dlm->reco.dead_node,
                             dlm->node_num, dlm->reco.new_master);
                        destroy = 1;
-                        status = ret;
+                        status = 0;
                        /* rescan everything marked dirty along the way */
                        dlm_kick_thread(dlm, NULL);
                        break;
@@ -591,7 +728,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
        }
-leave:
        if (destroy)
                dlm_destroy_recovery_area(dlm, dead_node);
@@ -617,7 +753,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
                }
                BUG_ON(num == dead_node);
-                ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL);
+                ndata = kcalloc(1, sizeof(*ndata), GFP_NOFS);
                if (!ndata) {
                        dlm_destroy_recovery_area(dlm, dead_node);
                        return -ENOMEM;
@@ -691,16 +827,25 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
        if (!dlm_grab(dlm))
                return -EINVAL;
+        if (lr->dead_node != dlm->reco.dead_node) {
+                mlog(ML_ERROR, "%s: node %u sent dead_node=%u, but local "
+                     "dead_node is %u\n", dlm->name, lr->node_idx,
+                     lr->dead_node, dlm->reco.dead_node);
+                dlm_print_reco_node_status(dlm);
+                /* this is a hack */
+                dlm_put(dlm);
+                return -ENOMEM;
+        }
        BUG_ON(lr->dead_node != dlm->reco.dead_node);
-        item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+        item = kcalloc(1, sizeof(*item), GFP_NOFS);
        if (!item) {
                dlm_put(dlm);
                return -ENOMEM;
        }
        /* this will get freed by dlm_request_all_locks_worker */
-        buf = (char *) __get_free_page(GFP_KERNEL);
+        buf = (char *) __get_free_page(GFP_NOFS);
        if (!buf) {
                kfree(item);
                dlm_put(dlm);
@@ -715,7 +860,7 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
        spin_lock(&dlm->work_lock);
        list_add_tail(&item->list, &dlm->work_list);
        spin_unlock(&dlm->work_lock);
-        schedule_work(&dlm->dispatched_work);
+        queue_work(dlm->dlm_worker, &dlm->dispatched_work);
        dlm_put(dlm);
        return 0;
@@ -730,32 +875,34 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
        struct list_head *iter;
        int ret;
        u8 dead_node, reco_master;
+        int skip_all_done = 0;
        dlm = item->dlm;
        dead_node = item->u.ral.dead_node;
        reco_master = item->u.ral.reco_master;
        mres = (struct dlm_migratable_lockres *)data;
+        mlog(0, "%s: recovery worker started, dead=%u, master=%u\n",
+             dlm->name, dead_node, reco_master);
        if (dead_node != dlm->reco.dead_node ||
            reco_master != dlm->reco.new_master) {
-                /* show extra debug info if the recovery state is messed */
+                /* worker could have been created before the recovery master
-                mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), "
+                 * died.  if so, do not continue, but do not error. */
-                     "request(dead=%u, master=%u)\n",
+                if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
-                     dlm->name, dlm->reco.dead_node, dlm->reco.new_master,
+                        mlog(ML_NOTICE, "%s: will not send recovery state, "
-                     dead_node, reco_master);
+                             "recovery master %u died, thread=(dead=%u,mas=%u)"
-                mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u "
+                             " current=(dead=%u,mas=%u)\n", dlm->name,
-                     "entry[0]={c=%u:%llu,l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n",
+                             reco_master, dead_node, reco_master,
-                     dlm->name, mres->lockname_len, mres->lockname, mres->master,
+                             dlm->reco.dead_node, dlm->reco.new_master);
-                     mres->num_locks, mres->total_locks, mres->flags,
+                } else {
-                     dlm_get_lock_cookie_node(mres->ml[0].cookie),
+                        mlog(ML_NOTICE, "%s: reco state invalid: reco(dead=%u, "
-                     dlm_get_lock_cookie_seq(mres->ml[0].cookie),
+                             "master=%u), request(dead=%u, master=%u)\n",
-                     mres->ml[0].list, mres->ml[0].flags,
+                             dlm->name, dlm->reco.dead_node,
-                     mres->ml[0].type, mres->ml[0].convert_type,
+                             dlm->reco.new_master, dead_node, reco_master);
-                     mres->ml[0].highest_blocked, mres->ml[0].node);
+                }
-                BUG();
+                goto leave;
        }
-        BUG_ON(dead_node != dlm->reco.dead_node);
-        BUG_ON(reco_master != dlm->reco.new_master);
        /* lock resources should have already been moved to the
         * dlm->reco.resources list.  now move items from that list
@@ -766,12 +913,20 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
        dlm_move_reco_locks_to_list(dlm, &resources, dead_node);
        /* now we can begin blasting lockreses without the dlm lock */
+        /* any errors returned will be due to the new_master dying,
+         * the dlm_reco_thread should detect this */
        list_for_each(iter, &resources) {
                res = list_entry (iter, struct dlm_lock_resource, recovering);
                ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
                                        DLM_MRES_RECOVERY);
-                if (ret < 0)
+                if (ret < 0) {
-                        mlog_errno(ret);
+                        mlog(ML_ERROR, "%s: node %u went down while sending "
+                             "recovery state for dead node %u, ret=%d\n", dlm->name,
+                             reco_master, dead_node, ret);
+                        skip_all_done = 1;
+                        break;
+                }
        }
        /* move the resources back to the list */
@@ -779,10 +934,15 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
        list_splice_init(&resources, &dlm->reco.resources);
        spin_unlock(&dlm->spinlock);
-        ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
+        if (!skip_all_done) {
-        if (ret < 0)
+                ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
-                mlog_errno(ret);
+                if (ret < 0) {
+                        mlog(ML_ERROR, "%s: node %u went down while sending "
+                             "recovery all-done for dead node %u, ret=%d\n",
+                             dlm->name, reco_master, dead_node, ret);
+                }
+        }
+leave:
        free_page((unsigned long)data);
 }
@@ -801,8 +961,14 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
        ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
                                 sizeof(done_msg), send_to, &tmpret);
-        /* negative status is ignored by the caller */
+        if (ret < 0) {
-        if (ret >= 0)
+                if (!dlm_is_host_down(ret)) {
+                        mlog_errno(ret);
+                        mlog(ML_ERROR, "%s: unknown error sending data-done "
+                             "to %u\n", dlm->name, send_to);
+                        BUG();
+                }
+        } else
                ret = tmpret;
        return ret;
 }
@@ -822,7 +988,11 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data)
        mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, "
             "node_idx=%u, this node=%u\n", done->dead_node,
             dlm->reco.dead_node, done->node_idx, dlm->node_num);
-        BUG_ON(done->dead_node != dlm->reco.dead_node);
+        mlog_bug_on_msg((done->dead_node != dlm->reco.dead_node),
+                        "Got DATA DONE: dead_node=%u, reco.dead_node=%u, "
+                        "node_idx=%u, this node=%u\n", done->dead_node,
+                        dlm->reco.dead_node, done->node_idx, dlm->node_num);
        spin_lock(&dlm_reco_state_lock);
        list_for_each(iter, &dlm->reco.node_data) {
@@ -905,13 +1075,11 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
                        mlog(0, "found lockres owned by dead node while "
                                  "doing recovery for node %u. sending it.\n",
                                  dead_node);
-                        list_del_init(&res->recovering);
+                        list_move_tail(&res->recovering, list);
-                        list_add_tail(&res->recovering, list);
                } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
                        mlog(0, "found UNKNOWN owner while doing recovery "
                                  "for node %u. sending it.\n", dead_node);
-                        list_del_init(&res->recovering);
+                        list_move_tail(&res->recovering, list);
-                        list_add_tail(&res->recovering, list);
                }
        }
        spin_unlock(&dlm->spinlock);
@@ -1023,8 +1191,9 @@ static int dlm_add_lock_to_array(struct dlm_lock *lock,
                    ml->type == LKM_PRMODE) {
                        /* if it is already set, this had better be a PR
                         * and it has to match */
-                        if (mres->lvb[0] && (ml->type == LKM_EXMODE ||
+                        if (!dlm_lvb_is_empty(mres->lvb) &&
-                            memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
+                            (ml->type == LKM_EXMODE ||
+                             memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
                                mlog(ML_ERROR, "mismatched lvbs!\n");
                                __dlm_print_one_lock_resource(lock->lockres);
                                BUG();
@@ -1083,22 +1252,25 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
                         * we must send it immediately. */
                        ret = dlm_send_mig_lockres_msg(dlm, mres, send_to,
                                                       res, total_locks);
-                        if (ret < 0) {
+                        if (ret < 0)
-                                // TODO
+                                goto error;
-                                mlog(ML_ERROR, "dlm_send_mig_lockres_msg "
-                                     "returned %d, TODO\n", ret);
-                                BUG();
-                        }
                }
        }
        /* flush any remaining locks */
        ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks);
-        if (ret < 0) {
+        if (ret < 0)
-                // TODO
+                goto error;
-                mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, "
+        return ret;
-                     "TODO\n", ret);
+error:
+        mlog(ML_ERROR, "%s: dlm_send_mig_lockres_msg returned %d\n",
+             dlm->name, ret);
+        if (!dlm_is_host_down(ret))
                BUG();
-        }
+        mlog(0, "%s: node %u went down while sending %s "
+             "lockres %.*s\n", dlm->name, send_to,
+             flags & DLM_MRES_RECOVERY ?  "recovery" : "migration",
+             res->lockname.len, res->lockname.name);
        return ret;
 }
@@ -1146,8 +1318,8 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
                mlog(0, "all done flag.  all lockres data received!\n");
        ret = -ENOMEM;
-        buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL);
+        buf = kmalloc(be16_to_cpu(msg->data_len), GFP_NOFS);
-        item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+        item = kcalloc(1, sizeof(*item), GFP_NOFS);
        if (!buf || !item)
                goto leave;
@@ -1238,7 +1410,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
        spin_lock(&dlm->work_lock);
        list_add_tail(&item->list, &dlm->work_list);
        spin_unlock(&dlm->work_lock);
-        schedule_work(&dlm->dispatched_work);
+        queue_work(dlm->dlm_worker, &dlm->dispatched_work);
 leave:
        dlm_put(dlm);
@@ -1406,6 +1578,7 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data)
        struct dlm_ctxt *dlm = data;
        struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf;
        struct dlm_lock_resource *res = NULL;
+        unsigned int hash;
        int master = DLM_LOCK_RES_OWNER_UNKNOWN;
        u32 flags = DLM_ASSERT_MASTER_REQUERY;
@@ -1415,8 +1588,10 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data)
                return master;
        }
+        hash = dlm_lockid_hash(req->name, req->namelen);
        spin_lock(&dlm->spinlock);
-        res = __dlm_lookup_lockres(dlm, req->name, req->namelen);
+        res = __dlm_lookup_lockres(dlm, req->name, req->namelen, hash);
        if (res) {
                spin_lock(&res->spinlock);
                master = res->owner;
@@ -1483,7 +1658,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
        struct dlm_lock *newlock = NULL;
        struct dlm_lockstatus *lksb = NULL;
        int ret = 0;
-        int i;
+        int i, bad;
        struct list_head *iter;
        struct dlm_lock *lock = NULL;
@@ -1529,8 +1704,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                        /* move the lock to its proper place */
                        /* do not alter lock refcount.  switching lists. */
-                        list_del_init(&lock->list);
+                        list_move_tail(&lock->list, queue);
-                        list_add_tail(&lock->list, queue);
                        spin_unlock(&res->spinlock);
                        mlog(0, "just reordered a local lock!\n");
@@ -1553,28 +1727,48 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                }
                lksb->flags |= (ml->flags &
                                (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
-                        
-                if (mres->lvb[0]) {
+                if (ml->type == LKM_NLMODE)
+                        goto skip_lvb;
+                if (!dlm_lvb_is_empty(mres->lvb)) {
                        if (lksb->flags & DLM_LKSB_PUT_LVB) {
                                /* other node was trying to update
                                 * lvb when node died.  recreate the
                                 * lksb with the updated lvb. */
                                memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN);
+                                /* the lock resource lvb update must happen
+                                 * NOW, before the spinlock is dropped.
+                                 * we no longer wait for the AST to update
+                                 * the lvb. */
+                                memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
                        } else {
                                /* otherwise, the node is sending its 
                                 * most recent valid lvb info */
                                BUG_ON(ml->type != LKM_EXMODE &&
                                       ml->type != LKM_PRMODE);
-                                if (res->lvb[0] && (ml->type == LKM_EXMODE ||
+                                if (!dlm_lvb_is_empty(res->lvb) &&
-                                    memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
+                                    (ml->type == LKM_EXMODE ||
-                                        mlog(ML_ERROR, "received bad lvb!\n");
+                                     memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
-                                        __dlm_print_one_lock_resource(res);
+                                        int i;
-                                        BUG();
+                                        mlog(ML_ERROR, "%s:%.*s: received bad "
+                                             "lvb! type=%d\n", dlm->name,
+                                             res->lockname.len,
+                                             res->lockname.name, ml->type);
+                                        printk("lockres lvb=[");
+                                        for (i=0; i<DLM_LVB_LEN; i++)
+                                                printk("%02x", res->lvb[i]);
+                                        printk("]\nmigrated lvb=[");
+                                        for (i=0; i<DLM_LVB_LEN; i++)
+                                                printk("%02x", mres->lvb[i]);
+                                        printk("]\n");
+                                        dlm_print_one_lock_resource(res);
+                                        BUG();
                                }
                                memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
                        }
                }
+skip_lvb:
                /* NOTE:
                 * wrt lock queue ordering and recovery:
@@ -1592,9 +1786,33 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                 * relative to each other, but clearly *not*
                 * preserved relative to locks from other nodes.
                 */
+                bad = 0;
                spin_lock(&res->spinlock);
-                dlm_lock_get(newlock);
+                list_for_each_entry(lock, queue, list) {
-                list_add_tail(&newlock->list, queue);
+                        if (lock->ml.cookie == ml->cookie) {
+                                u64 c = lock->ml.cookie;
+                                mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already "
+                                     "exists on this lockres!\n", dlm->name,
+                                     res->lockname.len, res->lockname.name,
+                                     dlm_get_lock_cookie_node(c),
+                                     dlm_get_lock_cookie_seq(c));
+                                mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, "
+                                     "node=%u, cookie=%u:%llu, queue=%d\n",
+                                     ml->type, ml->convert_type, ml->node,
+                                     dlm_get_lock_cookie_node(ml->cookie),
+                                     dlm_get_lock_cookie_seq(ml->cookie),
+                                     ml->list);
+                                __dlm_print_one_lock_resource(res);
+                                bad = 1;
+                                break;
+                        }
+                }
+                if (!bad) {
+                        dlm_lock_get(newlock);
+                        list_add_tail(&newlock->list, queue);
+                }
                spin_unlock(&res->spinlock);
        }
        mlog(0, "done running all the locks\n");
@@ -1618,8 +1836,14 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
        struct dlm_lock *lock;
        res->state |= DLM_LOCK_RES_RECOVERING;
-        if (!list_empty(&res->recovering))
+        if (!list_empty(&res->recovering)) {
+                mlog(0,
+                     "Recovering res %s:%.*s, is already on recovery list!\n",
+                     dlm->name, res->lockname.len, res->lockname.name);
                list_del_init(&res->recovering);
+        }
+        /* We need to hold a reference while on the recovery list */
+        dlm_lockres_get(res);
        list_add_tail(&res->recovering, &dlm->reco.resources);
        /* find any pending locks and put them back on proper list */
@@ -1708,9 +1932,11 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
                        spin_lock(&res->spinlock);
                        dlm_change_lockres_owner(dlm, res, new_master);
                        res->state &= ~DLM_LOCK_RES_RECOVERING;
-                        __dlm_dirty_lockres(dlm, res);
+                        if (!__dlm_lockres_unused(res))
+                                __dlm_dirty_lockres(dlm, res);
                        spin_unlock(&res->spinlock);
                        wake_up(&res->wq);
+                        dlm_lockres_put(res);
                }
        }
@@ -1719,7 +1945,7 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
         * the RECOVERING state and set the owner
         * if necessary */
        for (i = 0; i < DLM_HASH_BUCKETS; i++) {
-                bucket = &(dlm->lockres_hash[i]);
+                bucket = dlm_lockres_hash(dlm, i);
                hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
                        if (res->state & DLM_LOCK_RES_RECOVERING) {
                                if (res->owner == dead_node) {
@@ -1743,11 +1969,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
                                             dlm->name, res->lockname.len,
                                             res->lockname.name, res->owner);
                                        list_del_init(&res->recovering);
+                                        dlm_lockres_put(res);
                                }
                                spin_lock(&res->spinlock);
                                dlm_change_lockres_owner(dlm, res, new_master);
                                res->state &= ~DLM_LOCK_RES_RECOVERING;
-                                __dlm_dirty_lockres(dlm, res);
+                                if (!__dlm_lockres_unused(res))
+                                        __dlm_dirty_lockres(dlm, res);
                                spin_unlock(&res->spinlock);
                                wake_up(&res->wq);
                        }
@@ -1884,7 +2112,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
         *    need to be fired as a result.
         */
        for (i = 0; i < DLM_HASH_BUCKETS; i++) {
-                bucket = &(dlm->lockres_hash[i]);
+                bucket = dlm_lockres_hash(dlm, i);
                hlist_for_each_entry(res, iter, bucket, hash_node) {
                        /* always prune any $RECOVERY entries for dead nodes,
                         * otherwise hangs can occur during later recovery */
@@ -1924,6 +2152,20 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
 {
        assert_spin_locked(&dlm->spinlock);
+        if (dlm->reco.new_master == idx) {
+                mlog(0, "%s: recovery master %d just died\n",
+                     dlm->name, idx);
+                if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+                        /* finalize1 was reached, so it is safe to clear
+                         * the new_master and dead_node.  that recovery
+                         * is complete. */
+                        mlog(0, "%s: dead master %d had reached "
+                             "finalize1 state, clearing\n", dlm->name, idx);
+                        dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+                        __dlm_reset_recovery(dlm);
+                }
+        }
        /* check to see if the node is already considered dead */
        if (!test_bit(idx, dlm->live_nodes_map)) {
                mlog(0, "for domain %s, node %d is already dead. "
@@ -2087,7 +2329,7 @@ again:
                        /* set the new_master to this node */
                        spin_lock(&dlm->spinlock);
-                        dlm->reco.new_master = dlm->node_num;
+                        dlm_set_reco_master(dlm, dlm->node_num);
                        spin_unlock(&dlm->spinlock);
                }
@@ -2125,6 +2367,10 @@ again:
                mlog(0, "%s: reco master %u is ready to recover %u\n",
                     dlm->name, dlm->reco.new_master, dlm->reco.dead_node);
                status = -EEXIST;
+        } else if (ret == DLM_RECOVERING) {
+                mlog(0, "dlm=%s dlmlock says master node died (this=%u)\n",
+                     dlm->name, dlm->node_num);
+                goto again;
        } else {
                struct dlm_lock_resource *res;
@@ -2156,7 +2402,7 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
        mlog_entry("%u\n", dead_node);
-        mlog(0, "dead node is %u\n", dead_node);
+        mlog(0, "%s: dead node is %u\n", dlm->name, dead_node);
        spin_lock(&dlm->spinlock);
        dlm_node_iter_init(dlm->domain_map, &iter);
@@ -2214,6 +2460,14 @@ retry:
                         * another ENOMEM */
                        msleep(100);
                        goto retry;
+                } else if (ret == EAGAIN) {
+                        mlog(0, "%s: trying to start recovery of node "
+                             "%u, but node %u is waiting for last recovery "
+                             "to complete, backoff for a bit\n", dlm->name,
+                             dead_node, nodenum);
+                        /* TODO Look into replacing msleep with cond_resched() */
+                        msleep(100);
+                        goto retry;
                }
        }
@@ -2229,8 +2483,20 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
        if (!dlm_grab(dlm))
                return 0;
-        mlog(0, "node %u wants to recover node %u\n",
+        spin_lock(&dlm->spinlock);
-                  br->node_idx, br->dead_node);
+        if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+                mlog(0, "%s: node %u wants to recover node %u (%u:%u) "
+                     "but this node is in finalize state, waiting on finalize2\n",
+                     dlm->name, br->node_idx, br->dead_node,
+                     dlm->reco.dead_node, dlm->reco.new_master);
+                spin_unlock(&dlm->spinlock);
+                return EAGAIN;
+        }
+        spin_unlock(&dlm->spinlock);
+        mlog(0, "%s: node %u wants to recover node %u (%u:%u)\n",
+             dlm->name, br->node_idx, br->dead_node,
+             dlm->reco.dead_node, dlm->reco.new_master);
        dlm_fire_domain_eviction_callbacks(dlm, br->dead_node);
@@ -2252,8 +2518,8 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
                     "node %u changing it to %u\n", dlm->name, 
                     dlm->reco.dead_node, br->node_idx, br->dead_node);
        }
-        dlm->reco.new_master = br->node_idx;
+        dlm_set_reco_master(dlm, br->node_idx);
-        dlm->reco.dead_node = br->dead_node;
+        dlm_set_reco_dead_node(dlm, br->dead_node);
        if (!test_bit(br->dead_node, dlm->recovery_map)) {
                mlog(0, "recovery master %u sees %u as dead, but this "
                     "node has not yet.  marking %u as dead\n",
@@ -2272,10 +2538,16 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
        spin_unlock(&dlm->spinlock);
        dlm_kick_recovery_thread(dlm);
+        mlog(0, "%s: recovery started by node %u, for %u (%u:%u)\n",
+             dlm->name, br->node_idx, br->dead_node,
+             dlm->reco.dead_node, dlm->reco.new_master);
        dlm_put(dlm);
        return 0;
 }
+#define DLM_FINALIZE_STAGE2  0x01
 static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
 {
        int ret = 0;
@@ -2283,25 +2555,31 @@ static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
        struct dlm_node_iter iter;
        int nodenum;
        int status;
+        int stage = 1;
-        mlog(0, "finishing recovery for node %s:%u\n",
+        mlog(0, "finishing recovery for node %s:%u, "
-             dlm->name, dlm->reco.dead_node);
+             "stage %d\n", dlm->name, dlm->reco.dead_node, stage);
        spin_lock(&dlm->spinlock);
        dlm_node_iter_init(dlm->domain_map, &iter);
        spin_unlock(&dlm->spinlock);
+stage2:
        memset(&fr, 0, sizeof(fr));
        fr.node_idx = dlm->node_num;
        fr.dead_node = dlm->reco.dead_node;
+        if (stage == 2)
+                fr.flags |= DLM_FINALIZE_STAGE2;
        while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
                if (nodenum == dlm->node_num)
                        continue;
                ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key,
                                         &fr, sizeof(fr), nodenum, &status);
-                if (ret >= 0) {
+                if (ret >= 0)
                        ret = status;
+                if (ret < 0) {
+                        mlog_errno(ret);
                        if (dlm_is_host_down(ret)) {
                                /* this has no effect on this recovery 
                                 * session, so set the status to zero to 
@@ -2309,13 +2587,17 @@ static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
                                mlog(ML_ERROR, "node %u went down after this "
                                     "node finished recovery.\n", nodenum);
                                ret = 0;
+                                continue;
                        }
-                }
-                if (ret < 0) {
-                        mlog_errno(ret);
                        break;
                }
        }
+        if (stage == 1) {
+                /* reset the node_iter back to the top and send finalize2 */
+                iter.curnode = -1;
+                stage = 2;
+                goto stage2;
+        }
        return ret;
 }
@@ -2324,14 +2606,19 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
 {
        struct dlm_ctxt *dlm = data;
        struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf;
+        int stage = 1;
        /* ok to return 0, domain has gone away */
        if (!dlm_grab(dlm))
                return 0;
-        mlog(0, "node %u finalizing recovery of node %u\n",
+        if (fr->flags & DLM_FINALIZE_STAGE2)
-             fr->node_idx, fr->dead_node);
+                stage = 2;
+        mlog(0, "%s: node %u finalizing recovery stage%d of "
+             "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage,
+             fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master);
+ 
        spin_lock(&dlm->spinlock);
        if (dlm->reco.new_master != fr->node_idx) {
@@ -2347,13 +2634,41 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
                BUG();
        }
-        dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
+        switch (stage) {
+                case 1:
-        spin_unlock(&dlm->spinlock);
+                        dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
+                        if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+                                mlog(ML_ERROR, "%s: received finalize1 from "
+                                     "new master %u for dead node %u, but "
+                                     "this node has already received it!\n",
+                                     dlm->name, fr->node_idx, fr->dead_node);
+                                dlm_print_reco_node_status(dlm);
+                                BUG();
+                        }
+                        dlm->reco.state |= DLM_RECO_STATE_FINALIZE;
+                        spin_unlock(&dlm->spinlock);
+                        break;
+                case 2:
+                        if (!(dlm->reco.state & DLM_RECO_STATE_FINALIZE)) {
+                                mlog(ML_ERROR, "%s: received finalize2 from "
+                                     "new master %u for dead node %u, but "
+                                     "this node did not have finalize1!\n",
+                                     dlm->name, fr->node_idx, fr->dead_node);
+                                dlm_print_reco_node_status(dlm);
+                                BUG();
+                        }
+                        dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+                        spin_unlock(&dlm->spinlock);
+                        dlm_reset_recovery(dlm);
+                        dlm_kick_recovery_thread(dlm);
+                        break;
+                default:
+                        BUG();
+        }
-        dlm_reset_recovery(dlm);
+        mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n",
+             dlm->name, fr->node_idx, dlm->reco.dead_node, dlm->reco.new_master);
-        dlm_kick_recovery_thread(dlm);
        dlm_put(dlm);
        return 0;
 }
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 5be9d14f12cb..0c822f3ffb05 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -39,6 +39,7 @@
 #include <linux/inet.h>
 #include <linux/timer.h>
 #include <linux/kthread.h>
+#include <linux/delay.h>
 #include "cluster/heartbeat.h"
@@ -53,6 +54,8 @@
 #include "cluster/masklog.h"
 static int dlm_thread(void *data);
+static void dlm_purge_lockres_now(struct dlm_ctxt *dlm,
+                                  struct dlm_lock_resource *lockres);
 static void dlm_flush_asts(struct dlm_ctxt *dlm);
@@ -80,7 +83,7 @@ repeat:
 }
-static int __dlm_lockres_unused(struct dlm_lock_resource *res)
+int __dlm_lockres_unused(struct dlm_lock_resource *res)
 {
        if (list_empty(&res->granted) &&
            list_empty(&res->converting) &&
@@ -103,6 +106,20 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
        assert_spin_locked(&res->spinlock);
        if (__dlm_lockres_unused(res)){
+                /* For now, just keep any resource we master */
+                if (res->owner == dlm->node_num)
+                {
+                        if (!list_empty(&res->purge)) {
+                                mlog(0, "we master %s:%.*s, but it is on "
+                                     "the purge list.  Removing\n",
+                                     dlm->name, res->lockname.len,
+                                     res->lockname.name);
+                                list_del_init(&res->purge);
+                                dlm->purge_count--;
+                        }
+                        return;
+                }
                if (list_empty(&res->purge)) {
                        mlog(0, "putting lockres %.*s from purge list\n",
                             res->lockname.len, res->lockname.name);
@@ -110,10 +127,23 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
                        res->last_used = jiffies;
                        list_add_tail(&res->purge, &dlm->purge_list);
                        dlm->purge_count++;
+                        /* if this node is not the owner, there is
+                         * no way to keep track of who the owner could be.
+                         * unhash it to avoid serious problems. */
+                        if (res->owner != dlm->node_num) {
+                                mlog(0, "%s:%.*s: doing immediate "
+                                     "purge of lockres owned by %u\n",
+                                     dlm->name, res->lockname.len,
+                                     res->lockname.name, res->owner);
+                                dlm_purge_lockres_now(dlm, res);
+                        }
                }
        } else if (!list_empty(&res->purge)) {
-                mlog(0, "removing lockres %.*s from purge list\n",
+                mlog(0, "removing lockres %.*s from purge list, "
-                     res->lockname.len, res->lockname.name);
+                     "owner=%u\n", res->lockname.len, res->lockname.name,
+                     res->owner);
                list_del_init(&res->purge);
                dlm->purge_count--;
@@ -165,6 +195,7 @@ again:
        } else if (ret < 0) {
                mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n",
                     lockres->lockname.len, lockres->lockname.name);
+                msleep(100);
                goto again;
        }
@@ -178,6 +209,24 @@ finish:
        __dlm_unhash_lockres(lockres);
 }
+/* make an unused lockres go away immediately.
+ * as soon as the dlm spinlock is dropped, this lockres
+ * will not be found. kfree still happens on last put. */
+static void dlm_purge_lockres_now(struct dlm_ctxt *dlm,
+                                  struct dlm_lock_resource *lockres)
+{
+        assert_spin_locked(&dlm->spinlock);
+        assert_spin_locked(&lockres->spinlock);
+        BUG_ON(!__dlm_lockres_unused(lockres));
+        if (!list_empty(&lockres->purge)) {
+                list_del_init(&lockres->purge);
+                dlm->purge_count--;
+        }
+        __dlm_unhash_lockres(lockres);
+}
 static void dlm_run_purge_list(struct dlm_ctxt *dlm,
                               int purge_now)
 {
@@ -318,8 +367,7 @@ converting:
                target->ml.type = target->ml.convert_type;
                target->ml.convert_type = LKM_IVMODE;
-                list_del_init(&target->list);
+                list_move_tail(&target->list, &res->granted);
-                list_add_tail(&target->list, &res->granted);
                BUG_ON(!target->lksb);
                target->lksb->status = DLM_NORMAL;
@@ -380,8 +428,7 @@ blocked:
                     target->ml.type, target->ml.node);
                // target->ml.type is already correct
-                list_del_init(&target->list);
+                list_move_tail(&target->list, &res->granted);
-                list_add_tail(&target->list, &res->granted);
                BUG_ON(!target->lksb);
                target->lksb->status = DLM_NORMAL;
@@ -422,6 +469,8 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
        /* don't shuffle secondary queues */
        if ((res->owner == dlm->node_num) &&
            !(res->state & DLM_LOCK_RES_DIRTY)) {
+                /* ref for dirty_list */
+                dlm_lockres_get(res);
                list_add_tail(&res->dirty, &dlm->dirty_list);
                res->state |= DLM_LOCK_RES_DIRTY;
        }
@@ -606,6 +655,8 @@ static int dlm_thread(void *data)
                        list_del_init(&res->dirty);
                        spin_unlock(&res->spinlock);
                        spin_unlock(&dlm->spinlock);
+                        /* Drop dirty_list ref */
+                        dlm_lockres_put(res);
                        /* lockres can be re-dirtied/re-added to the
                         * dirty_list in this gap, but that is ok */
@@ -642,8 +693,9 @@ static int dlm_thread(void *data)
                         * spinlock and do NOT have the dlm lock.
                         * safe to reserve/queue asts and run the lists. */
-                        mlog(0, "calling dlm_shuffle_lists with dlm=%p, "
+                        mlog(0, "calling dlm_shuffle_lists with dlm=%s, "
-                             "res=%p\n", dlm, res);
+                             "res=%.*s\n", dlm->name,
+                             res->lockname.len, res->lockname.name);
                        /* called while holding lockres lock */
                        dlm_shuffle_lists(dlm, res);
@@ -657,6 +709,8 @@ in_progress:
                        /* if the lock was in-progress, stick
                         * it on the back of the list */
                        if (delay) {
+                                /* ref for dirty_list */
+                                dlm_lockres_get(res);
                                spin_lock(&res->spinlock);
                                list_add_tail(&res->dirty, &dlm->dirty_list);
                                res->state |= DLM_LOCK_RES_DIRTY;
@@ -677,7 +731,7 @@ in_progress:
                /* yield and continue right away if there is more work to do */
                if (!n) {
-                        yield();
+                        cond_resched();
                        continue;
                }
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 7b1a27542674..b0c3134f4f70 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -271,8 +271,7 @@ void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
 void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
                               struct dlm_lock *lock)
 {
-        list_del_init(&lock->list);
+        list_move_tail(&lock->list, &res->granted);
-        list_add_tail(&lock->list, &res->granted);
        lock->ml.convert_type = LKM_IVMODE;
 }
@@ -319,6 +318,16 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
        mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+        if (owner == dlm->node_num) {
+                /* ended up trying to contact ourself.  this means
+                 * that the lockres had been remote but became local
+                 * via a migration.  just retry it, now as local */
+                mlog(0, "%s:%.*s: this node became the master due to a "
+                     "migration, re-evaluate now\n", dlm->name,
+                     res->lockname.len, res->lockname.name);
+                return DLM_FORWARD;
+        }
        memset(&unlock, 0, sizeof(unlock));
        unlock.node_idx = dlm->node_num;
        unlock.flags = cpu_to_be32(flags);
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c
index 74ca4e5f9765..e641b084b343 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlm/userdlm.c
@@ -672,7 +672,7 @@ struct dlm_ctxt *user_dlm_register_context(struct qstr *name)
        u32 dlm_key;
        char *domain;
-        domain = kmalloc(name->len + 1, GFP_KERNEL);
+        domain = kmalloc(name->len + 1, GFP_NOFS);
        if (!domain) {
                mlog_errno(-ENOMEM);
                return ERR_PTR(-ENOMEM);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 64cd52860c87..4acd37286bdd 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -242,7 +242,7 @@ static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
        mlog_exit_void();
 }
-static spinlock_t ocfs2_dlm_tracking_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock);
 static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res,
                                       struct ocfs2_dlm_debug *dlm_debug)
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index eebc3cfa6be8..910a601b2e98 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -49,7 +49,7 @@
 #include "buffer_head_io.h"
-spinlock_t trans_inc_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(trans_inc_lock);
 static int ocfs2_force_read_journal(struct inode *inode);
 static int ocfs2_recover_node(struct ocfs2_super *osb,
@@ -222,8 +222,7 @@ void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
        BUG_ON(!list_empty(&OCFS2_I(inode)->ip_handle_list));
        OCFS2_I(inode)->ip_handle = handle;
-        list_del(&(OCFS2_I(inode)->ip_handle_list));
+        list_move_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list));
-        list_add_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list));
 }
 static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle)
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
index ee42765a8553..cf70fe2075b8 100644
--- a/fs/ocfs2/vote.c
+++ b/fs/ocfs2/vote.c
@@ -988,9 +988,7 @@ int ocfs2_request_mount_vote(struct ocfs2_super *osb)
        }
 bail:
-        if (request)
+        kfree(request);
-                kfree(request);
        return status;
 }
@@ -1021,9 +1019,7 @@ int ocfs2_request_umount_vote(struct ocfs2_super *osb)
        }
 bail:
-        if (request)
+        kfree(request);
-                kfree(request);
        return status;
 }
diff --git a/fs/open.c b/fs/open.c
index 5fb16e5267dc..303f06d2a7b9 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -322,7 +322,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
        error = locks_verify_truncate(inode, file, length);
        if (!error)
-                error = do_truncate(dentry, length, 0, file);
+                error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
 out_putf:
        fput(file);
 out:
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 464e2bce0203..93a56bd4a2b7 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -1,5 +1,4 @@
-/* $Id: inode.c,v 1.15 2001/11/12 09:43:39 davem Exp $
+/* inode.c: /proc/openprom handling routines
- * openpromfs.c: /proc/openprom handling routines
 *
 * Copyright (C) 1996-1999 Jakub Jelinek  (jakub@redhat.com)
 * Copyright (C) 1998      Eddie C. Dost  (ecd@skynet.be)
@@ -12,756 +11,245 @@
 #include <linux/openprom_fs.h>
 #include <linux/init.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
+#include <linux/seq_file.h>
 #include <asm/openprom.h>
 #include <asm/oplib.h>
+#include <asm/prom.h>
 #include <asm/uaccess.h>
-#define ALIASES_NNODES 64
+static DEFINE_MUTEX(op_mutex);
-typedef struct {
+#define OPENPROM_ROOT_INO       0
-        u16     parent;
-        u16     next;
+enum op_inode_type {
-        u16     child;
+        op_inode_node,
-        u16     first_prop;
+        op_inode_prop,
-        u32     node;
+};
-} openpromfs_node;
+union op_inode_data {
-typedef struct {
+        struct device_node      *node;
-#define OPP_STRING      0x10
+        struct property         *prop;
-#define OPP_STRINGLIST  0x20
+};
-#define OPP_BINARY      0x40
-#define OPP_HEXSTRING   0x80
-#define OPP_DIRTY       0x01
-#define OPP_QUOTED      0x02
-#define OPP_NOTQUOTED   0x04
-#define OPP_ASCIIZ      0x08
-        u32     flag;
-        u32     alloclen;
-        u32     len;
-        char    *value;
-        char    name[8];
-} openprom_property;
-static openpromfs_node *nodes;
-static int alloced;
-static u16 last_node;
-static u16 first_prop;
-static u16 options = 0xffff;
-static u16 aliases = 0xffff;
-static int aliases_nodes;
-static char *alias_names [ALIASES_NNODES];
-#define OPENPROM_ROOT_INO       16
-#define OPENPROM_FIRST_INO      OPENPROM_ROOT_INO
-#define NODE(ino) nodes[ino - OPENPROM_FIRST_INO]
-#define NODE2INO(node) (node + OPENPROM_FIRST_INO)
-#define NODEP2INO(no) (no + OPENPROM_FIRST_INO + last_node)
-static int openpromfs_create (struct inode *, struct dentry *, int, struct nameidata *);
-static int openpromfs_readdir(struct file *, void *, filldir_t);
-static struct dentry *openpromfs_lookup(struct inode *, struct dentry *dentry, struct nameidata *nd);
-static int openpromfs_unlink (struct inode *, struct dentry *dentry);
-static ssize_t nodenum_read(struct file *file, char __user *buf,
+struct op_inode_info {
-                            size_t count, loff_t *ppos)
+        struct inode            vfs_inode;
+        enum op_inode_type      type;
+        union op_inode_data     u;
+};
+static inline struct op_inode_info *OP_I(struct inode *inode)
 {
-        struct inode *inode = file->f_dentry->d_inode;
+        return container_of(inode, struct op_inode_info, vfs_inode);
-        char buffer[10];
-        
-        if (count < 0 || !inode->u.generic_ip)
-                return -EINVAL;
-        sprintf (buffer, "%8.8x\n", (u32)(long)(inode->u.generic_ip));
-        if (file->f_pos >= 9)
-                return 0;
-        if (count > 9 - file->f_pos)
-                count = 9 - file->f_pos;
-        if (copy_to_user(buf, buffer + file->f_pos, count))
-                return -EFAULT;
-        *ppos += count;
-        return count;
 }
-static ssize_t property_read(struct file *filp, char __user *buf,
+static int is_string(unsigned char *p, int len)
-                             size_t count, loff_t *ppos)
 {
-        struct inode *inode = filp->f_dentry->d_inode;
+        int i;
-        int i, j, k;
-        u32 node;
-        char *p, *s;
-        u32 *q;
-        openprom_property *op;
-        char buffer[64];
-        
-        if (!filp->private_data) {
-                node = nodes[(u16)((long)inode->u.generic_ip)].node;
-                i = ((u32)(long)inode->u.generic_ip) >> 16;
-                if ((u16)((long)inode->u.generic_ip) == aliases) {
-                        if (i >= aliases_nodes)
-                                p = NULL;
-                        else
-                                p = alias_names [i];
-                } else
-                        for (p = prom_firstprop (node, buffer);
-                             i && p && *p;
-                             p = prom_nextprop (node, p, buffer), i--)
-                                /* nothing */ ;
-                if (!p || !*p)
-                        return -EIO;
-                i = prom_getproplen (node, p);
-                if (i < 0) {
-                        if ((u16)((long)inode->u.generic_ip) == aliases)
-                                i = 0;
-                        else
-                                return -EIO;
-                }
-                k = i;
-                if (i < 64) i = 64;
-                filp->private_data = kmalloc (sizeof (openprom_property)
-                                              + (j = strlen (p)) + 2 * i,
-                                              GFP_KERNEL);
-                if (!filp->private_data)
-                        return -ENOMEM;
-                op = (openprom_property *)filp->private_data;
-                op->flag = 0;
-                op->alloclen = 2 * i;
-                strcpy (op->name, p);
-                op->value = (char *)(((unsigned long)(op->name + j + 4)) & ~3);
-                op->len = k;
-                if (k && prom_getproperty (node, p, op->value, i) < 0)
-                        return -EIO;
-                op->value [k] = 0;
-                if (k) {
-                        for (s = NULL, p = op->value; p < op->value + k; p++) {
-                                if ((*p >= ' ' && *p <= '~') || *p == '\n') {
-                                        op->flag |= OPP_STRING;
-                                        s = p;
-                                        continue;
-                                }
-                                if (p > op->value && !*p && s == p - 1) {
-                                        if (p < op->value + k - 1)
-                                                op->flag |= OPP_STRINGLIST;
-                                        else
-                                                op->flag |= OPP_ASCIIZ;
-                                        continue;
-                                }
-                                if (k == 1 && !*p) {
-                                        op->flag |= (OPP_STRING|OPP_ASCIIZ);
-                                        break;
-                                }
-                                op->flag &= ~(OPP_STRING|OPP_STRINGLIST);
-                                if (k & 3)
-                                        op->flag |= OPP_HEXSTRING;
-                                else
-                                        op->flag |= OPP_BINARY;
-                                break;
-                        }
-                        if (op->flag & OPP_STRINGLIST)
-                                op->flag &= ~(OPP_STRING);
-                        if (op->flag & OPP_ASCIIZ)
-                                op->len--;
-                }
-        } else
-                op = (openprom_property *)filp->private_data;
-        if (!count || !(op->len || (op->flag & OPP_ASCIIZ)))
-                return 0;
-        if (*ppos >= 0xffffff || count >= 0xffffff)
-                return -EINVAL;
-        if (op->flag & OPP_STRINGLIST) {
-                for (k = 0, p = op->value; p < op->value + op->len; p++)
-                        if (!*p)
-                                k++;
-                i = op->len + 4 * k + 3;
-        } else if (op->flag & OPP_STRING) {
-                i = op->len + 3;
-        } else if (op->flag & OPP_BINARY) {
-                i = (op->len * 9) >> 2;
-        } else {
-                i = (op->len << 1) + 1;
-        }
-        k = *ppos;
-        if (k >= i) return 0;
-        if (count > i - k) count = i - k;
-        if (op->flag & OPP_STRING) {
-                if (!k) {
-                        if (put_user('\'', buf))
-                                return -EFAULT;
-                        k++;
-                        count--;
-                }
-                if (k + count >= i - 2)
+        for (i = 0; i < len; i++) {
-                        j = i - 2 - k;
+                unsigned char val = p[i];
-                else
-                        j = count;
-                if (j >= 0) {
-                        if (copy_to_user(buf + k - *ppos,
-                                         op->value + k - 1, j))
-                                return -EFAULT;
-                        count -= j;
-                        k += j;
-                }
-                if (count) {
+                if ((i && !val) ||
-                        if (put_user('\'', &buf [k++ - *ppos]))
+                    (val >= ' ' && val <= '~'))
-                                return -EFAULT;
+                        continue;
-                }
-                if (count > 1) {
-                        if (put_user('\n', &buf [k++ - *ppos]))
-                                return -EFAULT;
-                }
-        } else if (op->flag & OPP_STRINGLIST) {
-                char *tmp;
-                tmp = kmalloc (i, GFP_KERNEL);
-                if (!tmp)
-                        return -ENOMEM;
-                s = tmp;
-                *s++ = '\'';
-                for (p = op->value; p < op->value + op->len; p++) {
-                        if (!*p) {
-                                strcpy(s, "' + '");
-                                s += 5;
-                                continue;
-                        }
-                        *s++ = *p;
-                }
-                strcpy(s, "'\n");
-                if (copy_to_user(buf, tmp + k, count))
-                        return -EFAULT;
-                kfree(tmp);
-                k += count;
-        } else if (op->flag & OPP_BINARY) {
-                char buffer[10];
-                u32 *first, *last;
-                int first_off, last_cnt;
-                first = ((u32 *)op->value) + k / 9;
-                first_off = k % 9;
-                last = ((u32 *)op->value) + (k + count - 1) / 9;
-                last_cnt = (k + count) % 9;
-                if (!last_cnt) last_cnt = 9;
-                if (first == last) {
-                        sprintf (buffer, "%08x.", *first);
-                        if (copy_to_user(buf, buffer + first_off,
-                                         last_cnt - first_off))
-                                return -EFAULT;
-                        buf += last_cnt - first_off;
-                } else {                
-                        for (q = first; q <= last; q++) {
-                                sprintf (buffer, "%08x.", *q);
-                                if (q == first) {
-                                        if (copy_to_user(buf, buffer + first_off,
-                                                         9 - first_off))
-                                                return -EFAULT;
-                                        buf += 9 - first_off;
-                                } else if (q == last) {
-                                        if (copy_to_user(buf, buffer, last_cnt))
-                                                return -EFAULT;
-                                        buf += last_cnt;
-                                } else {
-                                        if (copy_to_user(buf, buffer, 9))
-                                                return -EFAULT;
-                                        buf += 9;
-                                }
-                        }
-                }
-                if (last == (u32 *)(op->value + op->len - 4) && last_cnt == 9) {
+                return 0;
-                        if (put_user('\n', (buf - 1)))
+        }
-                                return -EFAULT;
-                }
-                k += count;
+        return 1;
+}
-        } else if (op->flag & OPP_HEXSTRING) {
+static int property_show(struct seq_file *f, void *v)
-                char buffer[3];
+{
+        struct property *prop = f->private;
+        void *pval;
+        int len;
-                if ((k < i - 1) && (k & 1)) {
+        len = prop->length;
-                        sprintf (buffer, "%02x",
+        pval = prop->value;
-                                 (unsigned char) *(op->value + (k >> 1)) & 0xff);
-                        if (put_user(buffer[1], &buf[k++ - *ppos]))
-                                return -EFAULT;
-                        count--;
-                }
-                for (; (count > 1) && (k < i - 1); k += 2) {
+        if (is_string(pval, len)) {
-                        sprintf (buffer, "%02x",
+                while (len > 0) {
-                                 (unsigned char) *(op->value + (k >> 1)) & 0xff);
+                        int n = strlen(pval);
-                        if (copy_to_user(buf + k - *ppos, buffer, 2))
-                                return -EFAULT;
-                        count -= 2;
-                }
-                if (count && (k < i - 1)) {
+                        seq_printf(f, "%s", (char *) pval);
-                        sprintf (buffer, "%02x",
-                                 (unsigned char) *(op->value + (k >> 1)) & 0xff);
-                        if (put_user(buffer[0], &buf[k++ - *ppos]))
-                                return -EFAULT;
-                        count--;
-                }
-                if (count) {
+                        /* Skip over the NULL byte too.  */
-                        if (put_user('\n', &buf [k++ - *ppos]))
+                        pval += n + 1;
-                                return -EFAULT;
+                        len -= n + 1;
-                }
-        }
-        count = k - *ppos;
-        *ppos = k;
-        return count;
-}
-static ssize_t property_write(struct file *filp, const char __user *buf,
+                        if (len > 0)
-                              size_t count, loff_t *ppos)
+                                seq_printf(f, " + ");
-{
-        int i, j, k;
-        char *p;
-        u32 *q;
-        void *b;
-        openprom_property *op;
-        
-        if (*ppos >= 0xffffff || count >= 0xffffff)
-                return -EINVAL;
-        if (!filp->private_data) {
-                i = property_read (filp, NULL, 0, NULL);
-                if (i)
-                        return i;
-        }
-        k = *ppos;
-        op = (openprom_property *)filp->private_data;
-        if (!(op->flag & OPP_STRING)) {
-                u32 *first, *last;
-                int first_off, last_cnt;
-                u32 mask, mask2;
-                char tmp [9];
-                int forcelen = 0;
-                
-                j = k % 9;
-                for (i = 0; i < count; i++, j++) {
-                        if (j == 9) j = 0;
-                        if (!j) {
-                                char ctmp;
-                                if (get_user(ctmp, &buf[i]))
-                                        return -EFAULT;
-                                if (ctmp != '.') {
-                                        if (ctmp != '\n') {
-                                                if (op->flag & OPP_BINARY)
-                                                        return -EINVAL;
-                                                else
-                                                        goto write_try_string;
-                                        } else {
-                                                count = i + 1;
-                                                forcelen = 1;
-                                                break;
-                                        }
-                                }
-                        } else {
-                                char ctmp;
-                                if (get_user(ctmp, &buf[i]))
-                                        return -EFAULT;
-                                if (ctmp < '0' || 
-                                    (ctmp > '9' && ctmp < 'A') ||
-                                    (ctmp > 'F' && ctmp < 'a') ||
-                                    ctmp > 'f') {
-                                        if (op->flag & OPP_BINARY)
-                                                return -EINVAL;
-                                        else
-                                                goto write_try_string;
-                                }
-                        }
-                }
-                op->flag |= OPP_BINARY;
-                tmp [8] = 0;
-                i = ((count + k + 8) / 9) << 2;
-                if (op->alloclen <= i) {
-                        b = kmalloc (sizeof (openprom_property) + 2 * i,
-                                     GFP_KERNEL);
-                        if (!b)
-                                return -ENOMEM;
-                        memcpy (b, filp->private_data,
-                                sizeof (openprom_property)
-                                + strlen (op->name) + op->alloclen);
-                        memset (((char *)b) + sizeof (openprom_property)
-                                + strlen (op->name) + op->alloclen, 
-                                0, 2 * i - op->alloclen);
-                        op = (openprom_property *)b;
-                        op->alloclen = 2*i;
-                        b = filp->private_data;
-                        filp->private_data = (void *)op;
-                        kfree (b);
                }
-                first = ((u32 *)op->value) + (k / 9);
+        } else {
-                first_off = k % 9;
+                if (len & 3) {
-                last = (u32 *)(op->value + i);
+                        while (len) {
-                last_cnt = (k + count) % 9;
+                                len--;
-                if (first + 1 == last) {
+                                if (len)
-                        memset (tmp, '0', 8);
+                                        seq_printf(f, "%02x.",
-                        if (copy_from_user(tmp + first_off, buf,
+                                                   *(unsigned char *) pval);
-                                           (count + first_off > 8) ?
+                                else
-                                           8 - first_off : count))
+                                        seq_printf(f, "%02x",
-                                return -EFAULT;
+                                                   *(unsigned char *) pval);
-                        mask = 0xffffffff;
+                                pval++;
-                        mask2 = 0xffffffff;
-                        for (j = 0; j < first_off; j++)
-                                mask >>= 1;
-                        for (j = 8 - count - first_off; j > 0; j--)
-                                mask2 <<= 1;
-                        mask &= mask2;
-                        if (mask) {
-                                *first &= ~mask;
-                                *first |= simple_strtoul (tmp, NULL, 16);
-                                op->flag |= OPP_DIRTY;
                        }
                } else {
-                        op->flag |= OPP_DIRTY;
+                        while (len >= 4) {
-                        for (q = first; q < last; q++) {
+                                len -= 4;
-                                if (q == first) {
-                                        if (first_off < 8) {
+                                if (len)
-                                                memset (tmp, '0', 8);
+                                        seq_printf(f, "%08x.",
-                                                if (copy_from_user(tmp + first_off,
+                                                   *(unsigned int *) pval);
-                                                                   buf,
+                                else
-                                                                   8 - first_off))
+                                        seq_printf(f, "%08x",
-                                                        return -EFAULT;
+                                                   *(unsigned int *) pval);
-                                                mask = 0xffffffff;
+                                pval += 4;
-                                                for (j = 0; j < first_off; j++)
-                                                        mask >>= 1;
-                                                *q &= ~mask;
-                                                *q |= simple_strtoul (tmp,NULL,16);
-                                        }
-                                        buf += 9;
-                                } else if ((q == last - 1) && last_cnt
-                                           && (last_cnt < 8)) {
-                                        memset (tmp, '0', 8);
-                                        if (copy_from_user(tmp, buf, last_cnt))
-                                                return -EFAULT;
-                                        mask = 0xffffffff;
-                                        for (j = 0; j < 8 - last_cnt; j++)
-                                                mask <<= 1;
-                                        *q &= ~mask;
-                                        *q |= simple_strtoul (tmp, NULL, 16);
-                                        buf += last_cnt;
-                                } else {
-                                        char tchars[17]; /* XXX yuck... */
-                                        if (copy_from_user(tchars, buf, 16))
-                                                return -EFAULT;
-                                        *q = simple_strtoul (tchars, NULL, 16);
-                                        buf += 9;
-                                }
-                        }
-                }
-                if (!forcelen) {
-                        if (op->len < i)
-                                op->len = i;
-                } else
-                        op->len = i;
-                *ppos += count;
-        }
-write_try_string:
-        if (!(op->flag & OPP_BINARY)) {
-                if (!(op->flag & (OPP_QUOTED | OPP_NOTQUOTED))) {
-                        char ctmp;
-                        /* No way, if somebody starts writing from the middle, 
-                         * we don't know whether he uses quotes around or not 
-                         */
-                        if (k > 0)
-                                return -EINVAL;
-                        if (get_user(ctmp, buf))
-                                return -EFAULT;
-                        if (ctmp == '\'') {
-                                op->flag |= OPP_QUOTED;
-                                buf++;
-                                count--;
-                                (*ppos)++;
-                                if (!count) {
-                                        op->flag |= OPP_STRING;
-                                        return 1;
-                                }
-                        } else
-                                op->flag |= OPP_NOTQUOTED;
-                }
-                op->flag |= OPP_STRING;
-                if (op->alloclen <= count + *ppos) {
-                        b = kmalloc (sizeof (openprom_property)
-                                     + 2 * (count + *ppos), GFP_KERNEL);
-                        if (!b)
-                                return -ENOMEM;
-                        memcpy (b, filp->private_data,
-                                sizeof (openprom_property)
-                                + strlen (op->name) + op->alloclen);
-                        memset (((char *)b) + sizeof (openprom_property)
-                                + strlen (op->name) + op->alloclen, 
-                                0, 2*(count - *ppos) - op->alloclen);
-                        op = (openprom_property *)b;
-                        op->alloclen = 2*(count + *ppos);
-                        b = filp->private_data;
-                        filp->private_data = (void *)op;
-                        kfree (b);
-                }
-                p = op->value + *ppos - ((op->flag & OPP_QUOTED) ? 1 : 0);
-                if (copy_from_user(p, buf, count))
-                        return -EFAULT;
-                op->flag |= OPP_DIRTY;
-                for (i = 0; i < count; i++, p++)
-                        if (*p == '\n') {
-                                *p = 0;
-                                break;
                        }
-                if (i < count) {
-                        op->len = p - op->value;
-                        *ppos += i + 1;
-                        if ((p > op->value) && (op->flag & OPP_QUOTED)
-                            && (*(p - 1) == '\''))
-                                op->len--;
-                } else {
-                        if (p - op->value > op->len)
-                                op->len = p - op->value;
-                        *ppos += count;
                }
        }
-        return *ppos - k;
+        seq_printf(f, "\n");
+        return 0;
 }
-int property_release (struct inode *inode, struct file *filp)
+static void *property_start(struct seq_file *f, loff_t *pos)
 {
-        openprom_property *op = (openprom_property *)filp->private_data;
+        if (*pos == 0)
-        int error;
+                return pos;
-        u32 node;
+        return NULL;
-        
+}
-        if (!op)
-                return 0;
+static void *property_next(struct seq_file *f, void *v, loff_t *pos)
-        lock_kernel();
+{
-        node = nodes[(u16)((long)inode->u.generic_ip)].node;
+        (*pos)++;
-        if ((u16)((long)inode->u.generic_ip) == aliases) {
+        return NULL;
-                if ((op->flag & OPP_DIRTY) && (op->flag & OPP_STRING)) {
+}
-                        char *p = op->name;
-                        int i = (op->value - op->name) - strlen (op->name) - 1;
+static void property_stop(struct seq_file *f, void *v)
-                        op->value [op->len] = 0;
+{
-                        *(op->value - 1) = ' ';
+        /* Nothing to do */
-                        if (i) {
+}
-                                for (p = op->value - i - 2; p >= op->name; p--)
-                                        p[i] = *p;
+static struct seq_operations property_op = {
-                                p = op->name + i;
+        .start          = property_start,
-                        }
+        .next           = property_next,
-                        memcpy (p - 8, "nvalias ", 8);
+        .stop           = property_stop,
-                        prom_feval (p - 8);
+        .show           = property_show
-                }
+};
-        } else if (op->flag & OPP_DIRTY) {
-                if (op->flag & OPP_STRING) {
+static int property_open(struct inode *inode, struct file *file)
-                        op->value [op->len] = 0;
+{
-                        error = prom_setprop (node, op->name,
+        struct op_inode_info *oi = OP_I(inode);
-                                              op->value, op->len + 1);
+        int ret;
-                        if (error <= 0)
-                                printk (KERN_WARNING "openpromfs: "
+        BUG_ON(oi->type != op_inode_prop);
-                                        "Couldn't write property %s\n",
-                                        op->name);
+        ret = seq_open(file, &property_op);
-                } else if ((op->flag & OPP_BINARY) || !op->len) {
+        if (!ret) {
-                        error = prom_setprop (node, op->name,
+                struct seq_file *m = file->private_data;
-                                              op->value, op->len);
+                m->private = oi->u.prop;
-                        if (error <= 0)
-                                printk (KERN_WARNING "openpromfs: "
-                                        "Couldn't write property %s\n",
-                                        op->name);
-                } else {
-                        printk (KERN_WARNING "openpromfs: "
-                                "Unknown property type of %s\n",
-                                op->name);
-                }
        }
-        unlock_kernel();
+        return ret;
-        kfree (filp->private_data);
-        return 0;
 }
 static const struct file_operations openpromfs_prop_ops = {
-        .read           = property_read,
+        .open           = property_open,
-        .write          = property_write,
+        .read           = seq_read,
-        .release        = property_release,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
 };
-static const struct file_operations openpromfs_nodenum_ops = {
+static int openpromfs_readdir(struct file *, void *, filldir_t);
-        .read           = nodenum_read,
-};
 static const struct file_operations openprom_operations = {
        .read           = generic_read_dir,
        .readdir        = openpromfs_readdir,
 };
-static struct inode_operations openprom_alias_inode_operations = {
+static struct dentry *openpromfs_lookup(struct inode *, struct dentry *, struct nameidata *);
-        .create         = openpromfs_create,
-        .lookup         = openpromfs_lookup,
-        .unlink         = openpromfs_unlink,
-};
 static struct inode_operations openprom_inode_operations = {
        .lookup         = openpromfs_lookup,
 };
-static int lookup_children(u16 n, const char * name, int len)
+static struct dentry *openpromfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
-{
-        int ret;
-        u16 node;
-        for (; n != 0xffff; n = nodes[n].next) {
-                node = nodes[n].child;
-                if (node != 0xffff) {
-                        char buffer[128];
-                        int i;
-                        char *p;
-                        
-                        while (node != 0xffff) {
-                                if (prom_getname (nodes[node].node,
-                                                  buffer, 128) >= 0) {
-                                        i = strlen (buffer);
-                                        if ((len == i)
-                                            && !strncmp (buffer, name, len))
-                                                return NODE2INO(node);
-                                        p = strchr (buffer, '@');
-                                        if (p && (len == p - buffer)
-                                            && !strncmp (buffer, name, len))
-                                                return NODE2INO(node);
-                                }
-                                node = nodes[node].next;
-                        }
-                } else
-                        continue;
-                ret = lookup_children (nodes[n].child, name, len);
-                if (ret) return ret;
-        }
-        return 0;
-}
-static struct dentry *openpromfs_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
 {
-        int ino = 0;
+        struct op_inode_info *ent_oi, *oi = OP_I(dir);
-#define OPFSL_DIR       0
+        struct device_node *dp, *child;
-#define OPFSL_PROPERTY  1
+        struct property *prop;
-#define OPFSL_NODENUM   2
+        enum op_inode_type ent_type;
-        int type = 0;
+        union op_inode_data ent_data;
-        char buffer[128];
-        char *p;
        const char *name;
-        u32 n;
-        u16 dirnode;
-        unsigned int len;
-        int i;
        struct inode *inode;
-        char buffer2[64];
+        unsigned int ino;
+        int len;
        
-        inode = NULL;
+        BUG_ON(oi->type != op_inode_node);
+        dp = oi->u.node;
        name = dentry->d_name.name;
        len = dentry->d_name.len;
-        lock_kernel();
-        if (name [0] == '.' && len == 5 && !strncmp (name + 1, "node", 4)) {
+        mutex_lock(&op_mutex);
-                ino = NODEP2INO(NODE(dir->i_ino).first_prop);
-                type = OPFSL_NODENUM;
+        child = dp->child;
-        }
+        while (child) {
-        if (!ino) {
+                int n = strlen(child->path_component_name);
-                u16 node = NODE(dir->i_ino).child;
-                while (node != 0xffff) {
+                if (len == n &&
-                        if (prom_getname (nodes[node].node, buffer, 128) >= 0) {
+                    !strncmp(child->path_component_name, name, len)) {
-                                i = strlen (buffer);
+                        ent_type = op_inode_node;
-                                if (len == i && !strncmp (buffer, name, len)) {
+                        ent_data.node = child;
-                                        ino = NODE2INO(node);
+                        ino = child->unique_id;
-                                        type = OPFSL_DIR;
+                        goto found;
-                                        break;
-                                }
-                                p = strchr (buffer, '@');
-                                if (p && (len == p - buffer)
-                                    && !strncmp (buffer, name, len)) {
-                                        ino = NODE2INO(node);
-                                        type = OPFSL_DIR;
-                                        break;
-                                }
-                        }
-                        node = nodes[node].next;
-                }
-        }
-        n = NODE(dir->i_ino).node;
-        dirnode = dir->i_ino - OPENPROM_FIRST_INO;
-        if (!ino) {
-                int j = NODEP2INO(NODE(dir->i_ino).first_prop);
-                if (dirnode != aliases) {
-                        for (p = prom_firstprop (n, buffer2);
-                             p && *p;
-                             p = prom_nextprop (n, p, buffer2)) {
-                                j++;
-                                if ((len == strlen (p))
-                                    && !strncmp (p, name, len)) {
-                                        ino = j;
-                                        type = OPFSL_PROPERTY;
-                                        break;
-                                }
-                        }
-                } else {
-                        int k;
-                        for (k = 0; k < aliases_nodes; k++) {
-                                j++;
-                                if (alias_names [k]
-                                    && (len == strlen (alias_names [k]))
-                                    && !strncmp (alias_names [k], name, len)) {
-                                        ino = j;
-                                        type = OPFSL_PROPERTY;
-                                        break;
-                                }
-                        }
                }
+                child = child->sibling;
        }
-        if (!ino) {
-                ino = lookup_children (NODE(dir->i_ino).child, name, len);
+        prop = dp->properties;
-                if (ino)
+        while (prop) {
-                        type = OPFSL_DIR;
+                int n = strlen(prop->name);
-                else {
-                        unlock_kernel();
+                if (len == n && !strncmp(prop->name, name, len)) {
-                        return ERR_PTR(-ENOENT);
+                        ent_type = op_inode_prop;
+                        ent_data.prop = prop;
+                        ino = prop->unique_id;
+                        goto found;
                }
+                prop = prop->next;
        }
-        inode = iget (dir->i_sb, ino);
-        unlock_kernel();
+        mutex_unlock(&op_mutex);
+        return ERR_PTR(-ENOENT);
+found:
+        inode = iget(dir->i_sb, ino);
+        mutex_unlock(&op_mutex);
        if (!inode)
                return ERR_PTR(-EINVAL);
-        switch (type) {
+        ent_oi = OP_I(inode);
-        case OPFSL_DIR:
+        ent_oi->type = ent_type;
+        ent_oi->u = ent_data;
+        switch (ent_type) {
+        case op_inode_node:
                inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
-                if (ino == OPENPROM_FIRST_INO + aliases) {
+                inode->i_op = &openprom_inode_operations;
-                        inode->i_mode |= S_IWUSR;
-                        inode->i_op = &openprom_alias_inode_operations;
-                } else
-                        inode->i_op = &openprom_inode_operations;
                inode->i_fop = &openprom_operations;
                inode->i_nlink = 2;
                break;
-        case OPFSL_NODENUM:
+        case op_inode_prop:
-                inode->i_mode = S_IFREG | S_IRUGO;
+                if (!strcmp(dp->name, "options") && (len == 17) &&
-                inode->i_fop = &openpromfs_nodenum_ops;
+                    !strncmp (name, "security-password", 17))
-                inode->i_nlink = 1;
-                inode->u.generic_ip = (void *)(long)(n);
-                break;
-        case OPFSL_PROPERTY:
-                if ((dirnode == options) && (len == 17)
-                    && !strncmp (name, "security-password", 17))
                        inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
-                else {
+                else
                        inode->i_mode = S_IFREG | S_IRUGO;
-                        if (dirnode == options || dirnode == aliases) {
-                                if (len != 4 || strncmp (name, "name", 4))
-                                        inode->i_mode |= S_IWUSR;
-                        }
-                }
                inode->i_fop = &openpromfs_prop_ops;
                inode->i_nlink = 1;
-                if (inode->i_size < 0)
+                inode->i_size = ent_oi->u.prop->length;
-                        inode->i_size = 0;
-                inode->u.generic_ip = (void *)(long)(((u16)dirnode) | 
-                        (((u16)(ino - NODEP2INO(NODE(dir->i_ino).first_prop) - 1)) << 16));
                break;
        }
@@ -775,237 +263,89 @@ static struct dentry *openpromfs_lookup(struct inode * dir, struct dentry *dentr
 static int openpromfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
        struct inode *inode = filp->f_dentry->d_inode;
+        struct op_inode_info *oi = OP_I(inode);
+        struct device_node *dp = oi->u.node;
+        struct device_node *child;
+        struct property *prop;
        unsigned int ino;
-        u32 n;
+        int i;
-        int i, j;
-        char buffer[128];
+        mutex_lock(&op_mutex);
-        u16 node;
-        char *p;
-        char buffer2[64];
-        lock_kernel();
        
        ino = inode->i_ino;
        i = filp->f_pos;
        switch (i) {
        case 0:
-                if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) goto out;
+                if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+                        goto out;
                i++;
                filp->f_pos++;
                /* fall thru */
        case 1:
-                if (filldir(dirent, "..", 2, i, 
+                if (filldir(dirent, "..", 2, i,
-                        (NODE(ino).parent == 0xffff) ? 
+                            (dp->parent == NULL ?
-                        OPENPROM_ROOT_INO : NODE2INO(NODE(ino).parent), DT_DIR) < 0) 
+                             OPENPROM_ROOT_INO :
+                             dp->parent->unique_id), DT_DIR) < 0) 
                        goto out;
                i++;
                filp->f_pos++;
                /* fall thru */
        default:
                i -= 2;
-                node = NODE(ino).child;
-                while (i && node != 0xffff) {
+                /* First, the children nodes as directories.  */
-                        node = nodes[node].next;
+                child = dp->child;
+                while (i && child) {
+                        child = child->sibling;
                        i--;
                }
-                while (node != 0xffff) {
+                while (child) {
-                        if (prom_getname (nodes[node].node, buffer, 128) < 0)
+                        if (filldir(dirent,
-                                goto out;
+                                    child->path_component_name,
-                        if (filldir(dirent, buffer, strlen(buffer),
+                                    strlen(child->path_component_name),
-                                    filp->f_pos, NODE2INO(node), DT_DIR) < 0)
+                                    filp->f_pos, child->unique_id, DT_DIR) < 0)
                                goto out;
                        filp->f_pos++;
-                        node = nodes[node].next;
+                        child = child->sibling;
                }
-                j = NODEP2INO(NODE(ino).first_prop);
-                if (!i) {
+                /* Next, the properties as files.  */
-                        if (filldir(dirent, ".node", 5, filp->f_pos, j, DT_REG) < 0)
+                prop = dp->properties;
+                while (i && prop) {
+                        prop = prop->next;
+                        i--;
+                }
+                while (prop) {
+                        if (filldir(dirent, prop->name, strlen(prop->name),
+                                    filp->f_pos, prop->unique_id, DT_REG) < 0)
                                goto out;
                        filp->f_pos++;
-                } else
+                        prop = prop->next;
-                        i--;
-                n = NODE(ino).node;
-                if (ino == OPENPROM_FIRST_INO + aliases) {
-                        for (j++; i < aliases_nodes; i++, j++) {
-                                if (alias_names [i]) {
-                                        if (filldir (dirent, alias_names [i], 
-                                                strlen (alias_names [i]), 
-                                                filp->f_pos, j, DT_REG) < 0) goto out; 
-                                        filp->f_pos++;
-                                }
-                        }
-                } else {
-                        for (p = prom_firstprop (n, buffer2);
-                             p && *p;
-                             p = prom_nextprop (n, p, buffer2)) {
-                                j++;
-                                if (i) i--;
-                                else {
-                                        if (filldir(dirent, p, strlen(p),
-                                                    filp->f_pos, j, DT_REG) < 0)
-                                                goto out;
-                                        filp->f_pos++;
-                                }
-                        }
                }
        }
 out:
-        unlock_kernel();
+        mutex_unlock(&op_mutex);
-        return 0;
-}
-static int openpromfs_create (struct inode *dir, struct dentry *dentry, int mode,
-                struct nameidata *nd)
-{
-        char *p;
-        struct inode *inode;
-        
-        if (!dir)
-                return -ENOENT;
-        if (dentry->d_name.len > 256)
-                return -EINVAL;
-        p = kmalloc (dentry->d_name.len + 1, GFP_KERNEL);
-        if (!p)
-                return -ENOMEM;
-        strncpy (p, dentry->d_name.name, dentry->d_name.len);
-        p [dentry->d_name.len] = 0;
-        lock_kernel();
-        if (aliases_nodes == ALIASES_NNODES) {
-                kfree(p);
-                unlock_kernel();
-                return -EIO;
-        }
-        alias_names [aliases_nodes++] = p;
-        inode = iget (dir->i_sb,
-                        NODEP2INO(NODE(dir->i_ino).first_prop) + aliases_nodes);
-        if (!inode) {
-                unlock_kernel();
-                return -EINVAL;
-        }
-        inode->i_mode = S_IFREG | S_IRUGO | S_IWUSR;
-        inode->i_fop = &openpromfs_prop_ops;
-        inode->i_nlink = 1;
-        if (inode->i_size < 0) inode->i_size = 0;
-        inode->u.generic_ip = (void *)(long)(((u16)aliases) | 
-                        (((u16)(aliases_nodes - 1)) << 16));
-        unlock_kernel();
-        d_instantiate(dentry, inode);
        return 0;
 }
-static int openpromfs_unlink (struct inode *dir, struct dentry *dentry)
+static kmem_cache_t *op_inode_cachep;
-{
-        unsigned int len;
-        char *p;
-        const char *name;
-        int i;
-        
-        name = dentry->d_name.name;
-        len = dentry->d_name.len;
-        lock_kernel();
-        for (i = 0; i < aliases_nodes; i++)
-                if ((strlen (alias_names [i]) == len)
-                    && !strncmp (name, alias_names[i], len)) {
-                        char buffer[512];
-                        
-                        p = alias_names [i];
-                        alias_names [i] = NULL;
-                        kfree (p);
-                        strcpy (buffer, "nvunalias ");
-                        memcpy (buffer + 10, name, len);
-                        buffer [10 + len] = 0;
-                        prom_feval (buffer);
-                }
-        unlock_kernel();
-        return 0;
-}
-/* {{{ init section */
+static struct inode *openprom_alloc_inode(struct super_block *sb)
-static int __init check_space (u16 n)
 {
-        unsigned long pages;
+        struct op_inode_info *oi;
-        if ((1 << alloced) * PAGE_SIZE < (n + 2) * sizeof(openpromfs_node)) {
+        oi = kmem_cache_alloc(op_inode_cachep, SLAB_KERNEL);
-                pages = __get_free_pages (GFP_KERNEL, alloced + 1);
+        if (!oi)
-                if (!pages)
+                return NULL;
-                        return -1;
-                if (nodes) {
+        return &oi->vfs_inode;
-                        memcpy ((char *)pages, (char *)nodes,
-                                (1 << alloced) * PAGE_SIZE);
-                        free_pages ((unsigned long)nodes, alloced);
-                }
-                alloced++;
-                nodes = (openpromfs_node *)pages;
-        }
-        return 0;
 }
-static u16 __init get_nodes (u16 parent, u32 node)
+static void openprom_destroy_inode(struct inode *inode)
 {
-        char *p;
+        kmem_cache_free(op_inode_cachep, OP_I(inode));
-        u16 n = last_node++, i;
-        char buffer[64];
-        if (check_space (n) < 0)
-                return 0xffff;
-        nodes[n].parent = parent;
-        nodes[n].node = node;
-        nodes[n].next = 0xffff;
-        nodes[n].child = 0xffff;
-        nodes[n].first_prop = first_prop++;
-        if (!parent) {
-                char buffer[8];
-                int j;
-                
-                if ((j = prom_getproperty (node, "name", buffer, 8)) >= 0) {
-                    buffer[j] = 0;
-                    if (!strcmp (buffer, "options"))
-                        options = n;
-                    else if (!strcmp (buffer, "aliases"))
-                        aliases = n;
-                }
-        }
-        if (n != aliases)
-                for (p = prom_firstprop (node, buffer);
-                     p && p != (char *)-1 && *p;
-                     p = prom_nextprop (node, p, buffer))
-                        first_prop++;
-        else {
-                char *q;
-                for (p = prom_firstprop (node, buffer);
-                     p && p != (char *)-1 && *p;
-                     p = prom_nextprop (node, p, buffer)) {
-                        if (aliases_nodes == ALIASES_NNODES)
-                                break;
-                        for (i = 0; i < aliases_nodes; i++)
-                                if (!strcmp (p, alias_names [i]))
-                                        break;
-                        if (i < aliases_nodes)
-                                continue;
-                        q = kmalloc (strlen (p) + 1, GFP_KERNEL);
-                        if (!q)
-                                return 0xffff;
-                        strcpy (q, p);
-                        alias_names [aliases_nodes++] = q;
-                }
-                first_prop += ALIASES_NNODES;
-        }
-        node = prom_getchild (node);
-        if (node) {
-                parent = get_nodes (n, node);
-                if (parent == 0xffff)
-                        return 0xffff;
-                nodes[n].child = parent;
-                while ((node = prom_getsibling (node)) != 0) {
-                        i = get_nodes (n, node);
-                        if (i == 0xffff)
-                                return 0xffff;
-                        nodes[parent].next = i;
-                        parent = i;
-                }
-        }
-        return n;
 }
 static void openprom_read_inode(struct inode * inode)
@@ -1025,6 +365,8 @@ static int openprom_remount(struct super_block *sb, int *flags, char *data)
 }
 static struct super_operations openprom_sops = { 
+        .alloc_inode    = openprom_alloc_inode,
+        .destroy_inode  = openprom_destroy_inode,
        .read_inode     = openprom_read_inode,
        .statfs         = simple_statfs,
        .remount_fs     = openprom_remount,
@@ -1032,7 +374,8 @@ static struct super_operations openprom_sops = {
 static int openprom_fill_super(struct super_block *s, void *data, int silent)
 {
-        struct inode * root_inode;
+        struct inode *root_inode;
+        struct op_inode_info *oi;
        s->s_flags |= MS_NOATIME;
        s->s_blocksize = 1024;
@@ -1043,6 +386,11 @@ static int openprom_fill_super(struct super_block *s, void *data, int silent)
        root_inode = iget(s, OPENPROM_ROOT_INO);
        if (!root_inode)
                goto out_no_root;
+        oi = OP_I(root_inode);
+        oi->type = op_inode_node;
+        oi->u.node = of_find_node_by_path("/");
        s->s_root = d_alloc_root(root_inode);
        if (!s->s_root)
                goto out_no_root;
@@ -1067,29 +415,39 @@ static struct file_system_type openprom_fs_type = {
        .kill_sb        = kill_anon_super,
 };
+static void op_inode_init_once(void *data, kmem_cache_t * cachep, unsigned long flags)
+{
+        struct op_inode_info *oi = (struct op_inode_info *) data;
+        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+            SLAB_CTOR_CONSTRUCTOR)
+                inode_init_once(&oi->vfs_inode);
+}
 static int __init init_openprom_fs(void)
 {
-        nodes = (openpromfs_node *)__get_free_pages(GFP_KERNEL, 0);
+        int err;
-        if (!nodes) {
-                printk (KERN_WARNING "openpromfs: can't get free page\n");
+        op_inode_cachep = kmem_cache_create("op_inode_cache",
-                return -EIO;
+                                            sizeof(struct op_inode_info),
-        }
+                                            0,
-        if (get_nodes (0xffff, prom_root_node) == 0xffff) {
+                                            (SLAB_RECLAIM_ACCOUNT |
-                printk (KERN_WARNING "openpromfs: couldn't setup tree\n");
+                                             SLAB_MEM_SPREAD),
-                return -EIO;
+                                            op_inode_init_once, NULL);
-        }
+        if (!op_inode_cachep)
-        nodes[last_node].first_prop = first_prop;
+                return -ENOMEM;
-        return register_filesystem(&openprom_fs_type);
+        err = register_filesystem(&openprom_fs_type);
+        if (err)
+                kmem_cache_destroy(op_inode_cachep);
+        return err;
 }
 static void __exit exit_openprom_fs(void)
 {
-        int i;
        unregister_filesystem(&openprom_fs_type);
-        free_pages ((unsigned long)nodes, alloced);
+        kmem_cache_destroy(op_inode_cachep);
-        for (i = 0; i < aliases_nodes; i++)
-                kfree (alias_names [i]);
-        nodes = NULL;
 }
 module_init(init_openprom_fs)
diff --git a/fs/pnode.c b/fs/pnode.c
index 37b568ed0e05..da42ee61c1df 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -53,8 +53,7 @@ static int do_make_slave(struct vfsmount *mnt)
        if (master) {
                list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
                        slave_mnt->mnt_master = master;
-                list_del(&mnt->mnt_slave);
+                list_move(&mnt->mnt_slave, &master->mnt_slave_list);
-                list_add(&mnt->mnt_slave, &master->mnt_slave_list);
                list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev);
                INIT_LIST_HEAD(&mnt->mnt_slave_list);
        } else {
@@ -283,10 +282,8 @@ static void __propagate_umount(struct vfsmount *mnt)
                 * umount the child only if the child has no
                 * other children
                 */
-                if (child && list_empty(&child->mnt_mounts)) {
+                if (child && list_empty(&child->mnt_mounts))
-                        list_del(&child->mnt_hash);
+                        list_move_tail(&child->mnt_hash, &mnt->mnt_hash);
-                        list_add_tail(&child->mnt_hash, &mnt->mnt_hash);
-                }
        }
 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6afff725a8c9..6ba7785319de 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -74,6 +74,16 @@
 #include <linux/poll.h>
 #include "internal.h"
+/* NOTE:
+ *      Implementing inode permission operations in /proc is almost
+ *      certainly an error.  Permission checks need to happen during
+ *      each system call not at open time.  The reason is that most of
+ *      what we wish to check for permissions in /proc varies at runtime.
+ *
+ *      The classic example of a problem is opening file descriptors
+ *      in /proc for a task before it execs a suid executable.
+ */
 /*
 * For hysterical raisins we keep the same inumbers as in the old procfs.
 * Feel free to change the macro below - just keep the range distinct from
@@ -121,6 +131,8 @@ enum pid_directory_inos {
        PROC_TGID_ATTR_PREV,
        PROC_TGID_ATTR_EXEC,
        PROC_TGID_ATTR_FSCREATE,
+        PROC_TGID_ATTR_KEYCREATE,
+        PROC_TGID_ATTR_SOCKCREATE,
 #endif
 #ifdef CONFIG_AUDITSYSCALL
        PROC_TGID_LOGINUID,
@@ -162,6 +174,8 @@ enum pid_directory_inos {
        PROC_TID_ATTR_PREV,
        PROC_TID_ATTR_EXEC,
        PROC_TID_ATTR_FSCREATE,
+        PROC_TID_ATTR_KEYCREATE,
+        PROC_TID_ATTR_SOCKCREATE,
 #endif
 #ifdef CONFIG_AUDITSYSCALL
        PROC_TID_LOGINUID,
@@ -173,6 +187,9 @@ enum pid_directory_inos {
        PROC_TID_FD_DIR = 0x8000,       /* 0x8000-0xffff */
 };
+/* Worst case buffer size needed for holding an integer. */
+#define PROC_NUMBUF 10
 struct pid_entry {
        int type;
        int len;
@@ -275,6 +292,8 @@ static struct pid_entry tgid_attr_stuff[] = {
        E(PROC_TGID_ATTR_PREV,     "prev",     S_IFREG|S_IRUGO),
        E(PROC_TGID_ATTR_EXEC,     "exec",     S_IFREG|S_IRUGO|S_IWUGO),
        E(PROC_TGID_ATTR_FSCREATE, "fscreate", S_IFREG|S_IRUGO|S_IWUGO),
+        E(PROC_TGID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO),
+        E(PROC_TGID_ATTR_SOCKCREATE, "sockcreate", S_IFREG|S_IRUGO|S_IWUGO),
        {0,0,NULL,0}
 };
 static struct pid_entry tid_attr_stuff[] = {
@@ -282,6 +301,8 @@ static struct pid_entry tid_attr_stuff[] = {
        E(PROC_TID_ATTR_PREV,      "prev",     S_IFREG|S_IRUGO),
        E(PROC_TID_ATTR_EXEC,      "exec",     S_IFREG|S_IRUGO|S_IWUGO),
        E(PROC_TID_ATTR_FSCREATE,  "fscreate", S_IFREG|S_IRUGO|S_IWUGO),
+        E(PROC_TID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO),
+        E(PROC_TID_ATTR_SOCKCREATE, "sockcreate", S_IFREG|S_IRUGO|S_IWUGO),
        {0,0,NULL,0}
 };
 #endif
@@ -290,12 +311,15 @@ static struct pid_entry tid_attr_stuff[] = {
 static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
 {
-        struct task_struct *task = proc_task(inode);
+        struct task_struct *task = get_proc_task(inode);
-        struct files_struct *files;
+        struct files_struct *files = NULL;
        struct file *file;
-        int fd = proc_type(inode) - PROC_TID_FD_DIR;
+        int fd = proc_fd(inode);
-        files = get_files_struct(task);
+        if (task) {
+                files = get_files_struct(task);
+                put_task_struct(task);
+        }
        if (files) {
                /*
                 * We are not taking a ref to the file structure, so we must
@@ -327,29 +351,33 @@ static struct fs_struct *get_fs_struct(struct task_struct *task)
        return fs;
 }
-static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
+static int get_nr_threads(struct task_struct *tsk)
 {
-        struct fs_struct *fs = get_fs_struct(proc_task(inode));
+        /* Must be called with the rcu_read_lock held */
-        int result = -ENOENT;
+        unsigned long flags;
-        if (fs) {
+        int count = 0;
-                read_lock(&fs->lock);
-                *mnt = mntget(fs->pwdmnt);
+        if (lock_task_sighand(tsk, &flags)) {
-                *dentry = dget(fs->pwd);
+                count = atomic_read(&tsk->signal->count);
-                read_unlock(&fs->lock);
+                unlock_task_sighand(tsk, &flags);
-                result = 0;
-                put_fs_struct(fs);
        }
-        return result;
+        return count;
 }
-static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
+static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
 {
-        struct fs_struct *fs = get_fs_struct(proc_task(inode));
+        struct task_struct *task = get_proc_task(inode);
+        struct fs_struct *fs = NULL;
        int result = -ENOENT;
+        if (task) {
+                fs = get_fs_struct(task);
+                put_task_struct(task);
+        }
        if (fs) {
                read_lock(&fs->lock);
-                *mnt = mntget(fs->rootmnt);
+                *mnt = mntget(fs->pwdmnt);
-                *dentry = dget(fs->root);
+                *dentry = dget(fs->pwd);
                read_unlock(&fs->lock);
                result = 0;
                put_fs_struct(fs);
@@ -357,42 +385,16 @@ static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vf
        return result;
 }
+static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
-/* Same as proc_root_link, but this addionally tries to get fs from other
- * threads in the group */
-static int proc_task_root_link(struct inode *inode, struct dentry **dentry,
-                                struct vfsmount **mnt)
 {
-        struct fs_struct *fs;
+        struct task_struct *task = get_proc_task(inode);
+        struct fs_struct *fs = NULL;
        int result = -ENOENT;
-        struct task_struct *leader = proc_task(inode);
-        task_lock(leader);
+        if (task) {
-        fs = leader->fs;
+                fs = get_fs_struct(task);
-        if (fs) {
+                put_task_struct(task);
-                atomic_inc(&fs->count);
-                task_unlock(leader);
-        } else {
-                /* Try to get fs from other threads */
-                task_unlock(leader);
-                read_lock(&tasklist_lock);
-                if (pid_alive(leader)) {
-                        struct task_struct *task = leader;
-                        while ((task = next_thread(task)) != leader) {
-                                task_lock(task);
-                                fs = task->fs;
-                                if (fs) {
-                                        atomic_inc(&fs->count);
-                                        task_unlock(task);
-                                        break;
-                                }
-                                task_unlock(task);
-                        }
-                }
-                read_unlock(&tasklist_lock);
        }
        if (fs) {
                read_lock(&fs->lock);
                *mnt = mntget(fs->rootmnt);
@@ -404,7 +406,6 @@ static int proc_task_root_link(struct inode *inode, struct dentry **dentry,
        return result;
 }
 #define MAY_PTRACE(task) \
        (task == current || \
        (task->parent == current && \
@@ -535,142 +536,22 @@ static int proc_oom_score(struct task_struct *task, char *buffer)
 /************************************************************************/
 /* permission checks */
+static int proc_fd_access_allowed(struct inode *inode)
-/* If the process being read is separated by chroot from the reading process,
- * don't let the reader access the threads.
- *
- * note: this does dput(root) and mntput(vfsmnt) on exit.
- */
-static int proc_check_chroot(struct dentry *root, struct vfsmount *vfsmnt)
-{
-        struct dentry *de, *base;
-        struct vfsmount *our_vfsmnt, *mnt;
-        int res = 0;
-        read_lock(&current->fs->lock);
-        our_vfsmnt = mntget(current->fs->rootmnt);
-        base = dget(current->fs->root);
-        read_unlock(&current->fs->lock);
-        spin_lock(&vfsmount_lock);
-        de = root;
-        mnt = vfsmnt;
-        while (mnt != our_vfsmnt) {
-                if (mnt == mnt->mnt_parent)
-                        goto out;
-                de = mnt->mnt_mountpoint;
-                mnt = mnt->mnt_parent;
-        }
-        if (!is_subdir(de, base))
-                goto out;
-        spin_unlock(&vfsmount_lock);
-exit:
-        dput(base);
-        mntput(our_vfsmnt);
-        dput(root);
-        mntput(vfsmnt);
-        return res;
-out:
-        spin_unlock(&vfsmount_lock);
-        res = -EACCES;
-        goto exit;
-}
-static int proc_check_root(struct inode *inode)
-{
-        struct dentry *root;
-        struct vfsmount *vfsmnt;
-        if (proc_root_link(inode, &root, &vfsmnt)) /* Ewww... */
-                return -ENOENT;
-        return proc_check_chroot(root, vfsmnt);
-}
-static int proc_permission(struct inode *inode, int mask, struct nameidata *nd)
-{
-        if (generic_permission(inode, mask, NULL) != 0)
-                return -EACCES;
-        return proc_check_root(inode);
-}
-static int proc_task_permission(struct inode *inode, int mask, struct nameidata *nd)
-{
-        struct dentry *root;
-        struct vfsmount *vfsmnt;
-        if (generic_permission(inode, mask, NULL) != 0)
-                return -EACCES;
-        if (proc_task_root_link(inode, &root, &vfsmnt))
-                return -ENOENT;
-        return proc_check_chroot(root, vfsmnt);
-}
-extern struct seq_operations proc_pid_maps_op;
-static int maps_open(struct inode *inode, struct file *file)
-{
-        struct task_struct *task = proc_task(inode);
-        int ret = seq_open(file, &proc_pid_maps_op);
-        if (!ret) {
-                struct seq_file *m = file->private_data;
-                m->private = task;
-        }
-        return ret;
-}
-static struct file_operations proc_maps_operations = {
-        .open           = maps_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = seq_release,
-};
-#ifdef CONFIG_NUMA
-extern struct seq_operations proc_pid_numa_maps_op;
-static int numa_maps_open(struct inode *inode, struct file *file)
-{
-        struct task_struct *task = proc_task(inode);
-        int ret = seq_open(file, &proc_pid_numa_maps_op);
-        if (!ret) {
-                struct seq_file *m = file->private_data;
-                m->private = task;
-        }
-        return ret;
-}
-static struct file_operations proc_numa_maps_operations = {
-        .open           = numa_maps_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = seq_release,
-};
-#endif
-#ifdef CONFIG_MMU
-extern struct seq_operations proc_pid_smaps_op;
-static int smaps_open(struct inode *inode, struct file *file)
 {
-        struct task_struct *task = proc_task(inode);
+        struct task_struct *task;
-        int ret = seq_open(file, &proc_pid_smaps_op);
+        int allowed = 0;
-        if (!ret) {
+        /* Allow access to a task's file descriptors if it is us or we
-                struct seq_file *m = file->private_data;
+         * may use ptrace attach to the process and find out that
-                m->private = task;
+         * information.
+         */
+        task = get_proc_task(inode);
+        if (task) {
+                allowed = ptrace_may_attach(task);
+                put_task_struct(task);
        }
-        return ret;
+        return allowed;
 }
-static struct file_operations proc_smaps_operations = {
-        .open           = smaps_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = seq_release,
-};
-#endif
 extern struct seq_operations mounts_op;
 struct proc_mounts {
        struct seq_file m;
@@ -679,16 +560,19 @@ struct proc_mounts {
 static int mounts_open(struct inode *inode, struct file *file)
 {
-        struct task_struct *task = proc_task(inode);
+        struct task_struct *task = get_proc_task(inode);
-        struct namespace *namespace;
+        struct namespace *namespace = NULL;
        struct proc_mounts *p;
        int ret = -EINVAL;
-        task_lock(task);
+        if (task) {
-        namespace = task->namespace;
+                task_lock(task);
-        if (namespace)
+                namespace = task->namespace;
-                get_namespace(namespace);
+                if (namespace)
-        task_unlock(task);
+                        get_namespace(namespace);
+                task_unlock(task);
+                put_task_struct(task);
+        }
        if (namespace) {
                ret = -ENOMEM;
@@ -745,17 +629,21 @@ static struct file_operations proc_mounts_operations = {
 extern struct seq_operations mountstats_op;
 static int mountstats_open(struct inode *inode, struct file *file)
 {
-        struct task_struct *task = proc_task(inode);
        int ret = seq_open(file, &mountstats_op);
        if (!ret) {
                struct seq_file *m = file->private_data;
-                struct namespace *namespace;
+                struct namespace *namespace = NULL;
-                task_lock(task);
+                struct task_struct *task = get_proc_task(inode);
-                namespace = task->namespace;
-                if (namespace)
+                if (task) {
-                        get_namespace(namespace);
+                        task_lock(task);
-                task_unlock(task);
+                        namespace = task->namespace;
+                        if (namespace)
+                                get_namespace(namespace);
+                        task_unlock(task);
+                        put_task_struct(task);
+                }
                if (namespace)
                        m->private = namespace;
@@ -782,18 +670,27 @@ static ssize_t proc_info_read(struct file * file, char __user * buf,
        struct inode * inode = file->f_dentry->d_inode;
        unsigned long page;
        ssize_t length;
-        struct task_struct *task = proc_task(inode);
+        struct task_struct *task = get_proc_task(inode);
+        length = -ESRCH;
+        if (!task)
+                goto out_no_task;
        if (count > PROC_BLOCK_SIZE)
                count = PROC_BLOCK_SIZE;
+        length = -ENOMEM;
        if (!(page = __get_free_page(GFP_KERNEL)))
-                return -ENOMEM;
+                goto out;
        length = PROC_I(inode)->op.proc_read(task, (char*)page);
        if (length >= 0)
                length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
        free_page(page);
+out:
+        put_task_struct(task);
+out_no_task:
        return length;
 }
@@ -810,12 +707,15 @@ static int mem_open(struct inode* inode, struct file* file)
 static ssize_t mem_read(struct file * file, char __user * buf,
                        size_t count, loff_t *ppos)
 {
-        struct task_struct *task = proc_task(file->f_dentry->d_inode);
+        struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
        char *page;
        unsigned long src = *ppos;
        int ret = -ESRCH;
        struct mm_struct *mm;
+        if (!task)
+                goto out_no_task;
        if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
                goto out;
@@ -865,6 +765,8 @@ out_put:
 out_free:
        free_page((unsigned long) page);
 out:
+        put_task_struct(task);
+out_no_task:
        return ret;
 }
@@ -877,15 +779,20 @@ static ssize_t mem_write(struct file * file, const char * buf,
 {
        int copied = 0;
        char *page;
-        struct task_struct *task = proc_task(file->f_dentry->d_inode);
+        struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
        unsigned long dst = *ppos;
+        copied = -ESRCH;
+        if (!task)
+                goto out_no_task;
        if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
-                return -ESRCH;
+                goto out;
+        copied = -ENOMEM;
        page = (char *)__get_free_page(GFP_USER);
        if (!page)
-                return -ENOMEM;
+                goto out;
        while (count > 0) {
                int this_len, retval;
@@ -908,6 +815,9 @@ static ssize_t mem_write(struct file * file, const char * buf,
        }
        *ppos = dst;
        free_page((unsigned long) page);
+out:
+        put_task_struct(task);
+out_no_task:
        return copied;
 }
 #endif
@@ -938,13 +848,18 @@ static struct file_operations proc_mem_operations = {
 static ssize_t oom_adjust_read(struct file *file, char __user *buf,
                                size_t count, loff_t *ppos)
 {
-        struct task_struct *task = proc_task(file->f_dentry->d_inode);
+        struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
-        char buffer[8];
+        char buffer[PROC_NUMBUF];
        size_t len;
-        int oom_adjust = task->oomkilladj;
+        int oom_adjust;
        loff_t __ppos = *ppos;
-        len = sprintf(buffer, "%i\n", oom_adjust);
+        if (!task)
+                return -ESRCH;
+        oom_adjust = task->oomkilladj;
+        put_task_struct(task);
+        len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
        if (__ppos >= len)
                return 0;
        if (count > len-__ppos)
@@ -958,15 +873,15 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
 static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppos)
 {
-        struct task_struct *task = proc_task(file->f_dentry->d_inode);
+        struct task_struct *task;
-        char buffer[8], *end;
+        char buffer[PROC_NUMBUF], *end;
        int oom_adjust;
        if (!capable(CAP_SYS_RESOURCE))
                return -EPERM;
-        memset(buffer, 0, 8);
+        memset(buffer, 0, sizeof(buffer));
-        if (count > 6)
+        if (count > sizeof(buffer) - 1)
-                count = 6;
+                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count))
                return -EFAULT;
        oom_adjust = simple_strtol(buffer, &end, 0);
@@ -974,7 +889,11 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
                return -EINVAL;
        if (*end == '\n')
                end++;
+        task = get_proc_task(file->f_dentry->d_inode);
+        if (!task)
+                return -ESRCH;
        task->oomkilladj = oom_adjust;
+        put_task_struct(task);
        if (end - buffer == 0)
                return -EIO;
        return end - buffer;
@@ -985,22 +904,21 @@ static struct file_operations proc_oom_adjust_operations = {
        .write          = oom_adjust_write,
 };
-static struct inode_operations proc_mem_inode_operations = {
-        .permission     = proc_permission,
-};
 #ifdef CONFIG_AUDITSYSCALL
 #define TMPBUFLEN 21
 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
                                  size_t count, loff_t *ppos)
 {
        struct inode * inode = file->f_dentry->d_inode;
-        struct task_struct *task = proc_task(inode);
+        struct task_struct *task = get_proc_task(inode);
        ssize_t length;
        char tmpbuf[TMPBUFLEN];
+        if (!task)
+                return -ESRCH;
        length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
                                audit_get_loginuid(task->audit_context));
+        put_task_struct(task);
        return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
 }
@@ -1010,13 +928,12 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
        struct inode * inode = file->f_dentry->d_inode;
        char *page, *tmp;
        ssize_t length;
-        struct task_struct *task = proc_task(inode);
        uid_t loginuid;
        if (!capable(CAP_AUDIT_CONTROL))
                return -EPERM;
-        if (current != task)
+        if (current != pid_task(proc_pid(inode), PIDTYPE_PID))
                return -EPERM;
        if (count >= PAGE_SIZE)
@@ -1040,7 +957,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
                goto out_free_page;
        }
-        length = audit_set_loginuid(task, loginuid);
+        length = audit_set_loginuid(current, loginuid);
        if (likely(length == 0))
                length = count;
@@ -1059,13 +976,16 @@ static struct file_operations proc_loginuid_operations = {
 static ssize_t seccomp_read(struct file *file, char __user *buf,
                            size_t count, loff_t *ppos)
 {
-        struct task_struct *tsk = proc_task(file->f_dentry->d_inode);
+        struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
        char __buf[20];
        loff_t __ppos = *ppos;
        size_t len;
+        if (!tsk)
+                return -ESRCH;
        /* no need to print the trailing zero, so use only len */
        len = sprintf(__buf, "%u\n", tsk->seccomp.mode);
+        put_task_struct(tsk);
        if (__ppos >= len)
                return 0;
        if (count > len - __ppos)
@@ -1079,29 +999,43 @@ static ssize_t seccomp_read(struct file *file, char __user *buf,
 static ssize_t seccomp_write(struct file *file, const char __user *buf,
                             size_t count, loff_t *ppos)
 {
-        struct task_struct *tsk = proc_task(file->f_dentry->d_inode);
+        struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
        char __buf[20], *end;
        unsigned int seccomp_mode;
+        ssize_t result;
+        result = -ESRCH;
+        if (!tsk)
+                goto out_no_task;
        /* can set it only once to be even more secure */
+        result = -EPERM;
        if (unlikely(tsk->seccomp.mode))
-                return -EPERM;
+                goto out;
+        result = -EFAULT;
        memset(__buf, 0, sizeof(__buf));
        count = min(count, sizeof(__buf) - 1);
        if (copy_from_user(__buf, buf, count))
-                return -EFAULT;
+                goto out;
        seccomp_mode = simple_strtoul(__buf, &end, 0);
        if (*end == '\n')
                end++;
+        result = -EINVAL;
        if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
                tsk->seccomp.mode = seccomp_mode;
                set_tsk_thread_flag(tsk, TIF_SECCOMP);
        } else
-                return -EINVAL;
+                goto out;
+        result = -EIO;
        if (unlikely(!(end - __buf)))
-                return -EIO;
+                goto out;
-        return end - __buf;
+        result = end - __buf;
+out:
+        put_task_struct(tsk);
+out_no_task:
+        return result;
 }
 static struct file_operations proc_seccomp_operations = {
@@ -1118,10 +1052,8 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
        /* We don't need a base pointer in the /proc filesystem */
        path_release(nd);
-        if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE))
+        /* Are we allowed to snoop on the tasks file descriptors? */
-                goto out;
+        if (!proc_fd_access_allowed(inode))
-        error = proc_check_root(inode);
-        if (error)
                goto out;
        error = PROC_I(inode)->op.proc_get_link(inode, &nd->dentry, &nd->mnt);
@@ -1163,12 +1095,8 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
        struct dentry *de;
        struct vfsmount *mnt = NULL;
-        lock_kernel();
+        /* Are we allowed to snoop on the tasks file descriptors? */
+        if (!proc_fd_access_allowed(inode))
-        if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE))
-                goto out;
-        error = proc_check_root(inode);
-        if (error)
                goto out;
        error = PROC_I(inode)->op.proc_get_link(inode, &de, &mnt);
@@ -1179,7 +1107,6 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
        dput(de);
        mntput(mnt);
 out:
-        unlock_kernel();
        return error;
 }
@@ -1188,21 +1115,20 @@ static struct inode_operations proc_pid_link_inode_operations = {
        .follow_link    = proc_pid_follow_link
 };
-#define NUMBUF 10
 static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 {
-        struct inode *inode = filp->f_dentry->d_inode;
+        struct dentry *dentry = filp->f_dentry;
-        struct task_struct *p = proc_task(inode);
+        struct inode *inode = dentry->d_inode;
+        struct task_struct *p = get_proc_task(inode);
        unsigned int fd, tid, ino;
        int retval;
-        char buf[NUMBUF];
+        char buf[PROC_NUMBUF];
        struct files_struct * files;
        struct fdtable *fdt;
        retval = -ENOENT;
-        if (!pid_alive(p))
+        if (!p)
-                goto out;
+                goto out_no_task;
        retval = 0;
        tid = p->pid;
@@ -1213,7 +1139,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
                                goto out;
                        filp->f_pos++;
                case 1:
-                        ino = fake_ino(tid, PROC_TID_INO);
+                        ino = parent_ino(dentry);
                        if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
                                goto out;
                        filp->f_pos++;
@@ -1232,7 +1158,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
                                        continue;
                                rcu_read_unlock();
-                                j = NUMBUF;
+                                j = PROC_NUMBUF;
                                i = fd;
                                do {
                                        j--;
@@ -1241,7 +1167,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
                                } while (i);
                                ino = fake_ino(tid, PROC_TID_FD_DIR + fd);
-                                if (filldir(dirent, buf+j, NUMBUF-j, fd+2, ino, DT_LNK) < 0) {
+                                if (filldir(dirent, buf+j, PROC_NUMBUF-j, fd+2, ino, DT_LNK) < 0) {
                                        rcu_read_lock();
                                        break;
                                }
@@ -1251,6 +1177,8 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
                        put_files_struct(files);
        }
 out:
+        put_task_struct(p);
+out_no_task:
        return retval;
 }
@@ -1262,16 +1190,18 @@ static int proc_pident_readdir(struct file *filp,
        int pid;
        struct dentry *dentry = filp->f_dentry;
        struct inode *inode = dentry->d_inode;
+        struct task_struct *task = get_proc_task(inode);
        struct pid_entry *p;
        ino_t ino;
        int ret;
        ret = -ENOENT;
-        if (!pid_alive(proc_task(inode)))
+        if (!task)
                goto out;
        ret = 0;
-        pid = proc_task(inode)->pid;
+        pid = task->pid;
+        put_task_struct(task);
        i = filp->f_pos;
        switch (i) {
        case 0:
@@ -1354,22 +1284,19 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
        /* Common stuff */
        ei = PROC_I(inode);
-        ei->task = NULL;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        inode->i_ino = fake_ino(task->pid, ino);
-        if (!pid_alive(task))
-                goto out_unlock;
        /*
         * grab the reference to task.
         */
-        get_task_struct(task);
+        ei->pid = get_pid(task->pids[PIDTYPE_PID].pid);
-        ei->task = task;
+        if (!ei->pid)
-        ei->type = ino;
+                goto out_unlock;
        inode->i_uid = 0;
        inode->i_gid = 0;
-        if (ino == PROC_TGID_INO || ino == PROC_TID_INO || task_dumpable(task)) {
+        if (task_dumpable(task)) {
                inode->i_uid = task->euid;
                inode->i_gid = task->egid;
        }
@@ -1379,7 +1306,6 @@ out:
        return inode;
 out_unlock:
-        ei->pde = NULL;
        iput(inode);
        return NULL;
 }
@@ -1393,13 +1319,21 @@ out_unlock:
 *
 * Rewrite the inode's ownerships here because the owning task may have
 * performed a setuid(), etc.
+ *
+ * Before the /proc/pid/status file was created the only way to read
+ * the effective uid of a /process was to stat /proc/pid.  Reading
+ * /proc/pid/status is slow enough that procps and other packages
+ * kept stating /proc/pid.  To keep the rules in /proc simple I have
+ * made this apply to all per process world readable and executable
+ * directories.
 */
 static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct inode *inode = dentry->d_inode;
-        struct task_struct *task = proc_task(inode);
+        struct task_struct *task = get_proc_task(inode);
-        if (pid_alive(task)) {
+        if (task) {
-                if (proc_type(inode) == PROC_TGID_INO || proc_type(inode) == PROC_TID_INO || task_dumpable(task)) {
+                if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
+                    task_dumpable(task)) {
                        inode->i_uid = task->euid;
                        inode->i_gid = task->egid;
                } else {
@@ -1407,59 +1341,75 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
                        inode->i_gid = 0;
                }
                security_task_to_inode(task, inode);
+                put_task_struct(task);
                return 1;
        }
        d_drop(dentry);
        return 0;
 }
+static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+        struct inode *inode = dentry->d_inode;
+        struct task_struct *task;
+        generic_fillattr(inode, stat);
+        rcu_read_lock();
+        stat->uid = 0;
+        stat->gid = 0;
+        task = pid_task(proc_pid(inode), PIDTYPE_PID);
+        if (task) {
+                if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
+                    task_dumpable(task)) {
+                        stat->uid = task->euid;
+                        stat->gid = task->egid;
+                }
+        }
+        rcu_read_unlock();
+        return 0;
+}
 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct inode *inode = dentry->d_inode;
-        struct task_struct *task = proc_task(inode);
+        struct task_struct *task = get_proc_task(inode);
-        int fd = proc_type(inode) - PROC_TID_FD_DIR;
+        int fd = proc_fd(inode);
        struct files_struct *files;
-        files = get_files_struct(task);
+        if (task) {
-        if (files) {
+                files = get_files_struct(task);
-                rcu_read_lock();
+                if (files) {
-                if (fcheck_files(files, fd)) {
+                        rcu_read_lock();
+                        if (fcheck_files(files, fd)) {
+                                rcu_read_unlock();
+                                put_files_struct(files);
+                                if (task_dumpable(task)) {
+                                        inode->i_uid = task->euid;
+                                        inode->i_gid = task->egid;
+                                } else {
+                                        inode->i_uid = 0;
+                                        inode->i_gid = 0;
+                                }
+                                security_task_to_inode(task, inode);
+                                put_task_struct(task);
+                                return 1;
+                        }
                        rcu_read_unlock();
                        put_files_struct(files);
-                        if (task_dumpable(task)) {
-                                inode->i_uid = task->euid;
-                                inode->i_gid = task->egid;
-                        } else {
-                                inode->i_uid = 0;
-                                inode->i_gid = 0;
-                        }
-                        security_task_to_inode(task, inode);
-                        return 1;
                }
-                rcu_read_unlock();
+                put_task_struct(task);
-                put_files_struct(files);
        }
        d_drop(dentry);
        return 0;
 }
-static void pid_base_iput(struct dentry *dentry, struct inode *inode)
-{
-        struct task_struct *task = proc_task(inode);
-        spin_lock(&task->proc_lock);
-        if (task->proc_dentry == dentry)
-                task->proc_dentry = NULL;
-        spin_unlock(&task->proc_lock);
-        iput(inode);
-}
 static int pid_delete_dentry(struct dentry * dentry)
 {
        /* Is the task we represent dead?
         * If so, then don't put the dentry on the lru list,
         * kill it immediately.
         */
-        return !pid_alive(proc_task(dentry->d_inode));
+        return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
 }
 static struct dentry_operations tid_fd_dentry_operations =
@@ -1474,13 +1424,6 @@ static struct dentry_operations pid_dentry_operations =
        .d_delete       = pid_delete_dentry,
 };
-static struct dentry_operations pid_base_dentry_operations =
-{
-        .d_revalidate   = pid_revalidate,
-        .d_iput         = pid_base_iput,
-        .d_delete       = pid_delete_dentry,
-};
 /* Lookups */
 static unsigned name_to_int(struct dentry *dentry)
@@ -1508,22 +1451,24 @@ out:
 /* SMP-safe */
 static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
 {
-        struct task_struct *task = proc_task(dir);
+        struct task_struct *task = get_proc_task(dir);
        unsigned fd = name_to_int(dentry);
+        struct dentry *result = ERR_PTR(-ENOENT);
        struct file * file;
        struct files_struct * files;
        struct inode *inode;
        struct proc_inode *ei;
+        if (!task)
+                goto out_no_task;
        if (fd == ~0U)
                goto out;
-        if (!pid_alive(task))
-                goto out;
        inode = proc_pid_make_inode(dir->i_sb, task, PROC_TID_FD_DIR+fd);
        if (!inode)
                goto out;
        ei = PROC_I(inode);
+        ei->fd = fd;
        files = get_files_struct(task);
        if (!files)
                goto out_unlock;
@@ -1548,19 +1493,25 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
        ei->op.proc_get_link = proc_fd_link;
        dentry->d_op = &tid_fd_dentry_operations;
        d_add(dentry, inode);
-        return NULL;
+        /* Close the race of the process dying before we return the dentry */
+        if (tid_fd_revalidate(dentry, NULL))
+                result = NULL;
+out:
+        put_task_struct(task);
+out_no_task:
+        return result;
 out_unlock2:
        spin_unlock(&files->file_lock);
        put_files_struct(files);
 out_unlock:
        iput(inode);
-out:
+        goto out;
-        return ERR_PTR(-ENOENT);
 }
 static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir);
 static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd);
+static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
 static struct file_operations proc_fd_operations = {
        .read           = generic_read_dir,
@@ -1577,12 +1528,11 @@ static struct file_operations proc_task_operations = {
 */
 static struct inode_operations proc_fd_inode_operations = {
        .lookup         = proc_lookupfd,
-        .permission     = proc_permission,
 };
 static struct inode_operations proc_task_inode_operations = {
        .lookup         = proc_task_lookup,
-        .permission     = proc_task_permission,
+        .getattr        = proc_task_getattr,
 };
 #ifdef CONFIG_SECURITY
@@ -1592,12 +1542,17 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
        struct inode * inode = file->f_dentry->d_inode;
        unsigned long page;
        ssize_t length;
-        struct task_struct *task = proc_task(inode);
+        struct task_struct *task = get_proc_task(inode);
+        length = -ESRCH;
+        if (!task)
+                goto out_no_task;
        if (count > PAGE_SIZE)
                count = PAGE_SIZE;
+        length = -ENOMEM;
        if (!(page = __get_free_page(GFP_KERNEL)))
-                return -ENOMEM;
+                goto out;
        length = security_getprocattr(task, 
                                      (char*)file->f_dentry->d_name.name, 
@@ -1605,6 +1560,9 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
        if (length >= 0)
                length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
        free_page(page);
+out:
+        put_task_struct(task);
+out_no_task:
        return length;
 }
@@ -1614,26 +1572,36 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
        struct inode * inode = file->f_dentry->d_inode;
        char *page; 
        ssize_t length; 
-        struct task_struct *task = proc_task(inode); 
+        struct task_struct *task = get_proc_task(inode);
+        length = -ESRCH;
+        if (!task)
+                goto out_no_task;
        if (count > PAGE_SIZE) 
                count = PAGE_SIZE; 
-        if (*ppos != 0) {
-                /* No partial writes. */
+        /* No partial writes. */
-                return -EINVAL;
+        length = -EINVAL;
-        }
+        if (*ppos != 0)
+                goto out;
+        length = -ENOMEM;
        page = (char*)__get_free_page(GFP_USER); 
        if (!page) 
-                return -ENOMEM;
+                goto out;
        length = -EFAULT; 
        if (copy_from_user(page, buf, count)) 
-                goto out;
+                goto out_free;
        length = security_setprocattr(task, 
                                      (char*)file->f_dentry->d_name.name, 
                                      (void*)page, count);
-out:
+out_free:
        free_page((unsigned long) page);
+out:
+        put_task_struct(task);
+out_no_task:
        return length;
 } 
@@ -1648,24 +1616,22 @@ static struct file_operations proc_tgid_attr_operations;
 static struct inode_operations proc_tgid_attr_inode_operations;
 #endif
-static int get_tid_list(int index, unsigned int *tids, struct inode *dir);
 /* SMP-safe */
 static struct dentry *proc_pident_lookup(struct inode *dir, 
                                         struct dentry *dentry,
                                         struct pid_entry *ents)
 {
        struct inode *inode;
-        int error;
+        struct dentry *error;
-        struct task_struct *task = proc_task(dir);
+        struct task_struct *task = get_proc_task(dir);
        struct pid_entry *p;
        struct proc_inode *ei;
-        error = -ENOENT;
+        error = ERR_PTR(-ENOENT);
        inode = NULL;
-        if (!pid_alive(task))
+        if (!task)
-                goto out;
+                goto out_no_task;
        for (p = ents; p->name; p++) {
                if (p->len != dentry->d_name.len)
@@ -1676,7 +1642,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
        if (!p->name)
                goto out;
-        error = -EINVAL;
+        error = ERR_PTR(-EINVAL);
        inode = proc_pid_make_inode(dir->i_sb, task, p->type);
        if (!inode)
                goto out;
@@ -1689,7 +1655,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
         */
        switch(p->type) {
                case PROC_TGID_TASK:
-                        inode->i_nlink = 2 + get_tid_list(2, NULL, dir);
+                        inode->i_nlink = 2;
                        inode->i_op = &proc_task_inode_operations;
                        inode->i_fop = &proc_task_operations;
                        break;
@@ -1759,7 +1725,6 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 #endif
                case PROC_TID_MEM:
                case PROC_TGID_MEM:
-                        inode->i_op = &proc_mem_inode_operations;
                        inode->i_fop = &proc_mem_operations;
                        break;
 #ifdef CONFIG_SECCOMP
@@ -1801,6 +1766,10 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
                case PROC_TGID_ATTR_EXEC:
                case PROC_TID_ATTR_FSCREATE:
                case PROC_TGID_ATTR_FSCREATE:
+                case PROC_TID_ATTR_KEYCREATE:
+                case PROC_TGID_ATTR_KEYCREATE:
+                case PROC_TID_ATTR_SOCKCREATE:
+                case PROC_TGID_ATTR_SOCKCREATE:
                        inode->i_fop = &proc_pid_attr_operations;
                        break;
 #endif
@@ -1842,14 +1811,18 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
                default:
                        printk("procfs: impossible type (%d)",p->type);
                        iput(inode);
-                        return ERR_PTR(-EINVAL);
+                        error = ERR_PTR(-EINVAL);
+                        goto out;
        }
        dentry->d_op = &pid_dentry_operations;
        d_add(dentry, inode);
-        return NULL;
+        /* Close the race of the process dying before we return the dentry */
+        if (pid_revalidate(dentry, NULL))
+                error = NULL;
 out:
-        return ERR_PTR(error);
+        put_task_struct(task);
+out_no_task:
+        return error;
 }
 static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
@@ -1872,10 +1845,12 @@ static struct file_operations proc_tid_base_operations = {
 static struct inode_operations proc_tgid_base_inode_operations = {
        .lookup         = proc_tgid_base_lookup,
+        .getattr        = pid_getattr,
 };
 static struct inode_operations proc_tid_base_inode_operations = {
        .lookup         = proc_tid_base_lookup,
+        .getattr        = pid_getattr,
 };
 #ifdef CONFIG_SECURITY
@@ -1917,10 +1892,12 @@ static struct dentry *proc_tid_attr_lookup(struct inode *dir,
 static struct inode_operations proc_tgid_attr_inode_operations = {
        .lookup         = proc_tgid_attr_lookup,
+        .getattr        = pid_getattr,
 };
 static struct inode_operations proc_tid_attr_inode_operations = {
        .lookup         = proc_tid_attr_lookup,
+        .getattr        = pid_getattr,
 };
 #endif
@@ -1930,14 +1907,14 @@ static struct inode_operations proc_tid_attr_inode_operations = {
 static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
                              int buflen)
 {
-        char tmp[30];
+        char tmp[PROC_NUMBUF];
        sprintf(tmp, "%d", current->tgid);
        return vfs_readlink(dentry,buffer,buflen,tmp);
 }
 static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-        char tmp[30];
+        char tmp[PROC_NUMBUF];
        sprintf(tmp, "%d", current->tgid);
        return ERR_PTR(vfs_follow_link(nd,tmp));
 }       
@@ -1948,67 +1925,80 @@ static struct inode_operations proc_self_inode_operations = {
 };
 /**
- * proc_pid_unhash -  Unhash /proc/@pid entry from the dcache.
+ * proc_flush_task -  Remove dcache entries for @task from the /proc dcache.
- * @p: task that should be flushed.
+ *
+ * @task: task that should be flushed.
+ *
+ * Looks in the dcache for
+ * /proc/@pid
+ * /proc/@tgid/task/@pid
+ * if either directory is present flushes it and all of it'ts children
+ * from the dcache.
 *
- * Drops the /proc/@pid dcache entry from the hash chains.
+ * It is safe and reasonable to cache /proc entries for a task until
+ * that task exits.  After that they just clog up the dcache with
+ * useless entries, possibly causing useful dcache entries to be
+ * flushed instead.  This routine is proved to flush those useless
+ * dcache entries at process exit time.
 *
- * Dropping /proc/@pid entries and detach_pid must be synchroneous,
+ * NOTE: This routine is just an optimization so it does not guarantee
- * otherwise e.g. /proc/@pid/exe might point to the wrong executable,
+ *       that no dcache entries will exist at process exit time it
- * if the pid value is immediately reused. This is enforced by
+ *       just makes it very unlikely that any will persist.
- * - caller must acquire spin_lock(p->proc_lock)
- * - must be called before detach_pid()
- * - proc_pid_lookup acquires proc_lock, and checks that
- *   the target is not dead by looking at the attach count
- *   of PIDTYPE_PID.
 */
+void proc_flush_task(struct task_struct *task)
-struct dentry *proc_pid_unhash(struct task_struct *p)
 {
-        struct dentry *proc_dentry;
+        struct dentry *dentry, *leader, *dir;
+        char buf[PROC_NUMBUF];
+        struct qstr name;
+        name.name = buf;
+        name.len = snprintf(buf, sizeof(buf), "%d", task->pid);
+        dentry = d_hash_and_lookup(proc_mnt->mnt_root, &name);
+        if (dentry) {
+                shrink_dcache_parent(dentry);
+                d_drop(dentry);
+                dput(dentry);
+        }
-        proc_dentry = p->proc_dentry;
+        if (thread_group_leader(task))
-        if (proc_dentry != NULL) {
+                goto out;
-                spin_lock(&dcache_lock);
+        name.name = buf;
-                spin_lock(&proc_dentry->d_lock);
+        name.len = snprintf(buf, sizeof(buf), "%d", task->tgid);
-                if (!d_unhashed(proc_dentry)) {
+        leader = d_hash_and_lookup(proc_mnt->mnt_root, &name);
-                        dget_locked(proc_dentry);
+        if (!leader)
-                        __d_drop(proc_dentry);
+                goto out;
-                        spin_unlock(&proc_dentry->d_lock);
-                } else {
-                        spin_unlock(&proc_dentry->d_lock);
-                        proc_dentry = NULL;
-                }
-                spin_unlock(&dcache_lock);
-        }
-        return proc_dentry;
-}
-/**
+        name.name = "task";
- * proc_pid_flush - recover memory used by stale /proc/@pid/x entries
+        name.len = strlen(name.name);
- * @proc_dentry: directoy to prune.
+        dir = d_hash_and_lookup(leader, &name);
- *
+        if (!dir)
- * Shrink the /proc directory that was used by the just killed thread.
+                goto out_put_leader;
- */
-        
+        name.name = buf;
-void proc_pid_flush(struct dentry *proc_dentry)
+        name.len = snprintf(buf, sizeof(buf), "%d", task->pid);
-{
+        dentry = d_hash_and_lookup(dir, &name);
-        might_sleep();
+        if (dentry) {
-        if(proc_dentry != NULL) {
+                shrink_dcache_parent(dentry);
-                shrink_dcache_parent(proc_dentry);
+                d_drop(dentry);
-                dput(proc_dentry);
+                dput(dentry);
        }
+        dput(dir);
+out_put_leader:
+        dput(leader);
+out:
+        return;
 }
 /* SMP-safe */
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
+        struct dentry *result = ERR_PTR(-ENOENT);
        struct task_struct *task;
        struct inode *inode;
        struct proc_inode *ei;
        unsigned tgid;
-        int died;
        if (dentry->d_name.len == 4 && !memcmp(dentry->d_name.name,"self",4)) {
                inode = new_inode(dir->i_sb);
@@ -2029,21 +2019,18 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct
        if (tgid == ~0U)
                goto out;
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        task = find_task_by_pid(tgid);
        if (task)
                get_task_struct(task);
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        if (!task)
                goto out;
        inode = proc_pid_make_inode(dir->i_sb, task, PROC_TGID_INO);
+        if (!inode)
+                goto out_put_task;
-        if (!inode) {
-                put_task_struct(task);
-                goto out;
-        }
        inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
        inode->i_op = &proc_tgid_base_inode_operations;
        inode->i_fop = &proc_tgid_base_operations;
@@ -2054,45 +2041,40 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct
        inode->i_nlink = 4;
 #endif
-        dentry->d_op = &pid_base_dentry_operations;
+        dentry->d_op = &pid_dentry_operations;
-        died = 0;
        d_add(dentry, inode);
-        spin_lock(&task->proc_lock);
+        /* Close the race of the process dying before we return the dentry */
-        task->proc_dentry = dentry;
+        if (pid_revalidate(dentry, NULL))
-        if (!pid_alive(task)) {
+                result = NULL;
-                dentry = proc_pid_unhash(task);
-                died = 1;
-        }
-        spin_unlock(&task->proc_lock);
+out_put_task:
        put_task_struct(task);
-        if (died) {
-                proc_pid_flush(dentry);
-                goto out;
-        }
-        return NULL;
 out:
-        return ERR_PTR(-ENOENT);
+        return result;
 }
 /* SMP-safe */
 static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
+        struct dentry *result = ERR_PTR(-ENOENT);
        struct task_struct *task;
-        struct task_struct *leader = proc_task(dir);
+        struct task_struct *leader = get_proc_task(dir);
        struct inode *inode;
        unsigned tid;
+        if (!leader)
+                goto out_no_task;
        tid = name_to_int(dentry);
        if (tid == ~0U)
                goto out;
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        task = find_task_by_pid(tid);
        if (task)
                get_task_struct(task);
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        if (!task)
                goto out;
        if (leader->tgid != task->tgid)
@@ -2113,101 +2095,95 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry
        inode->i_nlink = 3;
 #endif
-        dentry->d_op = &pid_base_dentry_operations;
+        dentry->d_op = &pid_dentry_operations;
        d_add(dentry, inode);
+        /* Close the race of the process dying before we return the dentry */
+        if (pid_revalidate(dentry, NULL))
+                result = NULL;
-        put_task_struct(task);
-        return NULL;
 out_drop_task:
        put_task_struct(task);
 out:
-        return ERR_PTR(-ENOENT);
+        put_task_struct(leader);
+out_no_task:
+        return result;
 }
-#define PROC_NUMBUF 10
-#define PROC_MAXPIDS 20
 /*
- * Get a few tgid's to return for filldir - we need to hold the
+ * Find the first tgid to return to user space.
- * tasklist lock while doing this, and we must release it before
+ *
- * we actually do the filldir itself, so we use a temp buffer..
+ * Usually this is just whatever follows &init_task, but if the users
+ * buffer was too small to hold the full list or there was a seek into
+ * the middle of the directory we have more work to do.
+ *
+ * In the case of a short read we start with find_task_by_pid.
+ *
+ * In the case of a seek we start with &init_task and walk nr
+ * threads past it.
 */
-static int get_tgid_list(int index, unsigned long version, unsigned int *tgids)
+static struct task_struct *first_tgid(int tgid, unsigned int nr)
-{
+{
-        struct task_struct *p;
+        struct task_struct *pos;
-        int nr_tgids = 0;
+        rcu_read_lock();
+        if (tgid && nr) {
-        index--;
+                pos = find_task_by_pid(tgid);
-        read_lock(&tasklist_lock);
+                if (pos && thread_group_leader(pos))
-        p = NULL;
+                        goto found;
-        if (version) {
-                p = find_task_by_pid(version);
-                if (p && !thread_group_leader(p))
-                        p = NULL;
        }
+        /* If nr exceeds the number of processes get out quickly */
+        pos = NULL;
+        if (nr && nr >= nr_processes())
+                goto done;
-        if (p)
+        /* If we haven't found our starting place yet start with
-                index = 0;
+         * the init_task and walk nr tasks forward.
-        else
+         */
-                p = next_task(&init_task);
+        for (pos = next_task(&init_task); nr > 0; --nr) {
+                pos = next_task(pos);
-        for ( ; p != &init_task; p = next_task(p)) {
+                if (pos == &init_task) {
-                int tgid = p->pid;
+                        pos = NULL;
-                if (!pid_alive(p))
+                        goto done;
-                        continue;
+                }
-                if (--index >= 0)
-                        continue;
-                tgids[nr_tgids] = tgid;
-                nr_tgids++;
-                if (nr_tgids >= PROC_MAXPIDS)
-                        break;
        }
-        read_unlock(&tasklist_lock);
+found:
-        return nr_tgids;
+        get_task_struct(pos);
+done:
+        rcu_read_unlock();
+        return pos;
 }
 /*
- * Get a few tid's to return for filldir - we need to hold the
+ * Find the next task in the task list.
- * tasklist lock while doing this, and we must release it before
+ * Return NULL if we loop or there is any error.
- * we actually do the filldir itself, so we use a temp buffer..
+ *
+ * The reference to the input task_struct is released.
 */
-static int get_tid_list(int index, unsigned int *tids, struct inode *dir)
+static struct task_struct *next_tgid(struct task_struct *start)
-{
+{
-        struct task_struct *leader_task = proc_task(dir);
+        struct task_struct *pos;
-        struct task_struct *task = leader_task;
+        rcu_read_lock();
-        int nr_tids = 0;
+        pos = start;
+        if (pid_alive(start))
-        index -= 2;
+                pos = next_task(start);
-        read_lock(&tasklist_lock);
+        if (pid_alive(pos) && (pos != &init_task)) {
-        /*
+                get_task_struct(pos);
-         * The starting point task (leader_task) might be an already
+                goto done;
-         * unlinked task, which cannot be used to access the task-list
+        }
-         * via next_thread().
+        pos = NULL;
-         */
+done:
-        if (pid_alive(task)) do {
+        rcu_read_unlock();
-                int tid = task->pid;
+        put_task_struct(start);
+        return pos;
-                if (--index >= 0)
-                        continue;
-                if (tids != NULL)
-                        tids[nr_tids] = tid;
-                nr_tids++;
-                if (nr_tids >= PROC_MAXPIDS)
-                        break;
-        } while ((task = next_thread(task)) != leader_task);
-        read_unlock(&tasklist_lock);
-        return nr_tids;
 }
 /* for the /proc/ directory itself, after non-process stuff has been done */
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
-        unsigned int tgid_array[PROC_MAXPIDS];
        char buf[PROC_NUMBUF];
        unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY;
-        unsigned int nr_tgids, i;
+        struct task_struct *task;
-        int next_tgid;
+        int tgid;
        if (!nr) {
                ino_t ino = fake_ino(0,PROC_TGID_INO);
@@ -2216,63 +2192,116 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
                filp->f_pos++;
                nr++;
        }
+        nr -= 1;
        /* f_version caches the tgid value that the last readdir call couldn't
         * return. lseek aka telldir automagically resets f_version to 0.
         */
-        next_tgid = filp->f_version;
+        tgid = filp->f_version;
        filp->f_version = 0;
-        for (;;) {
+        for (task = first_tgid(tgid, nr);
-                nr_tgids = get_tgid_list(nr, next_tgid, tgid_array);
+             task;
-                if (!nr_tgids) {
+             task = next_tgid(task), filp->f_pos++) {
-                        /* no more entries ! */
+                int len;
+                ino_t ino;
+                tgid = task->pid;
+                len = snprintf(buf, sizeof(buf), "%d", tgid);
+                ino = fake_ino(tgid, PROC_TGID_INO);
+                if (filldir(dirent, buf, len, filp->f_pos, ino, DT_DIR) < 0) {
+                        /* returning this tgid failed, save it as the first
+                         * pid for the next readir call */
+                        filp->f_version = tgid;
+                        put_task_struct(task);
                        break;
                }
-                next_tgid = 0;
+        }
+        return 0;
+}
-                /* do not use the last found pid, reserve it for next_tgid */
+/*
-                if (nr_tgids == PROC_MAXPIDS) {
+ * Find the first tid of a thread group to return to user space.
-                        nr_tgids--;
+ *
-                        next_tgid = tgid_array[nr_tgids];
+ * Usually this is just the thread group leader, but if the users
-                }
+ * buffer was too small or there was a seek into the middle of the
+ * directory we have more work todo.
+ *
+ * In the case of a short read we start with find_task_by_pid.
+ *
+ * In the case of a seek we start with the leader and walk nr
+ * threads past it.
+ */
+static struct task_struct *first_tid(struct task_struct *leader,
+                                        int tid, int nr)
+{
+        struct task_struct *pos;
-                for (i=0;i<nr_tgids;i++) {
+        rcu_read_lock();
-                        int tgid = tgid_array[i];
+        /* Attempt to start with the pid of a thread */
-                        ino_t ino = fake_ino(tgid,PROC_TGID_INO);
+        if (tid && (nr > 0)) {
-                        unsigned long j = PROC_NUMBUF;
+                pos = find_task_by_pid(tid);
+                if (pos && (pos->group_leader == leader))
+                        goto found;
+        }
-                        do
+        /* If nr exceeds the number of threads there is nothing todo */
-                                buf[--j] = '0' + (tgid % 10);
+        pos = NULL;
-                        while ((tgid /= 10) != 0);
+        if (nr && nr >= get_nr_threads(leader))
+                goto out;
-                        if (filldir(dirent, buf+j, PROC_NUMBUF-j, filp->f_pos, ino, DT_DIR) < 0) {
+        /* If we haven't found our starting place yet start
-                                /* returning this tgid failed, save it as the first
+         * with the leader and walk nr threads forward.
-                                 * pid for the next readir call */
+         */
-                                filp->f_version = tgid_array[i];
+        for (pos = leader; nr > 0; --nr) {
-                                goto out;
+                pos = next_thread(pos);
-                        }
+                if (pos == leader) {
-                        filp->f_pos++;
+                        pos = NULL;
-                        nr++;
+                        goto out;
                }
        }
+found:
+        get_task_struct(pos);
 out:
-        return 0;
+        rcu_read_unlock();
+        return pos;
+}
+/*
+ * Find the next thread in the thread list.
+ * Return NULL if there is an error or no next thread.
+ *
+ * The reference to the input task_struct is released.
+ */
+static struct task_struct *next_tid(struct task_struct *start)
+{
+        struct task_struct *pos = NULL;
+        rcu_read_lock();
+        if (pid_alive(start)) {
+                pos = next_thread(start);
+                if (thread_group_leader(pos))
+                        pos = NULL;
+                else
+                        get_task_struct(pos);
+        }
+        rcu_read_unlock();
+        put_task_struct(start);
+        return pos;
 }
 /* for the /proc/TGID/task/ directories */
 static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
-        unsigned int tid_array[PROC_MAXPIDS];
        char buf[PROC_NUMBUF];
-        unsigned int nr_tids, i;
        struct dentry *dentry = filp->f_dentry;
        struct inode *inode = dentry->d_inode;
+        struct task_struct *leader = get_proc_task(inode);
+        struct task_struct *task;
        int retval = -ENOENT;
        ino_t ino;
+        int tid;
        unsigned long pos = filp->f_pos;  /* avoiding "long long" filp->f_pos */
-        if (!pid_alive(proc_task(inode)))
+        if (!leader)
-                goto out;
+                goto out_no_task;
        retval = 0;
        switch (pos) {
@@ -2290,24 +2319,45 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
                /* fall through */
        }
-        nr_tids = get_tid_list(pos, tid_array, inode);
+        /* f_version caches the tgid value that the last readdir call couldn't
-        inode->i_nlink = pos + nr_tids;
+         * return. lseek aka telldir automagically resets f_version to 0.
+         */
-        for (i = 0; i < nr_tids; i++) {
+        tid = filp->f_version;
-                unsigned long j = PROC_NUMBUF;
+        filp->f_version = 0;
-                int tid = tid_array[i];
+        for (task = first_tid(leader, tid, pos - 2);
+             task;
-                ino = fake_ino(tid,PROC_TID_INO);
+             task = next_tid(task), pos++) {
+                int len;
-                do
+                tid = task->pid;
-                        buf[--j] = '0' + (tid % 10);
+                len = snprintf(buf, sizeof(buf), "%d", tid);
-                while ((tid /= 10) != 0);
+                ino = fake_ino(tid, PROC_TID_INO);
+                if (filldir(dirent, buf, len, pos, ino, DT_DIR < 0)) {
-                if (filldir(dirent, buf+j, PROC_NUMBUF-j, pos, ino, DT_DIR) < 0)
+                        /* returning this tgid failed, save it as the first
+                         * pid for the next readir call */
+                        filp->f_version = tid;
+                        put_task_struct(task);
                        break;
-                pos++;
+                }
        }
 out:
        filp->f_pos = pos;
+        put_task_struct(leader);
+out_no_task:
        return retval;
 }
+static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+        struct inode *inode = dentry->d_inode;
+        struct task_struct *p = get_proc_task(inode);
+        generic_fillattr(inode, stat);
+        if (p) {
+                rcu_read_lock();
+                stat->nlink += get_nr_threads(p);
+                rcu_read_unlock();
+                put_task_struct(p);
+        }
+        return 0;
+}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 722b9c463111..6dcef089e18e 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -58,14 +58,11 @@ static void de_put(struct proc_dir_entry *de)
 static void proc_delete_inode(struct inode *inode)
 {
        struct proc_dir_entry *de;
-        struct task_struct *tsk;
        truncate_inode_pages(&inode->i_data, 0);
-        /* Let go of any associated process */
+        /* Stop tracking associated processes */
-        tsk = PROC_I(inode)->task;
+        put_pid(PROC_I(inode)->pid);
-        if (tsk)
-                put_task_struct(tsk);
        /* Let go of any associated proc directory entry */
        de = PROC_I(inode)->pde;
@@ -94,8 +91,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
        ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, SLAB_KERNEL);
        if (!ei)
                return NULL;
-        ei->task = NULL;
+        ei->pid = NULL;
-        ei->type = 0;
+        ei->fd = 0;
        ei->op.proc_get_link = NULL;
        ei->pde = NULL;
        inode = &ei->vfs_inode;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 0502f17b860d..146a434ba944 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -37,16 +37,30 @@ extern int proc_tgid_stat(struct task_struct *, char *);
 extern int proc_pid_status(struct task_struct *, char *);
 extern int proc_pid_statm(struct task_struct *, char *);
+extern struct file_operations proc_maps_operations;
+extern struct file_operations proc_numa_maps_operations;
+extern struct file_operations proc_smaps_operations;
+extern struct file_operations proc_maps_operations;
+extern struct file_operations proc_numa_maps_operations;
+extern struct file_operations proc_smaps_operations;
 void free_proc_entry(struct proc_dir_entry *de);
 int proc_init_inodecache(void);
-static inline struct task_struct *proc_task(struct inode *inode)
+static inline struct pid *proc_pid(struct inode *inode)
+{
+        return PROC_I(inode)->pid;
+}
+static inline struct task_struct *get_proc_task(struct inode *inode)
 {
-        return PROC_I(inode)->task;
+        return get_pid_task(proc_pid(inode), PIDTYPE_PID);
 }
-static inline int proc_type(struct inode *inode)
+static inline int proc_fd(struct inode *inode)
 {
-        return PROC_I(inode)->type;
+        return PROC_I(inode)->fd;
 }
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 91b7c15ab373..0a163a4f7764 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -75,9 +75,13 @@ int proc_exe_link(struct inode *inode, struct dentry **dentry, struct vfsmount *
 {
        struct vm_area_struct * vma;
        int result = -ENOENT;
-        struct task_struct *task = proc_task(inode);
+        struct task_struct *task = get_proc_task(inode);
-        struct mm_struct * mm = get_task_mm(task);
+        struct mm_struct * mm = NULL;
+        if (task) {
+                mm = get_task_mm(task);
+                put_task_struct(task);
+        }
        if (!mm)
                goto out;
        down_read(&mm->mmap_sem);
@@ -118,9 +122,15 @@ struct mem_size_stats
        unsigned long private_dirty;
 };
+__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
+{
+        return NULL;
+}
 static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
 {
-        struct task_struct *task = m->private;
+        struct proc_maps_private *priv = m->private;
+        struct task_struct *task = priv->task;
        struct vm_area_struct *vma = v;
        struct mm_struct *mm = vma->vm_mm;
        struct file *file = vma->vm_file;
@@ -153,22 +163,23 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats
                pad_len_spaces(m, len);
                seq_path(m, file->f_vfsmnt, file->f_dentry, "\n");
        } else {
-                if (mm) {
+                const char *name = arch_vma_name(vma);
-                        if (vma->vm_start <= mm->start_brk &&
+                if (!name) {
+                        if (mm) {
+                                if (vma->vm_start <= mm->start_brk &&
                                                vma->vm_end >= mm->brk) {
-                                pad_len_spaces(m, len);
+                                        name = "[heap]";
-                                seq_puts(m, "[heap]");
+                                } else if (vma->vm_start <= mm->start_stack &&
-                        } else {
+                                           vma->vm_end >= mm->start_stack) {
-                                if (vma->vm_start <= mm->start_stack &&
+                                        name = "[stack]";
-                                        vma->vm_end >= mm->start_stack) {
-                                        pad_len_spaces(m, len);
-                                        seq_puts(m, "[stack]");
                                }
+                        } else {
+                                name = "[vdso]";
                        }
-                } else {
+                }
+                if (name) {
                        pad_len_spaces(m, len);
-                        seq_puts(m, "[vdso]");
+                        seq_puts(m, name);
                }
        }
        seq_putc(m, '\n');
@@ -295,12 +306,16 @@ static int show_smap(struct seq_file *m, void *v)
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
-        struct task_struct *task = m->private;
+        struct proc_maps_private *priv = m->private;
        unsigned long last_addr = m->version;
        struct mm_struct *mm;
-        struct vm_area_struct *vma, *tail_vma;
+        struct vm_area_struct *vma, *tail_vma = NULL;
        loff_t l = *pos;
+        /* Clear the per syscall fields in priv */
+        priv->task = NULL;
+        priv->tail_vma = NULL;
        /*
         * We remember last_addr rather than next_addr to hit with
         * mmap_cache most of the time. We have zero last_addr at
@@ -311,11 +326,15 @@ static void *m_start(struct seq_file *m, loff_t *pos)
        if (last_addr == -1UL)
                return NULL;
-        mm = get_task_mm(task);
+        priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
+        if (!priv->task)
+                return NULL;
+        mm = get_task_mm(priv->task);
        if (!mm)
                return NULL;
-        tail_vma = get_gate_vma(task);
+        priv->tail_vma = tail_vma = get_gate_vma(priv->task);
        down_read(&mm->mmap_sem);
        /* Start with last addr hint */
@@ -350,11 +369,9 @@ out:
        return tail_vma;
 }
-static void m_stop(struct seq_file *m, void *v)
+static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
 {
-        struct task_struct *task = m->private;
+        if (vma && vma != priv->tail_vma) {
-        struct vm_area_struct *vma = v;
-        if (vma && vma != get_gate_vma(task)) {
                struct mm_struct *mm = vma->vm_mm;
                up_read(&mm->mmap_sem);
                mmput(mm);
@@ -363,38 +380,103 @@ static void m_stop(struct seq_file *m, void *v)
 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
-        struct task_struct *task = m->private;
+        struct proc_maps_private *priv = m->private;
        struct vm_area_struct *vma = v;
-        struct vm_area_struct *tail_vma = get_gate_vma(task);
+        struct vm_area_struct *tail_vma = priv->tail_vma;
        (*pos)++;
        if (vma && (vma != tail_vma) && vma->vm_next)
                return vma->vm_next;
-        m_stop(m, v);
+        vma_stop(priv, vma);
        return (vma != tail_vma)? tail_vma: NULL;
 }
-struct seq_operations proc_pid_maps_op = {
+static void m_stop(struct seq_file *m, void *v)
+{
+        struct proc_maps_private *priv = m->private;
+        struct vm_area_struct *vma = v;
+        vma_stop(priv, vma);
+        if (priv->task)
+                put_task_struct(priv->task);
+}
+static struct seq_operations proc_pid_maps_op = {
        .start  = m_start,
        .next   = m_next,
        .stop   = m_stop,
        .show   = show_map
 };
-struct seq_operations proc_pid_smaps_op = {
+static struct seq_operations proc_pid_smaps_op = {
        .start  = m_start,
        .next   = m_next,
        .stop   = m_stop,
        .show   = show_smap
 };
+static int do_maps_open(struct inode *inode, struct file *file,
+                        struct seq_operations *ops)
+{
+        struct proc_maps_private *priv;
+        int ret = -ENOMEM;
+        priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+        if (priv) {
+                priv->pid = proc_pid(inode);
+                ret = seq_open(file, ops);
+                if (!ret) {
+                        struct seq_file *m = file->private_data;
+                        m->private = priv;
+                } else {
+                        kfree(priv);
+                }
+        }
+        return ret;
+}
+static int maps_open(struct inode *inode, struct file *file)
+{
+        return do_maps_open(inode, file, &proc_pid_maps_op);
+}
+struct file_operations proc_maps_operations = {
+        .open           = maps_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release_private,
+};
 #ifdef CONFIG_NUMA
 extern int show_numa_map(struct seq_file *m, void *v);
-struct seq_operations proc_pid_numa_maps_op = {
+static struct seq_operations proc_pid_numa_maps_op = {
        .start  = m_start,
        .next   = m_next,
        .stop   = m_stop,
        .show   = show_numa_map
 };
+static int numa_maps_open(struct inode *inode, struct file *file)
+{
+        return do_maps_open(inode, file, &proc_pid_numa_maps_op);
+}
+struct file_operations proc_numa_maps_operations = {
+        .open           = numa_maps_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release_private,
+};
 #endif
+static int smaps_open(struct inode *inode, struct file *file)
+{
+        return do_maps_open(inode, file, &proc_pid_smaps_op);
+}
+struct file_operations proc_smaps_operations = {
+        .open           = smaps_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release_private,
+};
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 8f68827ed10e..af69f28277b6 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -156,9 +156,28 @@ static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
        return NULL;
 }
-struct seq_operations proc_pid_maps_op = {
+static struct seq_operations proc_pid_maps_op = {
        .start  = m_start,
        .next   = m_next,
        .stop   = m_stop,
        .show   = show_map
 };
+static int maps_open(struct inode *inode, struct file *file)
+{
+        int ret;
+        ret = seq_open(file, &proc_pid_maps_op);
+        if (!ret) {
+                struct seq_file *m = file->private_data;
+                m->private = NULL;
+        }
+        return ret;
+}
+struct file_operations proc_maps_operations = {
+        .open           = maps_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index cf6e1cf40351..752cea12e30f 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -1560,12 +1560,6 @@ static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going t
        return res;
 }
-static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user * buf,
-                                  size_t count, loff_t pos)
-{
-        return generic_file_aio_write(iocb, buf, count, pos);
-}
 const struct file_operations reiserfs_file_operations = {
        .read = generic_file_read,
        .write = reiserfs_file_write,
@@ -1575,7 +1569,7 @@ const struct file_operations reiserfs_file_operations = {
        .fsync = reiserfs_sync_file,
        .sendfile = generic_file_sendfile,
        .aio_read = generic_file_aio_read,
-        .aio_write = reiserfs_aio_write,
+        .aio_write = generic_file_aio_write,
        .splice_read = generic_file_splice_read,
        .splice_write = generic_file_splice_write,
 };
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 1b73529b8099..49d1a53dbef0 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -834,8 +834,7 @@ static int write_ordered_buffers(spinlock_t * lock,
                get_bh(bh);
                if (test_set_buffer_locked(bh)) {
                        if (!buffer_dirty(bh)) {
-                                list_del_init(&jh->list);
+                                list_move(&jh->list, &tmp);
-                                list_add(&jh->list, &tmp);
                                goto loop_next;
                        }
                        spin_unlock(lock);
@@ -855,8 +854,7 @@ static int write_ordered_buffers(spinlock_t * lock,
                        ret = -EIO;
                }
                if (buffer_dirty(bh)) {
-                        list_del_init(&jh->list);
+                        list_move(&jh->list, &tmp);
-                        list_add(&jh->list, &tmp);
                        add_to_chunk(&chunk, bh, lock, write_ordered_chunk);
                } else {
                        reiserfs_free_jh(bh);
diff --git a/fs/select.c b/fs/select.c
index 9c4f0f2604f1..33b72ba0f86f 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -746,9 +746,9 @@ out_fds:
 asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
                        long timeout_msecs)
 {
-        s64 timeout_jiffies = 0;
+        s64 timeout_jiffies;
-        if (timeout_msecs) {
+        if (timeout_msecs > 0) {
 #if HZ > 1000
                /* We can only overflow if HZ > 1000 */
                if (timeout_msecs / 1000 > (s64)0x7fffffffffffffffULL / (s64)HZ)
@@ -756,6 +756,9 @@ asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
                else
 #endif
                        timeout_jiffies = msecs_to_jiffies(timeout_msecs);
+        } else {
+                /* Infinite (< 0) or no (0) timeout */
+                timeout_jiffies = timeout_msecs;
        }
        return do_sys_poll(ufds, nfds, &timeout_jiffies);
diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
index c71dd2760d32..c8e96195b96e 100644
--- a/fs/smbfs/request.c
+++ b/fs/smbfs/request.c
@@ -400,8 +400,7 @@ static int smb_request_send_req(struct smb_request *req)
        if (!(req->rq_flags & SMB_REQ_TRANSMITTED))
                goto out;
-        list_del_init(&req->rq_queue);
+        list_move_tail(&req->rq_queue, &server->recvq);
-        list_add_tail(&req->rq_queue, &server->recvq);
        result = 1;
 out:
        return result;
@@ -435,8 +434,7 @@ int smb_request_send_server(struct smb_sb_info *server)
        result = smb_request_send_req(req);
        if (result < 0) {
                server->conn_error = result;
-                list_del_init(&req->rq_queue);
+                list_move(&req->rq_queue, &server->xmitq);
-                list_add(&req->rq_queue, &server->xmitq);
                result = -EIO;
                goto out;
        }
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
index 481a97a423fa..24577e2c489b 100644
--- a/fs/smbfs/smbiod.c
+++ b/fs/smbfs/smbiod.c
@@ -20,6 +20,7 @@
 #include <linux/smp_lock.h>
 #include <linux/module.h>
 #include <linux/net.h>
+#include <linux/kthread.h>
 #include <net/ip.h>
 #include <linux/smb_fs.h>
@@ -40,7 +41,7 @@ enum smbiod_state {
 };
 static enum smbiod_state smbiod_state = SMBIOD_DEAD;
-static pid_t smbiod_pid;
+static struct task_struct *smbiod_thread;
 static DECLARE_WAIT_QUEUE_HEAD(smbiod_wait);
 static LIST_HEAD(smb_servers);
 static DEFINE_SPINLOCK(servers_lock);
@@ -67,20 +68,29 @@ void smbiod_wake_up(void)
 */
 static int smbiod_start(void)
 {
-        pid_t pid;
+        struct task_struct *tsk;
+        int err = 0;
        if (smbiod_state != SMBIOD_DEAD)
                return 0;
        smbiod_state = SMBIOD_STARTING;
        __module_get(THIS_MODULE);
        spin_unlock(&servers_lock);
-        pid = kernel_thread(smbiod, NULL, 0);
+        tsk = kthread_run(smbiod, NULL, "smbiod");
-        if (pid < 0)
+        if (IS_ERR(tsk)) {
+                err = PTR_ERR(tsk);
                module_put(THIS_MODULE);
+        }
        spin_lock(&servers_lock);
-        smbiod_state = pid < 0 ? SMBIOD_DEAD : SMBIOD_RUNNING;
+        if (err < 0) {
-        smbiod_pid = pid;
+                smbiod_state = SMBIOD_DEAD;
-        return pid;
+                smbiod_thread = NULL;
+        } else {
+                smbiod_state = SMBIOD_RUNNING;
+                smbiod_thread = tsk;
+        }
+        return err;
 }
 /*
@@ -183,8 +193,7 @@ int smbiod_retry(struct smb_sb_info *server)
                if (req->rq_flags & SMB_REQ_RETRY) {
                        /* must move the request to the xmitq */
                        VERBOSE("retrying request %p on recvq\n", req);
-                        list_del(&req->rq_queue);
+                        list_move(&req->rq_queue, &server->xmitq);
-                        list_add(&req->rq_queue, &server->xmitq);
                        continue;
                }
 #endif
@@ -290,8 +299,6 @@ out:
 */
 static int smbiod(void *unused)
 {
-        daemonize("smbiod");
        allow_signal(SIGKILL);
        VERBOSE("SMB Kernel thread starting (%d) ...\n", current->pid);
diff --git a/fs/super.c b/fs/super.c
index 057b5325b7ef..8a669f6f3f52 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -871,8 +871,6 @@ do_kern_mount(const char *fstype, int flags, const char *name, void *data)
        return mnt;
 }
-EXPORT_SYMBOL_GPL(do_kern_mount);
 struct vfsmount *kern_mount(struct file_system_type *type)
 {
        return vfs_kern_mount(type, 0, type->name, NULL);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 610b5bdbe75b..61c42430cba3 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -430,10 +430,9 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
                        i++;
                        /* fallthrough */
                default:
-                        if (filp->f_pos == 2) {
+                        if (filp->f_pos == 2)
-                                list_del(q);
+                                list_move(q, &parent_sd->s_children);
-                                list_add(q, &parent_sd->s_children);
-                        }
                        for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
                                struct sysfs_dirent *next;
                                const char * name;
@@ -455,8 +454,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
                                                 dt_type(next)) < 0)
                                        return 0;
-                                list_del(q);
+                                list_move(q, p);
-                                list_add(q, p);
                                p = q;
                                filp->f_pos++;
                        }
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 3ada9dcf55b8..95b878e5c7a0 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -21,14 +21,6 @@
 #include "swab.h"
 #include "util.h"
-#undef UFS_BALLOC_DEBUG
-#ifdef UFS_BALLOC_DEBUG
-#define UFSD(x) printk("(%s, %d), %s:", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
 static unsigned ufs_add_fragments (struct inode *, unsigned, unsigned, unsigned, int *);
 static unsigned ufs_alloc_fragments (struct inode *, unsigned, unsigned, unsigned, int *);
 static unsigned ufs_alloccg_block (struct inode *, struct ufs_cg_private_info *, unsigned, int *);
@@ -39,7 +31,8 @@ static void ufs_clusteracct(struct super_block *, struct ufs_cg_private_info *,
 /*
 * Free 'count' fragments from fragment number 'fragment'
 */
-void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count) {
+void ufs_free_fragments(struct inode *inode, unsigned fragment, unsigned count)
+{
        struct super_block * sb;
        struct ufs_sb_private_info * uspi;
        struct ufs_super_block_first * usb1;
@@ -51,7 +44,7 @@ void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count
        uspi = UFS_SB(sb)->s_uspi;
        usb1 = ubh_get_usb_first(uspi);
        
-        UFSD(("ENTER, fragment %u, count %u\n", fragment, count))
+        UFSD("ENTER, fragment %u, count %u\n", fragment, count);
        
        if (ufs_fragnum(fragment) + count > uspi->s_fpg)
                ufs_error (sb, "ufs_free_fragments", "internal error");
@@ -68,7 +61,7 @@ void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count
        ucpi = ufs_load_cylinder (sb, cgno);
        if (!ucpi) 
                goto failed;
-        ucg = ubh_get_ucg (UCPI_UBH);
+        ucg = ubh_get_ucg (UCPI_UBH(ucpi));
        if (!ufs_cg_chkmagic(sb, ucg)) {
                ufs_panic (sb, "ufs_free_fragments", "internal error, bad magic number on cg %u", cgno);
                goto failed;
@@ -76,11 +69,11 @@ void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count
        end_bit = bit + count;
        bbase = ufs_blknum (bit);
-        blkmap = ubh_blkmap (UCPI_UBH, ucpi->c_freeoff, bbase);
+        blkmap = ubh_blkmap (UCPI_UBH(ucpi), ucpi->c_freeoff, bbase);
        ufs_fragacct (sb, blkmap, ucg->cg_frsum, -1);
        for (i = bit; i < end_bit; i++) {
-                if (ubh_isclr (UCPI_UBH, ucpi->c_freeoff, i))
+                if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, i))
-                        ubh_setbit (UCPI_UBH, ucpi->c_freeoff, i);
+                        ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, i);
                else 
                        ufs_error (sb, "ufs_free_fragments",
                                   "bit already cleared for fragment %u", i);
@@ -90,51 +83,52 @@ void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count
        
        fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
-        fs32_add(sb, &usb1->fs_cstotal.cs_nffree, count);
+        uspi->cs_total.cs_nffree += count;
        fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
-        blkmap = ubh_blkmap (UCPI_UBH, ucpi->c_freeoff, bbase);
+        blkmap = ubh_blkmap (UCPI_UBH(ucpi), ucpi->c_freeoff, bbase);
        ufs_fragacct(sb, blkmap, ucg->cg_frsum, 1);
        /*
         * Trying to reassemble free fragments into block
         */
        blkno = ufs_fragstoblks (bbase);
-        if (ubh_isblockset(UCPI_UBH, ucpi->c_freeoff, blkno)) {
+        if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) {
                fs32_sub(sb, &ucg->cg_cs.cs_nffree, uspi->s_fpb);
-                fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, uspi->s_fpb);
+                uspi->cs_total.cs_nffree -= uspi->s_fpb;
                fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, uspi->s_fpb);
                if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
                        ufs_clusteracct (sb, ucpi, blkno, 1);
                fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
-                fs32_add(sb, &usb1->fs_cstotal.cs_nbfree, 1);
+                uspi->cs_total.cs_nbfree++;
                fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1);
                cylno = ufs_cbtocylno (bbase);
                fs16_add(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(bbase)), 1);
                fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1);
        }
        
-        ubh_mark_buffer_dirty (USPI_UBH);
+        ubh_mark_buffer_dirty (USPI_UBH(uspi));
-        ubh_mark_buffer_dirty (UCPI_UBH);
+        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
        if (sb->s_flags & MS_SYNCHRONOUS) {
-                ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
+                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH);
+                ubh_wait_on_buffer (UCPI_UBH(ucpi));
        }
        sb->s_dirt = 1;
        
        unlock_super (sb);
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        return;
 failed:
        unlock_super (sb);
-        UFSD(("EXIT (FAILED)\n"))
+        UFSD("EXIT (FAILED)\n");
        return;
 }
 /*
 * Free 'count' fragments from fragment number 'fragment' (free whole blocks)
 */
-void ufs_free_blocks (struct inode * inode, unsigned fragment, unsigned count) {
+void ufs_free_blocks(struct inode *inode, unsigned fragment, unsigned count)
+{
        struct super_block * sb;
        struct ufs_sb_private_info * uspi;
        struct ufs_super_block_first * usb1;
@@ -146,7 +140,7 @@ void ufs_free_blocks (struct inode * inode, unsigned fragment, unsigned count) {
        uspi = UFS_SB(sb)->s_uspi;
        usb1 = ubh_get_usb_first(uspi);
-        UFSD(("ENTER, fragment %u, count %u\n", fragment, count))
+        UFSD("ENTER, fragment %u, count %u\n", fragment, count);
        
        if ((fragment & uspi->s_fpbmask) || (count & uspi->s_fpbmask)) {
                ufs_error (sb, "ufs_free_blocks", "internal error, "
@@ -162,7 +156,7 @@ do_more:
        bit = ufs_dtogd (fragment);
        if (cgno >= uspi->s_ncg) {
                ufs_panic (sb, "ufs_free_blocks", "freeing blocks are outside device");
-                goto failed;
+                goto failed_unlock;
        }
        end_bit = bit + count;
        if (end_bit > uspi->s_fpg) {
@@ -173,36 +167,36 @@ do_more:
        ucpi = ufs_load_cylinder (sb, cgno);
        if (!ucpi) 
-                goto failed;
+                goto failed_unlock;
-        ucg = ubh_get_ucg (UCPI_UBH);
+        ucg = ubh_get_ucg (UCPI_UBH(ucpi));
        if (!ufs_cg_chkmagic(sb, ucg)) {
                ufs_panic (sb, "ufs_free_blocks", "internal error, bad magic number on cg %u", cgno);
-                goto failed;
+                goto failed_unlock;
        }
        for (i = bit; i < end_bit; i += uspi->s_fpb) {
                blkno = ufs_fragstoblks(i);
-                if (ubh_isblockset(UCPI_UBH, ucpi->c_freeoff, blkno)) {
+                if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) {
                        ufs_error(sb, "ufs_free_blocks", "freeing free fragment");
                }
-                ubh_setblock(UCPI_UBH, ucpi->c_freeoff, blkno);
+                ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
                if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
                        ufs_clusteracct (sb, ucpi, blkno, 1);
                DQUOT_FREE_BLOCK(inode, uspi->s_fpb);
                fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
-                fs32_add(sb, &usb1->fs_cstotal.cs_nbfree, 1);
+                uspi->cs_total.cs_nbfree++;
                fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1);
                cylno = ufs_cbtocylno(i);
                fs16_add(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(i)), 1);
                fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1);
        }
-        ubh_mark_buffer_dirty (USPI_UBH);
+        ubh_mark_buffer_dirty (USPI_UBH(uspi));
-        ubh_mark_buffer_dirty (UCPI_UBH);
+        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
        if (sb->s_flags & MS_SYNCHRONOUS) {
-                ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
+                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH);
+                ubh_wait_on_buffer (UCPI_UBH(ucpi));
        }
        if (overflow) {
@@ -213,38 +207,127 @@ do_more:
        sb->s_dirt = 1;
        unlock_super (sb);
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        return;
-failed:
+failed_unlock:
        unlock_super (sb);
-        UFSD(("EXIT (FAILED)\n"))
+failed:
+        UFSD("EXIT (FAILED)\n");
        return;
 }
+static struct page *ufs_get_locked_page(struct address_space *mapping,
+                                  unsigned long index)
+{
+        struct page *page;
+try_again:
+        page = find_lock_page(mapping, index);
+        if (!page) {
+                page = read_cache_page(mapping, index,
+                                       (filler_t*)mapping->a_ops->readpage,
+                                       NULL);
+                if (IS_ERR(page)) {
+                        printk(KERN_ERR "ufs_change_blocknr: "
+                               "read_cache_page error: ino %lu, index: %lu\n",
+                               mapping->host->i_ino, index);
+                        goto out;
+                }
+                lock_page(page);
-#define NULLIFY_FRAGMENTS \
+                if (!PageUptodate(page) || PageError(page)) {
-        for (i = oldcount; i < newcount; i++) { \
+                        unlock_page(page);
-                bh = sb_getblk(sb, result + i); \
+                        page_cache_release(page);
-                memset (bh->b_data, 0, sb->s_blocksize); \
-                set_buffer_uptodate(bh); \
+                        printk(KERN_ERR "ufs_change_blocknr: "
-                mark_buffer_dirty (bh); \
+                               "can not read page: ino %lu, index: %lu\n",
-                if (IS_SYNC(inode)) \
+                               mapping->host->i_ino, index);
-                        sync_dirty_buffer(bh); \
-                brelse (bh); \
+                        page = ERR_PTR(-EIO);
+                        goto out;
+                }
        }
-unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
+        if (unlikely(!page->mapping || !page_has_buffers(page))) {
-        unsigned goal, unsigned count, int * err )
+                unlock_page(page);
+                page_cache_release(page);
+                goto try_again;/*we really need these buffers*/
+        }
+out:
+        return page;
+}
+/*
+ * Modify inode page cache in such way:
+ * have - blocks with b_blocknr equal to oldb...oldb+count-1
+ * get - blocks with b_blocknr equal to newb...newb+count-1
+ * also we suppose that oldb...oldb+count-1 blocks
+ * situated at the end of file.
+ *
+ * We can come here from ufs_writepage or ufs_prepare_write,
+ * locked_page is argument of these functions, so we already lock it.
+ */
+static void ufs_change_blocknr(struct inode *inode, unsigned int baseblk,
+                               unsigned int count, unsigned int oldb,
+                               unsigned int newb, struct page *locked_page)
+{
+        unsigned int blk_per_page = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        struct address_space *mapping = inode->i_mapping;
+        pgoff_t index, cur_index = locked_page->index;
+        unsigned int i, j;
+        struct page *page;
+        struct buffer_head *head, *bh;
+        UFSD("ENTER, ino %lu, count %u, oldb %u, newb %u\n",
+              inode->i_ino, count, oldb, newb);
+        BUG_ON(!PageLocked(locked_page));
+        for (i = 0; i < count; i += blk_per_page) {
+                index = (baseblk+i) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+                if (likely(cur_index != index)) {
+                        page = ufs_get_locked_page(mapping, index);
+                        if (IS_ERR(page))
+                                continue;
+                } else
+                        page = locked_page;
+                j = i;
+                head = page_buffers(page);
+                bh = head;
+                do {
+                        if (likely(bh->b_blocknr == j + oldb && j < count)) {
+                                unmap_underlying_metadata(bh->b_bdev,
+                                                          bh->b_blocknr);
+                                bh->b_blocknr = newb + j++;
+                                mark_buffer_dirty(bh);
+                        }
+                        bh = bh->b_this_page;
+                } while (bh != head);
+                set_page_dirty(page);
+                if (likely(cur_index != index)) {
+                        unlock_page(page);
+                        page_cache_release(page);
+                }
+        }
+        UFSD("EXIT\n");
+}
+unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
+                           unsigned goal, unsigned count, int * err, struct page *locked_page)
 {
        struct super_block * sb;
        struct ufs_sb_private_info * uspi;
        struct ufs_super_block_first * usb1;
-        struct buffer_head * bh;
+        unsigned cgno, oldcount, newcount, tmp, request, result;
-        unsigned cgno, oldcount, newcount, tmp, request, i, result;
        
-        UFSD(("ENTER, ino %lu, fragment %u, goal %u, count %u\n", inode->i_ino, fragment, goal, count))
+        UFSD("ENTER, ino %lu, fragment %u, goal %u, count %u\n", inode->i_ino, fragment, goal, count);
        
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
@@ -273,14 +356,14 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
                        return (unsigned)-1;
                }
                if (fragment < UFS_I(inode)->i_lastfrag) {
-                        UFSD(("EXIT (ALREADY ALLOCATED)\n"))
+                        UFSD("EXIT (ALREADY ALLOCATED)\n");
                        unlock_super (sb);
                        return 0;
                }
        }
        else {
                if (tmp) {
-                        UFSD(("EXIT (ALREADY ALLOCATED)\n"))
+                        UFSD("EXIT (ALREADY ALLOCATED)\n");
                        unlock_super(sb);
                        return 0;
                }
@@ -289,9 +372,9 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
        /*
         * There is not enough space for user on the device
         */
-        if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(usb1, UFS_MINFREE) <= 0) {
+        if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) {
                unlock_super (sb);
-                UFSD(("EXIT (FAILED)\n"))
+                UFSD("EXIT (FAILED)\n");
                return 0;
        }
@@ -310,12 +393,10 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
                if (result) {
                        *p = cpu_to_fs32(sb, result);
                        *err = 0;
-                        inode->i_blocks += count << uspi->s_nspfshift;
                        UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
-                        NULLIFY_FRAGMENTS
                }
                unlock_super(sb);
-                UFSD(("EXIT, result %u\n", result))
+                UFSD("EXIT, result %u\n", result);
                return result;
        }
@@ -325,11 +406,9 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
        result = ufs_add_fragments (inode, tmp, oldcount, newcount, err);
        if (result) {
                *err = 0;
-                inode->i_blocks += count << uspi->s_nspfshift;
                UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
-                NULLIFY_FRAGMENTS
                unlock_super(sb);
-                UFSD(("EXIT, result %u\n", result))
+                UFSD("EXIT, result %u\n", result);
                return result;
        }
@@ -339,8 +418,8 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
        switch (fs32_to_cpu(sb, usb1->fs_optim)) {
            case UFS_OPTSPACE:
                request = newcount;
-                if (uspi->s_minfree < 5 || fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree) 
+                if (uspi->s_minfree < 5 || uspi->cs_total.cs_nffree
-                    > uspi->s_dsize * uspi->s_minfree / (2 * 100) )
+                    > uspi->s_dsize * uspi->s_minfree / (2 * 100))
                        break;
                usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME);
                break;
@@ -349,7 +428,7 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
        
            case UFS_OPTTIME:
                request = uspi->s_fpb;
-                if (fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree) < uspi->s_dsize *
+                if (uspi->cs_total.cs_nffree < uspi->s_dsize *
                    (uspi->s_minfree - 2) / 100)
                        break;
                usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME);
@@ -357,39 +436,22 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
        }
        result = ufs_alloc_fragments (inode, cgno, goal, request, err);
        if (result) {
-                for (i = 0; i < oldcount; i++) {
+                ufs_change_blocknr(inode, fragment - oldcount, oldcount, tmp,
-                        bh = sb_bread(sb, tmp + i);
+                                   result, locked_page);
-                        if(bh)
-                        {
-                                clear_buffer_dirty(bh);
-                                bh->b_blocknr = result + i;
-                                mark_buffer_dirty (bh);
-                                if (IS_SYNC(inode))
-                                        sync_dirty_buffer(bh);
-                                brelse (bh);
-                        }
-                        else
-                        {
-                                printk(KERN_ERR "ufs_new_fragments: bread fail\n");
-                                unlock_super(sb);
-                                return 0;
-                        }
-                }
                *p = cpu_to_fs32(sb, result);
                *err = 0;
-                inode->i_blocks += count << uspi->s_nspfshift;
                UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
-                NULLIFY_FRAGMENTS
                unlock_super(sb);
                if (newcount < request)
                        ufs_free_fragments (inode, result + newcount, request - newcount);
                ufs_free_fragments (inode, tmp, oldcount);
-                UFSD(("EXIT, result %u\n", result))
+                UFSD("EXIT, result %u\n", result);
                return result;
        }
        unlock_super(sb);
-        UFSD(("EXIT (FAILED)\n"))
+        UFSD("EXIT (FAILED)\n");
        return 0;
 }               
@@ -404,7 +466,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
        struct ufs_cylinder_group * ucg;
        unsigned cgno, fragno, fragoff, count, fragsize, i;
        
-        UFSD(("ENTER, fragment %u, oldcount %u, newcount %u\n", fragment, oldcount, newcount))
+        UFSD("ENTER, fragment %u, oldcount %u, newcount %u\n", fragment, oldcount, newcount);
        
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
@@ -419,7 +481,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
        ucpi = ufs_load_cylinder (sb, cgno);
        if (!ucpi)
                return 0;
-        ucg = ubh_get_ucg (UCPI_UBH);
+        ucg = ubh_get_ucg (UCPI_UBH(ucpi));
        if (!ufs_cg_chkmagic(sb, ucg)) {
                ufs_panic (sb, "ufs_add_fragments",
                        "internal error, bad magic number on cg %u", cgno);
@@ -429,14 +491,14 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
        fragno = ufs_dtogd (fragment);
        fragoff = ufs_fragnum (fragno);
        for (i = oldcount; i < newcount; i++)
-                if (ubh_isclr (UCPI_UBH, ucpi->c_freeoff, fragno + i))
+                if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i))
                        return 0;
        /*
         * Block can be extended
         */
        ucg->cg_time = cpu_to_fs32(sb, get_seconds());
        for (i = newcount; i < (uspi->s_fpb - fragoff); i++)
-                if (ubh_isclr (UCPI_UBH, ucpi->c_freeoff, fragno + i))
+                if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i))
                        break;
        fragsize = i - oldcount;
        if (!fs32_to_cpu(sb, ucg->cg_frsum[fragsize]))
@@ -446,7 +508,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
        if (fragsize != count)
                fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
        for (i = oldcount; i < newcount; i++)
-                ubh_clrbit (UCPI_UBH, ucpi->c_freeoff, fragno + i);
+                ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
        if(DQUOT_ALLOC_BLOCK(inode, count)) {
                *err = -EDQUOT;
                return 0;
@@ -454,17 +516,17 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
        fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
        fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
-        fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, count);
+        uspi->cs_total.cs_nffree -= count;
        
-        ubh_mark_buffer_dirty (USPI_UBH);
+        ubh_mark_buffer_dirty (USPI_UBH(uspi));
-        ubh_mark_buffer_dirty (UCPI_UBH);
+        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
        if (sb->s_flags & MS_SYNCHRONOUS) {
-                ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
+                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH);
+                ubh_wait_on_buffer (UCPI_UBH(ucpi));
        }
        sb->s_dirt = 1;
-        UFSD(("EXIT, fragment %u\n", fragment))
+        UFSD("EXIT, fragment %u\n", fragment);
        
        return fragment;
 }
@@ -487,7 +549,7 @@ static unsigned ufs_alloc_fragments (struct inode * inode, unsigned cgno,
        struct ufs_cylinder_group * ucg;
        unsigned oldcg, i, j, k, result, allocsize;
        
-        UFSD(("ENTER, ino %lu, cgno %u, goal %u, count %u\n", inode->i_ino, cgno, goal, count))
+        UFSD("ENTER, ino %lu, cgno %u, goal %u, count %u\n", inode->i_ino, cgno, goal, count);
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
@@ -521,14 +583,14 @@ static unsigned ufs_alloc_fragments (struct inode * inode, unsigned cgno,
                UFS_TEST_FREE_SPACE_CG
        }
        
-        UFSD(("EXIT (FAILED)\n"))
+        UFSD("EXIT (FAILED)\n");
        return 0;
 cg_found:
        ucpi = ufs_load_cylinder (sb, cgno);
        if (!ucpi)
                return 0;
-        ucg = ubh_get_ucg (UCPI_UBH);
+        ucg = ubh_get_ucg (UCPI_UBH(ucpi));
        if (!ufs_cg_chkmagic(sb, ucg)) 
                ufs_panic (sb, "ufs_alloc_fragments",
                        "internal error, bad magic number on cg %u", cgno);
@@ -551,12 +613,12 @@ cg_found:
                        return 0;
                goal = ufs_dtogd (result);
                for (i = count; i < uspi->s_fpb; i++)
-                        ubh_setbit (UCPI_UBH, ucpi->c_freeoff, goal + i);
+                        ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
                i = uspi->s_fpb - count;
                DQUOT_FREE_BLOCK(inode, i);
                fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
-                fs32_add(sb, &usb1->fs_cstotal.cs_nffree, i);
+                uspi->cs_total.cs_nffree += i;
                fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, i);
                fs32_add(sb, &ucg->cg_frsum[i], 1);
                goto succed;
@@ -570,10 +632,10 @@ cg_found:
                return 0;
        }
        for (i = 0; i < count; i++)
-                ubh_clrbit (UCPI_UBH, ucpi->c_freeoff, result + i);
+                ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i);
        
        fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
-        fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, count);
+        uspi->cs_total.cs_nffree -= count;
        fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
        fs32_sub(sb, &ucg->cg_frsum[allocsize], 1);
@@ -581,16 +643,16 @@ cg_found:
                fs32_add(sb, &ucg->cg_frsum[allocsize - count], 1);
 succed:
-        ubh_mark_buffer_dirty (USPI_UBH);
+        ubh_mark_buffer_dirty (USPI_UBH(uspi));
-        ubh_mark_buffer_dirty (UCPI_UBH);
+        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
        if (sb->s_flags & MS_SYNCHRONOUS) {
-                ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
+                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH);
+                ubh_wait_on_buffer (UCPI_UBH(ucpi));
        }
        sb->s_dirt = 1;
        result += cgno * uspi->s_fpg;
-        UFSD(("EXIT3, result %u\n", result))
+        UFSD("EXIT3, result %u\n", result);
        return result;
 }
@@ -603,12 +665,12 @@ static unsigned ufs_alloccg_block (struct inode * inode,
        struct ufs_cylinder_group * ucg;
        unsigned result, cylno, blkno;
-        UFSD(("ENTER, goal %u\n", goal))
+        UFSD("ENTER, goal %u\n", goal);
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
        usb1 = ubh_get_usb_first(uspi);
-        ucg = ubh_get_ucg(UCPI_UBH);
+        ucg = ubh_get_ucg(UCPI_UBH(ucpi));
        if (goal == 0) {
                goal = ucpi->c_rotor;
@@ -620,7 +682,7 @@ static unsigned ufs_alloccg_block (struct inode * inode,
        /*
         * If the requested block is available, use it.
         */
-        if (ubh_isblockset(UCPI_UBH, ucpi->c_freeoff, ufs_fragstoblks(goal))) {
+        if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, ufs_fragstoblks(goal))) {
                result = goal;
                goto gotit;
        }
@@ -632,7 +694,7 @@ norot:
        ucpi->c_rotor = result;
 gotit:
        blkno = ufs_fragstoblks(result);
-        ubh_clrblock (UCPI_UBH, ucpi->c_freeoff, blkno);
+        ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
        if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
                ufs_clusteracct (sb, ucpi, blkno, -1);
        if(DQUOT_ALLOC_BLOCK(inode, uspi->s_fpb)) {
@@ -641,31 +703,76 @@ gotit:
        }
        fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1);
-        fs32_sub(sb, &usb1->fs_cstotal.cs_nbfree, 1);
+        uspi->cs_total.cs_nbfree--;
        fs32_sub(sb, &UFS_SB(sb)->fs_cs(ucpi->c_cgx).cs_nbfree, 1);
        cylno = ufs_cbtocylno(result);
        fs16_sub(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(result)), 1);
        fs32_sub(sb, &ubh_cg_blktot(ucpi, cylno), 1);
        
-        UFSD(("EXIT, result %u\n", result))
+        UFSD("EXIT, result %u\n", result);
        return result;
 }
-static unsigned ufs_bitmap_search (struct super_block * sb,
+static unsigned ubh_scanc(struct ufs_sb_private_info *uspi,
-        struct ufs_cg_private_info * ucpi, unsigned goal, unsigned count)
+                          struct ufs_buffer_head *ubh,
+                          unsigned begin, unsigned size,
+                          unsigned char *table, unsigned char mask)
 {
-        struct ufs_sb_private_info * uspi;
+        unsigned rest, offset;
-        struct ufs_super_block_first * usb1;
+        unsigned char *cp;
-        struct ufs_cylinder_group * ucg;
+        
-        unsigned start, length, location, result;
-        unsigned possition, fragsize, blockmap, mask;
+        offset = begin & ~uspi->s_fmask;
-        
+        begin >>= uspi->s_fshift;
-        UFSD(("ENTER, cg %u, goal %u, count %u\n", ucpi->c_cgx, goal, count))
+        for (;;) {
+                if ((offset + size) < uspi->s_fsize)
+                        rest = size;
+                else
+                        rest = uspi->s_fsize - offset;
+                size -= rest;
+                cp = ubh->bh[begin]->b_data + offset;
+                while ((table[*cp++] & mask) == 0 && --rest)
+                        ;
+                if (rest || !size)
+                        break;
+                begin++;
+                offset = 0;
+        }
+        return (size + rest);
+}
+/*
+ * Find a block of the specified size in the specified cylinder group.
+ * @sp: pointer to super block
+ * @ucpi: pointer to cylinder group info
+ * @goal: near which block we want find new one
+ * @count: specified size
+ */
+static unsigned ufs_bitmap_search(struct super_block *sb,
+                                  struct ufs_cg_private_info *ucpi,
+                                  unsigned goal, unsigned count)
+{
+        /*
+         * Bit patterns for identifying fragments in the block map
+         * used as ((map & mask_arr) == want_arr)
+         */
+        static const int mask_arr[9] = {
+                0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff
+        };
+        static const int want_arr[9] = {
+                0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe
+        };
+        struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+        struct ufs_super_block_first *usb1;
+        struct ufs_cylinder_group *ucg;
+        unsigned start, length, loc, result;
+        unsigned pos, want, blockmap, mask, end;
+        UFSD("ENTER, cg %u, goal %u, count %u\n", ucpi->c_cgx, goal, count);
-        uspi = UFS_SB(sb)->s_uspi;
        usb1 = ubh_get_usb_first (uspi);
-        ucg = ubh_get_ucg(UCPI_UBH);
+        ucg = ubh_get_ucg(UCPI_UBH(ucpi));
        if (goal)
                start = ufs_dtogd(goal) >> 3;
@@ -673,53 +780,50 @@ static unsigned ufs_bitmap_search (struct super_block * sb,
                start = ucpi->c_frotor >> 3;
                
        length = ((uspi->s_fpg + 7) >> 3) - start;
-        location = ubh_scanc(UCPI_UBH, ucpi->c_freeoff + start, length,
+        loc = ubh_scanc(uspi, UCPI_UBH(ucpi), ucpi->c_freeoff + start, length,
                (uspi->s_fpb == 8) ? ufs_fragtable_8fpb : ufs_fragtable_other,
                1 << (count - 1 + (uspi->s_fpb & 7))); 
-        if (location == 0) {
+        if (loc == 0) {
                length = start + 1;
-                location = ubh_scanc(UCPI_UBH, ucpi->c_freeoff, length, 
+                loc = ubh_scanc(uspi, UCPI_UBH(ucpi), ucpi->c_freeoff, length,
-                        (uspi->s_fpb == 8) ? ufs_fragtable_8fpb : ufs_fragtable_other,
+                                (uspi->s_fpb == 8) ? ufs_fragtable_8fpb :
-                        1 << (count - 1 + (uspi->s_fpb & 7)));
+                                ufs_fragtable_other,
-                if (location == 0) {
+                                1 << (count - 1 + (uspi->s_fpb & 7)));
-                        ufs_error (sb, "ufs_bitmap_search",
+                if (loc == 0) {
-                        "bitmap corrupted on cg %u, start %u, length %u, count %u, freeoff %u\n",
+                        ufs_error(sb, "ufs_bitmap_search",
-                        ucpi->c_cgx, start, length, count, ucpi->c_freeoff);
+                                  "bitmap corrupted on cg %u, start %u,"
+                                  " length %u, count %u, freeoff %u\n",
+                                  ucpi->c_cgx, start, length, count,
+                                  ucpi->c_freeoff);
                        return (unsigned)-1;
                }
                start = 0;
        }
-        result = (start + length - location) << 3;
+        result = (start + length - loc) << 3;
        ucpi->c_frotor = result;
        /*
         * found the byte in the map
         */
-        blockmap = ubh_blkmap(UCPI_UBH, ucpi->c_freeoff, result);
-        fragsize = 0;
+        for (end = result + 8; result < end; result += uspi->s_fpb) {
-        for (possition = 0, mask = 1; possition < 8; possition++, mask <<= 1) {
+                blockmap = ubh_blkmap(UCPI_UBH(ucpi), ucpi->c_freeoff, result);
-                if (blockmap & mask) {
+                blockmap <<= 1;
-                        if (!(possition & uspi->s_fpbmask))
+                mask = mask_arr[count];
-                                fragsize = 1;
+                want = want_arr[count];
-                        else 
+                for (pos = 0; pos <= uspi->s_fpb - count; pos++) {
-                                fragsize++;
+                        if ((blockmap & mask) == want) {
-                }
+                                UFSD("EXIT, result %u\n", result);
-                else {
+                                return result + pos;
-                        if (fragsize == count) {
+                        }
-                                result += possition - count;
+                        mask <<= 1;
-                                UFSD(("EXIT, result %u\n", result))
+                        want <<= 1;
-                                return result;
+                }
-                        }
+        }
-                        fragsize = 0;
-                }
+        ufs_error(sb, "ufs_bitmap_search", "block not in map on cg %u\n",
-        }
+                  ucpi->c_cgx);
-        if (fragsize == count) {
+        UFSD("EXIT (FAILED)\n");
-                result += possition - count;
-                UFSD(("EXIT, result %u\n", result))
-                return result;
-        }
-        ufs_error (sb, "ufs_bitmap_search", "block not in map on cg %u\n", ucpi->c_cgx);
-        UFSD(("EXIT (FAILED)\n"))
        return (unsigned)-1;
 }
@@ -734,9 +838,9 @@ static void ufs_clusteracct(struct super_block * sb,
                return;
        if (cnt > 0)
-                ubh_setbit(UCPI_UBH, ucpi->c_clusteroff, blkno);
+                ubh_setbit(UCPI_UBH(ucpi), ucpi->c_clusteroff, blkno);
        else
-                ubh_clrbit(UCPI_UBH, ucpi->c_clusteroff, blkno);
+                ubh_clrbit(UCPI_UBH(ucpi), ucpi->c_clusteroff, blkno);
        /*
         * Find the size of the cluster going forward.
@@ -745,7 +849,7 @@ static void ufs_clusteracct(struct super_block * sb,
        end = start + uspi->s_contigsumsize;
        if ( end >= ucpi->c_nclusterblks)
                end = ucpi->c_nclusterblks;
-        i = ubh_find_next_zero_bit (UCPI_UBH, ucpi->c_clusteroff, end, start);
+        i = ubh_find_next_zero_bit (UCPI_UBH(ucpi), ucpi->c_clusteroff, end, start);
        if (i > end)
                i = end;
        forw = i - start;
@@ -757,7 +861,7 @@ static void ufs_clusteracct(struct super_block * sb,
        end = start - uspi->s_contigsumsize;
        if (end < 0 ) 
                end = -1;
-        i = ubh_find_last_zero_bit (UCPI_UBH, ucpi->c_clusteroff, start, end);
+        i = ubh_find_last_zero_bit (UCPI_UBH(ucpi), ucpi->c_clusteroff, start, end);
        if ( i < end) 
                i = end;
        back = start - i;
@@ -769,11 +873,11 @@ static void ufs_clusteracct(struct super_block * sb,
        i = back + forw + 1;
        if (i > uspi->s_contigsumsize)
                i = uspi->s_contigsumsize;
-        fs32_add(sb, (__fs32*)ubh_get_addr(UCPI_UBH, ucpi->c_clustersumoff + (i << 2)), cnt);
+        fs32_add(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (i << 2)), cnt);
        if (back > 0)
-                fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH, ucpi->c_clustersumoff + (back << 2)), cnt);
+                fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (back << 2)), cnt);
        if (forw > 0)
-                fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH, ucpi->c_clustersumoff + (forw << 2)), cnt);
+                fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (forw << 2)), cnt);
 }
diff --git a/fs/ufs/cylinder.c b/fs/ufs/cylinder.c
index 14abb8b835f7..09c39e5e6386 100644
--- a/fs/ufs/cylinder.c
+++ b/fs/ufs/cylinder.c
@@ -20,15 +20,6 @@
 #include "swab.h"
 #include "util.h"
-#undef UFS_CYLINDER_DEBUG
-#ifdef UFS_CYLINDER_DEBUG
-#define UFSD(x) printk("(%s, %d), %s:", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
 /*
 * Read cylinder group into cache. The memory space for ufs_cg_private_info
 * structure is already allocated during ufs_read_super.
@@ -42,19 +33,19 @@ static void ufs_read_cylinder (struct super_block * sb,
        struct ufs_cylinder_group * ucg;
        unsigned i, j;
-        UFSD(("ENTER, cgno %u, bitmap_nr %u\n", cgno, bitmap_nr))
+        UFSD("ENTER, cgno %u, bitmap_nr %u\n", cgno, bitmap_nr);
        uspi = sbi->s_uspi;
        ucpi = sbi->s_ucpi[bitmap_nr];
        ucg = (struct ufs_cylinder_group *)sbi->s_ucg[cgno]->b_data;
-        UCPI_UBH->fragment = ufs_cgcmin(cgno);
+        UCPI_UBH(ucpi)->fragment = ufs_cgcmin(cgno);
-        UCPI_UBH->count = uspi->s_cgsize >> sb->s_blocksize_bits;
+        UCPI_UBH(ucpi)->count = uspi->s_cgsize >> sb->s_blocksize_bits;
        /*
         * We have already the first fragment of cylinder group block in buffer
         */
-        UCPI_UBH->bh[0] = sbi->s_ucg[cgno];
+        UCPI_UBH(ucpi)->bh[0] = sbi->s_ucg[cgno];
-        for (i = 1; i < UCPI_UBH->count; i++)
+        for (i = 1; i < UCPI_UBH(ucpi)->count; i++)
-                if (!(UCPI_UBH->bh[i] = sb_bread(sb, UCPI_UBH->fragment + i)))
+                if (!(UCPI_UBH(ucpi)->bh[i] = sb_bread(sb, UCPI_UBH(ucpi)->fragment + i)))
                        goto failed;
        sbi->s_cgno[bitmap_nr] = cgno;
                        
@@ -73,7 +64,7 @@ static void ufs_read_cylinder (struct super_block * sb,
        ucpi->c_clustersumoff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clustersumoff);
        ucpi->c_clusteroff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clusteroff);
        ucpi->c_nclusterblks = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_nclusterblks);
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        return; 
        
 failed:
@@ -95,15 +86,15 @@ void ufs_put_cylinder (struct super_block * sb, unsigned bitmap_nr)
        struct ufs_cylinder_group * ucg;
        unsigned i;
-        UFSD(("ENTER, bitmap_nr %u\n", bitmap_nr))
+        UFSD("ENTER, bitmap_nr %u\n", bitmap_nr);
        uspi = sbi->s_uspi;
        if (sbi->s_cgno[bitmap_nr] == UFS_CGNO_EMPTY) {
-                UFSD(("EXIT\n"))
+                UFSD("EXIT\n");
                return;
        }
        ucpi = sbi->s_ucpi[bitmap_nr];
-        ucg = ubh_get_ucg(UCPI_UBH);
+        ucg = ubh_get_ucg(UCPI_UBH(ucpi));
        if (uspi->s_ncg > UFS_MAX_GROUP_LOADED && bitmap_nr >= sbi->s_cg_loaded) {
                ufs_panic (sb, "ufs_put_cylinder", "internal error");
@@ -116,13 +107,13 @@ void ufs_put_cylinder (struct super_block * sb, unsigned bitmap_nr)
        ucg->cg_rotor = cpu_to_fs32(sb, ucpi->c_rotor);
        ucg->cg_frotor = cpu_to_fs32(sb, ucpi->c_frotor);
        ucg->cg_irotor = cpu_to_fs32(sb, ucpi->c_irotor);
-        ubh_mark_buffer_dirty (UCPI_UBH);
+        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-        for (i = 1; i < UCPI_UBH->count; i++) {
+        for (i = 1; i < UCPI_UBH(ucpi)->count; i++) {
-                brelse (UCPI_UBH->bh[i]);
+                brelse (UCPI_UBH(ucpi)->bh[i]);
        }
        sbi->s_cgno[bitmap_nr] = UFS_CGNO_EMPTY;
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
 }
 /*
@@ -139,7 +130,7 @@ struct ufs_cg_private_info * ufs_load_cylinder (
        struct ufs_cg_private_info * ucpi;
        unsigned cg, i, j;
-        UFSD(("ENTER, cgno %u\n", cgno))
+        UFSD("ENTER, cgno %u\n", cgno);
        uspi = sbi->s_uspi;
        if (cgno >= uspi->s_ncg) {
@@ -150,7 +141,7 @@ struct ufs_cg_private_info * ufs_load_cylinder (
         * Cylinder group number cg it in cache and it was last used
         */
        if (sbi->s_cgno[0] == cgno) {
-                UFSD(("EXIT\n"))
+                UFSD("EXIT\n");
                return sbi->s_ucpi[0];
        }
        /*
@@ -160,16 +151,16 @@ struct ufs_cg_private_info * ufs_load_cylinder (
                if (sbi->s_cgno[cgno] != UFS_CGNO_EMPTY) {
                        if (sbi->s_cgno[cgno] != cgno) {
                                ufs_panic (sb, "ufs_load_cylinder", "internal error, wrong number of cg in cache");
-                                UFSD(("EXIT (FAILED)\n"))
+                                UFSD("EXIT (FAILED)\n");
                                return NULL;
                        }
                        else {
-                                UFSD(("EXIT\n"))
+                                UFSD("EXIT\n");
                                return sbi->s_ucpi[cgno];
                        }
                } else {
                        ufs_read_cylinder (sb, cgno, cgno);
-                        UFSD(("EXIT\n"))
+                        UFSD("EXIT\n");
                        return sbi->s_ucpi[cgno];
                }
        }
@@ -204,6 +195,6 @@ struct ufs_cg_private_info * ufs_load_cylinder (
                sbi->s_ucpi[0] = ucpi;
                ufs_read_cylinder (sb, cgno, 0);
        }
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        return sbi->s_ucpi[0];
 }
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 1a561202d3f4..7f0a0aa63584 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -11,31 +11,20 @@
 * 4.4BSD (FreeBSD) support added on February 1st 1998 by
 * Niels Kristian Bech Jensen <nkbj@image.dk> partially based
 * on code by Martin von Loewis <martin@mira.isdn.cs.tu-berlin.de>.
+ *
+ * Migration to usage of "page cache" on May 2006 by
+ * Evgeniy Dushistov <dushistov@mail.ru> based on ext2 code base.
 */
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/ufs_fs.h>
 #include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
 #include <linux/sched.h>
 #include "swab.h"
 #include "util.h"
-#undef UFS_DIR_DEBUG
-#ifdef UFS_DIR_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-static int
-ufs_check_dir_entry (const char *, struct inode *, struct ufs_dir_entry *,
-                     struct buffer_head *, unsigned long);
 /*
 * NOTE! unlike strncmp, ufs_match returns 1 for success, 0 for failure.
 *
@@ -51,495 +40,541 @@ static inline int ufs_match(struct super_block *sb, int len,
        return !memcmp(name, de->d_name, len);
 }
-/*
+static int ufs_commit_chunk(struct page *page, unsigned from, unsigned to)
- * This is blatantly stolen from ext2fs
- */
-static int
-ufs_readdir (struct file * filp, void * dirent, filldir_t filldir)
 {
-        struct inode *inode = filp->f_dentry->d_inode;
+        struct inode *dir = page->mapping->host;
-        int error = 0;
+        int err = 0;
-        unsigned long offset, lblk;
+        dir->i_version++;
-        int i, stored;
+        page->mapping->a_ops->commit_write(NULL, page, from, to);
-        struct buffer_head * bh;
+        if (IS_DIRSYNC(dir))
-        struct ufs_dir_entry * de;
+                err = write_one_page(page, 1);
-        struct super_block * sb;
+        else
-        int de_reclen;
+                unlock_page(page);
-        unsigned flags;
+        return err;
-        u64     blk= 0L;
+}
-        lock_kernel();
-        sb = inode->i_sb;
-        flags = UFS_SB(sb)->s_flags;
-        UFSD(("ENTER, ino %lu  f_pos %lu\n", inode->i_ino, (unsigned long) filp->f_pos))
-        stored = 0;
-        bh = NULL;
-        offset = filp->f_pos & (sb->s_blocksize - 1);
-        while (!error && !stored && filp->f_pos < inode->i_size) {
-                lblk = (filp->f_pos) >> sb->s_blocksize_bits;
-                blk = ufs_frag_map(inode, lblk);
-                if (!blk || !(bh = sb_bread(sb, blk))) {
-                        /* XXX - error - skip to the next block */
-                        printk("ufs_readdir: "
-                               "dir inode %lu has a hole at offset %lu\n",
-                               inode->i_ino, (unsigned long int)filp->f_pos);
-                        filp->f_pos += sb->s_blocksize - offset;
-                        continue;
-                }
-revalidate:
-                /* If the dir block has changed since the last call to
-                 * readdir(2), then we might be pointing to an invalid
-                 * dirent right now.  Scan from the start of the block
-                 * to make sure. */
-                if (filp->f_version != inode->i_version) {
-                        for (i = 0; i < sb->s_blocksize && i < offset; ) {
-                                de = (struct ufs_dir_entry *)(bh->b_data + i);
-                                /* It's too expensive to do a full
-                                 * dirent test each time round this
-                                 * loop, but we do have to test at
-                                 * least that it is non-zero.  A
-                                 * failure will be detected in the
-                                 * dirent test below. */
-                                de_reclen = fs16_to_cpu(sb, de->d_reclen);
-                                if (de_reclen < 1)
-                                        break;
-                                i += de_reclen;
-                        }
-                        offset = i;
-                        filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
-                                | offset;
-                        filp->f_version = inode->i_version;
-                }
-                while (!error && filp->f_pos < inode->i_size
+static inline void ufs_put_page(struct page *page)
-                       && offset < sb->s_blocksize) {
+{
-                        de = (struct ufs_dir_entry *) (bh->b_data + offset);
+        kunmap(page);
-                        /* XXX - put in a real ufs_check_dir_entry() */
+        page_cache_release(page);
-                        if ((de->d_reclen == 0) || (ufs_get_de_namlen(sb, de) == 0)) {
+}
-                                filp->f_pos = (filp->f_pos &
-                                              (sb->s_blocksize - 1)) +
-                                               sb->s_blocksize;
-                                brelse(bh);
-                                unlock_kernel();
-                                return stored;
-                        }
-                        if (!ufs_check_dir_entry ("ufs_readdir", inode, de,
-                                                   bh, offset)) {
-                                /* On error, skip the f_pos to the
-                                   next block. */
-                                filp->f_pos = (filp->f_pos |
-                                              (sb->s_blocksize - 1)) +
-                                               1;
-                                brelse (bh);
-                                unlock_kernel();
-                                return stored;
-                        }
-                        offset += fs16_to_cpu(sb, de->d_reclen);
-                        if (de->d_ino) {
-                                /* We might block in the next section
-                                 * if the data destination is
-                                 * currently swapped out.  So, use a
-                                 * version stamp to detect whether or
-                                 * not the directory has been modified
-                                 * during the copy operation. */
-                                unsigned long version = filp->f_version;
-                                unsigned char d_type = DT_UNKNOWN;
-                                UFSD(("filldir(%s,%u)\n", de->d_name,
+static inline unsigned long ufs_dir_pages(struct inode *inode)
-                                                        fs32_to_cpu(sb, de->d_ino)))
+{
-                                UFSD(("namlen %u\n", ufs_get_de_namlen(sb, de)))
+        return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
+}
-                                if ((flags & UFS_DE_MASK) == UFS_DE_44BSD)
+ino_t ufs_inode_by_name(struct inode *dir, struct dentry *dentry)
-                                        d_type = de->d_u.d_44.d_type;
+{
-                                error = filldir(dirent, de->d_name,
+        ino_t res = 0;
-                                                ufs_get_de_namlen(sb, de), filp->f_pos,
+        struct ufs_dir_entry *de;
-                                                fs32_to_cpu(sb, de->d_ino), d_type);
+        struct page *page;
-                                if (error)
+        
-                                        break;
+        de = ufs_find_entry(dir, dentry, &page);
-                                if (version != filp->f_version)
+        if (de) {
-                                        goto revalidate;
+                res = fs32_to_cpu(dir->i_sb, de->d_ino);
-                                stored ++;
+                ufs_put_page(page);
-                        }
-                        filp->f_pos += fs16_to_cpu(sb, de->d_reclen);
-                }
-                offset = 0;
-                brelse (bh);
        }
-        unlock_kernel();
+        return res;
-        return 0;
 }
-/*
- * define how far ahead to read directories while searching them.
- */
-#define NAMEI_RA_CHUNKS  2
-#define NAMEI_RA_BLOCKS  4
-#define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
-/*
+/* Releases the page */
- *      ufs_find_entry()
+void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
- *
+                  struct page *page, struct inode *inode)
- * finds an entry in the specified directory with the wanted name. It
- * returns the cache buffer in which the entry was found, and the entry
- * itself (as a parameter - res_bh). It does NOT read the inode of the
- * entry - you'll have to do that yourself if you want to.
- */
-struct ufs_dir_entry * ufs_find_entry (struct dentry *dentry,
-        struct buffer_head ** res_bh)
 {
-        struct super_block * sb;
+        unsigned from = (char *) de - (char *) page_address(page);
-        struct buffer_head * bh_use[NAMEI_RA_SIZE];
+        unsigned to = from + fs16_to_cpu(dir->i_sb, de->d_reclen);
-        struct buffer_head * bh_read[NAMEI_RA_SIZE];
+        int err;
-        unsigned long offset;
-        int block, toread, i, err;
-        struct inode *dir = dentry->d_parent->d_inode;
-        const char *name = dentry->d_name.name;
-        int namelen = dentry->d_name.len;
-        UFSD(("ENTER, dir_ino %lu, name %s, namlen %u\n", dir->i_ino, name, namelen))
+        lock_page(page);
-        
+        err = page->mapping->a_ops->prepare_write(NULL, page, from, to);
-        *res_bh = NULL;
+        BUG_ON(err);
-        
+        de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino);
-        sb = dir->i_sb;
+        ufs_set_de_type(dir->i_sb, de, inode->i_mode);
-        
+        err = ufs_commit_chunk(page, from, to);
-        if (namelen > UFS_MAXNAMLEN)
+        ufs_put_page(page);
-                return NULL;
+        dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+        mark_inode_dirty(dir);
+}
-        memset (bh_use, 0, sizeof (bh_use));
-        toread = 0;
-        for (block = 0; block < NAMEI_RA_SIZE; ++block) {
-                struct buffer_head * bh;
-                if ((block << sb->s_blocksize_bits) >= dir->i_size)
+static void ufs_check_page(struct page *page)
-                        break;
+{
-                bh = ufs_getfrag (dir, block, 0, &err);
+        struct inode *dir = page->mapping->host;
-                bh_use[block] = bh;
+        struct super_block *sb = dir->i_sb;
-                if (bh && !buffer_uptodate(bh))
+        char *kaddr = page_address(page);
-                        bh_read[toread++] = bh;
+        unsigned offs, rec_len;
+        unsigned limit = PAGE_CACHE_SIZE;
+        struct ufs_dir_entry *p;
+        char *error;
+        if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
+                limit = dir->i_size & ~PAGE_CACHE_MASK;
+                if (limit & (UFS_SECTOR_SIZE - 1))
+                        goto Ebadsize;
+                if (!limit)
+                        goto out;
        }
+        for (offs = 0; offs <= limit - UFS_DIR_REC_LEN(1); offs += rec_len) {
+                p = (struct ufs_dir_entry *)(kaddr + offs);
+                rec_len = fs16_to_cpu(sb, p->d_reclen);
+                if (rec_len < UFS_DIR_REC_LEN(1))
+                        goto Eshort;
+                if (rec_len & 3)
+                        goto Ealign;
+                if (rec_len < UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, p)))
+                        goto Enamelen;
+                if (((offs + rec_len - 1) ^ offs) & ~(UFS_SECTOR_SIZE-1))
+                        goto Espan;
+                if (fs32_to_cpu(sb, p->d_ino) > (UFS_SB(sb)->s_uspi->s_ipg *
+                                                  UFS_SB(sb)->s_uspi->s_ncg))
+                        goto Einumber;
+        }
+        if (offs != limit)
+                goto Eend;
+out:
+        SetPageChecked(page);
+        return;
+        /* Too bad, we had an error */
+Ebadsize:
+        ufs_error(sb, "ufs_check_page",
+                  "size of directory #%lu is not a multiple of chunk size",
+                  dir->i_ino
+        );
+        goto fail;
+Eshort:
+        error = "rec_len is smaller than minimal";
+        goto bad_entry;
+Ealign:
+        error = "unaligned directory entry";
+        goto bad_entry;
+Enamelen:
+        error = "rec_len is too small for name_len";
+        goto bad_entry;
+Espan:
+        error = "directory entry across blocks";
+        goto bad_entry;
+Einumber:
+        error = "inode out of bounds";
+bad_entry:
+        ufs_error (sb, "ufs_check_page", "bad entry in directory #%lu: %s - "
+                   "offset=%lu, rec_len=%d, name_len=%d",
+                   dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+                   rec_len, ufs_get_de_namlen(sb, p));
+        goto fail;
+Eend:
+        p = (struct ufs_dir_entry *)(kaddr + offs);
+        ufs_error (sb, "ext2_check_page",
+                   "entry in directory #%lu spans the page boundary"
+                   "offset=%lu",
+                   dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs);
+fail:
+        SetPageChecked(page);
+        SetPageError(page);
+}
-        for (block = 0, offset = 0; offset < dir->i_size; block++) {
+static struct page *ufs_get_page(struct inode *dir, unsigned long n)
-                struct buffer_head * bh;
+{
-                struct ufs_dir_entry * de;
+        struct address_space *mapping = dir->i_mapping;
-                char * dlimit;
+        struct page *page = read_cache_page(mapping, n,
+                                (filler_t*)mapping->a_ops->readpage, NULL);
-                if ((block % NAMEI_RA_BLOCKS) == 0 && toread) {
+        if (!IS_ERR(page)) {
-                        ll_rw_block (READ, toread, bh_read);
+                wait_on_page_locked(page);
-                        toread = 0;
+                kmap(page);
-                }
+                if (!PageUptodate(page))
-                bh = bh_use[block % NAMEI_RA_SIZE];
+                        goto fail;
-                if (!bh) {
+                if (!PageChecked(page))
-                        ufs_error (sb, "ufs_find_entry", 
+                        ufs_check_page(page);
-                                "directory #%lu contains a hole at offset %lu",
+                if (PageError(page))
-                                dir->i_ino, offset);
+                        goto fail;
-                        offset += sb->s_blocksize;
-                        continue;
-                }
-                wait_on_buffer (bh);
-                if (!buffer_uptodate(bh)) {
-                        /*
-                         * read error: all bets are off
-                         */
-                        break;
-                }
-                de = (struct ufs_dir_entry *) bh->b_data;
-                dlimit = bh->b_data + sb->s_blocksize;
-                while ((char *) de < dlimit && offset < dir->i_size) {
-                        /* this code is executed quadratically often */
-                        /* do minimal checking by hand */
-                        int de_len;
-                        if ((char *) de + namelen <= dlimit &&
-                            ufs_match(sb, namelen, name, de)) {
-                                /* found a match -
-                                just to be sure, do a full check */
-                                if (!ufs_check_dir_entry("ufs_find_entry",
-                                    dir, de, bh, offset))
-                                        goto failed;
-                                for (i = 0; i < NAMEI_RA_SIZE; ++i) {
-                                        if (bh_use[i] != bh)
-                                                brelse (bh_use[i]);
-                                }
-                                *res_bh = bh;
-                                return de;
-                        }
-                        /* prevent looping on a bad block */
-                        de_len = fs16_to_cpu(sb, de->d_reclen);
-                        if (de_len <= 0)
-                                goto failed;
-                        offset += de_len;
-                        de = (struct ufs_dir_entry *) ((char *) de + de_len);
-                }
-                brelse (bh);
-                if (((block + NAMEI_RA_SIZE) << sb->s_blocksize_bits ) >=
-                    dir->i_size)
-                        bh = NULL;
-                else
-                        bh = ufs_getfrag (dir, block + NAMEI_RA_SIZE, 0, &err);
-                bh_use[block % NAMEI_RA_SIZE] = bh;
-                if (bh && !buffer_uptodate(bh))
-                        bh_read[toread++] = bh;
        }
+        return page;
-failed:
+fail:
-        for (i = 0; i < NAMEI_RA_SIZE; ++i) brelse (bh_use[i]);
+        ufs_put_page(page);
-        UFSD(("EXIT\n"))
+        return ERR_PTR(-EIO);
-        return NULL;
 }
-static int
+/*
-ufs_check_dir_entry (const char *function, struct inode *dir,
+ * Return the offset into page `page_nr' of the last valid
-                     struct ufs_dir_entry *de, struct buffer_head *bh,
+ * byte in that page, plus one.
-                     unsigned long offset)
+ */
+static unsigned
+ufs_last_byte(struct inode *inode, unsigned long page_nr)
 {
-        struct super_block *sb = dir->i_sb;
+        unsigned last_byte = inode->i_size;
-        const char *error_msg = NULL;
-        int rlen = fs16_to_cpu(sb, de->d_reclen);
+        last_byte -= page_nr << PAGE_CACHE_SHIFT;
+        if (last_byte > PAGE_CACHE_SIZE)
-        if (rlen < UFS_DIR_REC_LEN(1))
+                last_byte = PAGE_CACHE_SIZE;
-                error_msg = "reclen is smaller than minimal";
+        return last_byte;
-        else if (rlen % 4 != 0)
-                error_msg = "reclen % 4 != 0";
-        else if (rlen < UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)))
-                error_msg = "reclen is too small for namlen";
-        else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
-                error_msg = "directory entry across blocks";
-        else if (fs32_to_cpu(sb, de->d_ino) > (UFS_SB(sb)->s_uspi->s_ipg *
-                                      UFS_SB(sb)->s_uspi->s_ncg))
-                error_msg = "inode out of bounds";
-        if (error_msg != NULL)
-                ufs_error (sb, function, "bad entry in directory #%lu, size %Lu: %s - "
-                            "offset=%lu, inode=%lu, reclen=%d, namlen=%d",
-                            dir->i_ino, dir->i_size, error_msg, offset,
-                            (unsigned long)fs32_to_cpu(sb, de->d_ino),
-                            rlen, ufs_get_de_namlen(sb, de));
-        
-        return (error_msg == NULL ? 1 : 0);
 }
-struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct buffer_head **p)
+static inline struct ufs_dir_entry *
+ufs_next_entry(struct super_block *sb, struct ufs_dir_entry *p)
 {
-        int err;
+        return (struct ufs_dir_entry *)((char *)p +
-        struct buffer_head *bh = ufs_bread (dir, 0, 0, &err);
+                                        fs16_to_cpu(sb, p->d_reclen));
-        struct ufs_dir_entry *res = NULL;
-        if (bh) {
-                res = (struct ufs_dir_entry *) bh->b_data;
-                res = (struct ufs_dir_entry *)((char *)res +
-                        fs16_to_cpu(dir->i_sb, res->d_reclen));
-        }
-        *p = bh;
-        return res;
 }
-ino_t ufs_inode_by_name(struct inode * dir, struct dentry *dentry)
+struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p)
 {
-        ino_t res = 0;
+        struct page *page = ufs_get_page(dir, 0);
-        struct ufs_dir_entry * de;
+        struct ufs_dir_entry *de = NULL;
-        struct buffer_head *bh;
-        de = ufs_find_entry (dentry, &bh);
+        if (!IS_ERR(page)) {
-        if (de) {
+                de = ufs_next_entry(dir->i_sb,
-                res = fs32_to_cpu(dir->i_sb, de->d_ino);
+                                    (struct ufs_dir_entry *)page_address(page));
-                brelse(bh);
+                *p = page;
        }
-        return res;
+        return de;
 }
-void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
+/*
-                struct buffer_head *bh, struct inode *inode)
+ *      ufs_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the page in which the entry was found, and the entry itself
+ * (as a parameter - res_dir). Page is returned mapped and unlocked.
+ * Entry is guaranteed to be valid.
+ */
+struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct dentry *dentry,
+                                     struct page **res_page)
 {
-        dir->i_version++;
+        struct super_block *sb = dir->i_sb;
-        de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino);
+        const char *name = dentry->d_name.name;
-        mark_buffer_dirty(bh);
+        int namelen = dentry->d_name.len;
-        if (IS_DIRSYNC(dir))
+        unsigned reclen = UFS_DIR_REC_LEN(namelen);
-                sync_dirty_buffer(bh);
+        unsigned long start, n;
-        brelse (bh);
+        unsigned long npages = ufs_dir_pages(dir);
+        struct page *page = NULL;
+        struct ufs_inode_info *ui = UFS_I(dir);
+        struct ufs_dir_entry *de;
+        UFSD("ENTER, dir_ino %lu, name %s, namlen %u\n", dir->i_ino, name, namelen);
+        if (npages == 0 || namelen > UFS_MAXNAMLEN)
+                goto out;
+        /* OFFSET_CACHE */
+        *res_page = NULL;
+        start = ui->i_dir_start_lookup;
+        if (start >= npages)
+                start = 0;
+        n = start;
+        do {
+                char *kaddr;
+                page = ufs_get_page(dir, n);
+                if (!IS_ERR(page)) {
+                        kaddr = page_address(page);
+                        de = (struct ufs_dir_entry *) kaddr;
+                        kaddr += ufs_last_byte(dir, n) - reclen;
+                        while ((char *) de <= kaddr) {
+                                if (de->d_reclen == 0) {
+                                        ufs_error(dir->i_sb, __FUNCTION__,
+                                                  "zero-length directory entry");
+                                        ufs_put_page(page);
+                                        goto out;
+                                }
+                                if (ufs_match(sb, namelen, name, de))
+                                        goto found;
+                                de = ufs_next_entry(sb, de);
+                        }
+                        ufs_put_page(page);
+                }
+                if (++n >= npages)
+                        n = 0;
+        } while (n != start);
+out:
+        return NULL;
+found:
+        *res_page = page;
+        ui->i_dir_start_lookup = n;
+        return de;
 }
 /*
- *      ufs_add_entry()
+ *      Parent is locked.
- *
- * adds a file entry to the specified directory, using the same
- * semantics as ufs_find_entry(). It returns NULL if it failed.
 */
 int ufs_add_link(struct dentry *dentry, struct inode *inode)
 {
-        struct super_block * sb;
-        struct ufs_sb_private_info * uspi;
-        unsigned long offset;
-        unsigned fragoff;
-        unsigned short rec_len;
-        struct buffer_head * bh;
-        struct ufs_dir_entry * de, * de1;
        struct inode *dir = dentry->d_parent->d_inode;
        const char *name = dentry->d_name.name;
        int namelen = dentry->d_name.len;
+        struct super_block *sb = dir->i_sb;
+        unsigned reclen = UFS_DIR_REC_LEN(namelen);
+        unsigned short rec_len, name_len;
+        struct page *page = NULL;
+        struct ufs_dir_entry *de;
+        unsigned long npages = ufs_dir_pages(dir);
+        unsigned long n;
+        char *kaddr;
+        unsigned from, to;
        int err;
-        UFSD(("ENTER, name %s, namelen %u\n", name, namelen))
+        UFSD("ENTER, name %s, namelen %u\n", name, namelen);
-        
-        sb = dir->i_sb;
+        /*
-        uspi = UFS_SB(sb)->s_uspi;
+         * We take care of directory expansion in the same loop.
+         * This code plays outside i_size, so it locks the page
-        if (!namelen)
+         * to protect that region.
-                return -EINVAL;
+         */
-        bh = ufs_bread (dir, 0, 0, &err);
+        for (n = 0; n <= npages; n++) {
-        if (!bh)
+                char *dir_end;
-                return err;
-        rec_len = UFS_DIR_REC_LEN(namelen);
+                page = ufs_get_page(dir, n);
-        offset = 0;
+                err = PTR_ERR(page);
-        de = (struct ufs_dir_entry *) bh->b_data;
+                if (IS_ERR(page))
-        while (1) {
+                        goto out;
-                if ((char *)de >= UFS_SECTOR_SIZE + bh->b_data) {
+                lock_page(page);
-                        fragoff = offset & ~uspi->s_fmask;
+                kaddr = page_address(page);
-                        if (fragoff != 0 && fragoff != UFS_SECTOR_SIZE)
+                dir_end = kaddr + ufs_last_byte(dir, n);
-                                ufs_error (sb, "ufs_add_entry", "internal error"
+                de = (struct ufs_dir_entry *)kaddr;
-                                        " fragoff %u", fragoff);
+                kaddr += PAGE_CACHE_SIZE - reclen;
-                        if (!fragoff) {
+                while ((char *)de <= kaddr) {
-                                brelse (bh);
+                        if ((char *)de == dir_end) {
-                                bh = ufs_bread (dir, offset >> sb->s_blocksize_bits, 1, &err);
+                                /* We hit i_size */
-                                if (!bh)
+                                name_len = 0;
-                                        return err;
+                                rec_len = UFS_SECTOR_SIZE;
-                        }
-                        if (dir->i_size <= offset) {
-                                if (dir->i_size == 0) {
-                                        brelse(bh);
-                                        return -ENOENT;
-                                }
-                                de = (struct ufs_dir_entry *) (bh->b_data + fragoff);
-                                de->d_ino = 0;
                                de->d_reclen = cpu_to_fs16(sb, UFS_SECTOR_SIZE);
-                                ufs_set_de_namlen(sb, de, 0);
+                                de->d_ino = 0;
-                                dir->i_size = offset + UFS_SECTOR_SIZE;
+                                goto got_it;
-                                mark_inode_dirty(dir);
-                        } else {
-                                de = (struct ufs_dir_entry *) bh->b_data;
                        }
+                        if (de->d_reclen == 0) {
+                                ufs_error(dir->i_sb, __FUNCTION__,
+                                          "zero-length directory entry");
+                                err = -EIO;
+                                goto out_unlock;
+                        }
+                        err = -EEXIST;
+                        if (ufs_match(sb, namelen, name, de))
+                                goto out_unlock;
+                        name_len = UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de));
+                        rec_len = fs16_to_cpu(sb, de->d_reclen);
+                        if (!de->d_ino && rec_len >= reclen)
+                                goto got_it;
+                        if (rec_len >= name_len + reclen)
+                                goto got_it;
+                        de = (struct ufs_dir_entry *) ((char *) de + rec_len);
                }
-                if (!ufs_check_dir_entry ("ufs_add_entry", dir, de, bh, offset)) {
+                unlock_page(page);
-                        brelse (bh);
+                ufs_put_page(page);
-                        return -ENOENT;
-                }
-                if (ufs_match(sb, namelen, name, de)) {
-                        brelse (bh);
-                        return -EEXIST;
-                }
-                if (de->d_ino == 0 && fs16_to_cpu(sb, de->d_reclen) >= rec_len)
-                        break;
-                        
-                if (fs16_to_cpu(sb, de->d_reclen) >=
-                     UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)) + rec_len)
-                        break;
-                offset += fs16_to_cpu(sb, de->d_reclen);
-                de = (struct ufs_dir_entry *) ((char *) de + fs16_to_cpu(sb, de->d_reclen));
        }
+        BUG();
+        return -EINVAL;
+got_it:
+        from = (char*)de - (char*)page_address(page);
+        to = from + rec_len;
+        err = page->mapping->a_ops->prepare_write(NULL, page, from, to);
+        if (err)
+                goto out_unlock;
        if (de->d_ino) {
-                de1 = (struct ufs_dir_entry *) ((char *) de +
+                struct ufs_dir_entry *de1 =
-                        UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)));
+                        (struct ufs_dir_entry *) ((char *) de + name_len);
-                de1->d_reclen =
+                de1->d_reclen = cpu_to_fs16(sb, rec_len - name_len);
-                        cpu_to_fs16(sb, fs16_to_cpu(sb, de->d_reclen) -
+                de->d_reclen = cpu_to_fs16(sb, name_len);
-                                UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)));
-                de->d_reclen =
-                        cpu_to_fs16(sb, UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)));
                de = de1;
        }
-        de->d_ino = 0;
        ufs_set_de_namlen(sb, de, namelen);
-        memcpy (de->d_name, name, namelen + 1);
+        memcpy(de->d_name, name, namelen + 1);
        de->d_ino = cpu_to_fs32(sb, inode->i_ino);
        ufs_set_de_type(sb, de, inode->i_mode);
-        mark_buffer_dirty(bh);
-        if (IS_DIRSYNC(dir))
+        err = ufs_commit_chunk(page, from, to);
-                sync_dirty_buffer(bh);
-        brelse (bh);
        dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
-        dir->i_version++;
        mark_inode_dirty(dir);
+        /* OFFSET_CACHE */
+out_put:
+        ufs_put_page(page);
+out:
+        return err;
+out_unlock:
+        unlock_page(page);
+        goto out_put;
+}
-        UFSD(("EXIT\n"))
+static inline unsigned
+ufs_validate_entry(struct super_block *sb, char *base,
+                   unsigned offset, unsigned mask)
+{
+        struct ufs_dir_entry *de = (struct ufs_dir_entry*)(base + offset);
+        struct ufs_dir_entry *p = (struct ufs_dir_entry*)(base + (offset&mask));
+        while ((char*)p < (char*)de) {
+                if (p->d_reclen == 0)
+                        break;
+                p = ufs_next_entry(sb, p);
+        }
+        return (char *)p - base;
+}
+/*
+ * This is blatantly stolen from ext2fs
+ */
+static int
+ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+        loff_t pos = filp->f_pos;
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct super_block *sb = inode->i_sb;
+        unsigned int offset = pos & ~PAGE_CACHE_MASK;
+        unsigned long n = pos >> PAGE_CACHE_SHIFT;
+        unsigned long npages = ufs_dir_pages(inode);
+        unsigned chunk_mask = ~(UFS_SECTOR_SIZE - 1);
+        int need_revalidate = filp->f_version != inode->i_version;
+        unsigned flags = UFS_SB(sb)->s_flags;
+        UFSD("BEGIN\n");
+        if (pos > inode->i_size - UFS_DIR_REC_LEN(1))
+                return 0;
+        for ( ; n < npages; n++, offset = 0) {
+                char *kaddr, *limit;
+                struct ufs_dir_entry *de;
+                struct page *page = ufs_get_page(inode, n);
+                if (IS_ERR(page)) {
+                        ufs_error(sb, __FUNCTION__,
+                                  "bad page in #%lu",
+                                  inode->i_ino);
+                        filp->f_pos += PAGE_CACHE_SIZE - offset;
+                        return -EIO;
+                }
+                kaddr = page_address(page);
+                if (unlikely(need_revalidate)) {
+                        if (offset) {
+                                offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask);
+                                filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+                        }
+                        filp->f_version = inode->i_version;
+                        need_revalidate = 0;
+                }
+                de = (struct ufs_dir_entry *)(kaddr+offset);
+                limit = kaddr + ufs_last_byte(inode, n) - UFS_DIR_REC_LEN(1);
+                for ( ;(char*)de <= limit; de = ufs_next_entry(sb, de)) {
+                        if (de->d_reclen == 0) {
+                                ufs_error(sb, __FUNCTION__,
+                                        "zero-length directory entry");
+                                ufs_put_page(page);
+                                return -EIO;
+                        }
+                        if (de->d_ino) {
+                                int over;
+                                unsigned char d_type = DT_UNKNOWN;
+                                offset = (char *)de - kaddr;
+                                UFSD("filldir(%s,%u)\n", de->d_name,
+                                      fs32_to_cpu(sb, de->d_ino));
+                                UFSD("namlen %u\n", ufs_get_de_namlen(sb, de));
+                                if ((flags & UFS_DE_MASK) == UFS_DE_44BSD)
+                                        d_type = de->d_u.d_44.d_type;
+                                over = filldir(dirent, de->d_name,
+                                               ufs_get_de_namlen(sb, de),
+                                                (n<<PAGE_CACHE_SHIFT) | offset,
+                                               fs32_to_cpu(sb, de->d_ino), d_type);
+                                if (over) {
+                                        ufs_put_page(page);
+                                        return 0;
+                                }
+                        }
+                        filp->f_pos += fs16_to_cpu(sb, de->d_reclen);
+                }
+                ufs_put_page(page);
+        }
        return 0;
 }
 /*
 * ufs_delete_entry deletes a directory entry by merging it with the
 * previous entry.
 */
-int ufs_delete_entry (struct inode * inode, struct ufs_dir_entry * dir,
+int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
-        struct buffer_head * bh )
+                     struct page * page)
-        
 {
-        struct super_block * sb;
+        struct super_block *sb = inode->i_sb;
-        struct ufs_dir_entry * de, * pde;
+        struct address_space *mapping = page->mapping;
-        unsigned i;
+        char *kaddr = page_address(page);
-        
+        unsigned from = ((char*)dir - kaddr) & ~(UFS_SECTOR_SIZE - 1);
-        UFSD(("ENTER\n"))
+        unsigned to = ((char*)dir - kaddr) + fs16_to_cpu(sb, dir->d_reclen);
+        struct ufs_dir_entry *pde = NULL;
+        struct ufs_dir_entry *de = (struct ufs_dir_entry *) (kaddr + from);
+        int err;
-        sb = inode->i_sb;
+        UFSD("ENTER\n");
-        i = 0;
-        pde = NULL;
+        UFSD("ino %u, reclen %u, namlen %u, name %s\n",
-        de = (struct ufs_dir_entry *) bh->b_data;
+              fs32_to_cpu(sb, de->d_ino),
-        
+              fs16_to_cpu(sb, de->d_reclen),
-        UFSD(("ino %u, reclen %u, namlen %u, name %s\n",
+              ufs_get_de_namlen(sb, de), de->d_name);
-                fs32_to_cpu(sb, de->d_ino),
-                fs16_to_cpu(sb, de->d_reclen),
+        while ((char*)de < (char*)dir) {
-                ufs_get_de_namlen(sb, de), de->d_name))
+                if (de->d_reclen == 0) {
+                        ufs_error(inode->i_sb, __FUNCTION__,
-        while (i < bh->b_size) {
+                                  "zero-length directory entry");
-                if (!ufs_check_dir_entry ("ufs_delete_entry", inode, de, bh, i)) {
+                        err = -EIO;
-                        brelse(bh);
+                        goto out;
-                        return -EIO;
-                }
-                if (de == dir)  {
-                        if (pde)
-                                fs16_add(sb, &pde->d_reclen,
-                                        fs16_to_cpu(sb, dir->d_reclen));
-                        dir->d_ino = 0;
-                        inode->i_version++;
-                        inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
-                        mark_inode_dirty(inode);
-                        mark_buffer_dirty(bh);
-                        if (IS_DIRSYNC(inode))
-                                sync_dirty_buffer(bh);
-                        brelse(bh);
-                        UFSD(("EXIT\n"))
-                        return 0;
                }
-                i += fs16_to_cpu(sb, de->d_reclen);
+                pde = de;
-                if (i == UFS_SECTOR_SIZE) pde = NULL;
+                de = ufs_next_entry(sb, de);
-                else pde = de;
-                de = (struct ufs_dir_entry *)
-                    ((char *) de + fs16_to_cpu(sb, de->d_reclen));
-                if (i == UFS_SECTOR_SIZE && de->d_reclen == 0)
-                        break;
        }
-        UFSD(("EXIT\n"))
+        if (pde)
-        brelse(bh);
+                from = (char*)pde - (char*)page_address(page);
-        return -ENOENT;
+        lock_page(page);
+        err = mapping->a_ops->prepare_write(NULL, page, from, to);
+        BUG_ON(err);
+        if (pde)
+                pde->d_reclen = cpu_to_fs16(sb, to-from);
+        dir->d_ino = 0;
+        err = ufs_commit_chunk(page, from, to);
+        inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+        mark_inode_dirty(inode);
+out:
+        ufs_put_page(page);
+        UFSD("EXIT\n");
+        return err;
 }
 int ufs_make_empty(struct inode * inode, struct inode *dir)
 {
        struct super_block * sb = dir->i_sb;
-        struct buffer_head * dir_block;
+        struct address_space *mapping = inode->i_mapping;
+        struct page *page = grab_cache_page(mapping, 0);
        struct ufs_dir_entry * de;
+        char *base;
        int err;
-        dir_block = ufs_bread (inode, 0, 1, &err);
+        if (!page)
-        if (!dir_block)
+                return -ENOMEM;
-                return err;
+        kmap(page);
+        err = mapping->a_ops->prepare_write(NULL, page, 0, UFS_SECTOR_SIZE);
+        if (err) {
+                unlock_page(page);
+                goto fail;
+        }
+        base = (char*)page_address(page);
+        memset(base, 0, PAGE_CACHE_SIZE);
+        de = (struct ufs_dir_entry *) base;
-        inode->i_blocks = sb->s_blocksize / UFS_SECTOR_SIZE;
-        de = (struct ufs_dir_entry *) dir_block->b_data;
        de->d_ino = cpu_to_fs32(sb, inode->i_ino);
        ufs_set_de_type(sb, de, inode->i_mode);
        ufs_set_de_namlen(sb, de, 1);
@@ -552,72 +587,65 @@ int ufs_make_empty(struct inode * inode, struct inode *dir)
        de->d_reclen = cpu_to_fs16(sb, UFS_SECTOR_SIZE - UFS_DIR_REC_LEN(1));
        ufs_set_de_namlen(sb, de, 2);
        strcpy (de->d_name, "..");
-        mark_buffer_dirty(dir_block);
-        brelse (dir_block);
+        err = ufs_commit_chunk(page, 0, UFS_SECTOR_SIZE);
-        mark_inode_dirty(inode);
+fail:
-        return 0;
+        kunmap(page);
+        page_cache_release(page);
+        return err;
 }
 /*
 * routine to check that the specified directory is empty (for rmdir)
 */
-int ufs_empty_dir (struct inode * inode)
+int ufs_empty_dir(struct inode * inode)
 {
-        struct super_block * sb;
+        struct super_block *sb = inode->i_sb;
-        unsigned long offset;
+        struct page *page = NULL;
-        struct buffer_head * bh;
+        unsigned long i, npages = ufs_dir_pages(inode);
-        struct ufs_dir_entry * de, * de1;
-        int err;
+        for (i = 0; i < npages; i++) {
-        
+                char *kaddr;
-        sb = inode->i_sb;
+                struct ufs_dir_entry *de;
+                page = ufs_get_page(inode, i);
-        if (inode->i_size < UFS_DIR_REC_LEN(1) + UFS_DIR_REC_LEN(2) ||
-            !(bh = ufs_bread (inode, 0, 0, &err))) {
+                if (IS_ERR(page))
-                ufs_warning (inode->i_sb, "empty_dir",
+                        continue;
-                              "bad directory (dir #%lu) - no data block",
-                              inode->i_ino);
+                kaddr = page_address(page);
-                return 1;
+                de = (struct ufs_dir_entry *)kaddr;
-        }
+                kaddr += ufs_last_byte(inode, i) - UFS_DIR_REC_LEN(1);
-        de = (struct ufs_dir_entry *) bh->b_data;
-        de1 = (struct ufs_dir_entry *)
+                while ((char *)de <= kaddr) {
-                ((char *)de + fs16_to_cpu(sb, de->d_reclen));
+                        if (de->d_reclen == 0) {
-        if (fs32_to_cpu(sb, de->d_ino) != inode->i_ino || de1->d_ino == 0 ||
+                                ufs_error(inode->i_sb, __FUNCTION__,
-             strcmp (".", de->d_name) || strcmp ("..", de1->d_name)) {
+                                        "zero-length directory entry: "
-                ufs_warning (inode->i_sb, "empty_dir",
+                                        "kaddr=%p, de=%p\n", kaddr, de);
-                              "bad directory (dir #%lu) - no `.' or `..'",
+                                goto not_empty;
-                              inode->i_ino);
-                return 1;
-        }
-        offset = fs16_to_cpu(sb, de->d_reclen) + fs16_to_cpu(sb, de1->d_reclen);
-        de = (struct ufs_dir_entry *)
-                ((char *)de1 + fs16_to_cpu(sb, de1->d_reclen));
-        while (offset < inode->i_size ) {
-                if (!bh || (void *) de >= (void *) (bh->b_data + sb->s_blocksize)) {
-                        brelse (bh);
-                        bh = ufs_bread (inode, offset >> sb->s_blocksize_bits, 1, &err);
-                        if (!bh) {
-                                ufs_error (sb, "empty_dir",
-                                            "directory #%lu contains a hole at offset %lu",
-                                            inode->i_ino, offset);
-                                offset += sb->s_blocksize;
-                                continue;
                        }
-                        de = (struct ufs_dir_entry *) bh->b_data;
+                        if (de->d_ino) {
-                }
+                                u16 namelen=ufs_get_de_namlen(sb, de);
-                if (!ufs_check_dir_entry ("empty_dir", inode, de, bh, offset)) {
+                                /* check for . and .. */
-                        brelse (bh);
+                                if (de->d_name[0] != '.')
-                        return 1;
+                                        goto not_empty;
-                }
+                                if (namelen > 2)
-                if (de->d_ino) {
+                                        goto not_empty;
-                        brelse (bh);
+                                if (namelen < 2) {
-                        return 0;
+                                        if (inode->i_ino !=
+                                            fs32_to_cpu(sb, de->d_ino))
+                                                goto not_empty;
+                                } else if (de->d_name[1] != '.')
+                                        goto not_empty;
+                        }
+                        de = ufs_next_entry(sb, de);
                }
-                offset += fs16_to_cpu(sb, de->d_reclen);
+                ufs_put_page(page);
-                de = (struct ufs_dir_entry *)
-                        ((char *)de + fs16_to_cpu(sb, de->d_reclen));
        }
-        brelse (bh);
        return 1;
+not_empty:
+        ufs_put_page(page);
+        return 0;
 }
 const struct file_operations ufs_dir_operations = {
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 312fd3f86313..0e5001512a9d 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -25,6 +25,26 @@
 #include <linux/fs.h>
 #include <linux/ufs_fs.h>
+#include <linux/buffer_head.h>  /* for sync_mapping_buffers() */
+static int ufs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+{
+        struct inode *inode = dentry->d_inode;
+        int err;
+        int ret;
+        ret = sync_mapping_buffers(inode->i_mapping);
+        if (!(inode->i_state & I_DIRTY))
+                return ret;
+        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+                return ret;
+        err = ufs_sync_inode(inode);
+        if (ret == 0)
+                ret = err;
+        return ret;
+}
 /*
 * We have mostly NULL's here: the current defaults are ok for
@@ -37,6 +57,7 @@ const struct file_operations ufs_file_operations = {
        .write          = generic_file_write,
        .mmap           = generic_file_mmap,
        .open           = generic_file_open,
+        .fsync          = ufs_sync_file,
        .sendfile       = generic_file_sendfile,
 };
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index c7a47ed4f430..9501dcd3b213 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -34,14 +34,6 @@
 #include "swab.h"
 #include "util.h"
-#undef UFS_IALLOC_DEBUG
-#ifdef UFS_IALLOC_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
 /*
 * NOTE! When we get the inode, we're the only people
 * that have access to it, and as such there are no
@@ -68,7 +60,7 @@ void ufs_free_inode (struct inode * inode)
        int is_directory;
        unsigned ino, cg, bit;
        
-        UFSD(("ENTER, ino %lu\n", inode->i_ino))
+        UFSD("ENTER, ino %lu\n", inode->i_ino);
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
@@ -91,7 +83,7 @@ void ufs_free_inode (struct inode * inode)
                unlock_super (sb);
                return;
        }
-        ucg = ubh_get_ucg(UCPI_UBH);
+        ucg = ubh_get_ucg(UCPI_UBH(ucpi));
        if (!ufs_cg_chkmagic(sb, ucg))
                ufs_panic (sb, "ufs_free_fragments", "internal error, bad cg magic number");
@@ -104,33 +96,33 @@ void ufs_free_inode (struct inode * inode)
        clear_inode (inode);
-        if (ubh_isclr (UCPI_UBH, ucpi->c_iusedoff, bit))
+        if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
                ufs_error(sb, "ufs_free_inode", "bit already cleared for inode %u", ino);
        else {
-                ubh_clrbit (UCPI_UBH, ucpi->c_iusedoff, bit);
+                ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit);
                if (ino < ucpi->c_irotor)
                        ucpi->c_irotor = ino;
                fs32_add(sb, &ucg->cg_cs.cs_nifree, 1);
-                fs32_add(sb, &usb1->fs_cstotal.cs_nifree, 1);
+                uspi->cs_total.cs_nifree++;
                fs32_add(sb, &UFS_SB(sb)->fs_cs(cg).cs_nifree, 1);
                if (is_directory) {
                        fs32_sub(sb, &ucg->cg_cs.cs_ndir, 1);
-                        fs32_sub(sb, &usb1->fs_cstotal.cs_ndir, 1);
+                        uspi->cs_total.cs_ndir--;
                        fs32_sub(sb, &UFS_SB(sb)->fs_cs(cg).cs_ndir, 1);
                }
        }
-        ubh_mark_buffer_dirty (USPI_UBH);
+        ubh_mark_buffer_dirty (USPI_UBH(uspi));
-        ubh_mark_buffer_dirty (UCPI_UBH);
+        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
        if (sb->s_flags & MS_SYNCHRONOUS) {
-                ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **) &ucpi);
+                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH);
+                ubh_wait_on_buffer (UCPI_UBH(ucpi));
        }
        
        sb->s_dirt = 1;
        unlock_super (sb);
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
 }
 /*
@@ -155,7 +147,7 @@ struct inode * ufs_new_inode(struct inode * dir, int mode)
        unsigned cg, bit, i, j, start;
        struct ufs_inode_info *ufsi;
-        UFSD(("ENTER\n"))
+        UFSD("ENTER\n");
        
        /* Cannot create files in a deleted directory */
        if (!dir || !dir->i_nlink)
@@ -213,43 +205,43 @@ cg_found:
        ucpi = ufs_load_cylinder (sb, cg);
        if (!ucpi)
                goto failed;
-        ucg = ubh_get_ucg(UCPI_UBH);
+        ucg = ubh_get_ucg(UCPI_UBH(ucpi));
        if (!ufs_cg_chkmagic(sb, ucg)) 
                ufs_panic (sb, "ufs_new_inode", "internal error, bad cg magic number");
        start = ucpi->c_irotor;
-        bit = ubh_find_next_zero_bit (UCPI_UBH, ucpi->c_iusedoff, uspi->s_ipg, start);
+        bit = ubh_find_next_zero_bit (UCPI_UBH(ucpi), ucpi->c_iusedoff, uspi->s_ipg, start);
        if (!(bit < uspi->s_ipg)) {
-                bit = ubh_find_first_zero_bit (UCPI_UBH, ucpi->c_iusedoff, start);
+                bit = ubh_find_first_zero_bit (UCPI_UBH(ucpi), ucpi->c_iusedoff, start);
                if (!(bit < start)) {
                        ufs_error (sb, "ufs_new_inode",
                            "cylinder group %u corrupted - error in inode bitmap\n", cg);
                        goto failed;
                }
        }
-        UFSD(("start = %u, bit = %u, ipg = %u\n", start, bit, uspi->s_ipg))
+        UFSD("start = %u, bit = %u, ipg = %u\n", start, bit, uspi->s_ipg);
-        if (ubh_isclr (UCPI_UBH, ucpi->c_iusedoff, bit))
+        if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
-                ubh_setbit (UCPI_UBH, ucpi->c_iusedoff, bit);
+                ubh_setbit (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit);
        else {
                ufs_panic (sb, "ufs_new_inode", "internal error");
                goto failed;
        }
        
        fs32_sub(sb, &ucg->cg_cs.cs_nifree, 1);
-        fs32_sub(sb, &usb1->fs_cstotal.cs_nifree, 1);
+        uspi->cs_total.cs_nifree--;
        fs32_sub(sb, &sbi->fs_cs(cg).cs_nifree, 1);
        
        if (S_ISDIR(mode)) {
                fs32_add(sb, &ucg->cg_cs.cs_ndir, 1);
-                fs32_add(sb, &usb1->fs_cstotal.cs_ndir, 1);
+                uspi->cs_total.cs_ndir++;
                fs32_add(sb, &sbi->fs_cs(cg).cs_ndir, 1);
        }
-        ubh_mark_buffer_dirty (USPI_UBH);
+        ubh_mark_buffer_dirty (USPI_UBH(uspi));
-        ubh_mark_buffer_dirty (UCPI_UBH);
+        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
        if (sb->s_flags & MS_SYNCHRONOUS) {
-                ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **) &ucpi);
+                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH);
+                ubh_wait_on_buffer (UCPI_UBH(ucpi));
        }
        sb->s_dirt = 1;
@@ -272,6 +264,7 @@ cg_found:
        ufsi->i_shadow = 0;
        ufsi->i_osync = 0;
        ufsi->i_oeftflag = 0;
+        ufsi->i_dir_start_lookup = 0;
        memset(&ufsi->i_u1, 0, sizeof(ufsi->i_u1));
        insert_inode_hash(inode);
@@ -287,14 +280,14 @@ cg_found:
                return ERR_PTR(-EDQUOT);
        }
-        UFSD(("allocating inode %lu\n", inode->i_ino))
+        UFSD("allocating inode %lu\n", inode->i_ino);
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        return inode;
 failed:
        unlock_super (sb);
        make_bad_inode(inode);
        iput (inode);
-        UFSD(("EXIT (FAILED)\n"))
+        UFSD("EXIT (FAILED)\n");
        return ERR_PTR(-ENOSPC);
 }
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 3c3f62ce2ad9..259bd196099d 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -41,14 +41,7 @@
 #include "swab.h"
 #include "util.h"
-#undef UFS_INODE_DEBUG
+static u64 ufs_frag_map(struct inode *inode, sector_t frag);
-#undef UFS_INODE_DEBUG_MORE
-#ifdef UFS_INODE_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
 static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4])
 {
@@ -61,7 +54,7 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off
        int n = 0;
-        UFSD(("ptrs=uspi->s_apb = %d,double_blocks=%ld \n",ptrs,double_blocks));
+        UFSD("ptrs=uspi->s_apb = %d,double_blocks=%ld \n",ptrs,double_blocks);
        if (i_block < 0) {
                ufs_warning(inode->i_sb, "ufs_block_to_path", "block < 0");
        } else if (i_block < direct_blocks) {
@@ -89,7 +82,7 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off
 * the begining of the filesystem.
 */
-u64  ufs_frag_map(struct inode *inode, sector_t frag)
+static u64 ufs_frag_map(struct inode *inode, sector_t frag)
 {
        struct ufs_inode_info *ufsi = UFS_I(inode);
        struct super_block *sb = inode->i_sb;
@@ -104,8 +97,8 @@ u64  ufs_frag_map(struct inode *inode, sector_t frag)
        unsigned flags = UFS_SB(sb)->s_flags;
        u64 temp = 0L;
-        UFSD((": frag = %llu  depth = %d\n", (unsigned long long)frag, depth));
+        UFSD(": frag = %llu  depth = %d\n", (unsigned long long)frag, depth);
-        UFSD((": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n",uspi->s_fpbshift,uspi->s_apbmask,mask));
+        UFSD(": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n",uspi->s_fpbshift,uspi->s_apbmask,mask);
        if (depth == 0)
                return 0;
@@ -161,26 +154,64 @@ out:
        return ret;
 }
-static struct buffer_head * ufs_inode_getfrag (struct inode *inode,
+static void ufs_clear_frag(struct inode *inode, struct buffer_head *bh)
-        unsigned int fragment, unsigned int new_fragment,
+{
-        unsigned int required, int *err, int metadata, long *phys, int *new)
+        lock_buffer(bh);
+        memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+        set_buffer_uptodate(bh);
+        mark_buffer_dirty(bh);
+        unlock_buffer(bh);
+        if (IS_SYNC(inode))
+                sync_dirty_buffer(bh);
+}
+static struct buffer_head *
+ufs_clear_frags(struct inode *inode, sector_t beg,
+                unsigned int n)
+{
+        struct buffer_head *res, *bh;
+        sector_t end = beg + n;
+        res = sb_getblk(inode->i_sb, beg);
+        ufs_clear_frag(inode, res);
+        for (++beg; beg < end; ++beg) {
+                bh = sb_getblk(inode->i_sb, beg);
+                ufs_clear_frag(inode, bh);
+                brelse(bh);
+        }
+        return res;
+}
+/**
+ * ufs_inode_getfrag() - allocate new fragment(s)
+ * @inode - pointer to inode
+ * @fragment - number of `fragment' which hold pointer
+ *   to new allocated fragment(s)
+ * @new_fragment - number of new allocated fragment(s)
+ * @required - how many fragment(s) we require
+ * @err - we set it if something wrong
+ * @phys - pointer to where we save physical number of new allocated fragments,
+ *   NULL if we allocate not data(indirect blocks for example).
+ * @new - we set it if we allocate new block
+ * @locked_page - for ufs_new_fragments()
+ */
+static struct buffer_head *
+ufs_inode_getfrag(struct inode *inode, unsigned int fragment,
+                  sector_t new_fragment, unsigned int required, int *err,
+                  long *phys, int *new, struct page *locked_page)
 {
        struct ufs_inode_info *ufsi = UFS_I(inode);
-        struct super_block * sb;
+        struct super_block *sb = inode->i_sb;
-        struct ufs_sb_private_info * uspi;
+        struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
        struct buffer_head * result;
        unsigned block, blockoff, lastfrag, lastblock, lastblockoff;
        unsigned tmp, goal;
        __fs32 * p, * p2;
-        unsigned flags = 0;
-        UFSD(("ENTER, ino %lu, fragment %u, new_fragment %u, required %u\n",
+        UFSD("ENTER, ino %lu, fragment %u, new_fragment %llu, required %u, "
-                inode->i_ino, fragment, new_fragment, required))         
+             "metadata %d\n", inode->i_ino, fragment,
+             (unsigned long long)new_fragment, required, !phys);
-        sb = inode->i_sb;
-        uspi = UFS_SB(sb)->s_uspi;
-        flags = UFS_SB(sb)->s_flags;
        /* TODO : to be done for write support
        if ( (flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
             goto ufs2;
@@ -195,16 +226,16 @@ repeat:
        tmp = fs32_to_cpu(sb, *p);
        lastfrag = ufsi->i_lastfrag;
        if (tmp && fragment < lastfrag) {
-                if (metadata) {
+                if (!phys) {
                        result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
                        if (tmp == fs32_to_cpu(sb, *p)) {
-                                UFSD(("EXIT, result %u\n", tmp + blockoff))
+                                UFSD("EXIT, result %u\n", tmp + blockoff);
                                return result;
                        }
                        brelse (result);
                        goto repeat;
                } else {
-                        *phys = tmp;
+                        *phys = tmp + blockoff;
                        return NULL;
                }
        }
@@ -221,7 +252,8 @@ repeat:
                if (lastblockoff) {
                        p2 = ufsi->i_u1.i_data + lastblock;
                        tmp = ufs_new_fragments (inode, p2, lastfrag, 
-                                fs32_to_cpu(sb, *p2), uspi->s_fpb - lastblockoff, err);
+                                                 fs32_to_cpu(sb, *p2), uspi->s_fpb - lastblockoff,
+                                                 err, locked_page);
                        if (!tmp) {
                                if (lastfrag != ufsi->i_lastfrag)
                                        goto repeat;
@@ -233,14 +265,16 @@ repeat:
                }
                goal = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock]) + uspi->s_fpb;
                tmp = ufs_new_fragments (inode, p, fragment - blockoff, 
-                        goal, required + blockoff, err);
+                                         goal, required + blockoff,
+                                         err, locked_page);
        }
        /*
         * We will extend last allocated block
         */
        else if (lastblock == block) {
-                tmp = ufs_new_fragments (inode, p, fragment - (blockoff - lastblockoff),
+                tmp = ufs_new_fragments(inode, p, fragment - (blockoff - lastblockoff),
-                        fs32_to_cpu(sb, *p), required +  (blockoff - lastblockoff), err);
+                                        fs32_to_cpu(sb, *p), required +  (blockoff - lastblockoff),
+                                        err, locked_page);
        }
        /*
         * We will allocate new block before last allocated block
@@ -248,8 +282,8 @@ repeat:
        else /* (lastblock > block) */ {
                if (lastblock && (tmp = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock-1])))
                        goal = tmp + uspi->s_fpb;
-                tmp = ufs_new_fragments (inode, p, fragment - blockoff, 
+                tmp = ufs_new_fragments(inode, p, fragment - blockoff,
-                        goal, uspi->s_fpb, err);
+                                        goal, uspi->s_fpb, err, locked_page);
        }
        if (!tmp) {
                if ((!blockoff && *p) || 
@@ -259,14 +293,10 @@ repeat:
                return NULL;
        }
-        /* The nullification of framgents done in ufs/balloc.c is
+        if (!phys) {
-         * something I don't have the stomache to move into here right
+                result = ufs_clear_frags(inode, tmp + blockoff, required);
-         * now. -DaveM
-         */
-        if (metadata) {
-                result = sb_getblk(inode->i_sb, tmp + blockoff);
        } else {
-                *phys = tmp;
+                *phys = tmp + blockoff;
                result = NULL;
                *err = 0;
                *new = 1;
@@ -276,7 +306,7 @@ repeat:
        if (IS_SYNC(inode))
                ufs_sync_inode (inode);
        mark_inode_dirty(inode);
-        UFSD(("EXIT, result %u\n", tmp + blockoff))
+        UFSD("EXIT, result %u\n", tmp + blockoff);
        return result;
     /* This part : To be implemented ....
@@ -295,22 +325,35 @@ repeat2:
     */
 }
-static struct buffer_head * ufs_block_getfrag (struct inode *inode,
+/**
-        struct buffer_head *bh, unsigned int fragment, unsigned int new_fragment, 
+ * ufs_inode_getblock() - allocate new block
-        unsigned int blocksize, int * err, int metadata, long *phys, int *new)
+ * @inode - pointer to inode
+ * @bh - pointer to block which hold "pointer" to new allocated block
+ * @fragment - number of `fragment' which hold pointer
+ *   to new allocated block
+ * @new_fragment - number of new allocated fragment
+ *  (block will hold this fragment and also uspi->s_fpb-1)
+ * @err - see ufs_inode_getfrag()
+ * @phys - see ufs_inode_getfrag()
+ * @new - see ufs_inode_getfrag()
+ * @locked_page - see ufs_inode_getfrag()
+ */
+static struct buffer_head *
+ufs_inode_getblock(struct inode *inode, struct buffer_head *bh,
+                  unsigned int fragment, sector_t new_fragment, int *err,
+                  long *phys, int *new, struct page *locked_page)
 {
-        struct super_block * sb;
+        struct super_block *sb = inode->i_sb;
-        struct ufs_sb_private_info * uspi;
+        struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
        struct buffer_head * result;
        unsigned tmp, goal, block, blockoff;
        __fs32 * p;
-        sb = inode->i_sb;
-        uspi = UFS_SB(sb)->s_uspi;
        block = ufs_fragstoblks (fragment);
        blockoff = ufs_fragnum (fragment);
-        UFSD(("ENTER, ino %lu, fragment %u, new_fragment %u\n", inode->i_ino, fragment, new_fragment))  
+        UFSD("ENTER, ino %lu, fragment %u, new_fragment %llu, metadata %d\n",
+             inode->i_ino, fragment, (unsigned long long)new_fragment, !phys);
        result = NULL;
        if (!bh)
@@ -326,14 +369,14 @@ static struct buffer_head * ufs_block_getfrag (struct inode *inode,
 repeat:
        tmp = fs32_to_cpu(sb, *p);
        if (tmp) {
-                if (metadata) {
+                if (!phys) {
                        result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
                        if (tmp == fs32_to_cpu(sb, *p))
                                goto out;
                        brelse (result);
                        goto repeat;
                } else {
-                        *phys = tmp;
+                        *phys = tmp + blockoff;
                        goto out;
                }
        }
@@ -342,21 +385,19 @@ repeat:
                goal = tmp + uspi->s_fpb;
        else
                goal = bh->b_blocknr + uspi->s_fpb;
-        tmp = ufs_new_fragments (inode, p, ufs_blknum(new_fragment), goal, uspi->s_fpb, err);
+        tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), goal,
+                                uspi->s_fpb, err, locked_page);
        if (!tmp) {
                if (fs32_to_cpu(sb, *p))
                        goto repeat;
                goto out;
        }               
-        /* The nullification of framgents done in ufs/balloc.c is
-         * something I don't have the stomache to move into here right
+        if (!phys) {
-         * now. -DaveM
+                result = ufs_clear_frags(inode, tmp + blockoff, uspi->s_fpb);
-         */
-        if (metadata) {
-                result = sb_getblk(sb, tmp + blockoff);
        } else {
-                *phys = tmp;
+                *phys = tmp + blockoff;
                *new = 1;
        }
@@ -365,18 +406,19 @@ repeat:
                sync_dirty_buffer(bh);
        inode->i_ctime = CURRENT_TIME_SEC;
        mark_inode_dirty(inode);
-        UFSD(("result %u\n", tmp + blockoff));
+        UFSD("result %u\n", tmp + blockoff);
 out:
        brelse (bh);
-        UFSD(("EXIT\n"));
+        UFSD("EXIT\n");
        return result;
 }
-/*
+/**
- * This function gets the block which contains the fragment.
+ * ufs_getfrag_bloc() - `get_block_t' function, interface between UFS and
+ * readpage, writepage and so on
 */
-int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
+int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
 {
        struct super_block * sb = inode->i_sb;
        struct ufs_sb_private_info * uspi = UFS_SB(sb)->s_uspi;
@@ -387,7 +429,7 @@ int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_hea
        
        if (!create) {
                phys64 = ufs_frag_map(inode, fragment);
-                UFSD(("phys64 = %llu \n",phys64));
+                UFSD("phys64 = %llu \n",phys64);
                if (phys64)
                        map_bh(bh_result, sb, phys64);
                return 0;
@@ -402,7 +444,7 @@ int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_hea
        lock_kernel();
-        UFSD(("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment))
+        UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment);
        if (fragment < 0)
                goto abort_negative;
        if (fragment >
@@ -418,15 +460,15 @@ int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_hea
         * it much more readable:
         */
 #define GET_INODE_DATABLOCK(x) \
-                ufs_inode_getfrag(inode, x, fragment, 1, &err, 0, &phys, &new)
+        ufs_inode_getfrag(inode, x, fragment, 1, &err, &phys, &new, bh_result->b_page)
 #define GET_INODE_PTR(x) \
-                ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, 1, NULL, NULL)
+        ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, NULL, NULL, bh_result->b_page)
 #define GET_INDIRECT_DATABLOCK(x) \
-                ufs_block_getfrag(inode, bh, x, fragment, sb->s_blocksize, \
+        ufs_inode_getblock(inode, bh, x, fragment,      \
-                                  &err, 0, &phys, &new);
+                          &err, &phys, &new, bh_result->b_page);
 #define GET_INDIRECT_PTR(x) \
-                ufs_block_getfrag(inode, bh, x, fragment, sb->s_blocksize, \
+        ufs_inode_getblock(inode, bh, x, fragment,      \
-                                  &err, 1, NULL, NULL);
+                          &err, NULL, NULL, bh_result->b_page);
        if (ptr < UFS_NDIR_FRAGMENT) {
                bh = GET_INODE_DATABLOCK(ptr);
@@ -474,8 +516,9 @@ abort_too_big:
        goto abort;
 }
-struct buffer_head *ufs_getfrag(struct inode *inode, unsigned int fragment,
+static struct buffer_head *ufs_getfrag(struct inode *inode,
-                                int create, int *err)
+                                       unsigned int fragment,
+                                       int create, int *err)
 {
        struct buffer_head dummy;
        int error;
@@ -502,7 +545,7 @@ struct buffer_head * ufs_bread (struct inode * inode, unsigned fragment,
 {
        struct buffer_head * bh;
-        UFSD(("ENTER, ino %lu, fragment %u\n", inode->i_ino, fragment))
+        UFSD("ENTER, ino %lu, fragment %u\n", inode->i_ino, fragment);
        bh = ufs_getfrag (inode, fragment, create, err);
        if (!bh || buffer_uptodate(bh))                 
                return bh;
@@ -540,39 +583,34 @@ struct address_space_operations ufs_aops = {
        .bmap = ufs_bmap
 };
-void ufs_read_inode (struct inode * inode)
+static void ufs_set_inode_ops(struct inode *inode)
+{
+        if (S_ISREG(inode->i_mode)) {
+                inode->i_op = &ufs_file_inode_operations;
+                inode->i_fop = &ufs_file_operations;
+                inode->i_mapping->a_ops = &ufs_aops;
+        } else if (S_ISDIR(inode->i_mode)) {
+                inode->i_op = &ufs_dir_inode_operations;
+                inode->i_fop = &ufs_dir_operations;
+                inode->i_mapping->a_ops = &ufs_aops;
+        } else if (S_ISLNK(inode->i_mode)) {
+                if (!inode->i_blocks)
+                        inode->i_op = &ufs_fast_symlink_inode_operations;
+                else {
+                        inode->i_op = &page_symlink_inode_operations;
+                        inode->i_mapping->a_ops = &ufs_aops;
+                }
+        } else
+                init_special_inode(inode, inode->i_mode,
+                                   ufs_get_inode_dev(inode->i_sb, UFS_I(inode)));
+}
+static void ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
 {
        struct ufs_inode_info *ufsi = UFS_I(inode);
-        struct super_block * sb;
+        struct super_block *sb = inode->i_sb;
-        struct ufs_sb_private_info * uspi;
-        struct ufs_inode * ufs_inode;   
-        struct ufs2_inode *ufs2_inode;
-        struct buffer_head * bh;
        mode_t mode;
        unsigned i;
-        unsigned flags;
-        
-        UFSD(("ENTER, ino %lu\n", inode->i_ino))
-        
-        sb = inode->i_sb;
-        uspi = UFS_SB(sb)->s_uspi;
-        flags = UFS_SB(sb)->s_flags;
-        if (inode->i_ino < UFS_ROOTINO || 
-            inode->i_ino > (uspi->s_ncg * uspi->s_ipg)) {
-                ufs_warning (sb, "ufs_read_inode", "bad inode number (%lu)\n", inode->i_ino);
-                goto bad_inode;
-        }
-        
-        bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino));
-        if (!bh) {
-                ufs_warning (sb, "ufs_read_inode", "unable to read inode %lu\n", inode->i_ino);
-                goto bad_inode;
-        }
-        if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
-                goto ufs2_inode;
-        ufs_inode = (struct ufs_inode *) (bh->b_data + sizeof(struct ufs_inode) * ufs_inotofsbo(inode->i_ino));
        /*
         * Copy data to the in-core inode.
@@ -596,56 +634,29 @@ void ufs_read_inode (struct inode * inode)
        inode->i_atime.tv_nsec = 0;
        inode->i_ctime.tv_nsec = 0;
        inode->i_blocks = fs32_to_cpu(sb, ufs_inode->ui_blocks);
-        inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size (for stat) */
-        inode->i_version++;
        ufsi->i_flags = fs32_to_cpu(sb, ufs_inode->ui_flags);
        ufsi->i_gen = fs32_to_cpu(sb, ufs_inode->ui_gen);
        ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow);
        ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag);
-        ufsi->i_lastfrag = (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift;
        
        if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
                for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++)
                        ufsi->i_u1.i_data[i] = ufs_inode->ui_u2.ui_addr.ui_db[i];
-        }
+        } else {
-        else {
                for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
                        ufsi->i_u1.i_symlink[i] = ufs_inode->ui_u2.ui_symlink[i];
        }
-        ufsi->i_osync = 0;
+}
-        if (S_ISREG(inode->i_mode)) {
-                inode->i_op = &ufs_file_inode_operations;
-                inode->i_fop = &ufs_file_operations;
-                inode->i_mapping->a_ops = &ufs_aops;
-        } else if (S_ISDIR(inode->i_mode)) {
-                inode->i_op = &ufs_dir_inode_operations;
-                inode->i_fop = &ufs_dir_operations;
-        } else if (S_ISLNK(inode->i_mode)) {
-                if (!inode->i_blocks)
-                        inode->i_op = &ufs_fast_symlink_inode_operations;
-                else {
-                        inode->i_op = &page_symlink_inode_operations;
-                        inode->i_mapping->a_ops = &ufs_aops;
-                }
-        } else
-                init_special_inode(inode, inode->i_mode,
-                        ufs_get_inode_dev(sb, ufsi));
-        brelse (bh);
-        UFSD(("EXIT\n"))
-        return;
-bad_inode:
-        make_bad_inode(inode);
-        return;
-ufs2_inode :
-        UFSD(("Reading ufs2 inode, ino %lu\n", inode->i_ino))
-        ufs2_inode = (struct ufs2_inode *)(bh->b_data + sizeof(struct ufs2_inode) * ufs_inotofsbo(inode->i_ino));
+static void ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
+{
+        struct ufs_inode_info *ufsi = UFS_I(inode);
+        struct super_block *sb = inode->i_sb;
+        mode_t mode;
+        unsigned i;
+        UFSD("Reading ufs2 inode, ino %lu\n", inode->i_ino);
        /*
         * Copy data to the in-core inode.
         */
@@ -668,50 +679,75 @@ ufs2_inode :
        inode->i_atime.tv_nsec = 0;
        inode->i_ctime.tv_nsec = 0;
        inode->i_blocks = fs64_to_cpu(sb, ufs2_inode->ui_blocks);
-        inode->i_blksize = PAGE_SIZE; /*This is the optimal IO size(for stat)*/
-        inode->i_version++;
        ufsi->i_flags = fs32_to_cpu(sb, ufs2_inode->ui_flags);
        ufsi->i_gen = fs32_to_cpu(sb, ufs2_inode->ui_gen);
        /*
        ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow);
        ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag);
        */
-        ufsi->i_lastfrag= (inode->i_size + uspi->s_fsize- 1) >> uspi->s_fshift;
        if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
                for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++)
                        ufsi->i_u1.u2_i_data[i] =
                                ufs2_inode->ui_u2.ui_addr.ui_db[i];
-        }
+        } else {
-        else {
                for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
                        ufsi->i_u1.i_symlink[i] = ufs2_inode->ui_u2.ui_symlink[i];
        }
+}
+void ufs_read_inode(struct inode * inode)
+{
+        struct ufs_inode_info *ufsi = UFS_I(inode);
+        struct super_block * sb;
+        struct ufs_sb_private_info * uspi;
+        struct buffer_head * bh;
+        UFSD("ENTER, ino %lu\n", inode->i_ino);
+        sb = inode->i_sb;
+        uspi = UFS_SB(sb)->s_uspi;
+        if (inode->i_ino < UFS_ROOTINO ||
+            inode->i_ino > (uspi->s_ncg * uspi->s_ipg)) {
+                ufs_warning(sb, "ufs_read_inode", "bad inode number (%lu)\n",
+                            inode->i_ino);
+                goto bad_inode;
+        }
+        bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino));
+        if (!bh) {
+                ufs_warning(sb, "ufs_read_inode", "unable to read inode %lu\n",
+                            inode->i_ino);
+                goto bad_inode;
+        }
+        if ((UFS_SB(sb)->s_flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
+                struct ufs2_inode *ufs2_inode = (struct ufs2_inode *)bh->b_data;
+                ufs2_read_inode(inode,
+                                ufs2_inode + ufs_inotofsbo(inode->i_ino));
+        } else {
+                struct ufs_inode *ufs_inode = (struct ufs_inode *)bh->b_data;
+                ufs1_read_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino));
+        }
+        inode->i_blksize = PAGE_SIZE;/*This is the optimal IO size (for stat)*/
+        inode->i_version++;
+        ufsi->i_lastfrag =
+                (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift;
+        ufsi->i_dir_start_lookup = 0;
        ufsi->i_osync = 0;
-        if (S_ISREG(inode->i_mode)) {
+        ufs_set_inode_ops(inode);
-                inode->i_op = &ufs_file_inode_operations;
-                inode->i_fop = &ufs_file_operations;
-                inode->i_mapping->a_ops = &ufs_aops;
-        } else if (S_ISDIR(inode->i_mode)) {
-                inode->i_op = &ufs_dir_inode_operations;
-                inode->i_fop = &ufs_dir_operations;
-        } else if (S_ISLNK(inode->i_mode)) {
-                if (!inode->i_blocks)
-                        inode->i_op = &ufs_fast_symlink_inode_operations;
-                else {
-                        inode->i_op = &page_symlink_inode_operations;
-                        inode->i_mapping->a_ops = &ufs_aops;
-                }
-        } else   /* TODO  : here ...*/
-                init_special_inode(inode, inode->i_mode,
-                        ufs_get_inode_dev(sb, ufsi));
        brelse(bh);
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        return;
+bad_inode:
+        make_bad_inode(inode);
 }
 static int ufs_update_inode(struct inode * inode, int do_sync)
@@ -724,7 +760,7 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
        unsigned i;
        unsigned flags;
-        UFSD(("ENTER, ino %lu\n", inode->i_ino))
+        UFSD("ENTER, ino %lu\n", inode->i_ino);
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
@@ -785,7 +821,7 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
                sync_dirty_buffer(bh);
        brelse (bh);
        
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        return 0;
 }
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 8d5f98a01c74..abd5f23a426d 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -1,6 +1,9 @@
 /*
 * linux/fs/ufs/namei.c
 *
+ * Migration to usage of "page cache" on May 2006 by
+ * Evgeniy Dushistov <dushistov@mail.ru> based on ext2 code base.
+ *
 * Copyright (C) 1998
 * Daniel Pirkl <daniel.pirkl@email.cz>
 * Charles University, Faculty of Mathematics and Physics
@@ -28,21 +31,9 @@
 #include <linux/fs.h>
 #include <linux/ufs_fs.h>
 #include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
 #include "swab.h"       /* will go away - see comment in mknod() */
 #include "util.h"
-/*
-#undef UFS_NAMEI_DEBUG
-*/
-#define UFS_NAMEI_DEBUG
-#ifdef UFS_NAMEI_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
 static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode)
 {
        int err = ufs_add_link(dentry, inode);
@@ -88,8 +79,13 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru
 static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
                struct nameidata *nd)
 {
-        struct inode * inode = ufs_new_inode(dir, mode);
+        struct inode *inode;
-        int err = PTR_ERR(inode);
+        int err;
+        UFSD("BEGIN\n");
+        inode = ufs_new_inode(dir, mode);
+        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                inode->i_op = &ufs_file_inode_operations;
                inode->i_fop = &ufs_file_operations;
@@ -99,6 +95,7 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
                err = ufs_add_nondir(dentry, inode);
                unlock_kernel();
        }
+        UFSD("END: err=%d\n", err);
        return err;
 }
@@ -205,6 +202,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
        inode->i_op = &ufs_dir_inode_operations;
        inode->i_fop = &ufs_dir_operations;
+        inode->i_mapping->a_ops = &ufs_aops;
        inode_inc_link_count(inode);
@@ -231,19 +229,18 @@ out_dir:
        goto out;
 }
-static int ufs_unlink(struct inode * dir, struct dentry *dentry)
+static int ufs_unlink(struct inode *dir, struct dentry *dentry)
 {
        struct inode * inode = dentry->d_inode;
-        struct buffer_head * bh;
+        struct ufs_dir_entry *de;
-        struct ufs_dir_entry * de;
+        struct page *page;
        int err = -ENOENT;
-        lock_kernel();
+        de = ufs_find_entry(dir, dentry, &page);
-        de = ufs_find_entry (dentry, &bh);
        if (!de)
                goto out;
-        err = ufs_delete_entry (dir, de, bh);
+        err = ufs_delete_entry(dir, de, page);
        if (err)
                goto out;
@@ -251,7 +248,6 @@ static int ufs_unlink(struct inode * dir, struct dentry *dentry)
        inode_dec_link_count(inode);
        err = 0;
 out:
-        unlock_kernel();
        return err;
 }
@@ -273,42 +269,42 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
        return err;
 }
-static int ufs_rename (struct inode * old_dir, struct dentry * old_dentry,
+static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
-        struct inode * new_dir, struct dentry * new_dentry )
+                      struct inode *new_dir, struct dentry *new_dentry)
 {
        struct inode *old_inode = old_dentry->d_inode;
        struct inode *new_inode = new_dentry->d_inode;
-        struct buffer_head *dir_bh = NULL;
+        struct page *dir_page = NULL;
-        struct ufs_dir_entry *dir_de = NULL;
+        struct ufs_dir_entry * dir_de = NULL;
-        struct buffer_head *old_bh;
+        struct page *old_page;
        struct ufs_dir_entry *old_de;
        int err = -ENOENT;
-        lock_kernel();
+        old_de = ufs_find_entry(old_dir, old_dentry, &old_page);
-        old_de = ufs_find_entry (old_dentry, &old_bh);
        if (!old_de)
                goto out;
        if (S_ISDIR(old_inode->i_mode)) {
                err = -EIO;
-                dir_de = ufs_dotdot(old_inode, &dir_bh);
+                dir_de = ufs_dotdot(old_inode, &dir_page);
                if (!dir_de)
                        goto out_old;
        }
        if (new_inode) {
-                struct buffer_head *new_bh;
+                struct page *new_page;
                struct ufs_dir_entry *new_de;
                err = -ENOTEMPTY;
-                if (dir_de && !ufs_empty_dir (new_inode))
+                if (dir_de && !ufs_empty_dir(new_inode))
                        goto out_dir;
                err = -ENOENT;
-                new_de = ufs_find_entry (new_dentry, &new_bh);
+                new_de = ufs_find_entry(new_dir, new_dentry, &new_page);
                if (!new_de)
                        goto out_dir;
                inode_inc_link_count(old_inode);
-                ufs_set_link(new_dir, new_de, new_bh, old_inode);
+                ufs_set_link(new_dir, new_de, new_page, old_inode);
                new_inode->i_ctime = CURRENT_TIME_SEC;
                if (dir_de)
                        new_inode->i_nlink--;
@@ -329,24 +325,32 @@ static int ufs_rename (struct inode * old_dir, struct dentry * old_dentry,
                        inode_inc_link_count(new_dir);
        }
-        ufs_delete_entry (old_dir, old_de, old_bh);
+        /*
+         * Like most other Unix systems, set the ctime for inodes on a
+         * rename.
+         * inode_dec_link_count() will mark the inode dirty.
+         */
+        old_inode->i_ctime = CURRENT_TIME_SEC;
+        ufs_delete_entry(old_dir, old_de, old_page);
        inode_dec_link_count(old_inode);
        if (dir_de) {
-                ufs_set_link(old_inode, dir_de, dir_bh, new_dir);
+                ufs_set_link(old_inode, dir_de, dir_page, new_dir);
                inode_dec_link_count(old_dir);
        }
-        unlock_kernel();
        return 0;
 out_dir:
-        if (dir_de)
+        if (dir_de) {
-                brelse(dir_bh);
+                kunmap(dir_page);
+                page_cache_release(dir_page);
+        }
 out_old:
-        brelse (old_bh);
+        kunmap(old_page);
+        page_cache_release(old_page);
 out:
-        unlock_kernel();
        return err;
 }
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index fe5ab2aa2899..74ef5e9bedff 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -90,95 +90,84 @@
 #include "swab.h"
 #include "util.h"
-#undef UFS_SUPER_DEBUG
+#ifdef CONFIG_UFS_DEBUG
-#undef UFS_SUPER_DEBUG_MORE
-#undef UFS_SUPER_DEBUG_MORE
-#ifdef UFS_SUPER_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-#ifdef UFS_SUPER_DEBUG_MORE
 /*
 * Print contents of ufs_super_block, useful for debugging
 */
-void ufs_print_super_stuff(struct super_block *sb,
+static void ufs_print_super_stuff(struct super_block *sb, unsigned flags,
-        struct ufs_super_block_first * usb1,
+                                  struct ufs_super_block_first *usb1,
-        struct ufs_super_block_second * usb2, 
+                                  struct ufs_super_block_second *usb2,
-        struct ufs_super_block_third * usb3)
+                                  struct ufs_super_block_third *usb3)
 {
        printk("ufs_print_super_stuff\n");
-        printk("size of usb:     %u\n", sizeof(struct ufs_super_block));
+        printk("  magic:     0x%x\n", fs32_to_cpu(sb, usb3->fs_magic));
-        printk("  magic:         0x%x\n", fs32_to_cpu(sb, usb3->fs_magic));
+        if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
-        printk("  sblkno:        %u\n", fs32_to_cpu(sb, usb1->fs_sblkno));
+                printk("  fs_size:   %llu\n", (unsigned long long)
-        printk("  cblkno:        %u\n", fs32_to_cpu(sb, usb1->fs_cblkno));
+                       fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size));
-        printk("  iblkno:        %u\n", fs32_to_cpu(sb, usb1->fs_iblkno));
+                printk("  fs_dsize:  %llu\n", (unsigned long long)
-        printk("  dblkno:        %u\n", fs32_to_cpu(sb, usb1->fs_dblkno));
+                       fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize));
-        printk("  cgoffset:      %u\n", fs32_to_cpu(sb, usb1->fs_cgoffset));
+                printk("  bsize:         %u\n",
-        printk("  ~cgmask:       0x%x\n", ~fs32_to_cpu(sb, usb1->fs_cgmask));
+                       fs32_to_cpu(sb, usb1->fs_bsize));
-        printk("  size:          %u\n", fs32_to_cpu(sb, usb1->fs_size));
+                printk("  fsize:         %u\n",
-        printk("  dsize:         %u\n", fs32_to_cpu(sb, usb1->fs_dsize));
+                       fs32_to_cpu(sb, usb1->fs_fsize));
-        printk("  ncg:           %u\n", fs32_to_cpu(sb, usb1->fs_ncg));
+                printk("  fs_volname:  %s\n", usb2->fs_un.fs_u2.fs_volname);
-        printk("  bsize:         %u\n", fs32_to_cpu(sb, usb1->fs_bsize));
+                printk("  fs_sblockloc: %llu\n", (unsigned long long)
-        printk("  fsize:         %u\n", fs32_to_cpu(sb, usb1->fs_fsize));
+                       fs64_to_cpu(sb, usb2->fs_un.fs_u2.fs_sblockloc));
-        printk("  frag:          %u\n", fs32_to_cpu(sb, usb1->fs_frag));
+                printk("  cs_ndir(No of dirs):  %llu\n", (unsigned long long)
-        printk("  fragshift:     %u\n", fs32_to_cpu(sb, usb1->fs_fragshift));
+                       fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir));
-        printk("  ~fmask:        %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask));
+                printk("  cs_nbfree(No of free blocks):  %llu\n",
-        printk("  fshift:        %u\n", fs32_to_cpu(sb, usb1->fs_fshift));
+                       (unsigned long long)
-        printk("  sbsize:        %u\n", fs32_to_cpu(sb, usb1->fs_sbsize));
+                       fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree));
-        printk("  spc:           %u\n", fs32_to_cpu(sb, usb1->fs_spc));
+        } else {
-        printk("  cpg:           %u\n", fs32_to_cpu(sb, usb1->fs_cpg));
+                printk(" sblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_sblkno));
-        printk("  ipg:           %u\n", fs32_to_cpu(sb, usb1->fs_ipg));
+                printk(" cblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_cblkno));
-        printk("  fpg:           %u\n", fs32_to_cpu(sb, usb1->fs_fpg));
+                printk(" iblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_iblkno));
-        printk("  csaddr:        %u\n", fs32_to_cpu(sb, usb1->fs_csaddr));
+                printk(" dblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_dblkno));
-        printk("  cssize:        %u\n", fs32_to_cpu(sb, usb1->fs_cssize));
+                printk(" cgoffset:    %u\n",
-        printk("  cgsize:        %u\n", fs32_to_cpu(sb, usb1->fs_cgsize));
+                       fs32_to_cpu(sb, usb1->fs_cgoffset));
-        printk("  fstodb:        %u\n", fs32_to_cpu(sb, usb1->fs_fsbtodb));
+                printk(" ~cgmask:     0x%x\n",
-        printk("  contigsumsize: %d\n", fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_contigsumsize));
+                       ~fs32_to_cpu(sb, usb1->fs_cgmask));
-        printk("  postblformat:  %u\n", fs32_to_cpu(sb, usb3->fs_postblformat));
+                printk(" size:        %u\n", fs32_to_cpu(sb, usb1->fs_size));
-        printk("  nrpos:         %u\n", fs32_to_cpu(sb, usb3->fs_nrpos));
+                printk(" dsize:       %u\n", fs32_to_cpu(sb, usb1->fs_dsize));
-        printk("  ndir           %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir));
+                printk(" ncg:         %u\n", fs32_to_cpu(sb, usb1->fs_ncg));
-        printk("  nifree         %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree));
+                printk(" bsize:       %u\n", fs32_to_cpu(sb, usb1->fs_bsize));
-        printk("  nbfree         %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree));
+                printk(" fsize:       %u\n", fs32_to_cpu(sb, usb1->fs_fsize));
-        printk("  nffree         %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree));
+                printk(" frag:        %u\n", fs32_to_cpu(sb, usb1->fs_frag));
-        printk("\n");
+                printk(" fragshift:   %u\n",
-}
+                       fs32_to_cpu(sb, usb1->fs_fragshift));
+                printk(" ~fmask:      %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask));
-/*
+                printk(" fshift:      %u\n", fs32_to_cpu(sb, usb1->fs_fshift));
- * Print contents of ufs2 ufs_super_block, useful for debugging
+                printk(" sbsize:      %u\n", fs32_to_cpu(sb, usb1->fs_sbsize));
- */
+                printk(" spc:         %u\n", fs32_to_cpu(sb, usb1->fs_spc));
-void ufs2_print_super_stuff(
+                printk(" cpg:         %u\n", fs32_to_cpu(sb, usb1->fs_cpg));
-     struct super_block *sb,
+                printk(" ipg:         %u\n", fs32_to_cpu(sb, usb1->fs_ipg));
-      struct ufs_super_block *usb)
+                printk(" fpg:         %u\n", fs32_to_cpu(sb, usb1->fs_fpg));
-{
+                printk(" csaddr:      %u\n", fs32_to_cpu(sb, usb1->fs_csaddr));
-        printk("ufs_print_super_stuff\n");
+                printk(" cssize:      %u\n", fs32_to_cpu(sb, usb1->fs_cssize));
-        printk("size of usb:     %u\n", sizeof(struct ufs_super_block));
+                printk(" cgsize:      %u\n", fs32_to_cpu(sb, usb1->fs_cgsize));
-        printk("  magic:         0x%x\n", fs32_to_cpu(sb, usb->fs_magic));
+                printk(" fstodb:      %u\n",
-        printk("  fs_size:   %u\n",fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_size));
+                       fs32_to_cpu(sb, usb1->fs_fsbtodb));
-        printk("  fs_dsize:  %u\n",fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_dsize));
+                printk(" nrpos:       %u\n", fs32_to_cpu(sb, usb3->fs_nrpos));
-        printk("  bsize:         %u\n", fs32_to_cpu(usb, usb->fs_bsize));
+                printk(" ndir         %u\n",
-        printk("  fsize:         %u\n", fs32_to_cpu(usb, usb->fs_fsize));
+                       fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir));
-        printk("  fs_volname:  %s\n", usb->fs_u11.fs_u2.fs_volname);
+                printk(" nifree       %u\n",
-        printk("  fs_fsmnt:  %s\n", usb->fs_u11.fs_u2.fs_fsmnt);
+                       fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree));
-        printk("  fs_sblockloc: %u\n",fs64_to_cpu(sb,
+                printk(" nbfree       %u\n",
-                        usb->fs_u11.fs_u2.fs_sblockloc));
+                       fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree));
-        printk("  cs_ndir(No of dirs):  %u\n",fs64_to_cpu(sb,
+                printk(" nffree       %u\n",
-                        usb->fs_u11.fs_u2.fs_cstotal.cs_ndir));
+                       fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree));
-        printk("  cs_nbfree(No of free blocks):  %u\n",fs64_to_cpu(sb,
+        }
-                        usb->fs_u11.fs_u2.fs_cstotal.cs_nbfree));
        printk("\n");
 }
 /*
 * Print contents of ufs_cylinder_group, useful for debugging
 */
-void ufs_print_cylinder_stuff(struct super_block *sb, struct ufs_cylinder_group *cg)
+static void ufs_print_cylinder_stuff(struct super_block *sb,
+                                     struct ufs_cylinder_group *cg)
 {
        printk("\nufs_print_cylinder_stuff\n");
-        printk("size of ucg: %u\n", sizeof(struct ufs_cylinder_group));
+        printk("size of ucg: %zu\n", sizeof(struct ufs_cylinder_group));
        printk("  magic:        %x\n", fs32_to_cpu(sb, cg->cg_magic));
        printk("  time:         %u\n", fs32_to_cpu(sb, cg->cg_time));
        printk("  cgx:          %u\n", fs32_to_cpu(sb, cg->cg_cgx));
@@ -202,12 +191,18 @@ void ufs_print_cylinder_stuff(struct super_block *sb, struct ufs_cylinder_group
        printk("  iuseoff:      %u\n", fs32_to_cpu(sb, cg->cg_iusedoff));
        printk("  freeoff:      %u\n", fs32_to_cpu(sb, cg->cg_freeoff));
        printk("  nextfreeoff:  %u\n", fs32_to_cpu(sb, cg->cg_nextfreeoff));
-        printk("  clustersumoff %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff));
+        printk("  clustersumoff %u\n",
-        printk("  clusteroff    %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff));
+               fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff));
-        printk("  nclusterblks  %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks));
+        printk("  clusteroff    %u\n",
+               fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff));
+        printk("  nclusterblks  %u\n",
+               fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks));
        printk("\n");
 }
-#endif /* UFS_SUPER_DEBUG_MORE */
+#else
+#  define ufs_print_super_stuff(sb, flags, usb1, usb2, usb3) /**/
+#  define ufs_print_cylinder_stuff(sb, cg) /**/
+#endif /* CONFIG_UFS_DEBUG */
 static struct super_operations ufs_super_ops;
@@ -225,7 +220,7 @@ void ufs_error (struct super_block * sb, const char * function,
        
        if (!(sb->s_flags & MS_RDONLY)) {
                usb1->fs_clean = UFS_FSBAD;
-                ubh_mark_buffer_dirty(USPI_UBH);
+                ubh_mark_buffer_dirty(USPI_UBH(uspi));
                sb->s_dirt = 1;
                sb->s_flags |= MS_RDONLY;
        }
@@ -257,7 +252,7 @@ void ufs_panic (struct super_block * sb, const char * function,
        
        if (!(sb->s_flags & MS_RDONLY)) {
                usb1->fs_clean = UFS_FSBAD;
-                ubh_mark_buffer_dirty(USPI_UBH);
+                ubh_mark_buffer_dirty(USPI_UBH(uspi));
                sb->s_dirt = 1;
        }
        va_start (args, fmt);
@@ -309,7 +304,7 @@ static int ufs_parse_options (char * options, unsigned * mount_options)
 {
        char * p;
        
-        UFSD(("ENTER\n"))
+        UFSD("ENTER\n");
        
        if (!options)
                return 1;
@@ -386,27 +381,57 @@ static int ufs_parse_options (char * options, unsigned * mount_options)
 }
 /*
+ * Diffrent types of UFS hold fs_cstotal in different
+ * places, and use diffrent data structure for it.
+ * To make things simplier we just copy fs_cstotal to ufs_sb_private_info
+ */
+static void ufs_setup_cstotal(struct super_block *sb)
+{
+        struct ufs_sb_info *sbi = UFS_SB(sb);
+        struct ufs_sb_private_info *uspi = sbi->s_uspi;
+        struct ufs_super_block_first *usb1;
+        struct ufs_super_block_second *usb2;
+        struct ufs_super_block_third *usb3;
+        unsigned mtype = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE;
+        UFSD("ENTER, mtype=%u\n", mtype);
+        usb1 = ubh_get_usb_first(uspi);
+        usb2 = ubh_get_usb_second(uspi);
+        usb3 = ubh_get_usb_third(uspi);
+        if ((mtype == UFS_MOUNT_UFSTYPE_44BSD &&
+             (usb1->fs_flags & UFS_FLAGS_UPDATED)) ||
+            mtype == UFS_MOUNT_UFSTYPE_UFS2) {
+                /*we have statistic in different place, then usual*/
+                uspi->cs_total.cs_ndir = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir);
+                uspi->cs_total.cs_nbfree = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree);
+                uspi->cs_total.cs_nifree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree);
+                uspi->cs_total.cs_nffree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree);
+        } else {
+                uspi->cs_total.cs_ndir = fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir);
+                uspi->cs_total.cs_nbfree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree);
+                uspi->cs_total.cs_nifree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree);
+                uspi->cs_total.cs_nffree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree);
+        }
+        UFSD("EXIT\n");
+}
+/*
 * Read on-disk structures associated with cylinder groups
 */
-static int ufs_read_cylinder_structures (struct super_block *sb)
+static int ufs_read_cylinder_structures(struct super_block *sb)
 {
-        struct ufs_sb_info * sbi = UFS_SB(sb);
+        struct ufs_sb_info *sbi = UFS_SB(sb);
-        struct ufs_sb_private_info * uspi;
+        struct ufs_sb_private_info *uspi = sbi->s_uspi;
-        struct ufs_super_block *usb;
+        unsigned flags = sbi->s_flags;
        struct ufs_buffer_head * ubh;
        unsigned char * base, * space;
        unsigned size, blks, i;
-        unsigned flags = 0;
+        struct ufs_super_block_third *usb3;
-        
-        UFSD(("ENTER\n"))
-        
-        uspi = sbi->s_uspi;
-        usb  = (struct ufs_super_block *)
+        UFSD("ENTER\n");
-                ((struct ufs_buffer_head *)uspi)->bh[0]->b_data;
-        flags = UFS_SB(sb)->s_flags;
+        usb3 = ubh_get_usb_third(uspi);
-        
        /*
         * Read cs structures from (usually) first data block
         * on the device. 
@@ -424,7 +449,7 @@ static int ufs_read_cylinder_structures (struct super_block *sb)
                if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) 
                        ubh = ubh_bread(sb,
-                                fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_csaddr) + i, size);
+                                fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_csaddr) + i, size);
                else 
                        ubh = ubh_bread(sb, uspi->s_csaddr + i, size);
                
@@ -451,14 +476,13 @@ static int ufs_read_cylinder_structures (struct super_block *sb)
                sbi->s_cgno[i] = UFS_CGNO_EMPTY;
        }
        for (i = 0; i < uspi->s_ncg; i++) {
-                UFSD(("read cg %u\n", i))
+                UFSD("read cg %u\n", i);
                if (!(sbi->s_ucg[i] = sb_bread(sb, ufs_cgcmin(i))))
                        goto failed;
                if (!ufs_cg_chkmagic (sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data))
                        goto failed;
-#ifdef UFS_SUPER_DEBUG_MORE
                ufs_print_cylinder_stuff(sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data);
-#endif
        }
        for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) {
                if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_KERNEL)))
@@ -466,7 +490,7 @@ static int ufs_read_cylinder_structures (struct super_block *sb)
                sbi->s_cgno[i] = UFS_CGNO_EMPTY;
        }
        sbi->s_cg_loaded = 0;
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        return 1;
 failed:
@@ -479,26 +503,69 @@ failed:
                for (i = 0; i < UFS_MAX_GROUP_LOADED; i++)
                        kfree (sbi->s_ucpi[i]);
        }
-        UFSD(("EXIT (FAILED)\n"))
+        UFSD("EXIT (FAILED)\n");
        return 0;
 }
 /*
- * Put on-disk structures associated with cylinder groups and 
+ * Sync our internal copy of fs_cstotal with disk
- * write them back to disk
 */
-static void ufs_put_cylinder_structures (struct super_block *sb)
+static void ufs_put_cstotal(struct super_block *sb)
 {
-        struct ufs_sb_info * sbi = UFS_SB(sb);
+        unsigned mtype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE;
-        struct ufs_sb_private_info * uspi;
+        struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+        struct ufs_super_block_first *usb1;
+        struct ufs_super_block_second *usb2;
+        struct ufs_super_block_third *usb3;
+        UFSD("ENTER\n");
+        usb1 = ubh_get_usb_first(uspi);
+        usb2 = ubh_get_usb_second(uspi);
+        usb3 = ubh_get_usb_third(uspi);
+        if ((mtype == UFS_MOUNT_UFSTYPE_44BSD &&
+             (usb1->fs_flags & UFS_FLAGS_UPDATED)) ||
+            mtype == UFS_MOUNT_UFSTYPE_UFS2) {
+                /*we have statistic in different place, then usual*/
+                usb2->fs_un.fs_u2.cs_ndir =
+                        cpu_to_fs64(sb, uspi->cs_total.cs_ndir);
+                usb2->fs_un.fs_u2.cs_nbfree =
+                        cpu_to_fs64(sb, uspi->cs_total.cs_nbfree);
+                usb3->fs_un1.fs_u2.cs_nifree =
+                        cpu_to_fs64(sb, uspi->cs_total.cs_nifree);
+                usb3->fs_un1.fs_u2.cs_nffree =
+                        cpu_to_fs64(sb, uspi->cs_total.cs_nffree);
+        } else {
+                usb1->fs_cstotal.cs_ndir =
+                        cpu_to_fs32(sb, uspi->cs_total.cs_ndir);
+                usb1->fs_cstotal.cs_nbfree =
+                        cpu_to_fs32(sb, uspi->cs_total.cs_nbfree);
+                usb1->fs_cstotal.cs_nifree =
+                        cpu_to_fs32(sb, uspi->cs_total.cs_nifree);
+                usb1->fs_cstotal.cs_nffree =
+                        cpu_to_fs32(sb, uspi->cs_total.cs_nffree);
+        }
+        ubh_mark_buffer_dirty(USPI_UBH(uspi));
+        UFSD("EXIT\n");
+}
+/**
+ * ufs_put_super_internal() - put on-disk intrenal structures
+ * @sb: pointer to super_block structure
+ * Put on-disk structures associated with cylinder groups
+ * and write them back to disk, also update cs_total on disk
+ */
+static void ufs_put_super_internal(struct super_block *sb)
+{
+        struct ufs_sb_info *sbi = UFS_SB(sb);
+        struct ufs_sb_private_info *uspi = sbi->s_uspi;
        struct ufs_buffer_head * ubh;
        unsigned char * base, * space;
        unsigned blks, size, i;
-        
-        UFSD(("ENTER\n"))
-        
-        uspi = sbi->s_uspi;
+        
+        UFSD("ENTER\n");
+        ufs_put_cstotal(sb);
        size = uspi->s_cssize;
        blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
        base = space = (char*) sbi->s_csp;
@@ -523,7 +590,7 @@ static void ufs_put_cylinder_structures (struct super_block *sb)
                brelse (sbi->s_ucg[i]);
        kfree (sbi->s_ucg);
        kfree (base);
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
 }
 static int ufs_fill_super(struct super_block *sb, void *data, int silent)
@@ -533,7 +600,6 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
        struct ufs_super_block_first * usb1;
        struct ufs_super_block_second * usb2;
        struct ufs_super_block_third * usb3;
-        struct ufs_super_block *usb;
        struct ufs_buffer_head * ubh;   
        struct inode *inode;
        unsigned block_size, super_block_size;
@@ -544,7 +610,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
        ubh = NULL;
        flags = 0;
        
-        UFSD(("ENTER\n"))
+        UFSD("ENTER\n");
                
        sbi = kmalloc(sizeof(struct ufs_sb_info), GFP_KERNEL);
        if (!sbi)
@@ -552,7 +618,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_fs_info = sbi;
        memset(sbi, 0, sizeof(struct ufs_sb_info));
-        UFSD(("flag %u\n", (int)(sb->s_flags & MS_RDONLY)))
+        UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY));
        
 #ifndef CONFIG_UFS_FS_WRITE
        if (!(sb->s_flags & MS_RDONLY)) {
@@ -593,7 +659,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
           the rules */
        switch (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) {
        case UFS_MOUNT_UFSTYPE_44BSD:
-                UFSD(("ufstype=44bsd\n"))
+                UFSD("ufstype=44bsd\n");
                uspi->s_fsize = block_size = 512;
                uspi->s_fmask = ~(512 - 1);
                uspi->s_fshift = 9;
@@ -602,7 +668,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD;
                break;
        case UFS_MOUNT_UFSTYPE_UFS2:
-                UFSD(("ufstype=ufs2\n"));
+                UFSD("ufstype=ufs2\n");
                super_block_offset=SBLOCK_UFS2;
                uspi->s_fsize = block_size = 512;
                uspi->s_fmask = ~(512 - 1);
@@ -617,7 +683,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                break;
                
        case UFS_MOUNT_UFSTYPE_SUN:
-                UFSD(("ufstype=sun\n"))
+                UFSD("ufstype=sun\n");
                uspi->s_fsize = block_size = 1024;
                uspi->s_fmask = ~(1024 - 1);
                uspi->s_fshift = 10;
@@ -628,7 +694,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                break;
        case UFS_MOUNT_UFSTYPE_SUNx86:
-                UFSD(("ufstype=sunx86\n"))
+                UFSD("ufstype=sunx86\n");
                uspi->s_fsize = block_size = 1024;
                uspi->s_fmask = ~(1024 - 1);
                uspi->s_fshift = 10;
@@ -639,7 +705,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                break;
        case UFS_MOUNT_UFSTYPE_OLD:
-                UFSD(("ufstype=old\n"))
+                UFSD("ufstype=old\n");
                uspi->s_fsize = block_size = 1024;
                uspi->s_fmask = ~(1024 - 1);
                uspi->s_fshift = 10;
@@ -654,7 +720,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                break;
        
        case UFS_MOUNT_UFSTYPE_NEXTSTEP:
-                UFSD(("ufstype=nextstep\n"))
+                UFSD("ufstype=nextstep\n");
                uspi->s_fsize = block_size = 1024;
                uspi->s_fmask = ~(1024 - 1);
                uspi->s_fshift = 10;
@@ -669,7 +735,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                break;
        
        case UFS_MOUNT_UFSTYPE_NEXTSTEP_CD:
-                UFSD(("ufstype=nextstep-cd\n"))
+                UFSD("ufstype=nextstep-cd\n");
                uspi->s_fsize = block_size = 2048;
                uspi->s_fmask = ~(2048 - 1);
                uspi->s_fshift = 11;
@@ -684,7 +750,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                break;
        
        case UFS_MOUNT_UFSTYPE_OPENSTEP:
-                UFSD(("ufstype=openstep\n"))
+                UFSD("ufstype=openstep\n");
                uspi->s_fsize = block_size = 1024;
                uspi->s_fmask = ~(1024 - 1);
                uspi->s_fshift = 10;
@@ -699,7 +765,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                break;
        
        case UFS_MOUNT_UFSTYPE_HP:
-                UFSD(("ufstype=hp\n"))
+                UFSD("ufstype=hp\n");
                uspi->s_fsize = block_size = 1024;
                uspi->s_fmask = ~(1024 - 1);
                uspi->s_fshift = 10;
@@ -737,8 +803,6 @@ again:
        usb1 = ubh_get_usb_first(uspi);
        usb2 = ubh_get_usb_second(uspi);
        usb3 = ubh_get_usb_third(uspi);
-        usb  = (struct ufs_super_block *)
-                ((struct ufs_buffer_head *)uspi)->bh[0]->b_data ;
        /*
         * Check ufs magic number
@@ -820,16 +884,12 @@ magic_found:
                ubh = NULL;
                block_size = uspi->s_fsize;
                super_block_size = uspi->s_sbsize;
-                UFSD(("another value of block_size or super_block_size %u, %u\n", block_size, super_block_size))
+                UFSD("another value of block_size or super_block_size %u, %u\n", block_size, super_block_size);
                goto again;
        }
-#ifdef UFS_SUPER_DEBUG_MORE
-        if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
+        ufs_print_super_stuff(sb, flags, usb1, usb2, usb3);
-                ufs2_print_super_stuff(sb,usb);
-        else
-                ufs_print_super_stuff(sb, usb1, usb2, usb3);
-#endif
        /*
         * Check, if file system was correctly unmounted.
@@ -842,13 +902,13 @@ magic_found:
          (ufs_get_fs_state(sb, usb1, usb3) == (UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time))))) {
                switch(usb1->fs_clean) {
                case UFS_FSCLEAN:
-                        UFSD(("fs is clean\n"))
+                        UFSD("fs is clean\n");
                        break;
                case UFS_FSSTABLE:
-                        UFSD(("fs is stable\n"))
+                        UFSD("fs is stable\n");
                        break;
                case UFS_FSOSF1:
-                        UFSD(("fs is DEC OSF/1\n"))
+                        UFSD("fs is DEC OSF/1\n");
                        break;
                case UFS_FSACTIVE:
                        printk("ufs_read_super: fs is active\n");
@@ -863,8 +923,7 @@ magic_found:
                        sb->s_flags |= MS_RDONLY;
                        break;
                }
-        }
+        } else {
-        else {
                printk("ufs_read_super: fs needs fsck\n");
                sb->s_flags |= MS_RDONLY;
        }
@@ -884,10 +943,9 @@ magic_found:
        uspi->s_cgmask = fs32_to_cpu(sb, usb1->fs_cgmask);
        if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
-                uspi->s_u2_size  = fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_size);
+                uspi->s_u2_size  = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size);
-                uspi->s_u2_dsize = fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_dsize);
+                uspi->s_u2_dsize = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize);
-        }
+        } else {
-        else {
                uspi->s_size  =  fs32_to_cpu(sb, usb1->fs_size);
                uspi->s_dsize =  fs32_to_cpu(sb, usb1->fs_dsize);
        }
@@ -901,8 +959,8 @@ magic_found:
        uspi->s_fmask = fs32_to_cpu(sb, usb1->fs_fmask);
        uspi->s_bshift = fs32_to_cpu(sb, usb1->fs_bshift);
        uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift);
-        UFSD(("uspi->s_bshift = %d,uspi->s_fshift = %d", uspi->s_bshift,
+        UFSD("uspi->s_bshift = %d,uspi->s_fshift = %d", uspi->s_bshift,
-                uspi->s_fshift));
+                uspi->s_fshift);
        uspi->s_fpbshift = fs32_to_cpu(sb, usb1->fs_fragshift);
        uspi->s_fsbtodb = fs32_to_cpu(sb, usb1->fs_fsbtodb);
        /* s_sbsize already set */
@@ -922,8 +980,8 @@ magic_found:
        uspi->s_spc = fs32_to_cpu(sb, usb1->fs_spc);
        uspi->s_ipg = fs32_to_cpu(sb, usb1->fs_ipg);
        uspi->s_fpg = fs32_to_cpu(sb, usb1->fs_fpg);
-        uspi->s_cpc = fs32_to_cpu(sb, usb2->fs_cpc);
+        uspi->s_cpc = fs32_to_cpu(sb, usb2->fs_un.fs_u1.fs_cpc);
-        uspi->s_contigsumsize = fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_contigsumsize);
+        uspi->s_contigsumsize = fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_contigsumsize);
        uspi->s_qbmask = ufs_get_fs_qbmask(sb, usb3);
        uspi->s_qfmask = ufs_get_fs_qfmask(sb, usb3);
        uspi->s_postblformat = fs32_to_cpu(sb, usb3->fs_postblformat);
@@ -935,12 +993,11 @@ magic_found:
         * Compute another frequently used values
         */
        uspi->s_fpbmask = uspi->s_fpb - 1;
-        if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
+        if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
                uspi->s_apbshift = uspi->s_bshift - 3;
-        }
+        else
-        else {
                uspi->s_apbshift = uspi->s_bshift - 2;
-        }
        uspi->s_2apbshift = uspi->s_apbshift * 2;
        uspi->s_3apbshift = uspi->s_apbshift * 3;
        uspi->s_apb = 1 << uspi->s_apbshift;
@@ -956,7 +1013,7 @@ magic_found:
        if ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) ==
            UFS_MOUNT_UFSTYPE_44BSD)
                uspi->s_maxsymlinklen =
-                    fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_maxsymlinklen);
+                    fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen);
        
        sbi->s_flags = flags;
@@ -967,7 +1024,7 @@ magic_found:
        if (!sb->s_root)
                goto dalloc_failed;
+        ufs_setup_cstotal(sb);
        /*
         * Read cylinder group structures
         */
@@ -975,7 +1032,7 @@ magic_found:
                if (!ufs_read_cylinder_structures(sb))
                        goto failed;
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        return 0;
 dalloc_failed:
@@ -986,15 +1043,16 @@ failed:
        kfree (uspi);
        kfree(sbi);
        sb->s_fs_info = NULL;
-        UFSD(("EXIT (FAILED)\n"))
+        UFSD("EXIT (FAILED)\n");
        return -EINVAL;
 failed_nomem:
-        UFSD(("EXIT (NOMEM)\n"))
+        UFSD("EXIT (NOMEM)\n");
        return -ENOMEM;
 }
-static void ufs_write_super (struct super_block *sb) {
+static void ufs_write_super(struct super_block *sb)
+{
        struct ufs_sb_private_info * uspi;
        struct ufs_super_block_first * usb1;
        struct ufs_super_block_third * usb3;
@@ -1002,7 +1060,7 @@ static void ufs_write_super (struct super_block *sb) {
        lock_kernel();
-        UFSD(("ENTER\n"))
+        UFSD("ENTER\n");
        flags = UFS_SB(sb)->s_flags;
        uspi = UFS_SB(sb)->s_uspi;
        usb1 = ubh_get_usb_first(uspi);
@@ -1014,26 +1072,27 @@ static void ufs_write_super (struct super_block *sb) {
                  || (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
                        ufs_set_fs_state(sb, usb1, usb3,
                                        UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
-                ubh_mark_buffer_dirty (USPI_UBH);
+                ufs_put_cstotal(sb);
        }
        sb->s_dirt = 0;
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        unlock_kernel();
 }
-static void ufs_put_super (struct super_block *sb)
+static void ufs_put_super(struct super_block *sb)
 {
        struct ufs_sb_info * sbi = UFS_SB(sb);
                
-        UFSD(("ENTER\n"))
+        UFSD("ENTER\n");
        if (!(sb->s_flags & MS_RDONLY))
-                ufs_put_cylinder_structures (sb);
+                ufs_put_super_internal(sb);
        
        ubh_brelse_uspi (sbi->s_uspi);
        kfree (sbi->s_uspi);
        kfree (sbi);
        sb->s_fs_info = NULL;
+        UFSD("EXIT\n");
        return;
 }
@@ -1062,8 +1121,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
                return -EINVAL;
        if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
                new_mount_opt |= ufstype;
-        }
+        } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
-        else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
                printk("ufstype can't be changed during remount\n");
                return -EINVAL;
        }
@@ -1077,20 +1135,19 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
         * fs was mouted as rw, remounting ro
         */
        if (*mount_flags & MS_RDONLY) {
-                ufs_put_cylinder_structures(sb);
+                ufs_put_super_internal(sb);
                usb1->fs_time = cpu_to_fs32(sb, get_seconds());
                if ((flags & UFS_ST_MASK) == UFS_ST_SUN
                  || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) 
                        ufs_set_fs_state(sb, usb1, usb3,
                                UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
-                ubh_mark_buffer_dirty (USPI_UBH);
+                ubh_mark_buffer_dirty (USPI_UBH(uspi));
                sb->s_dirt = 0;
                sb->s_flags |= MS_RDONLY;
-        }
+        } else {
        /*
         * fs was mounted as ro, remounting rw
         */
-        else {
 #ifndef CONFIG_UFS_FS_WRITE
                printk("ufs was compiled with read-only support, "
                "can't be mounted as read-write\n");
@@ -1102,7 +1159,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
                        printk("this ufstype is read-only supported\n");
                        return -EINVAL;
                }
-                if (!ufs_read_cylinder_structures (sb)) {
+                if (!ufs_read_cylinder_structures(sb)) {
                        printk("failed during remounting\n");
                        return -EPERM;
                }
@@ -1113,37 +1170,31 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
        return 0;
 }
-static int ufs_statfs (struct dentry *dentry, struct kstatfs *buf)
+static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
-        struct ufs_sb_private_info * uspi;
+        struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi;
-        struct ufs_super_block_first * usb1;
+        unsigned  flags = UFS_SB(sb)->s_flags;
-        struct ufs_super_block * usb;
+        struct ufs_super_block_first *usb1;
-        unsigned  flags = 0;
+        struct ufs_super_block_second *usb2;
+        struct ufs_super_block_third *usb3;
        lock_kernel();
-        uspi = UFS_SB(sb)->s_uspi;
+        usb1 = ubh_get_usb_first(uspi);
-        usb1 = ubh_get_usb_first (uspi);
+        usb2 = ubh_get_usb_second(uspi);
-        usb  = (struct ufs_super_block *)
+        usb3 = ubh_get_usb_third(uspi);
-                ((struct ufs_buffer_head *)uspi)->bh[0]->b_data ;
        
-        flags = UFS_SB(sb)->s_flags;
        if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
                buf->f_type = UFS2_MAGIC;
-                buf->f_blocks = fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_dsize);
+                buf->f_blocks = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize);
-                buf->f_bfree = ufs_blkstofrags(fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_cstotal.cs_nbfree)) +
+        } else {
-                        fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_cstotal.cs_nffree);
-                buf->f_ffree = fs64_to_cpu(sb,
-                        usb->fs_u11.fs_u2.fs_cstotal.cs_nifree);
-        }
-        else {
                buf->f_type = UFS_MAGIC;
                buf->f_blocks = uspi->s_dsize;
-                buf->f_bfree = ufs_blkstofrags(fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree)) +
-                        fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree);
-                buf->f_ffree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree);
        }
+        buf->f_bfree = ufs_blkstofrags(uspi->cs_total.cs_nbfree) +
+                uspi->cs_total.cs_nffree;
+        buf->f_ffree = uspi->cs_total.cs_nifree;
        buf->f_bsize = sb->s_blocksize;
        buf->f_bavail = (buf->f_bfree > (((long)buf->f_blocks / 100) * uspi->s_minfree))
                ? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0;
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 02e86291ef8a..3c3b301f8701 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -49,14 +49,6 @@
 #include "swab.h"
 #include "util.h"
-#undef UFS_TRUNCATE_DEBUG
-#ifdef UFS_TRUNCATE_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
- 
 /*
 * Secure deletion currently doesn't work. It interacts very badly
 * with buffers shared with memory mappings, and for that reason
@@ -82,7 +74,7 @@ static int ufs_trunc_direct (struct inode * inode)
        unsigned i, tmp;
        int retry;
        
-        UFSD(("ENTER\n"))
+        UFSD("ENTER\n");
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
@@ -105,7 +97,7 @@ static int ufs_trunc_direct (struct inode * inode)
                block2 = ufs_fragstoblks (frag3);
        }
-        UFSD(("frag1 %u, frag2 %u, block1 %u, block2 %u, frag3 %u, frag4 %u\n", frag1, frag2, block1, block2, frag3, frag4))
+        UFSD("frag1 %u, frag2 %u, block1 %u, block2 %u, frag3 %u, frag4 %u\n", frag1, frag2, block1, block2, frag3, frag4);
        if (frag1 >= frag2)
                goto next1;             
@@ -120,9 +112,8 @@ static int ufs_trunc_direct (struct inode * inode)
        frag1 = ufs_fragnum (frag1);
        frag2 = ufs_fragnum (frag2);
-        inode->i_blocks -= (frag2-frag1) << uspi->s_nspfshift;
-        mark_inode_dirty(inode);
        ufs_free_fragments (inode, tmp + frag1, frag2 - frag1);
+        mark_inode_dirty(inode);
        frag_to_free = tmp + frag1;
 next1:
@@ -136,8 +127,7 @@ next1:
                        continue;
                *p = 0;
-                inode->i_blocks -= uspi->s_nspb;
-                mark_inode_dirty(inode);
                if (free_count == 0) {
                        frag_to_free = tmp;
                        free_count = uspi->s_fpb;
@@ -148,6 +138,7 @@ next1:
                        frag_to_free = tmp;
                        free_count = uspi->s_fpb;
                }
+                mark_inode_dirty(inode);
        }
        
        if (free_count > 0)
@@ -166,12 +157,12 @@ next1:
        frag4 = ufs_fragnum (frag4);
        *p = 0;
-        inode->i_blocks -= frag4 << uspi->s_nspfshift;
-        mark_inode_dirty(inode);
        ufs_free_fragments (inode, tmp, frag4);
+        mark_inode_dirty(inode);
 next3:
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        return retry;
 }
@@ -186,7 +177,7 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p)
        unsigned frag_to_free, free_count;
        int retry;
-        UFSD(("ENTER\n"))
+        UFSD("ENTER\n");
                
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
@@ -227,7 +218,7 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p)
                        frag_to_free = tmp;
                        free_count = uspi->s_fpb;
                }
-                inode->i_blocks -= uspi->s_nspb;
                mark_inode_dirty(inode);
        }
@@ -238,26 +229,21 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p)
                if (*ubh_get_addr32(ind_ubh,i))
                        break;
        if (i >= uspi->s_apb) {
-                if (ubh_max_bcount(ind_ubh) != 1) {
+                tmp = fs32_to_cpu(sb, *p);
-                        retry = 1;
+                *p = 0;
-                }
-                else {
+                ufs_free_blocks (inode, tmp, uspi->s_fpb);
-                        tmp = fs32_to_cpu(sb, *p);
+                mark_inode_dirty(inode);
-                        *p = 0;
+                ubh_bforget(ind_ubh);
-                        inode->i_blocks -= uspi->s_nspb;
+                ind_ubh = NULL;
-                        mark_inode_dirty(inode);
-                        ufs_free_blocks (inode, tmp, uspi->s_fpb);
-                        ubh_bforget(ind_ubh);
-                        ind_ubh = NULL;
-                }
        }
        if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) {
-                ubh_ll_rw_block (SWRITE, 1, &ind_ubh);
+                ubh_ll_rw_block(SWRITE, ind_ubh);
                ubh_wait_on_buffer (ind_ubh);
        }
        ubh_brelse (ind_ubh);
        
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        
        return retry;
 }
@@ -271,7 +257,7 @@ static int ufs_trunc_dindirect (struct inode *inode, unsigned offset, __fs32 *p)
        __fs32 * dind;
        int retry = 0;
        
-        UFSD(("ENTER\n"))
+        UFSD("ENTER\n");
        
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
@@ -306,25 +292,21 @@ static int ufs_trunc_dindirect (struct inode *inode, unsigned offset, __fs32 *p)
                if (*ubh_get_addr32 (dind_bh, i))
                        break;
        if (i >= uspi->s_apb) {
-                if (ubh_max_bcount(dind_bh) != 1)
+                tmp = fs32_to_cpu(sb, *p);
-                        retry = 1;
+                *p = 0;
-                else {
-                        tmp = fs32_to_cpu(sb, *p);
+                ufs_free_blocks(inode, tmp, uspi->s_fpb);
-                        *p = 0;
+                mark_inode_dirty(inode);
-                        inode->i_blocks -= uspi->s_nspb;
+                ubh_bforget(dind_bh);
-                        mark_inode_dirty(inode);
+                dind_bh = NULL;
-                        ufs_free_blocks (inode, tmp, uspi->s_fpb);
-                        ubh_bforget(dind_bh);
-                        dind_bh = NULL;
-                }
        }
        if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) {
-                ubh_ll_rw_block (SWRITE, 1, &dind_bh);
+                ubh_ll_rw_block(SWRITE, dind_bh);
                ubh_wait_on_buffer (dind_bh);
        }
        ubh_brelse (dind_bh);
        
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        
        return retry;
 }
@@ -339,7 +321,7 @@ static int ufs_trunc_tindirect (struct inode * inode)
        __fs32 * tind, * p;
        int retry;
        
-        UFSD(("ENTER\n"))
+        UFSD("ENTER\n");
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
@@ -370,25 +352,21 @@ static int ufs_trunc_tindirect (struct inode * inode)
                if (*ubh_get_addr32 (tind_bh, i))
                        break;
        if (i >= uspi->s_apb) {
-                if (ubh_max_bcount(tind_bh) != 1)
+                tmp = fs32_to_cpu(sb, *p);
-                        retry = 1;
+                *p = 0;
-                else {
-                        tmp = fs32_to_cpu(sb, *p);
+                ufs_free_blocks(inode, tmp, uspi->s_fpb);
-                        *p = 0;
+                mark_inode_dirty(inode);
-                        inode->i_blocks -= uspi->s_nspb;
+                ubh_bforget(tind_bh);
-                        mark_inode_dirty(inode);
+                tind_bh = NULL;
-                        ufs_free_blocks (inode, tmp, uspi->s_fpb);
-                        ubh_bforget(tind_bh);
-                        tind_bh = NULL;
-                }
        }
        if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) {
-                ubh_ll_rw_block (SWRITE, 1, &tind_bh);
+                ubh_ll_rw_block(SWRITE, tind_bh);
                ubh_wait_on_buffer (tind_bh);
        }
        ubh_brelse (tind_bh);
        
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        return retry;
 }
                
@@ -399,7 +377,7 @@ void ufs_truncate (struct inode * inode)
        struct ufs_sb_private_info * uspi;
        int retry;
        
-        UFSD(("ENTER\n"))
+        UFSD("ENTER\n");
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
@@ -430,5 +408,5 @@ void ufs_truncate (struct inode * inode)
        ufsi->i_lastfrag = DIRECT_FRAGMENT;
        unlock_kernel();
        mark_inode_dirty(inode);
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
 }
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 59acc8f073ac..a2f13f45708b 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -14,15 +14,6 @@
 #include "swab.h"
 #include "util.h"
-#undef UFS_UTILS_DEBUG
-#ifdef UFS_UTILS_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
 struct ufs_buffer_head * _ubh_bread_ (struct ufs_sb_private_info * uspi,
        struct super_block *sb, u64 fragment, u64 size)
 {
@@ -63,17 +54,17 @@ struct ufs_buffer_head * ubh_bread_uspi (struct ufs_sb_private_info * uspi,
        count = size >> uspi->s_fshift;
        if (count <= 0 || count > UFS_MAXFRAG)
                return NULL;
-        USPI_UBH->fragment = fragment;
+        USPI_UBH(uspi)->fragment = fragment;
-        USPI_UBH->count = count;
+        USPI_UBH(uspi)->count = count;
        for (i = 0; i < count; i++)
-                if (!(USPI_UBH->bh[i] = sb_bread(sb, fragment + i)))
+                if (!(USPI_UBH(uspi)->bh[i] = sb_bread(sb, fragment + i)))
                        goto failed;
        for (; i < UFS_MAXFRAG; i++)
-                USPI_UBH->bh[i] = NULL;
+                USPI_UBH(uspi)->bh[i] = NULL;
-        return USPI_UBH;
+        return USPI_UBH(uspi);
 failed:
        for (j = 0; j < i; j++)
-                brelse (USPI_UBH->bh[j]);
+                brelse (USPI_UBH(uspi)->bh[j]);
        return NULL;
 }
@@ -90,11 +81,11 @@ void ubh_brelse (struct ufs_buffer_head * ubh)
 void ubh_brelse_uspi (struct ufs_sb_private_info * uspi)
 {
        unsigned i;
-        if (!USPI_UBH)
+        if (!USPI_UBH(uspi))
                return;
-        for ( i = 0; i < USPI_UBH->count; i++ ) {
+        for ( i = 0; i < USPI_UBH(uspi)->count; i++ ) {
-                brelse (USPI_UBH->bh[i]);
+                brelse (USPI_UBH(uspi)->bh[i]);
-                USPI_UBH->bh[i] = NULL;
+                USPI_UBH(uspi)->bh[i] = NULL;
        }
 }
@@ -121,13 +112,12 @@ void ubh_mark_buffer_uptodate (struct ufs_buffer_head * ubh, int flag)
        }
 }
-void ubh_ll_rw_block (int rw, unsigned nr, struct ufs_buffer_head * ubh[])
+void ubh_ll_rw_block(int rw, struct ufs_buffer_head *ubh)
 {
-        unsigned i;
        if (!ubh)
                return;
-        for ( i = 0; i < nr; i++ )
-                ll_rw_block (rw, ubh[i]->count, ubh[i]->bh);
+        ll_rw_block(rw, ubh->count, ubh->bh);
 }
 void ubh_wait_on_buffer (struct ufs_buffer_head * ubh)
@@ -139,18 +129,6 @@ void ubh_wait_on_buffer (struct ufs_buffer_head * ubh)
                wait_on_buffer (ubh->bh[i]);
 }
-unsigned ubh_max_bcount (struct ufs_buffer_head * ubh)
-{
-        unsigned i;
-        unsigned max = 0;
-        if (!ubh)
-                return 0;
-        for ( i = 0; i < ubh->count; i++ ) 
-                if ( atomic_read(&ubh->bh[i]->b_count) > max )
-                        max = atomic_read(&ubh->bh[i]->b_count);
-        return max;
-}
 void ubh_bforget (struct ufs_buffer_head * ubh)
 {
        unsigned i;
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 48d6d9bcc157..406981fff5e7 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -17,10 +17,16 @@
 #define in_range(b,first,len)   ((b)>=(first)&&(b)<(first)+(len))
 /*
- * macros used for retyping
+ * functions used for retyping
 */
-#define UCPI_UBH ((struct ufs_buffer_head *)ucpi)
+static inline struct ufs_buffer_head *UCPI_UBH(struct ufs_cg_private_info *cpi)
-#define USPI_UBH ((struct ufs_buffer_head *)uspi)
+{
+        return &cpi->c_ubh;
+}
+static inline struct ufs_buffer_head *USPI_UBH(struct ufs_sb_private_info *spi)
+{
+        return &spi->s_ubh;
+}
@@ -33,12 +39,12 @@ ufs_get_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1,
 {
        switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
        case UFS_ST_SUN:
-                return fs32_to_cpu(sb, usb3->fs_u2.fs_sun.fs_state);
+                return fs32_to_cpu(sb, usb3->fs_un2.fs_sun.fs_state);
        case UFS_ST_SUNx86:
                return fs32_to_cpu(sb, usb1->fs_u1.fs_sunx86.fs_state);
        case UFS_ST_44BSD:
        default:
-                return fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_state);
+                return fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_state);
        }
 }
@@ -48,13 +54,13 @@ ufs_set_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1,
 {
        switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
        case UFS_ST_SUN:
-                usb3->fs_u2.fs_sun.fs_state = cpu_to_fs32(sb, value);
+                usb3->fs_un2.fs_sun.fs_state = cpu_to_fs32(sb, value);
                break;
        case UFS_ST_SUNx86:
                usb1->fs_u1.fs_sunx86.fs_state = cpu_to_fs32(sb, value);
                break;
        case UFS_ST_44BSD:
-                usb3->fs_u2.fs_44.fs_state = cpu_to_fs32(sb, value);
+                usb3->fs_un2.fs_44.fs_state = cpu_to_fs32(sb, value);
                break;
        }
 }
@@ -64,7 +70,7 @@ ufs_get_fs_npsect(struct super_block *sb, struct ufs_super_block_first *usb1,
                  struct ufs_super_block_third *usb3)
 {
        if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86)
-                return fs32_to_cpu(sb, usb3->fs_u2.fs_sunx86.fs_npsect);
+                return fs32_to_cpu(sb, usb3->fs_un2.fs_sunx86.fs_npsect);
        else
                return fs32_to_cpu(sb, usb1->fs_u1.fs_sun.fs_npsect);
 }
@@ -76,16 +82,16 @@ ufs_get_fs_qbmask(struct super_block *sb, struct ufs_super_block_third *usb3)
        switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
        case UFS_ST_SUN:
-                ((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sun.fs_qbmask[0];
+                ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sun.fs_qbmask[0];
-                ((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sun.fs_qbmask[1];
+                ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sun.fs_qbmask[1];
                break;
        case UFS_ST_SUNx86:
-                ((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sunx86.fs_qbmask[0];
+                ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sunx86.fs_qbmask[0];
-                ((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sunx86.fs_qbmask[1];
+                ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sunx86.fs_qbmask[1];
                break;
        case UFS_ST_44BSD:
-                ((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_44.fs_qbmask[0];
+                ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_44.fs_qbmask[0];
-                ((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_44.fs_qbmask[1];
+                ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_44.fs_qbmask[1];
                break;
        }
@@ -99,16 +105,16 @@ ufs_get_fs_qfmask(struct super_block *sb, struct ufs_super_block_third *usb3)
        switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
        case UFS_ST_SUN:
-                ((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sun.fs_qfmask[0];
+                ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sun.fs_qfmask[0];
-                ((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sun.fs_qfmask[1];
+                ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sun.fs_qfmask[1];
                break;
        case UFS_ST_SUNx86:
-                ((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sunx86.fs_qfmask[0];
+                ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sunx86.fs_qfmask[0];
-                ((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sunx86.fs_qfmask[1];
+                ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sunx86.fs_qfmask[1];
                break;
        case UFS_ST_44BSD:
-                ((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_44.fs_qfmask[0];
+                ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_44.fs_qfmask[0];
-                ((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_44.fs_qfmask[1];
+                ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_44.fs_qfmask[1];
                break;
        }
@@ -236,9 +242,8 @@ extern void ubh_brelse (struct ufs_buffer_head *);
 extern void ubh_brelse_uspi (struct ufs_sb_private_info *);
 extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *);
 extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int);
-extern void ubh_ll_rw_block (int, unsigned, struct ufs_buffer_head **);
+extern void ubh_ll_rw_block(int, struct ufs_buffer_head *);
 extern void ubh_wait_on_buffer (struct ufs_buffer_head *);
-extern unsigned ubh_max_bcount (struct ufs_buffer_head *);
 extern void ubh_bforget (struct ufs_buffer_head *);
 extern int  ubh_buffer_dirty (struct ufs_buffer_head *);
 #define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size)
@@ -297,40 +302,26 @@ static inline void *get_usb_offset(struct ufs_sb_private_info *uspi,
 #define ubh_blkmap(ubh,begin,bit) \
        ((*ubh_get_addr(ubh, (begin) + ((bit) >> 3)) >> ((bit) & 7)) & (0xff >> (UFS_MAXFRAG - uspi->s_fpb)))
-/*
- * Macros for access to superblock array structures
- */
-#define ubh_postbl(ubh,cylno,i) \
-        ((uspi->s_postblformat != UFS_DYNAMICPOSTBLFMT) \
-        ? (*(__s16*)(ubh_get_addr(ubh, \
-        (unsigned)(&((struct ufs_super_block *)0)->fs_opostbl) \
-        + (((cylno) * 16 + (i)) << 1) ) )) \
-        : (*(__s16*)(ubh_get_addr(ubh, \
-        uspi->s_postbloff + (((cylno) * uspi->s_nrpos + (i)) << 1) ))))
-#define ubh_rotbl(ubh,i) \
-        ((uspi->s_postblformat != UFS_DYNAMICPOSTBLFMT) \
-        ? (*(__u8*)(ubh_get_addr(ubh, \
-        (unsigned)(&((struct ufs_super_block *)0)->fs_space) + (i)))) \
-        : (*(__u8*)(ubh_get_addr(ubh, uspi->s_rotbloff + (i)))))
 /*
 * Determine the number of available frags given a
 * percentage to hold in reserve.
 */
-#define ufs_freespace(usb, percentreserved) \
+static inline u64
-        (ufs_blkstofrags(fs32_to_cpu(sb, (usb)->fs_cstotal.cs_nbfree)) + \
+ufs_freespace(struct ufs_sb_private_info *uspi, int percentreserved)
-        fs32_to_cpu(sb, (usb)->fs_cstotal.cs_nffree) - (uspi->s_dsize * (percentreserved) / 100))
+{
+        return ufs_blkstofrags(uspi->cs_total.cs_nbfree) +
+                uspi->cs_total.cs_nffree -
+                (uspi->s_dsize * (percentreserved) / 100);
+}
 /*
 * Macros to access cylinder group array structures
 */
 #define ubh_cg_blktot(ucpi,cylno) \
-        (*((__fs32*)ubh_get_addr(UCPI_UBH, (ucpi)->c_btotoff + ((cylno) << 2))))
+        (*((__fs32*)ubh_get_addr(UCPI_UBH(ucpi), (ucpi)->c_btotoff + ((cylno) << 2))))
 #define ubh_cg_blks(ucpi,cylno,rpos) \
-        (*((__fs16*)ubh_get_addr(UCPI_UBH, \
+        (*((__fs16*)ubh_get_addr(UCPI_UBH(ucpi), \
        (ucpi)->c_boff + (((cylno) * uspi->s_nrpos + (rpos)) << 1 ))))
 /*
@@ -508,29 +499,3 @@ static inline void ufs_fragacct (struct super_block * sb, unsigned blockmap,
        if (fragsize > 0 && fragsize < uspi->s_fpb)
                fs32_add(sb, &fraglist[fragsize], cnt);
 }
-#define ubh_scanc(ubh,begin,size,table,mask) _ubh_scanc_(uspi,ubh,begin,size,table,mask)
-static inline unsigned _ubh_scanc_(struct ufs_sb_private_info * uspi, struct ufs_buffer_head * ubh, 
-        unsigned begin, unsigned size, unsigned char * table, unsigned char mask)
-{
-        unsigned rest, offset;
-        unsigned char * cp;
-        
-        offset = begin & ~uspi->s_fmask;
-        begin >>= uspi->s_fshift;
-        for (;;) {
-                if ((offset + size) < uspi->s_fsize)
-                        rest = size;
-                else
-                        rest = uspi->s_fsize - offset;
-                size -= rest;
-                cp = ubh->bh[begin]->b_data + offset;
-                while ((table[*cp++] & mask) == 0 && --rest);
-                if (rest || !size)
-                        break;
-                begin++;
-                offset = 0;
-        }
-        return (size + rest);
-}
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 70662371bb11..3d4f6dff2113 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -299,7 +299,8 @@ xfs_file_open(
 STATIC int
 xfs_file_close(
-        struct file     *filp)
+        struct file     *filp,
+        fl_owner_t      id)
 {
        return -bhv_vop_close(vn_from_inode(filp->f_dentry->d_inode), 0,
                                file_count(filp) > 1 ? L_FALSE : L_TRUE, NULL);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 10dbf203c62f..ed7579beb6b0 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1721,15 +1721,14 @@ xfs_mount_log_sbunit(
 * is present to prevent thrashing).
 */
+#ifdef CONFIG_HOTPLUG_CPU
 /*
 * hot-plug CPU notifier support.
 *
- * We cannot use the hotcpu_register() function because it does
+ * We need a notifier per filesystem as we need to be able to identify
- * not allow notifier instances. We need a notifier per filesystem
+ * the filesystem to balance the counters out. This is achieved by
- * as we need to be able to identify the filesystem to balance
+ * having a notifier block embedded in the xfs_mount_t and doing pointer
- * the counters out. This is achieved by having a notifier block
+ * magic to get the mount pointer from the notifier block address.
- * embedded in the xfs_mount_t and doing pointer magic to get the
- * mount pointer from the notifier block address.
 */
 STATIC int
 xfs_icsb_cpu_notify(
@@ -1779,6 +1778,7 @@ xfs_icsb_cpu_notify(
        return NOTIFY_OK;
 }
+#endif /* CONFIG_HOTPLUG_CPU */
 int
 xfs_icsb_init_counters(
@@ -1791,9 +1791,11 @@ xfs_icsb_init_counters(
        if (mp->m_sb_cnts == NULL)
                return -ENOMEM;
+#ifdef CONFIG_HOTPLUG_CPU
        mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
        mp->m_icsb_notifier.priority = 0;
-        register_cpu_notifier(&mp->m_icsb_notifier);
+        register_hotcpu_notifier(&mp->m_icsb_notifier);
+#endif /* CONFIG_HOTPLUG_CPU */
        for_each_online_cpu(i) {
                cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
@@ -1812,7 +1814,7 @@ xfs_icsb_destroy_counters(
        xfs_mount_t     *mp)
 {
        if (mp->m_sb_cnts) {
-                unregister_cpu_notifier(&mp->m_icsb_notifier);
+                unregister_hotcpu_notifier(&mp->m_icsb_notifier);
                free_percpu(mp->m_sb_cnts);
        }
 }