Merge git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git

author: David Woodhouse <dwmw2@infradead.org> 2008-02-03 02:29:41 -0500
committer: David Woodhouse <dwmw2@infradead.org> 2008-02-03 02:30:32 -0500
commit: c1f3ee120bb61045b1c0a3ead620d1d65af47130 (patch)
tree: 908430bf2b47fe8e96ac623ae7ab6dd5698d0938 /fs
parent: e619a75ff6201b567a539e787aa9af9bc63a3187 (diff)
parent: 9135f1901ee6449dfe338adf6e40e9c2025b8150 (diff)
347 files changed, 16893 insertions, 7167 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 756f7e9beb2e..fbb12dadba83 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -82,7 +82,7 @@ static match_table_t tokens = {
 static void v9fs_parse_options(struct v9fs_session_info *v9ses)
 {
-        char *options = v9ses->options;
+        char *options;
        substring_t args[MAX_OPT_ARGS];
        char *p;
        int option;
@@ -96,9 +96,10 @@ static void v9fs_parse_options(struct v9fs_session_info *v9ses)
        v9ses->cache = 0;
        v9ses->trans = v9fs_default_trans();
-        if (!options)
+        if (!v9ses->options)
                return;
+        options = kstrdup(v9ses->options, GFP_KERNEL);
        while ((p = strsep(&options, ",")) != NULL) {
                int token;
                if (!*p)
@@ -169,6 +170,7 @@ static void v9fs_parse_options(struct v9fs_session_info *v9ses)
                        continue;
                }
        }
+        kfree(options);
 }
 /**
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index bb0cef9a6b8a..678c02f1ae23 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -119,6 +119,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
        P9_DPRINTK(P9_DEBUG_VFS, " \n");
+        st = NULL;
        v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
        if (!v9ses)
                return -ENOMEM;
@@ -164,10 +165,12 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
        root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
        v9fs_stat2inode(st, root->d_inode, sb);
        v9fs_fid_add(root, fid);
+        kfree(st);
        return simple_set_mnt(mnt, sb);
 error:
+        kfree(st);
        if (fid)
                p9_client_clunk(fid);
diff --git a/fs/Kconfig b/fs/Kconfig
index cc28a69246a7..987b5d7cb21a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -236,6 +236,7 @@ config JBD_DEBUG
 config JBD2
        tristate
+        select CRC32
        help
          This is a generic journaling layer for block devices that support
          both 32-bit and 64-bit block numbers.  It is currently used by
@@ -440,14 +441,8 @@ config OCFS2_FS
          Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
          OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
-          Note: Features which OCFS2 does not support yet:
+          For more information on OCFS2, see the file
-                  - extended attributes
+          <file:Documentation/filesystems/ocfs2.txt>.
-                  - quotas
-                  - cluster aware flock
-                  - Directory change notification (F_NOTIFY)
-                  - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
-                  - POSIX ACLs
-                  - readpages / writepages (not user visible)
 config OCFS2_DEBUG_MASKLOG
        bool "OCFS2 logging support"
@@ -459,6 +454,15 @@ config OCFS2_DEBUG_MASKLOG
          This option will enlarge your kernel, but it allows debugging of
          ocfs2 filesystem issues.
+config OCFS2_DEBUG_FS
+        bool "OCFS2 expensive checks"
+        depends on OCFS2_FS
+        default n
+        help
+          This option will enable expensive consistency checks. Enable
+          this option for debugging only as it is likely to decrease
+          performance of the filesystem.
 config MINIX_FS
        tristate "Minix fs support"
        help
@@ -504,7 +508,7 @@ config INOTIFY
          including multiple file events, one-shot support, and unmount
          notification.
-          For more information, see Documentation/filesystems/inotify.txt
+          For more information, see <file:Documentation/filesystems/inotify.txt>
          If unsure, say Y.
@@ -518,7 +522,7 @@ config INOTIFY_USER
          directories via a single open fd.  Events are read from the file
          descriptor, which is also select()- and poll()-able.
-          For more information, see Documentation/filesystems/inotify.txt
+          For more information, see <file:Documentation/filesystems/inotify.txt>
          If unsure, say Y.
@@ -1019,8 +1023,8 @@ config HUGETLB_PAGE
        def_bool HUGETLBFS
 config CONFIGFS_FS
-        tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)"
+        tristate "Userspace-driven configuration filesystem"
-        depends on SYSFS && EXPERIMENTAL
+        depends on SYSFS
        help
          configfs is a ram-based filesystem that provides the converse
          of sysfs's functionality. Where sysfs is a filesystem-based
@@ -1089,7 +1093,7 @@ config ECRYPT_FS
        depends on EXPERIMENTAL && KEYS && CRYPTO && NET
        help
          Encrypted filesystem that operates on the VFS layer.  See
-          <file:Documentation/ecryptfs.txt> to learn more about
+          <file:Documentation/filesystems/ecryptfs.txt> to learn more about
          eCryptfs.  Userspace components are required and can be
          obtained from <http://ecryptfs.sf.net>.
@@ -1103,8 +1107,8 @@ config HFS_FS
        help
          If you say Y here, you will be able to mount Macintosh-formatted
          floppy disks and hard drive partitions with full read-write access.
-          Please read <file:fs/hfs/HFS.txt> to learn about the available mount
+          Please read <file:Documentation/filesystems/hfs.txt> to learn about
-          options.
+          the available mount options.
          To compile this file system support as a module, choose M here: the
          module will be called hfs.
@@ -1296,7 +1300,7 @@ config JFFS2_COMPRESSION_OPTIONS
        help
          Enabling this option allows you to explicitly choose which
          compression modules, if any, are enabled in JFFS2. Removing
-          compressors and mean you cannot read existing file systems,
+          compressors can mean you cannot read existing file systems,
          and enabling experimental compressors can mean that you
          write a file system which cannot be read by a standard kernel.
@@ -1670,6 +1674,8 @@ config NFSD
        select CRYPTO_MD5 if NFSD_V4
        select CRYPTO if NFSD_V4
        select FS_POSIX_ACL if NFSD_V4
+        select PROC_FS if NFSD_V4
+        select PROC_FS if SUNRPC_GSS
        help
          If you want your Linux box to act as an NFS *server*, so that other
          computers on your local network which support NFS can access certain
@@ -1896,13 +1902,15 @@ config CIFS
          file servers such as Windows 2000 (including Windows 2003, NT 4  
          and Windows XP) as well by Samba (which provides excellent CIFS
          server support for Linux and many other operating systems). Limited
-          support for OS/2 and Windows ME and similar servers is provided as well.
+          support for OS/2 and Windows ME and similar servers is provided as
+          well.
-          The intent of the cifs module is to provide an advanced
-          network file system client for mounting to CIFS compliant servers,
+          The cifs module provides an advanced network file system
-          including support for dfs (hierarchical name space), secure per-user
+          client for mounting to CIFS compliant servers.  It includes
-          session establishment, safe distributed caching (oplock), optional
+          support for DFS (hierarchical name space), secure per-user
-          packet signing, Unicode and other internationalization improvements. 
+          session establishment via Kerberos or NTLM or NTLMv2,
+          safe distributed caching (oplock), optional packet
+          signing, Unicode and other internationalization improvements.
          If you need to mount to Samba or Windows from this machine, say Y.
 config CIFS_STATS
@@ -1934,7 +1942,8 @@ config CIFS_WEAK_PW_HASH
          (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos)
          security mechanisms. These hash the password more securely
          than the mechanisms used in the older LANMAN version of the
-          SMB protocol needed to establish sessions with old SMB servers.
+          SMB protocol but LANMAN based authentication is needed to
+          establish sessions with some old SMB servers.
          Enabling this option allows the cifs module to mount to older
          LANMAN based servers such as OS/2 and Windows 95, but such
@@ -1942,8 +1951,8 @@ config CIFS_WEAK_PW_HASH
          security mechanisms if you are on a public network.  Unless you
          have a need to access old SMB servers (and are on a private 
          network) you probably want to say N.  Even if this support
-          is enabled in the kernel build, they will not be used
+          is enabled in the kernel build, LANMAN authentication will not be
-          automatically. At runtime LANMAN mounts are disabled but
+          used automatically. At runtime LANMAN mounts are disabled but
          can be set to required (or optional) either in
          /proc/fs/cifs (see fs/cifs/README for more detail) or via an
          option on the mount command. This support is disabled by 
@@ -2007,14 +2016,24 @@ config CIFS_EXPERIMENTAL
 config CIFS_UPCALL
          bool "Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)"
          depends on CIFS_EXPERIMENTAL
-          depends on CONNECTOR
+          depends on KEYS
          help
-            Enables an upcall mechanism for CIFS which will be used to contact
+            Enables an upcall mechanism for CIFS which accesses
-            userspace helper utilities to provide SPNEGO packaged Kerberos
+            userspace helper utilities to provide SPNEGO packaged (RFC 4178)
-            tickets which are needed to mount to certain secure servers
+            Kerberos tickets which are needed to mount to certain secure servers
            (for which more secure Kerberos authentication is required). If
            unsure, say N.
+config CIFS_DFS_UPCALL
+          bool "DFS feature support (EXPERIMENTAL)"
+          depends on CIFS_EXPERIMENTAL
+          depends on KEYS
+          help
+            Enables an upcall mechanism for CIFS which contacts userspace
+            helper utilities to provide server name resolution (host names to
+            IP addresses) which is needed for implicit mounts of DFS junction
+            points. If unsure, say N.
 config NCP_FS
        tristate "NCP file system support (to mount NetWare volumes)"
        depends on IPX!=n || INET
@@ -2121,4 +2140,3 @@ source "fs/nls/Kconfig"
 source "fs/dlm/Kconfig"
 endmenu
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index d4fc6095466d..7c3d5f923da1 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -23,6 +23,10 @@ config BINFMT_ELF
          ld.so (check the file <file:Documentation/Changes> for location and
          latest version).
+config COMPAT_BINFMT_ELF
+        bool
+        depends on COMPAT && MMU
 config BINFMT_ELF_FDPIC
        bool "Kernel support for FDPIC ELF binaries"
        default y
diff --git a/fs/Makefile b/fs/Makefile
index 500cf15cdb4b..1e7a11bd4da1 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_BINFMT_MISC)	+= binfmt_misc.o
 obj-y                           += binfmt_script.o
 obj-$(CONFIG_BINFMT_ELF)        += binfmt_elf.o
+obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o
 obj-$(CONFIG_BINFMT_ELF_FDPIC)  += binfmt_elf_fdpic.o
 obj-$(CONFIG_BINFMT_SOM)        += binfmt_som.o
 obj-$(CONFIG_BINFMT_FLAT)       += binfmt_flat.o
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 33fe39ad4e03..0cc3597c1197 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -546,11 +546,11 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
        dentry->d_op = &afs_fs_dentry_operations;
        d_add(dentry, inode);
-        _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%lu }",
+        _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%llu }",
               fid.vnode,
               fid.unique,
               dentry->d_inode->i_ino,
-               dentry->d_inode->i_version);
+               (unsigned long long)dentry->d_inode->i_version);
        return NULL;
 }
@@ -630,9 +630,10 @@ static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
                 * been deleted and replaced, and the original vnode ID has
                 * been reused */
                if (fid.unique != vnode->fid.unique) {
-                        _debug("%s: file deleted (uq %u -> %u I:%lu)",
+                        _debug("%s: file deleted (uq %u -> %u I:%llu)",
                               dentry->d_name.name, fid.unique,
-                               vnode->fid.unique, dentry->d_inode->i_version);
+                               vnode->fid.unique,
+                               (unsigned long long)dentry->d_inode->i_version);
                        spin_lock(&vnode->lock);
                        set_bit(AFS_VNODE_DELETED, &vnode->flags);
                        spin_unlock(&vnode->lock);
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index d196840127c6..84750c8e9f95 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -301,7 +301,8 @@ int afs_getattr(struct vfsmount *mnt, struct dentry *dentry,
        inode = dentry->d_inode;
-        _enter("{ ino=%lu v=%lu }", inode->i_ino, inode->i_version);
+        _enter("{ ino=%lu v=%llu }", inode->i_ino,
+                (unsigned long long)inode->i_version);
        generic_fillattr(inode, stat);
        return 0;
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 7b4bbe48112d..849fc3160cb5 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -382,7 +382,7 @@ struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *cell,
               cell->name, key_serial(key),
               (int) namesz, (int) namesz, name, namesz);
-        if (namesz > sizeof(vl->vldb.name)) {
+        if (namesz >= sizeof(vl->vldb.name)) {
                _leave(" = -ENAMETOOLONG");
                return ERR_PTR(-ENAMETOOLONG);
        }
diff --git a/fs/aio.c b/fs/aio.c
index f12db415c0f6..8a37dbbf3437 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -397,7 +397,7 @@ void fastcall __put_ioctx(struct kioctx *ctx)
 * This prevents races between the aio code path referencing the
 * req (after submitting it) and aio_complete() freeing the req.
 */
-static struct kiocb *FASTCALL(__aio_get_req(struct kioctx *ctx));
+static struct kiocb *__aio_get_req(struct kioctx *ctx);
 static struct kiocb fastcall *__aio_get_req(struct kioctx *ctx)
 {
        struct kiocb *req = NULL;
@@ -1161,7 +1161,12 @@ retry:
                        ret = 0;
                        if (to.timed_out)       /* Only check after read evt */
                                break;
-                        io_schedule();
+                        /* Try to only show up in io wait if there are ops
+                         *  in flight */
+                        if (ctx->reqs_active)
+                                io_schedule();
+                        else
+                                schedule();
                        if (signal_pending(tsk)) {
                                ret = -EINTR;
                                break;
diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index 130f6c66c5ba..ac7a8b1d6c3a 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -14,8 +14,6 @@ struct bfs_sb_info {
        unsigned long si_blocks;
        unsigned long si_freeb;
        unsigned long si_freei;
-        unsigned long si_lf_ioff;
-        unsigned long si_lf_sblk;
        unsigned long si_lf_eblk;
        unsigned long si_lasti;
        unsigned long * si_imap;
@@ -39,7 +37,7 @@ static inline struct bfs_sb_info *BFS_SB(struct super_block *sb)
 static inline struct bfs_inode_info *BFS_I(struct inode *inode)
 {
-        return list_entry(inode, struct bfs_inode_info, vfs_inode);
+        return container_of(inode, struct bfs_inode_info, vfs_inode);
 }
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 097f1497f743..1fd056d0fc3d 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -21,29 +21,32 @@
 #define dprintf(x...)
 #endif
-static int bfs_add_entry(struct inode * dir, const unsigned char * name, int namelen, int ino);
+static int bfs_add_entry(struct inode *dir, const unsigned char *name,
-static struct buffer_head * bfs_find_entry(struct inode * dir, 
+                                                int namelen, int ino);
-        const unsigned char * name, int namelen, struct bfs_dirent ** res_dir);
+static struct buffer_head *bfs_find_entry(struct inode *dir,
+                                const unsigned char *name, int namelen,
+                                struct bfs_dirent **res_dir);
-static int bfs_readdir(struct file * f, void * dirent, filldir_t filldir)
+static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
 {
-        struct inode * dir = f->f_path.dentry->d_inode;
+        struct inode *dir = f->f_path.dentry->d_inode;
-        struct buffer_head * bh;
+        struct buffer_head *bh;
-        struct bfs_dirent * de;
+        struct bfs_dirent *de;
        unsigned int offset;
        int block;
        lock_kernel();
-        if (f->f_pos & (BFS_DIRENT_SIZE-1)) {
+        if (f->f_pos & (BFS_DIRENT_SIZE - 1)) {
-                printf("Bad f_pos=%08lx for %s:%08lx\n", (unsigned long)f->f_pos, 
+                printf("Bad f_pos=%08lx for %s:%08lx\n",
-                        dir->i_sb->s_id, dir->i_ino);
+                                        (unsigned long)f->f_pos,
+                                        dir->i_sb->s_id, dir->i_ino);
                unlock_kernel();
                return -EBADF;
        }
        while (f->f_pos < dir->i_size) {
-                offset = f->f_pos & (BFS_BSIZE-1);
+                offset = f->f_pos & (BFS_BSIZE - 1);
                block = BFS_I(dir)->i_sblock + (f->f_pos >> BFS_BSIZE_BITS);
                bh = sb_bread(dir->i_sb, block);
                if (!bh) {
@@ -54,7 +57,9 @@ static int bfs_readdir(struct file * f, void * dirent, filldir_t filldir)
                        de = (struct bfs_dirent *)(bh->b_data + offset);
                        if (de->ino) {
                                int size = strnlen(de->name, BFS_NAMELEN);
-                                if (filldir(dirent, de->name, size, f->f_pos, le16_to_cpu(de->ino), DT_UNKNOWN) < 0) {
+                                if (filldir(dirent, de->name, size, f->f_pos,
+                                                le16_to_cpu(de->ino),
+                                                DT_UNKNOWN) < 0) {
                                        brelse(bh);
                                        unlock_kernel();
                                        return 0;
@@ -62,7 +67,7 @@ static int bfs_readdir(struct file * f, void * dirent, filldir_t filldir)
                        }
                        offset += BFS_DIRENT_SIZE;
                        f->f_pos += BFS_DIRENT_SIZE;
-                } while (offset < BFS_BSIZE && f->f_pos < dir->i_size);
+                } while ((offset < BFS_BSIZE) && (f->f_pos < dir->i_size));
                brelse(bh);
        }
@@ -78,13 +83,13 @@ const struct file_operations bfs_dir_operations = {
 extern void dump_imap(const char *, struct super_block *);
-static int bfs_create(struct inode * dir, struct dentry * dentry, int mode,
+static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
-                struct nameidata *nd)
+                                                struct nameidata *nd)
 {
        int err;
-        struct inode * inode;
+        struct inode *inode;
-        struct super_block * s = dir->i_sb;
+        struct super_block *s = dir->i_sb;
-        struct bfs_sb_info * info = BFS_SB(s);
+        struct bfs_sb_info *info = BFS_SB(s);
        unsigned long ino;
        inode = new_inode(s);
@@ -97,7 +102,7 @@ static int bfs_create(struct inode * dir, struct dentry * dentry, int mode,
                iput(inode);
                return -ENOSPC;
        }
-        set_bit(ino, info->si_imap);    
+        set_bit(ino, info->si_imap);
        info->si_freei--;
        inode->i_uid = current->fsuid;
        inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current->fsgid;
@@ -113,9 +118,10 @@ static int bfs_create(struct inode * dir, struct dentry * dentry, int mode,
        BFS_I(inode)->i_eblock = 0;
        insert_inode_hash(inode);
        mark_inode_dirty(inode);
-        dump_imap("create",s);
+        dump_imap("create", s);
-        err = bfs_add_entry(dir, dentry->d_name.name, dentry->d_name.len, inode->i_ino);
+        err = bfs_add_entry(dir, dentry->d_name.name, dentry->d_name.len,
+                                                        inode->i_ino);
        if (err) {
                inode_dec_link_count(inode);
                iput(inode);
@@ -127,11 +133,12 @@ static int bfs_create(struct inode * dir, struct dentry * dentry, int mode,
        return 0;
 }
-static struct dentry * bfs_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
+static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry,
+                                                struct nameidata *nd)
 {
-        struct inode * inode = NULL;
+        struct inode *inode = NULL;
-        struct buffer_head * bh;
+        struct buffer_head *bh;
-        struct bfs_dirent * de;
+        struct bfs_dirent *de;
        if (dentry->d_name.len > BFS_NAMELEN)
                return ERR_PTR(-ENAMETOOLONG);
@@ -152,13 +159,15 @@ static struct dentry * bfs_lookup(struct inode * dir, struct dentry * dentry, st
        return NULL;
 }
-static int bfs_link(struct dentry * old, struct inode * dir, struct dentry * new)
+static int bfs_link(struct dentry *old, struct inode *dir,
+                                                struct dentry *new)
 {
-        struct inode * inode = old->d_inode;
+        struct inode *inode = old->d_inode;
        int err;
        lock_kernel();
-        err = bfs_add_entry(dir, new->d_name.name, new->d_name.len, inode->i_ino);
+        err = bfs_add_entry(dir, new->d_name.name, new->d_name.len,
+                                                        inode->i_ino);
        if (err) {
                unlock_kernel();
                return err;
@@ -172,23 +181,23 @@ static int bfs_link(struct dentry * old, struct inode * dir, struct dentry * new
        return 0;
 }
+static int bfs_unlink(struct inode *dir, struct dentry *dentry)
-static int bfs_unlink(struct inode * dir, struct dentry * dentry)
 {
        int error = -ENOENT;
-        struct inode * inode;
+        struct inode *inode;
-        struct buffer_head * bh;
+        struct buffer_head *bh;
-        struct bfs_dirent * de;
+        struct bfs_dirent *de;
        inode = dentry->d_inode;
        lock_kernel();
        bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de);
-        if (!bh || le16_to_cpu(de->ino) != inode->i_ino)
+        if (!bh || (le16_to_cpu(de->ino) != inode->i_ino))
                goto out_brelse;
        if (!inode->i_nlink) {
-                printf("unlinking non-existent file %s:%lu (nlink=%d)\n", inode->i_sb->s_id, 
+                printf("unlinking non-existent file %s:%lu (nlink=%d)\n",
-                                inode->i_ino, inode->i_nlink);
+                                        inode->i_sb->s_id, inode->i_ino,
+                                        inode->i_nlink);
                inode->i_nlink = 1;
        }
        de->ino = 0;
@@ -205,12 +214,12 @@ out_brelse:
        return error;
 }
-static int bfs_rename(struct inode * old_dir, struct dentry * old_dentry, 
+static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-                        struct inode * new_dir, struct dentry * new_dentry)
+                        struct inode *new_dir, struct dentry *new_dentry)
 {
-        struct inode * old_inode, * new_inode;
+        struct inode *old_inode, *new_inode;
-        struct buffer_head * old_bh, * new_bh;
+        struct buffer_head *old_bh, *new_bh;
-        struct bfs_dirent * old_de, * new_de;           
+        struct bfs_dirent *old_de, *new_de;
        int error = -ENOENT;
        old_bh = new_bh = NULL;
@@ -223,7 +232,7 @@ static int bfs_rename(struct inode * old_dir, struct dentry * old_dentry,
                                old_dentry->d_name.name, 
                                old_dentry->d_name.len, &old_de);
-        if (!old_bh || le16_to_cpu(old_de->ino) != old_inode->i_ino)
+        if (!old_bh || (le16_to_cpu(old_de->ino) != old_inode->i_ino))
                goto end_rename;
        error = -EPERM;
@@ -239,7 +248,8 @@ static int bfs_rename(struct inode * old_dir, struct dentry * old_dentry,
        if (!new_bh) {
                error = bfs_add_entry(new_dir, 
                                        new_dentry->d_name.name,
-                                        new_dentry->d_name.len, old_inode->i_ino);
+                                        new_dentry->d_name.len,
+                                        old_inode->i_ino);
                if (error)
                        goto end_rename;
        }
@@ -268,11 +278,12 @@ const struct inode_operations bfs_dir_inops = {
        .rename                 = bfs_rename,
 };
-static int bfs_add_entry(struct inode * dir, const unsigned char * name, int namelen, int ino)
+static int bfs_add_entry(struct inode *dir, const unsigned char *name,
+                                                        int namelen, int ino)
 {
-        struct buffer_head * bh;
+        struct buffer_head *bh;
-        struct bfs_dirent * de;
+        struct bfs_dirent *de;
-        int block, sblock, eblock, off, eoff;
+        int block, sblock, eblock, off, pos;
        int i;
        dprintf("name=%s, namelen=%d\n", name, namelen);
@@ -284,27 +295,24 @@ static int bfs_add_entry(struct inode * dir, const unsigned char * name, int nam
        sblock = BFS_I(dir)->i_sblock;
        eblock = BFS_I(dir)->i_eblock;
-        eoff = dir->i_size % BFS_BSIZE;
+        for (block = sblock; block <= eblock; block++) {
-        for (block=sblock; block<=eblock; block++) {
                bh = sb_bread(dir->i_sb, block);
-                if(!bh) 
+                if (!bh)
                        return -ENOSPC;
-                for (off=0; off<BFS_BSIZE; off+=BFS_DIRENT_SIZE) {
+                for (off = 0; off < BFS_BSIZE; off += BFS_DIRENT_SIZE) {
                        de = (struct bfs_dirent *)(bh->b_data + off);
-                        if (block==eblock && off>=eoff) {
-                                /* Do not read/interpret the garbage in the end of eblock. */
-                                de->ino = 0;
-                        }
                        if (!de->ino) {
-                                if ((block-sblock)*BFS_BSIZE + off >= dir->i_size) {
+                                pos = (block - sblock) * BFS_BSIZE + off;
+                                if (pos >= dir->i_size) {
                                        dir->i_size += BFS_DIRENT_SIZE;
                                        dir->i_ctime = CURRENT_TIME_SEC;
                                }
                                dir->i_mtime = CURRENT_TIME_SEC;
                                mark_inode_dirty(dir);
                                de->ino = cpu_to_le16((u16)ino);
-                                for (i=0; i<BFS_NAMELEN; i++)
+                                for (i = 0; i < BFS_NAMELEN; i++)
-                                        de->name[i] = (i < namelen) ? name[i] : 0;
+                                        de->name[i] =
+                                                (i < namelen) ? name[i] : 0;
                                mark_buffer_dirty(bh);
                                brelse(bh);
                                return 0;
@@ -315,25 +323,26 @@ static int bfs_add_entry(struct inode * dir, const unsigned char * name, int nam
        return -ENOSPC;
 }
-static inline int bfs_namecmp(int len, const unsigned char * name, const char * buffer)
+static inline int bfs_namecmp(int len, const unsigned char *name,
+                                                        const char *buffer)
 {
-        if (len < BFS_NAMELEN && buffer[len])
+        if ((len < BFS_NAMELEN) && buffer[len])
                return 0;
        return !memcmp(name, buffer, len);
 }
-static struct buffer_head * bfs_find_entry(struct inode * dir, 
+static struct buffer_head *bfs_find_entry(struct inode *dir,
-        const unsigned char * name, int namelen, struct bfs_dirent ** res_dir)
+                        const unsigned char *name, int namelen,
+                        struct bfs_dirent **res_dir)
 {
-        unsigned long block, offset;
+        unsigned long block = 0, offset = 0;
-        struct buffer_head * bh;
+        struct buffer_head *bh = NULL;
-        struct bfs_dirent * de;
+        struct bfs_dirent *de;
        *res_dir = NULL;
        if (namelen > BFS_NAMELEN)
                return NULL;
-        bh = NULL;
-        block = offset = 0;
        while (block * BFS_BSIZE + offset < dir->i_size) {
                if (!bh) {
                        bh = sb_bread(dir->i_sb, BFS_I(dir)->i_sblock + block);
@@ -344,7 +353,8 @@ static struct buffer_head * bfs_find_entry(struct inode * dir,
                }
                de = (struct bfs_dirent *)(bh->b_data + offset);
                offset += BFS_DIRENT_SIZE;
-                if (le16_to_cpu(de->ino) && bfs_namecmp(namelen, name, de->name)) {
+                if (le16_to_cpu(de->ino) &&
+                                bfs_namecmp(namelen, name, de->name)) {
                        *res_dir = de;
                        return bh;
                }
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 911b4ccf470f..b11e63e8fbcd 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -2,6 +2,11 @@
 *      fs/bfs/file.c
 *      BFS file operations.
 *      Copyright (C) 1999,2000 Tigran Aivazian <tigran@veritas.com>
+ *
+ *      Make the file block allocation algorithm understand the size
+ *      of the underlying block device.
+ *      Copyright (C) 2007 Dmitri Vorobiev <dmitri.vorobiev@gmail.com>
+ *
 */
 #include <linux/fs.h>
@@ -27,7 +32,8 @@ const struct file_operations bfs_file_operations = {
        .splice_read    = generic_file_splice_read,
 };
-static int bfs_move_block(unsigned long from, unsigned long to, struct super_block *sb)
+static int bfs_move_block(unsigned long from, unsigned long to,
+                                        struct super_block *sb)
 {
        struct buffer_head *bh, *new;
@@ -43,21 +49,22 @@ static int bfs_move_block(unsigned long from, unsigned long to, struct super_blo
 }
 static int bfs_move_blocks(struct super_block *sb, unsigned long start,
-                           unsigned long end, unsigned long where)
+                                unsigned long end, unsigned long where)
 {
        unsigned long i;
        dprintf("%08lx-%08lx->%08lx\n", start, end, where);
        for (i = start; i <= end; i++)
                if(bfs_move_block(i, where + i, sb)) {
-                        dprintf("failed to move block %08lx -> %08lx\n", i, where + i);
+                        dprintf("failed to move block %08lx -> %08lx\n", i,
+                                                                where + i);
                        return -EIO;
                }
        return 0;
 }
-static int bfs_get_block(struct inode * inode, sector_t block, 
+static int bfs_get_block(struct inode *inode, sector_t block,
-        struct buffer_head * bh_result, int create)
+                        struct buffer_head *bh_result, int create)
 {
        unsigned long phys;
        int err;
@@ -66,9 +73,6 @@ static int bfs_get_block(struct inode * inode, sector_t block,
        struct bfs_inode_info *bi = BFS_I(inode);
        struct buffer_head *sbh = info->si_sbh;
-        if (block > info->si_blocks)
-                return -EIO;
        phys = bi->i_sblock + block;
        if (!create) {
                if (phys <= bi->i_eblock) {
@@ -79,21 +83,29 @@ static int bfs_get_block(struct inode * inode, sector_t block,
                return 0;
        }
-        /* if the file is not empty and the requested block is within the range
+        /*
-           of blocks allocated for this file, we can grant it */
+         * If the file is not empty and the requested block is within the
-        if (inode->i_size && phys <= bi->i_eblock) {
+         * range of blocks allocated for this file, we can grant it.
+         */
+        if (bi->i_sblock && (phys <= bi->i_eblock)) {
                dprintf("c=%d, b=%08lx, phys=%08lx (interim block granted)\n", 
                                create, (unsigned long)block, phys);
                map_bh(bh_result, sb, phys);
                return 0;
        }
-        /* the rest has to be protected against itself */
+        /* The file will be extended, so let's see if there is enough space. */
+        if (phys >= info->si_blocks)
+                return -ENOSPC;
+        /* The rest has to be protected against itself. */
        lock_kernel();
-        /* if the last data block for this file is the last allocated
+        /*
-           block, we can extend the file trivially, without moving it
+         * If the last data block for this file is the last allocated
-           anywhere */
+         * block, we can extend the file trivially, without moving it
+         * anywhere.
+         */
        if (bi->i_eblock == info->si_lf_eblk) {
                dprintf("c=%d, b=%08lx, phys=%08lx (simple extension)\n", 
                                create, (unsigned long)block, phys);
@@ -106,13 +118,19 @@ static int bfs_get_block(struct inode * inode, sector_t block,
                goto out;
        }
-        /* Ok, we have to move this entire file to the next free block */
+        /* Ok, we have to move this entire file to the next free block. */
        phys = info->si_lf_eblk + 1;
-        if (bi->i_sblock) { /* if data starts on block 0 then there is no data */
+        if (phys + block >= info->si_blocks) {
+                err = -ENOSPC;
+                goto out;
+        }
+        if (bi->i_sblock) {
                err = bfs_move_blocks(inode->i_sb, bi->i_sblock, 
-                                bi->i_eblock, phys);
+                                                bi->i_eblock, phys);
                if (err) {
-                        dprintf("failed to move ino=%08lx -> fs corruption\n", inode->i_ino);
+                        dprintf("failed to move ino=%08lx -> fs corruption\n",
+                                                                inode->i_ino);
                        goto out;
                }
        } else
@@ -124,8 +142,10 @@ static int bfs_get_block(struct inode * inode, sector_t block,
        phys += block;
        info->si_lf_eblk = bi->i_eblock = phys;
-        /* this assumes nothing can write the inode back while we are here
+        /*
-         * and thus update inode->i_blocks! (XXX)*/
+         * This assumes nothing can write the inode back while we are here
+         * and thus update inode->i_blocks! (XXX)
+         */
        info->si_freeb -= bi->i_eblock - bi->i_sblock + 1 - inode->i_blocks;
        mark_inode_dirty(inode);
        mark_buffer_dirty(sbh);
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 7bd9c2bbe6ee..a64a71d444f5 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -30,25 +30,26 @@ MODULE_LICENSE("GPL");
 #define dprintf(x...)
 #endif
-void dump_imap(const char *prefix, struct super_block * s);
+void dump_imap(const char *prefix, struct super_block *s);
-static void bfs_read_inode(struct inode * inode)
+static void bfs_read_inode(struct inode *inode)
 {
        unsigned long ino = inode->i_ino;
-        struct bfs_inode * di;
+        struct bfs_inode *di;
-        struct buffer_head * bh;
+        struct buffer_head *bh;
        int block, off;
-        if (ino < BFS_ROOT_INO || ino > BFS_SB(inode->i_sb)->si_lasti) {
+        if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(inode->i_sb)->si_lasti)) {
                printf("Bad inode number %s:%08lx\n", inode->i_sb->s_id, ino);
                make_bad_inode(inode);
                return;
        }
-        block = (ino - BFS_ROOT_INO)/BFS_INODES_PER_BLOCK + 1;
+        block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
        bh = sb_bread(inode->i_sb, block);
        if (!bh) {
-                printf("Unable to read inode %s:%08lx\n", inode->i_sb->s_id, ino);
+                printf("Unable to read inode %s:%08lx\n", inode->i_sb->s_id,
+                                                                        ino);
                make_bad_inode(inode);
                return;
        }
@@ -56,7 +57,7 @@ static void bfs_read_inode(struct inode * inode)
        off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
        di = (struct bfs_inode *)bh->b_data + off;
-        inode->i_mode = 0x0000FFFF &  le32_to_cpu(di->i_mode);
+        inode->i_mode = 0x0000FFFF & le32_to_cpu(di->i_mode);
        if (le32_to_cpu(di->i_vtype) == BFS_VDIR) {
                inode->i_mode |= S_IFDIR;
                inode->i_op = &bfs_dir_inops;
@@ -70,48 +71,48 @@ static void bfs_read_inode(struct inode * inode)
        BFS_I(inode)->i_sblock =  le32_to_cpu(di->i_sblock);
        BFS_I(inode)->i_eblock =  le32_to_cpu(di->i_eblock);
+        BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino);
        inode->i_uid =  le32_to_cpu(di->i_uid);
        inode->i_gid =  le32_to_cpu(di->i_gid);
        inode->i_nlink =  le32_to_cpu(di->i_nlink);
        inode->i_size = BFS_FILESIZE(di);
        inode->i_blocks = BFS_FILEBLOCKS(di);
-        if (inode->i_size || inode->i_blocks) dprintf("Registered inode with %lld size, %ld blocks\n", inode->i_size, inode->i_blocks);
        inode->i_atime.tv_sec =  le32_to_cpu(di->i_atime);
        inode->i_mtime.tv_sec =  le32_to_cpu(di->i_mtime);
        inode->i_ctime.tv_sec =  le32_to_cpu(di->i_ctime);
        inode->i_atime.tv_nsec = 0;
        inode->i_mtime.tv_nsec = 0;
        inode->i_ctime.tv_nsec = 0;
-        BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino); /* can be 0 so we store a copy */
        brelse(bh);
 }
-static int bfs_write_inode(struct inode * inode, int unused)
+static int bfs_write_inode(struct inode *inode, int unused)
 {
        unsigned int ino = (u16)inode->i_ino;
        unsigned long i_sblock;
-        struct bfs_inode * di;
+        struct bfs_inode *di;
-        struct buffer_head * bh;
+        struct buffer_head *bh;
        int block, off;
        dprintf("ino=%08x\n", ino);
-        if (ino < BFS_ROOT_INO || ino > BFS_SB(inode->i_sb)->si_lasti) {
+        if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(inode->i_sb)->si_lasti)) {
                printf("Bad inode number %s:%08x\n", inode->i_sb->s_id, ino);
                return -EIO;
        }
        lock_kernel();
-        block = (ino - BFS_ROOT_INO)/BFS_INODES_PER_BLOCK + 1;
+        block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
        bh = sb_bread(inode->i_sb, block);
        if (!bh) {
-                printf("Unable to read inode %s:%08x\n", inode->i_sb->s_id, ino);
+                printf("Unable to read inode %s:%08x\n",
+                                inode->i_sb->s_id, ino);
                unlock_kernel();
                return -EIO;
        }
-        off = (ino - BFS_ROOT_INO)%BFS_INODES_PER_BLOCK;
+        off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
        di = (struct bfs_inode *)bh->b_data + off;
        if (ino == BFS_ROOT_INO)
@@ -133,27 +134,26 @@ static int bfs_write_inode(struct inode * inode, int unused)
        di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
        mark_buffer_dirty(bh);
-        dprintf("Written ino=%d into %d:%d\n",le16_to_cpu(di->i_ino),block,off);
        brelse(bh);
        unlock_kernel();
        return 0;
 }
-static void bfs_delete_inode(struct inode * inode)
+static void bfs_delete_inode(struct inode *inode)
 {
        unsigned long ino = inode->i_ino;
-        struct bfs_inode * di;
+        struct bfs_inode *di;
-        struct buffer_head * bh;
+        struct buffer_head *bh;
        int block, off;
-        struct super_block * s = inode->i_sb;
+        struct super_block *s = inode->i_sb;
-        struct bfs_sb_info * info = BFS_SB(s);
+        struct bfs_sb_info *info = BFS_SB(s);
-        struct bfs_inode_info * bi = BFS_I(inode);
+        struct bfs_inode_info *bi = BFS_I(inode);
        dprintf("ino=%08lx\n", ino);
        truncate_inode_pages(&inode->i_data, 0);
-        if (ino < BFS_ROOT_INO || ino > info->si_lasti) {
+        if ((ino < BFS_ROOT_INO) || (ino > info->si_lasti)) {
                printf("invalid ino=%08lx\n", ino);
                return;
        }
@@ -162,31 +162,36 @@ static void bfs_delete_inode(struct inode * inode)
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
        lock_kernel();
        mark_inode_dirty(inode);
-        block = (ino - BFS_ROOT_INO)/BFS_INODES_PER_BLOCK + 1;
+        block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
        bh = sb_bread(s, block);
        if (!bh) {
-                printf("Unable to read inode %s:%08lx\n", inode->i_sb->s_id, ino);
+                printf("Unable to read inode %s:%08lx\n",
+                                        inode->i_sb->s_id, ino);
                unlock_kernel();
                return;
        }
-        off = (ino - BFS_ROOT_INO)%BFS_INODES_PER_BLOCK;
+        off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
-        di = (struct bfs_inode *) bh->b_data + off;
+        di = (struct bfs_inode *)bh->b_data + off;
+        memset((void *)di, 0, sizeof(struct bfs_inode));
+        mark_buffer_dirty(bh);
+        brelse(bh);
        if (bi->i_dsk_ino) {
-                info->si_freeb += 1 + bi->i_eblock - bi->i_sblock;
+                if (bi->i_sblock)
+                        info->si_freeb += bi->i_eblock + 1 - bi->i_sblock;
                info->si_freei++;
                clear_bit(ino, info->si_imap);
                dump_imap("delete_inode", s);
        }
-        di->i_ino = 0;
-        di->i_sblock = 0;
-        mark_buffer_dirty(bh);
-        brelse(bh);
-        /* if this was the last file, make the previous 
+        /*
-           block "last files last block" even if there is no real file there,
+         * If this was the last file, make the previous block
-           saves us 1 gap */
+         * "last block of the last file" even if there is no
-        if (info->si_lf_eblk == BFS_I(inode)->i_eblock) {
+         * real file there, saves us 1 gap.
-                info->si_lf_eblk = BFS_I(inode)->i_sblock - 1;
+         */
+        if (info->si_lf_eblk == bi->i_eblock) {
+                info->si_lf_eblk = bi->i_sblock - 1;
                mark_buffer_dirty(info->si_sbh);
        }
        unlock_kernel();
@@ -228,7 +233,7 @@ static void bfs_write_super(struct super_block *s)
        unlock_kernel();
 }
-static struct kmem_cache * bfs_inode_cachep;
+static struct kmem_cache *bfs_inode_cachep;
 static struct inode *bfs_alloc_inode(struct super_block *sb)
 {
@@ -279,7 +284,7 @@ static const struct super_operations bfs_sops = {
        .statfs         = bfs_statfs,
 };
-void dump_imap(const char *prefix, struct super_block * s)
+void dump_imap(const char *prefix, struct super_block *s)
 {
 #ifdef DEBUG
        int i;
@@ -287,25 +292,26 @@ void dump_imap(const char *prefix, struct super_block * s)
        if (!tmpbuf)
                return;
-        for (i=BFS_SB(s)->si_lasti; i>=0; i--) {
+        for (i = BFS_SB(s)->si_lasti; i >= 0; i--) {
-                if (i > PAGE_SIZE-100) break;
+                if (i > PAGE_SIZE - 100) break;
                if (test_bit(i, BFS_SB(s)->si_imap))
                        strcat(tmpbuf, "1");
                else
                        strcat(tmpbuf, "0");
        }
-        printk(KERN_ERR "BFS-fs: %s: lasti=%08lx <%s>\n", prefix, BFS_SB(s)->si_lasti, tmpbuf);
+        printf("BFS-fs: %s: lasti=%08lx <%s>\n",
+                                prefix, BFS_SB(s)->si_lasti, tmpbuf);
        free_page((unsigned long)tmpbuf);
 #endif
 }
 static int bfs_fill_super(struct super_block *s, void *data, int silent)
 {
-        struct buffer_head * bh;
+        struct buffer_head *bh;
-        struct bfs_super_block * bfs_sb;
+        struct bfs_super_block *bfs_sb;
-        struct inode * inode;
+        struct inode *inode;
        unsigned i, imap_len;
-        struct bfs_sb_info * info;
+        struct bfs_sb_info *info;
        info = kzalloc(sizeof(*info), GFP_KERNEL);
        if (!info)
@@ -329,14 +335,14 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
        s->s_magic = BFS_MAGIC;
        info->si_sbh = bh;
-        info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE)/sizeof(struct bfs_inode)
+        info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) /
-                        + BFS_ROOT_INO - 1;
+                                        sizeof(struct bfs_inode)
+                                        + BFS_ROOT_INO - 1;
-        imap_len = info->si_lasti/8 + 1;
+        imap_len = (info->si_lasti / 8) + 1;
        info->si_imap = kzalloc(imap_len, GFP_KERNEL);
        if (!info->si_imap)
                goto out;
-        for (i=0; i<BFS_ROOT_INO; i++) 
+        for (i = 0; i < BFS_ROOT_INO; i++)
                set_bit(i, info->si_imap);
        s->s_op = &bfs_sops;
@@ -352,16 +358,15 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
                goto out;
        }
-        info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1)>>BFS_BSIZE_BITS; /* for statfs(2) */
+        info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1) >> BFS_BSIZE_BITS;
-        info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1 -  le32_to_cpu(bfs_sb->s_start))>>BFS_BSIZE_BITS;
+        info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1
+                        - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS;
        info->si_freei = 0;
        info->si_lf_eblk = 0;
-        info->si_lf_sblk = 0;
-        info->si_lf_ioff = 0;
        bh = NULL;
-        for (i=BFS_ROOT_INO; i<=info->si_lasti; i++) {
+        for (i = BFS_ROOT_INO; i <= info->si_lasti; i++) {
                struct bfs_inode *di;
-                int block = (i - BFS_ROOT_INO)/BFS_INODES_PER_BLOCK + 1;
+                int block = (i - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
                int off = (i - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
                unsigned long sblock, eblock;
@@ -384,11 +389,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
                sblock =  le32_to_cpu(di->i_sblock);
                eblock =  le32_to_cpu(di->i_eblock);
-                if (eblock > info->si_lf_eblk) {
+                if (eblock > info->si_lf_eblk)
                        info->si_lf_eblk = eblock;
-                        info->si_lf_sblk = sblock;
-                        info->si_lf_ioff = BFS_INO2OFF(i);
-                }
        }
        brelse(bh);
        if (!(s->s_flags & MS_RDONLY)) {
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index e176d195e7e5..7596e1e94cde 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -319,7 +319,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
        current->mm->free_area_cache = current->mm->mmap_base;
        current->mm->cached_hole_size = 0;
-        current->mm->mmap = NULL;
        compute_creds(bprm);
        current->flags &= ~PF_FORKNOEXEC;
 #ifdef __sparc__
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index ba8de7ca260b..18ed6dd906c1 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -45,7 +45,8 @@
 static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
 static int load_elf_library(struct file *);
-static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int);
+static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
+                                int, int, unsigned long);
 /*
 * If we don't support core dumping, then supply a NULL so we
@@ -298,33 +299,70 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 #ifndef elf_map
 static unsigned long elf_map(struct file *filep, unsigned long addr,
-                struct elf_phdr *eppnt, int prot, int type)
+                struct elf_phdr *eppnt, int prot, int type,
+                unsigned long total_size)
 {
        unsigned long map_addr;
-        unsigned long pageoffset = ELF_PAGEOFFSET(eppnt->p_vaddr);
+        unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr);
+        unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr);
+        addr = ELF_PAGESTART(addr);
+        size = ELF_PAGEALIGN(size);
-        down_write(&current->mm->mmap_sem);
        /* mmap() will return -EINVAL if given a zero size, but a
         * segment with zero filesize is perfectly valid */
-        if (eppnt->p_filesz + pageoffset)
+        if (!size)
-                map_addr = do_mmap(filep, ELF_PAGESTART(addr),
+                return addr;
-                                   eppnt->p_filesz + pageoffset, prot, type,
-                                   eppnt->p_offset - pageoffset);
+        down_write(&current->mm->mmap_sem);
-        else
+        /*
-                map_addr = ELF_PAGESTART(addr);
+        * total_size is the size of the ELF (interpreter) image.
+        * The _first_ mmap needs to know the full size, otherwise
+        * randomization might put this image into an overlapping
+        * position with the ELF binary image. (since size < total_size)
+        * So we first map the 'big' image - and unmap the remainder at
+        * the end. (which unmap is needed for ELF images with holes.)
+        */
+        if (total_size) {
+                total_size = ELF_PAGEALIGN(total_size);
+                map_addr = do_mmap(filep, addr, total_size, prot, type, off);
+                if (!BAD_ADDR(map_addr))
+                        do_munmap(current->mm, map_addr+size, total_size-size);
+        } else
+                map_addr = do_mmap(filep, addr, size, prot, type, off);
        up_write(&current->mm->mmap_sem);
        return(map_addr);
 }
 #endif /* !elf_map */
+static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
+{
+        int i, first_idx = -1, last_idx = -1;
+        for (i = 0; i < nr; i++) {
+                if (cmds[i].p_type == PT_LOAD) {
+                        last_idx = i;
+                        if (first_idx == -1)
+                                first_idx = i;
+                }
+        }
+        if (first_idx == -1)
+                return 0;
+        return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz -
+                                ELF_PAGESTART(cmds[first_idx].p_vaddr);
+}
 /* This is much more generalized than the library routine read function,
   so we keep this separate.  Technically the library read function
   is only provided so that we can read a.out libraries that have
   an ELF header */
 static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
-                struct file *interpreter, unsigned long *interp_load_addr)
+                struct file *interpreter, unsigned long *interp_map_addr,
+                unsigned long no_base)
 {
        struct elf_phdr *elf_phdata;
        struct elf_phdr *eppnt;
@@ -332,6 +370,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
        int load_addr_set = 0;
        unsigned long last_bss = 0, elf_bss = 0;
        unsigned long error = ~0UL;
+        unsigned long total_size;
        int retval, i, size;
        /* First of all, some simple consistency checks */
@@ -370,6 +409,12 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
                goto out_close;
        }
+        total_size = total_mapping_size(elf_phdata, interp_elf_ex->e_phnum);
+        if (!total_size) {
+                error = -EINVAL;
+                goto out_close;
+        }
        eppnt = elf_phdata;
        for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
                if (eppnt->p_type == PT_LOAD) {
@@ -387,9 +432,14 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
                        vaddr = eppnt->p_vaddr;
                        if (interp_elf_ex->e_type == ET_EXEC || load_addr_set)
                                elf_type |= MAP_FIXED;
+                        else if (no_base && interp_elf_ex->e_type == ET_DYN)
+                                load_addr = -vaddr;
                        map_addr = elf_map(interpreter, load_addr + vaddr,
-                                           eppnt, elf_prot, elf_type);
+                                        eppnt, elf_prot, elf_type, total_size);
+                        total_size = 0;
+                        if (!*interp_map_addr)
+                                *interp_map_addr = map_addr;
                        error = map_addr;
                        if (BAD_ADDR(map_addr))
                                goto out_close;
@@ -455,8 +505,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
                        goto out_close;
        }
-        *interp_load_addr = load_addr;
+        error = load_addr;
-        error = ((unsigned long)interp_elf_ex->e_entry) + load_addr;
 out_close:
        kfree(elf_phdata);
@@ -546,14 +595,14 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        int load_addr_set = 0;
        char * elf_interpreter = NULL;
        unsigned int interpreter_type = INTERPRETER_NONE;
-        unsigned char ibcs2_interpreter = 0;
        unsigned long error;
        struct elf_phdr *elf_ppnt, *elf_phdata;
        unsigned long elf_bss, elf_brk;
        int elf_exec_fileno;
        int retval, i;
        unsigned int size;
-        unsigned long elf_entry, interp_load_addr = 0;
+        unsigned long elf_entry;
+        unsigned long interp_load_addr = 0;
        unsigned long start_code, end_code, start_data, end_data;
        unsigned long reloc_func_desc = 0;
        char passed_fileno[6];
@@ -663,14 +712,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                        if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
                                goto out_free_interp;
-                        /* If the program interpreter is one of these two,
-                         * then assume an iBCS2 image. Otherwise assume
-                         * a native linux image.
-                         */
-                        if (strcmp(elf_interpreter,"/usr/lib/libc.so.1") == 0 ||
-                            strcmp(elf_interpreter,"/usr/lib/ld.so.1") == 0)
-                                ibcs2_interpreter = 1;
                        /*
                         * The early SET_PERSONALITY here is so that the lookup
                         * for the interpreter happens in the namespace of the 
@@ -690,7 +731,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                         * switch really is going to happen - do this in
                         * flush_thread().      - akpm
                         */
-                        SET_PERSONALITY(loc->elf_ex, ibcs2_interpreter);
+                        SET_PERSONALITY(loc->elf_ex, 0);
                        interpreter = open_exec(elf_interpreter);
                        retval = PTR_ERR(interpreter);
@@ -769,7 +810,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                        goto out_free_dentry;
        } else {
                /* Executables without an interpreter also need a personality  */
-                SET_PERSONALITY(loc->elf_ex, ibcs2_interpreter);
+                SET_PERSONALITY(loc->elf_ex, 0);
        }
        /* OK, we are done with that, now set up the arg stuff,
@@ -803,7 +844,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        /* Do this immediately, since STACK_TOP as used in setup_arg_pages
           may depend on the personality.  */
-        SET_PERSONALITY(loc->elf_ex, ibcs2_interpreter);
+        SET_PERSONALITY(loc->elf_ex, 0);
        if (elf_read_implies_exec(loc->elf_ex, executable_stack))
                current->personality |= READ_IMPLIES_EXEC;
@@ -825,9 +866,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        current->mm->start_stack = bprm->p;
        /* Now we do a little grungy work by mmaping the ELF image into
-           the correct location in memory.  At this point, we assume that
+           the correct location in memory. */
-           the image should be loaded at fixed address, not at a variable
-           address. */
        for(i = 0, elf_ppnt = elf_phdata;
            i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
                int elf_prot = 0, elf_flags;
@@ -881,11 +920,15 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                         * default mmap base, as well as whatever program they
                         * might try to exec.  This is because the brk will
                         * follow the loader, and is not movable.  */
+#ifdef CONFIG_X86
+                        load_bias = 0;
+#else
                        load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
+#endif
                }
                error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
-                                elf_prot, elf_flags);
+                                elf_prot, elf_flags, 0);
                if (BAD_ADDR(error)) {
                        send_sig(SIGKILL, current, 0);
                        retval = IS_ERR((void *)error) ?
@@ -961,13 +1004,25 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        }
        if (elf_interpreter) {
-                if (interpreter_type == INTERPRETER_AOUT)
+                if (interpreter_type == INTERPRETER_AOUT) {
                        elf_entry = load_aout_interp(&loc->interp_ex,
                                                     interpreter);
-                else
+                } else {
+                        unsigned long uninitialized_var(interp_map_addr);
                        elf_entry = load_elf_interp(&loc->interp_elf_ex,
                                                    interpreter,
-                                                    &interp_load_addr);
+                                                    &interp_map_addr,
+                                                    load_bias);
+                        if (!IS_ERR((void *)elf_entry)) {
+                                /*
+                                 * load_elf_interp() returns relocation
+                                 * adjustment
+                                 */
+                                interp_load_addr = elf_entry;
+                                elf_entry += loc->interp_elf_ex.e_entry;
+                        }
+                }
                if (BAD_ADDR(elf_entry)) {
                        force_sig(SIGSEGV, current);
                        retval = IS_ERR((void *)elf_entry) ?
@@ -1021,6 +1076,12 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        current->mm->end_data = end_data;
        current->mm->start_stack = bprm->p;
+#ifdef arch_randomize_brk
+        if (current->flags & PF_RANDOMIZE)
+                current->mm->brk = current->mm->start_brk =
+                        arch_randomize_brk(current->mm);
+#endif
        if (current->personality & MMAP_PAGE_ZERO) {
                /* Why this, you ask???  Well SVr4 maps page 0 as read-only,
                   and some applications "depend" upon this behavior.
@@ -1325,7 +1386,8 @@ static int writenote(struct memelfnote *men, struct file *file,
        if (!dump_seek(file, (off))) \
                goto end_coredump;
-static void fill_elf_header(struct elfhdr *elf, int segs)
+static void fill_elf_header(struct elfhdr *elf, int segs,
+                            u16 machine, u32 flags, u8 osabi)
 {
        memcpy(elf->e_ident, ELFMAG, SELFMAG);
        elf->e_ident[EI_CLASS] = ELF_CLASS;
@@ -1335,12 +1397,12 @@ static void fill_elf_header(struct elfhdr *elf, int segs)
        memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
        elf->e_type = ET_CORE;
-        elf->e_machine = ELF_ARCH;
+        elf->e_machine = machine;
        elf->e_version = EV_CURRENT;
        elf->e_entry = 0;
        elf->e_phoff = sizeof(struct elfhdr);
        elf->e_shoff = 0;
-        elf->e_flags = ELF_CORE_EFLAGS;
+        elf->e_flags = flags;
        elf->e_ehsize = sizeof(struct elfhdr);
        elf->e_phentsize = sizeof(struct elf_phdr);
        elf->e_phnum = segs;
@@ -1384,7 +1446,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
        prstatus->pr_sigpend = p->pending.signal.sig[0];
        prstatus->pr_sighold = p->blocked.sig[0];
        prstatus->pr_pid = task_pid_vnr(p);
-        prstatus->pr_ppid = task_pid_vnr(p->parent);
+        prstatus->pr_ppid = task_pid_vnr(p->real_parent);
        prstatus->pr_pgrp = task_pgrp_vnr(p);
        prstatus->pr_sid = task_session_vnr(p);
        if (thread_group_leader(p)) {
@@ -1430,7 +1492,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
        psinfo->pr_psargs[len] = 0;
        psinfo->pr_pid = task_pid_vnr(p);
-        psinfo->pr_ppid = task_pid_vnr(p->parent);
+        psinfo->pr_ppid = task_pid_vnr(p->real_parent);
        psinfo->pr_pgrp = task_pgrp_vnr(p);
        psinfo->pr_sid = task_session_vnr(p);
@@ -1447,6 +1509,238 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
        return 0;
 }
+static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm)
+{
+        elf_addr_t *auxv = (elf_addr_t *) mm->saved_auxv;
+        int i = 0;
+        do
+                i += 2;
+        while (auxv[i - 2] != AT_NULL);
+        fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv);
+}
+#ifdef CORE_DUMP_USE_REGSET
+#include <linux/regset.h>
+struct elf_thread_core_info {
+        struct elf_thread_core_info *next;
+        struct task_struct *task;
+        struct elf_prstatus prstatus;
+        struct memelfnote notes[0];
+};
+struct elf_note_info {
+        struct elf_thread_core_info *thread;
+        struct memelfnote psinfo;
+        struct memelfnote auxv;
+        size_t size;
+        int thread_notes;
+};
+static int fill_thread_core_info(struct elf_thread_core_info *t,
+                                 const struct user_regset_view *view,
+                                 long signr, size_t *total)
+{
+        unsigned int i;
+        /*
+         * NT_PRSTATUS is the one special case, because the regset data
+         * goes into the pr_reg field inside the note contents, rather
+         * than being the whole note contents.  We fill the reset in here.
+         * We assume that regset 0 is NT_PRSTATUS.
+         */
+        fill_prstatus(&t->prstatus, t->task, signr);
+        (void) view->regsets[0].get(t->task, &view->regsets[0],
+                                    0, sizeof(t->prstatus.pr_reg),
+                                    &t->prstatus.pr_reg, NULL);
+        fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
+                  sizeof(t->prstatus), &t->prstatus);
+        *total += notesize(&t->notes[0]);
+        /*
+         * Each other regset might generate a note too.  For each regset
+         * that has no core_note_type or is inactive, we leave t->notes[i]
+         * all zero and we'll know to skip writing it later.
+         */
+        for (i = 1; i < view->n; ++i) {
+                const struct user_regset *regset = &view->regsets[i];
+                if (regset->core_note_type &&
+                    (!regset->active || regset->active(t->task, regset))) {
+                        int ret;
+                        size_t size = regset->n * regset->size;
+                        void *data = kmalloc(size, GFP_KERNEL);
+                        if (unlikely(!data))
+                                return 0;
+                        ret = regset->get(t->task, regset,
+                                          0, size, data, NULL);
+                        if (unlikely(ret))
+                                kfree(data);
+                        else {
+                                if (regset->core_note_type != NT_PRFPREG)
+                                        fill_note(&t->notes[i], "LINUX",
+                                                  regset->core_note_type,
+                                                  size, data);
+                                else {
+                                        t->prstatus.pr_fpvalid = 1;
+                                        fill_note(&t->notes[i], "CORE",
+                                                  NT_PRFPREG, size, data);
+                                }
+                                *total += notesize(&t->notes[i]);
+                        }
+                }
+        }
+        return 1;
+}
+static int fill_note_info(struct elfhdr *elf, int phdrs,
+                          struct elf_note_info *info,
+                          long signr, struct pt_regs *regs)
+{
+        struct task_struct *dump_task = current;
+        const struct user_regset_view *view = task_user_regset_view(dump_task);
+        struct elf_thread_core_info *t;
+        struct elf_prpsinfo *psinfo;
+        struct task_struct *g, *p;
+        unsigned int i;
+        info->size = 0;
+        info->thread = NULL;
+        psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
+        fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
+        if (psinfo == NULL)
+                return 0;
+        /*
+         * Figure out how many notes we're going to need for each thread.
+         */
+        info->thread_notes = 0;
+        for (i = 0; i < view->n; ++i)
+                if (view->regsets[i].core_note_type != 0)
+                        ++info->thread_notes;
+        /*
+         * Sanity check.  We rely on regset 0 being in NT_PRSTATUS,
+         * since it is our one special case.
+         */
+        if (unlikely(info->thread_notes == 0) ||
+            unlikely(view->regsets[0].core_note_type != NT_PRSTATUS)) {
+                WARN_ON(1);
+                return 0;
+        }
+        /*
+         * Initialize the ELF file header.
+         */
+        fill_elf_header(elf, phdrs,
+                        view->e_machine, view->e_flags, view->ei_osabi);
+        /*
+         * Allocate a structure for each thread.
+         */
+        rcu_read_lock();
+        do_each_thread(g, p)
+                if (p->mm == dump_task->mm) {
+                        t = kzalloc(offsetof(struct elf_thread_core_info,
+                                             notes[info->thread_notes]),
+                                    GFP_ATOMIC);
+                        if (unlikely(!t)) {
+                                rcu_read_unlock();
+                                return 0;
+                        }
+                        t->task = p;
+                        if (p == dump_task || !info->thread) {
+                                t->next = info->thread;
+                                info->thread = t;
+                        } else {
+                                /*
+                                 * Make sure to keep the original task at
+                                 * the head of the list.
+                                 */
+                                t->next = info->thread->next;
+                                info->thread->next = t;
+                        }
+                }
+        while_each_thread(g, p);
+        rcu_read_unlock();
+        /*
+         * Now fill in each thread's information.
+         */
+        for (t = info->thread; t != NULL; t = t->next)
+                if (!fill_thread_core_info(t, view, signr, &info->size))
+                        return 0;
+        /*
+         * Fill in the two process-wide notes.
+         */
+        fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm);
+        info->size += notesize(&info->psinfo);
+        fill_auxv_note(&info->auxv, current->mm);
+        info->size += notesize(&info->auxv);
+        return 1;
+}
+static size_t get_note_info_size(struct elf_note_info *info)
+{
+        return info->size;
+}
+/*
+ * Write all the notes for each thread.  When writing the first thread, the
+ * process-wide notes are interleaved after the first thread-specific note.
+ */
+static int write_note_info(struct elf_note_info *info,
+                           struct file *file, loff_t *foffset)
+{
+        bool first = 1;
+        struct elf_thread_core_info *t = info->thread;
+        do {
+                int i;
+                if (!writenote(&t->notes[0], file, foffset))
+                        return 0;
+                if (first && !writenote(&info->psinfo, file, foffset))
+                        return 0;
+                if (first && !writenote(&info->auxv, file, foffset))
+                        return 0;
+                for (i = 1; i < info->thread_notes; ++i)
+                        if (t->notes[i].data &&
+                            !writenote(&t->notes[i], file, foffset))
+                                return 0;
+                first = 0;
+                t = t->next;
+        } while (t);
+        return 1;
+}
+static void free_note_info(struct elf_note_info *info)
+{
+        struct elf_thread_core_info *threads = info->thread;
+        while (threads) {
+                unsigned int i;
+                struct elf_thread_core_info *t = threads;
+                threads = t->next;
+                WARN_ON(t->notes[0].data && t->notes[0].data != &t->prstatus);
+                for (i = 1; i < info->thread_notes; ++i)
+                        kfree(t->notes[i].data);
+                kfree(t);
+        }
+        kfree(info->psinfo.data);
+}
+#else
 /* Here is the structure in which status of each thread is captured. */
 struct elf_thread_status
 {
@@ -1499,6 +1793,176 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
        return sz;
 }
+struct elf_note_info {
+        struct memelfnote *notes;
+        struct elf_prstatus *prstatus;  /* NT_PRSTATUS */
+        struct elf_prpsinfo *psinfo;    /* NT_PRPSINFO */
+        struct list_head thread_list;
+        elf_fpregset_t *fpu;
+#ifdef ELF_CORE_COPY_XFPREGS
+        elf_fpxregset_t *xfpu;
+#endif
+        int thread_status_size;
+        int numnote;
+};
+static int fill_note_info(struct elfhdr *elf, int phdrs,
+                          struct elf_note_info *info,
+                          long signr, struct pt_regs *regs)
+{
+#define NUM_NOTES       6
+        struct list_head *t;
+        struct task_struct *g, *p;
+        info->notes = NULL;
+        info->prstatus = NULL;
+        info->psinfo = NULL;
+        info->fpu = NULL;
+#ifdef ELF_CORE_COPY_XFPREGS
+        info->xfpu = NULL;
+#endif
+        INIT_LIST_HEAD(&info->thread_list);
+        info->notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote),
+                              GFP_KERNEL);
+        if (!info->notes)
+                return 0;
+        info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
+        if (!info->psinfo)
+                return 0;
+        info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);
+        if (!info->prstatus)
+                return 0;
+        info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);
+        if (!info->fpu)
+                return 0;
+#ifdef ELF_CORE_COPY_XFPREGS
+        info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL);
+        if (!info->xfpu)
+                return 0;
+#endif
+        info->thread_status_size = 0;
+        if (signr) {
+                struct elf_thread_status *tmp;
+                rcu_read_lock();
+                do_each_thread(g, p)
+                        if (current->mm == p->mm && current != p) {
+                                tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC);
+                                if (!tmp) {
+                                        rcu_read_unlock();
+                                        return 0;
+                                }
+                                tmp->thread = p;
+                                list_add(&tmp->list, &info->thread_list);
+                        }
+                while_each_thread(g, p);
+                rcu_read_unlock();
+                list_for_each(t, &info->thread_list) {
+                        struct elf_thread_status *tmp;
+                        int sz;
+                        tmp = list_entry(t, struct elf_thread_status, list);
+                        sz = elf_dump_thread_status(signr, tmp);
+                        info->thread_status_size += sz;
+                }
+        }
+        /* now collect the dump for the current */
+        memset(info->prstatus, 0, sizeof(*info->prstatus));
+        fill_prstatus(info->prstatus, current, signr);
+        elf_core_copy_regs(&info->prstatus->pr_reg, regs);
+        /* Set up header */
+        fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS, ELF_OSABI);
+        /*
+         * Set up the notes in similar form to SVR4 core dumps made
+         * with info from their /proc.
+         */
+        fill_note(info->notes + 0, "CORE", NT_PRSTATUS,
+                  sizeof(*info->prstatus), info->prstatus);
+        fill_psinfo(info->psinfo, current->group_leader, current->mm);
+        fill_note(info->notes + 1, "CORE", NT_PRPSINFO,
+                  sizeof(*info->psinfo), info->psinfo);
+        info->numnote = 2;
+        fill_auxv_note(&info->notes[info->numnote++], current->mm);
+        /* Try to dump the FPU. */
+        info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs,
+                                                               info->fpu);
+        if (info->prstatus->pr_fpvalid)
+                fill_note(info->notes + info->numnote++,
+                          "CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu);
+#ifdef ELF_CORE_COPY_XFPREGS
+        if (elf_core_copy_task_xfpregs(current, info->xfpu))
+                fill_note(info->notes + info->numnote++,
+                          "LINUX", ELF_CORE_XFPREG_TYPE,
+                          sizeof(*info->xfpu), info->xfpu);
+#endif
+        return 1;
+#undef NUM_NOTES
+}
+static size_t get_note_info_size(struct elf_note_info *info)
+{
+        int sz = 0;
+        int i;
+        for (i = 0; i < info->numnote; i++)
+                sz += notesize(info->notes + i);
+        sz += info->thread_status_size;
+        return sz;
+}
+static int write_note_info(struct elf_note_info *info,
+                           struct file *file, loff_t *foffset)
+{
+        int i;
+        struct list_head *t;
+        for (i = 0; i < info->numnote; i++)
+                if (!writenote(info->notes + i, file, foffset))
+                        return 0;
+        /* write out the thread status notes section */
+        list_for_each(t, &info->thread_list) {
+                struct elf_thread_status *tmp =
+                                list_entry(t, struct elf_thread_status, list);
+                for (i = 0; i < tmp->num_notes; i++)
+                        if (!writenote(&tmp->notes[i], file, foffset))
+                                return 0;
+        }
+        return 1;
+}
+static void free_note_info(struct elf_note_info *info)
+{
+        while (!list_empty(&info->thread_list)) {
+                struct list_head *tmp = info->thread_list.next;
+                list_del(tmp);
+                kfree(list_entry(tmp, struct elf_thread_status, list));
+        }
+        kfree(info->prstatus);
+        kfree(info->psinfo);
+        kfree(info->notes);
+        kfree(info->fpu);
+#ifdef ELF_CORE_COPY_XFPREGS
+        kfree(info->xfpu);
+#endif
+}
+#endif
 static struct vm_area_struct *first_vma(struct task_struct *tsk,
                                        struct vm_area_struct *gate_vma)
 {
@@ -1534,29 +1998,15 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
 */
 static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit)
 {
-#define NUM_NOTES       6
        int has_dumped = 0;
        mm_segment_t fs;
        int segs;
        size_t size = 0;
-        int i;
        struct vm_area_struct *vma, *gate_vma;
        struct elfhdr *elf = NULL;
        loff_t offset = 0, dataoff, foffset;
-        int numnote;
-        struct memelfnote *notes = NULL;
-        struct elf_prstatus *prstatus = NULL;   /* NT_PRSTATUS */
-        struct elf_prpsinfo *psinfo = NULL;     /* NT_PRPSINFO */
-        struct task_struct *g, *p;
-        LIST_HEAD(thread_list);
-        struct list_head *t;
-        elf_fpregset_t *fpu = NULL;
-#ifdef ELF_CORE_COPY_XFPREGS
-        elf_fpxregset_t *xfpu = NULL;
-#endif
-        int thread_status_size = 0;
-        elf_addr_t *auxv;
        unsigned long mm_flags;
+        struct elf_note_info info;
        /*
         * We no longer stop all VM operations.
@@ -1574,52 +2024,6 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
        elf = kmalloc(sizeof(*elf), GFP_KERNEL);
        if (!elf)
                goto cleanup;
-        prstatus = kmalloc(sizeof(*prstatus), GFP_KERNEL);
-        if (!prstatus)
-                goto cleanup;
-        psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
-        if (!psinfo)
-                goto cleanup;
-        notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote), GFP_KERNEL);
-        if (!notes)
-                goto cleanup;
-        fpu = kmalloc(sizeof(*fpu), GFP_KERNEL);
-        if (!fpu)
-                goto cleanup;
-#ifdef ELF_CORE_COPY_XFPREGS
-        xfpu = kmalloc(sizeof(*xfpu), GFP_KERNEL);
-        if (!xfpu)
-                goto cleanup;
-#endif
-        if (signr) {
-                struct elf_thread_status *tmp;
-                rcu_read_lock();
-                do_each_thread(g,p)
-                        if (current->mm == p->mm && current != p) {
-                                tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC);
-                                if (!tmp) {
-                                        rcu_read_unlock();
-                                        goto cleanup;
-                                }
-                                tmp->thread = p;
-                                list_add(&tmp->list, &thread_list);
-                        }
-                while_each_thread(g,p);
-                rcu_read_unlock();
-                list_for_each(t, &thread_list) {
-                        struct elf_thread_status *tmp;
-                        int sz;
-                        tmp = list_entry(t, struct elf_thread_status, list);
-                        sz = elf_dump_thread_status(signr, tmp);
-                        thread_status_size += sz;
-                }
-        }
-        /* now collect the dump for the current */
-        memset(prstatus, 0, sizeof(*prstatus));
-        fill_prstatus(prstatus, current, signr);
-        elf_core_copy_regs(&prstatus->pr_reg, regs);
        
        segs = current->mm->map_count;
 #ifdef ELF_CORE_EXTRA_PHDRS
@@ -1630,42 +2034,16 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
        if (gate_vma != NULL)
                segs++;
-        /* Set up header */
-        fill_elf_header(elf, segs + 1); /* including notes section */
-        has_dumped = 1;
-        current->flags |= PF_DUMPCORE;
        /*
-         * Set up the notes in similar form to SVR4 core dumps made
+         * Collect all the non-memory information about the process for the
-         * with info from their /proc.
+         * notes.  This also sets up the file header.
         */
+        if (!fill_note_info(elf, segs + 1, /* including notes section */
+                            &info, signr, regs))
+                goto cleanup;
-        fill_note(notes + 0, "CORE", NT_PRSTATUS, sizeof(*prstatus), prstatus);
+        has_dumped = 1;
-        fill_psinfo(psinfo, current->group_leader, current->mm);
+        current->flags |= PF_DUMPCORE;
-        fill_note(notes + 1, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
-        
-        numnote = 2;
-        auxv = (elf_addr_t *)current->mm->saved_auxv;
-        i = 0;
-        do
-                i += 2;
-        while (auxv[i - 2] != AT_NULL);
-        fill_note(&notes[numnote++], "CORE", NT_AUXV,
-                  i * sizeof(elf_addr_t), auxv);
-        /* Try to dump the FPU. */
-        if ((prstatus->pr_fpvalid =
-             elf_core_copy_task_fpregs(current, regs, fpu)))
-                fill_note(notes + numnote++,
-                          "CORE", NT_PRFPREG, sizeof(*fpu), fpu);
-#ifdef ELF_CORE_COPY_XFPREGS
-        if (elf_core_copy_task_xfpregs(current, xfpu))
-                fill_note(notes + numnote++,
-                          "LINUX", ELF_CORE_XFPREG_TYPE, sizeof(*xfpu), xfpu);
-#endif  
  
        fs = get_fs();
        set_fs(KERNEL_DS);
@@ -1678,12 +2056,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
        /* Write notes phdr entry */
        {
                struct elf_phdr phdr;
-                int sz = 0;
+                size_t sz = get_note_info_size(&info);
-                for (i = 0; i < numnote; i++)
-                        sz += notesize(notes + i);
-                
-                sz += thread_status_size;
                sz += elf_coredump_extra_notes_size();
@@ -1728,23 +2101,12 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
 #endif
        /* write out the notes section */
-        for (i = 0; i < numnote; i++)
+        if (!write_note_info(&info, file, &foffset))
-                if (!writenote(notes + i, file, &foffset))
+                goto end_coredump;
-                        goto end_coredump;
        if (elf_coredump_extra_notes_write(file, &foffset))
                goto end_coredump;
-        /* write out the thread status notes section */
-        list_for_each(t, &thread_list) {
-                struct elf_thread_status *tmp =
-                                list_entry(t, struct elf_thread_status, list);
-                for (i = 0; i < tmp->num_notes; i++)
-                        if (!writenote(&tmp->notes[i], file, &foffset))
-                                goto end_coredump;
-        }
        /* Align to page */
        DUMP_SEEK(dataoff - foffset);
@@ -1795,22 +2157,9 @@ end_coredump:
        set_fs(fs);
 cleanup:
-        while (!list_empty(&thread_list)) {
-                struct list_head *tmp = thread_list.next;
-                list_del(tmp);
-                kfree(list_entry(tmp, struct elf_thread_status, list));
-        }
        kfree(elf);
-        kfree(prstatus);
+        free_note_info(&info);
-        kfree(psinfo);
-        kfree(notes);
-        kfree(fpu);
-#ifdef ELF_CORE_COPY_XFPREGS
-        kfree(xfpu);
-#endif
        return has_dumped;
-#undef NUM_NOTES
 }
 #endif          /* USE_ELF_CORE_DUMP */
diff --git a/fs/bio.c b/fs/bio.c
index d59ddbf79626..242e409dab4b 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -248,11 +248,13 @@ inline int bio_hw_segments(struct request_queue *q, struct bio *bio)
 */
 void __bio_clone(struct bio *bio, struct bio *bio_src)
 {
-        struct request_queue *q = bdev_get_queue(bio_src->bi_bdev);
        memcpy(bio->bi_io_vec, bio_src->bi_io_vec,
                bio_src->bi_max_vecs * sizeof(struct bio_vec));
+        /*
+         * most users will be overriding ->bi_bdev with a new target,
+         * so we don't set nor calculate new physical/hw segment counts here
+         */
        bio->bi_sector = bio_src->bi_sector;
        bio->bi_bdev = bio_src->bi_bdev;
        bio->bi_flags |= 1 << BIO_CLONED;
@@ -260,8 +262,6 @@ void __bio_clone(struct bio *bio, struct bio *bio_src)
        bio->bi_vcnt = bio_src->bi_vcnt;
        bio->bi_size = bio_src->bi_size;
        bio->bi_idx = bio_src->bi_idx;
-        bio_phys_segments(q, bio);
-        bio_hw_segments(q, bio);
 }
 /**
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 993f78c55221..e48a630ae266 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -738,9 +738,9 @@ EXPORT_SYMBOL(bd_release);
 static struct kobject *bdev_get_kobj(struct block_device *bdev)
 {
        if (bdev->bd_contains != bdev)
-                return kobject_get(&bdev->bd_part->kobj);
+                return kobject_get(&bdev->bd_part->dev.kobj);
        else
-                return kobject_get(&bdev->bd_disk->kobj);
+                return kobject_get(&bdev->bd_disk->dev.kobj);
 }
 static struct kobject *bdev_get_holder(struct block_device *bdev)
@@ -1176,7 +1176,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
                                ret = -ENXIO;
                                goto out_first;
                        }
-                        kobject_get(&p->kobj);
+                        kobject_get(&p->dev.kobj);
                        bdev->bd_part = p;
                        bd_set_size(bdev, (loff_t) p->nr_sects << 9);
                }
@@ -1299,7 +1299,7 @@ static int __blkdev_put(struct block_device *bdev, int for_part)
                module_put(owner);
                if (bdev->bd_contains != bdev) {
-                        kobject_put(&bdev->bd_part->kobj);
+                        kobject_put(&bdev->bd_part->dev.kobj);
                        bdev->bd_part = NULL;
                }
                bdev->bd_disk = NULL;
diff --git a/fs/buffer.c b/fs/buffer.c
index 7249e014819e..456c9ab7705b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3213,6 +3213,50 @@ static int buffer_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
+/**
+ * bh_uptodate_or_lock: Test whether the buffer is uptodate
+ * @bh: struct buffer_head
+ *
+ * Return true if the buffer is up-to-date and false,
+ * with the buffer locked, if not.
+ */
+int bh_uptodate_or_lock(struct buffer_head *bh)
+{
+        if (!buffer_uptodate(bh)) {
+                lock_buffer(bh);
+                if (!buffer_uptodate(bh))
+                        return 0;
+                unlock_buffer(bh);
+        }
+        return 1;
+}
+EXPORT_SYMBOL(bh_uptodate_or_lock);
+/**
+ * bh_submit_read: Submit a locked buffer for reading
+ * @bh: struct buffer_head
+ *
+ * Returns zero on success and -EIO on error.
+ */
+int bh_submit_read(struct buffer_head *bh)
+{
+        BUG_ON(!buffer_locked(bh));
+        if (buffer_uptodate(bh)) {
+                unlock_buffer(bh);
+                return 0;
+        }
+        get_bh(bh);
+        bh->b_end_io = end_buffer_read_sync;
+        submit_bh(READ, bh);
+        wait_on_buffer(bh);
+        if (buffer_uptodate(bh))
+                return 0;
+        return -EIO;
+}
+EXPORT_SYMBOL(bh_submit_read);
 void __init buffer_init(void)
 {
        int nrpages;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index c3bfa76765c4..2c7a8b5b4598 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -510,9 +510,8 @@ struct cdev *cdev_alloc(void)
 {
        struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL);
        if (p) {
-                p->kobj.ktype = &ktype_cdev_dynamic;
                INIT_LIST_HEAD(&p->list);
-                kobject_init(&p->kobj);
+                kobject_init(&p->kobj, &ktype_cdev_dynamic);
        }
        return p;
 }
@@ -529,8 +528,7 @@ void cdev_init(struct cdev *cdev, const struct file_operations *fops)
 {
        memset(cdev, 0, sizeof *cdev);
        INIT_LIST_HEAD(&cdev->list);
-        cdev->kobj.ktype = &ktype_cdev_default;
+        kobject_init(&cdev->kobj, &ktype_cdev_default);
-        kobject_init(&cdev->kobj);
        cdev->ops = fops;
 }
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 3d419163c3d3..edd248367b36 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,13 @@
+Version 1.52
+------------
+Fix oops on second mount to server when null auth is used.
+Enable experimental Kerberos support.  Return writebehind errors on flush
+and sync so that events like out of disk space get reported properly on
+cached files. Fix setxattr failure to certain Samba versions. Fix mount
+of second share to disconnected server session (autoreconnect on this).
+Add ability to modify cifs acls for handling chmod (when mounted with
+cifsacl flag).
 Version 1.51
 ------------
 Fix memory leak in statfs when mounted to very old servers (e.g.
@@ -12,7 +22,12 @@ leak that causes cifsd not to stop and rmmod to fail to cleanup
 cifs_request_buffers pool. Fix problem with POSIX Open/Mkdir on
 bigendian architectures. Fix possible memory corruption when
 EAGAIN returned on kern_recvmsg. Return better error if server
-requires packet signing but client has disabled it.
+requires packet signing but client has disabled it. When mounted
+with cifsacl mount option - mode bits are approximated based
+on the contents of the ACL of the file or directory. When cifs
+mount helper is missing convert make sure that UNC name 
+has backslash (not forward slash) between ip address of server
+and the share name.
 Version 1.50
 ------------
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index ff6ba8d823f0..6ba43fb346fb 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -3,4 +3,11 @@
 #
 obj-$(CONFIG_CIFS) += cifs.o
-cifs-objs := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o fcntl.o readdir.o ioctl.o sess.o export.o cifsacl.o
+cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
+          link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
+          md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o fcntl.o \
+          readdir.o ioctl.o sess.o export.o cifsacl.o
+cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
+cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o
diff --git a/fs/cifs/README b/fs/cifs/README
index b806b11b5560..c623e2f9c5db 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -56,7 +56,8 @@ the CIFS VFS web site) copy it to the same directory in which mount.smbfs and
 similar files reside (usually /sbin).  Although the helper software is not  
 required, mount.cifs is recommended.  Eventually the Samba 3.0 utility program 
 "net" may also be helpful since it may someday provide easier mount syntax for
-users who are used to Windows e.g.  net use <mount point> <UNC name or cifs URL>
+users who are used to Windows e.g.
+        net use <mount point> <UNC name or cifs URL>
 Note that running the Winbind pam/nss module (logon service) on all of your
 Linux clients is useful in mapping Uids and Gids consistently across the
 domain to the proper network user.  The mount.cifs mount helper can be
@@ -225,12 +226,9 @@ If no password is provided, mount.cifs will prompt for password entry
 Restrictions
 ============
-Servers must support the NTLM SMB dialect (which is the most recent, supported 
-by Samba and Windows NT version 4, 2000 and XP and many other SMB/CIFS servers) 
 Servers must support either "pure-TCP" (port 445 TCP/IP CIFS connections) or RFC 
-1001/1002 support for "Netbios-Over-TCP/IP." Neither of these is likely to be a 
+1001/1002 support for "Netbios-Over-TCP/IP." This is not likely to be a 
-problem as most servers support this.  IPv6 support is planned for the future,
+problem as most servers support this.
-and is almost complete.
 Valid filenames differ between Windows and Linux.  Windows typically restricts
 filenames which contain certain reserved characters (e.g.the character : 
@@ -251,7 +249,7 @@ A partial list of the supported mount options follows:
                the CIFS session.
  password      The user password.  If the mount helper is
                installed, the user will be prompted for password
-                if it is not supplied.
+                if not supplied.
  ip            The ip address of the target server
  unc           The target server Universal Network Name (export) to 
                mount.  
@@ -286,7 +284,7 @@ A partial list of the supported mount options follows:
                can be enabled by specifying file_mode and dir_mode on 
                the client.  Note that the mount.cifs helper must be
                at version 1.10 or higher to support specifying the uid
-                (or gid) in non-numberic form.
+                (or gid) in non-numeric form.
  gid           Set the default gid for inodes (similar to above).
  file_mode     If CIFS Unix extensions are not supported by the server
                this overrides the default mode for file inodes.
@@ -420,9 +418,10 @@ A partial list of the supported mount options follows:
  acl           Allow setfacl and getfacl to manage posix ACLs if server
                supports them.  (default)
  noacl         Do not allow setfacl and getfacl calls on this mount
-  user_xattr    Allow getting and setting user xattrs as OS/2 EAs (extended
+  user_xattr    Allow getting and setting user xattrs (those attributes whose
-                attributes) to the server (default) e.g. via setfattr 
+                name begins with "user." or "os2.") as OS/2 EAs (extended
-                and getfattr utilities. 
+                attributes) to the server.  This allows support of the
+                setfattr and getfattr utilities. (default)
  nouser_xattr  Do not allow getfattr/setfattr to get/set/list xattrs 
  mapchars      Translate six of the seven reserved characters (not backslash)
                        *?<>|:
@@ -437,6 +436,7 @@ A partial list of the supported mount options follows:
 nomapchars     Do not translate any of these seven characters (default).
 nocase         Request case insensitive path name matching (case
                sensitive is the default if the server suports it).
+                (mount option "ignorecase" is identical to "nocase")
 posixpaths     If CIFS Unix extensions are supported, attempt to
                negotiate posix path name support which allows certain
                characters forbidden in typical CIFS filenames, without
@@ -458,6 +458,8 @@ A partial list of the supported mount options follows:
                byte range locks).
 remount        remount the share (often used to change from ro to rw mounts
                or vice versa)
+ cifsacl        Report mode bits (e.g. on stat) based on the Windows ACL for
+                the file. (EXPERIMENTAL)
 servern        Specify the server 's netbios name (RFC1001 name) to use
                when attempting to setup a session to the server.  This is
                This is needed for mounting to some older servers (such
@@ -486,6 +488,9 @@ A partial list of the supported mount options follows:
                        ntlmv2i Use NTLMv2 password hashing with packet signing
                        lanman  (if configured in kernel config) use older
                                lanman hash
+hard            Retry file operations if server is not responding
+soft            Limit retries to unresponsive servers (usually only
+                one retry) before returning an error.  (default)
 The mount.cifs mount helper also accepts a few mount options before -o
 including:
@@ -536,8 +541,8 @@ SecurityFlags		Flags which control security negotiation and
                        must use NTLM                                   0x02002
                        may use NTLMv2                                  0x00004
                        must use NTLMv2                                 0x04004
-                        may use Kerberos security (not implemented yet) 0x00008
+                        may use Kerberos security                       0x00008
-                        must use Kerberos (not implemented yet)         0x08008
+                        must use Kerberos                               0x08008
                        may use lanman (weak) password hash             0x00010
                        must use lanman password hash                   0x10010
                        may use plaintext passwords                     0x00020
@@ -584,8 +589,8 @@ Experimental            When set to 1 used to enable certain experimental
                        performance enhancement was disabled when
                        signing turned on in case buffer was modified
                        just before it was sent, also this flag will
-                        be used to use the new experimental sessionsetup
+                        be used to use the new experimental directory change 
-                        code).
+                        notification code).
 These experimental features and tracing can be enabled by changing flags in 
 /proc/fs/cifs (after the cifs module has been installed or built into the 
@@ -608,7 +613,8 @@ the start of smb requests and responses can be enabled via:
 Two other experimental features are under development. To test these
 requires enabling CONFIG_CIFS_EXPERIMENTAL
-        ipv6 enablement
+        cifsacl support needed to retrieve approximated mode bits based on
+                the contents on the CIFS ACL.
        DNOTIFY fcntl: needed for support of directory change 
                            notification and perhaps later for file leases)
@@ -625,10 +631,7 @@ that they represent all for that share, not just those for which the server
 returned success.
        
 Also note that "cat /proc/fs/cifs/DebugData" will display information about 
-the active sessions and the shares that are mounted.  Note: NTLMv2 enablement 
+the active sessions and the shares that are mounted.
-will not work since its implementation is not quite complete yet. Do not alter
+Enabling Kerberos (extended security) works when CONFIG_CIFS_EXPERIMENTAL is
-the ExtendedSecurity configuration value unless you are doing specific testing.
+on but requires a user space helper (from the Samba project). NTLM and NTLMv2 and
-Enabling extended security works to Windows 2000 Workstations and XP but not to 
+LANMAN support do not require this helper.
-Windows 2000 server or Samba since it does not usually send "raw NTLMSSP" 
-(instead it sends NTLMSSP encapsulated in SPNEGO/GSSAPI, which support is not 
-complete in the CIFS VFS yet).  
diff --git a/fs/cifs/TODO b/fs/cifs/TODO
index 29d4b2715254..92c9feac440f 100644
--- a/fs/cifs/TODO
+++ b/fs/cifs/TODO
@@ -1,4 +1,4 @@
-Version 1.49 April 26, 2007
+Version 1.52 January 3, 2008
 A Partial List of Missing Features
 ==================================
@@ -16,16 +16,14 @@ SecurityDescriptors
 c) Better pam/winbind integration (e.g. to handle uid mapping
 better)
-d) Kerberos/SPNEGO session setup support - (started)
+d) Cleanup now unneeded SessSetup code in
-e) Cleanup now unneeded SessSetup code in
 fs/cifs/connect.c and add back in NTLMSSP code if any servers
 need it
-f) MD5-HMAC signing SMB PDUs when SPNEGO style SessionSetup 
+e) ms-dfs and ms-dfs host name resolution cleanup
-used (Kerberos or NTLMSSP). Signing alreadyimplemented for NTLM
-and raw NTLMSSP already. This is important when enabling
+f) fix NTLMv2 signing when two mounts with different users to same
-extended security and mounting to Windows 2003 Servers
+server.
 g) Directory entry caching relies on a 1 second timer, rather than 
 using FindNotify or equivalent.  - (started)
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index 2a01f3ef96a0..bcda2c6b6a04 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -77,8 +77,12 @@
 #define SPNEGO_OID_LEN 7
 #define NTLMSSP_OID_LEN  10
+#define KRB5_OID_LEN  7
+#define MSKRB5_OID_LEN  7
 static unsigned long SPNEGO_OID[7] = { 1, 3, 6, 1, 5, 5, 2 };
 static unsigned long NTLMSSP_OID[10] = { 1, 3, 6, 1, 4, 1, 311, 2, 2, 10 };
+static unsigned long KRB5_OID[7] = { 1, 2, 840, 113554, 1, 2, 2 };
+static unsigned long MSKRB5_OID[7] = { 1, 2, 840, 48018, 1, 2, 2 };
 /*
 * ASN.1 context.
@@ -457,6 +461,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
        unsigned long *oid = NULL;
        unsigned int cls, con, tag, oidlen, rc;
        int use_ntlmssp = FALSE;
+        int use_kerberos = FALSE;
        *secType = NTLM; /* BB eventually make Kerberos or NLTMSSP the default*/
@@ -545,18 +550,28 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                                return 0;
                        }
                        if ((tag == ASN1_OJI) && (con == ASN1_PRI)) {
-                                rc = asn1_oid_decode(&ctx, end, &oid, &oidlen);
+                                if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) {
-                                if (rc) {
                                        cFYI(1,
                                          ("OID len = %d oid = 0x%lx 0x%lx "
                                           "0x%lx 0x%lx",
                                           oidlen, *oid, *(oid + 1),
                                           *(oid + 2), *(oid + 3)));
-                                        rc = compare_oid(oid, oidlen,
-                                                 NTLMSSP_OID, NTLMSSP_OID_LEN);
+                                        if (compare_oid(oid, oidlen,
-                                        kfree(oid);
+                                                        MSKRB5_OID,
-                                        if (rc)
+                                                        MSKRB5_OID_LEN))
+                                                use_kerberos = TRUE;
+                                        else if (compare_oid(oid, oidlen,
+                                                             KRB5_OID,
+                                                             KRB5_OID_LEN))
+                                                use_kerberos = TRUE;
+                                        else if (compare_oid(oid, oidlen,
+                                                             NTLMSSP_OID,
+                                                             NTLMSSP_OID_LEN))
                                                use_ntlmssp = TRUE;
+                                        kfree(oid);
                                }
                        } else {
                                cFYI(1, ("Should be an oid what is going on?"));
@@ -609,12 +624,10 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                         ctx.pointer)); /* is this UTF-8 or ASCII? */
        }
-        /* if (use_kerberos)
+        if (use_kerberos)
-           *secType = Kerberos
+                *secType = Kerberos;
-           else */
+        else if (use_ntlmssp)
-        if (use_ntlmssp) {
                *secType = NTLMSSP;
-        }
        return 1;
 }
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
new file mode 100644
index 000000000000..413ee2349d1a
--- /dev/null
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -0,0 +1,377 @@
+/*
+ *   Contains the CIFS DFS referral mounting routines used for handling
+ *   traversal via DFS junction point
+ *
+ *   Copyright (c) 2007 Igor Mammedov
+ *   Copyright (C) International Business Machines  Corp., 2008
+ *   Author(s): Igor Mammedov (niallain@gmail.com)
+ *              Steve French (sfrench@us.ibm.com)
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation; either version
+ *   2 of the License, or (at your option) any later version.
+ */
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/vfs.h>
+#include <linux/fs.h>
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifsfs.h"
+#include "dns_resolve.h"
+#include "cifs_debug.h"
+LIST_HEAD(cifs_dfs_automount_list);
+/*
+ * DFS functions
+*/
+void dfs_shrink_umount_helper(struct vfsmount *vfsmnt)
+{
+        mark_mounts_for_expiry(&cifs_dfs_automount_list);
+        mark_mounts_for_expiry(&cifs_dfs_automount_list);
+        shrink_submounts(vfsmnt, &cifs_dfs_automount_list);
+}
+/**
+ * cifs_get_share_name  -       extracts share name from UNC
+ * @node_name:  pointer to UNC string
+ *
+ * Extracts sharename form full UNC.
+ * i.e. strips from UNC trailing path that is not part of share
+ * name and fixup missing '\' in the begining of DFS node refferal
+ * if neccessary.
+ * Returns pointer to share name on success or NULL on error.
+ * Caller is responsible for freeing returned string.
+ */
+static char *cifs_get_share_name(const char *node_name)
+{
+        int len;
+        char *UNC;
+        char *pSep;
+        len = strlen(node_name);
+        UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */,
+                         GFP_KERNEL);
+        if (!UNC)
+                return NULL;
+        /* get share name and server name */
+        if (node_name[1] != '\\') {
+                UNC[0] = '\\';
+                strncpy(UNC+1, node_name, len);
+                len++;
+                UNC[len] = 0;
+        } else {
+                strncpy(UNC, node_name, len);
+                UNC[len] = 0;
+        }
+        /* find server name end */
+        pSep = memchr(UNC+2, '\\', len-2);
+        if (!pSep) {
+                cERROR(1, ("%s: no server name end in node name: %s",
+                        __FUNCTION__, node_name));
+                kfree(UNC);
+                return NULL;
+        }
+        /* find sharename end */
+        pSep++;
+        pSep = memchr(UNC+(pSep-UNC), '\\', len-(pSep-UNC));
+        if (!pSep) {
+                cERROR(1, ("%s:2 cant find share name in node name: %s",
+                        __FUNCTION__, node_name));
+                kfree(UNC);
+                return NULL;
+        }
+        /* trim path up to sharename end
+         *          * now we have share name in UNC */
+        *pSep = 0;
+        return UNC;
+}
+/**
+ * compose_mount_options        -       creates mount options for refferral
+ * @sb_mountdata:       parent/root DFS mount options (template)
+ * @ref_unc:            refferral server UNC
+ * @devname:            pointer for saving device name
+ *
+ * creates mount options for submount based on template options sb_mountdata
+ * and replacing unc,ip,prefixpath options with ones we've got form ref_unc.
+ *
+ * Returns: pointer to new mount options or ERR_PTR.
+ * Caller is responcible for freeing retunrned value if it is not error.
+ */
+static char *compose_mount_options(const char *sb_mountdata,
+                                   const char *ref_unc,
+                                   char **devname)
+{
+        int rc;
+        char *mountdata;
+        int md_len;
+        char *tkn_e;
+        char *srvIP = NULL;
+        char sep = ',';
+        int off, noff;
+        if (sb_mountdata == NULL)
+                return ERR_PTR(-EINVAL);
+        *devname = cifs_get_share_name(ref_unc);
+        rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
+        if (rc != 0) {
+                cERROR(1, ("%s: Failed to resolve server part of %s to IP",
+                          __FUNCTION__, *devname));
+                mountdata = ERR_PTR(rc);
+                goto compose_mount_options_out;
+        }
+        md_len = strlen(sb_mountdata) + strlen(srvIP) + strlen(ref_unc) + 3;
+        mountdata = kzalloc(md_len+1, GFP_KERNEL);
+        if (mountdata == NULL) {
+                mountdata = ERR_PTR(-ENOMEM);
+                goto compose_mount_options_out;
+        }
+        /* copy all options except of unc,ip,prefixpath */
+        off = 0;
+        if (strncmp(sb_mountdata, "sep=", 4) == 0) {
+                        sep = sb_mountdata[4];
+                        strncpy(mountdata, sb_mountdata, 5);
+                        off += 5;
+        }
+        while ((tkn_e = strchr(sb_mountdata+off, sep))) {
+                noff = (tkn_e - (sb_mountdata+off)) + 1;
+                if (strnicmp(sb_mountdata+off, "unc=", 4) == 0) {
+                        off += noff;
+                        continue;
+                }
+                if (strnicmp(sb_mountdata+off, "ip=", 3) == 0) {
+                        off += noff;
+                        continue;
+                }
+                if (strnicmp(sb_mountdata+off, "prefixpath=", 3) == 0) {
+                        off += noff;
+                        continue;
+                }
+                strncat(mountdata, sb_mountdata+off, noff);
+                off += noff;
+        }
+        strcat(mountdata, sb_mountdata+off);
+        mountdata[md_len] = '\0';
+        /* copy new IP and ref share name */
+        strcat(mountdata, ",ip=");
+        strcat(mountdata, srvIP);
+        strcat(mountdata, ",unc=");
+        strcat(mountdata, *devname);
+        /* find & copy prefixpath */
+        tkn_e = strchr(ref_unc+2, '\\');
+        if (tkn_e) {
+                tkn_e = strchr(tkn_e+1, '\\');
+                if (tkn_e) {
+                        strcat(mountdata, ",prefixpath=");
+                        strcat(mountdata, tkn_e);
+                }
+        }
+        /*cFYI(1,("%s: parent mountdata: %s", __FUNCTION__,sb_mountdata));*/
+        /*cFYI(1, ("%s: submount mountdata: %s", __FUNCTION__, mountdata ));*/
+compose_mount_options_out:
+        kfree(srvIP);
+        return mountdata;
+}
+static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent,
+                struct dentry *dentry, char *ref_unc)
+{
+        struct cifs_sb_info *cifs_sb;
+        struct vfsmount *mnt;
+        char *mountdata;
+        char *devname = NULL;
+        cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
+        mountdata = compose_mount_options(cifs_sb->mountdata,
+                                                ref_unc, &devname);
+        if (IS_ERR(mountdata))
+                return (struct vfsmount *)mountdata;
+        mnt = vfs_kern_mount(&cifs_fs_type, 0, devname, mountdata);
+        kfree(mountdata);
+        kfree(devname);
+        return mnt;
+}
+static char *build_full_dfs_path_from_dentry(struct dentry *dentry)
+{
+        char *full_path = NULL;
+        char *search_path;
+        char *tmp_path;
+        size_t l_max_len;
+        struct cifs_sb_info *cifs_sb;
+        if (dentry->d_inode == NULL)
+                return NULL;
+        cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
+        if (cifs_sb->tcon == NULL)
+                return NULL;
+        search_path = build_path_from_dentry(dentry);
+        if (search_path == NULL)
+                return NULL;
+        if (cifs_sb->tcon->Flags & SMB_SHARE_IS_IN_DFS) {
+                /* we should use full path name to correct working with DFS */
+                l_max_len = strnlen(cifs_sb->tcon->treeName, MAX_TREE_SIZE+1) +
+                                        strnlen(search_path, MAX_PATHCONF) + 1;
+                tmp_path = kmalloc(l_max_len, GFP_KERNEL);
+                if (tmp_path == NULL) {
+                        kfree(search_path);
+                        return NULL;
+                }
+                strncpy(tmp_path, cifs_sb->tcon->treeName, l_max_len);
+                strcat(tmp_path, search_path);
+                tmp_path[l_max_len-1] = 0;
+                full_path = tmp_path;
+                kfree(search_path);
+        } else {
+                full_path = search_path;
+        }
+        return full_path;
+}
+static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
+                                struct list_head *mntlist)
+{
+        /* stolen from afs code */
+        int err;
+        mntget(newmnt);
+        err = do_add_mount(newmnt, nd, nd->mnt->mnt_flags, mntlist);
+        switch (err) {
+        case 0:
+                dput(nd->dentry);
+                mntput(nd->mnt);
+                nd->mnt = newmnt;
+                nd->dentry = dget(newmnt->mnt_root);
+                break;
+        case -EBUSY:
+                /* someone else made a mount here whilst we were busy */
+                while (d_mountpoint(nd->dentry) &&
+                       follow_down(&nd->mnt, &nd->dentry))
+                        ;
+                err = 0;
+        default:
+                mntput(newmnt);
+                break;
+        }
+        return err;
+}
+static void dump_referral(const struct dfs_info3_param *ref)
+{
+        cFYI(1, ("DFS: ref path: %s", ref->path_name));
+        cFYI(1, ("DFS: node path: %s", ref->node_name));
+        cFYI(1, ("DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type));
+        cFYI(1, ("DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag,
+                                ref->PathConsumed));
+}
+static void*
+cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
+{
+        struct dfs_info3_param *referrals = NULL;
+        unsigned int num_referrals = 0;
+        struct cifs_sb_info *cifs_sb;
+        struct cifsSesInfo *ses;
+        char *full_path = NULL;
+        int xid, i;
+        int rc = 0;
+        struct vfsmount *mnt = ERR_PTR(-ENOENT);
+        cFYI(1, ("in %s", __FUNCTION__));
+        BUG_ON(IS_ROOT(dentry));
+        xid = GetXid();
+        dput(nd->dentry);
+        nd->dentry = dget(dentry);
+        cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
+        ses = cifs_sb->tcon->ses;
+        if (!ses) {
+                rc = -EINVAL;
+                goto out_err;
+        }
+        full_path = build_full_dfs_path_from_dentry(dentry);
+        if (full_path == NULL) {
+                rc = -ENOMEM;
+                goto out_err;
+        }
+        rc = get_dfs_path(xid, ses , full_path, cifs_sb->local_nls,
+                &num_referrals, &referrals,
+                cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        for (i = 0; i < num_referrals; i++) {
+                dump_referral(referrals+i);
+                /* connect to a storage node */
+                if (referrals[i].flags & DFSREF_STORAGE_SERVER) {
+                        int len;
+                        len = strlen(referrals[i].node_name);
+                        if (len < 2) {
+                                cERROR(1, ("%s: Net Address path too short: %s",
+                                        __FUNCTION__, referrals[i].node_name));
+                                rc = -EINVAL;
+                                goto out_err;
+                        }
+                        mnt = cifs_dfs_do_refmount(nd->mnt, nd->dentry,
+                                                referrals[i].node_name);
+                        cFYI(1, ("%s: cifs_dfs_do_refmount:%s , mnt:%p",
+                                         __FUNCTION__,
+                                        referrals[i].node_name, mnt));
+                        /* complete mount procedure if we accured submount */
+                        if (!IS_ERR(mnt))
+                                break;
+                }
+        }
+        /* we need it cause for() above could exit without valid submount */
+        rc = PTR_ERR(mnt);
+        if (IS_ERR(mnt))
+                goto out_err;
+        nd->mnt->mnt_flags |= MNT_SHRINKABLE;
+        rc = add_mount_helper(mnt, nd, &cifs_dfs_automount_list);
+out:
+        FreeXid(xid);
+        free_dfs_info_array(referrals, num_referrals);
+        kfree(full_path);
+        cFYI(1, ("leaving %s" , __FUNCTION__));
+        return ERR_PTR(rc);
+out_err:
+        path_release(nd);
+        goto out;
+}
+struct inode_operations cifs_dfs_referral_inode_operations = {
+        .follow_link = cifs_dfs_follow_mountpoint,
+};
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 34af556cdd8d..8ad2330ba061 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -43,6 +43,9 @@ struct cifs_sb_info {
        mode_t  mnt_dir_mode;
        int     mnt_cifs_flags;
        int     prepathlen;
-        char   *prepath;
+        char   *prepath; /* relative path under the share to mount to */
+#ifdef CONFIG_CIFS_DFS_UPCALL
+        char   *mountdata; /* mount options received at mount time */
+#endif
 };
 #endif                          /* _CIFS_FS_SB_H */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
new file mode 100644
index 000000000000..d543accc10dd
--- /dev/null
+++ b/fs/cifs/cifs_spnego.c
@@ -0,0 +1,136 @@
+/*
+ *   fs/cifs/cifs_spnego.c -- SPNEGO upcall management for CIFS
+ *
+ *   Copyright (c) 2007 Red Hat, Inc.
+ *   Author(s): Jeff Layton (jlayton@redhat.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/list.h>
+#include <linux/string.h>
+#include <keys/user-type.h>
+#include <linux/key-type.h>
+#include "cifsglob.h"
+#include "cifs_spnego.h"
+#include "cifs_debug.h"
+/* create a new cifs key */
+static int
+cifs_spnego_key_instantiate(struct key *key, const void *data, size_t datalen)
+{
+        char *payload;
+        int ret;
+        ret = -ENOMEM;
+        payload = kmalloc(datalen, GFP_KERNEL);
+        if (!payload)
+                goto error;
+        /* attach the data */
+        memcpy(payload, data, datalen);
+        rcu_assign_pointer(key->payload.data, payload);
+        ret = 0;
+error:
+        return ret;
+}
+static void
+cifs_spnego_key_destroy(struct key *key)
+{
+        kfree(key->payload.data);
+}
+/*
+ * keytype for CIFS spnego keys
+ */
+struct key_type cifs_spnego_key_type = {
+        .name           = "cifs.spnego",
+        .instantiate    = cifs_spnego_key_instantiate,
+        .match          = user_match,
+        .destroy        = cifs_spnego_key_destroy,
+        .describe       = user_describe,
+};
+#define MAX_VER_STR_LEN   9 /* length of longest version string e.g.
+                                strlen(";ver=0xFF") */
+#define MAX_MECH_STR_LEN 13 /* length of longest security mechanism name, eg
+                               in future could have strlen(";sec=ntlmsspi") */
+#define MAX_IPV6_ADDR_LEN 42 /* eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */
+/* get a key struct with a SPNEGO security blob, suitable for session setup */
+struct key *
+cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
+{
+        struct TCP_Server_Info *server = sesInfo->server;
+        char *description, *dp;
+        size_t desc_len;
+        struct key *spnego_key;
+        const char *hostname = server->hostname;
+        /* BB: come up with better scheme for determining length */
+        /* length of fields (with semicolons): ver=0xyz ipv4= ipaddress host=
+           hostname sec=mechanism uid=0x uid */
+        desc_len = MAX_VER_STR_LEN + 5 + MAX_IPV6_ADDR_LEN + 1 + 6 +
+                  strlen(hostname) + MAX_MECH_STR_LEN + 8 + (sizeof(uid_t) * 2);
+        spnego_key = ERR_PTR(-ENOMEM);
+        description = kzalloc(desc_len, GFP_KERNEL);
+        if (description == NULL)
+                goto out;
+        dp = description;
+        /* start with version and hostname portion of UNC string */
+        spnego_key = ERR_PTR(-EINVAL);
+        sprintf(dp, "ver=0x%x;host=%s;", CIFS_SPNEGO_UPCALL_VERSION,
+                hostname);
+        dp = description + strlen(description);
+        /* add the server address */
+        if (server->addr.sockAddr.sin_family == AF_INET)
+                sprintf(dp, "ip4=" NIPQUAD_FMT,
+                        NIPQUAD(server->addr.sockAddr.sin_addr));
+        else if (server->addr.sockAddr.sin_family == AF_INET6)
+                sprintf(dp, "ip6=" NIP6_SEQFMT,
+                        NIP6(server->addr.sockAddr6.sin6_addr));
+        else
+                goto out;
+        dp = description + strlen(description);
+        /* for now, only sec=krb5 is valid */
+        if (server->secType == Kerberos)
+                sprintf(dp, ";sec=krb5");
+        else
+                goto out;
+        dp = description + strlen(description);
+        sprintf(dp, ";uid=0x%x", sesInfo->linux_uid);
+        cFYI(1, ("key description = %s", description));
+        spnego_key = request_key(&cifs_spnego_key_type, description, "");
+#ifdef CONFIG_CIFS_DEBUG2
+        if (cifsFYI && !IS_ERR(spnego_key)) {
+                struct cifs_spnego_msg *msg = spnego_key->payload.data;
+                cifs_dump_mem("SPNEGO reply blob:", msg->data, min(1024,
+                                msg->secblob_len + msg->sesskey_len));
+        }
+#endif /* CONFIG_CIFS_DEBUG2 */
+out:
+        kfree(description);
+        return spnego_key;
+}
diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h
new file mode 100644
index 000000000000..05a34b17a1ab
--- /dev/null
+++ b/fs/cifs/cifs_spnego.h
@@ -0,0 +1,47 @@
+/*
+ *   fs/cifs/cifs_spnego.h -- SPNEGO upcall management for CIFS
+ *
+ *   Copyright (c) 2007 Red Hat, Inc.
+ *   Author(s): Jeff Layton (jlayton@redhat.com)
+ *              Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _CIFS_SPNEGO_H
+#define _CIFS_SPNEGO_H
+#define CIFS_SPNEGO_UPCALL_VERSION 1
+/*
+ * The version field should always be set to CIFS_SPNEGO_UPCALL_VERSION.
+ * The flags field is for future use. The request-key callout should set
+ * sesskey_len and secblob_len, and then concatenate the SessKey+SecBlob
+ * and stuff it in the data field.
+ */
+struct cifs_spnego_msg {
+        uint32_t        version;
+        uint32_t        flags;
+        uint32_t        sesskey_len;
+        uint32_t        secblob_len;
+        uint8_t         data[1];
+};
+#ifdef __KERNEL__
+extern struct key_type cifs_spnego_key_type;
+extern struct key *cifs_get_spnego_key(struct cifsSesInfo *sesInfo);
+#endif /* KERNEL */
+#endif /* _CIFS_SPNEGO_H */
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index e8e56353f5a1..a7035bd18e4e 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -38,13 +38,13 @@ static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
        {{1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(18), 0, 0, 0, 0} }, "sys"},
        {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(544), 0, 0, 0} }, "root"},
        {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(545), 0, 0, 0} }, "users"},
-        {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(546), 0, 0, 0} }, "guest"}
+        {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(546), 0, 0, 0} }, "guest"} }
-};
+;
 /* security id for everyone */
-static const struct cifs_sid sid_everyone =
+static const struct cifs_sid sid_everyone = {
-                {1, 1, {0, 0, 0, 0, 0, 0}, {} };
+        1, 1, {0, 0, 0, 0, 0, 1}, {0} };
 /* group users */
 static const struct cifs_sid sid_user =
                {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
@@ -97,7 +97,7 @@ int match_sid(struct cifs_sid *ctsid)
 /* if the two SIDs (roughly equivalent to a UUID for a user or group) are
   the same returns 1, if they do not match returns 0 */
-int compare_sids(struct cifs_sid *ctsid, struct cifs_sid *cwsid)
+int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
 {
        int i;
        int num_subauth, num_sat, num_saw;
@@ -130,69 +130,228 @@ int compare_sids(struct cifs_sid *ctsid, struct cifs_sid *cwsid)
 }
-static void parse_ace(struct cifs_ace *pace, char *end_of_acl)
+/* copy ntsd, owner sid, and group sid from a security descriptor to another */
+static void copy_sec_desc(const struct cifs_ntsd *pntsd,
+                                struct cifs_ntsd *pnntsd, __u32 sidsoffset)
+{
+        int i;
+        struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
+        struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr;
+        /* copy security descriptor control portion */
+        pnntsd->revision = pntsd->revision;
+        pnntsd->type = pntsd->type;
+        pnntsd->dacloffset = cpu_to_le32(sizeof(struct cifs_ntsd));
+        pnntsd->sacloffset = 0;
+        pnntsd->osidoffset = cpu_to_le32(sidsoffset);
+        pnntsd->gsidoffset = cpu_to_le32(sidsoffset + sizeof(struct cifs_sid));
+        /* copy owner sid */
+        owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+                                le32_to_cpu(pntsd->osidoffset));
+        nowner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset);
+        nowner_sid_ptr->revision = owner_sid_ptr->revision;
+        nowner_sid_ptr->num_subauth = owner_sid_ptr->num_subauth;
+        for (i = 0; i < 6; i++)
+                nowner_sid_ptr->authority[i] = owner_sid_ptr->authority[i];
+        for (i = 0; i < 5; i++)
+                nowner_sid_ptr->sub_auth[i] = owner_sid_ptr->sub_auth[i];
+        /* copy group sid */
+        group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+                                le32_to_cpu(pntsd->gsidoffset));
+        ngroup_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset +
+                                        sizeof(struct cifs_sid));
+        ngroup_sid_ptr->revision = group_sid_ptr->revision;
+        ngroup_sid_ptr->num_subauth = group_sid_ptr->num_subauth;
+        for (i = 0; i < 6; i++)
+                ngroup_sid_ptr->authority[i] = group_sid_ptr->authority[i];
+        for (i = 0; i < 5; i++)
+                ngroup_sid_ptr->sub_auth[i] =
+                                cpu_to_le32(group_sid_ptr->sub_auth[i]);
+        return;
+}
+/*
+   change posix mode to reflect permissions
+   pmode is the existing mode (we only want to overwrite part of this
+   bits to set can be: S_IRWXU, S_IRWXG or S_IRWXO ie 00700 or 00070 or 00007
+*/
+static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
+                                 umode_t *pbits_to_set)
+{
+        __u32 flags = le32_to_cpu(ace_flags);
+        /* the order of ACEs is important.  The canonical order is to begin with
+           DENY entries followed by ALLOW, otherwise an allow entry could be
+           encountered first, making the subsequent deny entry like "dead code"
+           which would be superflous since Windows stops when a match is made
+           for the operation you are trying to perform for your user */
+        /* For deny ACEs we change the mask so that subsequent allow access
+           control entries do not turn on the bits we are denying */
+        if (type == ACCESS_DENIED) {
+                if (flags & GENERIC_ALL) {
+                        *pbits_to_set &= ~S_IRWXUGO;
+                }
+                if ((flags & GENERIC_WRITE) ||
+                        ((flags & FILE_WRITE_RIGHTS) == FILE_WRITE_RIGHTS))
+                        *pbits_to_set &= ~S_IWUGO;
+                if ((flags & GENERIC_READ) ||
+                        ((flags & FILE_READ_RIGHTS) == FILE_READ_RIGHTS))
+                        *pbits_to_set &= ~S_IRUGO;
+                if ((flags & GENERIC_EXECUTE) ||
+                        ((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
+                        *pbits_to_set &= ~S_IXUGO;
+                return;
+        } else if (type != ACCESS_ALLOWED) {
+                cERROR(1, ("unknown access control type %d", type));
+                return;
+        }
+        /* else ACCESS_ALLOWED type */
+        if (flags & GENERIC_ALL) {
+                *pmode |= (S_IRWXUGO & (*pbits_to_set));
+#ifdef CONFIG_CIFS_DEBUG2
+                cFYI(1, ("all perms"));
+#endif
+                return;
+        }
+        if ((flags & GENERIC_WRITE) ||
+                        ((flags & FILE_WRITE_RIGHTS) == FILE_WRITE_RIGHTS))
+                *pmode |= (S_IWUGO & (*pbits_to_set));
+        if ((flags & GENERIC_READ) ||
+                        ((flags & FILE_READ_RIGHTS) == FILE_READ_RIGHTS))
+                *pmode |= (S_IRUGO & (*pbits_to_set));
+        if ((flags & GENERIC_EXECUTE) ||
+                        ((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
+                *pmode |= (S_IXUGO & (*pbits_to_set));
+#ifdef CONFIG_CIFS_DEBUG2
+        cFYI(1, ("access flags 0x%x mode now 0x%x", flags, *pmode));
+#endif
+        return;
+}
+/*
+   Generate access flags to reflect permissions mode is the existing mode.
+   This function is called for every ACE in the DACL whose SID matches
+   with either owner or group or everyone.
+*/
+static void mode_to_access_flags(umode_t mode, umode_t bits_to_use,
+                                __u32 *pace_flags)
+{
+        /* reset access mask */
+        *pace_flags = 0x0;
+        /* bits to use are either S_IRWXU or S_IRWXG or S_IRWXO */
+        mode &= bits_to_use;
+        /* check for R/W/X UGO since we do not know whose flags
+           is this but we have cleared all the bits sans RWX for
+           either user or group or other as per bits_to_use */
+        if (mode & S_IRUGO)
+                *pace_flags |= SET_FILE_READ_RIGHTS;
+        if (mode & S_IWUGO)
+                *pace_flags |= SET_FILE_WRITE_RIGHTS;
+        if (mode & S_IXUGO)
+                *pace_flags |= SET_FILE_EXEC_RIGHTS;
+#ifdef CONFIG_CIFS_DEBUG2
+        cFYI(1, ("mode: 0x%x, access flags now 0x%x", mode, *pace_flags));
+#endif
+        return;
+}
+static __le16 fill_ace_for_sid(struct cifs_ace *pntace,
+                        const struct cifs_sid *psid, __u64 nmode, umode_t bits)
+{
+        int i;
+        __u16 size = 0;
+        __u32 access_req = 0;
+        pntace->type = ACCESS_ALLOWED;
+        pntace->flags = 0x0;
+        mode_to_access_flags(nmode, bits, &access_req);
+        if (!access_req)
+                access_req = SET_MINIMUM_RIGHTS;
+        pntace->access_req = cpu_to_le32(access_req);
+        pntace->sid.revision = psid->revision;
+        pntace->sid.num_subauth = psid->num_subauth;
+        for (i = 0; i < 6; i++)
+                pntace->sid.authority[i] = psid->authority[i];
+        for (i = 0; i < psid->num_subauth; i++)
+                pntace->sid.sub_auth[i] = psid->sub_auth[i];
+        size = 1 + 1 + 2 + 4 + 1 + 1 + 6 + (psid->num_subauth * 4);
+        pntace->size = cpu_to_le16(size);
+        return (size);
+}
+#ifdef CONFIG_CIFS_DEBUG2
+static void dump_ace(struct cifs_ace *pace, char *end_of_acl)
 {
        int num_subauth;
        /* validate that we do not go past end of acl */
-        /* XXX this if statement can be removed
+        if (le16_to_cpu(pace->size) < 16) {
-        if (end_of_acl < (char *)pace + sizeof(struct cifs_ace)) {
+                cERROR(1, ("ACE too small, %d", le16_to_cpu(pace->size)));
+                return;
+        }
+        if (end_of_acl < (char *)pace + le16_to_cpu(pace->size)) {
                cERROR(1, ("ACL too small to parse ACE"));
                return;
-        } */
+        }
-        num_subauth = pace->num_subauth;
+        num_subauth = pace->sid.num_subauth;
        if (num_subauth) {
-#ifdef CONFIG_CIFS_DEBUG2
                int i;
-                cFYI(1, ("ACE revision %d num_subauth %d",
+                cFYI(1, ("ACE revision %d num_auth %d type %d flags %d size %d",
-                        pace->revision, pace->num_subauth));
+                        pace->sid.revision, pace->sid.num_subauth, pace->type,
+                        pace->flags, le16_to_cpu(pace->size)));
                for (i = 0; i < num_subauth; ++i) {
                        cFYI(1, ("ACE sub_auth[%d]: 0x%x", i,
-                                le32_to_cpu(pace->sub_auth[i])));
+                                le32_to_cpu(pace->sid.sub_auth[i])));
                }
                /* BB add length check to make sure that we do not have huge
                        num auths and therefore go off the end */
-                cFYI(1, ("RID %d", le32_to_cpu(pace->sub_auth[num_subauth-1])));
-#endif
        }
        return;
 }
-static void parse_ntace(struct cifs_ntace *pntace, char *end_of_acl)
-{
-        /* validate that we do not go past end of acl */
-        if (end_of_acl < (char *)pntace + sizeof(struct cifs_ntace)) {
-                cERROR(1, ("ACL too small to parse NT ACE"));
-                return;
-        }
-#ifdef CONFIG_CIFS_DEBUG2
-        cFYI(1, ("NTACE type %d flags 0x%x size %d, access Req 0x%x",
-                pntace->type, pntace->flags, pntace->size,
-                pntace->access_req));
 #endif
-        return;
-}
 static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
-                       struct cifs_sid *pownersid, struct cifs_sid *pgrpsid)
+                       struct cifs_sid *pownersid, struct cifs_sid *pgrpsid,
+                       struct inode *inode)
 {
        int i;
        int num_aces = 0;
        int acl_size;
        char *acl_base;
-        struct cifs_ntace **ppntace;
        struct cifs_ace **ppace;
        /* BB need to add parm so we can store the SID BB */
+        if (!pdacl) {
+                /* no DACL in the security descriptor, set
+                   all the permissions for user/group/other */
+                inode->i_mode |= S_IRWXUGO;
+                return;
+        }
        /* validate that we do not go past end of acl */
        if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) {
                cERROR(1, ("ACL too small to parse DACL"));
@@ -205,72 +364,101 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
                le32_to_cpu(pdacl->num_aces)));
 #endif
+        /* reset rwx permissions for user/group/other.
+           Also, if num_aces is 0 i.e. DACL has no ACEs,
+           user/group/other have no permissions */
+        inode->i_mode &= ~(S_IRWXUGO);
        acl_base = (char *)pdacl;
        acl_size = sizeof(struct cifs_acl);
        num_aces = le32_to_cpu(pdacl->num_aces);
        if (num_aces  > 0) {
-                ppntace = kmalloc(num_aces * sizeof(struct cifs_ntace *),
+                umode_t user_mask = S_IRWXU;
-                                GFP_KERNEL);
+                umode_t group_mask = S_IRWXG;
+                umode_t other_mask = S_IRWXO;
                ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
                                GFP_KERNEL);
 /*              cifscred->cecount = pdacl->num_aces;
-                cifscred->ntaces = kmalloc(num_aces *
-                        sizeof(struct cifs_ntace *), GFP_KERNEL);
                cifscred->aces = kmalloc(num_aces *
                        sizeof(struct cifs_ace *), GFP_KERNEL);*/
                for (i = 0; i < num_aces; ++i) {
-                        ppntace[i] = (struct cifs_ntace *)
+                        ppace[i] = (struct cifs_ace *) (acl_base + acl_size);
-                                        (acl_base + acl_size);
+#ifdef CONFIG_CIFS_DEBUG2
-                        ppace[i] = (struct cifs_ace *) ((char *)ppntace[i] +
+                        dump_ace(ppace[i], end_of_acl);
-                                        sizeof(struct cifs_ntace));
+#endif
+                        if (compare_sids(&(ppace[i]->sid), pownersid))
-                        parse_ntace(ppntace[i], end_of_acl);
+                                access_flags_to_mode(ppace[i]->access_req,
-                        if (end_of_acl < ((char *)ppace[i] +
+                                                     ppace[i]->type,
-                                        (le16_to_cpu(ppntace[i]->size) -
+                                                     &(inode->i_mode),
-                                        sizeof(struct cifs_ntace)))) {
+                                                     &user_mask);
-                                cERROR(1, ("ACL too small to parse ACE"));
+                        if (compare_sids(&(ppace[i]->sid), pgrpsid))
-                                break;
+                                access_flags_to_mode(ppace[i]->access_req,
-                        } else
+                                                     ppace[i]->type,
-                                parse_ace(ppace[i], end_of_acl);
+                                                     &(inode->i_mode),
+                                                     &group_mask);
-/*                      memcpy((void *)(&(cifscred->ntaces[i])),
+                        if (compare_sids(&(ppace[i]->sid), &sid_everyone))
-                                (void *)ppntace[i],
+                                access_flags_to_mode(ppace[i]->access_req,
-                                sizeof(struct cifs_ntace));
+                                                     ppace[i]->type,
-                        memcpy((void *)(&(cifscred->aces[i])),
+                                                     &(inode->i_mode),
+                                                     &other_mask);
+/*                      memcpy((void *)(&(cifscred->aces[i])),
                                (void *)ppace[i],
                                sizeof(struct cifs_ace)); */
-                        acl_base = (char *)ppntace[i];
+                        acl_base = (char *)ppace[i];
-                        acl_size = le16_to_cpu(ppntace[i]->size);
+                        acl_size = le16_to_cpu(ppace[i]->size);
                }
                kfree(ppace);
-                kfree(ppntace);
        }
        return;
 }
-static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
+static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid,
+                        struct cifs_sid *pgrpsid, __u64 nmode)
 {
+        __le16 size = 0;
+        struct cifs_acl *pnndacl;
+        pnndacl = (struct cifs_acl *)((char *)pndacl + sizeof(struct cifs_acl));
+        size += fill_ace_for_sid((struct cifs_ace *) ((char *)pnndacl + size),
+                                        pownersid, nmode, S_IRWXU);
+        size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size),
+                                        pgrpsid, nmode, S_IRWXG);
+        size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size),
+                                         &sid_everyone, nmode, S_IRWXO);
+        pndacl->size = cpu_to_le16(size + sizeof(struct cifs_acl));
+        pndacl->num_aces = 3;
+        return (0);
+}
+static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
+{
        /* BB need to add parm so we can store the SID BB */
-        /* validate that we do not go past end of acl */
+        /* validate that we do not go past end of ACL - sid must be at least 8
-        if (end_of_acl < (char *)psid + sizeof(struct cifs_sid)) {
+           bytes long (assuming no sub-auths - e.g. the null SID */
-                cERROR(1, ("ACL too small to parse SID"));
+        if (end_of_acl < (char *)psid + 8) {
+                cERROR(1, ("ACL too small to parse SID %p", psid));
                return -EINVAL;
        }
        if (psid->num_subauth) {
 #ifdef CONFIG_CIFS_DEBUG2
                int i;
-                cFYI(1, ("SID revision %d num_auth %d First subauth 0x%x",
+                cFYI(1, ("SID revision %d num_auth %d",
-                        psid->revision, psid->num_subauth, psid->sub_auth[0]));
+                        psid->revision, psid->num_subauth));
                for (i = 0; i < psid->num_subauth; i++) {
                        cFYI(1, ("SID sub_auth[%d]: 0x%x ", i,
@@ -289,27 +477,32 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
 /* Convert CIFS ACL to POSIX form */
-int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len)
+static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
+                          struct inode *inode)
 {
        int rc;
        struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
        struct cifs_acl *dacl_ptr; /* no need for SACL ptr */
        char *end_of_acl = ((char *)pntsd) + acl_len;
+        __u32 dacloffset;
+        if ((inode == NULL) || (pntsd == NULL))
+                return -EIO;
        owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
                                le32_to_cpu(pntsd->osidoffset));
        group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
                                le32_to_cpu(pntsd->gsidoffset));
-        dacl_ptr = (struct cifs_acl *)((char *)pntsd +
+        dacloffset = le32_to_cpu(pntsd->dacloffset);
-                                le32_to_cpu(pntsd->dacloffset));
+        dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
 #ifdef CONFIG_CIFS_DEBUG2
        cFYI(1, ("revision %d type 0x%x ooffset 0x%x goffset 0x%x "
                 "sacloffset 0x%x dacloffset 0x%x",
                 pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset),
                 le32_to_cpu(pntsd->gsidoffset),
-                 le32_to_cpu(pntsd->sacloffset),
+                 le32_to_cpu(pntsd->sacloffset), dacloffset));
-                 le32_to_cpu(pntsd->dacloffset)));
 #endif
+/*      cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */
        rc = parse_sid(owner_sid_ptr, end_of_acl);
        if (rc)
                return rc;
@@ -318,16 +511,247 @@ int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len)
        if (rc)
                return rc;
-        parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr, group_sid_ptr);
+        if (dacloffset)
+                parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr,
+                           group_sid_ptr, inode);
+        else
+                cFYI(1, ("no ACL")); /* BB grant all or default perms? */
 /*      cifscred->uid = owner_sid_ptr->rid;
        cifscred->gid = group_sid_ptr->rid;
        memcpy((void *)(&(cifscred->osid)), (void *)owner_sid_ptr,
-                        sizeof (struct cifs_sid));
+                        sizeof(struct cifs_sid));
        memcpy((void *)(&(cifscred->gsid)), (void *)group_sid_ptr,
-                        sizeof (struct cifs_sid)); */
+                        sizeof(struct cifs_sid)); */
        return (0);
 }
+/* Convert permission bits from mode to equivalent CIFS ACL */
+static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
+                                int acl_len, struct inode *inode, __u64 nmode)
+{
+        int rc = 0;
+        __u32 dacloffset;
+        __u32 ndacloffset;
+        __u32 sidsoffset;
+        struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
+        struct cifs_acl *dacl_ptr = NULL;  /* no need for SACL ptr */
+        struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */
+        if ((inode == NULL) || (pntsd == NULL) || (pnntsd == NULL))
+                return (-EIO);
+        owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+                                le32_to_cpu(pntsd->osidoffset));
+        group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+                                le32_to_cpu(pntsd->gsidoffset));
+        dacloffset = le32_to_cpu(pntsd->dacloffset);
+        dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
+        ndacloffset = sizeof(struct cifs_ntsd);
+        ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset);
+        ndacl_ptr->revision = dacl_ptr->revision;
+        ndacl_ptr->size = 0;
+        ndacl_ptr->num_aces = 0;
+        rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr, nmode);
+        sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size);
+        /* copy security descriptor control portion and owner and group sid */
+        copy_sec_desc(pntsd, pnntsd, sidsoffset);
+        return (rc);
+}
+/* Retrieve an ACL from the server */
+static struct cifs_ntsd *get_cifs_acl(u32 *pacllen, struct inode *inode,
+                                       const char *path)
+{
+        struct cifsFileInfo *open_file;
+        int unlock_file = FALSE;
+        int xid;
+        int rc = -EIO;
+        __u16 fid;
+        struct super_block *sb;
+        struct cifs_sb_info *cifs_sb;
+        struct cifs_ntsd *pntsd = NULL;
+        cFYI(1, ("get mode from ACL for %s", path));
+        if (inode == NULL)
+                return NULL;
+        xid = GetXid();
+        open_file = find_readable_file(CIFS_I(inode));
+        sb = inode->i_sb;
+        if (sb == NULL) {
+                FreeXid(xid);
+                return NULL;
+        }
+        cifs_sb = CIFS_SB(sb);
+        if (open_file) {
+                unlock_file = TRUE;
+                fid = open_file->netfid;
+        } else {
+                int oplock = FALSE;
+                /* open file */
+                rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
+                                READ_CONTROL, 0, &fid, &oplock, NULL,
+                                cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
+                if (rc != 0) {
+                        cERROR(1, ("Unable to open file to get ACL"));
+                        FreeXid(xid);
+                        return NULL;
+                }
+        }
+        rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
+        cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
+        if (unlock_file == TRUE)
+                atomic_dec(&open_file->wrtPending);
+        else
+                CIFSSMBClose(xid, cifs_sb->tcon, fid);
+        FreeXid(xid);
+        return pntsd;
+}
+/* Set an ACL on the server */
+static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
+                                struct inode *inode, const char *path)
+{
+        struct cifsFileInfo *open_file;
+        int unlock_file = FALSE;
+        int xid;
+        int rc = -EIO;
+        __u16 fid;
+        struct super_block *sb;
+        struct cifs_sb_info *cifs_sb;
+#ifdef CONFIG_CIFS_DEBUG2
+        cFYI(1, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
+#endif
+        if (!inode)
+                return (rc);
+        sb = inode->i_sb;
+        if (sb == NULL)
+                return (rc);
+        cifs_sb = CIFS_SB(sb);
+        xid = GetXid();
+        open_file = find_readable_file(CIFS_I(inode));
+        if (open_file) {
+                unlock_file = TRUE;
+                fid = open_file->netfid;
+        } else {
+                int oplock = FALSE;
+                /* open file */
+                rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
+                                WRITE_DAC, 0, &fid, &oplock, NULL,
+                                cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
+                if (rc != 0) {
+                        cERROR(1, ("Unable to open file to set ACL"));
+                        FreeXid(xid);
+                        return (rc);
+                }
+        }
+        rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
+#ifdef CONFIG_CIFS_DEBUG2
+        cFYI(1, ("SetCIFSACL rc = %d", rc));
+#endif
+        if (unlock_file == TRUE)
+                atomic_dec(&open_file->wrtPending);
+        else
+                CIFSSMBClose(xid, cifs_sb->tcon, fid);
+        FreeXid(xid);
+        return (rc);
+}
+/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
+void acl_to_uid_mode(struct inode *inode, const char *path)
+{
+        struct cifs_ntsd *pntsd = NULL;
+        u32 acllen = 0;
+        int rc = 0;
+#ifdef CONFIG_CIFS_DEBUG2
+        cFYI(1, ("converting ACL to mode for %s", path));
+#endif
+        pntsd = get_cifs_acl(&acllen, inode, path);
+        /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
+        if (pntsd)
+                rc = parse_sec_desc(pntsd, acllen, inode);
+        if (rc)
+                cFYI(1, ("parse sec desc failed rc = %d", rc));
+        kfree(pntsd);
+        return;
+}
+/* Convert mode bits to an ACL so we can update the ACL on the server */
+int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
+{
+        int rc = 0;
+        __u32 acllen = 0;
+        struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
+        struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
+#ifdef CONFIG_CIFS_DEBUG2
+        cFYI(1, ("set ACL from mode for %s", path));
+#endif
+        /* Get the security descriptor */
+        pntsd = get_cifs_acl(&acllen, inode, path);
+        /* Add three ACEs for owner, group, everyone getting rid of
+           other ACEs as chmod disables ACEs and set the security descriptor */
+        if (pntsd) {
+                /* allocate memory for the smb header,
+                   set security descriptor request security descriptor
+                   parameters, and secuirty descriptor itself */
+                pnntsd = kmalloc(acllen, GFP_KERNEL);
+                if (!pnntsd) {
+                        cERROR(1, ("Unable to allocate security descriptor"));
+                        kfree(pntsd);
+                        return (-ENOMEM);
+                }
+                rc = build_sec_desc(pntsd, pnntsd, acllen, inode, nmode);
+#ifdef CONFIG_CIFS_DEBUG2
+                cFYI(1, ("build_sec_desc rc: %d", rc));
+#endif
+                if (!rc) {
+                        /* Set the security descriptor */
+                        rc = set_cifs_acl(pnntsd, acllen, inode, path);
+#ifdef CONFIG_CIFS_DEBUG2
+                        cFYI(1, ("set_cifs_acl rc: %d", rc));
+#endif
+                }
+                kfree(pnntsd);
+                kfree(pntsd);
+        }
+        return (rc);
+}
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index 420f87813647..93a7c3462ea2 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -35,6 +35,9 @@
 #define UBITSHIFT       6
 #define GBITSHIFT       3
+#define ACCESS_ALLOWED  0
+#define ACCESS_DENIED   1
 struct cifs_ntsd {
        __le16 revision; /* revision level */
        __le16 type;
@@ -48,7 +51,7 @@ struct cifs_sid {
        __u8 revision; /* revision level */
        __u8 num_subauth;
        __u8 authority[6];
-        __le32 sub_auth[5]; /* sub_auth[num_subauth] */ /* BB FIXME endianness BB */
+        __le32 sub_auth[5]; /* sub_auth[num_subauth] */
 } __attribute__((packed));
 struct cifs_acl {
@@ -57,18 +60,12 @@ struct cifs_acl {
        __le32 num_aces;
 } __attribute__((packed));
-struct cifs_ntace { /* first part of ACE which contains perms */
+struct cifs_ace {
        __u8 type;
        __u8 flags;
        __le16 size;
        __le32 access_req;
-} __attribute__((packed));
+        struct cifs_sid sid; /* ie UUID of user or group who gets these perms */
-struct cifs_ace { /* last part of ACE which includes user info */
-        __u8 revision; /* revision level */
-        __u8 num_subauth;
-        __u8 authority[6];
-        __le32 sub_auth[5];
 } __attribute__((packed));
 struct cifs_wksid {
@@ -79,7 +76,7 @@ struct cifs_wksid {
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 extern int match_sid(struct cifs_sid *);
-extern int compare_sids(struct cifs_sid *, struct cifs_sid *);
+extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *);
 #endif /*  CONFIG_CIFS_EXPERIMENTAL */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 632070b4275d..4ff8939c6cc7 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -99,15 +99,16 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
        MD5Init(&context);
        MD5Update(&context, (char *)&key->data, key->len);
        for (i = 0; i < n_vec; i++) {
+                if (iov[i].iov_len == 0)
+                        continue;
                if (iov[i].iov_base == NULL) {
                        cERROR(1, ("null iovec entry"));
                        return -EIO;
-                } else if (iov[i].iov_len == 0)
+                }
-                        break; /* bail out if we are sent nothing to sign */
                /* The first entry includes a length field (which does not get
                   signed that occupies the first 4 bytes before the header */
                if (i == 0) {
-                        if (iov[0].iov_len <= 8 ) /* cmd field at offset 9 */
+                        if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
                                break; /* nothing to sign or corrupt header */
                        MD5Update(&context, iov[0].iov_base+4,
                                  iov[0].iov_len-4);
@@ -122,7 +123,7 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
 int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
-                   __u32 * pexpected_response_sequence_number)
+                   __u32 *pexpected_response_sequence_number)
 {
        int rc = 0;
        char smb_signature[20];
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index a6fbea57c4b1..e9f4ec701092 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -43,6 +43,9 @@
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
 #include <linux/mm.h>
+#include <linux/key-type.h>
+#include "dns_resolve.h"
+#include "cifs_spnego.h"
 #define CIFS_MAGIC_NUMBER 0xFF534D42    /* the first four bytes of SMB PDUs */
 #ifdef CONFIG_CIFS_QUOTA
@@ -94,6 +97,9 @@ cifs_read_super(struct super_block *sb, void *data,
 {
        struct inode *inode;
        struct cifs_sb_info *cifs_sb;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+        int len;
+#endif
        int rc = 0;
        /* BB should we make this contingent on mount parm? */
@@ -103,6 +109,25 @@ cifs_read_super(struct super_block *sb, void *data,
        if (cifs_sb == NULL)
                return -ENOMEM;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+        /* copy mount params to sb for use in submounts */
+        /* BB: should we move this after the mount so we
+         * do not have to do the copy on failed mounts?
+         * BB: May be it is better to do simple copy before
+         * complex operation (mount), and in case of fail
+         * just exit instead of doing mount and attempting
+         * undo it if this copy fails?*/
+        len = strlen(data);
+        cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL);
+        if (cifs_sb->mountdata == NULL) {
+                kfree(sb->s_fs_info);
+                sb->s_fs_info = NULL;
+                return -ENOMEM;
+        }
+        strncpy(cifs_sb->mountdata, data, len + 1);
+        cifs_sb->mountdata[len] = '\0';
+#endif
        rc = cifs_mount(sb, cifs_sb, data, devname);
        if (rc) {
@@ -152,6 +177,12 @@ out_no_root:
 out_mount_failed:
        if (cifs_sb) {
+#ifdef CONFIG_CIFS_DFS_UPCALL
+                if (cifs_sb->mountdata) {
+                        kfree(cifs_sb->mountdata);
+                        cifs_sb->mountdata = NULL;
+                }
+#endif
                if (cifs_sb->local_nls)
                        unload_nls(cifs_sb->local_nls);
                kfree(cifs_sb);
@@ -175,6 +206,13 @@ cifs_put_super(struct super_block *sb)
        if (rc) {
                cERROR(1, ("cifs_umount failed with return code %d", rc));
        }
+#ifdef CONFIG_CIFS_DFS_UPCALL
+        if (cifs_sb->mountdata) {
+                kfree(cifs_sb->mountdata);
+                cifs_sb->mountdata = NULL;
+        }
+#endif
        unload_nls(cifs_sb->local_nls);
        kfree(cifs_sb);
        return;
@@ -264,6 +302,7 @@ cifs_alloc_inode(struct super_block *sb)
        cifs_inode->cifsAttrs = 0x20;   /* default */
        atomic_set(&cifs_inode->inUse, 0);
        cifs_inode->time = 0;
+        cifs_inode->write_behind_rc = 0;
        /* Until the file is open and we have gotten oplock
        info back from the server, can not assume caching of
        file data or metadata */
@@ -432,6 +471,10 @@ static void cifs_umount_begin(struct vfsmount *vfsmnt, int flags)
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *tcon;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+        dfs_shrink_umount_helper(vfsmnt);
+#endif /* CONFIG CIFS_DFS_UPCALL */
        if (!(flags & MNT_FORCE))
                return;
        cifs_sb = CIFS_SB(vfsmnt->mnt_sb);
@@ -549,7 +592,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
        return remote_llseek(file, offset, origin);
 }
-static struct file_system_type cifs_fs_type = {
+struct file_system_type cifs_fs_type = {
        .owner = THIS_MODULE,
        .name = "cifs",
        .get_sb = cifs_get_sb,
@@ -850,7 +893,7 @@ static int cifs_oplock_thread(void *dummyarg)
        struct cifsTconInfo *pTcon;
        struct inode *inode;
        __u16  netfid;
-        int rc;
+        int rc, waitrc = 0;
        set_freezable();
        do {
@@ -882,9 +925,11 @@ static int cifs_oplock_thread(void *dummyarg)
                                           filemap_fdatawrite(inode->i_mapping);
                                        if (CIFS_I(inode)->clientCanCacheRead
                                                                         == 0) {
-                                                filemap_fdatawait(inode->i_mapping);
+                                                waitrc = filemap_fdatawait(inode->i_mapping);
                                                invalidate_remote_inode(inode);
                                        }
+                                        if (rc == 0)
+                                                rc = waitrc;
                                } else
                                        rc = 0;
                                /* mutex_unlock(&inode->i_mutex);*/
@@ -1005,12 +1050,21 @@ init_cifs(void)
        rc = register_filesystem(&cifs_fs_type);
        if (rc)
                goto out_destroy_request_bufs;
+#ifdef CONFIG_CIFS_UPCALL
+        rc = register_key_type(&cifs_spnego_key_type);
+        if (rc)
+                goto out_unregister_filesystem;
+#endif
+#ifdef CONFIG_CIFS_DFS_UPCALL
+        rc = register_key_type(&key_type_dns_resolver);
+        if (rc)
+                goto out_unregister_key_type;
+#endif
        oplockThread = kthread_run(cifs_oplock_thread, NULL, "cifsoplockd");
        if (IS_ERR(oplockThread)) {
                rc = PTR_ERR(oplockThread);
                cERROR(1, ("error %d create oplock thread", rc));
-                goto out_unregister_filesystem;
+                goto out_unregister_dfs_key_type;
        }
        dnotifyThread = kthread_run(cifs_dnotify_thread, NULL, "cifsdnotifyd");
@@ -1024,7 +1078,15 @@ init_cifs(void)
 out_stop_oplock_thread:
        kthread_stop(oplockThread);
+ out_unregister_dfs_key_type:
+#ifdef CONFIG_CIFS_DFS_UPCALL
+        unregister_key_type(&key_type_dns_resolver);
+ out_unregister_key_type:
+#endif
+#ifdef CONFIG_CIFS_UPCALL
+        unregister_key_type(&cifs_spnego_key_type);
 out_unregister_filesystem:
+#endif
        unregister_filesystem(&cifs_fs_type);
 out_destroy_request_bufs:
        cifs_destroy_request_bufs();
@@ -1046,6 +1108,12 @@ exit_cifs(void)
 #ifdef CONFIG_PROC_FS
        cifs_proc_clean();
 #endif
+#ifdef CONFIG_CIFS_DFS_UPCALL
+        unregister_key_type(&key_type_dns_resolver);
+#endif
+#ifdef CONFIG_CIFS_UPCALL
+        unregister_key_type(&cifs_spnego_key_type);
+#endif
        unregister_filesystem(&cifs_fs_type);
        cifs_destroy_inodecache();
        cifs_destroy_mids();
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 5574ba3ab1f9..195b14de5567 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -32,6 +32,7 @@
 #define TRUE 1
 #endif
+extern struct file_system_type cifs_fs_type;
 extern const struct address_space_operations cifs_addr_ops;
 extern const struct address_space_operations cifs_addr_ops_smallbuf;
@@ -60,6 +61,10 @@ extern int cifs_setattr(struct dentry *, struct iattr *);
 extern const struct inode_operations cifs_file_inode_ops;
 extern const struct inode_operations cifs_symlink_inode_ops;
+extern struct list_head cifs_dfs_automount_list;
+extern struct inode_operations cifs_dfs_referral_inode_operations;
 /* Functions related to files and directories */
 extern const struct file_operations cifs_file_ops;
@@ -106,5 +111,5 @@ extern int cifs_ioctl(struct inode *inode, struct file *filep,
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
-#define CIFS_VERSION   "1.51"
+#define CIFS_VERSION   "1.52"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 87f51f23276f..5d32d8ddc82e 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/cifsglob.h
 *
- *   Copyright (C) International Business Machines  Corp., 2002,2007
+ *   Copyright (C) International Business Machines  Corp., 2002,2008
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *              Jeremy Allison (jra@samba.org)
 *
@@ -70,14 +70,6 @@
 #endif
 /*
- * This information is kept on every Server we know about.
- *
- * Some things to note:
- *
- */
-#define SERVER_NAME_LEN_WITH_NULL       (SERVER_NAME_LENGTH + 1)
-/*
 * CIFS vfs client Status information (based on what we know.)
 */
@@ -110,6 +102,7 @@ struct mac_key {
        unsigned int len;
        union {
                char ntlm[CIFS_SESS_KEY_SIZE + 16];
+                char krb5[CIFS_SESS_KEY_SIZE + 16]; /* BB: length correct? */
                struct {
                        char key[16];
                        struct ntlmv2_resp resp;
@@ -139,6 +132,7 @@ struct TCP_Server_Info {
        /* 15 character server name + 0x20 16th byte indicating type = srv */
        char server_RFC1001_name[SERVER_NAME_LEN_WITH_NULL];
        char unicode_server_Name[SERVER_NAME_LEN_WITH_NULL * 2];
+        char *hostname; /* hostname portion of UNC string */
        struct socket *ssocket;
        union {
                struct sockaddr_in sockAddr;
@@ -458,6 +452,37 @@ struct dir_notify_req {
       struct file *pfile;
 };
+struct dfs_info3_param {
+        int flags; /* DFSREF_REFERRAL_SERVER, DFSREF_STORAGE_SERVER*/
+        int PathConsumed;
+        int server_type;
+        int ref_flag;
+        char *path_name;
+        char *node_name;
+};
+static inline void free_dfs_info_param(struct dfs_info3_param *param)
+{
+        if (param) {
+                kfree(param->path_name);
+                kfree(param->node_name);
+                kfree(param);
+        }
+}
+static inline void free_dfs_info_array(struct dfs_info3_param *param,
+                                       int number_of_items)
+{
+        int i;
+        if ((number_of_items == 0) || (param == NULL))
+                return;
+        for (i = 0; i < number_of_items; i++) {
+                kfree(param[i].path_name);
+                kfree(param[i].node_name);
+        }
+        kfree(param);
+}
 #define   MID_FREE 0
 #define   MID_REQUEST_ALLOCATED 1
 #define   MID_REQUEST_SUBMITTED 2
@@ -471,6 +496,17 @@ struct dir_notify_req {
 #define   CIFS_LARGE_BUFFER     2
 #define   CIFS_IOVEC            4    /* array of response buffers */
+/* Type of Request to SendReceive2 */
+#define   CIFS_STD_OP           0    /* normal request timeout */
+#define   CIFS_LONG_OP          1    /* long op (up to 45 sec, oplock time) */
+#define   CIFS_VLONG_OP         2    /* sloow op - can take up to 180 seconds */
+#define   CIFS_BLOCKING_OP      4    /* operation can block */
+#define   CIFS_ASYNC_OP         8    /* do not wait for response */
+#define   CIFS_TIMEOUT_MASK 0x00F    /* only one of 5 above set in req */
+#define   CIFS_LOG_ERROR    0x010    /* log NT STATUS if non-zero */
+#define   CIFS_LARGE_BUF_OP 0x020    /* large request buffer */
+#define   CIFS_NO_RESP      0x040    /* no response buffer required */
 /* Security Flags: indicate type of session setup needed */
 #define   CIFSSEC_MAY_SIGN      0x00001
 #define   CIFSSEC_MAY_NTLM      0x00002
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index c41ff74e9128..47f79504f57b 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -220,6 +220,26 @@
                                | FILE_WRITE_EA | FILE_WRITE_ATTRIBUTES)
 #define FILE_EXEC_RIGHTS (FILE_EXECUTE)
+#define SET_FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA | FILE_WRITE_EA \
+                                | FILE_READ_ATTRIBUTES \
+                                | FILE_WRITE_ATTRIBUTES \
+                                | DELETE | READ_CONTROL | WRITE_DAC \
+                                | WRITE_OWNER | SYNCHRONIZE)
+#define SET_FILE_WRITE_RIGHTS (FILE_WRITE_DATA | FILE_APPEND_DATA \
+                                | FILE_READ_EA | FILE_WRITE_EA \
+                                | FILE_DELETE_CHILD | FILE_READ_ATTRIBUTES \
+                                | FILE_WRITE_ATTRIBUTES \
+                                | DELETE | READ_CONTROL | WRITE_DAC \
+                                | WRITE_OWNER | SYNCHRONIZE)
+#define SET_FILE_EXEC_RIGHTS (FILE_READ_EA | FILE_WRITE_EA | FILE_EXECUTE \
+                                | FILE_READ_ATTRIBUTES \
+                                | FILE_WRITE_ATTRIBUTES \
+                                | DELETE | READ_CONTROL | WRITE_DAC \
+                                | WRITE_OWNER | SYNCHRONIZE)
+#define SET_MINIMUM_RIGHTS (FILE_READ_EA | FILE_READ_ATTRIBUTES \
+                                | READ_CONTROL | SYNCHRONIZE)
 /*
 * Invalid readdir handle
@@ -1211,6 +1231,29 @@ typedef struct smb_com_transaction_qsec_req {
        __le32 AclFlags;
 } __attribute__((packed)) QUERY_SEC_DESC_REQ;
+typedef struct smb_com_transaction_ssec_req {
+        struct smb_hdr hdr;     /* wct = 19 */
+        __u8 MaxSetupCount;
+        __u16 Reserved;
+        __le32 TotalParameterCount;
+        __le32 TotalDataCount;
+        __le32 MaxParameterCount;
+        __le32 MaxDataCount;
+        __le32 ParameterCount;
+        __le32 ParameterOffset;
+        __le32 DataCount;
+        __le32 DataOffset;
+        __u8 SetupCount; /* no setup words follow subcommand */
+        /* SNIA spec incorrectly included spurious pad here */
+        __le16 SubCommand; /* 3 = SET_SECURITY_DESC */
+        __le16 ByteCount; /* bcc = 3 + 8 */
+        __u8 Pad[3];
+        __u16 Fid;
+        __u16 Reserved2;
+        __le32 AclFlags;
+} __attribute__((packed)) SET_SEC_DESC_REQ;
 typedef struct smb_com_transaction_change_notify_req {
        struct smb_hdr hdr;     /* wct = 23 */
        __u8 MaxSetupCount;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 1a883663b22d..2f09f565a3d9 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/cifsproto.h
 *
- *   Copyright (c) International Business Machines  Corp., 2002,2007
+ *   Copyright (c) International Business Machines  Corp., 2002,2008
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   This library is free software; you can redistribute it and/or modify
@@ -48,10 +48,11 @@ extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
                        struct smb_hdr * /* input */ ,
                        struct smb_hdr * /* out */ ,
                        int * /* bytes returned */ , const int long_op);
+extern int SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
+                        struct smb_hdr *in_buf, int flags);
 extern int SendReceive2(const unsigned int /* xid */ , struct cifsSesInfo *,
                        struct kvec *, int /* nvec to send */,
-                        int * /* type of buf returned */ , const int long_op,
+                        int * /* type of buf returned */ , const int flags);
-                        const int logError /* whether to log status code*/ );
 extern int SendReceiveBlockingLock(const unsigned int /* xid */ ,
                                        struct cifsTconInfo *,
                                struct smb_hdr * /* input */ ,
@@ -61,6 +62,9 @@ extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
 extern int is_valid_oplock_break(struct smb_hdr *smb, struct TCP_Server_Info *);
 extern int is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
 extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *);
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *);
+#endif
 extern unsigned int smbCalcSize(struct smb_hdr *ptr);
 extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
 extern int decode_negTokenInit(unsigned char *security_blob, int length,
@@ -92,10 +96,15 @@ extern int cifs_get_inode_info(struct inode **pinode,
 extern int cifs_get_inode_info_unix(struct inode **pinode,
                        const unsigned char *search_path,
                        struct super_block *sb, int xid);
+extern void acl_to_uid_mode(struct inode *inode, const char *search_path);
+extern int mode_to_acl(struct inode *inode, const char *path, __u64);
 extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
                        const char *);
 extern int cifs_umount(struct super_block *, struct cifs_sb_info *);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+extern void dfs_shrink_umount_helper(struct vfsmount *vfsmnt);
+#endif
 void cifs_proc_init(void);
 void cifs_proc_clean(void);
@@ -147,7 +156,7 @@ extern int get_dfs_path(int xid, struct cifsSesInfo *pSesInfo,
                        const char *old_path,
                        const struct nls_table *nls_codepage,
                        unsigned int *pnum_referrals,
-                        unsigned char **preferrals,
+                        struct dfs_info3_param **preferrals,
                        int remap);
 extern void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
                                 struct super_block *sb, struct smb_vol *vol);
@@ -241,15 +250,15 @@ extern int CIFSSMBQueryReparseLinkInfo(const int xid,
 extern int CIFSSMBOpen(const int xid, struct cifsTconInfo *tcon,
                        const char *fileName, const int disposition,
                        const int access_flags, const int omode,
-                        __u16 * netfid, int *pOplock, FILE_ALL_INFO *,
+                        __u16 *netfid, int *pOplock, FILE_ALL_INFO *,
                        const struct nls_table *nls_codepage, int remap);
 extern int SMBLegacyOpen(const int xid, struct cifsTconInfo *tcon,
                        const char *fileName, const int disposition,
                        const int access_flags, const int omode,
-                        __u16 * netfid, int *pOplock, FILE_ALL_INFO *,
+                        __u16 *netfid, int *pOplock, FILE_ALL_INFO *,
                        const struct nls_table *nls_codepage, int remap);
 extern int CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon,
-                        u32 posix_flags, __u64 mode, __u16 * netfid,
+                        u32 posix_flags, __u64 mode, __u16 *netfid,
                        FILE_UNIX_BASIC_INFO *pRetData,
                        __u32 *pOplock, const char *name,
                        const struct nls_table *nls_codepage, int remap);
@@ -270,7 +279,7 @@ extern int CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
                        const __u64 offset, unsigned int *nbytes,
                        struct kvec *iov, const int nvec, const int long_op);
 extern int CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon,
-                        const unsigned char *searchName, __u64 * inode_number,
+                        const unsigned char *searchName, __u64 *inode_number,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
 extern int cifs_convertUCSpath(char *target, const __le16 *source, int maxlen,
@@ -311,7 +320,6 @@ extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *,
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 extern void calc_lanman_hash(struct cifsSesInfo *ses, char *lnm_session_key);
 #endif /* CIFS_WEAK_PW_HASH */
-extern int parse_sec_desc(struct cifs_ntsd *, int);
 extern int CIFSSMBCopy(int xid,
                        struct cifsTconInfo *source_tcon,
                        const char *fromName,
@@ -336,8 +344,9 @@ extern int CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon,
                const void *ea_value, const __u16 ea_value_len,
                const struct nls_table *nls_codepage, int remap_special_chars);
 extern int CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon,
-                        __u16 fid, char *acl_inf, const int buflen,
+                        __u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen);
-                        const int acl_type /* ACCESS vs. DEFAULT */);
+extern int CIFSSMBSetCIFSACL(const int, struct cifsTconInfo *, __u16,
+                        struct cifs_ntsd *, __u32);
 extern int CIFSSMBGetPosixACL(const int xid, struct cifsTconInfo *tcon,
                const unsigned char *searchName,
                char *acl_inf, const int buflen, const int acl_type,
@@ -347,5 +356,5 @@ extern int CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
                const char *local_acl, const int buflen, const int acl_type,
                const struct nls_table *nls_codepage, int remap_special_chars);
 extern int CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
-                        const int netfid, __u64 * pExtAttrBits, __u64 *pMask);
+                        const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
 #endif                  /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index f0d9a485d095..9409524e4bf8 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -647,8 +647,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                                                 count - 16,
                                                 &server->secType);
                        if (rc == 1) {
-                        /* BB Need to fill struct for sessetup here */
+                                rc = 0;
-                                rc = -EOPNOTSUPP;
                        } else {
                                rc = -EINVAL;
                        }
@@ -699,9 +698,7 @@ int
 CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
 {
        struct smb_hdr *smb_buffer;
-        struct smb_hdr *smb_buffer_response; /* BB removeme BB */
        int rc = 0;
-        int length;
        cFYI(1, ("In tree disconnect"));
        /*
@@ -738,16 +735,12 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
        if (rc) {
                up(&tcon->tconSem);
                return rc;
-        } else {
-                smb_buffer_response = smb_buffer; /* BB removeme BB */
        }
-        rc = SendReceive(xid, tcon->ses, smb_buffer, smb_buffer_response,
-                         &length, 0);
+        rc = SendReceiveNoRsp(xid, tcon->ses, smb_buffer, 0);
        if (rc)
                cFYI(1, ("Tree disconnect failed %d", rc));
-        if (smb_buffer)
-                cifs_small_buf_release(smb_buffer);
        up(&tcon->tconSem);
        /* No need to return error on this operation if tid invalidated and
@@ -761,10 +754,8 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
 int
 CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
 {
-        struct smb_hdr *smb_buffer_response;
        LOGOFF_ANDX_REQ *pSMB;
        int rc = 0;
-        int length;
        cFYI(1, ("In SMBLogoff for session disconnect"));
        if (ses)
@@ -783,8 +774,6 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
                return rc;
        }
-        smb_buffer_response = (struct smb_hdr *)pSMB; /* BB removeme BB */
        if (ses->server) {
                pSMB->hdr.Mid = GetNextMid(ses->server);
@@ -796,8 +785,7 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
        pSMB->hdr.Uid = ses->Suid;
        pSMB->AndXCommand = 0xFF;
-        rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
+        rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0);
-                         smb_buffer_response, &length, 0);
        if (ses->server) {
                atomic_dec(&ses->server->socketUseCount);
                if (atomic_read(&ses->server->socketUseCount) == 0) {
@@ -808,7 +796,6 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
                }
        }
        up(&ses->sesSem);
-        cifs_small_buf_release(pSMB);
        /* if session dead then we do not need to do ulogoff,
                since server closed smb session, no sense reporting
@@ -1256,7 +1243,7 @@ OldOpenRetry:
        pSMB->ByteCount = cpu_to_le16(count);
        /* long_op set to 1 to allow for oplock break timeouts */
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-                         (struct smb_hdr *) pSMBr, &bytes_returned, 1);
+                        (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
        cifs_stats_inc(&tcon->num_opens);
        if (rc) {
                cFYI(1, ("Error in Open = %d", rc));
@@ -1369,7 +1356,7 @@ openRetry:
        pSMB->ByteCount = cpu_to_le16(count);
        /* long_op set to 1 to allow for oplock break timeouts */
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-                         (struct smb_hdr *) pSMBr, &bytes_returned, 1);
+                        (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
        cifs_stats_inc(&tcon->num_opens);
        if (rc) {
                cFYI(1, ("Error in Open = %d", rc));
@@ -1447,7 +1434,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
        iov[0].iov_base = (char *)pSMB;
        iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
        rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
-                         &resp_buf_type, 0 /* not long op */, 1 /* log err */ );
+                         &resp_buf_type, CIFS_STD_OP | CIFS_LOG_ERROR);
        cifs_stats_inc(&tcon->num_reads);
        pSMBr = (READ_RSP *)iov[0].iov_base;
        if (rc) {
@@ -1666,7 +1653,7 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
        rc = SendReceive2(xid, tcon->ses, iov, n_vec + 1, &resp_buf_type,
-                          long_op, 0 /* do not log STATUS code */ );
+                          long_op);
        cifs_stats_inc(&tcon->num_writes);
        if (rc) {
                cFYI(1, ("Send error Write2 = %d", rc));
@@ -1708,7 +1695,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
        int timeout = 0;
        __u16 count;
-        cFYI(1, ("In CIFSSMBLock - timeout %d numLock %d", waitFlag, numLock));
+        cFYI(1, ("CIFSSMBLock timeout %d numLock %d", waitFlag, numLock));
        rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB);
        if (rc)
@@ -1717,10 +1704,10 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
        pSMBr = (LOCK_RSP *)pSMB; /* BB removeme BB */
        if (lockType == LOCKING_ANDX_OPLOCK_RELEASE) {
-                timeout = -1; /* no response expected */
+                timeout = CIFS_ASYNC_OP; /* no response expected */
                pSMB->Timeout = 0;
        } else if (waitFlag == TRUE) {
-                timeout = 3;  /* blocking operation, no timeout */
+                timeout = CIFS_BLOCKING_OP; /* blocking operation, no timeout */
                pSMB->Timeout = cpu_to_le32(-1);/* blocking - do not time out */
        } else {
                pSMB->Timeout = 0;
@@ -1750,15 +1737,16 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
        if (waitFlag) {
                rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB,
                        (struct smb_hdr *) pSMBr, &bytes_returned);
+                cifs_small_buf_release(pSMB);
        } else {
-                rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+                rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *)pSMB,
-                         (struct smb_hdr *) pSMBr, &bytes_returned, timeout);
+                                      timeout);
+                /* SMB buffer freed by function above */
        }
        cifs_stats_inc(&tcon->num_locks);
        if (rc) {
                cFYI(1, ("Send error in Lock = %d", rc));
        }
-        cifs_small_buf_release(pSMB);
        /* Note: On -EAGAIN error only caller can retry on handle based calls
        since file handle passed in no longer valid */
@@ -1777,7 +1765,9 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
        int rc = 0;
        int timeout = 0;
        int bytes_returned = 0;
+        int resp_buf_type = 0;
        __u16 params, param_offset, offset, byte_count, count;
+        struct kvec iov[1];
        cFYI(1, ("Posix Lock"));
@@ -1819,7 +1809,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
        parm_data->lock_type = cpu_to_le16(lock_type);
        if (waitFlag) {
-                timeout = 3;  /* blocking operation, no timeout */
+                timeout = CIFS_BLOCKING_OP; /* blocking operation, no timeout */
                parm_data->lock_flags = cpu_to_le16(1);
                pSMB->Timeout = cpu_to_le32(-1);
        } else
@@ -1839,8 +1829,13 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
                rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB,
                        (struct smb_hdr *) pSMBr, &bytes_returned);
        } else {
-                rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+                iov[0].iov_base = (char *)pSMB;
-                        (struct smb_hdr *) pSMBr, &bytes_returned, timeout);
+                iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
+                rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
+                                &resp_buf_type, timeout);
+                pSMB = NULL; /* request buf already freed by SendReceive2. Do
+                                not try to free it twice below on exit */
+                pSMBr = (struct smb_com_transaction2_sfi_rsp *)iov[0].iov_base;
        }
        if (rc) {
@@ -1875,6 +1870,11 @@ plk_err_exit:
        if (pSMB)
                cifs_small_buf_release(pSMB);
+        if (resp_buf_type == CIFS_SMALL_BUFFER)
+                cifs_small_buf_release(iov[0].iov_base);
+        else if (resp_buf_type == CIFS_LARGE_BUFFER)
+                cifs_buf_release(iov[0].iov_base);
        /* Note: On -EAGAIN error only caller can retry on handle based calls
           since file handle passed in no longer valid */
@@ -1887,8 +1887,6 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 {
        int rc = 0;
        CLOSE_REQ *pSMB = NULL;
-        CLOSE_RSP *pSMBr = NULL;
-        int bytes_returned;
        cFYI(1, ("In CIFSSMBClose"));
 /* do not retry on dead session on close */
@@ -1898,13 +1896,10 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
        if (rc)
                return rc;
-        pSMBr = (CLOSE_RSP *)pSMB; /* BB removeme BB */
        pSMB->FileID = (__u16) smb_file_id;
        pSMB->LastWriteTime = 0xFFFFFFFF;
        pSMB->ByteCount = 0;
-        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
-                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_closes);
        if (rc) {
                if (rc != -EINTR) {
@@ -1913,8 +1908,6 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
                }
        }
-        cifs_small_buf_release(pSMB);
        /* Since session is dead, file will be closed on server already */
        if (rc == -EAGAIN)
                rc = 0;
@@ -2486,6 +2479,7 @@ querySymLinkRetry:
        return rc;
 }
+#ifdef CONFIG_CIFS_EXPERIMENTAL
 /* Initialize NT TRANSACT SMB into small smb request buffer.
   This assumes that all NT TRANSACTS that we init here have
   total parm and data under about 400 bytes (to fit in small cifs
@@ -2494,7 +2488,7 @@ querySymLinkRetry:
        MaxSetupCount (size of returned setup area) and
        MaxParameterCount (returned parms size) must be set by caller */
 static int
-smb_init_ntransact(const __u16 sub_command, const int setup_count,
+smb_init_nttransact(const __u16 sub_command, const int setup_count,
                   const int parm_len, struct cifsTconInfo *tcon,
                   void **ret_buf)
 {
@@ -2525,12 +2519,15 @@ smb_init_ntransact(const __u16 sub_command, const int setup_count,
 static int
 validate_ntransact(char *buf, char **ppparm, char **ppdata,
-                   int *pdatalen, int *pparmlen)
+                   __u32 *pparmlen, __u32 *pdatalen)
 {
        char *end_of_smb;
        __u32 data_count, data_offset, parm_count, parm_offset;
        struct smb_com_ntransact_rsp *pSMBr;
+        *pdatalen = 0;
+        *pparmlen = 0;
        if (buf == NULL)
                return -EINVAL;
@@ -2567,8 +2564,11 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
                cFYI(1, ("parm count and data count larger than SMB"));
                return -EINVAL;
        }
+        *pdatalen = data_count;
+        *pparmlen = parm_count;
        return 0;
 }
+#endif /* CIFS_EXPERIMENTAL */
 int
 CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
@@ -3067,8 +3067,7 @@ GetExtAttrOut:
 /* Get Security Descriptor (by handle) from remote server for a file or dir */
 int
 CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
-                /* BB fix up return info */ char *acl_inf, const int buflen,
+                  struct cifs_ntsd **acl_inf, __u32 *pbuflen)
-                  const int acl_type)
 {
        int rc = 0;
        int buf_type = 0;
@@ -3077,7 +3076,10 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
        cFYI(1, ("GetCifsACL"));
-        rc = smb_init_ntransact(NT_TRANSACT_QUERY_SECURITY_DESC, 0,
+        *pbuflen = 0;
+        *acl_inf = NULL;
+        rc = smb_init_nttransact(NT_TRANSACT_QUERY_SECURITY_DESC, 0,
                        8 /* parm len */, tcon, (void **) &pSMB);
        if (rc)
                return rc;
@@ -3094,39 +3096,57 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
        iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
        rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type,
-                         0 /* not long op */, 0 /* do not log STATUS codes */ );
+                         CIFS_STD_OP);
        cifs_stats_inc(&tcon->num_acl_get);
        if (rc) {
                cFYI(1, ("Send error in QuerySecDesc = %d", rc));
        } else {                /* decode response */
-                struct cifs_ntsd *psec_desc;
                __le32 * parm;
-                int parm_len;
+                __u32 parm_len;
-                int data_len;
+                __u32 acl_len;
-                int acl_len;
                struct smb_com_ntransact_rsp *pSMBr;
+                char *pdata;
 /* validate_nttransact */
                rc = validate_ntransact(iov[0].iov_base, (char **)&parm,
-                                        (char **)&psec_desc,
+                                        &pdata, &parm_len, pbuflen);
-                                        &parm_len, &data_len);
                if (rc)
                        goto qsec_out;
                pSMBr = (struct smb_com_ntransact_rsp *)iov[0].iov_base;
-                cFYI(1, ("smb %p parm %p data %p", pSMBr, parm, psec_desc));
+                cFYI(1, ("smb %p parm %p data %p", pSMBr, parm, *acl_inf));
                if (le32_to_cpu(pSMBr->ParameterCount) != 4) {
                        rc = -EIO;      /* bad smb */
+                        *pbuflen = 0;
                        goto qsec_out;
                }
 /* BB check that data area is minimum length and as big as acl_len */
                acl_len = le32_to_cpu(*parm);
-                /* BB check if (acl_len > bufsize) */
+                if (acl_len != *pbuflen) {
+                        cERROR(1, ("acl length %d does not match %d",
+                                   acl_len, *pbuflen));
+                        if (*pbuflen > acl_len)
+                                *pbuflen = acl_len;
+                }
-                parse_sec_desc(psec_desc, acl_len);
+                /* check if buffer is big enough for the acl
+                   header followed by the smallest SID */
+                if ((*pbuflen < sizeof(struct cifs_ntsd) + 8) ||
+                    (*pbuflen >= 64 * 1024)) {
+                        cERROR(1, ("bad acl length %d", *pbuflen));
+                        rc = -EINVAL;
+                        *pbuflen = 0;
+                } else {
+                        *acl_inf = kmalloc(*pbuflen, GFP_KERNEL);
+                        if (*acl_inf == NULL) {
+                                *pbuflen = 0;
+                                rc = -ENOMEM;
+                        }
+                        memcpy(*acl_inf, pdata, *pbuflen);
+                }
        }
 qsec_out:
        if (buf_type == CIFS_SMALL_BUFFER)
@@ -3136,6 +3156,71 @@ qsec_out:
 /*      cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
        return rc;
 }
+int
+CIFSSMBSetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
+                        struct cifs_ntsd *pntsd, __u32 acllen)
+{
+        __u16 byte_count, param_count, data_count, param_offset, data_offset;
+        int rc = 0;
+        int bytes_returned = 0;
+        SET_SEC_DESC_REQ *pSMB = NULL;
+        NTRANSACT_RSP *pSMBr = NULL;
+setCifsAclRetry:
+        rc = smb_init(SMB_COM_NT_TRANSACT, 19, tcon, (void **) &pSMB,
+                        (void **) &pSMBr);
+        if (rc)
+                        return (rc);
+        pSMB->MaxSetupCount = 0;
+        pSMB->Reserved = 0;
+        param_count = 8;
+        param_offset = offsetof(struct smb_com_transaction_ssec_req, Fid) - 4;
+        data_count = acllen;
+        data_offset = param_offset + param_count;
+        byte_count = 3 /* pad */  + param_count;
+        pSMB->DataCount = cpu_to_le32(data_count);
+        pSMB->TotalDataCount = pSMB->DataCount;
+        pSMB->MaxParameterCount = cpu_to_le32(4);
+        pSMB->MaxDataCount = cpu_to_le32(16384);
+        pSMB->ParameterCount = cpu_to_le32(param_count);
+        pSMB->ParameterOffset = cpu_to_le32(param_offset);
+        pSMB->TotalParameterCount = pSMB->ParameterCount;
+        pSMB->DataOffset = cpu_to_le32(data_offset);
+        pSMB->SetupCount = 0;
+        pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_SET_SECURITY_DESC);
+        pSMB->ByteCount = cpu_to_le16(byte_count+data_count);
+        pSMB->Fid = fid; /* file handle always le */
+        pSMB->Reserved2 = 0;
+        pSMB->AclFlags = cpu_to_le32(CIFS_ACL_DACL);
+        if (pntsd && acllen) {
+                memcpy((char *) &pSMBr->hdr.Protocol + data_offset,
+                        (char *) pntsd,
+                        acllen);
+                pSMB->hdr.smb_buf_length += (byte_count + data_count);
+        } else
+                pSMB->hdr.smb_buf_length += byte_count;
+        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+                (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+        cFYI(1, ("SetCIFSACL bytes_returned: %d, rc: %d", bytes_returned, rc));
+        if (rc)
+                cFYI(1, ("Set CIFS ACL returned %d", rc));
+        cifs_buf_release(pSMB);
+        if (rc == -EAGAIN)
+                goto setCifsAclRetry;
+        return (rc);
+}
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 /* Legacy Query Path Information call for lookup to old servers such
@@ -3381,7 +3466,7 @@ UnixQPathInfoRetry:
                        memcpy((char *) pFindData,
                               (char *) &pSMBr->hdr.Protocol +
                               data_offset,
-                               sizeof (FILE_UNIX_BASIC_INFO));
+                               sizeof(FILE_UNIX_BASIC_INFO));
                }
        }
        cifs_buf_release(pSMB);
@@ -3649,7 +3734,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
        pSMB->SubCommand = cpu_to_le16(TRANS2_FIND_NEXT);
        pSMB->SearchHandle = searchHandle;      /* always kept as le */
        pSMB->SearchCount =
-                cpu_to_le16(CIFSMaxBufSize / sizeof (FILE_UNIX_INFO));
+                cpu_to_le16(CIFSMaxBufSize / sizeof(FILE_UNIX_INFO));
        pSMB->InformationLevel = cpu_to_le16(psrch_inf->info_level);
        pSMB->ResumeKey = psrch_inf->resume_key;
        pSMB->SearchFlags =
@@ -3737,8 +3822,6 @@ CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
 {
        int rc = 0;
        FINDCLOSE_REQ *pSMB = NULL;
-        CLOSE_RSP *pSMBr = NULL; /* BB removeme BB */
-        int bytes_returned;
        cFYI(1, ("In CIFSSMBFindClose"));
        rc = small_smb_init(SMB_COM_FIND_CLOSE2, 1, tcon, (void **)&pSMB);
@@ -3750,16 +3833,13 @@ CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
        if (rc)
                return rc;
-        pSMBr = (CLOSE_RSP *)pSMB;  /* BB removeme BB */
        pSMB->FileID = searchHandle;
        pSMB->ByteCount = 0;
-        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
-                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
                cERROR(1, ("Send error in FindClose = %d", rc));
        }
        cifs_stats_inc(&tcon->num_fclose);
-        cifs_small_buf_release(pSMB);
        /* Since session is dead, search handle closed on server already */
        if (rc == -EAGAIN)
@@ -4331,7 +4411,7 @@ QFSDeviceRetry:
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-                if (rc || (pSMBr->ByteCount < sizeof (FILE_SYSTEM_DEVICE_INFO)))
+                if (rc || (pSMBr->ByteCount < sizeof(FILE_SYSTEM_DEVICE_INFO)))
                        rc = -EIO;      /* bad smb */
                else {
                        __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -4681,11 +4761,9 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
                   __u16 fid, __u32 pid_of_opener, int SetAllocation)
 {
        struct smb_com_transaction2_sfi_req *pSMB  = NULL;
-        struct smb_com_transaction2_sfi_rsp *pSMBr = NULL;
        char *data_offset;
        struct file_end_of_file_info *parm_data;
        int rc = 0;
-        int bytes_returned = 0;
        __u16 params, param_offset, offset, byte_count, count;
        cFYI(1, ("SetFileSize (via SetFileInfo) %lld",
@@ -4695,8 +4773,6 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
        if (rc)
                return rc;
-        pSMBr = (struct smb_com_transaction2_sfi_rsp *)pSMB;
        pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener);
        pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));
@@ -4747,17 +4823,13 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
        pSMB->Reserved4 = 0;
        pSMB->hdr.smb_buf_length += byte_count;
        pSMB->ByteCount = cpu_to_le16(byte_count);
-        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
-                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
                cFYI(1,
                     ("Send error in SetFileInfo (SetFileSize) = %d",
                      rc));
        }
-        if (pSMB)
-                cifs_small_buf_release(pSMB);
        /* Note: On -EAGAIN error only caller can retry on handle based calls
                since file handle passed in no longer valid */
@@ -4775,10 +4847,8 @@ CIFSSMBSetFileTimes(const int xid, struct cifsTconInfo *tcon,
                    const FILE_BASIC_INFO *data, __u16 fid)
 {
        struct smb_com_transaction2_sfi_req *pSMB  = NULL;
-        struct smb_com_transaction2_sfi_rsp *pSMBr = NULL;
        char *data_offset;
        int rc = 0;
-        int bytes_returned = 0;
        __u16 params, param_offset, offset, byte_count, count;
        cFYI(1, ("Set Times (via SetFileInfo)"));
@@ -4787,8 +4857,6 @@ CIFSSMBSetFileTimes(const int xid, struct cifsTconInfo *tcon,
        if (rc)
                return rc;
-        pSMBr = (struct smb_com_transaction2_sfi_rsp *)pSMB;
        /* At this point there is no need to override the current pid
        with the pid of the opener, but that could change if we someday
        use an existing handle (rather than opening one on the fly) */
@@ -4828,14 +4896,11 @@ CIFSSMBSetFileTimes(const int xid, struct cifsTconInfo *tcon,
        pSMB->hdr.smb_buf_length += byte_count;
        pSMB->ByteCount = cpu_to_le16(byte_count);
        memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
-        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
-                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
                cFYI(1, ("Send error in Set Time (SetFileInfo) = %d", rc));
        }
-        cifs_small_buf_release(pSMB);
        /* Note: On -EAGAIN error only caller can retry on handle based calls
                since file handle passed in no longer valid */
@@ -5126,7 +5191,8 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
        pSMB->ByteCount = 0;
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-                        (struct smb_hdr *) pSMBr, &bytes_returned, -1);
+                         (struct smb_hdr *)pSMBr, &bytes_returned,
+                         CIFS_ASYNC_OP);
        if (rc) {
                cFYI(1, ("Error in Notify = %d", rc));
        } else {
@@ -5498,7 +5564,7 @@ SetEARetry:
        else
                name_len = strnlen(ea_name, 255);
-        count = sizeof(*parm_data) + ea_value_len + name_len + 1;
+        count = sizeof(*parm_data) + ea_value_len + name_len;
        pSMB->MaxParameterCount = cpu_to_le16(2);
        pSMB->MaxDataCount = cpu_to_le16(1000); /* BB find max SMB size from sess */
        pSMB->MaxSetupCount = 0;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 19ee11f7f35a..65d0ba72e78f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/connect.c
 *
- *   Copyright (C) International Business Machines  Corp., 2002,2007
+ *   Copyright (C) International Business Machines  Corp., 2002,2008
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   This library is free software; you can redistribute it and/or modify
@@ -160,7 +160,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
        if (server->ssocket) {
                cFYI(1, ("State: 0x%x Flags: 0x%lx", server->ssocket->state,
                        server->ssocket->flags));
-                server->ssocket->ops->shutdown(server->ssocket, SEND_SHUTDOWN);
+                kernel_sock_shutdown(server->ssocket, SHUT_WR);
                cFYI(1, ("Post shutdown state: 0x%x Flags: 0x%lx",
                        server->ssocket->state,
                        server->ssocket->flags));
@@ -438,9 +438,9 @@ incomplete_rcv:
                        csocket = server->ssocket;
                        wake_up(&server->response_q);
                        continue;
-                } else if (length < 4) {
+                } else if (length < pdu_length) {
-                        cFYI(1, ("less than four bytes received (%d bytes)",
+                        cFYI(1, ("requested %d bytes but only got %d bytes",
-                              length));
+                                  pdu_length, length));
                        pdu_length -= length;
                        msleep(1);
                        goto incomplete_rcv;
@@ -752,6 +752,7 @@ multi_t2_fnd:
        }
        write_unlock(&GlobalSMBSeslock);
+        kfree(server->hostname);
        kfree(server);
        if (length  > 0)
                mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
@@ -760,6 +761,34 @@ multi_t2_fnd:
        return 0;
 }
+/* extract the host portion of the UNC string */
+static char *
+extract_hostname(const char *unc)
+{
+        const char *src;
+        char *dst, *delim;
+        unsigned int len;
+        /* skip double chars at beginning of string */
+        /* BB: check validity of these bytes? */
+        src = unc + 2;
+        /* delimiter between hostname and sharename is always '\\' now */
+        delim = strchr(src, '\\');
+        if (!delim)
+                return ERR_PTR(-EINVAL);
+        len = delim - src;
+        dst = kmalloc((len + 1), GFP_KERNEL);
+        if (dst == NULL)
+                return ERR_PTR(-ENOMEM);
+        memcpy(dst, src, len);
+        dst[len] = '\0';
+        return dst;
+}
 static int
 cifs_parse_mount_options(char *options, const char *devname,
                         struct smb_vol *vol)
@@ -793,7 +822,7 @@ cifs_parse_mount_options(char *options, const char *devname,
        vol->linux_gid = current->gid;
        vol->dir_mode = S_IRWXUGO;
        /* 2767 perms indicate mandatory locking support */
-        vol->file_mode = S_IALLUGO & ~(S_ISUID | S_IXGRP);
+        vol->file_mode = (S_IRWXUGO | S_ISGID) & (~S_IXGRP);
        /* vol->retry default is 0 (i.e. "soft" limited retry not hard retry) */
        vol->rw = TRUE;
@@ -1381,7 +1410,7 @@ connect_to_dfs_path(int xid, struct cifsSesInfo *pSesInfo,
                    const char *old_path, const struct nls_table *nls_codepage,
                    int remap)
 {
-        unsigned char *referrals = NULL;
+        struct dfs_info3_param *referrals = NULL;
        unsigned int num_referrals;
        int rc = 0;
@@ -1400,12 +1429,14 @@ connect_to_dfs_path(int xid, struct cifsSesInfo *pSesInfo,
 int
 get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
             const struct nls_table *nls_codepage, unsigned int *pnum_referrals,
-             unsigned char **preferrals, int remap)
+             struct dfs_info3_param **preferrals, int remap)
 {
        char *temp_unc;
        int rc = 0;
+        unsigned char *targetUNCs;
        *pnum_referrals = 0;
+        *preferrals = NULL;
        if (pSesInfo->ipc_tid == 0) {
                temp_unc = kmalloc(2 /* for slashes */ +
@@ -1425,8 +1456,10 @@ get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
                kfree(temp_unc);
        }
        if (rc == 0)
-                rc = CIFSGetDFSRefer(xid, pSesInfo, old_path, preferrals,
+                rc = CIFSGetDFSRefer(xid, pSesInfo, old_path, &targetUNCs,
                                     pnum_referrals, nls_codepage, remap);
+        /* BB map targetUNCs to dfs_info3 structures, here or
+                in CIFSGetDFSRefer BB */
        return rc;
 }
@@ -1781,16 +1814,13 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
        memset(&volume_info, 0, sizeof(struct smb_vol));
        if (cifs_parse_mount_options(mount_data, devname, &volume_info)) {
-                kfree(volume_info.UNC);
+                rc = -EINVAL;
-                kfree(volume_info.password);
+                goto out;
-                kfree(volume_info.prepath);
-                FreeXid(xid);
-                return -EINVAL;
        }
        if (volume_info.nullauth) {
                cFYI(1, ("null user"));
-                volume_info.username = NULL;
+                volume_info.username = "";
        } else if (volume_info.username) {
                /* BB fixme parse for domain name here */
                cFYI(1, ("Username: %s", volume_info.username));
@@ -1798,11 +1828,8 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                cifserror("No username specified");
        /* In userspace mount helper we can get user name from alternate
           locations such as env variables and files on disk */
-                kfree(volume_info.UNC);
+                rc = -EINVAL;
-                kfree(volume_info.password);
+                goto out;
-                kfree(volume_info.prepath);
-                FreeXid(xid);
-                return -EINVAL;
        }
        if (volume_info.UNCip && volume_info.UNC) {
@@ -1821,11 +1848,8 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                if (rc <= 0) {
                        /* we failed translating address */
-                        kfree(volume_info.UNC);
+                        rc = -EINVAL;
-                        kfree(volume_info.password);
+                        goto out;
-                        kfree(volume_info.prepath);
-                        FreeXid(xid);
-                        return -EINVAL;
                }
                cFYI(1, ("UNC: %s ip: %s", volume_info.UNC, volume_info.UNCip));
@@ -1835,20 +1859,14 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                /* BB using ip addr as server name to connect to the
                   DFS root below */
                cERROR(1, ("Connecting to DFS root not implemented yet"));
-                kfree(volume_info.UNC);
+                rc = -EINVAL;
-                kfree(volume_info.password);
+                goto out;
-                kfree(volume_info.prepath);
-                FreeXid(xid);
-                return -EINVAL;
        } else /* which servers DFS root would we conect to */ {
                cERROR(1,
                       ("CIFS mount error: No UNC path (e.g. -o "
                        "unc=//192.168.1.100/public) specified"));
-                kfree(volume_info.UNC);
+                rc = -EINVAL;
-                kfree(volume_info.password);
+                goto out;
-                kfree(volume_info.prepath);
-                FreeXid(xid);
-                return -EINVAL;
        }
        /* this is needed for ASCII cp to Unicode converts */
@@ -1860,11 +1878,8 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                if (cifs_sb->local_nls == NULL) {
                        cERROR(1, ("CIFS mount error: iocharset %s not found",
                                 volume_info.iocharset));
-                        kfree(volume_info.UNC);
+                        rc = -ELIBACC;
-                        kfree(volume_info.password);
+                        goto out;
-                        kfree(volume_info.prepath);
-                        FreeXid(xid);
-                        return -ELIBACC;
                }
        }
@@ -1878,11 +1893,8 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                        &sin_server6.sin6_addr,
                        volume_info.username, &srvTcp);
        } else {
-                kfree(volume_info.UNC);
+                rc = -EINVAL;
-                kfree(volume_info.password);
+                goto out;
-                kfree(volume_info.prepath);
-                FreeXid(xid);
-                return -EINVAL;
        }
        if (srvTcp) {
@@ -1906,22 +1918,14 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                                   "Aborting operation"));
                        if (csocket != NULL)
                                sock_release(csocket);
-                        kfree(volume_info.UNC);
+                        goto out;
-                        kfree(volume_info.password);
-                        kfree(volume_info.prepath);
-                        FreeXid(xid);
-                        return rc;
                }
                srvTcp = kzalloc(sizeof(struct TCP_Server_Info), GFP_KERNEL);
                if (!srvTcp) {
                        rc = -ENOMEM;
                        sock_release(csocket);
-                        kfree(volume_info.UNC);
+                        goto out;
-                        kfree(volume_info.password);
-                        kfree(volume_info.prepath);
-                        FreeXid(xid);
-                        return rc;
                } else {
                        memcpy(&srvTcp->addr.sockAddr, &sin_server,
                                sizeof(struct sockaddr_in));
@@ -1929,6 +1933,12 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                        /* BB Add code for ipv6 case too */
                        srvTcp->ssocket = csocket;
                        srvTcp->protocolType = IPV4;
+                        srvTcp->hostname = extract_hostname(volume_info.UNC);
+                        if (IS_ERR(srvTcp->hostname)) {
+                                rc = PTR_ERR(srvTcp->hostname);
+                                sock_release(csocket);
+                                goto out;
+                        }
                        init_waitqueue_head(&srvTcp->response_q);
                        init_waitqueue_head(&srvTcp->request_q);
                        INIT_LIST_HEAD(&srvTcp->pending_mid_q);
@@ -1938,16 +1948,13 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                        srvTcp->tcpStatus = CifsNew;
                        init_MUTEX(&srvTcp->tcpSem);
                        srvTcp->tsk = kthread_run((void *)(void *)cifs_demultiplex_thread, srvTcp, "cifsd");
-                        if ( IS_ERR(srvTcp->tsk) ) {
+                        if (IS_ERR(srvTcp->tsk)) {
                                rc = PTR_ERR(srvTcp->tsk);
                                cERROR(1, ("error %d create cifsd thread", rc));
                                srvTcp->tsk = NULL;
                                sock_release(csocket);
-                                kfree(volume_info.UNC);
+                                kfree(srvTcp->hostname);
-                                kfree(volume_info.password);
+                                goto out;
-                                kfree(volume_info.prepath);
-                                FreeXid(xid);
-                                return rc;
                        }
                        wait_for_completion(&cifsd_complete);
                        rc = 0;
@@ -1961,9 +1968,15 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
        if (existingCifsSes) {
                pSesInfo = existingCifsSes;
-                cFYI(1, ("Existing smb sess found"));
+                cFYI(1, ("Existing smb sess found (status=%d)",
-                kfree(volume_info.password);
+                        pSesInfo->status));
-                /* volume_info.UNC freed at end of function */
+                down(&pSesInfo->sesSem);
+                if (pSesInfo->status == CifsNeedReconnect) {
+                        cFYI(1, ("Session needs reconnect"));
+                        rc = cifs_setup_session(xid, pSesInfo,
+                                                cifs_sb->local_nls);
+                }
+                up(&pSesInfo->sesSem);
        } else if (!rc) {
                cFYI(1, ("Existing smb sess not found"));
                pSesInfo = sesInfoAlloc();
@@ -1977,8 +1990,11 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                if (!rc) {
                        /* volume_info.password freed at unmount */
-                        if (volume_info.password)
+                        if (volume_info.password) {
                                pSesInfo->password = volume_info.password;
+                                /* set to NULL to prevent freeing on exit */
+                                volume_info.password = NULL;
+                        }
                        if (volume_info.username)
                                strncpy(pSesInfo->userName,
                                        volume_info.username,
@@ -2000,8 +2016,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                        up(&pSesInfo->sesSem);
                        if (!rc)
                                atomic_inc(&srvTcp->socketUseCount);
-                } else
+                }
-                        kfree(volume_info.password);
        }
        /* search for existing tcon to this server share */
@@ -2106,9 +2121,8 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                                                "", cifs_sb->local_nls,
                                                cifs_sb->mnt_cifs_flags &
                                                  CIFS_MOUNT_MAP_SPECIAL_CHR);
-                                        kfree(volume_info.UNC);
+                                        rc = -ENODEV;
-                                        FreeXid(xid);
+                                        goto out;
-                                        return -ENODEV;
                                } else {
                                        /* BB Do we need to wrap sesSem around
                                         * this TCon call and Unix SetFS as
@@ -2231,6 +2245,12 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
        (in which case it is not needed anymore) but when new sesion is created
        the password ptr is put in the new session structure (in which case the
        password will be freed at unmount time) */
+out:
+        /* zero out password before freeing */
+        if (volume_info.password != NULL) {
+                memset(volume_info.password, 0, strlen(volume_info.password));
+                kfree(volume_info.password);
+        }
        kfree(volume_info.UNC);
        kfree(volume_info.prepath);
        FreeXid(xid);
@@ -2374,7 +2394,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
        pSMB->req_no_secext.ByteCount = cpu_to_le16(count);
        rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response,
-                         &bytes_returned, 1);
+                         &bytes_returned, CIFS_LONG_OP);
        if (rc) {
 /* rc = map_smb_to_linux_error(smb_buffer_response); now done in SendReceive */
        } else if ((smb_buffer_response->WordCount == 3)
@@ -2678,7 +2698,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
        pSMB->req.ByteCount = cpu_to_le16(count);
        rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response,
-                         &bytes_returned, 1);
+                         &bytes_returned, CIFS_LONG_OP);
        if (smb_buffer_response->Status.CifsError ==
            cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))
@@ -3105,7 +3125,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
        pSMB->req.ByteCount = cpu_to_le16(count);
        rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response,
-                         &bytes_returned, 1);
+                         &bytes_returned, CIFS_LONG_OP);
        if (rc) {
 /*   rc = map_smb_to_linux_error(smb_buffer_response) done in SendReceive now */
        } else if ((smb_buffer_response->WordCount == 3) ||
@@ -3381,7 +3401,8 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
        pSMB->hdr.smb_buf_length += count;
        pSMB->ByteCount = cpu_to_le16(count);
-        rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length, 0);
+        rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length,
+                         CIFS_STD_OP);
        /* if (rc) rc = map_smb_to_linux_error(smb_buffer_response); */
        /* above now done in SendReceive */
@@ -3505,7 +3526,7 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
                sesInfoFree(ses);
        FreeXid(xid);
-        return rc;      /* BB check if we should always return zero here */
+        return rc;
 }
 int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 793404b10925..699ec1198409 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -517,12 +517,10 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
                d_add(direntry, NULL);
        /*      if it was once a directory (but how can we tell?) we could do
                shrink_dcache_parent(direntry); */
-        } else {
+        } else if (rc != -EACCES) {
-                cERROR(1, ("Error 0x%x on cifs_get_inode_info in lookup of %s",
+                cERROR(1, ("Unexpected lookup error %d", rc));
-                           rc, full_path));
+                /* We special case check for Access Denied - since that
-                /* BB special case check for Access Denied - watch security
+                is a common return code */
-                exposure of returning dir info implicitly via different rc
-                if file exists or not but no access BB */
        }
        kfree(full_path);
@@ -593,7 +591,7 @@ static int cifs_ci_compare(struct dentry *dentry, struct qstr *a,
                 * case take precedence.  If a is not a negative dentry, this
                 * should have no side effects
                 */
-                memcpy((unsigned char *)a->name, b->name, a->len);
+                memcpy(a->name, b->name, a->len);
                return 0;
        }
        return 1;
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
new file mode 100644
index 000000000000..ef7f43824347
--- /dev/null
+++ b/fs/cifs/dns_resolve.c
@@ -0,0 +1,124 @@
+/*
+ *  fs/cifs/dns_resolve.c
+ *
+ *   Copyright (c) 2007 Igor Mammedov
+ *   Author(s): Igor Mammedov (niallain@gmail.com)
+ *              Steve French (sfrench@us.ibm.com)
+ *
+ *   Contains the CIFS DFS upcall routines used for hostname to
+ *   IP address translation.
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <keys/user-type.h>
+#include "dns_resolve.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+static int dns_resolver_instantiate(struct key *key, const void *data,
+                size_t datalen)
+{
+        int rc = 0;
+        char *ip;
+        ip = kmalloc(datalen+1, GFP_KERNEL);
+        if (!ip)
+                return -ENOMEM;
+        memcpy(ip, data, datalen);
+        ip[datalen] = '\0';
+        rcu_assign_pointer(key->payload.data, ip);
+        return rc;
+}
+struct key_type key_type_dns_resolver = {
+        .name        = "dns_resolver",
+        .def_datalen = sizeof(struct in_addr),
+        .describe    = user_describe,
+        .instantiate = dns_resolver_instantiate,
+        .match       = user_match,
+};
+/* Resolves server name to ip address.
+ * input:
+ *      unc - server UNC
+ * output:
+ *      *ip_addr - pointer to server ip, caller responcible for freeing it.
+ * return 0 on success
+ */
+int
+dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
+{
+        int rc = -EAGAIN;
+        struct key *rkey;
+        char *name;
+        int len;
+        if (!ip_addr || !unc)
+                return -EINVAL;
+        /* search for server name delimiter */
+        len = strlen(unc);
+        if (len < 3) {
+                cFYI(1, ("%s: unc is too short: %s", __FUNCTION__, unc));
+                return -EINVAL;
+        }
+        len -= 2;
+        name = memchr(unc+2, '\\', len);
+        if (!name) {
+                cFYI(1, ("%s: probably server name is whole unc: %s",
+                                        __FUNCTION__, unc));
+        } else {
+                len = (name - unc) - 2/* leading // */;
+        }
+        name = kmalloc(len+1, GFP_KERNEL);
+        if (!name) {
+                rc = -ENOMEM;
+                return rc;
+        }
+        memcpy(name, unc+2, len);
+        name[len] = 0;
+        rkey = request_key(&key_type_dns_resolver, name, "");
+        if (!IS_ERR(rkey)) {
+                len = strlen(rkey->payload.data);
+                *ip_addr = kmalloc(len+1, GFP_KERNEL);
+                if (*ip_addr) {
+                        memcpy(*ip_addr, rkey->payload.data, len);
+                        (*ip_addr)[len] = '\0';
+                        cFYI(1, ("%s: resolved: %s to %s", __FUNCTION__,
+                                        rkey->description,
+                                        *ip_addr
+                                ));
+                        rc = 0;
+                } else {
+                        rc = -ENOMEM;
+                }
+                key_put(rkey);
+        } else {
+                cERROR(1, ("%s: unable to resolve: %s", __FUNCTION__, name));
+        }
+        kfree(name);
+        return rc;
+}
diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h
new file mode 100644
index 000000000000..073fdc3db419
--- /dev/null
+++ b/fs/cifs/dns_resolve.h
@@ -0,0 +1,32 @@
+/*
+ *   fs/cifs/dns_resolve.h -- DNS Resolver upcall management for CIFS DFS
+ *                            Handles host name to IP address resolution
+ * 
+ *   Copyright (c) International Business Machines  Corp., 2008
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _DNS_RESOLVE_H
+#define _DNS_RESOLVE_H
+#ifdef __KERNEL__
+#include <linux/key-type.h>
+extern struct key_type key_type_dns_resolver;
+extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr);
+#endif /* KERNEL */
+#endif /* _DNS_RESOLVE_H */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 1e7e4c06d9e3..5f7c374ae89c 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -130,7 +130,9 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
                if (file->f_path.dentry->d_inode->i_mapping) {
                /* BB no need to lock inode until after invalidate
                   since namei code should already have it locked? */
-                        filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
+                        rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
+                        if (rc != 0)
+                                CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
                }
                cFYI(1, ("invalidating remote inode since open detected it "
                         "changed"));
@@ -425,7 +427,9 @@ reopen_error_exit:
                pCifsInode = CIFS_I(inode);
                if (pCifsInode) {
                        if (can_flush) {
-                                filemap_write_and_wait(inode->i_mapping);
+                                rc = filemap_write_and_wait(inode->i_mapping);
+                                if (rc != 0)
+                                        CIFS_I(inode)->write_behind_rc = rc;
                        /* temporarily disable caching while we
                           go to server to get inode info */
                                pCifsInode->clientCanCacheAll = FALSE;
@@ -835,9 +839,9 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
        xid = GetXid();
        if (*poffset > file->f_path.dentry->d_inode->i_size)
-                long_op = 2; /* writes past end of file can take a long time */
+                long_op = CIFS_VLONG_OP; /* writes past EOF take long time */
        else
-                long_op = 1;
+                long_op = CIFS_LONG_OP;
        for (total_written = 0; write_size > total_written;
             total_written += bytes_written) {
@@ -884,7 +888,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
                        }
                } else
                        *poffset += bytes_written;
-                long_op = FALSE; /* subsequent writes fast -
+                long_op = CIFS_STD_OP; /* subsequent writes fast -
                                    15 seconds is plenty */
        }
@@ -934,9 +938,9 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
        xid = GetXid();
        if (*poffset > file->f_path.dentry->d_inode->i_size)
-                long_op = 2; /* writes past end of file can take a long time */
+                long_op = CIFS_VLONG_OP; /* writes past EOF can be slow */
        else
-                long_op = 1;
+                long_op = CIFS_LONG_OP;
        for (total_written = 0; write_size > total_written;
             total_written += bytes_written) {
@@ -1002,7 +1006,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
                        }
                } else
                        *poffset += bytes_written;
-                long_op = FALSE; /* subsequent writes fast -
+                long_op = CIFS_STD_OP; /* subsequent writes fast -
                                    15 seconds is plenty */
        }
@@ -1026,6 +1030,37 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
        return total_written;
 }
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
+{
+        struct cifsFileInfo *open_file = NULL;
+        read_lock(&GlobalSMBSeslock);
+        /* we could simply get the first_list_entry since write-only entries
+           are always at the end of the list but since the first entry might
+           have a close pending, we go through the whole list */
+        list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
+                if (open_file->closePend)
+                        continue;
+                if (open_file->pfile && ((open_file->pfile->f_flags & O_RDWR) ||
+                    (open_file->pfile->f_flags & O_RDONLY))) {
+                        if (!open_file->invalidHandle) {
+                                /* found a good file */
+                                /* lock it so it will not be closed on us */
+                                atomic_inc(&open_file->wrtPending);
+                                read_unlock(&GlobalSMBSeslock);
+                                return open_file;
+                        } /* else might as well continue, and look for
+                             another, or simply have the caller reopen it
+                             again rather than trying to fix this handle */
+                } else /* write only file */
+                        break; /* write only files are last so must be done */
+        }
+        read_unlock(&GlobalSMBSeslock);
+        return NULL;
+}
+#endif
 struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
 {
        struct cifsFileInfo *open_file;
@@ -1056,11 +1091,11 @@ refind_writable:
                                read_unlock(&GlobalSMBSeslock);
                                return open_file;
                        }
-        
                        read_unlock(&GlobalSMBSeslock);
                        /* Had to unlock since following call can block */
                        rc = cifs_reopen_file(open_file->pfile, FALSE);
-                        if (!rc) { 
+                        if (!rc) {
                                if (!open_file->closePend)
                                        return open_file;
                                else { /* start over in case this was deleted */
@@ -1083,7 +1118,7 @@ refind_writable:
                        /* can not use this handle, no write
                           pending on this one after all */
                        atomic_dec(&open_file->wrtPending);
-                        
                        if (open_file->closePend) /* list could have changed */
                                goto refind_writable;
                        /* else we simply continue to the next entry. Thus
@@ -1144,12 +1179,10 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
                atomic_dec(&open_file->wrtPending);
                /* Does mm or vfs already set times? */
                inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
-                if ((bytes_written > 0) && (offset)) {
+                if ((bytes_written > 0) && (offset))
                        rc = 0;
-                } else if (bytes_written < 0) {
+                else if (bytes_written < 0)
-                        if (rc != -EBADF)
+                        rc = bytes_written;
-                                rc = bytes_written;
-                }
        } else {
                cFYI(1, ("No writeable filehandles for inode"));
                rc = -EIO;
@@ -1329,14 +1362,17 @@ retry:
                                                   open_file->netfid,
                                                   bytes_to_write, offset,
                                                   &bytes_written, iov, n_iov,
-                                                   1);
+                                                   CIFS_LONG_OP);
                                atomic_dec(&open_file->wrtPending);
                                if (rc || bytes_written < bytes_to_write) {
                                        cERROR(1, ("Write2 ret %d, wrote %d",
                                                  rc, bytes_written));
                                        /* BB what if continued retry is
                                           requested via mount flags? */
-                                        set_bit(AS_EIO, &mapping->flags);
+                                        if (rc == -ENOSPC)
+                                                set_bit(AS_ENOSPC, &mapping->flags);
+                                        else
+                                                set_bit(AS_EIO, &mapping->flags);
                                } else {
                                        cifs_stats_bytes_written(cifs_sb->tcon,
                                                                 bytes_written);
@@ -1468,9 +1504,11 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
        cFYI(1, ("Sync file - name: %s datasync: 0x%x",
                dentry->d_name.name, datasync));
-        rc = filemap_fdatawrite(inode->i_mapping);
+        rc = filemap_write_and_wait(inode->i_mapping);
-        if (rc == 0)
+        if (rc == 0) {
+                rc = CIFS_I(inode)->write_behind_rc;
                CIFS_I(inode)->write_behind_rc = 0;
+        }
        FreeXid(xid);
        return rc;
 }
@@ -1522,8 +1560,11 @@ int cifs_flush(struct file *file, fl_owner_t id)
           filemapfdatawrite appears easier for the time being */
        rc = filemap_fdatawrite(inode->i_mapping);
-        if (!rc) /* reset wb rc if we were able to write out dirty pages */
+        /* reset wb rc if we were able to write out dirty pages */
+        if (!rc) {
+                rc = CIFS_I(inode)->write_behind_rc;
                CIFS_I(inode)->write_behind_rc = 0;
+        }
        cFYI(1, ("Flush inode %p file %p rc %d", inode, file, rc));
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 5e8b388be3b6..d9567ba2960b 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -54,9 +54,9 @@ int cifs_get_inode_info_unix(struct inode **pinode,
                                            MAX_TREE_SIZE + 1) +
                                    strnlen(search_path, MAX_PATHCONF) + 1,
                                    GFP_KERNEL);
-                        if (tmp_path == NULL) {
+                        if (tmp_path == NULL)
                                return -ENOMEM;
-                        }
                        /* have to skip first of the double backslash of
                           UNC name */
                        strncpy(tmp_path, pTcon->treeName, MAX_TREE_SIZE);
@@ -289,7 +289,7 @@ static int decode_sfu_inode(struct inode *inode, __u64 size,
 #define SFBITS_MASK (S_ISVTX | S_ISGID | S_ISUID)  /* SETFILEBITS valid bits */
-static int get_sfu_uid_mode(struct inode *inode,
+static int get_sfu_mode(struct inode *inode,
                        const unsigned char *path,
                        struct cifs_sb_info *cifs_sb, int xid)
 {
@@ -511,7 +511,8 @@ int cifs_get_inode_info(struct inode **pinode,
                }
                spin_lock(&inode->i_lock);
-                if (is_size_safe_to_change(cifsInfo, le64_to_cpu(pfindData->EndOfFile))) {
+                if (is_size_safe_to_change(cifsInfo,
+                                           le64_to_cpu(pfindData->EndOfFile))) {
                        /* can not safely shrink the file size here if the
                           client is writing to it due to potential races */
                        i_size_write(inode, le64_to_cpu(pfindData->EndOfFile));
@@ -527,11 +528,16 @@ int cifs_get_inode_info(struct inode **pinode,
                /* BB fill in uid and gid here? with help from winbind?
                   or retrieve from NTFS stream extended attribute */
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+                /* fill in 0777 bits from ACL */
+                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
+                        cFYI(1, ("Getting mode bits from ACL"));
+                        acl_to_uid_mode(inode, search_path);
+                }
+#endif
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
-                        /* fill in uid, gid, mode from server ACL */
+                        /* fill in remaining high mode bits e.g. SUID, VTX */
-                        /* BB FIXME this should also take into account the
+                        get_sfu_mode(inode, search_path, cifs_sb, xid);
-                         * default uid specified on mount if present */
-                        get_sfu_uid_mode(inode, search_path, cifs_sb, xid);
                } else if (atomic_read(&cifsInfo->inUse) == 0) {
                        inode->i_uid = cifs_sb->mnt_uid;
                        inode->i_gid = cifs_sb->mnt_gid;
@@ -926,7 +932,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                (CIFS_UNIX_POSIX_PATH_OPS_CAP &
                        le64_to_cpu(pTcon->fsUnixInfo.Capability))) {
                u32 oplock = 0;
-                FILE_UNIX_BASIC_INFO * pInfo =
+                FILE_UNIX_BASIC_INFO *pInfo =
                        kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
                if (pInfo == NULL) {
                        rc = -ENOMEM;
@@ -1228,7 +1234,7 @@ cifs_rename_exit:
 int cifs_revalidate(struct dentry *direntry)
 {
        int xid;
-        int rc = 0;
+        int rc = 0, wbrc = 0;
        char *full_path;
        struct cifs_sb_info *cifs_sb;
        struct cifsInodeInfo *cifsInode;
@@ -1328,7 +1334,9 @@ int cifs_revalidate(struct dentry *direntry)
        if (direntry->d_inode->i_mapping) {
                /* do we need to lock inode until after invalidate completes
                   below? */
-                filemap_fdatawrite(direntry->d_inode->i_mapping);
+                wbrc = filemap_fdatawrite(direntry->d_inode->i_mapping);
+                if (wbrc)
+                        CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
        }
        if (invalidate_inode) {
        /* shrink_dcache not necessary now that cifs dentry ops
@@ -1337,7 +1345,9 @@ int cifs_revalidate(struct dentry *direntry)
                        shrink_dcache_parent(direntry); */
                if (S_ISREG(direntry->d_inode->i_mode)) {
                        if (direntry->d_inode->i_mapping)
-                                filemap_fdatawait(direntry->d_inode->i_mapping);
+                                wbrc = filemap_fdatawait(direntry->d_inode->i_mapping);
+                                if (wbrc)
+                                        CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
                        /* may eventually have to do this for open files too */
                        if (list_empty(&(cifsInode->openFileList))) {
                                /* changed on server - flush read ahead pages */
@@ -1480,10 +1490,20 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
        /* BB check if we need to refresh inode from server now ? BB */
-        /* need to flush data before changing file size on server */
-        filemap_write_and_wait(direntry->d_inode->i_mapping);
        if (attrs->ia_valid & ATTR_SIZE) {
+                /*
+                   Flush data before changing file size on server. If the
+                   flush returns error, store it to report later and continue.
+                   BB: This should be smarter. Why bother flushing pages that
+                   will be truncated anyway? Also, should we error out here if
+                   the flush returns error?
+                 */
+                rc = filemap_write_and_wait(direntry->d_inode->i_mapping);
+                if (rc != 0) {
+                        CIFS_I(direntry->d_inode)->write_behind_rc = rc;
+                        rc = 0;
+                }
                /* To avoid spurious oplock breaks from server, in the case of
                   inodes that we already have open, avoid doing path based
                   setting of file size if we can do it by handle.
@@ -1588,7 +1608,14 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
        else if (attrs->ia_valid & ATTR_MODE) {
                rc = 0;
-                if ((mode & S_IWUGO) == 0) /* not writeable */ {
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
+                        rc = mode_to_acl(direntry->d_inode, full_path, mode);
+                else if ((mode & S_IWUGO) == 0) {
+#else
+                if ((mode & S_IWUGO) == 0) {
+#endif
+                        /* not writeable */
                        if ((cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
                                set_dosattr = TRUE;
                                time_buf.Attributes =
@@ -1607,10 +1634,10 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
                        if (time_buf.Attributes == 0)
                                time_buf.Attributes |= cpu_to_le32(ATTR_NORMAL);
                }
-                /* BB to be implemented -
+#ifdef CONFIG_CIFS_EXPERIMENTAL
-                   via Windows security descriptors or streams */
+                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
-                /* CIFSSMBWinSetPerms(xid, pTcon, full_path, mode, uid, gid,
+                        mode_to_acl(direntry->d_inode, full_path, mode);
-                                      cifs_sb->local_nls); */
+#endif
        }
        if (attrs->ia_valid & ATTR_ATIME) {
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 11f265726db7..1d6fb01b8e6d 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/link.c
 *
- *   Copyright (C) International Business Machines  Corp., 2002,2003
+ *   Copyright (C) International Business Machines  Corp., 2002,2008
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   This library is free software; you can redistribute it and/or modify
@@ -236,8 +236,6 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
        char *full_path = NULL;
        char *tmp_path = NULL;
        char *tmpbuffer;
-        unsigned char *referrals = NULL;
-        unsigned int num_referrals = 0;
        int len;
        __u16 fid;
@@ -297,8 +295,11 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
                                cFYI(1, ("Error closing junction point "
                                         "(open for ioctl)"));
                        }
+                        /* BB unwind this long, nested function, or remove BB */
                        if (rc == -EIO) {
                                /* Query if DFS Junction */
+                                unsigned int num_referrals = 0;
+                                struct dfs_info3_param *refs = NULL;
                                tmp_path =
                                        kmalloc(MAX_TREE_SIZE + MAX_PATHCONF + 1,
                                                GFP_KERNEL);
@@ -310,7 +311,7 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
                                        rc = get_dfs_path(xid, pTcon->ses,
                                                tmp_path,
                                                cifs_sb->local_nls,
-                                                &num_referrals, &referrals,
+                                                &num_referrals, &refs,
                                                cifs_sb->mnt_cifs_flags &
                                                    CIFS_MOUNT_MAP_SPECIAL_CHR);
                                        cFYI(1, ("Get DFS for %s rc = %d ",
@@ -320,14 +321,13 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
                                        else {
                                                cFYI(1, ("num referral: %d",
                                                        num_referrals));
-                                                if (referrals) {
+                                                if (refs && refs->path_name) {
-                                                        cFYI(1,("referral string: %s", referrals));
                                                        strncpy(tmpbuffer,
-                                                                referrals,
+                                                                refs->path_name,
                                                                len-1);
                                                }
                                        }
-                                        kfree(referrals);
+                                        kfree(refs);
                                        kfree(tmp_path);
 }
                                /* BB add code like else decode referrals
diff --git a/fs/cifs/md5.c b/fs/cifs/md5.c
index e5c3e1212697..f13f96d42fcf 100644
--- a/fs/cifs/md5.c
+++ b/fs/cifs/md5.c
@@ -276,8 +276,8 @@ hmac_md5_init_rfc2104(unsigned char *key, int key_len,
        }
        /* start out by storing key in pads */
-        memset(ctx->k_ipad, 0, sizeof (ctx->k_ipad));
+        memset(ctx->k_ipad, 0, sizeof(ctx->k_ipad));
-        memset(ctx->k_opad, 0, sizeof (ctx->k_opad));
+        memset(ctx->k_opad, 0, sizeof(ctx->k_opad));
        memcpy(ctx->k_ipad, key, key_len);
        memcpy(ctx->k_opad, key, key_len);
@@ -307,8 +307,8 @@ hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
        }
        /* start out by storing key in pads */
-        memset(ctx->k_ipad, 0, sizeof (ctx->k_ipad));
+        memset(ctx->k_ipad, 0, sizeof(ctx->k_ipad));
-        memset(ctx->k_opad, 0, sizeof (ctx->k_opad));
+        memset(ctx->k_opad, 0, sizeof(ctx->k_opad));
        memcpy(ctx->k_ipad, key, key_len);
        memcpy(ctx->k_opad, key, key_len);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 51ec681fe74a..15546c2354c5 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -73,7 +73,7 @@ sesInfoAlloc(void)
 {
        struct cifsSesInfo *ret_buf;
-        ret_buf = kzalloc(sizeof (struct cifsSesInfo), GFP_KERNEL);
+        ret_buf = kzalloc(sizeof(struct cifsSesInfo), GFP_KERNEL);
        if (ret_buf) {
                write_lock(&GlobalSMBSeslock);
                atomic_inc(&sesInfoAllocCount);
@@ -109,7 +109,7 @@ struct cifsTconInfo *
 tconInfoAlloc(void)
 {
        struct cifsTconInfo *ret_buf;
-        ret_buf = kzalloc(sizeof (struct cifsTconInfo), GFP_KERNEL);
+        ret_buf = kzalloc(sizeof(struct cifsTconInfo), GFP_KERNEL);
        if (ret_buf) {
                write_lock(&GlobalSMBSeslock);
                atomic_inc(&tconInfoAllocCount);
@@ -298,7 +298,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
        memset(temp, 0, 256); /* bigger than MAX_CIFS_HDR_SIZE */
        buffer->smb_buf_length =
-            (2 * word_count) + sizeof (struct smb_hdr) -
+            (2 * word_count) + sizeof(struct smb_hdr) -
            4 /*  RFC 1001 length field does not count */  +
            2 /* for bcc field itself */ ;
        /* Note that this is the only network field that has to be converted
@@ -422,8 +422,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
        __u32 clc_len;  /* calculated length */
        cFYI(0, ("checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len));
-        if (length < 2 + sizeof (struct smb_hdr)) {
+        if (length < 2 + sizeof(struct smb_hdr)) {
-                if ((length >= sizeof (struct smb_hdr) - 1)
+                if ((length >= sizeof(struct smb_hdr) - 1)
                            && (smb->Status.CifsError != 0)) {
                        smb->WordCount = 0;
                        /* some error cases do not return wct and bcc */
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index f06359cb22ee..646e1f06941b 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -132,6 +132,34 @@ static const struct smb_to_posix_error mapping_table_ERRHRD[] = {
        {0, 0}
 };
+/* if the mount helper is missing we need to reverse the 1st slash
+   from '/' to backslash in order to format the UNC properly for
+   ip address parsing and for tree connect (unless the user
+   remembered to put the UNC name in properly). Fortunately we do
+   not have to call this twice (we check for IPv4 addresses
+   first, so it is already converted by the time we
+   try IPv6 addresses */
+static int canonicalize_unc(char *cp)
+{
+        int i;
+        for (i = 0; i <= 46 /* INET6_ADDRSTRLEN */ ; i++) {
+                if (cp[i] == 0)
+                        break;
+                if (cp[i] == '\\')
+                        break;
+                if (cp[i] == '/') {
+#ifdef CONFIG_CIFS_DEBUG2
+                        cFYI(1, ("change slash to backslash in malformed UNC"));
+#endif
+                        cp[i] = '\\';
+                        return 1;
+                }
+        }
+        return 0;
+}
 /* Convert string containing dotted ip address to binary form */
 /* returns 0 if invalid address */
@@ -141,11 +169,13 @@ cifs_inet_pton(int address_family, char *cp, void *dst)
        int ret = 0;
        /* calculate length by finding first slash or NULL */
-        /* BB Should we convert '/' slash to '\' here since it seems already
+        if (address_family == AF_INET) {
-         * done before this */
+                ret = in4_pton(cp, -1 /* len */, dst, '\\', NULL);
-        if ( address_family == AF_INET ) {
+                if (ret == 0) {
-                ret = in4_pton(cp, -1 /* len */, dst , '\\', NULL);
+                        if (canonicalize_unc(cp))
-        } else if ( address_family == AF_INET6 ) {
+                                ret = in4_pton(cp, -1, dst, '\\', NULL);
+                }
+        } else if (address_family == AF_INET6) {
                ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL);
        }
 #ifdef CONFIG_CIFS_DEBUG2
@@ -740,7 +770,7 @@ cifs_print_status(__u32 status_code)
 static void
-ntstatus_to_dos(__u32 ntstatus, __u8 * eclass, __u16 * ecode)
+ntstatus_to_dos(__u32 ntstatus, __u8 *eclass, __u16 *ecode)
 {
        int i;
        if (ntstatus == 0) {
@@ -793,8 +823,8 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
        if (smberrclass == ERRDOS) {  /* 1 byte field no need to byte reverse */
                for (i = 0;
                     i <
-                     sizeof (mapping_table_ERRDOS) /
+                     sizeof(mapping_table_ERRDOS) /
-                     sizeof (struct smb_to_posix_error); i++) {
+                     sizeof(struct smb_to_posix_error); i++) {
                        if (mapping_table_ERRDOS[i].smb_err == 0)
                                break;
                        else if (mapping_table_ERRDOS[i].smb_err ==
@@ -807,8 +837,8 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
        } else if (smberrclass == ERRSRV) {   /* server class of error codes */
                for (i = 0;
                     i <
-                     sizeof (mapping_table_ERRSRV) /
+                     sizeof(mapping_table_ERRSRV) /
-                     sizeof (struct smb_to_posix_error); i++) {
+                     sizeof(struct smb_to_posix_error); i++) {
                        if (mapping_table_ERRSRV[i].smb_err == 0)
                                break;
                        else if (mapping_table_ERRSRV[i].smb_err ==
@@ -837,14 +867,14 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
 unsigned int
 smbCalcSize(struct smb_hdr *ptr)
 {
-        return (sizeof (struct smb_hdr) + (2 * ptr->WordCount) +
+        return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
                2 /* size of the bcc field */ + BCC(ptr));
 }
 unsigned int
 smbCalcSize_LE(struct smb_hdr *ptr)
 {
-        return (sizeof (struct smb_hdr) + (2 * ptr->WordCount) +
+        return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
                2 /* size of the bcc field */ + le16_to_cpu(BCC_LE(ptr)));
 }
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 3746580e9701..0f22def4bdff 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -171,7 +171,13 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
        /* Linux can not store file creation time unfortunately so ignore it */
        cifsInfo->cifsAttrs = attr;
-        cifsInfo->time = jiffies;
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
+                /* get more accurate mode via ACL - so force inode refresh */
+                cifsInfo->time = 0;
+        } else
+#endif /* CONFIG_CIFS_EXPERIMENTAL */
+                cifsInfo->time = jiffies;
        /* treat dos attribute of read-only as read-only mode bit e.g. 555? */
        /* 2767 perms - indicate mandatory locking */
@@ -495,7 +501,7 @@ ffirst_retry:
 static int cifs_unicode_bytelen(char *str)
 {
        int len;
-        __le16 * ustr = (__le16 *)str;
+        __le16 *ustr = (__le16 *)str;
        for (len = 0; len <= PATH_MAX; len++) {
                if (ustr[len] == 0)
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 899dc6078d9a..d2153abcba6d 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -29,6 +29,7 @@
 #include "ntlmssp.h"
 #include "nterr.h"
 #include <linux/utsname.h>
+#include "cifs_spnego.h"
 extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
                         unsigned char *p24);
@@ -340,11 +341,12 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
        SESSION_SETUP_ANDX *pSMB;
        __u32 capabilities;
        int count;
-        int resp_buf_type = 0;
+        int resp_buf_type;
-        struct kvec iov[2];
+        struct kvec iov[3];
        enum securityEnum type;
        __u16 action;
        int bytes_remaining;
+        struct key *spnego_key = NULL;
        if (ses == NULL)
                return -EINVAL;
@@ -377,24 +379,32 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
        capabilities = cifs_ssetup_hdr(ses, pSMB);
-        /* we will send the SMB in two pieces,
+        /* we will send the SMB in three pieces:
-        a fixed length beginning part, and a
+        a fixed length beginning part, an optional
-        second part which will include the strings
+        SPNEGO blob (which can be zero length), and a
-        and rest of bcc area, in order to avoid having
+        last part which will include the strings
-        to do a large buffer 17K allocation */
+        and rest of bcc area. This allows us to avoid
+        a large buffer 17K allocation */
        iov[0].iov_base = (char *)pSMB;
        iov[0].iov_len = smb_buf->smb_buf_length + 4;
+        /* setting this here allows the code at the end of the function
+           to free the request buffer if there's an error */
+        resp_buf_type = CIFS_SMALL_BUFFER;
        /* 2000 big enough to fit max user, domain, NOS name etc. */
        str_area = kmalloc(2000, GFP_KERNEL);
        if (str_area == NULL) {
-                cifs_small_buf_release(smb_buf);
+                rc = -ENOMEM;
-                return -ENOMEM;
+                goto ssetup_exit;
        }
        bcc_ptr = str_area;
        ses->flags &= ~CIFS_SES_LANMAN;
+        iov[1].iov_base = NULL;
+        iov[1].iov_len = 0;
        if (type == LANMAN) {
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
                char lnm_session_key[CIFS_SESS_KEY_SIZE];
@@ -463,8 +473,8 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
                   struct ntlmv2_resp */
                if (v2_sess_key == NULL) {
-                        cifs_small_buf_release(smb_buf);
+                        rc = -ENOMEM;
-                        return -ENOMEM;
+                        goto ssetup_exit;
                }
                pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
@@ -499,22 +509,69 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
                        unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
                } else
                        ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
-        } else /* NTLMSSP or SPNEGO */ {
+        } else if (type == Kerberos) {
+#ifdef CONFIG_CIFS_UPCALL
+                struct cifs_spnego_msg *msg;
+                spnego_key = cifs_get_spnego_key(ses);
+                if (IS_ERR(spnego_key)) {
+                        rc = PTR_ERR(spnego_key);
+                        spnego_key = NULL;
+                        goto ssetup_exit;
+                }
+                msg = spnego_key->payload.data;
+                /* bail out if key is too long */
+                if (msg->sesskey_len >
+                    sizeof(ses->server->mac_signing_key.data.krb5)) {
+                        cERROR(1, ("Kerberos signing key too long (%u bytes)",
+                                msg->sesskey_len));
+                        rc = -EOVERFLOW;
+                        goto ssetup_exit;
+                }
+                if (first_time) {
+                        ses->server->mac_signing_key.len = msg->sesskey_len;
+                        memcpy(ses->server->mac_signing_key.data.krb5,
+                                msg->data, msg->sesskey_len);
+                }
                pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
                capabilities |= CAP_EXTENDED_SECURITY;
                pSMB->req.Capabilities = cpu_to_le32(capabilities);
-                /* BB set password lengths */
+                iov[1].iov_base = msg->data + msg->sesskey_len;
+                iov[1].iov_len = msg->secblob_len;
+                pSMB->req.SecurityBlobLength = cpu_to_le16(iov[1].iov_len);
+                if (ses->capabilities & CAP_UNICODE) {
+                        /* unicode strings must be word aligned */
+                        if ((iov[0].iov_len + iov[1].iov_len) % 2) {
+                                *bcc_ptr = 0;
+                                bcc_ptr++;
+                        }
+                        unicode_oslm_strings(&bcc_ptr, nls_cp);
+                        unicode_domain_string(&bcc_ptr, ses, nls_cp);
+                } else
+                /* BB: is this right? */
+                        ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
+#else /* ! CONFIG_CIFS_UPCALL */
+                cERROR(1, ("Kerberos negotiated but upcall support disabled!"));
+                rc = -ENOSYS;
+                goto ssetup_exit;
+#endif /* CONFIG_CIFS_UPCALL */
+        } else {
+                cERROR(1, ("secType %d not supported!", type));
+                rc = -ENOSYS;
+                goto ssetup_exit;
        }
-        count = (long) bcc_ptr - (long) str_area;
+        iov[2].iov_base = str_area;
+        iov[2].iov_len = (long) bcc_ptr - (long) str_area;
+        count = iov[1].iov_len + iov[2].iov_len;
        smb_buf->smb_buf_length += count;
        BCC_LE(smb_buf) = cpu_to_le16(count);
-        iov[1].iov_base = str_area;
+        rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type,
-        iov[1].iov_len = count;
+                          CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR);
-        rc = SendReceive2(xid, ses, iov, 2 /* num_iovecs */, &resp_buf_type,
-                          0 /* not long op */, 1 /* log NT STATUS if any */ );
        /* SMB request buf freed in SendReceive2 */
        cFYI(1, ("ssetup rc from sendrecv2 is %d", rc));
@@ -560,6 +617,8 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
                                         ses, nls_cp);
 ssetup_exit:
+        if (spnego_key)
+                key_put(spnego_key);
        kfree(str_area);
        if (resp_buf_type == CIFS_SMALL_BUFFER) {
                cFYI(1, ("ssetup freeing small buf %p", iov[0].iov_base));
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 90542a39be17..58bbfd992cc0 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -80,7 +80,7 @@ SMBencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
 /* Routines for Windows NT MD4 Hash functions. */
 static int
-_my_wcslen(__u16 * str)
+_my_wcslen(__u16 *str)
 {
        int len = 0;
        while (*str++ != 0)
@@ -96,7 +96,7 @@ _my_wcslen(__u16 * str)
 */
 static int
-_my_mbstowcs(__u16 * dst, const unsigned char *src, int len)
+_my_mbstowcs(__u16 *dst, const unsigned char *src, int len)
 {       /* BB not a very good conversion routine - change/fix */
        int i;
        __u16 val;
@@ -125,9 +125,9 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16)
        /* Password cannot be longer than 128 characters */
        if (passwd) {
                len = strlen((char *) passwd);
-                if (len > 128) {
+                if (len > 128)
                        len = 128;
-                }
                /* Password must be converted to NT unicode */
                _my_mbstowcs(wpwd, passwd, len);
        } else
@@ -135,7 +135,7 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16)
        wpwd[len] = 0;  /* Ensure string is null terminated */
        /* Calculate length in bytes */
-        len = _my_wcslen(wpwd) * sizeof (__u16);
+        len = _my_wcslen(wpwd) * sizeof(__u16);
        mdfour(p16, (unsigned char *) wpwd, len);
        memset(wpwd, 0, 129 * 2);
@@ -167,7 +167,7 @@ nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16])
        E_P16((unsigned char *) passwd, (unsigned char *) p16);
        /* clear out local copy of user's password (just being paranoid). */
-        memset(passwd, '\0', sizeof (passwd));
+        memset(passwd, '\0', sizeof(passwd));
 }
 #endif
@@ -189,8 +189,10 @@ ntv2_owf_gen(const unsigned char owf[16], const char *user_n,
                return;
        dom_u = user_u + 1024;
-        /* push_ucs2(NULL, user_u, user_n, (user_l+1)*2, STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER);
+        /* push_ucs2(NULL, user_u, user_n, (user_l+1)*2,
-           push_ucs2(NULL, dom_u, domain_n, (domain_l+1)*2, STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER); */
+                        STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER);
+           push_ucs2(NULL, dom_u, domain_n, (domain_l+1)*2,
+                        STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER); */
        /* BB user and domain may need to be uppercased */
        user_l = cifs_strtoUCS(user_u, user_n, 511, nls_codepage);
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 7ed32b3cb781..50b623ad9320 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -308,7 +308,7 @@ smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec,
 static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
 {
-        if (long_op == -1) {
+        if (long_op == CIFS_ASYNC_OP) {
                /* oplock breaks must not be held up */
                atomic_inc(&ses->server->inFlight);
        } else {
@@ -337,7 +337,7 @@ static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
                                   as they are allowed to block on server */
                                /* update # of requests on the wire to server */
-                                if (long_op < 3)
+                                if (long_op != CIFS_BLOCKING_OP)
                                        atomic_inc(&ses->server->inFlight);
                                spin_unlock(&GlobalMid_Lock);
                                break;
@@ -415,17 +415,48 @@ static int wait_for_response(struct cifsSesInfo *ses,
        }
 }
+/*
+ *
+ * Send an SMB Request.  No response info (other than return code)
+ * needs to be parsed.
+ *
+ * flags indicate the type of request buffer and how long to wait
+ * and whether to log NT STATUS code (error) before mapping it to POSIX error
+ *
+ */
+int
+SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
+                struct smb_hdr *in_buf, int flags)
+{
+        int rc;
+        struct kvec iov[1];
+        int resp_buf_type;
+        iov[0].iov_base = (char *)in_buf;
+        iov[0].iov_len = in_buf->smb_buf_length + 4;
+        flags |= CIFS_NO_RESP;
+        rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags);
+#ifdef CONFIG_CIFS_DEBUG2
+        cFYI(1, ("SendRcvNoR flags %d rc %d", flags, rc));
+#endif
+        return rc;
+}
 int
 SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
             struct kvec *iov, int n_vec, int *pRespBufType /* ret */,
-             const int long_op, const int logError)
+             const int flags)
 {
        int rc = 0;
+        int long_op;
        unsigned int receive_len;
        unsigned long timeout;
        struct mid_q_entry *midQ;
        struct smb_hdr *in_buf = iov[0].iov_base;
+        long_op = flags & CIFS_TIMEOUT_MASK;
        *pRespBufType = CIFS_NO_BUFFER;  /* no response buf yet */
        if ((ses == NULL) || (ses->server == NULL)) {
@@ -483,15 +514,22 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
        if (rc < 0)
                goto out;
-        if (long_op == -1)
+        if (long_op == CIFS_STD_OP)
-                goto out;
+                timeout = 15 * HZ;
-        else if (long_op == 2) /* writes past end of file can take loong time */
+        else if (long_op == CIFS_VLONG_OP) /* e.g. slow writes past EOF */
                timeout = 180 * HZ;
-        else if (long_op == 1)
+        else if (long_op == CIFS_LONG_OP)
                timeout = 45 * HZ; /* should be greater than
                        servers oplock break timeout (about 43 seconds) */
-        else
+        else if (long_op == CIFS_ASYNC_OP)
-                timeout = 15 * HZ;
+                goto out;
+        else if (long_op == CIFS_BLOCKING_OP)
+                timeout = 0x7FFFFFFF; /*  large, but not so large as to wrap */
+        else {
+                cERROR(1, ("unknown timeout flag %d", long_op));
+                rc = -EIO;
+                goto out;
+        }
        /* wait for 15 seconds or until woken up due to response arriving or
           due to last connection to this server being unmounted */
@@ -566,7 +604,8 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
                        }
                        /* BB special case reconnect tid and uid here? */
-                        rc = map_smb_to_linux_error(midQ->resp_buf, logError);
+                        rc = map_smb_to_linux_error(midQ->resp_buf,
+                                                flags & CIFS_LOG_ERROR);
                        /* convert ByteCount if necessary */
                        if (receive_len >= sizeof(struct smb_hdr) - 4
@@ -574,8 +613,10 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
                            (2 * midQ->resp_buf->WordCount) + 2 /* bcc */ )
                                BCC(midQ->resp_buf) =
                                        le16_to_cpu(BCC_LE(midQ->resp_buf));
-                        midQ->resp_buf = NULL;  /* mark it so will not be freed
+                        if ((flags & CIFS_NO_RESP) == 0)
-                                                by DeleteMidQEntry */
+                                midQ->resp_buf = NULL;  /* mark it so buf will
+                                                           not be freed by
+                                                           DeleteMidQEntry */
                } else {
                        rc = -EIO;
                        cFYI(1, ("Bad MID state?"));
@@ -663,17 +704,25 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
        if (rc < 0)
                goto out;
-        if (long_op == -1)
+        if (long_op == CIFS_STD_OP)
+                timeout = 15 * HZ;
+        /* wait for 15 seconds or until woken up due to response arriving or
+           due to last connection to this server being unmounted */
+        else if (long_op == CIFS_ASYNC_OP)
                goto out;
-        else if (long_op == 2) /* writes past end of file can take loong time */
+        else if (long_op == CIFS_VLONG_OP) /* writes past EOF can be slow */
                timeout = 180 * HZ;
-        else if (long_op == 1)
+        else if (long_op == CIFS_LONG_OP)
                timeout = 45 * HZ; /* should be greater than
                        servers oplock break timeout (about 43 seconds) */
-        else
+        else if (long_op == CIFS_BLOCKING_OP)
-                timeout = 15 * HZ;
+                timeout = 0x7FFFFFFF; /* large but no so large as to wrap */
-        /* wait for 15 seconds or until woken up due to response arriving or
+        else {
-           due to last connection to this server being unmounted */
+                cERROR(1, ("unknown timeout flag %d", long_op));
+                rc = -EIO;
+                goto out;
+        }
        if (signal_pending(current)) {
                /* if signal pending do not hold up user for full smb timeout
                but we still give response a chance to complete */
@@ -812,7 +861,7 @@ send_lock_cancel(const unsigned int xid, struct cifsTconInfo *tcon,
        pSMB->hdr.Mid = GetNextMid(ses->server);
        return SendReceive(xid, ses, in_buf, out_buf,
-                        &bytes_returned, 0);
+                        &bytes_returned, CIFS_STD_OP);
 }
 int
@@ -844,7 +893,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
           to the same server. We may make this configurable later or
           use ses->maxReq */
-        rc = wait_for_free_request(ses, 3);
+        rc = wait_for_free_request(ses, CIFS_BLOCKING_OP);
        if (rc)
                return rc;
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 369e838bebd3..54e8ef96cb79 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -265,7 +265,9 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
                else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
                        __u16 fid;
                        int oplock = FALSE;
-                        if (experimEnabled) 
+                        struct cifs_ntsd *pacl = NULL;
+                        __u32 buflen = 0;
+                        if (experimEnabled)
                                rc = CIFSSMBOpen(xid, pTcon, full_path,
                                        FILE_OPEN, GENERIC_READ, 0, &fid,
                                        &oplock, NULL, cifs_sb->local_nls,
@@ -273,10 +275,9 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
                        /* else rc is EOPNOTSUPP from above */
-                        if(rc == 0) {
+                        if (rc == 0) {
-                                rc = CIFSSMBGetCIFSACL(xid, pTcon, fid,
+                                rc = CIFSSMBGetCIFSACL(xid, pTcon, fid, &pacl,
-                                        ea_value, buf_size,
+                                                      &buflen);
-                                        ACL_TYPE_ACCESS);
                                CIFSSMBClose(xid, pTcon, fid);
                        }
                }
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index dcc6aead70f5..e3eb3556622b 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -362,8 +362,8 @@ static int init_coda_psdev(void)
                goto out_chrdev;
        }               
        for (i = 0; i < MAX_CODADEVS; i++)
-                class_device_create(coda_psdev_class, NULL,
+                device_create(coda_psdev_class, NULL,
-                                MKDEV(CODA_PSDEV_MAJOR,i), NULL, "cfs%d", i);
+                              MKDEV(CODA_PSDEV_MAJOR,i), "cfs%d", i);
        coda_sysctl_init();
        goto out;
@@ -405,7 +405,7 @@ static int __init init_coda(void)
        return 0;
 out:
        for (i = 0; i < MAX_CODADEVS; i++)
-                class_device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
+                device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
        class_destroy(coda_psdev_class);
        unregister_chrdev(CODA_PSDEV_MAJOR, "coda");
        coda_sysctl_clean();
@@ -424,7 +424,7 @@ static void __exit exit_coda(void)
                printk("coda: failed to unregister filesystem\n");
        }
        for (i = 0; i < MAX_CODADEVS; i++)
-                class_device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
+                device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
        class_destroy(coda_psdev_class);
        unregister_chrdev(CODA_PSDEV_MAJOR, "coda");
        coda_sysctl_clean();
diff --git a/fs/compat.c b/fs/compat.c
index 15078ce4c04a..5216c3fd7517 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1104,10 +1104,6 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
        if (ret < 0)
                goto out;
-        ret = security_file_permission(file, type == READ ? MAY_READ:MAY_WRITE);
-        if (ret)
-                goto out;
        fnv = NULL;
        if (type == READ) {
                fn = file->f_op->read;
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
new file mode 100644
index 000000000000..0adced2f296f
--- /dev/null
+++ b/fs/compat_binfmt_elf.c
@@ -0,0 +1,131 @@
+/*
+ * 32-bit compatibility support for ELF format executables and core dumps.
+ *
+ * Copyright (C) 2007 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * Red Hat Author: Roland McGrath.
+ *
+ * This file is used in a 64-bit kernel that wants to support 32-bit ELF.
+ * asm/elf.h is responsible for defining the compat_* and COMPAT_* macros
+ * used below, with definitions appropriate for 32-bit ABI compatibility.
+ *
+ * We use macros to rename the ABI types and machine-dependent
+ * functions used in binfmt_elf.c to compat versions.
+ */
+#include <linux/elfcore-compat.h>
+#include <linux/time.h>
+/*
+ * Rename the basic ELF layout types to refer to the 32-bit class of files.
+ */
+#undef  ELF_CLASS
+#define ELF_CLASS       ELFCLASS32
+#undef  elfhdr
+#undef  elf_phdr
+#undef  elf_note
+#undef  elf_addr_t
+#define elfhdr          elf32_hdr
+#define elf_phdr        elf32_phdr
+#define elf_note        elf32_note
+#define elf_addr_t      Elf32_Addr
+/*
+ * The machine-dependent core note format types are defined in elfcore-compat.h,
+ * which requires asm/elf.h to define compat_elf_gregset_t et al.
+ */
+#define elf_prstatus    compat_elf_prstatus
+#define elf_prpsinfo    compat_elf_prpsinfo
+/*
+ * Compat version of cputime_to_compat_timeval, perhaps this
+ * should be an inline in <linux/compat.h>.
+ */
+static void cputime_to_compat_timeval(const cputime_t cputime,
+                                      struct compat_timeval *value)
+{
+        struct timeval tv;
+        cputime_to_timeval(cputime, &tv);
+        value->tv_sec = tv.tv_sec;
+        value->tv_usec = tv.tv_usec;
+}
+#undef cputime_to_timeval
+#define cputime_to_timeval cputime_to_compat_timeval
+/*
+ * To use this file, asm/elf.h must define compat_elf_check_arch.
+ * The other following macros can be defined if the compat versions
+ * differ from the native ones, or omitted when they match.
+ */
+#undef  ELF_ARCH
+#undef  elf_check_arch
+#define elf_check_arch  compat_elf_check_arch
+#ifdef  COMPAT_ELF_PLATFORM
+#undef  ELF_PLATFORM
+#define ELF_PLATFORM            COMPAT_ELF_PLATFORM
+#endif
+#ifdef  COMPAT_ELF_HWCAP
+#undef  ELF_HWCAP
+#define ELF_HWCAP               COMPAT_ELF_HWCAP
+#endif
+#ifdef  COMPAT_ARCH_DLINFO
+#undef  ARCH_DLINFO
+#define ARCH_DLINFO             COMPAT_ARCH_DLINFO
+#endif
+#ifdef  COMPAT_ELF_ET_DYN_BASE
+#undef  ELF_ET_DYN_BASE
+#define ELF_ET_DYN_BASE         COMPAT_ELF_ET_DYN_BASE
+#endif
+#ifdef COMPAT_ELF_EXEC_PAGESIZE
+#undef  ELF_EXEC_PAGESIZE
+#define ELF_EXEC_PAGESIZE       COMPAT_ELF_EXEC_PAGESIZE
+#endif
+#ifdef  COMPAT_ELF_PLAT_INIT
+#undef  ELF_PLAT_INIT
+#define ELF_PLAT_INIT           COMPAT_ELF_PLAT_INIT
+#endif
+#ifdef  COMPAT_SET_PERSONALITY
+#undef  SET_PERSONALITY
+#define SET_PERSONALITY         COMPAT_SET_PERSONALITY
+#endif
+#ifdef  compat_start_thread
+#undef  start_thread
+#define start_thread            compat_start_thread
+#endif
+#ifdef  compat_arch_setup_additional_pages
+#undef  ARCH_HAS_SETUP_ADDITIONAL_PAGES
+#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
+#undef  arch_setup_additional_pages
+#define arch_setup_additional_pages compat_arch_setup_additional_pages
+#endif
+/*
+ * Rename a few of the symbols that binfmt_elf.c will define.
+ * These are all local so the names don't really matter, but it
+ * might make some debugging less confusing not to duplicate them.
+ */
+#define elf_format              compat_elf_format
+#define init_elf_binfmt         init_compat_elf_binfmt
+#define exit_elf_binfmt         exit_compat_elf_binfmt
+/*
+ * We share all the actual code with the native (64-bit) version.
+ */
+#include "binfmt_elf.c"
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index a4284ccac1f9..ffdc022cae64 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -10,6 +10,8 @@
 * ioctls.
 */
+#include <linux/joystick.h>
 #include <linux/types.h>
 #include <linux/compat.h>
 #include <linux/kernel.h>
@@ -322,7 +324,7 @@ static int dev_ifname32(unsigned int fd, unsigned int cmd, unsigned long arg)
        int err;
        uifr = compat_alloc_user_space(sizeof(struct ifreq));
-        if (copy_in_user(uifr, compat_ptr(arg), sizeof(struct ifreq32)));
+        if (copy_in_user(uifr, compat_ptr(arg), sizeof(struct ifreq32)))
                return -EFAULT;
        err = sys_ioctl(fd, SIOCGIFNAME, (unsigned long)uifr);
@@ -1374,7 +1376,7 @@ static int do_atm_ioctl(unsigned int fd, unsigned int cmd32, unsigned long arg)
        return -EINVAL;
 }
-static __attribute_used__ int 
+static __used int
 ret_einval(unsigned int fd, unsigned int cmd, unsigned long arg)
 {
        return -EINVAL;
@@ -1954,6 +1956,12 @@ ULONG_IOCTL(TIOCSCTTY)
 COMPATIBLE_IOCTL(TIOCGPTN)
 COMPATIBLE_IOCTL(TIOCSPTLCK)
 COMPATIBLE_IOCTL(TIOCSERGETLSR)
+#ifdef TCGETS2
+COMPATIBLE_IOCTL(TCGETS2)
+COMPATIBLE_IOCTL(TCSETS2)
+COMPATIBLE_IOCTL(TCSETSW2)
+COMPATIBLE_IOCTL(TCSETSF2)
+#endif
 /* Little f */
 COMPATIBLE_IOCTL(FIOCLEX)
 COMPATIBLE_IOCTL(FIONCLEX)
@@ -2636,6 +2644,12 @@ COMPATIBLE_IOCTL(VIDEO_SET_ATTRIBUTES)
 COMPATIBLE_IOCTL(VIDEO_GET_SIZE)
 COMPATIBLE_IOCTL(VIDEO_GET_FRAME_RATE)
+/* joystick */
+COMPATIBLE_IOCTL(JSIOCGVERSION)
+COMPATIBLE_IOCTL(JSIOCGAXES)
+COMPATIBLE_IOCTL(JSIOCGBUTTONS)
+COMPATIBLE_IOCTL(JSIOCGNAME(0))
 /* now things that need handlers */
 HANDLE_IOCTL(MEMREADOOB32, mtd_rw_oob)
 HANDLE_IOCTL(MEMWRITEOOB32, mtd_rw_oob)
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 50ed691098bc..a48dc7dd8765 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -546,7 +546,7 @@ static int populate_groups(struct config_group *group)
                 * That said, taking our i_mutex is closer to mkdir
                 * emulation, and shouldn't hurt.
                 */
-                mutex_lock(&dentry->d_inode->i_mutex);
+                mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
                for (i = 0; group->default_groups[i]; i++) {
                        new_group = group->default_groups[i];
@@ -1405,7 +1405,8 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
        sd = configfs_sb->s_root->d_fsdata;
        link_group(to_config_group(sd->s_element), group);
-        mutex_lock(&configfs_sb->s_root->d_inode->i_mutex);
+        mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex,
+                        I_MUTEX_PARENT);
        name.name = group->cg_item.ci_name;
        name.len = strlen(name.name);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index a3658f9a082c..397cb503a180 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -320,7 +320,7 @@ int configfs_add_file(struct dentry * dir, const struct configfs_attribute * att
        umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
        int error = 0;
-        mutex_lock(&dir->d_inode->i_mutex);
+        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL);
        error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type);
        mutex_unlock(&dir->d_inode->i_mutex);
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 3bf0278ea843..de3b31d0a37d 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -128,7 +128,7 @@ void configfs_release_fs(void)
 }
-static decl_subsys(config, NULL, NULL);
+static struct kobject *config_kobj;
 static int __init configfs_init(void)
 {
@@ -140,9 +140,8 @@ static int __init configfs_init(void)
        if (!configfs_dir_cachep)
                goto out;
-        kobj_set_kset_s(&config_subsys, kernel_subsys);
+        config_kobj = kobject_create_and_add("config", kernel_kobj);
-        err = subsystem_register(&config_subsys);
+        if (!config_kobj) {
-        if (err) {
                kmem_cache_destroy(configfs_dir_cachep);
                configfs_dir_cachep = NULL;
                goto out;
@@ -151,7 +150,7 @@ static int __init configfs_init(void)
        err = register_filesystem(&configfs_fs_type);
        if (err) {
                printk(KERN_ERR "configfs: Unable to register filesystem!\n");
-                subsystem_unregister(&config_subsys);
+                kobject_put(config_kobj);
                kmem_cache_destroy(configfs_dir_cachep);
                configfs_dir_cachep = NULL;
                goto out;
@@ -160,7 +159,7 @@ static int __init configfs_init(void)
        err = configfs_inode_init();
        if (err) {
                unregister_filesystem(&configfs_fs_type);
-                subsystem_unregister(&config_subsys);
+                kobject_put(config_kobj);
                kmem_cache_destroy(configfs_dir_cachep);
                configfs_dir_cachep = NULL;
        }
@@ -171,7 +170,7 @@ out:
 static void __exit configfs_exit(void)
 {
        unregister_filesystem(&configfs_fs_type);
-        subsystem_unregister(&config_subsys);
+        kobject_put(config_kobj);
        kmem_cache_destroy(configfs_dir_cachep);
        configfs_dir_cachep = NULL;
        configfs_inode_exit();
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 6a713b33992f..d26e2826ba5b 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -426,20 +426,19 @@ exit:
 }
 EXPORT_SYMBOL_GPL(debugfs_rename);
-static decl_subsys(debug, NULL, NULL);
+static struct kobject *debug_kobj;
 static int __init debugfs_init(void)
 {
        int retval;
-        kobj_set_kset_s(&debug_subsys, kernel_subsys);
+        debug_kobj = kobject_create_and_add("debug", kernel_kobj);
-        retval = subsystem_register(&debug_subsys);
+        if (!debug_kobj)
-        if (retval)
+                return -EINVAL;
-                return retval;
        retval = register_filesystem(&debug_fs_type);
        if (retval)
-                subsystem_unregister(&debug_subsys);
+                kobject_put(debug_kobj);
        return retval;
 }
@@ -447,7 +446,7 @@ static void __exit debugfs_exit(void)
 {
        simple_release_fs(&debugfs_mount, &debugfs_mount_count);
        unregister_filesystem(&debug_fs_type);
-        subsystem_unregister(&debug_subsys);
+        kobject_put(debug_kobj);
 }
 core_initcall(debugfs_init);
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 46754553fdcc..ff97ba924333 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -49,7 +49,7 @@ static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
        spin_unlock(&ls->ls_recover_list_lock);
        if (!found)
-                de = allocate_direntry(ls, len);
+                de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_KERNEL);
        return de;
 }
@@ -62,7 +62,7 @@ void dlm_clear_free_entries(struct dlm_ls *ls)
                de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
                                list);
                list_del(&de->list);
-                free_direntry(de);
+                kfree(de);
        }
        spin_unlock(&ls->ls_recover_list_lock);
 }
@@ -171,7 +171,7 @@ void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen
        }
        list_del(&de->list);
-        free_direntry(de);
+        kfree(de);
 out:
        write_unlock(&ls->ls_dirtbl[bucket].lock);
 }
@@ -302,7 +302,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
        write_unlock(&ls->ls_dirtbl[bucket].lock);
-        de = allocate_direntry(ls, namelen);
+        de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_KERNEL);
        if (!de)
                return -ENOMEM;
@@ -313,7 +313,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
        write_lock(&ls->ls_dirtbl[bucket].lock);
        tmp = search_bucket(ls, name, namelen, bucket);
        if (tmp) {
-                free_direntry(de);
+                kfree(de);
                de = tmp;
        } else {
                list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
@@ -329,49 +329,47 @@ int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
        return get_entry(ls, nodeid, name, namelen, r_nodeid);
 }
-/* Copy the names of master rsb's into the buffer provided.
+static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len)
-   Only select names whose dir node is the given nodeid. */
+{
+        struct dlm_rsb *r;
+        down_read(&ls->ls_root_sem);
+        list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+                if (len == r->res_length && !memcmp(name, r->res_name, len)) {
+                        up_read(&ls->ls_root_sem);
+                        return r;
+                }
+        }
+        up_read(&ls->ls_root_sem);
+        return NULL;
+}
+/* Find the rsb where we left off (or start again), then send rsb names
+   for rsb's we're master of and whose directory node matches the requesting
+   node.  inbuf is the rsb name last sent, inlen is the name's length */
 void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
                           char *outbuf, int outlen, int nodeid)
 {
        struct list_head *list;
-        struct dlm_rsb *start_r = NULL, *r = NULL;
+        struct dlm_rsb *r;
-        int offset = 0, start_namelen, error, dir_nodeid;
+        int offset = 0, dir_nodeid;
-        char *start_name;
        uint16_t be_namelen;
-        /*
-         * Find the rsb where we left off (or start again)
-         */
-        start_namelen = inlen;
-        start_name = inbuf;
-        if (start_namelen > 1) {
-                /*
-                 * We could also use a find_rsb_root() function here that
-                 * searched the ls_root_list.
-                 */
-                error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER,
-                                     &start_r);
-                DLM_ASSERT(!error && start_r,
-                           printk("error %d\n", error););
-                DLM_ASSERT(!list_empty(&start_r->res_root_list),
-                           dlm_print_rsb(start_r););
-                dlm_put_rsb(start_r);
-        }
-        /*
-         * Send rsb names for rsb's we're master of and whose directory node
-         * matches the requesting node.
-         */
        down_read(&ls->ls_root_sem);
-        if (start_r)
-                list = start_r->res_root_list.next;
+        if (inlen > 1) {
-        else
+                r = find_rsb_root(ls, inbuf, inlen);
+                if (!r) {
+                        inbuf[inlen - 1] = '\0';
+                        log_error(ls, "copy_master_names from %d start %d %s",
+                                  nodeid, inlen, inbuf);
+                        goto out;
+                }
+                list = r->res_root_list.next;
+        } else {
                list = ls->ls_root_list.next;
+        }
        for (offset = 0; list != &ls->ls_root_list; list = list->next) {
                r = list_entry(list, struct dlm_rsb, res_root_list);
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index d2fc2384c3be..ec61bbaf25df 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -570,5 +570,21 @@ static inline int dlm_no_directory(struct dlm_ls *ls)
        return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
 }
+int dlm_netlink_init(void);
+void dlm_netlink_exit(void);
+void dlm_timeout_warn(struct dlm_lkb *lkb);
+#ifdef CONFIG_DLM_DEBUG
+int dlm_register_debugfs(void);
+void dlm_unregister_debugfs(void);
+int dlm_create_debug_file(struct dlm_ls *ls);
+void dlm_delete_debug_file(struct dlm_ls *ls);
+#else
+static inline int dlm_register_debugfs(void) { return 0; }
+static inline void dlm_unregister_debugfs(void) { }
+static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
+static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
+#endif
 #endif                          /* __DLM_INTERNAL_DOT_H__ */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 3915b8e14146..ff4a198fa677 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -88,7 +88,6 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
 static int receive_extralen(struct dlm_message *ms);
 static void do_purge(struct dlm_ls *ls, int nodeid, int pid);
 static void del_timeout(struct dlm_lkb *lkb);
-void dlm_timeout_warn(struct dlm_lkb *lkb);
 /*
 * Lock compatibilty matrix - thanks Steve
@@ -335,7 +334,7 @@ static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
 {
        struct dlm_rsb *r;
-        r = allocate_rsb(ls, len);
+        r = dlm_allocate_rsb(ls, len);
        if (!r)
                return NULL;
@@ -478,7 +477,7 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
        error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
        if (!error) {
                write_unlock(&ls->ls_rsbtbl[bucket].lock);
-                free_rsb(r);
+                dlm_free_rsb(r);
                r = tmp;
                goto out;
        }
@@ -490,12 +489,6 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
        return error;
 }
-int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
-                 unsigned int flags, struct dlm_rsb **r_ret)
-{
-        return find_rsb(ls, name, namelen, flags, r_ret);
-}
 /* This is only called to add a reference when the code already holds
   a valid reference to the rsb, so there's no need for locking. */
@@ -519,7 +512,7 @@ static void toss_rsb(struct kref *kref)
        list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
        r->res_toss_time = jiffies;
        if (r->res_lvbptr) {
-                free_lvb(r->res_lvbptr);
+                dlm_free_lvb(r->res_lvbptr);
                r->res_lvbptr = NULL;
        }
 }
@@ -589,7 +582,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
        uint32_t lkid = 0;
        uint16_t bucket;
-        lkb = allocate_lkb(ls);
+        lkb = dlm_allocate_lkb(ls);
        if (!lkb)
                return -ENOMEM;
@@ -683,8 +676,8 @@ static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
                /* for local/process lkbs, lvbptr points to caller's lksb */
                if (lkb->lkb_lvbptr && is_master_copy(lkb))
-                        free_lvb(lkb->lkb_lvbptr);
+                        dlm_free_lvb(lkb->lkb_lvbptr);
-                free_lkb(lkb);
+                dlm_free_lkb(lkb);
                return 1;
        } else {
                write_unlock(&ls->ls_lkbtbl[bucket].lock);
@@ -988,7 +981,7 @@ static int shrink_bucket(struct dlm_ls *ls, int b)
                        if (is_master(r))
                                dir_remove(r);
-                        free_rsb(r);
+                        dlm_free_rsb(r);
                        count++;
                } else {
                        write_unlock(&ls->ls_rsbtbl[b].lock);
@@ -1171,7 +1164,7 @@ static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
                        return;
                if (!r->res_lvbptr)
-                        r->res_lvbptr = allocate_lvb(r->res_ls);
+                        r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
                if (!r->res_lvbptr)
                        return;
@@ -1203,7 +1196,7 @@ static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
                return;
        if (!r->res_lvbptr)
-                r->res_lvbptr = allocate_lvb(r->res_ls);
+                r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
        if (!r->res_lvbptr)
                return;
@@ -1852,7 +1845,7 @@ static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
 static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
        struct dlm_ls *ls = r->res_ls;
-        int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
+        int i, error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
        if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
                rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
@@ -1886,7 +1879,7 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
                return 1;
        }
-        for (;;) {
+        for (i = 0; i < 2; i++) {
                /* It's possible for dlm_scand to remove an old rsb for
                   this same resource from the toss list, us to create
                   a new one, look up the master locally, and find it
@@ -1900,6 +1893,8 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
                log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
                schedule();
        }
+        if (error && error != -EEXIST)
+                return error;
        if (ret_nodeid == our_nodeid) {
                r->res_first_lkid = 0;
@@ -1941,8 +1936,11 @@ static void confirm_master(struct dlm_rsb *r, int error)
                break;
        case -EAGAIN:
-                /* the remote master didn't queue our NOQUEUE request;
+        case -EBADR:
-                   make a waiting lkb the first_lkid */
+        case -ENOTBLK:
+                /* the remote request failed and won't be retried (it was
+                   a NOQUEUE, or has been canceled/unlocked); make a waiting
+                   lkb the first_lkid */
                r->res_first_lkid = 0;
@@ -2108,17 +2106,18 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
        /* an lkb may be waiting for an rsb lookup to complete where the
           lookup was initiated by another lock */
-        if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) {
+        if (!list_empty(&lkb->lkb_rsb_lookup)) {
-                if (!list_empty(&lkb->lkb_rsb_lookup)) {
+                if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) {
                        log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id);
                        list_del_init(&lkb->lkb_rsb_lookup);
                        queue_cast(lkb->lkb_resource, lkb,
                                   args->flags & DLM_LKF_CANCEL ?
                                   -DLM_ECANCEL : -DLM_EUNLOCK);
                        unhold_lkb(lkb); /* undoes create_lkb() */
-                        rv = -EBUSY;
-                        goto out;
                }
+                /* caller changes -EBUSY to 0 for CANCEL and FORCEUNLOCK */
+                rv = -EBUSY;
+                goto out;
        }
        /* cancel not allowed with another cancel/unlock in progress */
@@ -2986,7 +2985,7 @@ static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
        if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
                if (!lkb->lkb_lvbptr)
-                        lkb->lkb_lvbptr = allocate_lvb(ls);
+                        lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
                if (!lkb->lkb_lvbptr)
                        return -ENOMEM;
                len = receive_extralen(ms);
@@ -3006,11 +3005,9 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
        lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
        lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
-        DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
        if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
                /* lkb was just created so there won't be an lvb yet */
-                lkb->lkb_lvbptr = allocate_lvb(ls);
+                lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
                if (!lkb->lkb_lvbptr)
                        return -ENOMEM;
        }
@@ -3021,16 +3018,6 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
                                struct dlm_message *ms)
 {
-        if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
-                log_error(ls, "convert_args nodeid %d %d lkid %x %x",
-                          lkb->lkb_nodeid, ms->m_header.h_nodeid,
-                          lkb->lkb_id, lkb->lkb_remid);
-                return -EINVAL;
-        }
-        if (!is_master_copy(lkb))
-                return -EINVAL;
        if (lkb->lkb_status != DLM_LKSTS_GRANTED)
                return -EBUSY;
@@ -3046,8 +3033,6 @@ static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
                               struct dlm_message *ms)
 {
-        if (!is_master_copy(lkb))
-                return -EINVAL;
        if (receive_lvb(ls, lkb, ms))
                return -ENOMEM;
        return 0;
@@ -3063,6 +3048,50 @@ static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
        lkb->lkb_remid = ms->m_lkid;
 }
+/* This is called after the rsb is locked so that we can safely inspect
+   fields in the lkb. */
+static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+        int from = ms->m_header.h_nodeid;
+        int error = 0;
+        switch (ms->m_type) {
+        case DLM_MSG_CONVERT:
+        case DLM_MSG_UNLOCK:
+        case DLM_MSG_CANCEL:
+                if (!is_master_copy(lkb) || lkb->lkb_nodeid != from)
+                        error = -EINVAL;
+                break;
+        case DLM_MSG_CONVERT_REPLY:
+        case DLM_MSG_UNLOCK_REPLY:
+        case DLM_MSG_CANCEL_REPLY:
+        case DLM_MSG_GRANT:
+        case DLM_MSG_BAST:
+                if (!is_process_copy(lkb) || lkb->lkb_nodeid != from)
+                        error = -EINVAL;
+                break;
+        case DLM_MSG_REQUEST_REPLY:
+                if (!is_process_copy(lkb))
+                        error = -EINVAL;
+                else if (lkb->lkb_nodeid != -1 && lkb->lkb_nodeid != from)
+                        error = -EINVAL;
+                break;
+        default:
+                error = -EINVAL;
+        }
+        if (error)
+                log_error(lkb->lkb_resource->res_ls,
+                          "ignore invalid message %d from %d %x %x %x %d",
+                          ms->m_type, from, lkb->lkb_id, lkb->lkb_remid,
+                          lkb->lkb_flags, lkb->lkb_nodeid);
+        return error;
+}
 static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
 {
        struct dlm_lkb *lkb;
@@ -3124,17 +3153,21 @@ static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
        hold_rsb(r);
        lock_rsb(r);
+        error = validate_message(lkb, ms);
+        if (error)
+                goto out;
        receive_flags(lkb, ms);
        error = receive_convert_args(ls, lkb, ms);
        if (error)
-                goto out;
+                goto out_reply;
        reply = !down_conversion(lkb);
        error = do_convert(r, lkb);
- out:
+ out_reply:
        if (reply)
                send_convert_reply(r, lkb, error);
+ out:
        unlock_rsb(r);
        put_rsb(r);
        dlm_put_lkb(lkb);
@@ -3160,15 +3193,19 @@ static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
        hold_rsb(r);
        lock_rsb(r);
+        error = validate_message(lkb, ms);
+        if (error)
+                goto out;
        receive_flags(lkb, ms);
        error = receive_unlock_args(ls, lkb, ms);
        if (error)
-                goto out;
+                goto out_reply;
        error = do_unlock(r, lkb);
- out:
+ out_reply:
        send_unlock_reply(r, lkb, error);
+ out:
        unlock_rsb(r);
        put_rsb(r);
        dlm_put_lkb(lkb);
@@ -3196,9 +3233,13 @@ static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
        hold_rsb(r);
        lock_rsb(r);
+        error = validate_message(lkb, ms);
+        if (error)
+                goto out;
        error = do_cancel(r, lkb);
        send_cancel_reply(r, lkb, error);
+ out:
        unlock_rsb(r);
        put_rsb(r);
        dlm_put_lkb(lkb);
@@ -3217,22 +3258,26 @@ static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
        error = find_lkb(ls, ms->m_remid, &lkb);
        if (error) {
-                log_error(ls, "receive_grant no lkb");
+                log_debug(ls, "receive_grant from %d no lkb %x",
+                          ms->m_header.h_nodeid, ms->m_remid);
                return;
        }
-        DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
        r = lkb->lkb_resource;
        hold_rsb(r);
        lock_rsb(r);
+        error = validate_message(lkb, ms);
+        if (error)
+                goto out;
        receive_flags_reply(lkb, ms);
        if (is_altmode(lkb))
                munge_altmode(lkb, ms);
        grant_lock_pc(r, lkb, ms);
        queue_cast(r, lkb, 0);
+ out:
        unlock_rsb(r);
        put_rsb(r);
        dlm_put_lkb(lkb);
@@ -3246,18 +3291,22 @@ static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
        error = find_lkb(ls, ms->m_remid, &lkb);
        if (error) {
-                log_error(ls, "receive_bast no lkb");
+                log_debug(ls, "receive_bast from %d no lkb %x",
+                          ms->m_header.h_nodeid, ms->m_remid);
                return;
        }
-        DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
        r = lkb->lkb_resource;
        hold_rsb(r);
        lock_rsb(r);
-        queue_bast(r, lkb, ms->m_bastmode);
+        error = validate_message(lkb, ms);
+        if (error)
+                goto out;
+        queue_bast(r, lkb, ms->m_bastmode);
+ out:
        unlock_rsb(r);
        put_rsb(r);
        dlm_put_lkb(lkb);
@@ -3323,15 +3372,19 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
        error = find_lkb(ls, ms->m_remid, &lkb);
        if (error) {
-                log_error(ls, "receive_request_reply no lkb");
+                log_debug(ls, "receive_request_reply from %d no lkb %x",
+                          ms->m_header.h_nodeid, ms->m_remid);
                return;
        }
-        DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
        r = lkb->lkb_resource;
        hold_rsb(r);
        lock_rsb(r);
+        error = validate_message(lkb, ms);
+        if (error)
+                goto out;
        mstype = lkb->lkb_wait_type;
        error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY);
        if (error)
@@ -3383,6 +3436,7 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
                if (is_overlap(lkb)) {
                        /* we'll ignore error in cancel/unlock reply */
                        queue_cast_overlap(r, lkb);
+                        confirm_master(r, result);
                        unhold_lkb(lkb); /* undoes create_lkb() */
                } else
                        _request_lock(r, lkb);
@@ -3463,6 +3517,10 @@ static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
        hold_rsb(r);
        lock_rsb(r);
+        error = validate_message(lkb, ms);
+        if (error)
+                goto out;
        /* stub reply can happen with waiters_mutex held */
        error = remove_from_waiters_ms(lkb, ms);
        if (error)
@@ -3481,10 +3539,10 @@ static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
        error = find_lkb(ls, ms->m_remid, &lkb);
        if (error) {
-                log_error(ls, "receive_convert_reply no lkb");
+                log_debug(ls, "receive_convert_reply from %d no lkb %x",
+                          ms->m_header.h_nodeid, ms->m_remid);
                return;
        }
-        DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
        _receive_convert_reply(lkb, ms);
        dlm_put_lkb(lkb);
@@ -3498,6 +3556,10 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
        hold_rsb(r);
        lock_rsb(r);
+        error = validate_message(lkb, ms);
+        if (error)
+                goto out;
        /* stub reply can happen with waiters_mutex held */
        error = remove_from_waiters_ms(lkb, ms);
        if (error)
@@ -3529,10 +3591,10 @@ static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
        error = find_lkb(ls, ms->m_remid, &lkb);
        if (error) {
-                log_error(ls, "receive_unlock_reply no lkb");
+                log_debug(ls, "receive_unlock_reply from %d no lkb %x",
+                          ms->m_header.h_nodeid, ms->m_remid);
                return;
        }
-        DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
        _receive_unlock_reply(lkb, ms);
        dlm_put_lkb(lkb);
@@ -3546,6 +3608,10 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
        hold_rsb(r);
        lock_rsb(r);
+        error = validate_message(lkb, ms);
+        if (error)
+                goto out;
        /* stub reply can happen with waiters_mutex held */
        error = remove_from_waiters_ms(lkb, ms);
        if (error)
@@ -3577,10 +3643,10 @@ static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
        error = find_lkb(ls, ms->m_remid, &lkb);
        if (error) {
-                log_error(ls, "receive_cancel_reply no lkb");
+                log_debug(ls, "receive_cancel_reply from %d no lkb %x",
+                          ms->m_header.h_nodeid, ms->m_remid);
                return;
        }
-        DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
        _receive_cancel_reply(lkb, ms);
        dlm_put_lkb(lkb);
@@ -3640,6 +3706,13 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
 static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms)
 {
+        if (!dlm_is_member(ls, ms->m_header.h_nodeid)) {
+                log_debug(ls, "ignore non-member message %d from %d %x %x %d",
+                          ms->m_type, ms->m_header.h_nodeid, ms->m_lkid,
+                          ms->m_remid, ms->m_result);
+                return;
+        }
        switch (ms->m_type) {
        /* messages sent to a master node */
@@ -3778,8 +3851,9 @@ void dlm_receive_buffer(struct dlm_header *hd, int nodeid)
        ls = dlm_find_lockspace_global(hd->h_lockspace);
        if (!ls) {
-                log_print("invalid h_lockspace %x from %d cmd %d type %d",
+                if (dlm_config.ci_log_debug)
-                          hd->h_lockspace, nodeid, hd->h_cmd, type);
+                        log_print("invalid lockspace %x from %d cmd %d type %d",
+                                  hd->h_lockspace, nodeid, hd->h_cmd, type);
                if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS)
                        dlm_send_ls_not_ready(nodeid, rc);
@@ -3806,6 +3880,7 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
                ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
                ls->ls_stub_ms.m_result = -EINPROGRESS;
                ls->ls_stub_ms.m_flags = lkb->lkb_flags;
+                ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
                _receive_convert_reply(lkb, &ls->ls_stub_ms);
                /* Same special case as in receive_rcom_lock_args() */
@@ -3847,6 +3922,7 @@ static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
 void dlm_recover_waiters_pre(struct dlm_ls *ls)
 {
        struct dlm_lkb *lkb, *safe;
+        int wait_type, stub_unlock_result, stub_cancel_result;
        mutex_lock(&ls->ls_waiters_mutex);
@@ -3865,7 +3941,33 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
                if (!waiter_needs_recovery(ls, lkb))
                        continue;
-                switch (lkb->lkb_wait_type) {
+                wait_type = lkb->lkb_wait_type;
+                stub_unlock_result = -DLM_EUNLOCK;
+                stub_cancel_result = -DLM_ECANCEL;
+                /* Main reply may have been received leaving a zero wait_type,
+                   but a reply for the overlapping op may not have been
+                   received.  In that case we need to fake the appropriate
+                   reply for the overlap op. */
+                if (!wait_type) {
+                        if (is_overlap_cancel(lkb)) {
+                                wait_type = DLM_MSG_CANCEL;
+                                if (lkb->lkb_grmode == DLM_LOCK_IV)
+                                        stub_cancel_result = 0;
+                        }
+                        if (is_overlap_unlock(lkb)) {
+                                wait_type = DLM_MSG_UNLOCK;
+                                if (lkb->lkb_grmode == DLM_LOCK_IV)
+                                        stub_unlock_result = -ENOENT;
+                        }
+                        log_debug(ls, "rwpre overlap %x %x %d %d %d",
+                                  lkb->lkb_id, lkb->lkb_flags, wait_type,
+                                  stub_cancel_result, stub_unlock_result);
+                }
+                switch (wait_type) {
                case DLM_MSG_REQUEST:
                        lkb->lkb_flags |= DLM_IFL_RESEND;
@@ -3878,8 +3980,9 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
                case DLM_MSG_UNLOCK:
                        hold_lkb(lkb);
                        ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY;
-                        ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
+                        ls->ls_stub_ms.m_result = stub_unlock_result;
                        ls->ls_stub_ms.m_flags = lkb->lkb_flags;
+                        ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
                        _receive_unlock_reply(lkb, &ls->ls_stub_ms);
                        dlm_put_lkb(lkb);
                        break;
@@ -3887,15 +3990,16 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
                case DLM_MSG_CANCEL:
                        hold_lkb(lkb);
                        ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY;
-                        ls->ls_stub_ms.m_result = -DLM_ECANCEL;
+                        ls->ls_stub_ms.m_result = stub_cancel_result;
                        ls->ls_stub_ms.m_flags = lkb->lkb_flags;
+                        ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
                        _receive_cancel_reply(lkb, &ls->ls_stub_ms);
                        dlm_put_lkb(lkb);
                        break;
                default:
-                        log_error(ls, "invalid lkb wait_type %d",
+                        log_error(ls, "invalid lkb wait_type %d %d",
-                                  lkb->lkb_wait_type);
+                                  lkb->lkb_wait_type, wait_type);
                }
                schedule();
        }
@@ -4184,7 +4288,7 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
        lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
        if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
-                lkb->lkb_lvbptr = allocate_lvb(ls);
+                lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
                if (!lkb->lkb_lvbptr)
                        return -ENOMEM;
                lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
@@ -4259,7 +4363,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
        put_rsb(r);
 out:
        if (error)
-                log_print("recover_master_copy %d %x", error, rl->rl_lkid);
+                log_debug(ls, "recover_master_copy %d %x", error, rl->rl_lkid);
        rl->rl_result = error;
        return error;
 }
@@ -4342,7 +4446,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
                }
        }
-        /* After ua is attached to lkb it will be freed by free_lkb().
+        /* After ua is attached to lkb it will be freed by dlm_free_lkb().
           When DLM_IFL_USER is set, the dlm knows that this is a userspace
           lock and that lkb_astparam is the dlm_user_args structure. */
@@ -4679,6 +4783,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
        }
        list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
+                lkb->lkb_ast_type = 0;
                list_del(&lkb->lkb_astqueue);
                dlm_put_lkb(lkb);
        }
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index ada04680a1e5..27b6ed302911 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -19,8 +19,6 @@ void dlm_print_lkb(struct dlm_lkb *lkb);
 void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms);
 void dlm_receive_buffer(struct dlm_header *hd, int nodeid);
 int dlm_modes_compat(int mode1, int mode2);
-int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
-        unsigned int flags, struct dlm_rsb **r_ret);
 void dlm_put_rsb(struct dlm_rsb *r);
 void dlm_hold_rsb(struct dlm_rsb *r);
 int dlm_put_lkb(struct dlm_lkb *lkb);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 6353a8384520..b180fdc51085 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -24,14 +24,6 @@
 #include "recover.h"
 #include "requestqueue.h"
-#ifdef CONFIG_DLM_DEBUG
-int dlm_create_debug_file(struct dlm_ls *ls);
-void dlm_delete_debug_file(struct dlm_ls *ls);
-#else
-static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
-static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
-#endif
 static int                      ls_count;
 static struct mutex             ls_lock;
 static struct list_head         lslist;
@@ -166,26 +158,7 @@ static struct kobj_type dlm_ktype = {
        .release       = lockspace_kobj_release,
 };
-static struct kset dlm_kset = {
+static struct kset *dlm_kset;
-        .ktype  = &dlm_ktype,
-};
-static int kobject_setup(struct dlm_ls *ls)
-{
-        char lsname[DLM_LOCKSPACE_LEN];
-        int error;
-        memset(lsname, 0, DLM_LOCKSPACE_LEN);
-        snprintf(lsname, DLM_LOCKSPACE_LEN, "%s", ls->ls_name);
-        error = kobject_set_name(&ls->ls_kobj, "%s", lsname);
-        if (error)
-                return error;
-        ls->ls_kobj.kset = &dlm_kset;
-        ls->ls_kobj.ktype = &dlm_ktype;
-        return 0;
-}
 static int do_uevent(struct dlm_ls *ls, int in)
 {
@@ -220,24 +193,22 @@ static int do_uevent(struct dlm_ls *ls, int in)
 int dlm_lockspace_init(void)
 {
-        int error;
        ls_count = 0;
        mutex_init(&ls_lock);
        INIT_LIST_HEAD(&lslist);
        spin_lock_init(&lslist_lock);
-        kobject_set_name(&dlm_kset.kobj, "dlm");
+        dlm_kset = kset_create_and_add("dlm", NULL, kernel_kobj);
-        kobj_set_kset_s(&dlm_kset, kernel_subsys);
+        if (!dlm_kset) {
-        error = kset_register(&dlm_kset);
+                printk(KERN_WARNING "%s: can not create kset\n", __FUNCTION__);
-        if (error)
+                return -ENOMEM;
-                printk("dlm_lockspace_init: cannot register kset %d\n", error);
+        }
-        return error;
+        return 0;
 }
 void dlm_lockspace_exit(void)
 {
-        kset_unregister(&dlm_kset);
+        kset_unregister(dlm_kset);
 }
 static int dlm_scand(void *data)
@@ -549,13 +520,12 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
                goto out_delist;
        }
-        error = kobject_setup(ls);
+        ls->ls_kobj.kset = dlm_kset;
-        if (error)
+        error = kobject_init_and_add(&ls->ls_kobj, &dlm_ktype, NULL,
-                goto out_stop;
+                                     "%s", ls->ls_name);
-        error = kobject_register(&ls->ls_kobj);
        if (error)
                goto out_stop;
+        kobject_uevent(&ls->ls_kobj, KOBJ_ADD);
        /* let kobject handle freeing of ls if there's an error */
        do_unreg = 1;
@@ -601,7 +571,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
        kfree(ls->ls_rsbtbl);
 out_lsfree:
        if (do_unreg)
-                kobject_unregister(&ls->ls_kobj);
+                kobject_put(&ls->ls_kobj);
        else
                kfree(ls);
 out:
@@ -706,9 +676,9 @@ static int release_lockspace(struct dlm_ls *ls, int force)
                        dlm_del_ast(lkb);
                        if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
-                                free_lvb(lkb->lkb_lvbptr);
+                                dlm_free_lvb(lkb->lkb_lvbptr);
-                        free_lkb(lkb);
+                        dlm_free_lkb(lkb);
                }
        }
        dlm_astd_resume();
@@ -726,7 +696,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
                                         res_hashchain);
                        list_del(&rsb->res_hashchain);
-                        free_rsb(rsb);
+                        dlm_free_rsb(rsb);
                }
                head = &ls->ls_rsbtbl[i].toss;
@@ -734,7 +704,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
                        rsb = list_entry(head->next, struct dlm_rsb,
                                         res_hashchain);
                        list_del(&rsb->res_hashchain);
-                        free_rsb(rsb);
+                        dlm_free_rsb(rsb);
                }
        }
@@ -750,7 +720,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
        dlm_clear_members(ls);
        dlm_clear_members_gone(ls);
        kfree(ls->ls_node_array);
-        kobject_unregister(&ls->ls_kobj);
+        kobject_put(&ls->ls_kobj);
        /* The ls structure will be freed when the kobject is done with */
        mutex_lock(&ls_lock);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 58bf3f5cdbe2..7c1e5e5cccd8 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -864,7 +864,7 @@ static void sctp_init_assoc(struct connection *con)
 static void tcp_connect_to_sock(struct connection *con)
 {
        int result = -EHOSTUNREACH;
-        struct sockaddr_storage saddr;
+        struct sockaddr_storage saddr, src_addr;
        int addr_len;
        struct socket *sock;
@@ -898,6 +898,17 @@ static void tcp_connect_to_sock(struct connection *con)
        con->connect_action = tcp_connect_to_sock;
        add_sock(sock, con);
+        /* Bind to our cluster-known address connecting to avoid
+           routing problems */
+        memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr));
+        make_sockaddr(&src_addr, 0, &addr_len);
+        result = sock->ops->bind(sock, (struct sockaddr *) &src_addr,
+                                 addr_len);
+        if (result < 0) {
+                log_print("could not bind for connect: %d", result);
+                /* This *may* not indicate a critical error */
+        }
        make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
        log_print("connecting to %d", con->nodeid);
@@ -1062,7 +1073,7 @@ static int sctp_listen_for_all(void)
        subscribe.sctp_shutdown_event = 1;
        subscribe.sctp_partial_delivery_event = 1;
-        result = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUF,
+        result = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUFFORCE,
                                 (char *)&bufsize, sizeof(bufsize));
        if (result)
                log_print("Error increasing buffer space on socket %d", result);
@@ -1426,6 +1437,8 @@ void dlm_lowcomms_stop(void)
                con = __nodeid2con(i, 0);
                if (con) {
                        close_connection(con, true);
+                        if (con->othercon)
+                                kmem_cache_free(con_cache, con->othercon);
                        kmem_cache_free(con_cache, con);
                }
        }
@@ -1454,10 +1467,6 @@ int dlm_lowcomms_start(void)
        if (!con_cache)
                goto out;
-        /* Set some sysctl minima */
-        if (sysctl_rmem_max < NEEDED_RMEM)
-                sysctl_rmem_max = NEEDED_RMEM;
        /* Start listening */
        if (dlm_config.ci_protocol == 0)
                error = tcp_listen_for_all();
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index eca2907f2386..58487fb95a4c 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -18,16 +18,6 @@
 #include "memory.h"
 #include "config.h"
-#ifdef CONFIG_DLM_DEBUG
-int dlm_register_debugfs(void);
-void dlm_unregister_debugfs(void);
-#else
-static inline int dlm_register_debugfs(void) { return 0; }
-static inline void dlm_unregister_debugfs(void) { }
-#endif
-int dlm_netlink_init(void);
-void dlm_netlink_exit(void);
 static int __init init_dlm(void)
 {
        int error;
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index e9cdcab306e2..fa17f5a27883 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -70,7 +70,7 @@ static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
        ls->ls_num_nodes--;
 }
-static int dlm_is_member(struct dlm_ls *ls, int nodeid)
+int dlm_is_member(struct dlm_ls *ls, int nodeid)
 {
        struct dlm_member *memb;
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
index 927c08c19214..7a26fca1e0b5 100644
--- a/fs/dlm/member.h
+++ b/fs/dlm/member.h
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -19,6 +19,7 @@ void dlm_clear_members(struct dlm_ls *ls);
 void dlm_clear_members_gone(struct dlm_ls *ls);
 int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
 int dlm_is_removed(struct dlm_ls *ls, int nodeid);
+int dlm_is_member(struct dlm_ls *ls, int nodeid);
 #endif                          /* __MEMBER_DOT_H__ */
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index ecf0e5cb2035..f7783867491a 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -35,7 +35,7 @@ void dlm_memory_exit(void)
                kmem_cache_destroy(lkb_cache);
 }
-char *allocate_lvb(struct dlm_ls *ls)
+char *dlm_allocate_lvb(struct dlm_ls *ls)
 {
        char *p;
@@ -43,7 +43,7 @@ char *allocate_lvb(struct dlm_ls *ls)
        return p;
 }
-void free_lvb(char *p)
+void dlm_free_lvb(char *p)
 {
        kfree(p);
 }
@@ -51,7 +51,7 @@ void free_lvb(char *p)
 /* FIXME: have some minimal space built-in to rsb for the name and
   kmalloc a separate name if needed, like dentries are done */
-struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
+struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen)
 {
        struct dlm_rsb *r;
@@ -61,14 +61,14 @@ struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
        return r;
 }
-void free_rsb(struct dlm_rsb *r)
+void dlm_free_rsb(struct dlm_rsb *r)
 {
        if (r->res_lvbptr)
-                free_lvb(r->res_lvbptr);
+                dlm_free_lvb(r->res_lvbptr);
        kfree(r);
 }
-struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
+struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
 {
        struct dlm_lkb *lkb;
@@ -76,7 +76,7 @@ struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
        return lkb;
 }
-void free_lkb(struct dlm_lkb *lkb)
+void dlm_free_lkb(struct dlm_lkb *lkb)
 {
        if (lkb->lkb_flags & DLM_IFL_USER) {
                struct dlm_user_args *ua;
@@ -90,19 +90,3 @@ void free_lkb(struct dlm_lkb *lkb)
        kmem_cache_free(lkb_cache, lkb);
 }
-struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
-{
-        struct dlm_direntry *de;
-        DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,
-                   printk("namelen = %d\n", namelen););
-        de = kzalloc(sizeof(*de) + namelen, GFP_KERNEL);
-        return de;
-}
-void free_direntry(struct dlm_direntry *de)
-{
-        kfree(de);
-}
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
index 6ead158ccc5c..485fb29143bd 100644
--- a/fs/dlm/memory.h
+++ b/fs/dlm/memory.h
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -16,14 +16,12 @@
 int dlm_memory_init(void);
 void dlm_memory_exit(void);
-struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
+struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen);
-void free_rsb(struct dlm_rsb *r);
+void dlm_free_rsb(struct dlm_rsb *r);
-struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
+struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls);
-void free_lkb(struct dlm_lkb *l);
+void dlm_free_lkb(struct dlm_lkb *l);
-struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen);
+char *dlm_allocate_lvb(struct dlm_ls *ls);
-void free_direntry(struct dlm_direntry *de);
+void dlm_free_lvb(char *l);
-char *allocate_lvb(struct dlm_ls *ls);
-void free_lvb(char *l);
 #endif          /* __MEMORY_DOT_H__ */
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index f8c69dda16a0..e69926e984db 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -58,8 +58,12 @@ static void copy_from_cb(void *dst, const void *base, unsigned offset,
 int dlm_process_incoming_buffer(int nodeid, const void *base,
                                unsigned offset, unsigned len, unsigned limit)
 {
-        unsigned char __tmp[DLM_INBUF_LEN];
+        union {
-        struct dlm_header *msg = (struct dlm_header *) __tmp;
+                unsigned char __buf[DLM_INBUF_LEN];
+                /* this is to force proper alignment on some arches */
+                struct dlm_header dlm;
+        } __tmp;
+        struct dlm_header *msg = &__tmp.dlm;
        int ret = 0;
        int err = 0;
        uint16_t msglen;
@@ -100,8 +104,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
                   in the buffer on the stack (which should work for most
                   ordinary messages). */
-                if (msglen > sizeof(__tmp) &&
+                if (msglen > DLM_INBUF_LEN && msg == &__tmp.dlm) {
-                    msg == (struct dlm_header *) __tmp) {
                        msg = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL);
                        if (msg == NULL)
                                return ret;
@@ -119,7 +122,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
                dlm_receive_buffer(msg, nodeid);
        }
-        if (msg != (struct dlm_header *) __tmp)
+        if (msg != &__tmp.dlm)
                kfree(msg);
        return err ? err : ret;
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index ae2fd97fa4ad..026824cd3acb 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -197,11 +197,6 @@ static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
        spin_unlock(&ls->ls_rcom_spin);
 }
-static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
-{
-        receive_sync_reply(ls, rc_in);
-}
 int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
 {
        struct dlm_rcom *rc;
@@ -254,11 +249,6 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
        send_rcom(ls, mh, rc);
 }
-static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
-{
-        receive_sync_reply(ls, rc_in);
-}
 int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
 {
        struct dlm_rcom *rc;
@@ -381,11 +371,6 @@ static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
        send_rcom(ls, mh, rc);
 }
-static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
-{
-        dlm_recover_process_copy(ls, rc_in);
-}
 /* If the lockspace doesn't exist then still send a status message
   back; it's possible that it just doesn't have its global_id yet. */
@@ -481,11 +466,11 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
                break;
        case DLM_RCOM_STATUS_REPLY:
-                receive_rcom_status_reply(ls, rc);
+                receive_sync_reply(ls, rc);
                break;
        case DLM_RCOM_NAMES_REPLY:
-                receive_rcom_names_reply(ls, rc);
+                receive_sync_reply(ls, rc);
                break;
        case DLM_RCOM_LOOKUP_REPLY:
@@ -493,11 +478,11 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
                break;
        case DLM_RCOM_LOCK_REPLY:
-                receive_rcom_lock_reply(ls, rc);
+                dlm_recover_process_copy(ls, rc);
                break;
        default:
-                DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type););
+                log_error(ls, "receive_rcom bad type %d", rc->rc_type);
        }
 out:
        return;
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index c2cc7694cd16..df075dc300fa 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -629,7 +629,7 @@ static void recover_lvb(struct dlm_rsb *r)
                goto out;
        if (!r->res_lvbptr) {
-                r->res_lvbptr = allocate_lvb(r->res_ls);
+                r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
                if (!r->res_lvbptr)
                        goto out;
        }
@@ -731,6 +731,20 @@ int dlm_create_root_list(struct dlm_ls *ls)
                        list_add(&r->res_root_list, &ls->ls_root_list);
                        dlm_hold_rsb(r);
                }
+                /* If we're using a directory, add tossed rsbs to the root
+                   list; they'll have entries created in the new directory,
+                   but no other recovery steps should do anything with them. */
+                if (dlm_no_directory(ls)) {
+                        read_unlock(&ls->ls_rsbtbl[i].lock);
+                        continue;
+                }
+                list_for_each_entry(r, &ls->ls_rsbtbl[i].toss, res_hashchain) {
+                        list_add(&r->res_root_list, &ls->ls_root_list);
+                        dlm_hold_rsb(r);
+                }
                read_unlock(&ls->ls_rsbtbl[i].lock);
        }
 out:
@@ -750,6 +764,11 @@ void dlm_release_root_list(struct dlm_ls *ls)
        up_write(&ls->ls_root_sem);
 }
+/* If not using a directory, clear the entire toss list, there's no benefit to
+   caching the master value since it's fixed.  If we are using a dir, keep the
+   rsb's we're the master of.  Recovery will add them to the root list and from
+   there they'll be entered in the rebuilt directory. */
 void dlm_clear_toss_list(struct dlm_ls *ls)
 {
        struct dlm_rsb *r, *safe;
@@ -759,8 +778,10 @@ void dlm_clear_toss_list(struct dlm_ls *ls)
                write_lock(&ls->ls_rsbtbl[i].lock);
                list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
                                         res_hashchain) {
-                        list_del(&r->res_hashchain);
+                        if (dlm_no_directory(ls) || !is_master(r)) {
-                        free_rsb(r);
+                                list_del(&r->res_hashchain);
+                                dlm_free_rsb(r);
+                        }
                }
                write_unlock(&ls->ls_rsbtbl[i].lock);
        }
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 4b89e20eebe7..997f9531d594 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -67,17 +67,18 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
        dlm_astd_resume();
        /*
-         * This list of root rsb's will be the basis of most of the recovery
+         * Free non-master tossed rsb's.  Master rsb's are kept on toss
-         * routines.
+         * list and put on root list to be included in resdir recovery.
         */
-        dlm_create_root_list(ls);
+        dlm_clear_toss_list(ls);
        /*
-         * Free all the tossed rsb's so we don't have to recover them.
+         * This list of root rsb's will be the basis of most of the recovery
+         * routines.
         */
-        dlm_clear_toss_list(ls);
+        dlm_create_root_list(ls);
        /*
         * Add or remove nodes from the lockspace's ls_nodes list.
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 4f741546f4bb..7cbc6826239b 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -24,8 +24,7 @@
 #include "lvb_table.h"
 #include "user.h"
-static const char *name_prefix="dlm";
+static const char name_prefix[] = "dlm";
-static struct miscdevice ctl_device;
 static const struct file_operations device_fops;
 #ifdef CONFIG_COMPAT
@@ -82,7 +81,8 @@ struct dlm_lock_result32 {
 };
 static void compat_input(struct dlm_write_request *kb,
-                         struct dlm_write_request32 *kb32)
+                         struct dlm_write_request32 *kb32,
+                         int max_namelen)
 {
        kb->version[0] = kb32->version[0];
        kb->version[1] = kb32->version[1];
@@ -112,7 +112,11 @@ static void compat_input(struct dlm_write_request *kb,
                kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr;
                kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb;
                memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
-                memcpy(kb->i.lock.name, kb32->i.lock.name, kb->i.lock.namelen);
+                if (kb->i.lock.namelen <= max_namelen)
+                        memcpy(kb->i.lock.name, kb32->i.lock.name,
+                               kb->i.lock.namelen);
+                else
+                        kb->i.lock.namelen = max_namelen;
        }
 }
@@ -236,12 +240,12 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
        spin_unlock(&proc->asts_spin);
        if (eol) {
-                spin_lock(&ua->proc->locks_spin);
+                spin_lock(&proc->locks_spin);
                if (!list_empty(&lkb->lkb_ownqueue)) {
                        list_del_init(&lkb->lkb_ownqueue);
                        dlm_put_lkb(lkb);
                }
-                spin_unlock(&ua->proc->locks_spin);
+                spin_unlock(&proc->locks_spin);
        }
 out:
        mutex_unlock(&ls->ls_clear_proc_locks);
@@ -529,7 +533,8 @@ static ssize_t device_write(struct file *file, const char __user *buf,
                if (proc)
                        set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
-                compat_input(kbuf, k32buf);
+                compat_input(kbuf, k32buf,
+                             count - sizeof(struct dlm_write_request32));
                kfree(k32buf);
        }
 #endif
@@ -896,14 +901,16 @@ static const struct file_operations ctl_device_fops = {
        .owner   = THIS_MODULE,
 };
+static struct miscdevice ctl_device = {
+        .name  = "dlm-control",
+        .fops  = &ctl_device_fops,
+        .minor = MISC_DYNAMIC_MINOR,
+};
 int dlm_user_init(void)
 {
        int error;
-        ctl_device.name = "dlm-control";
-        ctl_device.fops = &ctl_device_fops;
-        ctl_device.minor = MISC_DYNAMIC_MINOR;
        error = misc_register(&ctl_device);
        if (error)
                log_print("misc_register failed for control device");
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
index 963889cf6740..4d9c1f4e1bd1 100644
--- a/fs/dlm/util.c
+++ b/fs/dlm/util.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -14,6 +14,14 @@
 #include "rcom.h"
 #include "util.h"
+#define DLM_ERRNO_EDEADLK               35
+#define DLM_ERRNO_EBADR                 53
+#define DLM_ERRNO_EBADSLT               57
+#define DLM_ERRNO_EPROTO                71
+#define DLM_ERRNO_EOPNOTSUPP            95
+#define DLM_ERRNO_ETIMEDOUT            110
+#define DLM_ERRNO_EINPROGRESS          115
 static void header_out(struct dlm_header *hd)
 {
        hd->h_version           = cpu_to_le32(hd->h_version);
@@ -30,11 +38,54 @@ static void header_in(struct dlm_header *hd)
        hd->h_length            = le16_to_cpu(hd->h_length);
 }
-void dlm_message_out(struct dlm_message *ms)
+/* higher errno values are inconsistent across architectures, so select
+   one set of values for on the wire */
+static int to_dlm_errno(int err)
+{
+        switch (err) {
+        case -EDEADLK:
+                return -DLM_ERRNO_EDEADLK;
+        case -EBADR:
+                return -DLM_ERRNO_EBADR;
+        case -EBADSLT:
+                return -DLM_ERRNO_EBADSLT;
+        case -EPROTO:
+                return -DLM_ERRNO_EPROTO;
+        case -EOPNOTSUPP:
+                return -DLM_ERRNO_EOPNOTSUPP;
+        case -ETIMEDOUT:
+                return -DLM_ERRNO_ETIMEDOUT;
+        case -EINPROGRESS:
+                return -DLM_ERRNO_EINPROGRESS;
+        }
+        return err;
+}
+static int from_dlm_errno(int err)
 {
-        struct dlm_header *hd = (struct dlm_header *) ms;
+        switch (err) {
+        case -DLM_ERRNO_EDEADLK:
+                return -EDEADLK;
+        case -DLM_ERRNO_EBADR:
+                return -EBADR;
+        case -DLM_ERRNO_EBADSLT:
+                return -EBADSLT;
+        case -DLM_ERRNO_EPROTO:
+                return -EPROTO;
+        case -DLM_ERRNO_EOPNOTSUPP:
+                return -EOPNOTSUPP;
+        case -DLM_ERRNO_ETIMEDOUT:
+                return -ETIMEDOUT;
+        case -DLM_ERRNO_EINPROGRESS:
+                return -EINPROGRESS;
+        }
+        return err;
+}
-        header_out(hd);
+void dlm_message_out(struct dlm_message *ms)
+{
+        header_out(&ms->m_header);
        ms->m_type              = cpu_to_le32(ms->m_type);
        ms->m_nodeid            = cpu_to_le32(ms->m_nodeid);
@@ -53,14 +104,12 @@ void dlm_message_out(struct dlm_message *ms)
        ms->m_rqmode            = cpu_to_le32(ms->m_rqmode);
        ms->m_bastmode          = cpu_to_le32(ms->m_bastmode);
        ms->m_asts              = cpu_to_le32(ms->m_asts);
-        ms->m_result            = cpu_to_le32(ms->m_result);
+        ms->m_result            = cpu_to_le32(to_dlm_errno(ms->m_result));
 }
 void dlm_message_in(struct dlm_message *ms)
 {
-        struct dlm_header *hd = (struct dlm_header *) ms;
+        header_in(&ms->m_header);
-        header_in(hd);
        ms->m_type              = le32_to_cpu(ms->m_type);
        ms->m_nodeid            = le32_to_cpu(ms->m_nodeid);
@@ -79,7 +128,7 @@ void dlm_message_in(struct dlm_message *ms)
        ms->m_rqmode            = le32_to_cpu(ms->m_rqmode);
        ms->m_bastmode          = le32_to_cpu(ms->m_bastmode);
        ms->m_asts              = le32_to_cpu(ms->m_asts);
-        ms->m_result            = le32_to_cpu(ms->m_result);
+        ms->m_result            = from_dlm_errno(le32_to_cpu(ms->m_result));
 }
 static void rcom_lock_out(struct rcom_lock *rl)
@@ -126,10 +175,9 @@ static void rcom_config_in(struct rcom_config *rf)
 void dlm_rcom_out(struct dlm_rcom *rc)
 {
-        struct dlm_header *hd = (struct dlm_header *) rc;
        int type = rc->rc_type;
-        header_out(hd);
+        header_out(&rc->rc_header);
        rc->rc_type             = cpu_to_le32(rc->rc_type);
        rc->rc_result           = cpu_to_le32(rc->rc_result);
@@ -137,7 +185,7 @@ void dlm_rcom_out(struct dlm_rcom *rc)
        rc->rc_seq              = cpu_to_le64(rc->rc_seq);
        rc->rc_seq_reply        = cpu_to_le64(rc->rc_seq_reply);
-        if (type == DLM_RCOM_LOCK)
+        if ((type == DLM_RCOM_LOCK) || (type == DLM_RCOM_LOCK_REPLY))
                rcom_lock_out((struct rcom_lock *) rc->rc_buf);
        else if (type == DLM_RCOM_STATUS_REPLY)
@@ -146,9 +194,9 @@ void dlm_rcom_out(struct dlm_rcom *rc)
 void dlm_rcom_in(struct dlm_rcom *rc)
 {
-        struct dlm_header *hd = (struct dlm_header *) rc;
+        int type;
-        header_in(hd);
+        header_in(&rc->rc_header);
        rc->rc_type             = le32_to_cpu(rc->rc_type);
        rc->rc_result           = le32_to_cpu(rc->rc_result);
@@ -156,10 +204,12 @@ void dlm_rcom_in(struct dlm_rcom *rc)
        rc->rc_seq              = le64_to_cpu(rc->rc_seq);
        rc->rc_seq_reply        = le64_to_cpu(rc->rc_seq_reply);
-        if (rc->rc_type == DLM_RCOM_LOCK)
+        type = rc->rc_type;
+        if ((type == DLM_RCOM_LOCK) || (type == DLM_RCOM_LOCK_REPLY))
                rcom_lock_in((struct rcom_lock *) rc->rc_buf);
-        else if (rc->rc_type == DLM_RCOM_STATUS_REPLY)
+        else if (type == DLM_RCOM_STATUS_REPLY)
                rcom_config_in((struct rcom_config *) rc->rc_buf);
 }
diff --git a/fs/dquot.c b/fs/dquot.c
index 2809768d9c41..cee7c6f428f0 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -827,6 +827,18 @@ static inline void dquot_decr_space(struct dquot *dquot, qsize_t number)
        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
 }
+static int warning_issued(struct dquot *dquot, const int warntype)
+{
+        int flag = (warntype == QUOTA_NL_BHARDWARN ||
+                warntype == QUOTA_NL_BSOFTLONGWARN) ? DQ_BLKS_B :
+                ((warntype == QUOTA_NL_IHARDWARN ||
+                warntype == QUOTA_NL_ISOFTLONGWARN) ? DQ_INODES_B : 0);
+        if (!flag)
+                return 0;
+        return test_and_set_bit(flag, &dquot->dq_flags);
+}
 #ifdef CONFIG_PRINT_QUOTA_WARNING
 static int flag_print_warnings = 1;
@@ -845,16 +857,12 @@ static inline int need_print_warning(struct dquot *dquot)
 }
 /* Print warning to user which exceeded quota */
-static void print_warning(struct dquot *dquot, const char warntype)
+static void print_warning(struct dquot *dquot, const int warntype)
 {
        char *msg = NULL;
        struct tty_struct *tty;
-        int flag = (warntype == QUOTA_NL_BHARDWARN ||
-                warntype == QUOTA_NL_BSOFTLONGWARN) ? DQ_BLKS_B :
-                ((warntype == QUOTA_NL_IHARDWARN ||
-                warntype == QUOTA_NL_ISOFTLONGWARN) ? DQ_INODES_B : 0);
-        if (!need_print_warning(dquot) || (flag && test_and_set_bit(flag, &dquot->dq_flags)))
+        if (!need_print_warning(dquot))
                return;
        mutex_lock(&tty_mutex);
@@ -895,9 +903,6 @@ out_lock:
 #ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-/* Size of quota netlink message - actually an upperbound for buffer size */
-#define QUOTA_NL_MSG_SIZE 32
 /* Netlink family structure for quota */
 static struct genl_family quota_genl_family = {
        .id = GENL_ID_GENERATE,
@@ -914,11 +919,13 @@ static void send_warning(const struct dquot *dquot, const char warntype)
        struct sk_buff *skb;
        void *msg_head;
        int ret;
+        int msg_size = 4 * nla_total_size(sizeof(u32)) +
+                       2 * nla_total_size(sizeof(u64));
        /* We have to allocate using GFP_NOFS as we are called from a
         * filesystem performing write and thus further recursion into
         * the fs to free some data could cause deadlocks. */
-        skb = genlmsg_new(QUOTA_NL_MSG_SIZE, GFP_NOFS);
+        skb = genlmsg_new(msg_size, GFP_NOFS);
        if (!skb) {
                printk(KERN_ERR
                  "VFS: Not enough memory to send quota warning.\n");
@@ -959,18 +966,19 @@ static void send_warning(const struct dquot *dquot, const char warntype)
                        "VFS: Failed to send notification message: %d\n", ret);
        return;
 attr_err_out:
-        printk(KERN_ERR "VFS: Failed to compose quota message: %d\n", ret);
+        printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
 err_out:
        kfree_skb(skb);
 }
 #endif
-static inline void flush_warnings(struct dquot **dquots, char *warntype)
+static inline void flush_warnings(struct dquot * const *dquots, char *warntype)
 {
        int i;
        for (i = 0; i < MAXQUOTAS; i++)
-                if (dquots[i] != NODQUOT && warntype[i] != QUOTA_NL_NOWARN) {
+                if (dquots[i] != NODQUOT && warntype[i] != QUOTA_NL_NOWARN &&
+                    !warning_issued(dquots[i], warntype[i])) {
 #ifdef CONFIG_PRINT_QUOTA_WARNING
                        print_warning(dquots[i], warntype[i]);
 #endif
@@ -1216,7 +1224,7 @@ warn_put_all:
                for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                        if (inode->i_dquot[cnt])
                                mark_dquot_dirty(inode->i_dquot[cnt]);
-        flush_warnings((struct dquot **)inode->i_dquot, warntype);
+        flush_warnings(inode->i_dquot, warntype);
        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
        return ret;
 }
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 9d70289f7df3..f8ef0af919e7 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -115,11 +115,29 @@ static int ecryptfs_calculate_md5(char *dst,
                }
                crypt_stat->hash_tfm = desc.tfm;
        }
-        crypto_hash_init(&desc);
+        rc = crypto_hash_init(&desc);
-        crypto_hash_update(&desc, &sg, len);
+        if (rc) {
-        crypto_hash_final(&desc, dst);
+                printk(KERN_ERR
-        mutex_unlock(&crypt_stat->cs_hash_tfm_mutex);
+                       "%s: Error initializing crypto hash; rc = [%d]\n",
+                       __FUNCTION__, rc);
+                goto out;
+        }
+        rc = crypto_hash_update(&desc, &sg, len);
+        if (rc) {
+                printk(KERN_ERR
+                       "%s: Error updating crypto hash; rc = [%d]\n",
+                       __FUNCTION__, rc);
+                goto out;
+        }
+        rc = crypto_hash_final(&desc, dst);
+        if (rc) {
+                printk(KERN_ERR
+                       "%s: Error finalizing crypto hash; rc = [%d]\n",
+                       __FUNCTION__, rc);
+                goto out;
+        }
 out:
+        mutex_unlock(&crypt_stat->cs_hash_tfm_mutex);
        return rc;
 }
@@ -504,7 +522,6 @@ int ecryptfs_encrypt_page(struct page *page)
                                        "\n", rc);
                        goto out;
                }
-                extent_offset++;
        }
 out:
        kfree(enc_extent_virt);
@@ -640,7 +657,6 @@ int ecryptfs_decrypt_page(struct page *page)
                               "rc = [%d]\n", __FUNCTION__, rc);
                        goto out;
                }
-                extent_offset++;
        }
 out:
        kfree(enc_extent_virt);
@@ -783,7 +799,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
        rc = ecryptfs_crypto_api_algify_cipher_name(&full_alg_name,
                                                    crypt_stat->cipher, "cbc");
        if (rc)
-                goto out;
+                goto out_unlock;
        crypt_stat->tfm = crypto_alloc_blkcipher(full_alg_name, 0,
                                                 CRYPTO_ALG_ASYNC);
        kfree(full_alg_name);
@@ -792,12 +808,12 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
                ecryptfs_printk(KERN_ERR, "cryptfs: init_crypt_ctx(): "
                                "Error initializing cipher [%s]\n",
                                crypt_stat->cipher);
-                mutex_unlock(&crypt_stat->cs_tfm_mutex);
+                goto out_unlock;
-                goto out;
        }
        crypto_blkcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
-        mutex_unlock(&crypt_stat->cs_tfm_mutex);
        rc = 0;
+out_unlock:
+        mutex_unlock(&crypt_stat->cs_tfm_mutex);
 out:
        return rc;
 }
@@ -1831,6 +1847,7 @@ ecryptfs_add_new_key_tfm(struct ecryptfs_key_tfm **key_tfm, char *cipher_name,
        mutex_init(&tmp_tfm->key_tfm_mutex);
        strncpy(tmp_tfm->cipher_name, cipher_name,
                ECRYPTFS_MAX_CIPHER_NAME_SIZE);
+        tmp_tfm->cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
        tmp_tfm->key_size = key_size;
        rc = ecryptfs_process_key_cipher(&tmp_tfm->key_tfm,
                                         tmp_tfm->cipher_name,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 0b1ab016fa2e..5a719180983c 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -120,22 +120,9 @@ ecryptfs_do_create(struct inode *directory_inode,
        rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode,
                                             ecryptfs_dentry, mode, nd);
        if (rc) {
-                struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode;
+                printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
-                struct ecryptfs_inode_info *inode_info =
+                       "rc = [%d]\n", __FUNCTION__, rc);
-                        ecryptfs_inode_to_private(ecryptfs_inode);
+                goto out_lock;
-                printk(KERN_WARNING "%s: Error creating underlying file; "
-                       "rc = [%d]; checking for existing\n", __FUNCTION__, rc);
-                if (inode_info) {
-                        mutex_lock(&inode_info->lower_file_mutex);
-                        if (!inode_info->lower_file) {
-                                mutex_unlock(&inode_info->lower_file_mutex);
-                                printk(KERN_ERR "%s: Failure to set underlying "
-                                       "file; rc = [%d]\n", __FUNCTION__, rc);
-                                goto out_lock;
-                        }
-                        mutex_unlock(&inode_info->lower_file_mutex);
-                }
        }
        rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
                                directory_inode->i_sb, 0);
@@ -451,6 +438,7 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
        dentry->d_inode->i_nlink =
                ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink;
        dentry->d_inode->i_ctime = dir->i_ctime;
+        d_drop(dentry);
 out_unlock:
        unlock_parent(lower_dentry);
        return rc;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 263fed88c0ca..f458c1f35565 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1860,7 +1860,7 @@ ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
        struct ecryptfs_global_auth_tok *new_auth_tok;
        int rc = 0;
-        new_auth_tok = kmem_cache_alloc(ecryptfs_global_auth_tok_cache,
+        new_auth_tok = kmem_cache_zalloc(ecryptfs_global_auth_tok_cache,
                                        GFP_KERNEL);
        if (!new_auth_tok) {
                rc = -ENOMEM;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index b83a512b7e08..0249aa4ae181 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -138,11 +138,14 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
                inode_info->lower_file = dentry_open(lower_dentry,
                                                     lower_mnt,
                                                     (O_RDWR | O_LARGEFILE));
-                if (IS_ERR(inode_info->lower_file))
+                if (IS_ERR(inode_info->lower_file)) {
+                        dget(lower_dentry);
+                        mntget(lower_mnt);
                        inode_info->lower_file = dentry_open(lower_dentry,
                                                             lower_mnt,
                                                             (O_RDONLY
                                                              | O_LARGEFILE));
+                }
                if (IS_ERR(inode_info->lower_file)) {
                        printk(KERN_ERR "Error opening lower persistent file "
                               "for lower_dentry [0x%p] and lower_mnt [0x%p]\n",
@@ -523,6 +526,7 @@ static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
        lower_mnt = nd.mnt;
        ecryptfs_set_superblock_lower(sb, lower_root->d_sb);
        sb->s_maxbytes = lower_root->d_sb->s_maxbytes;
+        sb->s_blocksize = lower_root->d_sb->s_blocksize;
        ecryptfs_set_dentry_lower(sb->s_root, lower_root);
        ecryptfs_set_dentry_lower_mnt(sb->s_root, lower_mnt);
        rc = ecryptfs_interpose(lower_root, sb->s_root, sb, 0);
@@ -730,127 +734,40 @@ static int ecryptfs_init_kmem_caches(void)
        return 0;
 }
-struct ecryptfs_obj {
+static struct kobject *ecryptfs_kobj;
-        char *name;
-        struct list_head slot_list;
-        struct kobject kobj;
-};
-struct ecryptfs_attribute {
-        struct attribute attr;
-        ssize_t(*show) (struct ecryptfs_obj *, char *);
-        ssize_t(*store) (struct ecryptfs_obj *, const char *, size_t);
-};
-static ssize_t
-ecryptfs_attr_store(struct kobject *kobj,
-                    struct attribute *attr, const char *buf, size_t len)
-{
-        struct ecryptfs_obj *obj = container_of(kobj, struct ecryptfs_obj,
-                                                kobj);
-        struct ecryptfs_attribute *attribute =
-                container_of(attr, struct ecryptfs_attribute, attr);
-        return (attribute->store ? attribute->store(obj, buf, len) : 0);
-}
-static ssize_t
+static ssize_t version_show(struct kobject *kobj,
-ecryptfs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf)
+                            struct kobj_attribute *attr, char *buff)
 {
-        struct ecryptfs_obj *obj = container_of(kobj, struct ecryptfs_obj,
+        return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK);
-                                                kobj);
-        struct ecryptfs_attribute *attribute =
-                container_of(attr, struct ecryptfs_attribute, attr);
-        return (attribute->show ? attribute->show(obj, buf) : 0);
 }
-static struct sysfs_ops ecryptfs_sysfs_ops = {
+static struct kobj_attribute version_attr = __ATTR_RO(version);
-        .show = ecryptfs_attr_show,
-        .store = ecryptfs_attr_store
-};
-static struct kobj_type ecryptfs_ktype = {
+static struct attribute *attributes[] = {
-        .sysfs_ops = &ecryptfs_sysfs_ops
+        &version_attr.attr,
+        NULL,
 };
-static decl_subsys(ecryptfs, &ecryptfs_ktype, NULL);
+static struct attribute_group attr_group = {
+        .attrs = attributes,
-static ssize_t version_show(struct ecryptfs_obj *obj, char *buff)
-{
-        return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK);
-}
-static struct ecryptfs_attribute sysfs_attr_version = __ATTR_RO(version);
-static struct ecryptfs_version_str_map_elem {
-        u32 flag;
-        char *str;
-} ecryptfs_version_str_map[] = {
-        {ECRYPTFS_VERSIONING_PASSPHRASE, "passphrase"},
-        {ECRYPTFS_VERSIONING_PUBKEY, "pubkey"},
-        {ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH, "plaintext passthrough"},
-        {ECRYPTFS_VERSIONING_POLICY, "policy"},
-        {ECRYPTFS_VERSIONING_XATTR, "metadata in extended attribute"},
-        {ECRYPTFS_VERSIONING_MULTKEY, "multiple keys per file"}
 };
-static ssize_t version_str_show(struct ecryptfs_obj *obj, char *buff)
-{
-        int i;
-        int remaining = PAGE_SIZE;
-        int total_written = 0;
-        buff[0] = '\0';
-        for (i = 0; i < ARRAY_SIZE(ecryptfs_version_str_map); i++) {
-                int entry_size;
-                if (!(ECRYPTFS_VERSIONING_MASK
-                      & ecryptfs_version_str_map[i].flag))
-                        continue;
-                entry_size = strlen(ecryptfs_version_str_map[i].str);
-                if ((entry_size + 2) > remaining)
-                        goto out;
-                memcpy(buff, ecryptfs_version_str_map[i].str, entry_size);
-                buff[entry_size++] = '\n';
-                buff[entry_size] = '\0';
-                buff += entry_size;
-                total_written += entry_size;
-                remaining -= entry_size;
-        }
-out:
-        return total_written;
-}
-static struct ecryptfs_attribute sysfs_attr_version_str = __ATTR_RO(version_str);
 static int do_sysfs_registration(void)
 {
        int rc;
-        rc = subsystem_register(&ecryptfs_subsys);
+        ecryptfs_kobj = kobject_create_and_add("ecryptfs", fs_kobj);
-        if (rc) {
+        if (!ecryptfs_kobj) {
-                printk(KERN_ERR
+                printk(KERN_ERR "Unable to create ecryptfs kset\n");
-                       "Unable to register ecryptfs sysfs subsystem\n");
+                rc = -ENOMEM;
-                goto out;
-        }
-        rc = sysfs_create_file(&ecryptfs_subsys.kobj,
-                               &sysfs_attr_version.attr);
-        if (rc) {
-                printk(KERN_ERR
-                       "Unable to create ecryptfs version attribute\n");
-                subsystem_unregister(&ecryptfs_subsys);
                goto out;
        }
-        rc = sysfs_create_file(&ecryptfs_subsys.kobj,
+        rc = sysfs_create_group(ecryptfs_kobj, &attr_group);
-                               &sysfs_attr_version_str.attr);
        if (rc) {
                printk(KERN_ERR
-                       "Unable to create ecryptfs version_str attribute\n");
+                       "Unable to create ecryptfs version attributes\n");
-                sysfs_remove_file(&ecryptfs_subsys.kobj,
+                kobject_put(ecryptfs_kobj);
-                                  &sysfs_attr_version.attr);
-                subsystem_unregister(&ecryptfs_subsys);
-                goto out;
        }
 out:
        return rc;
@@ -858,11 +775,8 @@ out:
 static void do_sysfs_unregistration(void)
 {
-        sysfs_remove_file(&ecryptfs_subsys.kobj,
+        sysfs_remove_group(ecryptfs_kobj, &attr_group);
-                          &sysfs_attr_version.attr);
+        kobject_put(ecryptfs_kobj);
-        sysfs_remove_file(&ecryptfs_subsys.kobj,
-                          &sysfs_attr_version_str.attr);
-        subsystem_unregister(&ecryptfs_subsys);
 }
 static int __init ecryptfs_init(void)
@@ -890,7 +804,6 @@ static int __init ecryptfs_init(void)
                printk(KERN_ERR "Failed to register filesystem\n");
                goto out_free_kmem_caches;
        }
-        kobj_set_kset_s(&ecryptfs_subsys, fs_subsys);
        rc = do_sysfs_registration();
        if (rc) {
                printk(KERN_ERR "sysfs registration failed\n");
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index a96d341d154d..9cc2aec27b0d 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -427,6 +427,7 @@ int ecryptfs_init_messaging(unsigned int transport)
        if (!ecryptfs_daemon_id_hash) {
                rc = -ENOMEM;
                ecryptfs_printk(KERN_ERR, "Failed to allocate memory\n");
+                mutex_unlock(&ecryptfs_daemon_id_hash_mux);
                goto out;
        }
        for (i = 0; i < ecryptfs_hash_buckets; i++)
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 16a7a555f392..32c5711d79a3 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -263,14 +263,13 @@ out:
        return 0;
 }
+/* This function must zero any hole we create */
 static int ecryptfs_prepare_write(struct file *file, struct page *page,
                                  unsigned from, unsigned to)
 {
        int rc = 0;
+        loff_t prev_page_end_size;
-        if (from == 0 && to == PAGE_CACHE_SIZE)
-                goto out;       /* If we are writing a full page, it will be
-                                   up to date. */
        if (!PageUptodate(page)) {
                rc = ecryptfs_read_lower_page_segment(page, page->index, 0,
                                                      PAGE_CACHE_SIZE,
@@ -283,22 +282,32 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page,
                } else
                        SetPageUptodate(page);
        }
-        if (page->index != 0) {
-                loff_t end_of_prev_pg_pos =
-                        (((loff_t)page->index << PAGE_CACHE_SHIFT) - 1);
-                if (end_of_prev_pg_pos > i_size_read(page->mapping->host)) {
+        prev_page_end_size = ((loff_t)page->index << PAGE_CACHE_SHIFT);
+        /*
+         * If creating a page or more of holes, zero them out via truncate.
+         * Note, this will increase i_size.
+         */
+        if (page->index != 0) {
+                if (prev_page_end_size > i_size_read(page->mapping->host)) {
                        rc = ecryptfs_truncate(file->f_path.dentry,
-                                               end_of_prev_pg_pos);
+                                               prev_page_end_size);
                        if (rc) {
                                printk(KERN_ERR "Error on attempt to "
                                       "truncate to (higher) offset [%lld];"
-                                       " rc = [%d]\n", end_of_prev_pg_pos, rc);
+                                       " rc = [%d]\n", prev_page_end_size, rc);
                                goto out;
                        }
                }
-                if (end_of_prev_pg_pos + 1 > i_size_read(page->mapping->host))
+        }
-                        zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
+        /*
+         * Writing to a new page, and creating a small hole from start of page?
+         * Zero it out.
+         */
+        if ((i_size_read(page->mapping->host) == prev_page_end_size) &&
+            (from != 0)) {
+                zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
        }
 out:
        return rc;
diff --git a/fs/ecryptfs/netlink.c b/fs/ecryptfs/netlink.c
index 9aa345121e09..f638a698dc52 100644
--- a/fs/ecryptfs/netlink.c
+++ b/fs/ecryptfs/netlink.c
@@ -237,7 +237,6 @@ out:
 */
 void ecryptfs_release_netlink(void)
 {
-        if (ecryptfs_nl_sock && ecryptfs_nl_sock->sk_socket)
+        netlink_kernel_release(ecryptfs_nl_sock);
-                sock_release(ecryptfs_nl_sock->sk_socket);
        ecryptfs_nl_sock = NULL;
 }
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 2150edf9a58e..948f57624c05 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -87,7 +87,7 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
        loff_t offset;
        int rc;
-        offset = ((((off_t)page_for_lower->index) << PAGE_CACHE_SHIFT)
+        offset = ((((loff_t)page_for_lower->index) << PAGE_CACHE_SHIFT)
                  + offset_in_page);
        virt = kmap(page_for_lower);
        rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size);
@@ -124,6 +124,10 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
        loff_t pos;
        int rc = 0;
+        /*
+         * if we are writing beyond current size, then start pos
+         * at the current size - we'll fill in zeros from there.
+         */
        if (offset > ecryptfs_file_size)
                pos = ecryptfs_file_size;
        else
@@ -137,6 +141,7 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
                if (num_bytes > total_remaining_bytes)
                        num_bytes = total_remaining_bytes;
                if (pos < offset) {
+                        /* remaining zeros to write, up to destination offset */
                        size_t total_remaining_zeros = (offset - pos);
                        if (num_bytes > total_remaining_zeros)
@@ -167,17 +172,27 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
                        }
                }
                ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0);
+                /*
+                 * pos: where we're now writing, offset: where the request was
+                 * If current pos is before request, we are filling zeros
+                 * If we are at or beyond request, we are writing the *data*
+                 * If we're in a fresh page beyond eof, zero it in either case
+                 */
+                if (pos < offset || !start_offset_in_page) {
+                        /* We are extending past the previous end of the file.
+                         * Fill in zero values to the end of the page */
+                        memset(((char *)ecryptfs_page_virt
+                                + start_offset_in_page), 0,
+                                PAGE_CACHE_SIZE - start_offset_in_page);
+                }
+                /* pos >= offset, we are now writing the data request */
                if (pos >= offset) {
                        memcpy(((char *)ecryptfs_page_virt
                                + start_offset_in_page),
                               (data + data_offset), num_bytes);
                        data_offset += num_bytes;
-                } else {
-                        /* We are extending past the previous end of the file.
-                         * Fill in zero values up to the start of where we
-                         * will be writing data. */
-                        memset(((char *)ecryptfs_page_virt
-                                + start_offset_in_page), 0, num_bytes);
                }
                kunmap_atomic(ecryptfs_page_virt, KM_USER0);
                flush_dcache_page(ecryptfs_page);
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index f8cdab2bee3d..4859c4eecd65 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -86,7 +86,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
                        fput(inode_info->lower_file);
                        inode_info->lower_file = NULL;
                        d_drop(lower_dentry);
-                        d_delete(lower_dentry);
                }
        }
        mutex_unlock(&inode_info->lower_file_mutex);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 34f68f3a069a..81c04abfb1aa 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -656,8 +656,7 @@ is_linked:
         * wait list.
         */
        if (waitqueue_active(&ep->wq))
-                __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
+                wake_up_locked(&ep->wq);
-                                 TASK_INTERRUPTIBLE);
        if (waitqueue_active(&ep->poll_wait))
                pwake++;
@@ -780,7 +779,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
                /* Notify waiting tasks that events are available */
                if (waitqueue_active(&ep->wq))
-                        __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE);
+                        wake_up_locked(&ep->wq);
                if (waitqueue_active(&ep->poll_wait))
                        pwake++;
        }
@@ -854,8 +853,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
                        /* Notify waiting tasks that events are available */
                        if (waitqueue_active(&ep->wq))
-                                __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
+                                wake_up_locked(&ep->wq);
-                                                 TASK_INTERRUPTIBLE);
                        if (waitqueue_active(&ep->poll_wait))
                                pwake++;
                }
@@ -978,8 +976,7 @@ errxit:
                 * wait list (delayed after we release the lock).
                 */
                if (waitqueue_active(&ep->wq))
-                        __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
+                        wake_up_locked(&ep->wq);
-                                         TASK_INTERRUPTIBLE);
                if (waitqueue_active(&ep->poll_wait))
                        pwake++;
        }
diff --git a/fs/exec.c b/fs/exec.c
index 2c942e2d14ea..282240afe99e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1692,7 +1692,10 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
        if (!binfmt || !binfmt->core_dump)
                goto fail;
        down_write(&mm->mmap_sem);
-        if (!get_dumpable(mm)) {
+        /*
+         * If another thread got here first, or we are not dumpable, bail out.
+         */
+        if (mm->core_waiters || !get_dumpable(mm)) {
                up_write(&mm->mmap_sem);
                goto fail;
        }
@@ -1706,7 +1709,6 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
                flag = O_EXCL;          /* Stop rewrite attacks */
                current->fsuid = 0;     /* Dump root private */
        }
-        set_dumpable(mm, 0);
        retval = coredump_wait(exit_code);
        if (retval < 0)
@@ -1778,6 +1780,12 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
           but keep the previous behaviour for now. */
        if (!ispipe && !S_ISREG(inode->i_mode))
                goto close_fail;
+        /*
+         * Dont allow local users get cute and trick others to coredump
+         * into their pre-created files:
+         */
+        if (inode->i_uid != current->fsuid)
+                goto close_fail;
        if (!file->f_op)
                goto close_fail;
        if (!file->f_op->write)
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 18a42de25b55..377ad172d74b 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -69,14 +69,6 @@ struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb,
        return desc + offset;
 }
-static inline int
-block_in_use(unsigned long block, struct super_block *sb, unsigned char *map)
-{
-        return ext2_test_bit ((block -
-                le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block)) %
-                         EXT2_BLOCKS_PER_GROUP(sb), map);
-}
 /*
 * Read the bitmap for a given block_group, reading into the specified 
 * slot in the superblock's bitmap cache.
@@ -86,51 +78,20 @@ block_in_use(unsigned long block, struct super_block *sb, unsigned char *map)
 static struct buffer_head *
 read_block_bitmap(struct super_block *sb, unsigned int block_group)
 {
-        int i;
        struct ext2_group_desc * desc;
        struct buffer_head * bh = NULL;
-        unsigned int bitmap_blk;
+        
        desc = ext2_get_group_desc (sb, block_group, NULL);
        if (!desc)
-                return NULL;
+                goto error_out;
-        bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
+        bh = sb_bread(sb, le32_to_cpu(desc->bg_block_bitmap));
-        bh = sb_bread(sb, bitmap_blk);
        if (!bh)
-                ext2_error (sb, __FUNCTION__,
+                ext2_error (sb, "read_block_bitmap",
                            "Cannot read block bitmap - "
                            "block_group = %d, block_bitmap = %u",
                            block_group, le32_to_cpu(desc->bg_block_bitmap));
-        /* check whether block bitmap block number is set */
-        if (!block_in_use(bitmap_blk, sb, bh->b_data)) {
-                /* bad block bitmap */
-                goto error_out;
-        }
-        /* check whether the inode bitmap block number is set */
-        bitmap_blk = le32_to_cpu(desc->bg_inode_bitmap);
-        if (!block_in_use(bitmap_blk, sb, bh->b_data)) {
-                /* bad block bitmap */
-                goto error_out;
-        }
-        /* check whether the inode table block number is set */
-        bitmap_blk = le32_to_cpu(desc->bg_inode_table);
-        for (i = 0; i < EXT2_SB(sb)->s_itb_per_group; i++, bitmap_blk++) {
-                if (!block_in_use(bitmap_blk, sb, bh->b_data)) {
-                        /* bad block bitmap */
-                        goto error_out;
-                }
-        }
-        return bh;
 error_out:
-        brelse(bh);
+        return bh;
-        ext2_error(sb, __FUNCTION__,
-                        "Invalid block bitmap - "
-                        "block_group = %d, block = %u",
-                        block_group, bitmap_blk);
-        return NULL;
 }
 static void release_blocks(struct super_block *sb, int count)
@@ -1461,7 +1422,6 @@ unsigned long ext2_count_free_blocks (struct super_block * sb)
 #endif
 }
 static inline int test_root(int a, int b)
 {
        int num = b;
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 7730388c4931..c87ae29c19cb 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -178,3 +178,10 @@ extern const struct inode_operations ext2_special_inode_operations;
 /* symlink.c */
 extern const struct inode_operations ext2_fast_symlink_inode_operations;
 extern const struct inode_operations ext2_symlink_inode_operations;
+static inline ext2_fsblk_t
+ext2_group_first_block_no(struct super_block *sb, unsigned long group_no)
+{
+        return group_no * (ext2_fsblk_t)EXT2_BLOCKS_PER_GROUP(sb) +
+                le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block);
+}
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index c2324d5fe4ac..320b2cb3d4d2 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -47,6 +47,11 @@ int ext2_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
                        flags &= ~EXT2_DIRSYNC_FL;
                mutex_lock(&inode->i_mutex);
+                /* Is it quota file? Do not allow user to mess with it */
+                if (IS_NOQUOTA(inode)) {
+                        mutex_unlock(&inode->i_mutex);
+                        return -EPERM;
+                }
                oldflags = ei->i_flags;
                /*
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 154e25f13d77..6abaf75163f0 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -680,11 +680,31 @@ static int ext2_check_descriptors (struct super_block * sb)
 static loff_t ext2_max_size(int bits)
 {
        loff_t res = EXT2_NDIR_BLOCKS;
-        /* This constant is calculated to be the largest file size for a
+        int meta_blocks;
-         * dense, 4k-blocksize file such that the total number of
+        loff_t upper_limit;
+        /* This is calculated to be the largest file size for a
+         * dense, file such that the total number of
         * sectors in the file, including data and all indirect blocks,
-         * does not exceed 2^32. */
+         * does not exceed 2^32 -1
-        const loff_t upper_limit = 0x1ff7fffd000LL;
+         * __u32 i_blocks representing the total number of
+         * 512 bytes blocks of the file
+         */
+        upper_limit = (1LL << 32) - 1;
+        /* total blocks in file system block size */
+        upper_limit >>= (bits - 9);
+        /* indirect blocks */
+        meta_blocks = 1;
+        /* double indirect blocks */
+        meta_blocks += 1 + (1LL << (bits-2));
+        /* tripple indirect blocks */
+        meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
+        upper_limit -= meta_blocks;
+        upper_limit <<= bits;
        res += 1LL << (bits-2);
        res += 1LL << (2*(bits-2));
@@ -692,6 +712,10 @@ static loff_t ext2_max_size(int bits)
        res <<= bits;
        if (res > upper_limit)
                res = upper_limit;
+        if (res > MAX_LFS_FILESIZE)
+                res = MAX_LFS_FILESIZE;
        return res;
 }
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 7a87d15523be..a8ba7e831278 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -80,14 +80,6 @@ struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
        return desc + offset;
 }
-static inline int
-block_in_use(ext3_fsblk_t block, struct super_block *sb, unsigned char *map)
-{
-        return ext3_test_bit ((block -
-                le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) %
-                         EXT3_BLOCKS_PER_GROUP(sb), map);
-}
 /**
 * read_block_bitmap()
 * @sb:                 super block
@@ -101,51 +93,20 @@ block_in_use(ext3_fsblk_t block, struct super_block *sb, unsigned char *map)
 static struct buffer_head *
 read_block_bitmap(struct super_block *sb, unsigned int block_group)
 {
-        int i;
        struct ext3_group_desc * desc;
        struct buffer_head * bh = NULL;
-        ext3_fsblk_t bitmap_blk;
        desc = ext3_get_group_desc (sb, block_group, NULL);
        if (!desc)
-                return NULL;
+                goto error_out;
-        bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
+        bh = sb_bread(sb, le32_to_cpu(desc->bg_block_bitmap));
-        bh = sb_bread(sb, bitmap_blk);
        if (!bh)
-                ext3_error (sb, __FUNCTION__,
+                ext3_error (sb, "read_block_bitmap",
                            "Cannot read block bitmap - "
                            "block_group = %d, block_bitmap = %u",
                            block_group, le32_to_cpu(desc->bg_block_bitmap));
-        /* check whether block bitmap block number is set */
-        if (!block_in_use(bitmap_blk, sb, bh->b_data)) {
-                /* bad block bitmap */
-                goto error_out;
-        }
-        /* check whether the inode bitmap block number is set */
-        bitmap_blk = le32_to_cpu(desc->bg_inode_bitmap);
-        if (!block_in_use(bitmap_blk, sb, bh->b_data)) {
-                /* bad block bitmap */
-                goto error_out;
-        }
-        /* check whether the inode table block number is set */
-        bitmap_blk = le32_to_cpu(desc->bg_inode_table);
-        for (i = 0; i < EXT3_SB(sb)->s_itb_per_group; i++, bitmap_blk++) {
-                if (!block_in_use(bitmap_blk, sb, bh->b_data)) {
-                        /* bad block bitmap */
-                        goto error_out;
-                }
-        }
-        return bh;
 error_out:
-        brelse(bh);
+        return bh;
-        ext3_error(sb, __FUNCTION__,
-                        "Invalid block bitmap - "
-                        "block_group = %d, block = %lu",
-                        block_group, bitmap_blk);
-        return NULL;
 }
 /*
 * The reservation window structure operations
@@ -1772,7 +1733,6 @@ ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb)
 #endif
 }
 static inline int test_root(int a, int b)
 {
        int num = b;
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index c8e4ee3af1d0..8ca3bfd72427 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -67,7 +67,7 @@ int ext3_check_dir_entry (const char * function, struct inode * dir,
                          unsigned long offset)
 {
        const char * error_msg = NULL;
-        const int rlen = le16_to_cpu(de->rec_len);
+        const int rlen = ext3_rec_len_from_disk(de->rec_len);
        if (rlen < EXT3_DIR_REC_LEN(1))
                error_msg = "rec_len is smaller than minimal";
@@ -173,10 +173,10 @@ revalidate:
                                 * least that it is non-zero.  A
                                 * failure will be detected in the
                                 * dirent test below. */
-                                if (le16_to_cpu(de->rec_len) <
+                                if (ext3_rec_len_from_disk(de->rec_len) <
                                                EXT3_DIR_REC_LEN(1))
                                        break;
-                                i += le16_to_cpu(de->rec_len);
+                                i += ext3_rec_len_from_disk(de->rec_len);
                        }
                        offset = i;
                        filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
@@ -197,7 +197,7 @@ revalidate:
                                ret = stored;
                                goto out;
                        }
-                        offset += le16_to_cpu(de->rec_len);
+                        offset += ext3_rec_len_from_disk(de->rec_len);
                        if (le32_to_cpu(de->inode)) {
                                /* We might block in the next section
                                 * if the data destination is
@@ -219,7 +219,7 @@ revalidate:
                                        goto revalidate;
                                stored ++;
                        }
-                        filp->f_pos += le16_to_cpu(de->rec_len);
+                        filp->f_pos += ext3_rec_len_from_disk(de->rec_len);
                }
                offset = 0;
                brelse (bh);
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 4a2a02c95bf9..023a070f55f1 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -51,6 +51,11 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
                        flags &= ~EXT3_DIRSYNC_FL;
                mutex_lock(&inode->i_mutex);
+                /* Is it quota file? Do not allow user to mess with it */
+                if (IS_NOQUOTA(inode)) {
+                        mutex_unlock(&inode->i_mutex);
+                        return -EPERM;
+                }
                oldflags = ei->i_flags;
                /* The JOURNAL_DATA flag is modifiable only by root */
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index ec8170adac53..4ab6f76e63d0 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -177,6 +177,16 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
                             struct inode *inode);
 /*
+ * p is at least 6 bytes before the end of page
+ */
+static inline struct ext3_dir_entry_2 *
+ext3_next_entry(struct ext3_dir_entry_2 *p)
+{
+        return (struct ext3_dir_entry_2 *)((char *)p +
+                ext3_rec_len_from_disk(p->rec_len));
+}
+/*
 * Future: use high four bits of block for coalesce-on-delete flags
 * Mask them off for now.
 */
@@ -280,7 +290,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_ent
                        space += EXT3_DIR_REC_LEN(de->name_len);
                        names++;
                }
-                de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
+                de = ext3_next_entry(de);
        }
        printk("(%i)\n", names);
        return (struct stats) { names, space, 1 };
@@ -547,14 +557,6 @@ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
 /*
- * p is at least 6 bytes before the end of page
- */
-static inline struct ext3_dir_entry_2 *ext3_next_entry(struct ext3_dir_entry_2 *p)
-{
-        return (struct ext3_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len));
-}
-/*
 * This function fills a red-black tree with information from a
 * directory block.  It returns the number directory entries loaded
 * into the tree.  If there is an error it is returned in err.
@@ -720,7 +722,7 @@ static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
                        cond_resched();
                }
                /* XXX: do we need to check rec_len == 0 case? -Chris */
-                de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
+                de = ext3_next_entry(de);
        }
        return count;
 }
@@ -822,7 +824,7 @@ static inline int search_dirblock(struct buffer_head * bh,
                        return 1;
                }
                /* prevent looping on a bad block */
-                de_len = le16_to_cpu(de->rec_len);
+                de_len = ext3_rec_len_from_disk(de->rec_len);
                if (de_len <= 0)
                        return -1;
                offset += de_len;
@@ -1130,7 +1132,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
                rec_len = EXT3_DIR_REC_LEN(de->name_len);
                memcpy (to, de, rec_len);
                ((struct ext3_dir_entry_2 *) to)->rec_len =
-                                cpu_to_le16(rec_len);
+                                ext3_rec_len_to_disk(rec_len);
                de->inode = 0;
                map++;
                to += rec_len;
@@ -1149,13 +1151,12 @@ static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
        prev = to = de;
        while ((char*)de < base + size) {
-                next = (struct ext3_dir_entry_2 *) ((char *) de +
+                next = ext3_next_entry(de);
-                                                    le16_to_cpu(de->rec_len));
                if (de->inode && de->name_len) {
                        rec_len = EXT3_DIR_REC_LEN(de->name_len);
                        if (de > to)
                                memmove(to, de, rec_len);
-                        to->rec_len = cpu_to_le16(rec_len);
+                        to->rec_len = ext3_rec_len_to_disk(rec_len);
                        prev = to;
                        to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len);
                }
@@ -1229,8 +1230,8 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
        /* Fancy dance to stay within two buffers */
        de2 = dx_move_dirents(data1, data2, map + split, count - split);
        de = dx_pack_dirents(data1,blocksize);
-        de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
+        de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de);
-        de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
+        de2->rec_len = ext3_rec_len_to_disk(data2 + blocksize - (char *) de2);
        dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
        dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
@@ -1300,7 +1301,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
                                return -EEXIST;
                        }
                        nlen = EXT3_DIR_REC_LEN(de->name_len);
-                        rlen = le16_to_cpu(de->rec_len);
+                        rlen = ext3_rec_len_from_disk(de->rec_len);
                        if ((de->inode? rlen - nlen: rlen) >= reclen)
                                break;
                        de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
@@ -1319,11 +1320,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        /* By now the buffer is marked for journaling */
        nlen = EXT3_DIR_REC_LEN(de->name_len);
-        rlen = le16_to_cpu(de->rec_len);
+        rlen = ext3_rec_len_from_disk(de->rec_len);
        if (de->inode) {
                struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
-                de1->rec_len = cpu_to_le16(rlen - nlen);
+                de1->rec_len = ext3_rec_len_to_disk(rlen - nlen);
-                de->rec_len = cpu_to_le16(nlen);
+                de->rec_len = ext3_rec_len_to_disk(nlen);
                de = de1;
        }
        de->file_type = EXT3_FT_UNKNOWN;
@@ -1400,17 +1401,18 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        /* The 0th block becomes the root, move the dirents out */
        fde = &root->dotdot;
-        de = (struct ext3_dir_entry_2 *)((char *)fde + le16_to_cpu(fde->rec_len));
+        de = (struct ext3_dir_entry_2 *)((char *)fde +
+                        ext3_rec_len_from_disk(fde->rec_len));
        len = ((char *) root) + blocksize - (char *) de;
        memcpy (data1, de, len);
        de = (struct ext3_dir_entry_2 *) data1;
        top = data1 + len;
-        while ((char *)(de2=(void*)de+le16_to_cpu(de->rec_len)) < top)
+        while ((char *)(de2 = ext3_next_entry(de)) < top)
                de = de2;
-        de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
+        de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de);
        /* Initialize the root; the dot dirents already exist */
        de = (struct ext3_dir_entry_2 *) (&root->dotdot);
-        de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2));
+        de->rec_len = ext3_rec_len_to_disk(blocksize - EXT3_DIR_REC_LEN(2));
        memset (&root->info, 0, sizeof(root->info));
        root->info.info_length = sizeof(root->info);
        root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
@@ -1490,7 +1492,7 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
                return retval;
        de = (struct ext3_dir_entry_2 *) bh->b_data;
        de->inode = 0;
-        de->rec_len = cpu_to_le16(blocksize);
+        de->rec_len = ext3_rec_len_to_disk(blocksize);
        return add_dirent_to_buf(handle, dentry, inode, de, bh);
 }
@@ -1553,7 +1555,7 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        goto cleanup;
                node2 = (struct dx_node *)(bh2->b_data);
                entries2 = node2->entries;
-                node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
+                node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize);
                node2->fake.inode = 0;
                BUFFER_TRACE(frame->bh, "get_write_access");
                err = ext3_journal_get_write_access(handle, frame->bh);
@@ -1651,9 +1653,9 @@ static int ext3_delete_entry (handle_t *handle,
                        BUFFER_TRACE(bh, "get_write_access");
                        ext3_journal_get_write_access(handle, bh);
                        if (pde)
-                                pde->rec_len =
+                                pde->rec_len = ext3_rec_len_to_disk(
-                                        cpu_to_le16(le16_to_cpu(pde->rec_len) +
+                                        ext3_rec_len_from_disk(pde->rec_len) +
-                                                    le16_to_cpu(de->rec_len));
+                                        ext3_rec_len_from_disk(de->rec_len));
                        else
                                de->inode = 0;
                        dir->i_version++;
@@ -1661,10 +1663,9 @@ static int ext3_delete_entry (handle_t *handle,
                        ext3_journal_dirty_metadata(handle, bh);
                        return 0;
                }
-                i += le16_to_cpu(de->rec_len);
+                i += ext3_rec_len_from_disk(de->rec_len);
                pde = de;
-                de = (struct ext3_dir_entry_2 *)
+                de = ext3_next_entry(de);
-                        ((char *) de + le16_to_cpu(de->rec_len));
        }
        return -ENOENT;
 }
@@ -1798,13 +1799,13 @@ retry:
        de = (struct ext3_dir_entry_2 *) dir_block->b_data;
        de->inode = cpu_to_le32(inode->i_ino);
        de->name_len = 1;
-        de->rec_len = cpu_to_le16(EXT3_DIR_REC_LEN(de->name_len));
+        de->rec_len = ext3_rec_len_to_disk(EXT3_DIR_REC_LEN(de->name_len));
        strcpy (de->name, ".");
        ext3_set_de_type(dir->i_sb, de, S_IFDIR);
-        de = (struct ext3_dir_entry_2 *)
+        de = ext3_next_entry(de);
-                        ((char *) de + le16_to_cpu(de->rec_len));
        de->inode = cpu_to_le32(dir->i_ino);
-        de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT3_DIR_REC_LEN(1));
+        de->rec_len = ext3_rec_len_to_disk(inode->i_sb->s_blocksize -
+                                        EXT3_DIR_REC_LEN(1));
        de->name_len = 2;
        strcpy (de->name, "..");
        ext3_set_de_type(dir->i_sb, de, S_IFDIR);
@@ -1856,8 +1857,7 @@ static int empty_dir (struct inode * inode)
                return 1;
        }
        de = (struct ext3_dir_entry_2 *) bh->b_data;
-        de1 = (struct ext3_dir_entry_2 *)
+        de1 = ext3_next_entry(de);
-                        ((char *) de + le16_to_cpu(de->rec_len));
        if (le32_to_cpu(de->inode) != inode->i_ino ||
                        !le32_to_cpu(de1->inode) ||
                        strcmp (".", de->name) ||
@@ -1868,9 +1868,9 @@ static int empty_dir (struct inode * inode)
                brelse (bh);
                return 1;
        }
-        offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
+        offset = ext3_rec_len_from_disk(de->rec_len) +
-        de = (struct ext3_dir_entry_2 *)
+                        ext3_rec_len_from_disk(de1->rec_len);
-                        ((char *) de1 + le16_to_cpu(de1->rec_len));
+        de = ext3_next_entry(de1);
        while (offset < inode->i_size ) {
                if (!bh ||
                        (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
@@ -1899,9 +1899,8 @@ static int empty_dir (struct inode * inode)
                        brelse (bh);
                        return 0;
                }
-                offset += le16_to_cpu(de->rec_len);
+                offset += ext3_rec_len_from_disk(de->rec_len);
-                de = (struct ext3_dir_entry_2 *)
+                de = ext3_next_entry(de);
-                                ((char *) de + le16_to_cpu(de->rec_len));
        }
        brelse (bh);
        return 1;
@@ -2255,8 +2254,7 @@ retry:
 }
 #define PARENT_INO(buffer) \
-        ((struct ext3_dir_entry_2 *) ((char *) buffer + \
+        (ext3_next_entry((struct ext3_dir_entry_2 *)(buffer))->inode)
-        le16_to_cpu(((struct ext3_dir_entry_2 *) buffer)->rec_len)))->inode
 /*
 * Anybody can rename anything with this: the permission checks are left to the
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index de55da9e28ba..f3675cc630e9 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1436,11 +1436,31 @@ static void ext3_orphan_cleanup (struct super_block * sb,
 static loff_t ext3_max_size(int bits)
 {
        loff_t res = EXT3_NDIR_BLOCKS;
-        /* This constant is calculated to be the largest file size for a
+        int meta_blocks;
-         * dense, 4k-blocksize file such that the total number of
+        loff_t upper_limit;
+        /* This is calculated to be the largest file size for a
+         * dense, file such that the total number of
         * sectors in the file, including data and all indirect blocks,
-         * does not exceed 2^32. */
+         * does not exceed 2^32 -1
-        const loff_t upper_limit = 0x1ff7fffd000LL;
+         * __u32 i_blocks representing the total number of
+         * 512 bytes blocks of the file
+         */
+        upper_limit = (1LL << 32) - 1;
+        /* total blocks in file system block size */
+        upper_limit >>= (bits - 9);
+        /* indirect blocks */
+        meta_blocks = 1;
+        /* double indirect blocks */
+        meta_blocks += 1 + (1LL << (bits-2));
+        /* tripple indirect blocks */
+        meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
+        upper_limit -= meta_blocks;
+        upper_limit <<= bits;
        res += 1LL << (bits-2);
        res += 1LL << (2*(bits-2));
@@ -1448,6 +1468,10 @@ static loff_t ext3_max_size(int bits)
        res <<= bits;
        if (res > upper_limit)
                res = upper_limit;
+        if (res > MAX_LFS_FILESIZE)
+                res = MAX_LFS_FILESIZE;
        return res;
 }
@@ -1676,7 +1700,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
        sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
        sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
-        if (EXT3_INODE_SIZE(sb) == 0)
+        if (EXT3_INODE_SIZE(sb) == 0 || EXT3_INODES_PER_GROUP(sb) == 0)
                goto cantfind_ext3;
        sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb);
        if (sbi->s_inodes_per_block == 0)
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index ae6e7e502ac9..ac6fa8ca0a2f 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
 ext4dev-y       := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
                   ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-                   ext4_jbd2.o
+                   ext4_jbd2.o migrate.o mballoc.o
 ext4dev-$(CONFIG_EXT4DEV_FS_XATTR)      += xattr.o xattr_user.o xattr_trusted.o
 ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL)  += acl.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e906b65448e2..ac75ea953d83 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -29,7 +29,7 @@
 * Calculate the block group number and offset, given a block number
 */
 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
-                unsigned long *blockgrpp, ext4_grpblk_t *offsetp)
+                ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
 {
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
        ext4_grpblk_t offset;
@@ -46,7 +46,7 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 /* Initializes an uninitialized block bitmap if given, and returns the
 * number of blocks free in the group. */
 unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
-                                int block_group, struct ext4_group_desc *gdp)
+                 ext4_group_t block_group, struct ext4_group_desc *gdp)
 {
        unsigned long start;
        int bit, bit_max;
@@ -60,7 +60,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                 * essentially implementing a per-group read-only flag. */
                if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
                        ext4_error(sb, __FUNCTION__,
-                                   "Checksum bad for group %u\n", block_group);
+                                  "Checksum bad for group %lu\n", block_group);
                        gdp->bg_free_blocks_count = 0;
                        gdp->bg_free_inodes_count = 0;
                        gdp->bg_itable_unused = 0;
@@ -153,7 +153,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 *                      group descriptor
 */
 struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
-                                             unsigned int block_group,
+                                             ext4_group_t block_group,
                                             struct buffer_head ** bh)
 {
        unsigned long group_desc;
@@ -164,7 +164,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
        if (block_group >= sbi->s_groups_count) {
                ext4_error (sb, "ext4_get_group_desc",
                            "block_group >= groups_count - "
-                            "block_group = %d, groups_count = %lu",
+                            "block_group = %lu, groups_count = %lu",
                            block_group, sbi->s_groups_count);
                return NULL;
@@ -176,7 +176,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
        if (!sbi->s_group_desc[group_desc]) {
                ext4_error (sb, "ext4_get_group_desc",
                            "Group descriptor not loaded - "
-                            "block_group = %d, group_desc = %lu, desc = %lu",
+                            "block_group = %lu, group_desc = %lu, desc = %lu",
                             block_group, group_desc, offset);
                return NULL;
        }
@@ -189,29 +189,71 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
        return desc;
 }
-static inline int
+static int ext4_valid_block_bitmap(struct super_block *sb,
-block_in_use(ext4_fsblk_t block, struct super_block *sb, unsigned char *map)
+                                        struct ext4_group_desc *desc,
+                                        unsigned int block_group,
+                                        struct buffer_head *bh)
 {
        ext4_grpblk_t offset;
+        ext4_grpblk_t next_zero_bit;
+        ext4_fsblk_t bitmap_blk;
+        ext4_fsblk_t group_first_block;
-        ext4_get_group_no_and_offset(sb, block, NULL, &offset);
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
-        return ext4_test_bit (offset, map);
+                /* with FLEX_BG, the inode/block bitmaps and itable
-}
+                 * blocks may not be in the group at all
+                 * so the bitmap validation will be skipped for those groups
+                 * or it has to also read the block group where the bitmaps
+                 * are located to verify they are set.
+                 */
+                return 1;
+        }
+        group_first_block = ext4_group_first_block_no(sb, block_group);
+        /* check whether block bitmap block number is set */
+        bitmap_blk = ext4_block_bitmap(sb, desc);
+        offset = bitmap_blk - group_first_block;
+        if (!ext4_test_bit(offset, bh->b_data))
+                /* bad block bitmap */
+                goto err_out;
+        /* check whether the inode bitmap block number is set */
+        bitmap_blk = ext4_inode_bitmap(sb, desc);
+        offset = bitmap_blk - group_first_block;
+        if (!ext4_test_bit(offset, bh->b_data))
+                /* bad block bitmap */
+                goto err_out;
+        /* check whether the inode table block number is set */
+        bitmap_blk = ext4_inode_table(sb, desc);
+        offset = bitmap_blk - group_first_block;
+        next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
+                                offset + EXT4_SB(sb)->s_itb_per_group,
+                                offset);
+        if (next_zero_bit >= offset + EXT4_SB(sb)->s_itb_per_group)
+                /* good bitmap for inode tables */
+                return 1;
+err_out:
+        ext4_error(sb, __FUNCTION__,
+                        "Invalid block bitmap - "
+                        "block_group = %d, block = %llu",
+                        block_group, bitmap_blk);
+        return 0;
+}
 /**
 * read_block_bitmap()
 * @sb:                 super block
 * @block_group:        given block group
 *
- * Read the bitmap for a given block_group, reading into the specified
+ * Read the bitmap for a given block_group,and validate the
- * slot in the superblock's bitmap cache.
+ * bits for block/inode/inode tables are set in the bitmaps
 *
 * Return buffer_head on success or NULL in case of failure.
 */
 struct buffer_head *
-read_block_bitmap(struct super_block *sb, unsigned int block_group)
+read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 {
-        int i;
        struct ext4_group_desc * desc;
        struct buffer_head * bh = NULL;
        ext4_fsblk_t bitmap_blk;
@@ -220,57 +262,37 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group)
        if (!desc)
                return NULL;
        bitmap_blk = ext4_block_bitmap(sb, desc);
-        if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+        bh = sb_getblk(sb, bitmap_blk);
-                bh = sb_getblk(sb, bitmap_blk);
+        if (unlikely(!bh)) {
-                if (!buffer_uptodate(bh)) {
+                ext4_error(sb, __FUNCTION__,
-                        lock_buffer(bh);
-                        if (!buffer_uptodate(bh)) {
-                                ext4_init_block_bitmap(sb, bh, block_group,
-                                                       desc);
-                                set_buffer_uptodate(bh);
-                        }
-                        unlock_buffer(bh);
-                }
-        } else {
-                bh = sb_bread(sb, bitmap_blk);
-        }
-        if (!bh)
-                ext4_error (sb, __FUNCTION__,
                            "Cannot read block bitmap - "
                            "block_group = %d, block_bitmap = %llu",
-                            block_group, bitmap_blk);
+                            (int)block_group, (unsigned long long)bitmap_blk);
+                return NULL;
-        /* check whether block bitmap block number is set */
-        if (!block_in_use(bitmap_blk, sb, bh->b_data)) {
-                /* bad block bitmap */
-                goto error_out;
        }
+        if (bh_uptodate_or_lock(bh))
+                return bh;
-        /* check whether the inode bitmap block number is set */
+        if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-        bitmap_blk = ext4_inode_bitmap(sb, desc);
+                ext4_init_block_bitmap(sb, bh, block_group, desc);
-        if (!block_in_use(bitmap_blk, sb, bh->b_data)) {
+                set_buffer_uptodate(bh);
-                /* bad block bitmap */
+                unlock_buffer(bh);
-                goto error_out;
+                return bh;
        }
-        /* check whether the inode table block number is set */
+        if (bh_submit_read(bh) < 0) {
-        bitmap_blk = ext4_inode_table(sb, desc);
+                put_bh(bh);
-        for (i = 0; i < EXT4_SB(sb)->s_itb_per_group; i++, bitmap_blk++) {
+                ext4_error(sb, __FUNCTION__,
-                if (!block_in_use(bitmap_blk, sb, bh->b_data)) {
+                            "Cannot read block bitmap - "
-                        /* bad block bitmap */
+                            "block_group = %d, block_bitmap = %llu",
-                        goto error_out;
+                            (int)block_group, (unsigned long long)bitmap_blk);
-                }
+                return NULL;
+        }
+        if (!ext4_valid_block_bitmap(sb, desc, block_group, bh)) {
+                put_bh(bh);
+                return NULL;
        }
        return bh;
-error_out:
-        brelse(bh);
-        ext4_error(sb, __FUNCTION__,
-                        "Invalid block bitmap - "
-                        "block_group = %d, block = %llu",
-                        block_group, bitmap_blk);
-        return NULL;
 }
 /*
 * The reservation window structure operations
@@ -361,7 +383,7 @@ restart:
 */
 static int
 goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal,
-                        unsigned int group, struct super_block * sb)
+                        ext4_group_t group, struct super_block *sb)
 {
        ext4_fsblk_t group_first_block, group_last_block;
@@ -504,7 +526,7 @@ static inline int rsv_is_empty(struct ext4_reserve_window *rsv)
 * when setting the reservation window size through ioctl before the file
 * is open for write (needs block allocation).
 *
- * Needs truncate_mutex protection prior to call this function.
+ * Needs down_write(i_data_sem) protection prior to call this function.
 */
 void ext4_init_block_alloc_info(struct inode *inode)
 {
@@ -555,6 +577,8 @@ void ext4_discard_reservation(struct inode *inode)
        struct ext4_reserve_window_node *rsv;
        spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
+        ext4_mb_discard_inode_preallocations(inode);
        if (!block_i)
                return;
@@ -581,7 +605,7 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
 {
        struct buffer_head *bitmap_bh = NULL;
        struct buffer_head *gd_bh;
-        unsigned long block_group;
+        ext4_group_t block_group;
        ext4_grpblk_t bit;
        unsigned long i;
        unsigned long overflow;
@@ -628,11 +652,13 @@ do_more:
            in_range(ext4_inode_bitmap(sb, desc), block, count) ||
            in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
            in_range(block + count - 1, ext4_inode_table(sb, desc),
-                     sbi->s_itb_per_group))
+                     sbi->s_itb_per_group)) {
                ext4_error (sb, "ext4_free_blocks",
                            "Freeing blocks in system zones - "
                            "Block = %llu, count = %lu",
                            block, count);
+                goto error_return;
+        }
        /*
         * We are about to start releasing blocks in the bitmap,
@@ -761,19 +787,29 @@ error_return:
 * @inode:              inode
 * @block:              start physical block to free
 * @count:              number of blocks to count
+ * @metadata:           Are these metadata blocks
 */
 void ext4_free_blocks(handle_t *handle, struct inode *inode,
-                        ext4_fsblk_t block, unsigned long count)
+                        ext4_fsblk_t block, unsigned long count,
+                        int metadata)
 {
        struct super_block * sb;
        unsigned long dquot_freed_blocks;
+        /* this isn't the right place to decide whether block is metadata
+         * inode.c/extents.c knows better, but for safety ... */
+        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
+                        ext4_should_journal_data(inode))
+                metadata = 1;
        sb = inode->i_sb;
-        if (!sb) {
-                printk ("ext4_free_blocks: nonexistent device");
+        if (!test_opt(sb, MBALLOC) || !EXT4_SB(sb)->s_group_info)
-                return;
+                ext4_free_blocks_sb(handle, sb, block, count,
-        }
+                                                &dquot_freed_blocks);
-        ext4_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
+        else
+                ext4_mb_free_blocks(handle, inode, block, count,
+                                                metadata, &dquot_freed_blocks);
        if (dquot_freed_blocks)
                DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
        return;
@@ -961,9 +997,10 @@ claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh)
 * ext4_journal_release_buffer(), else we'll run out of credits.
 */
 static ext4_grpblk_t
-ext4_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
+ext4_try_to_allocate(struct super_block *sb, handle_t *handle,
-                        struct buffer_head *bitmap_bh, ext4_grpblk_t grp_goal,
+                        ext4_group_t group, struct buffer_head *bitmap_bh,
-                        unsigned long *count, struct ext4_reserve_window *my_rsv)
+                        ext4_grpblk_t grp_goal, unsigned long *count,
+                        struct ext4_reserve_window *my_rsv)
 {
        ext4_fsblk_t group_first_block;
        ext4_grpblk_t start, end;
@@ -1197,7 +1234,7 @@ static int find_next_reservable_window(
 */
 static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv,
                ext4_grpblk_t grp_goal, struct super_block *sb,
-                unsigned int group, struct buffer_head *bitmap_bh)
+                ext4_group_t group, struct buffer_head *bitmap_bh)
 {
        struct ext4_reserve_window_node *search_head;
        ext4_fsblk_t group_first_block, group_end_block, start_block;
@@ -1395,7 +1432,7 @@ static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv,
 */
 static ext4_grpblk_t
 ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
-                        unsigned int group, struct buffer_head *bitmap_bh,
+                        ext4_group_t group, struct buffer_head *bitmap_bh,
                        ext4_grpblk_t grp_goal,
                        struct ext4_reserve_window_node * my_rsv,
                        unsigned long *count, int *errp)
@@ -1551,7 +1588,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 }
 /**
- * ext4_new_blocks() -- core block(s) allocation function
+ * ext4_new_blocks_old() -- core block(s) allocation function
 * @handle:             handle to this transaction
 * @inode:              file inode
 * @goal:               given target block(filesystem wide)
@@ -1564,17 +1601,17 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 * any specific goal block.
 *
 */
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
        struct buffer_head *bitmap_bh = NULL;
        struct buffer_head *gdp_bh;
-        unsigned long group_no;
+        ext4_group_t group_no;
-        int goal_group;
+        ext4_group_t goal_group;
        ext4_grpblk_t grp_target_blk;   /* blockgroup relative goal block */
        ext4_grpblk_t grp_alloc_blk;    /* blockgroup-relative allocated block*/
        ext4_fsblk_t ret_block;         /* filesyetem-wide allocated block */
-        int bgi;                        /* blockgroup iteration index */
+        ext4_group_t bgi;                       /* blockgroup iteration index */
        int fatal = 0, err;
        int performed_allocation = 0;
        ext4_grpblk_t free_blocks;      /* number of free blocks in a group */
@@ -1585,10 +1622,7 @@ ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
        struct ext4_reserve_window_node *my_rsv = NULL;
        struct ext4_block_alloc_info *block_i;
        unsigned short windowsz = 0;
-#ifdef EXT4FS_DEBUG
+        ext4_group_t ngroups;
-        static int goal_hits, goal_attempts;
-#endif
-        unsigned long ngroups;
        unsigned long num = *count;
        *errp = -ENOSPC;
@@ -1608,7 +1642,7 @@ ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
        sbi = EXT4_SB(sb);
        es = EXT4_SB(sb)->s_es;
-        ext4_debug("goal=%lu.\n", goal);
+        ext4_debug("goal=%llu.\n", goal);
        /*
         * Allocate a block from reservation only when
         * filesystem is mounted with reservation(default,-o reservation), and
@@ -1718,7 +1752,7 @@ retry_alloc:
 allocated:
-        ext4_debug("using block group %d(%d)\n",
+        ext4_debug("using block group %lu(%d)\n",
                        group_no, gdp->bg_free_blocks_count);
        BUFFER_TRACE(gdp_bh, "get_write_access");
@@ -1733,11 +1767,13 @@ allocated:
            in_range(ret_block, ext4_inode_table(sb, gdp),
                     EXT4_SB(sb)->s_itb_per_group) ||
            in_range(ret_block + num - 1, ext4_inode_table(sb, gdp),
-                     EXT4_SB(sb)->s_itb_per_group))
+                     EXT4_SB(sb)->s_itb_per_group)) {
                ext4_error(sb, "ext4_new_block",
                            "Allocating block in system zone - "
                            "blocks from %llu, length %lu",
                             ret_block, num);
+                goto out;
+        }
        performed_allocation = 1;
@@ -1784,9 +1820,6 @@ allocated:
         * list of some description.  We don't know in advance whether
         * the caller wants to use it as metadata or data.
         */
-        ext4_debug("allocating block %lu. Goal hits %d of %d.\n",
-                        ret_block, goal_hits, goal_attempts);
        spin_lock(sb_bgl_lock(sbi, group_no));
        if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
                gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
@@ -1828,13 +1861,46 @@ out:
 }
 ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
-                        ext4_fsblk_t goal, int *errp)
+                ext4_fsblk_t goal, int *errp)
+{
+        struct ext4_allocation_request ar;
+        ext4_fsblk_t ret;
+        if (!test_opt(inode->i_sb, MBALLOC)) {
+                unsigned long count = 1;
+                ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
+                return ret;
+        }
+        memset(&ar, 0, sizeof(ar));
+        ar.inode = inode;
+        ar.goal = goal;
+        ar.len = 1;
+        ret = ext4_mb_new_blocks(handle, &ar, errp);
+        return ret;
+}
+ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+                ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
-        unsigned long count = 1;
+        struct ext4_allocation_request ar;
+        ext4_fsblk_t ret;
-        return ext4_new_blocks(handle, inode, goal, &count, errp);
+        if (!test_opt(inode->i_sb, MBALLOC)) {
+                ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
+                return ret;
+        }
+        memset(&ar, 0, sizeof(ar));
+        ar.inode = inode;
+        ar.goal = goal;
+        ar.len = *count;
+        ret = ext4_mb_new_blocks(handle, &ar, errp);
+        *count = ar.len;
+        return ret;
 }
 /**
 * ext4_count_free_blocks() -- count filesystem free blocks
 * @sb:         superblock
@@ -1845,8 +1911,8 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 {
        ext4_fsblk_t desc_count;
        struct ext4_group_desc *gdp;
-        int i;
+        ext4_group_t i;
-        unsigned long ngroups = EXT4_SB(sb)->s_groups_count;
+        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
 #ifdef EXT4FS_DEBUG
        struct ext4_super_block *es;
        ext4_fsblk_t bitmap_count;
@@ -1870,14 +1936,14 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
                        continue;
                x = ext4_count_free(bitmap_bh, sb->s_blocksize);
-                printk("group %d: stored = %d, counted = %lu\n",
+                printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
                        i, le16_to_cpu(gdp->bg_free_blocks_count), x);
                bitmap_count += x;
        }
        brelse(bitmap_bh);
        printk("ext4_count_free_blocks: stored = %llu"
                ", computed = %llu, %llu\n",
-               EXT4_FREE_BLOCKS_COUNT(es),
+                ext4_free_blocks_count(es),
                desc_count, bitmap_count);
        return bitmap_count;
 #else
@@ -1894,7 +1960,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 #endif
 }
-static inline int test_root(int a, int b)
+static inline int test_root(ext4_group_t a, int b)
 {
        int num = b;
@@ -1903,7 +1969,7 @@ static inline int test_root(int a, int b)
        return num == a;
 }
-static int ext4_group_sparse(int group)
+static int ext4_group_sparse(ext4_group_t group)
 {
        if (group <= 1)
                return 1;
@@ -1921,7 +1987,7 @@ static int ext4_group_sparse(int group)
 *      Return the number of blocks used by the superblock (primary or backup)
 *      in this group.  Currently this will be only 0 or 1.
 */
-int ext4_bg_has_super(struct super_block *sb, int group)
+int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
 {
        if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
                                EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
@@ -1930,18 +1996,20 @@ int ext4_bg_has_super(struct super_block *sb, int group)
        return 1;
 }
-static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, int group)
+static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
+                                        ext4_group_t group)
 {
        unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
-        unsigned long first = metagroup * EXT4_DESC_PER_BLOCK(sb);
+        ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb);
-        unsigned long last = first + EXT4_DESC_PER_BLOCK(sb) - 1;
+        ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1;
        if (group == first || group == first + 1 || group == last)
                return 1;
        return 0;
 }
-static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, int group)
+static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
+                                        ext4_group_t group)
 {
        if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
                                EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
@@ -1959,7 +2027,7 @@ static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, int group)
 *      (primary or backup) in this group.  In the future there may be a
 *      different number of descriptor blocks in each group.
 */
-unsigned long ext4_bg_num_gdb(struct super_block *sb, int group)
+unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
 {
        unsigned long first_meta_bg =
                        le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index f612bef98315..33888bb58144 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -67,7 +67,7 @@ int ext4_check_dir_entry (const char * function, struct inode * dir,
                          unsigned long offset)
 {
        const char * error_msg = NULL;
-        const int rlen = le16_to_cpu(de->rec_len);
+        const int rlen = ext4_rec_len_from_disk(de->rec_len);
        if (rlen < EXT4_DIR_REC_LEN(1))
                error_msg = "rec_len is smaller than minimal";
@@ -124,7 +124,7 @@ static int ext4_readdir(struct file * filp,
        offset = filp->f_pos & (sb->s_blocksize - 1);
        while (!error && !stored && filp->f_pos < inode->i_size) {
-                unsigned long blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
+                ext4_lblk_t blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
                struct buffer_head map_bh;
                struct buffer_head *bh = NULL;
@@ -172,10 +172,10 @@ revalidate:
                                 * least that it is non-zero.  A
                                 * failure will be detected in the
                                 * dirent test below. */
-                                if (le16_to_cpu(de->rec_len) <
+                                if (ext4_rec_len_from_disk(de->rec_len)
-                                                EXT4_DIR_REC_LEN(1))
+                                                < EXT4_DIR_REC_LEN(1))
                                        break;
-                                i += le16_to_cpu(de->rec_len);
+                                i += ext4_rec_len_from_disk(de->rec_len);
                        }
                        offset = i;
                        filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
@@ -197,7 +197,7 @@ revalidate:
                                ret = stored;
                                goto out;
                        }
-                        offset += le16_to_cpu(de->rec_len);
+                        offset += ext4_rec_len_from_disk(de->rec_len);
                        if (le32_to_cpu(de->inode)) {
                                /* We might block in the next section
                                 * if the data destination is
@@ -219,7 +219,7 @@ revalidate:
                                        goto revalidate;
                                stored ++;
                        }
-                        filp->f_pos += le16_to_cpu(de->rec_len);
+                        filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
                }
                offset = 0;
                brelse (bh);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 85287742f2ae..bc7081f1fbe8 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -61,7 +61,7 @@ static ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
 * idx_pblock:
 * combine low and high parts of a leaf physical block number into ext4_fsblk_t
 */
-static ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
+ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
 {
        ext4_fsblk_t block;
@@ -75,7 +75,7 @@ static ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
 * stores a large physical block number into an extent struct,
 * breaking it into parts
 */
-static void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
+void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
 {
        ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
        ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
@@ -144,7 +144,7 @@ static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
 static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
                              struct ext4_ext_path *path,
-                              ext4_fsblk_t block)
+                              ext4_lblk_t block)
 {
        struct ext4_inode_info *ei = EXT4_I(inode);
        ext4_fsblk_t bg_start;
@@ -367,13 +367,14 @@ static void ext4_ext_drop_refs(struct ext4_ext_path *path)
 * the header must be checked before calling this
 */
 static void
-ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int block)
+ext4_ext_binsearch_idx(struct inode *inode,
+                        struct ext4_ext_path *path, ext4_lblk_t block)
 {
        struct ext4_extent_header *eh = path->p_hdr;
        struct ext4_extent_idx *r, *l, *m;
-        ext_debug("binsearch for %d(idx):  ", block);
+        ext_debug("binsearch for %u(idx):  ", block);
        l = EXT_FIRST_INDEX(eh) + 1;
        r = EXT_LAST_INDEX(eh);
@@ -425,7 +426,8 @@ ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int bloc
 * the header must be checked before calling this
 */
 static void
-ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block)
+ext4_ext_binsearch(struct inode *inode,
+                struct ext4_ext_path *path, ext4_lblk_t block)
 {
        struct ext4_extent_header *eh = path->p_hdr;
        struct ext4_extent *r, *l, *m;
@@ -438,7 +440,7 @@ ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block)
                return;
        }
-        ext_debug("binsearch for %d:  ", block);
+        ext_debug("binsearch for %u:  ", block);
        l = EXT_FIRST_EXTENT(eh) + 1;
        r = EXT_LAST_EXTENT(eh);
@@ -494,7 +496,8 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
 }
 struct ext4_ext_path *
-ext4_ext_find_extent(struct inode *inode, int block, struct ext4_ext_path *path)
+ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
+                                        struct ext4_ext_path *path)
 {
        struct ext4_extent_header *eh;
        struct buffer_head *bh;
@@ -763,7 +766,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
        while (k--) {
                oldblock = newblock;
                newblock = ablocks[--a];
-                bh = sb_getblk(inode->i_sb, (ext4_fsblk_t)newblock);
+                bh = sb_getblk(inode->i_sb, newblock);
                if (!bh) {
                        err = -EIO;
                        goto cleanup;
@@ -783,9 +786,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                fidx->ei_block = border;
                ext4_idx_store_pblock(fidx, oldblock);
-                ext_debug("int.index at %d (block %llu): %lu -> %llu\n", i,
+                ext_debug("int.index at %d (block %llu): %u -> %llu\n",
-                                newblock, (unsigned long) le32_to_cpu(border),
+                                i, newblock, le32_to_cpu(border), oldblock);
-                                oldblock);
                /* copy indexes */
                m = 0;
                path[i].p_idx++;
@@ -851,7 +853,7 @@ cleanup:
                for (i = 0; i < depth; i++) {
                        if (!ablocks[i])
                                continue;
-                        ext4_free_blocks(handle, inode, ablocks[i], 1);
+                        ext4_free_blocks(handle, inode, ablocks[i], 1, 1);
                }
        }
        kfree(ablocks);
@@ -979,8 +981,8 @@ repeat:
                /* refill path */
                ext4_ext_drop_refs(path);
                path = ext4_ext_find_extent(inode,
-                                            le32_to_cpu(newext->ee_block),
+                                    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
-                                            path);
+                                    path);
                if (IS_ERR(path))
                        err = PTR_ERR(path);
        } else {
@@ -992,8 +994,8 @@ repeat:
                /* refill path */
                ext4_ext_drop_refs(path);
                path = ext4_ext_find_extent(inode,
-                                            le32_to_cpu(newext->ee_block),
+                                   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
-                                            path);
+                                    path);
                if (IS_ERR(path)) {
                        err = PTR_ERR(path);
                        goto out;
@@ -1015,13 +1017,157 @@ out:
 }
 /*
+ * search the closest allocated block to the left for *logical
+ * and returns it at @logical + it's physical address at @phys
+ * if *logical is the smallest allocated block, the function
+ * returns 0 at @phys
+ * return value contains 0 (success) or error code
+ */
+int
+ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
+                        ext4_lblk_t *logical, ext4_fsblk_t *phys)
+{
+        struct ext4_extent_idx *ix;
+        struct ext4_extent *ex;
+        int depth, ee_len;
+        BUG_ON(path == NULL);
+        depth = path->p_depth;
+        *phys = 0;
+        if (depth == 0 && path->p_ext == NULL)
+                return 0;
+        /* usually extent in the path covers blocks smaller
+         * then *logical, but it can be that extent is the
+         * first one in the file */
+        ex = path[depth].p_ext;
+        ee_len = ext4_ext_get_actual_len(ex);
+        if (*logical < le32_to_cpu(ex->ee_block)) {
+                BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
+                while (--depth >= 0) {
+                        ix = path[depth].p_idx;
+                        BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
+                }
+                return 0;
+        }
+        BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
+        *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
+        *phys = ext_pblock(ex) + ee_len - 1;
+        return 0;
+}
+/*
+ * search the closest allocated block to the right for *logical
+ * and returns it at @logical + it's physical address at @phys
+ * if *logical is the smallest allocated block, the function
+ * returns 0 at @phys
+ * return value contains 0 (success) or error code
+ */
+int
+ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
+                        ext4_lblk_t *logical, ext4_fsblk_t *phys)
+{
+        struct buffer_head *bh = NULL;
+        struct ext4_extent_header *eh;
+        struct ext4_extent_idx *ix;
+        struct ext4_extent *ex;
+        ext4_fsblk_t block;
+        int depth, ee_len;
+        BUG_ON(path == NULL);
+        depth = path->p_depth;
+        *phys = 0;
+        if (depth == 0 && path->p_ext == NULL)
+                return 0;
+        /* usually extent in the path covers blocks smaller
+         * then *logical, but it can be that extent is the
+         * first one in the file */
+        ex = path[depth].p_ext;
+        ee_len = ext4_ext_get_actual_len(ex);
+        if (*logical < le32_to_cpu(ex->ee_block)) {
+                BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
+                while (--depth >= 0) {
+                        ix = path[depth].p_idx;
+                        BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
+                }
+                *logical = le32_to_cpu(ex->ee_block);
+                *phys = ext_pblock(ex);
+                return 0;
+        }
+        BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
+        if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
+                /* next allocated block in this leaf */
+                ex++;
+                *logical = le32_to_cpu(ex->ee_block);
+                *phys = ext_pblock(ex);
+                return 0;
+        }
+        /* go up and search for index to the right */
+        while (--depth >= 0) {
+                ix = path[depth].p_idx;
+                if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
+                        break;
+        }
+        if (depth < 0) {
+                /* we've gone up to the root and
+                 * found no index to the right */
+                return 0;
+        }
+        /* we've found index to the right, let's
+         * follow it and find the closest allocated
+         * block to the right */
+        ix++;
+        block = idx_pblock(ix);
+        while (++depth < path->p_depth) {
+                bh = sb_bread(inode->i_sb, block);
+                if (bh == NULL)
+                        return -EIO;
+                eh = ext_block_hdr(bh);
+                if (ext4_ext_check_header(inode, eh, depth)) {
+                        put_bh(bh);
+                        return -EIO;
+                }
+                ix = EXT_FIRST_INDEX(eh);
+                block = idx_pblock(ix);
+                put_bh(bh);
+        }
+        bh = sb_bread(inode->i_sb, block);
+        if (bh == NULL)
+                return -EIO;
+        eh = ext_block_hdr(bh);
+        if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
+                put_bh(bh);
+                return -EIO;
+        }
+        ex = EXT_FIRST_EXTENT(eh);
+        *logical = le32_to_cpu(ex->ee_block);
+        *phys = ext_pblock(ex);
+        put_bh(bh);
+        return 0;
+}
+/*
 * ext4_ext_next_allocated_block:
 * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
 * NOTE: it considers block number from index entry as
 * allocated block. Thus, index entries have to be consistent
 * with leaves.
 */
-static unsigned long
+static ext4_lblk_t
 ext4_ext_next_allocated_block(struct ext4_ext_path *path)
 {
        int depth;
@@ -1054,7 +1200,7 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
 * ext4_ext_next_leaf_block:
 * returns first allocated block from next leaf or EXT_MAX_BLOCK
 */
-static unsigned ext4_ext_next_leaf_block(struct inode *inode,
+static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
                                        struct ext4_ext_path *path)
 {
        int depth;
@@ -1072,7 +1218,8 @@ static unsigned ext4_ext_next_leaf_block(struct inode *inode,
        while (depth >= 0) {
                if (path[depth].p_idx !=
                                EXT_LAST_INDEX(path[depth].p_hdr))
-                  return le32_to_cpu(path[depth].p_idx[1].ei_block);
+                        return (ext4_lblk_t)
+                                le32_to_cpu(path[depth].p_idx[1].ei_block);
                depth--;
        }
@@ -1085,7 +1232,7 @@ static unsigned ext4_ext_next_leaf_block(struct inode *inode,
 * then we have to correct all indexes above.
 * TODO: do we need to correct tree in all cases?
 */
-int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
+static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
                                struct ext4_ext_path *path)
 {
        struct ext4_extent_header *eh;
@@ -1171,7 +1318,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
        if (ext1_ee_len + ext2_ee_len > max_len)
                return 0;
 #ifdef AGGRESSIVE_TEST
-        if (le16_to_cpu(ex1->ee_len) >= 4)
+        if (ext1_ee_len >= 4)
                return 0;
 #endif
@@ -1239,7 +1386,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode,
                                    struct ext4_extent *newext,
                                    struct ext4_ext_path *path)
 {
-        unsigned long b1, b2;
+        ext4_lblk_t b1, b2;
        unsigned int depth, len1;
        unsigned int ret = 0;
@@ -1260,7 +1407,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode,
                        goto out;
        }
-        /* check for wrap through zero */
+        /* check for wrap through zero on extent logical start block*/
        if (b1 + len1 < b1) {
                len1 = EXT_MAX_BLOCK - b1;
                newext->ee_len = cpu_to_le16(len1);
@@ -1290,7 +1437,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
        struct ext4_extent *ex, *fex;
        struct ext4_extent *nearex; /* nearest extent */
        struct ext4_ext_path *npath = NULL;
-        int depth, len, err, next;
+        int depth, len, err;
+        ext4_lblk_t next;
        unsigned uninitialized = 0;
        BUG_ON(ext4_ext_get_actual_len(newext) == 0);
@@ -1435,114 +1583,8 @@ cleanup:
        return err;
 }
-int ext4_ext_walk_space(struct inode *inode, unsigned long block,
-                        unsigned long num, ext_prepare_callback func,
-                        void *cbdata)
-{
-        struct ext4_ext_path *path = NULL;
-        struct ext4_ext_cache cbex;
-        struct ext4_extent *ex;
-        unsigned long next, start = 0, end = 0;
-        unsigned long last = block + num;
-        int depth, exists, err = 0;
-        BUG_ON(func == NULL);
-        BUG_ON(inode == NULL);
-        while (block < last && block != EXT_MAX_BLOCK) {
-                num = last - block;
-                /* find extent for this block */
-                path = ext4_ext_find_extent(inode, block, path);
-                if (IS_ERR(path)) {
-                        err = PTR_ERR(path);
-                        path = NULL;
-                        break;
-                }
-                depth = ext_depth(inode);
-                BUG_ON(path[depth].p_hdr == NULL);
-                ex = path[depth].p_ext;
-                next = ext4_ext_next_allocated_block(path);
-                exists = 0;
-                if (!ex) {
-                        /* there is no extent yet, so try to allocate
-                         * all requested space */
-                        start = block;
-                        end = block + num;
-                } else if (le32_to_cpu(ex->ee_block) > block) {
-                        /* need to allocate space before found extent */
-                        start = block;
-                        end = le32_to_cpu(ex->ee_block);
-                        if (block + num < end)
-                                end = block + num;
-                } else if (block >= le32_to_cpu(ex->ee_block)
-                                        + ext4_ext_get_actual_len(ex)) {
-                        /* need to allocate space after found extent */
-                        start = block;
-                        end = block + num;
-                        if (end >= next)
-                                end = next;
-                } else if (block >= le32_to_cpu(ex->ee_block)) {
-                        /*
-                         * some part of requested space is covered
-                         * by found extent
-                         */
-                        start = block;
-                        end = le32_to_cpu(ex->ee_block)
-                                + ext4_ext_get_actual_len(ex);
-                        if (block + num < end)
-                                end = block + num;
-                        exists = 1;
-                } else {
-                        BUG();
-                }
-                BUG_ON(end <= start);
-                if (!exists) {
-                        cbex.ec_block = start;
-                        cbex.ec_len = end - start;
-                        cbex.ec_start = 0;
-                        cbex.ec_type = EXT4_EXT_CACHE_GAP;
-                } else {
-                        cbex.ec_block = le32_to_cpu(ex->ee_block);
-                        cbex.ec_len = ext4_ext_get_actual_len(ex);
-                        cbex.ec_start = ext_pblock(ex);
-                        cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
-                }
-                BUG_ON(cbex.ec_len == 0);
-                err = func(inode, path, &cbex, cbdata);
-                ext4_ext_drop_refs(path);
-                if (err < 0)
-                        break;
-                if (err == EXT_REPEAT)
-                        continue;
-                else if (err == EXT_BREAK) {
-                        err = 0;
-                        break;
-                }
-                if (ext_depth(inode) != depth) {
-                        /* depth was changed. we have to realloc path */
-                        kfree(path);
-                        path = NULL;
-                }
-                block = cbex.ec_block + cbex.ec_len;
-        }
-        if (path) {
-                ext4_ext_drop_refs(path);
-                kfree(path);
-        }
-        return err;
-}
 static void
-ext4_ext_put_in_cache(struct inode *inode, __u32 block,
+ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
                        __u32 len, ext4_fsblk_t start, int type)
 {
        struct ext4_ext_cache *cex;
@@ -1561,10 +1603,11 @@ ext4_ext_put_in_cache(struct inode *inode, __u32 block,
 */
 static void
 ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
-                                unsigned long block)
+                                ext4_lblk_t block)
 {
        int depth = ext_depth(inode);
-        unsigned long lblock, len;
+        unsigned long len;
+        ext4_lblk_t lblock;
        struct ext4_extent *ex;
        ex = path[depth].p_ext;
@@ -1576,32 +1619,34 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
        } else if (block < le32_to_cpu(ex->ee_block)) {
                lblock = block;
                len = le32_to_cpu(ex->ee_block) - block;
-                ext_debug("cache gap(before): %lu [%lu:%lu]",
+                ext_debug("cache gap(before): %u [%u:%u]",
-                                (unsigned long) block,
+                                block,
-                                (unsigned long) le32_to_cpu(ex->ee_block),
+                                le32_to_cpu(ex->ee_block),
-                                (unsigned long) ext4_ext_get_actual_len(ex));
+                                 ext4_ext_get_actual_len(ex));
        } else if (block >= le32_to_cpu(ex->ee_block)
                        + ext4_ext_get_actual_len(ex)) {
+                ext4_lblk_t next;
                lblock = le32_to_cpu(ex->ee_block)
                        + ext4_ext_get_actual_len(ex);
-                len = ext4_ext_next_allocated_block(path);
-                ext_debug("cache gap(after): [%lu:%lu] %lu",
+                next = ext4_ext_next_allocated_block(path);
-                                (unsigned long) le32_to_cpu(ex->ee_block),
+                ext_debug("cache gap(after): [%u:%u] %u",
-                                (unsigned long) ext4_ext_get_actual_len(ex),
+                                le32_to_cpu(ex->ee_block),
-                                (unsigned long) block);
+                                ext4_ext_get_actual_len(ex),
-                BUG_ON(len == lblock);
+                                block);
-                len = len - lblock;
+                BUG_ON(next == lblock);
+                len = next - lblock;
        } else {
                lblock = len = 0;
                BUG();
        }
-        ext_debug(" -> %lu:%lu\n", (unsigned long) lblock, len);
+        ext_debug(" -> %u:%lu\n", lblock, len);
        ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
 }
 static int
-ext4_ext_in_cache(struct inode *inode, unsigned long block,
+ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
                        struct ext4_extent *ex)
 {
        struct ext4_ext_cache *cex;
@@ -1618,11 +1663,9 @@ ext4_ext_in_cache(struct inode *inode, unsigned long block,
                ex->ee_block = cpu_to_le32(cex->ec_block);
                ext4_ext_store_pblock(ex, cex->ec_start);
                ex->ee_len = cpu_to_le16(cex->ec_len);
-                ext_debug("%lu cached by %lu:%lu:%llu\n",
+                ext_debug("%u cached by %u:%u:%llu\n",
-                                (unsigned long) block,
+                                block,
-                                (unsigned long) cex->ec_block,
+                                cex->ec_block, cex->ec_len, cex->ec_start);
-                                (unsigned long) cex->ec_len,
-                                cex->ec_start);
                return cex->ec_type;
        }
@@ -1636,7 +1679,7 @@ ext4_ext_in_cache(struct inode *inode, unsigned long block,
 * It's used in truncate case only, thus all requests are for
 * last index in the block only.
 */
-int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
+static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
                        struct ext4_ext_path *path)
 {
        struct buffer_head *bh;
@@ -1657,7 +1700,7 @@ int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
        ext_debug("index is empty, remove it, free block %llu\n", leaf);
        bh = sb_find_get_block(inode->i_sb, leaf);
        ext4_forget(handle, 1, inode, bh, leaf);
-        ext4_free_blocks(handle, inode, leaf, 1);
+        ext4_free_blocks(handle, inode, leaf, 1, 1);
        return err;
 }
@@ -1666,7 +1709,7 @@ int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
 * This routine returns max. credits that the extent tree can consume.
 * It should be OK for low-performance paths like ->writepage()
 * To allow many writing processes to fit into a single transaction,
- * the caller should calculate credits under truncate_mutex and
+ * the caller should calculate credits under i_data_sem and
 * pass the actual path.
 */
 int ext4_ext_calc_credits_for_insert(struct inode *inode,
@@ -1714,12 +1757,14 @@ int ext4_ext_calc_credits_for_insert(struct inode *inode,
 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                                struct ext4_extent *ex,
-                                unsigned long from, unsigned long to)
+                                ext4_lblk_t from, ext4_lblk_t to)
 {
        struct buffer_head *bh;
        unsigned short ee_len =  ext4_ext_get_actual_len(ex);
-        int i;
+        int i, metadata = 0;
+        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+                metadata = 1;
 #ifdef EXTENTS_STATS
        {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -1738,42 +1783,45 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
        if (from >= le32_to_cpu(ex->ee_block)
            && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
                /* tail removal */
-                unsigned long num;
+                ext4_lblk_t num;
                ext4_fsblk_t start;
                num = le32_to_cpu(ex->ee_block) + ee_len - from;
                start = ext_pblock(ex) + ee_len - num;
-                ext_debug("free last %lu blocks starting %llu\n", num, start);
+                ext_debug("free last %u blocks starting %llu\n", num, start);
                for (i = 0; i < num; i++) {
                        bh = sb_find_get_block(inode->i_sb, start + i);
                        ext4_forget(handle, 0, inode, bh, start + i);
                }
-                ext4_free_blocks(handle, inode, start, num);
+                ext4_free_blocks(handle, inode, start, num, metadata);
        } else if (from == le32_to_cpu(ex->ee_block)
                   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
-                printk("strange request: removal %lu-%lu from %u:%u\n",
+                printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
                        from, to, le32_to_cpu(ex->ee_block), ee_len);
        } else {
-                printk("strange request: removal(2) %lu-%lu from %u:%u\n",
+                printk(KERN_INFO "strange request: removal(2) "
-                        from, to, le32_to_cpu(ex->ee_block), ee_len);
+                                "%u-%u from %u:%u\n",
+                                from, to, le32_to_cpu(ex->ee_block), ee_len);
        }
        return 0;
 }
 static int
 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
-                struct ext4_ext_path *path, unsigned long start)
+                struct ext4_ext_path *path, ext4_lblk_t start)
 {
        int err = 0, correct_index = 0;
        int depth = ext_depth(inode), credits;
        struct ext4_extent_header *eh;
-        unsigned a, b, block, num;
+        ext4_lblk_t a, b, block;
-        unsigned long ex_ee_block;
+        unsigned num;
+        ext4_lblk_t ex_ee_block;
        unsigned short ex_ee_len;
        unsigned uninitialized = 0;
        struct ext4_extent *ex;
        /* the header must be checked already in ext4_ext_remove_space() */
-        ext_debug("truncate since %lu in leaf\n", start);
+        ext_debug("truncate since %u in leaf\n", start);
        if (!path[depth].p_hdr)
                path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
        eh = path[depth].p_hdr;
@@ -1904,7 +1952,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
        return 1;
 }
-int ext4_ext_remove_space(struct inode *inode, unsigned long start)
+static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 {
        struct super_block *sb = inode->i_sb;
        int depth = ext_depth(inode);
@@ -1912,7 +1960,7 @@ int ext4_ext_remove_space(struct inode *inode, unsigned long start)
        handle_t *handle;
        int i = 0, err = 0;
-        ext_debug("truncate since %lu\n", start);
+        ext_debug("truncate since %u\n", start);
        /* probably first extent we're gonna free will be last in block */
        handle = ext4_journal_start(inode, depth + 1);
@@ -2094,17 +2142,19 @@ void ext4_ext_release(struct super_block *sb)
 *   b> Splits in two extents: Write is happening at either end of the extent
 *   c> Splits in three extents: Somone is writing in middle of the extent
 */
-int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode,
+static int ext4_ext_convert_to_initialized(handle_t *handle,
-                                        struct ext4_ext_path *path,
+                                                struct inode *inode,
-                                        ext4_fsblk_t iblock,
+                                                struct ext4_ext_path *path,
-                                        unsigned long max_blocks)
+                                                ext4_lblk_t iblock,
+                                                unsigned long max_blocks)
 {
        struct ext4_extent *ex, newex;
        struct ext4_extent *ex1 = NULL;
        struct ext4_extent *ex2 = NULL;
        struct ext4_extent *ex3 = NULL;
        struct ext4_extent_header *eh;
-        unsigned int allocated, ee_block, ee_len, depth;
+        ext4_lblk_t ee_block;
+        unsigned int allocated, ee_len, depth;
        ext4_fsblk_t newblock;
        int err = 0;
        int ret = 0;
@@ -2225,8 +2275,13 @@ out:
        return err ? err : allocated;
 }
+/*
+ * Need to be called with
+ * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
+ * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
+ */
 int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
-                        ext4_fsblk_t iblock,
+                        ext4_lblk_t iblock,
                        unsigned long max_blocks, struct buffer_head *bh_result,
                        int create, int extend_disksize)
 {
@@ -2236,11 +2291,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        ext4_fsblk_t goal, newblock;
        int err = 0, depth, ret;
        unsigned long allocated = 0;
+        struct ext4_allocation_request ar;
        __clear_bit(BH_New, &bh_result->b_state);
-        ext_debug("blocks %d/%lu requested for inode %u\n", (int) iblock,
+        ext_debug("blocks %u/%lu requested for inode %u\n",
-                        max_blocks, (unsigned) inode->i_ino);
+                        iblock, max_blocks, inode->i_ino);
-        mutex_lock(&EXT4_I(inode)->truncate_mutex);
        /* check in cache */
        goal = ext4_ext_in_cache(inode, iblock, &newex);
@@ -2260,7 +2315,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                                   - le32_to_cpu(newex.ee_block)
                                   + ext_pblock(&newex);
                        /* number of remaining blocks in the extent */
-                        allocated = le16_to_cpu(newex.ee_len) -
+                        allocated = ext4_ext_get_actual_len(&newex) -
                                        (iblock - le32_to_cpu(newex.ee_block));
                        goto out;
                } else {
@@ -2288,7 +2343,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        ex = path[depth].p_ext;
        if (ex) {
-                unsigned long ee_block = le32_to_cpu(ex->ee_block);
+                ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
                ext4_fsblk_t ee_start = ext_pblock(ex);
                unsigned short ee_len;
@@ -2302,7 +2357,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                        newblock = iblock - ee_block + ee_start;
                        /* number of remaining blocks in the extent */
                        allocated = ee_len - (iblock - ee_block);
-                        ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock,
+                        ext_debug("%u fit into %lu:%d -> %llu\n", iblock,
                                        ee_block, ee_len, newblock);
                        /* Do not put uninitialized extent in the cache */
@@ -2320,9 +2375,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                        ret = ext4_ext_convert_to_initialized(handle, inode,
                                                                path, iblock,
                                                                max_blocks);
-                        if (ret <= 0)
+                        if (ret <= 0) {
+                                err = ret;
                                goto out2;
-                        else
+                        } else
                                allocated = ret;
                        goto outnew;
                }
@@ -2347,8 +2403,15 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
                ext4_init_block_alloc_info(inode);
-        /* allocate new block */
+        /* find neighbour allocated blocks */
-        goal = ext4_ext_find_goal(inode, path, iblock);
+        ar.lleft = iblock;
+        err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
+        if (err)
+                goto out2;
+        ar.lright = iblock;
+        err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
+        if (err)
+                goto out2;
        /*
         * See if request is beyond maximum number of blocks we can have in
@@ -2368,10 +2431,21 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        newex.ee_len = cpu_to_le16(max_blocks);
        err = ext4_ext_check_overlap(inode, &newex, path);
        if (err)
-                allocated = le16_to_cpu(newex.ee_len);
+                allocated = ext4_ext_get_actual_len(&newex);
        else
                allocated = max_blocks;
-        newblock = ext4_new_blocks(handle, inode, goal, &allocated, &err);
+        /* allocate new block */
+        ar.inode = inode;
+        ar.goal = ext4_ext_find_goal(inode, path, iblock);
+        ar.logical = iblock;
+        ar.len = allocated;
+        if (S_ISREG(inode->i_mode))
+                ar.flags = EXT4_MB_HINT_DATA;
+        else
+                /* disable in-core preallocation for non-regular files */
+                ar.flags = 0;
+        newblock = ext4_mb_new_blocks(handle, &ar, &err);
        if (!newblock)
                goto out2;
        ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
@@ -2379,14 +2453,17 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        /* try to insert new extent into found leaf and return */
        ext4_ext_store_pblock(&newex, newblock);
-        newex.ee_len = cpu_to_le16(allocated);
+        newex.ee_len = cpu_to_le16(ar.len);
        if (create == EXT4_CREATE_UNINITIALIZED_EXT)  /* Mark uninitialized */
                ext4_ext_mark_uninitialized(&newex);
        err = ext4_ext_insert_extent(handle, inode, path, &newex);
        if (err) {
                /* free data blocks we just allocated */
+                /* not a good idea to call discard here directly,
+                 * but otherwise we'd need to call it every free() */
+                ext4_mb_discard_inode_preallocations(inode);
                ext4_free_blocks(handle, inode, ext_pblock(&newex),
-                                        le16_to_cpu(newex.ee_len));
+                                        ext4_ext_get_actual_len(&newex), 0);
                goto out2;
        }
@@ -2395,6 +2472,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        /* previous routine could use block we allocated */
        newblock = ext_pblock(&newex);
+        allocated = ext4_ext_get_actual_len(&newex);
 outnew:
        __set_bit(BH_New, &bh_result->b_state);
@@ -2414,8 +2492,6 @@ out2:
                ext4_ext_drop_refs(path);
                kfree(path);
        }
-        mutex_unlock(&EXT4_I(inode)->truncate_mutex);
        return err ? err : allocated;
 }
@@ -2423,7 +2499,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
 {
        struct address_space *mapping = inode->i_mapping;
        struct super_block *sb = inode->i_sb;
-        unsigned long last_block;
+        ext4_lblk_t last_block;
        handle_t *handle;
        int err = 0;
@@ -2445,9 +2521,11 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
        if (page)
                ext4_block_truncate_page(handle, page, mapping, inode->i_size);
-        mutex_lock(&EXT4_I(inode)->truncate_mutex);
+        down_write(&EXT4_I(inode)->i_data_sem);
        ext4_ext_invalidate_cache(inode);
+        ext4_mb_discard_inode_preallocations(inode);
        /*
         * TODO: optimization is possible here.
         * Probably we need not scan at all,
@@ -2481,7 +2559,7 @@ out_stop:
        if (inode->i_nlink)
                ext4_orphan_del(handle, inode);
-        mutex_unlock(&EXT4_I(inode)->truncate_mutex);
+        up_write(&EXT4_I(inode)->i_data_sem);
        ext4_journal_stop(handle);
 }
@@ -2516,7 +2594,8 @@ int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
 long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 {
        handle_t *handle;
-        ext4_fsblk_t block, max_blocks;
+        ext4_lblk_t block;
+        unsigned long max_blocks;
        ext4_fsblk_t nblocks = 0;
        int ret = 0;
        int ret2 = 0;
@@ -2544,6 +2623,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
         * modify 1 super block, 1 block bitmap and 1 group descriptor.
         */
        credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3;
+        down_write((&EXT4_I(inode)->i_data_sem));
 retry:
        while (ret >= 0 && ret < max_blocks) {
                block = block + ret;
@@ -2557,12 +2637,12 @@ retry:
                ret = ext4_ext_get_blocks(handle, inode, block,
                                          max_blocks, &map_bh,
                                          EXT4_CREATE_UNINITIALIZED_EXT, 0);
-                WARN_ON(!ret);
+                WARN_ON(ret <= 0);
-                if (!ret) {
+                if (ret <= 0) {
                        ext4_error(inode->i_sb, "ext4_fallocate",
-                                   "ext4_ext_get_blocks returned 0! inode#%lu"
+                                    "ext4_ext_get_blocks returned error: "
-                                   ", block=%llu, max_blocks=%llu",
+                                    "inode#%lu, block=%u, max_blocks=%lu",
-                                   inode->i_ino, block, max_blocks);
+                                    inode->i_ino, block, max_blocks);
                        ret = -EIO;
                        ext4_mark_inode_dirty(handle, inode);
                        ret2 = ext4_journal_stop(handle);
@@ -2600,6 +2680,7 @@ retry:
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
+        up_write((&EXT4_I(inode)->i_data_sem));
        /*
         * Time to update the file size.
         * Update only when preallocation was requested beyond the file size.
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 1a81cd66d63b..ac35ec58db55 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -37,9 +37,9 @@ static int ext4_release_file (struct inode * inode, struct file * filp)
        if ((filp->f_mode & FMODE_WRITE) &&
                        (atomic_read(&inode->i_writecount) == 1))
        {
-                mutex_lock(&EXT4_I(inode)->truncate_mutex);
+                down_write(&EXT4_I(inode)->i_data_sem);
                ext4_discard_reservation(inode);
-                mutex_unlock(&EXT4_I(inode)->truncate_mutex);
+                up_write(&EXT4_I(inode)->i_data_sem);
        }
        if (is_dx(inode) && filp->private_data)
                ext4_htree_free_dir_info(filp->private_data);
@@ -56,8 +56,25 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
        ssize_t ret;
        int err;
-        ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+        /*
+         * If we have encountered a bitmap-format file, the size limit
+         * is smaller than s_maxbytes, which is for extent-mapped files.
+         */
+        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+                size_t length = iov_length(iov, nr_segs);
+                if (pos > sbi->s_bitmap_maxbytes)
+                        return -EFBIG;
+                if (pos + length > sbi->s_bitmap_maxbytes) {
+                        nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
+                                              sbi->s_bitmap_maxbytes - pos);
+                }
+        }
+        ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
        /*
         * Skip flushing if there was an error, or if nothing was written.
         */
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
index 1577910bb58b..7eb0604e7eea 100644
--- a/fs/ext4/group.h
+++ b/fs/ext4/group.h
@@ -14,14 +14,16 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
 extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
                                       struct ext4_group_desc *gdp);
 struct buffer_head *read_block_bitmap(struct super_block *sb,
-                                      unsigned int block_group);
+                                      ext4_group_t block_group);
 extern unsigned ext4_init_block_bitmap(struct super_block *sb,
-                                       struct buffer_head *bh, int group,
+                                       struct buffer_head *bh,
+                                       ext4_group_t group,
                                       struct ext4_group_desc *desc);
 #define ext4_free_blocks_after_init(sb, group, desc)                    \
                ext4_init_block_bitmap(sb, NULL, group, desc)
 extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
-                                       struct buffer_head *bh, int group,
+                                       struct buffer_head *bh,
+                                       ext4_group_t group,
                                       struct ext4_group_desc *desc);
 extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
 #endif /* _LINUX_EXT4_GROUP_H */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index c61f37fd3f05..575b5215c808 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -64,8 +64,8 @@ void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
 }
 /* Initializes an uninitialized inode bitmap */
-unsigned ext4_init_inode_bitmap(struct super_block *sb,
+unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
-                                struct buffer_head *bh, int block_group,
+                                ext4_group_t block_group,
                                struct ext4_group_desc *gdp)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -75,7 +75,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb,
        /* If checksum is bad mark all blocks and inodes use to prevent
         * allocation, essentially implementing a per-group read-only flag. */
        if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
-                ext4_error(sb, __FUNCTION__, "Checksum bad for group %u\n",
+                ext4_error(sb, __FUNCTION__, "Checksum bad for group %lu\n",
                           block_group);
                gdp->bg_free_blocks_count = 0;
                gdp->bg_free_inodes_count = 0;
@@ -98,7 +98,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb,
 * Return buffer_head of bitmap on success or NULL.
 */
 static struct buffer_head *
-read_inode_bitmap(struct super_block * sb, unsigned long block_group)
+read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 {
        struct ext4_group_desc *desc;
        struct buffer_head *bh = NULL;
@@ -152,7 +152,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
        unsigned long ino;
        struct buffer_head *bitmap_bh = NULL;
        struct buffer_head *bh2;
-        unsigned long block_group;
+        ext4_group_t block_group;
        unsigned long bit;
        struct ext4_group_desc * gdp;
        struct ext4_super_block * es;
@@ -260,12 +260,14 @@ error_return:
 * For other inodes, search forward from the parent directory\'s block
 * group to find a free inode.
 */
-static int find_group_dir(struct super_block *sb, struct inode *parent)
+static int find_group_dir(struct super_block *sb, struct inode *parent,
+                                ext4_group_t *best_group)
 {
-        int ngroups = EXT4_SB(sb)->s_groups_count;
+        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
        unsigned int freei, avefreei;
        struct ext4_group_desc *desc, *best_desc = NULL;
-        int group, best_group = -1;
+        ext4_group_t group;
+        int ret = -1;
        freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter);
        avefreei = freei / ngroups;
@@ -279,11 +281,12 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
                if (!best_desc ||
                    (le16_to_cpu(desc->bg_free_blocks_count) >
                     le16_to_cpu(best_desc->bg_free_blocks_count))) {
-                        best_group = group;
+                        *best_group = group;
                        best_desc = desc;
+                        ret = 0;
                }
        }
-        return best_group;
+        return ret;
 }
 /*
@@ -314,12 +317,13 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
 #define INODE_COST 64
 #define BLOCK_COST 256
-static int find_group_orlov(struct super_block *sb, struct inode *parent)
+static int find_group_orlov(struct super_block *sb, struct inode *parent,
+                                ext4_group_t *group)
 {
-        int parent_group = EXT4_I(parent)->i_block_group;
+        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
-        int ngroups = sbi->s_groups_count;
+        ext4_group_t ngroups = sbi->s_groups_count;
        int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
        unsigned int freei, avefreei;
        ext4_fsblk_t freeb, avefreeb;
@@ -327,7 +331,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
        unsigned int ndirs;
        int max_debt, max_dirs, min_inodes;
        ext4_grpblk_t min_blocks;
-        int group = -1, i;
+        ext4_group_t i;
        struct ext4_group_desc *desc;
        freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
@@ -340,13 +344,14 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
        if ((parent == sb->s_root->d_inode) ||
            (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
                int best_ndir = inodes_per_group;
-                int best_group = -1;
+                ext4_group_t grp;
+                int ret = -1;
-                get_random_bytes(&group, sizeof(group));
+                get_random_bytes(&grp, sizeof(grp));
-                parent_group = (unsigned)group % ngroups;
+                parent_group = (unsigned)grp % ngroups;
                for (i = 0; i < ngroups; i++) {
-                        group = (parent_group + i) % ngroups;
+                        grp = (parent_group + i) % ngroups;
-                        desc = ext4_get_group_desc (sb, group, NULL);
+                        desc = ext4_get_group_desc(sb, grp, NULL);
                        if (!desc || !desc->bg_free_inodes_count)
                                continue;
                        if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
@@ -355,11 +360,12 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
                                continue;
                        if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
                                continue;
-                        best_group = group;
+                        *group = grp;
+                        ret = 0;
                        best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
                }
-                if (best_group >= 0)
+                if (ret == 0)
-                        return best_group;
+                        return ret;
                goto fallback;
        }
@@ -380,8 +386,8 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
                max_debt = 1;
        for (i = 0; i < ngroups; i++) {
-                group = (parent_group + i) % ngroups;
+                *group = (parent_group + i) % ngroups;
-                desc = ext4_get_group_desc (sb, group, NULL);
+                desc = ext4_get_group_desc(sb, *group, NULL);
                if (!desc || !desc->bg_free_inodes_count)
                        continue;
                if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
@@ -390,17 +396,16 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
                        continue;
                if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
                        continue;
-                return group;
+                return 0;
        }
 fallback:
        for (i = 0; i < ngroups; i++) {
-                group = (parent_group + i) % ngroups;
+                *group = (parent_group + i) % ngroups;
-                desc = ext4_get_group_desc (sb, group, NULL);
+                desc = ext4_get_group_desc(sb, *group, NULL);
-                if (!desc || !desc->bg_free_inodes_count)
+                if (desc && desc->bg_free_inodes_count &&
-                        continue;
+                        le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
-                if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
+                        return 0;
-                        return group;
        }
        if (avefreei) {
@@ -415,21 +420,22 @@ fallback:
        return -1;
 }
-static int find_group_other(struct super_block *sb, struct inode *parent)
+static int find_group_other(struct super_block *sb, struct inode *parent,
+                                ext4_group_t *group)
 {
-        int parent_group = EXT4_I(parent)->i_block_group;
+        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
-        int ngroups = EXT4_SB(sb)->s_groups_count;
+        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
        struct ext4_group_desc *desc;
-        int group, i;
+        ext4_group_t i;
        /*
         * Try to place the inode in its parent directory
         */
-        group = parent_group;
+        *group = parent_group;
-        desc = ext4_get_group_desc (sb, group, NULL);
+        desc = ext4_get_group_desc(sb, *group, NULL);
        if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
                        le16_to_cpu(desc->bg_free_blocks_count))
-                return group;
+                return 0;
        /*
         * We're going to place this inode in a different blockgroup from its
@@ -440,33 +446,33 @@ static int find_group_other(struct super_block *sb, struct inode *parent)
         *
         * So add our directory's i_ino into the starting point for the hash.
         */
-        group = (group + parent->i_ino) % ngroups;
+        *group = (*group + parent->i_ino) % ngroups;
        /*
         * Use a quadratic hash to find a group with a free inode and some free
         * blocks.
         */
        for (i = 1; i < ngroups; i <<= 1) {
-                group += i;
+                *group += i;
-                if (group >= ngroups)
+                if (*group >= ngroups)
-                        group -= ngroups;
+                        *group -= ngroups;
-                desc = ext4_get_group_desc (sb, group, NULL);
+                desc = ext4_get_group_desc(sb, *group, NULL);
                if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
                                le16_to_cpu(desc->bg_free_blocks_count))
-                        return group;
+                        return 0;
        }
        /*
         * That failed: try linear search for a free inode, even if that group
         * has no free blocks.
         */
-        group = parent_group;
+        *group = parent_group;
        for (i = 0; i < ngroups; i++) {
-                if (++group >= ngroups)
+                if (++*group >= ngroups)
-                        group = 0;
+                        *group = 0;
-                desc = ext4_get_group_desc (sb, group, NULL);
+                desc = ext4_get_group_desc(sb, *group, NULL);
                if (desc && le16_to_cpu(desc->bg_free_inodes_count))
-                        return group;
+                        return 0;
        }
        return -1;
@@ -487,16 +493,17 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
        struct super_block *sb;
        struct buffer_head *bitmap_bh = NULL;
        struct buffer_head *bh2;
-        int group;
+        ext4_group_t group = 0;
        unsigned long ino = 0;
        struct inode * inode;
        struct ext4_group_desc * gdp = NULL;
        struct ext4_super_block * es;
        struct ext4_inode_info *ei;
        struct ext4_sb_info *sbi;
-        int err = 0;
+        int ret2, err = 0;
        struct inode *ret;
-        int i, free = 0;
+        ext4_group_t i;
+        int free = 0;
        /* Cannot create files in a deleted directory */
        if (!dir || !dir->i_nlink)
@@ -512,14 +519,14 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
        es = sbi->s_es;
        if (S_ISDIR(mode)) {
                if (test_opt (sb, OLDALLOC))
-                        group = find_group_dir(sb, dir);
+                        ret2 = find_group_dir(sb, dir, &group);
                else
-                        group = find_group_orlov(sb, dir);
+                        ret2 = find_group_orlov(sb, dir, &group);
        } else
-                group = find_group_other(sb, dir);
+                ret2 = find_group_other(sb, dir, &group);
        err = -ENOSPC;
-        if (group == -1)
+        if (ret2 == -1)
                goto out;
        for (i = 0; i < sbi->s_groups_count; i++) {
@@ -583,7 +590,7 @@ got:
            ino > EXT4_INODES_PER_GROUP(sb)) {
                ext4_error(sb, __FUNCTION__,
                           "reserved inode or inode > inodes count - "
-                           "block_group = %d, inode=%lu", group,
+                           "block_group = %lu, inode=%lu", group,
                           ino + group * EXT4_INODES_PER_GROUP(sb));
                err = -EIO;
                goto fail;
@@ -702,7 +709,6 @@ got:
        if (!S_ISDIR(mode))
                ei->i_flags &= ~EXT4_DIRSYNC_FL;
        ei->i_file_acl = 0;
-        ei->i_dir_acl = 0;
        ei->i_dtime = 0;
        ei->i_block_alloc_info = NULL;
        ei->i_block_group = group;
@@ -741,13 +747,10 @@ got:
        if (test_opt(sb, EXTENTS)) {
                EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
                ext4_ext_tree_init(handle, inode);
-                if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+                err = ext4_update_incompat_feature(handle, sb,
-                        err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+                                                EXT4_FEATURE_INCOMPAT_EXTENTS);
-                        if (err) goto fail;
+                if (err)
-                        EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS);
+                        goto fail;
-                        BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "call ext4_journal_dirty_metadata");
-                        err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
-                }
        }
        ext4_debug("allocating inode %lu\n", inode->i_ino);
@@ -777,7 +780,7 @@ fail_drop:
 struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
 {
        unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
-        unsigned long block_group;
+        ext4_group_t block_group;
        int bit;
        struct buffer_head *bitmap_bh = NULL;
        struct inode *inode = NULL;
@@ -833,7 +836,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
 {
        unsigned long desc_count;
        struct ext4_group_desc *gdp;
-        int i;
+        ext4_group_t i;
 #ifdef EXT4FS_DEBUG
        struct ext4_super_block *es;
        unsigned long bitmap_count, x;
@@ -854,7 +857,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
                        continue;
                x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
-                printk("group %d: stored = %d, counted = %lu\n",
+                printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
                        i, le16_to_cpu(gdp->bg_free_inodes_count), x);
                bitmap_count += x;
        }
@@ -879,7 +882,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
 unsigned long ext4_count_dirs (struct super_block * sb)
 {
        unsigned long count = 0;
-        int i;
+        ext4_group_t i;
        for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
                struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5489703d9573..bb717cbb749c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -105,7 +105,7 @@ int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
 */
 static unsigned long blocks_for_truncate(struct inode *inode)
 {
-        unsigned long needed;
+        ext4_lblk_t needed;
        needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
@@ -243,13 +243,6 @@ static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
        p->bh = bh;
 }
-static int verify_chain(Indirect *from, Indirect *to)
-{
-        while (from <= to && from->key == *from->p)
-                from++;
-        return (from > to);
-}
 /**
 *      ext4_block_to_path - parse the block number into array of offsets
 *      @inode: inode in question (we are only interested in its superblock)
@@ -282,7 +275,8 @@ static int verify_chain(Indirect *from, Indirect *to)
 */
 static int ext4_block_to_path(struct inode *inode,
-                        long i_block, int offsets[4], int *boundary)
+                        ext4_lblk_t i_block,
+                        ext4_lblk_t offsets[4], int *boundary)
 {
        int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
        int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
@@ -313,7 +307,10 @@ static int ext4_block_to_path(struct inode *inode,
                offsets[n++] = i_block & (ptrs - 1);
                final = ptrs;
        } else {
-                ext4_warning(inode->i_sb, "ext4_block_to_path", "block > big");
+                ext4_warning(inode->i_sb, "ext4_block_to_path",
+                                "block %lu > max",
+                                i_block + direct_blocks +
+                                indirect_blocks + double_blocks);
        }
        if (boundary)
                *boundary = final - 1 - (i_block & (ptrs - 1));
@@ -344,12 +341,14 @@ static int ext4_block_to_path(struct inode *inode,
 *              (pointer to last triple returned, *@err == 0)
 *      or when it gets an IO error reading an indirect block
 *              (ditto, *@err == -EIO)
- *      or when it notices that chain had been changed while it was reading
- *              (ditto, *@err == -EAGAIN)
 *      or when it reads all @depth-1 indirect blocks successfully and finds
 *      the whole chain, all way to the data (returns %NULL, *err == 0).
+ *
+ *      Need to be called with
+ *      down_read(&EXT4_I(inode)->i_data_sem)
 */
-static Indirect *ext4_get_branch(struct inode *inode, int depth, int *offsets,
+static Indirect *ext4_get_branch(struct inode *inode, int depth,
+                                 ext4_lblk_t  *offsets,
                                 Indirect chain[4], int *err)
 {
        struct super_block *sb = inode->i_sb;
@@ -365,9 +364,6 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, int *offsets,
                bh = sb_bread(sb, le32_to_cpu(p->key));
                if (!bh)
                        goto failure;
-                /* Reader: pointers */
-                if (!verify_chain(chain, p))
-                        goto changed;
                add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
                /* Reader: end */
                if (!p->key)
@@ -375,10 +371,6 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, int *offsets,
        }
        return NULL;
-changed:
-        brelse(bh);
-        *err = -EAGAIN;
-        goto no_block;
 failure:
        *err = -EIO;
 no_block:
@@ -445,7 +437,7 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 *      stores it in *@goal and returns zero.
 */
-static ext4_fsblk_t ext4_find_goal(struct inode *inode, long block,
+static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
                Indirect chain[4], Indirect *partial)
 {
        struct ext4_block_alloc_info *block_i;
@@ -559,7 +551,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
        return ret;
 failed_out:
        for (i = 0; i <index; i++)
-                ext4_free_blocks(handle, inode, new_blocks[i], 1);
+                ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
        return ret;
 }
@@ -590,7 +582,7 @@ failed_out:
 */
 static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
                        int indirect_blks, int *blks, ext4_fsblk_t goal,
-                        int *offsets, Indirect *branch)
+                        ext4_lblk_t *offsets, Indirect *branch)
 {
        int blocksize = inode->i_sb->s_blocksize;
        int i, n = 0;
@@ -658,9 +650,9 @@ failed:
                ext4_journal_forget(handle, branch[i].bh);
        }
        for (i = 0; i <indirect_blks; i++)
-                ext4_free_blocks(handle, inode, new_blocks[i], 1);
+                ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
-        ext4_free_blocks(handle, inode, new_blocks[i], num);
+        ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
        return err;
 }
@@ -680,7 +672,7 @@ failed:
 * chain to new block and return 0.
 */
 static int ext4_splice_branch(handle_t *handle, struct inode *inode,
-                        long block, Indirect *where, int num, int blks)
+                        ext4_lblk_t block, Indirect *where, int num, int blks)
 {
        int i;
        int err = 0;
@@ -757,9 +749,10 @@ err_out:
        for (i = 1; i <= num; i++) {
                BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
                ext4_journal_forget(handle, where[i].bh);
-                ext4_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
+                ext4_free_blocks(handle, inode,
+                                        le32_to_cpu(where[i-1].key), 1, 0);
        }
-        ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
+        ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
        return err;
 }
@@ -782,14 +775,19 @@ err_out:
 * return > 0, # of blocks mapped or allocated.
 * return = 0, if plain lookup failed.
 * return < 0, error case.
+ *
+ *
+ * Need to be called with
+ * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
+ * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
 */
 int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
-                sector_t iblock, unsigned long maxblocks,
+                ext4_lblk_t iblock, unsigned long maxblocks,
                struct buffer_head *bh_result,
                int create, int extend_disksize)
 {
        int err = -EIO;
-        int offsets[4];
+        ext4_lblk_t offsets[4];
        Indirect chain[4];
        Indirect *partial;
        ext4_fsblk_t goal;
@@ -803,7 +801,8 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
        J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
        J_ASSERT(handle != NULL || create == 0);
-        depth = ext4_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
+        depth = ext4_block_to_path(inode, iblock, offsets,
+                                        &blocks_to_boundary);
        if (depth == 0)
                goto out;
@@ -819,18 +818,6 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
                while (count < maxblocks && count <= blocks_to_boundary) {
                        ext4_fsblk_t blk;
-                        if (!verify_chain(chain, partial)) {
-                                /*
-                                 * Indirect block might be removed by
-                                 * truncate while we were reading it.
-                                 * Handling of that case: forget what we've
-                                 * got now. Flag the err as EAGAIN, so it
-                                 * will reread.
-                                 */
-                                err = -EAGAIN;
-                                count = 0;
-                                break;
-                        }
                        blk = le32_to_cpu(*(chain[depth-1].p + count));
                        if (blk == first_block + count)
@@ -838,44 +825,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
                        else
                                break;
                }
-                if (err != -EAGAIN)
+                goto got_it;
-                        goto got_it;
        }
        /* Next simple case - plain lookup or failed read of indirect block */
        if (!create || err == -EIO)
                goto cleanup;
-        mutex_lock(&ei->truncate_mutex);
-        /*
-         * If the indirect block is missing while we are reading
-         * the chain(ext4_get_branch() returns -EAGAIN err), or
-         * if the chain has been changed after we grab the semaphore,
-         * (either because another process truncated this branch, or
-         * another get_block allocated this branch) re-grab the chain to see if
-         * the request block has been allocated or not.
-         *
-         * Since we already block the truncate/other get_block
-         * at this point, we will have the current copy of the chain when we
-         * splice the branch into the tree.
-         */
-        if (err == -EAGAIN || !verify_chain(chain, partial)) {
-                while (partial > chain) {
-                        brelse(partial->bh);
-                        partial--;
-                }
-                partial = ext4_get_branch(inode, depth, offsets, chain, &err);
-                if (!partial) {
-                        count++;
-                        mutex_unlock(&ei->truncate_mutex);
-                        if (err)
-                                goto cleanup;
-                        clear_buffer_new(bh_result);
-                        goto got_it;
-                }
-        }
        /*
         * Okay, we need to do block allocation.  Lazily initialize the block
         * allocation info here if necessary
@@ -911,13 +867,12 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
                err = ext4_splice_branch(handle, inode, iblock,
                                        partial, indirect_blks, count);
        /*
-         * i_disksize growing is protected by truncate_mutex.  Don't forget to
+         * i_disksize growing is protected by i_data_sem.  Don't forget to
         * protect it if you're about to implement concurrent
         * ext4_get_block() -bzzz
        */
        if (!err && extend_disksize && inode->i_size > ei->i_disksize)
                ei->i_disksize = inode->i_size;
-        mutex_unlock(&ei->truncate_mutex);
        if (err)
                goto cleanup;
@@ -942,6 +897,47 @@ out:
 #define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32)
+int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
+                        unsigned long max_blocks, struct buffer_head *bh,
+                        int create, int extend_disksize)
+{
+        int retval;
+        /*
+         * Try to see if we can get  the block without requesting
+         * for new file system block.
+         */
+        down_read((&EXT4_I(inode)->i_data_sem));
+        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+                retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
+                                bh, 0, 0);
+        } else {
+                retval = ext4_get_blocks_handle(handle,
+                                inode, block, max_blocks, bh, 0, 0);
+        }
+        up_read((&EXT4_I(inode)->i_data_sem));
+        if (!create || (retval > 0))
+                return retval;
+        /*
+         * We need to allocate new blocks which will result
+         * in i_data update
+         */
+        down_write((&EXT4_I(inode)->i_data_sem));
+        /*
+         * We need to check for EXT4 here because migrate
+         * could have changed the inode type in between
+         */
+        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+                retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
+                                bh, create, extend_disksize);
+        } else {
+                retval = ext4_get_blocks_handle(handle, inode, block,
+                                max_blocks, bh, create, extend_disksize);
+        }
+        up_write((&EXT4_I(inode)->i_data_sem));
+        return retval;
+}
 static int ext4_get_block(struct inode *inode, sector_t iblock,
                        struct buffer_head *bh_result, int create)
 {
@@ -996,7 +992,7 @@ get_block:
 * `handle' can be NULL if create is zero
 */
 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
-                                long block, int create, int *errp)
+                                ext4_lblk_t block, int create, int *errp)
 {
        struct buffer_head dummy;
        int fatal = 0, err;
@@ -1063,7 +1059,7 @@ err:
 }
 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
-                               int block, int create, int *err)
+                               ext4_lblk_t block, int create, int *err)
 {
        struct buffer_head * bh;
@@ -1446,7 +1442,7 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
 *      ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...
 *
 * Same applies to ext4_get_block().  We will deadlock on various things like
- * lock_journal and i_truncate_mutex.
+ * lock_journal and i_data_sem
 *
 * Setting PF_MEMALLOC here doesn't work - too many internal memory
 * allocations fail.
@@ -1828,7 +1824,8 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
 {
        ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
        unsigned offset = from & (PAGE_CACHE_SIZE-1);
-        unsigned blocksize, iblock, length, pos;
+        unsigned blocksize, length, pos;
+        ext4_lblk_t iblock;
        struct inode *inode = mapping->host;
        struct buffer_head *bh;
        int err = 0;
@@ -1964,7 +1961,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
 *                      (no partially truncated stuff there).  */
 static Indirect *ext4_find_shared(struct inode *inode, int depth,
-                        int offsets[4], Indirect chain[4], __le32 *top)
+                        ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top)
 {
        Indirect *partial, *p;
        int k, err;
@@ -2048,15 +2045,15 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
        for (p = first; p < last; p++) {
                u32 nr = le32_to_cpu(*p);
                if (nr) {
-                        struct buffer_head *bh;
+                        struct buffer_head *tbh;
                        *p = 0;
-                        bh = sb_find_get_block(inode->i_sb, nr);
+                        tbh = sb_find_get_block(inode->i_sb, nr);
-                        ext4_forget(handle, 0, inode, bh, nr);
+                        ext4_forget(handle, 0, inode, tbh, nr);
                }
        }
-        ext4_free_blocks(handle, inode, block_to_free, count);
+        ext4_free_blocks(handle, inode, block_to_free, count, 0);
 }
 /**
@@ -2229,7 +2226,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                                ext4_journal_test_restart(handle, inode);
                        }
-                        ext4_free_blocks(handle, inode, nr, 1);
+                        ext4_free_blocks(handle, inode, nr, 1, 1);
                        if (parent_bh) {
                                /*
@@ -2289,12 +2286,12 @@ void ext4_truncate(struct inode *inode)
        __le32 *i_data = ei->i_data;
        int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
        struct address_space *mapping = inode->i_mapping;
-        int offsets[4];
+        ext4_lblk_t offsets[4];
        Indirect chain[4];
        Indirect *partial;
        __le32 nr = 0;
        int n;
-        long last_block;
+        ext4_lblk_t last_block;
        unsigned blocksize = inode->i_sb->s_blocksize;
        struct page *page;
@@ -2320,8 +2317,10 @@ void ext4_truncate(struct inode *inode)
                        return;
        }
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
-                return ext4_ext_truncate(inode, page);
+                ext4_ext_truncate(inode, page);
+                return;
+        }
        handle = start_transaction(inode);
        if (IS_ERR(handle)) {
@@ -2369,7 +2368,7 @@ void ext4_truncate(struct inode *inode)
         * From here we block out all ext4_get_block() callers who want to
         * modify the block allocation tree.
         */
-        mutex_lock(&ei->truncate_mutex);
+        down_write(&ei->i_data_sem);
        if (n == 1) {           /* direct blocks */
                ext4_free_data(handle, inode, NULL, i_data+offsets[0],
@@ -2433,7 +2432,7 @@ do_indirects:
        ext4_discard_reservation(inode);
-        mutex_unlock(&ei->truncate_mutex);
+        up_write(&ei->i_data_sem);
        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
        ext4_mark_inode_dirty(handle, inode);
@@ -2460,7 +2459,8 @@ out_stop:
 static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
                unsigned long ino, struct ext4_iloc *iloc)
 {
-        unsigned long desc, group_desc, block_group;
+        unsigned long desc, group_desc;
+        ext4_group_t block_group;
        unsigned long offset;
        ext4_fsblk_t block;
        struct buffer_head *bh;
@@ -2547,7 +2547,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
                        struct ext4_group_desc *desc;
                        int inodes_per_buffer;
                        int inode_offset, i;
-                        int block_group;
+                        ext4_group_t block_group;
                        int start;
                        block_group = (inode->i_ino - 1) /
@@ -2660,6 +2660,28 @@ void ext4_get_inode_flags(struct ext4_inode_info *ei)
        if (flags & S_DIRSYNC)
                ei->i_flags |= EXT4_DIRSYNC_FL;
 }
+static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
+                                        struct ext4_inode_info *ei)
+{
+        blkcnt_t i_blocks ;
+        struct inode *inode = &(ei->vfs_inode);
+        struct super_block *sb = inode->i_sb;
+        if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+                /* we are using combined 48 bit field */
+                i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
+                                        le32_to_cpu(raw_inode->i_blocks_lo);
+                if (ei->i_flags & EXT4_HUGE_FILE_FL) {
+                        /* i_blocks represent file system block size */
+                        return i_blocks  << (inode->i_blkbits - 9);
+                } else {
+                        return i_blocks;
+                }
+        } else {
+                return le32_to_cpu(raw_inode->i_blocks_lo);
+        }
+}
 void ext4_read_inode(struct inode * inode)
 {
@@ -2687,7 +2709,6 @@ void ext4_read_inode(struct inode * inode)
                inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
        }
        inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
-        inode->i_size = le32_to_cpu(raw_inode->i_size);
        ei->i_state = 0;
        ei->i_dir_start_lookup = 0;
@@ -2709,19 +2730,15 @@ void ext4_read_inode(struct inode * inode)
                 * recovery code: that's fine, we're about to complete
                 * the process of deleting those. */
        }
-        inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
        ei->i_flags = le32_to_cpu(raw_inode->i_flags);
-        ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
+        inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
+        ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
-            cpu_to_le32(EXT4_OS_HURD))
+            cpu_to_le32(EXT4_OS_HURD)) {
                ei->i_file_acl |=
                        ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
-        if (!S_ISREG(inode->i_mode)) {
-                ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
-        } else {
-                inode->i_size |=
-                        ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
        }
+        inode->i_size = ext4_isize(raw_inode);
        ei->i_disksize = inode->i_size;
        inode->i_generation = le32_to_cpu(raw_inode->i_generation);
        ei->i_block_group = iloc.block_group;
@@ -2765,6 +2782,13 @@ void ext4_read_inode(struct inode * inode)
        EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
        EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
+        inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
+        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+                if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+                        inode->i_version |=
+                        (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+        }
        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &ext4_file_inode_operations;
                inode->i_fop = &ext4_file_operations;
@@ -2797,6 +2821,55 @@ bad_inode:
        return;
 }
+static int ext4_inode_blocks_set(handle_t *handle,
+                                struct ext4_inode *raw_inode,
+                                struct ext4_inode_info *ei)
+{
+        struct inode *inode = &(ei->vfs_inode);
+        u64 i_blocks = inode->i_blocks;
+        struct super_block *sb = inode->i_sb;
+        int err = 0;
+        if (i_blocks <= ~0U) {
+                /*
+                 * i_blocks can be represnted in a 32 bit variable
+                 * as multiple of 512 bytes
+                 */
+                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
+                raw_inode->i_blocks_high = 0;
+                ei->i_flags &= ~EXT4_HUGE_FILE_FL;
+        } else if (i_blocks <= 0xffffffffffffULL) {
+                /*
+                 * i_blocks can be represented in a 48 bit variable
+                 * as multiple of 512 bytes
+                 */
+                err = ext4_update_rocompat_feature(handle, sb,
+                                            EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
+                if (err)
+                        goto  err_out;
+                /* i_block is stored in the split  48 bit fields */
+                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
+                raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
+                ei->i_flags &= ~EXT4_HUGE_FILE_FL;
+        } else {
+                /*
+                 * i_blocks should be represented in a 48 bit variable
+                 * as multiple of  file system block size
+                 */
+                err = ext4_update_rocompat_feature(handle, sb,
+                                            EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
+                if (err)
+                        goto  err_out;
+                ei->i_flags |= EXT4_HUGE_FILE_FL;
+                /* i_block is stored in file system block size */
+                i_blocks = i_blocks >> (inode->i_blkbits - 9);
+                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
+                raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
+        }
+err_out:
+        return err;
+}
 /*
 * Post the struct inode info into an on-disk inode location in the
 * buffer-cache.  This gobbles the caller's reference to the
@@ -2845,47 +2918,42 @@ static int ext4_do_update_inode(handle_t *handle,
                raw_inode->i_gid_high = 0;
        }
        raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
-        raw_inode->i_size = cpu_to_le32(ei->i_disksize);
        EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
        EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
        EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
        EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
-        raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
+        if (ext4_inode_blocks_set(handle, raw_inode, ei))
+                goto out_brelse;
        raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
        raw_inode->i_flags = cpu_to_le32(ei->i_flags);
        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
            cpu_to_le32(EXT4_OS_HURD))
                raw_inode->i_file_acl_high =
                        cpu_to_le16(ei->i_file_acl >> 32);
-        raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
+        raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
-        if (!S_ISREG(inode->i_mode)) {
+        ext4_isize_set(raw_inode, ei->i_disksize);
-                raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
+        if (ei->i_disksize > 0x7fffffffULL) {
-        } else {
+                struct super_block *sb = inode->i_sb;
-                raw_inode->i_size_high =
+                if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
-                        cpu_to_le32(ei->i_disksize >> 32);
+                                EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
-                if (ei->i_disksize > 0x7fffffffULL) {
+                                EXT4_SB(sb)->s_es->s_rev_level ==
-                        struct super_block *sb = inode->i_sb;
+                                cpu_to_le32(EXT4_GOOD_OLD_REV)) {
-                        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                        /* If this is the first large file
-                                        EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
+                         * created, add a flag to the superblock.
-                            EXT4_SB(sb)->s_es->s_rev_level ==
+                         */
-                                        cpu_to_le32(EXT4_GOOD_OLD_REV)) {
+                        err = ext4_journal_get_write_access(handle,
-                               /* If this is the first large file
+                                        EXT4_SB(sb)->s_sbh);
-                                * created, add a flag to the superblock.
+                        if (err)
-                                */
+                                goto out_brelse;
-                                err = ext4_journal_get_write_access(handle,
+                        ext4_update_dynamic_rev(sb);
-                                                EXT4_SB(sb)->s_sbh);
+                        EXT4_SET_RO_COMPAT_FEATURE(sb,
-                                if (err)
-                                        goto out_brelse;
-                                ext4_update_dynamic_rev(sb);
-                                EXT4_SET_RO_COMPAT_FEATURE(sb,
                                        EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
-                                sb->s_dirt = 1;
+                        sb->s_dirt = 1;
-                                handle->h_sync = 1;
+                        handle->h_sync = 1;
-                                err = ext4_journal_dirty_metadata(handle,
+                        err = ext4_journal_dirty_metadata(handle,
-                                                EXT4_SB(sb)->s_sbh);
+                                        EXT4_SB(sb)->s_sbh);
-                        }
                }
        }
        raw_inode->i_generation = cpu_to_le32(inode->i_generation);
@@ -2903,8 +2971,14 @@ static int ext4_do_update_inode(handle_t *handle,
        } else for (block = 0; block < EXT4_N_BLOCKS; block++)
                raw_inode->i_block[block] = ei->i_data[block];
-        if (ei->i_extra_isize)
+        raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
+        if (ei->i_extra_isize) {
+                if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+                        raw_inode->i_version_hi =
+                        cpu_to_le32(inode->i_version >> 32);
                raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+        }
        BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
        rc = ext4_journal_dirty_metadata(handle, bh);
@@ -3024,6 +3098,17 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                ext4_journal_stop(handle);
        }
+        if (attr->ia_valid & ATTR_SIZE) {
+                if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+                        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+                        if (attr->ia_size > sbi->s_bitmap_maxbytes) {
+                                error = -EFBIG;
+                                goto err_out;
+                        }
+                }
+        }
        if (S_ISREG(inode->i_mode) &&
            attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
                handle_t *handle;
@@ -3120,6 +3205,9 @@ int ext4_mark_iloc_dirty(handle_t *handle,
 {
        int err = 0;
+        if (test_opt(inode->i_sb, I_VERSION))
+                inode_inc_iversion(inode);
        /* the do_update_inode consumes one bh->b_count */
        get_bh(iloc->bh);
@@ -3158,8 +3246,10 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
 * Expand an inode by new_extra_isize bytes.
 * Returns 0 on success or negative error number on failure.
 */
-int ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize,
+static int ext4_expand_extra_isize(struct inode *inode,
-                        struct ext4_iloc iloc, handle_t *handle)
+                                   unsigned int new_extra_isize,
+                                   struct ext4_iloc iloc,
+                                   handle_t *handle)
 {
        struct ext4_inode *raw_inode;
        struct ext4_xattr_ibody_header *header;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index c04c7ccba9e3..2ed7c37f897e 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -51,6 +51,11 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
                        flags &= ~EXT4_DIRSYNC_FL;
                mutex_lock(&inode->i_mutex);
+                /* Is it quota file? Do not allow user to mess with it */
+                if (IS_NOQUOTA(inode)) {
+                        mutex_unlock(&inode->i_mutex);
+                        return -EPERM;
+                }
                oldflags = ei->i_flags;
                /* The JOURNAL_DATA flag is modifiable only by root */
@@ -194,7 +199,7 @@ flags_err:
                 * need to allocate reservation structure for this inode
                 * before set the window size
                 */
-                mutex_lock(&ei->truncate_mutex);
+                down_write(&ei->i_data_sem);
                if (!ei->i_block_alloc_info)
                        ext4_init_block_alloc_info(inode);
@@ -202,7 +207,7 @@ flags_err:
                        struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
                        rsv->rsv_goal_size = rsv_window_size;
                }
-                mutex_unlock(&ei->truncate_mutex);
+                up_write(&ei->i_data_sem);
                return 0;
        }
        case EXT4_IOC_GROUP_EXTEND: {
@@ -249,6 +254,9 @@ flags_err:
                return err;
        }
+        case EXT4_IOC_MIGRATE:
+                return ext4_ext_migrate(inode, filp, cmd, arg);
        default:
                return -ENOTTY;
        }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
new file mode 100644
index 000000000000..76e5fedc0a0b
--- /dev/null
+++ b/fs/ext4/mballoc.c
@@ -0,0 +1,4552 @@
+/*
+ * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
+ */
+/*
+ * mballoc.c contains the multiblocks allocation routines
+ */
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/proc_fs.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/version.h>
+#include "group.h"
+/*
+ * MUSTDO:
+ *   - test ext4_ext_search_left() and ext4_ext_search_right()
+ *   - search for metadata in few groups
+ *
+ * TODO v4:
+ *   - normalization should take into account whether file is still open
+ *   - discard preallocations if no free space left (policy?)
+ *   - don't normalize tails
+ *   - quota
+ *   - reservation for superuser
+ *
+ * TODO v3:
+ *   - bitmap read-ahead (proposed by Oleg Drokin aka green)
+ *   - track min/max extents in each group for better group selection
+ *   - mb_mark_used() may allocate chunk right after splitting buddy
+ *   - tree of groups sorted by number of free blocks
+ *   - error handling
+ */
+/*
+ * The allocation request involve request for multiple number of blocks
+ * near to the goal(block) value specified.
+ *
+ * During initialization phase of the allocator we decide to use the group
+ * preallocation or inode preallocation depending on the size file. The
+ * size of the file could be the resulting file size we would have after
+ * allocation or the current file size which ever is larger. If the size is
+ * less that sbi->s_mb_stream_request we select the group
+ * preallocation. The default value of s_mb_stream_request is 16
+ * blocks. This can also be tuned via
+ * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms
+ * of number of blocks.
+ *
+ * The main motivation for having small file use group preallocation is to
+ * ensure that we have small file closer in the disk.
+ *
+ * First stage the allocator looks at the inode prealloc list
+ * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for
+ * this particular inode. The inode prealloc space is represented as:
+ *
+ * pa_lstart -> the logical start block for this prealloc space
+ * pa_pstart -> the physical start block for this prealloc space
+ * pa_len    -> lenght for this prealloc space
+ * pa_free   ->  free space available in this prealloc space
+ *
+ * The inode preallocation space is used looking at the _logical_ start
+ * block. If only the logical file block falls within the range of prealloc
+ * space we will consume the particular prealloc space. This make sure that
+ * that the we have contiguous physical blocks representing the file blocks
+ *
+ * The important thing to be noted in case of inode prealloc space is that
+ * we don't modify the values associated to inode prealloc space except
+ * pa_free.
+ *
+ * If we are not able to find blocks in the inode prealloc space and if we
+ * have the group allocation flag set then we look at the locality group
+ * prealloc space. These are per CPU prealloc list repreasented as
+ *
+ * ext4_sb_info.s_locality_groups[smp_processor_id()]
+ *
+ * The reason for having a per cpu locality group is to reduce the contention
+ * between CPUs. It is possible to get scheduled at this point.
+ *
+ * The locality group prealloc space is used looking at whether we have
+ * enough free space (pa_free) withing the prealloc space.
+ *
+ * If we can't allocate blocks via inode prealloc or/and locality group
+ * prealloc then we look at the buddy cache. The buddy cache is represented
+ * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
+ * mapped to the buddy and bitmap information regarding different
+ * groups. The buddy information is attached to buddy cache inode so that
+ * we can access them through the page cache. The information regarding
+ * each group is loaded via ext4_mb_load_buddy.  The information involve
+ * block bitmap and buddy information. The information are stored in the
+ * inode as:
+ *
+ *  {                        page                        }
+ *  [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
+ *
+ *
+ * one block each for bitmap and buddy information.  So for each group we
+ * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE /
+ * blocksize) blocks.  So it can have information regarding groups_per_page
+ * which is blocks_per_page/2
+ *
+ * The buddy cache inode is not stored on disk. The inode is thrown
+ * away when the filesystem is unmounted.
+ *
+ * We look for count number of blocks in the buddy cache. If we were able
+ * to locate that many free blocks we return with additional information
+ * regarding rest of the contiguous physical block available
+ *
+ * Before allocating blocks via buddy cache we normalize the request
+ * blocks. This ensure we ask for more blocks that we needed. The extra
+ * blocks that we get after allocation is added to the respective prealloc
+ * list. In case of inode preallocation we follow a list of heuristics
+ * based on file size. This can be found in ext4_mb_normalize_request. If
+ * we are doing a group prealloc we try to normalize the request to
+ * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to
+ * 512 blocks. This can be tuned via
+ * /proc/fs/ext4/<partition/group_prealloc. The value is represented in
+ * terms of number of blocks. If we have mounted the file system with -O
+ * stripe=<value> option the group prealloc request is normalized to the
+ * stripe value (sbi->s_stripe)
+ *
+ * The regular allocator(using the buddy cache) support few tunables.
+ *
+ * /proc/fs/ext4/<partition>/min_to_scan
+ * /proc/fs/ext4/<partition>/max_to_scan
+ * /proc/fs/ext4/<partition>/order2_req
+ *
+ * The regular allocator use buddy scan only if the request len is power of
+ * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
+ * value of s_mb_order2_reqs can be tuned via
+ * /proc/fs/ext4/<partition>/order2_req.  If the request len is equal to
+ * stripe size (sbi->s_stripe), we try to search for contigous block in
+ * stripe size. This should result in better allocation on RAID setup. If
+ * not we search in the specific group using bitmap for best extents. The
+ * tunable min_to_scan and max_to_scan controll the behaviour here.
+ * min_to_scan indicate how long the mballoc __must__ look for a best
+ * extent and max_to_scanindicate how long the mballoc __can__ look for a
+ * best extent in the found extents. Searching for the blocks starts with
+ * the group specified as the goal value in allocation context via
+ * ac_g_ex. Each group is first checked based on the criteria whether it
+ * can used for allocation. ext4_mb_good_group explains how the groups are
+ * checked.
+ *
+ * Both the prealloc space are getting populated as above. So for the first
+ * request we will hit the buddy cache which will result in this prealloc
+ * space getting filled. The prealloc space is then later used for the
+ * subsequent request.
+ */
+/*
+ * mballoc operates on the following data:
+ *  - on-disk bitmap
+ *  - in-core buddy (actually includes buddy and bitmap)
+ *  - preallocation descriptors (PAs)
+ *
+ * there are two types of preallocations:
+ *  - inode
+ *    assiged to specific inode and can be used for this inode only.
+ *    it describes part of inode's space preallocated to specific
+ *    physical blocks. any block from that preallocated can be used
+ *    independent. the descriptor just tracks number of blocks left
+ *    unused. so, before taking some block from descriptor, one must
+ *    make sure corresponded logical block isn't allocated yet. this
+ *    also means that freeing any block within descriptor's range
+ *    must discard all preallocated blocks.
+ *  - locality group
+ *    assigned to specific locality group which does not translate to
+ *    permanent set of inodes: inode can join and leave group. space
+ *    from this type of preallocation can be used for any inode. thus
+ *    it's consumed from the beginning to the end.
+ *
+ * relation between them can be expressed as:
+ *    in-core buddy = on-disk bitmap + preallocation descriptors
+ *
+ * this mean blocks mballoc considers used are:
+ *  - allocated blocks (persistent)
+ *  - preallocated blocks (non-persistent)
+ *
+ * consistency in mballoc world means that at any time a block is either
+ * free or used in ALL structures. notice: "any time" should not be read
+ * literally -- time is discrete and delimited by locks.
+ *
+ *  to keep it simple, we don't use block numbers, instead we count number of
+ *  blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
+ *
+ * all operations can be expressed as:
+ *  - init buddy:                       buddy = on-disk + PAs
+ *  - new PA:                           buddy += N; PA = N
+ *  - use inode PA:                     on-disk += N; PA -= N
+ *  - discard inode PA                  buddy -= on-disk - PA; PA = 0
+ *  - use locality group PA             on-disk += N; PA -= N
+ *  - discard locality group PA         buddy -= PA; PA = 0
+ *  note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
+ *        is used in real operation because we can't know actual used
+ *        bits from PA, only from on-disk bitmap
+ *
+ * if we follow this strict logic, then all operations above should be atomic.
+ * given some of them can block, we'd have to use something like semaphores
+ * killing performance on high-end SMP hardware. let's try to relax it using
+ * the following knowledge:
+ *  1) if buddy is referenced, it's already initialized
+ *  2) while block is used in buddy and the buddy is referenced,
+ *     nobody can re-allocate that block
+ *  3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
+ *     bit set and PA claims same block, it's OK. IOW, one can set bit in
+ *     on-disk bitmap if buddy has same bit set or/and PA covers corresponded
+ *     block
+ *
+ * so, now we're building a concurrency table:
+ *  - init buddy vs.
+ *    - new PA
+ *      blocks for PA are allocated in the buddy, buddy must be referenced
+ *      until PA is linked to allocation group to avoid concurrent buddy init
+ *    - use inode PA
+ *      we need to make sure that either on-disk bitmap or PA has uptodate data
+ *      given (3) we care that PA-=N operation doesn't interfere with init
+ *    - discard inode PA
+ *      the simplest way would be to have buddy initialized by the discard
+ *    - use locality group PA
+ *      again PA-=N must be serialized with init
+ *    - discard locality group PA
+ *      the simplest way would be to have buddy initialized by the discard
+ *  - new PA vs.
+ *    - use inode PA
+ *      i_data_sem serializes them
+ *    - discard inode PA
+ *      discard process must wait until PA isn't used by another process
+ *    - use locality group PA
+ *      some mutex should serialize them
+ *    - discard locality group PA
+ *      discard process must wait until PA isn't used by another process
+ *  - use inode PA
+ *    - use inode PA
+ *      i_data_sem or another mutex should serializes them
+ *    - discard inode PA
+ *      discard process must wait until PA isn't used by another process
+ *    - use locality group PA
+ *      nothing wrong here -- they're different PAs covering different blocks
+ *    - discard locality group PA
+ *      discard process must wait until PA isn't used by another process
+ *
+ * now we're ready to make few consequences:
+ *  - PA is referenced and while it is no discard is possible
+ *  - PA is referenced until block isn't marked in on-disk bitmap
+ *  - PA changes only after on-disk bitmap
+ *  - discard must not compete with init. either init is done before
+ *    any discard or they're serialized somehow
+ *  - buddy init as sum of on-disk bitmap and PAs is done atomically
+ *
+ * a special case when we've used PA to emptiness. no need to modify buddy
+ * in this case, but we should care about concurrent init
+ *
+ */
+ /*
+ * Logic in few words:
+ *
+ *  - allocation:
+ *    load group
+ *    find blocks
+ *    mark bits in on-disk bitmap
+ *    release group
+ *
+ *  - use preallocation:
+ *    find proper PA (per-inode or group)
+ *    load group
+ *    mark bits in on-disk bitmap
+ *    release group
+ *    release PA
+ *
+ *  - free:
+ *    load group
+ *    mark bits in on-disk bitmap
+ *    release group
+ *
+ *  - discard preallocations in group:
+ *    mark PAs deleted
+ *    move them onto local list
+ *    load on-disk bitmap
+ *    load group
+ *    remove PA from object (inode or locality group)
+ *    mark free blocks in-core
+ *
+ *  - discard inode's preallocations:
+ */
+/*
+ * Locking rules
+ *
+ * Locks:
+ *  - bitlock on a group        (group)
+ *  - object (inode/locality)   (object)
+ *  - per-pa lock               (pa)
+ *
+ * Paths:
+ *  - new pa
+ *    object
+ *    group
+ *
+ *  - find and use pa:
+ *    pa
+ *
+ *  - release consumed pa:
+ *    pa
+ *    group
+ *    object
+ *
+ *  - generate in-core bitmap:
+ *    group
+ *        pa
+ *
+ *  - discard all for given object (inode, locality group):
+ *    object
+ *        pa
+ *    group
+ *
+ *  - discard all for given group:
+ *    group
+ *        pa
+ *    group
+ *        object
+ *
+ */
+/*
+ * with AGGRESSIVE_CHECK allocator runs consistency checks over
+ * structures. these checks slow things down a lot
+ */
+#define AGGRESSIVE_CHECK__
+/*
+ * with DOUBLE_CHECK defined mballoc creates persistent in-core
+ * bitmaps, maintains and uses them to check for double allocations
+ */
+#define DOUBLE_CHECK__
+/*
+ */
+#define MB_DEBUG__
+#ifdef MB_DEBUG
+#define mb_debug(fmt, a...)     printk(fmt, ##a)
+#else
+#define mb_debug(fmt, a...)
+#endif
+/*
+ * with EXT4_MB_HISTORY mballoc stores last N allocations in memory
+ * and you can monitor it in /proc/fs/ext4/<dev>/mb_history
+ */
+#define EXT4_MB_HISTORY
+#define EXT4_MB_HISTORY_ALLOC           1       /* allocation */
+#define EXT4_MB_HISTORY_PREALLOC        2       /* preallocated blocks used */
+#define EXT4_MB_HISTORY_DISCARD         4       /* preallocation discarded */
+#define EXT4_MB_HISTORY_FREE            8       /* free */
+#define EXT4_MB_HISTORY_DEFAULT         (EXT4_MB_HISTORY_ALLOC | \
+                                         EXT4_MB_HISTORY_PREALLOC)
+/*
+ * How long mballoc can look for a best extent (in found extents)
+ */
+#define MB_DEFAULT_MAX_TO_SCAN          200
+/*
+ * How long mballoc must look for a best extent
+ */
+#define MB_DEFAULT_MIN_TO_SCAN          10
+/*
+ * How many groups mballoc will scan looking for the best chunk
+ */
+#define MB_DEFAULT_MAX_GROUPS_TO_SCAN   5
+/*
+ * with 'ext4_mb_stats' allocator will collect stats that will be
+ * shown at umount. The collecting costs though!
+ */
+#define MB_DEFAULT_STATS                1
+/*
+ * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
+ * by the stream allocator, which purpose is to pack requests
+ * as close each to other as possible to produce smooth I/O traffic
+ * We use locality group prealloc space for stream request.
+ * We can tune the same via /proc/fs/ext4/<parition>/stream_req
+ */
+#define MB_DEFAULT_STREAM_THRESHOLD     16      /* 64K */
+/*
+ * for which requests use 2^N search using buddies
+ */
+#define MB_DEFAULT_ORDER2_REQS          2
+/*
+ * default group prealloc size 512 blocks
+ */
+#define MB_DEFAULT_GROUP_PREALLOC       512
+static struct kmem_cache *ext4_pspace_cachep;
+#ifdef EXT4_BB_MAX_BLOCKS
+#undef EXT4_BB_MAX_BLOCKS
+#endif
+#define EXT4_BB_MAX_BLOCKS      30
+struct ext4_free_metadata {
+        ext4_group_t group;
+        unsigned short num;
+        ext4_grpblk_t  blocks[EXT4_BB_MAX_BLOCKS];
+        struct list_head list;
+};
+struct ext4_group_info {
+        unsigned long   bb_state;
+        unsigned long   bb_tid;
+        struct ext4_free_metadata *bb_md_cur;
+        unsigned short  bb_first_free;
+        unsigned short  bb_free;
+        unsigned short  bb_fragments;
+        struct          list_head bb_prealloc_list;
+#ifdef DOUBLE_CHECK
+        void            *bb_bitmap;
+#endif
+        unsigned short  bb_counters[];
+};
+#define EXT4_GROUP_INFO_NEED_INIT_BIT   0
+#define EXT4_GROUP_INFO_LOCKED_BIT      1
+#define EXT4_MB_GRP_NEED_INIT(grp)      \
+        (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+struct ext4_prealloc_space {
+        struct list_head        pa_inode_list;
+        struct list_head        pa_group_list;
+        union {
+                struct list_head pa_tmp_list;
+                struct rcu_head pa_rcu;
+        } u;
+        spinlock_t              pa_lock;
+        atomic_t                pa_count;
+        unsigned                pa_deleted;
+        ext4_fsblk_t            pa_pstart;      /* phys. block */
+        ext4_lblk_t             pa_lstart;      /* log. block */
+        unsigned short          pa_len;         /* len of preallocated chunk */
+        unsigned short          pa_free;        /* how many blocks are free */
+        unsigned short          pa_linear;      /* consumed in one direction
+                                                 * strictly, for grp prealloc */
+        spinlock_t              *pa_obj_lock;
+        struct inode            *pa_inode;      /* hack, for history only */
+};
+struct ext4_free_extent {
+        ext4_lblk_t fe_logical;
+        ext4_grpblk_t fe_start;
+        ext4_group_t fe_group;
+        int fe_len;
+};
+/*
+ * Locality group:
+ *   we try to group all related changes together
+ *   so that writeback can flush/allocate them together as well
+ */
+struct ext4_locality_group {
+        /* for allocator */
+        struct mutex            lg_mutex;       /* to serialize allocates */
+        struct list_head        lg_prealloc_list;/* list of preallocations */
+        spinlock_t              lg_prealloc_lock;
+};
+struct ext4_allocation_context {
+        struct inode *ac_inode;
+        struct super_block *ac_sb;
+        /* original request */
+        struct ext4_free_extent ac_o_ex;
+        /* goal request (after normalization) */
+        struct ext4_free_extent ac_g_ex;
+        /* the best found extent */
+        struct ext4_free_extent ac_b_ex;
+        /* copy of the bext found extent taken before preallocation efforts */
+        struct ext4_free_extent ac_f_ex;
+        /* number of iterations done. we have to track to limit searching */
+        unsigned long ac_ex_scanned;
+        __u16 ac_groups_scanned;
+        __u16 ac_found;
+        __u16 ac_tail;
+        __u16 ac_buddy;
+        __u16 ac_flags;         /* allocation hints */
+        __u8 ac_status;
+        __u8 ac_criteria;
+        __u8 ac_repeats;
+        __u8 ac_2order;         /* if request is to allocate 2^N blocks and
+                                 * N > 0, the field stores N, otherwise 0 */
+        __u8 ac_op;             /* operation, for history only */
+        struct page *ac_bitmap_page;
+        struct page *ac_buddy_page;
+        struct ext4_prealloc_space *ac_pa;
+        struct ext4_locality_group *ac_lg;
+};
+#define AC_STATUS_CONTINUE      1
+#define AC_STATUS_FOUND         2
+#define AC_STATUS_BREAK         3
+struct ext4_mb_history {
+        struct ext4_free_extent orig;   /* orig allocation */
+        struct ext4_free_extent goal;   /* goal allocation */
+        struct ext4_free_extent result; /* result allocation */
+        unsigned pid;
+        unsigned ino;
+        __u16 found;    /* how many extents have been found */
+        __u16 groups;   /* how many groups have been scanned */
+        __u16 tail;     /* what tail broke some buddy */
+        __u16 buddy;    /* buddy the tail ^^^ broke */
+        __u16 flags;
+        __u8 cr:3;      /* which phase the result extent was found at */
+        __u8 op:4;
+        __u8 merged:1;
+};
+struct ext4_buddy {
+        struct page *bd_buddy_page;
+        void *bd_buddy;
+        struct page *bd_bitmap_page;
+        void *bd_bitmap;
+        struct ext4_group_info *bd_info;
+        struct super_block *bd_sb;
+        __u16 bd_blkbits;
+        ext4_group_t bd_group;
+};
+#define EXT4_MB_BITMAP(e4b)     ((e4b)->bd_bitmap)
+#define EXT4_MB_BUDDY(e4b)      ((e4b)->bd_buddy)
+#ifndef EXT4_MB_HISTORY
+static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
+{
+        return;
+}
+#else
+static void ext4_mb_store_history(struct ext4_allocation_context *ac);
+#endif
+#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+static struct proc_dir_entry *proc_root_ext4;
+struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
+ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+                        ext4_fsblk_t goal, unsigned long *count, int *errp);
+static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+                                        ext4_group_t group);
+static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
+static void ext4_mb_free_committed_blocks(struct super_block *);
+static void ext4_mb_return_to_preallocation(struct inode *inode,
+                                        struct ext4_buddy *e4b, sector_t block,
+                                        int count);
+static void ext4_mb_put_pa(struct ext4_allocation_context *,
+                        struct super_block *, struct ext4_prealloc_space *pa);
+static int ext4_mb_init_per_dev_proc(struct super_block *sb);
+static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
+static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+{
+        struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+        bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+}
+static inline void ext4_unlock_group(struct super_block *sb,
+                                        ext4_group_t group)
+{
+        struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+        bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+}
+static inline int ext4_is_group_locked(struct super_block *sb,
+                                        ext4_group_t group)
+{
+        struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+        return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
+                                                &(grinfo->bb_state));
+}
+static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
+                                        struct ext4_free_extent *fex)
+{
+        ext4_fsblk_t block;
+        block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb)
+                        + fex->fe_start
+                        + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+        return block;
+}
+#if BITS_PER_LONG == 64
+#define mb_correct_addr_and_bit(bit, addr)              \
+{                                                       \
+        bit += ((unsigned long) addr & 7UL) << 3;       \
+        addr = (void *) ((unsigned long) addr & ~7UL);  \
+}
+#elif BITS_PER_LONG == 32
+#define mb_correct_addr_and_bit(bit, addr)              \
+{                                                       \
+        bit += ((unsigned long) addr & 3UL) << 3;       \
+        addr = (void *) ((unsigned long) addr & ~3UL);  \
+}
+#else
+#error "how many bits you are?!"
+#endif
+static inline int mb_test_bit(int bit, void *addr)
+{
+        /*
+         * ext4_test_bit on architecture like powerpc
+         * needs unsigned long aligned address
+         */
+        mb_correct_addr_and_bit(bit, addr);
+        return ext4_test_bit(bit, addr);
+}
+static inline void mb_set_bit(int bit, void *addr)
+{
+        mb_correct_addr_and_bit(bit, addr);
+        ext4_set_bit(bit, addr);
+}
+static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
+{
+        mb_correct_addr_and_bit(bit, addr);
+        ext4_set_bit_atomic(lock, bit, addr);
+}
+static inline void mb_clear_bit(int bit, void *addr)
+{
+        mb_correct_addr_and_bit(bit, addr);
+        ext4_clear_bit(bit, addr);
+}
+static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
+{
+        mb_correct_addr_and_bit(bit, addr);
+        ext4_clear_bit_atomic(lock, bit, addr);
+}
+static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
+{
+        char *bb;
+        /* FIXME!! is this needed */
+        BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
+        BUG_ON(max == NULL);
+        if (order > e4b->bd_blkbits + 1) {
+                *max = 0;
+                return NULL;
+        }
+        /* at order 0 we see each particular block */
+        *max = 1 << (e4b->bd_blkbits + 3);
+        if (order == 0)
+                return EXT4_MB_BITMAP(e4b);
+        bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
+        *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
+        return bb;
+}
+#ifdef DOUBLE_CHECK
+static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
+                           int first, int count)
+{
+        int i;
+        struct super_block *sb = e4b->bd_sb;
+        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
+                return;
+        BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
+        for (i = 0; i < count; i++) {
+                if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
+                        ext4_fsblk_t blocknr;
+                        blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
+                        blocknr += first + i;
+                        blocknr +=
+                            le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+                        ext4_error(sb, __FUNCTION__, "double-free of inode"
+                                   " %lu's block %llu(bit %u in group %lu)\n",
+                                   inode ? inode->i_ino : 0, blocknr,
+                                   first + i, e4b->bd_group);
+                }
+                mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
+        }
+}
+static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
+{
+        int i;
+        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
+                return;
+        BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+        for (i = 0; i < count; i++) {
+                BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
+                mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
+        }
+}
+static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
+{
+        if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
+                unsigned char *b1, *b2;
+                int i;
+                b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
+                b2 = (unsigned char *) bitmap;
+                for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
+                        if (b1[i] != b2[i]) {
+                                printk("corruption in group %lu at byte %u(%u):"
+                                       " %x in copy != %x on disk/prealloc\n",
+                                        e4b->bd_group, i, i * 8, b1[i], b2[i]);
+                                BUG();
+                        }
+                }
+        }
+}
+#else
+static inline void mb_free_blocks_double(struct inode *inode,
+                                struct ext4_buddy *e4b, int first, int count)
+{
+        return;
+}
+static inline void mb_mark_used_double(struct ext4_buddy *e4b,
+                                                int first, int count)
+{
+        return;
+}
+static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
+{
+        return;
+}
+#endif
+#ifdef AGGRESSIVE_CHECK
+#define MB_CHECK_ASSERT(assert)                                         \
+do {                                                                    \
+        if (!(assert)) {                                                \
+                printk(KERN_EMERG                                       \
+                        "Assertion failure in %s() at %s:%d: \"%s\"\n", \
+                        function, file, line, # assert);                \
+                BUG();                                                  \
+        }                                                               \
+} while (0)
+static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
+                                const char *function, int line)
+{
+        struct super_block *sb = e4b->bd_sb;
+        int order = e4b->bd_blkbits + 1;
+        int max;
+        int max2;
+        int i;
+        int j;
+        int k;
+        int count;
+        struct ext4_group_info *grp;
+        int fragments = 0;
+        int fstart;
+        struct list_head *cur;
+        void *buddy;
+        void *buddy2;
+        if (!test_opt(sb, MBALLOC))
+                return 0;
+        {
+                static int mb_check_counter;
+                if (mb_check_counter++ % 100 != 0)
+                        return 0;
+        }
+        while (order > 1) {
+                buddy = mb_find_buddy(e4b, order, &max);
+                MB_CHECK_ASSERT(buddy);
+                buddy2 = mb_find_buddy(e4b, order - 1, &max2);
+                MB_CHECK_ASSERT(buddy2);
+                MB_CHECK_ASSERT(buddy != buddy2);
+                MB_CHECK_ASSERT(max * 2 == max2);
+                count = 0;
+                for (i = 0; i < max; i++) {
+                        if (mb_test_bit(i, buddy)) {
+                                /* only single bit in buddy2 may be 1 */
+                                if (!mb_test_bit(i << 1, buddy2)) {
+                                        MB_CHECK_ASSERT(
+                                                mb_test_bit((i<<1)+1, buddy2));
+                                } else if (!mb_test_bit((i << 1) + 1, buddy2)) {
+                                        MB_CHECK_ASSERT(
+                                                mb_test_bit(i << 1, buddy2));
+                                }
+                                continue;
+                        }
+                        /* both bits in buddy2 must be 0 */
+                        MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
+                        MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
+                        for (j = 0; j < (1 << order); j++) {
+                                k = (i * (1 << order)) + j;
+                                MB_CHECK_ASSERT(
+                                        !mb_test_bit(k, EXT4_MB_BITMAP(e4b)));
+                        }
+                        count++;
+                }
+                MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
+                order--;
+        }
+        fstart = -1;
+        buddy = mb_find_buddy(e4b, 0, &max);
+        for (i = 0; i < max; i++) {
+                if (!mb_test_bit(i, buddy)) {
+                        MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
+                        if (fstart == -1) {
+                                fragments++;
+                                fstart = i;
+                        }
+                        continue;
+                }
+                fstart = -1;
+                /* check used bits only */
+                for (j = 0; j < e4b->bd_blkbits + 1; j++) {
+                        buddy2 = mb_find_buddy(e4b, j, &max2);
+                        k = i >> j;
+                        MB_CHECK_ASSERT(k < max2);
+                        MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
+                }
+        }
+        MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
+        MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
+        grp = ext4_get_group_info(sb, e4b->bd_group);
+        buddy = mb_find_buddy(e4b, 0, &max);
+        list_for_each(cur, &grp->bb_prealloc_list) {
+                ext4_group_t groupnr;
+                struct ext4_prealloc_space *pa;
+                pa = list_entry(cur, struct ext4_prealloc_space, group_list);
+                ext4_get_group_no_and_offset(sb, pa->pstart, &groupnr, &k);
+                MB_CHECK_ASSERT(groupnr == e4b->bd_group);
+                for (i = 0; i < pa->len; i++)
+                        MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
+        }
+        return 0;
+}
+#undef MB_CHECK_ASSERT
+#define mb_check_buddy(e4b) __mb_check_buddy(e4b,       \
+                                        __FILE__, __FUNCTION__, __LINE__)
+#else
+#define mb_check_buddy(e4b)
+#endif
+/* FIXME!! need more doc */
+static void ext4_mb_mark_free_simple(struct super_block *sb,
+                                void *buddy, unsigned first, int len,
+                                        struct ext4_group_info *grp)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        unsigned short min;
+        unsigned short max;
+        unsigned short chunk;
+        unsigned short border;
+        BUG_ON(len >= EXT4_BLOCKS_PER_GROUP(sb));
+        border = 2 << sb->s_blocksize_bits;
+        while (len > 0) {
+                /* find how many blocks can be covered since this position */
+                max = ffs(first | border) - 1;
+                /* find how many blocks of power 2 we need to mark */
+                min = fls(len) - 1;
+                if (max < min)
+                        min = max;
+                chunk = 1 << min;
+                /* mark multiblock chunks only */
+                grp->bb_counters[min]++;
+                if (min > 0)
+                        mb_clear_bit(first >> min,
+                                     buddy + sbi->s_mb_offsets[min]);
+                len -= chunk;
+                first += chunk;
+        }
+}
+static void ext4_mb_generate_buddy(struct super_block *sb,
+                                void *buddy, void *bitmap, ext4_group_t group)
+{
+        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+        unsigned short max = EXT4_BLOCKS_PER_GROUP(sb);
+        unsigned short i = 0;
+        unsigned short first;
+        unsigned short len;
+        unsigned free = 0;
+        unsigned fragments = 0;
+        unsigned long long period = get_cycles();
+        /* initialize buddy from bitmap which is aggregation
+         * of on-disk bitmap and preallocations */
+        i = ext4_find_next_zero_bit(bitmap, max, 0);
+        grp->bb_first_free = i;
+        while (i < max) {
+                fragments++;
+                first = i;
+                i = ext4_find_next_bit(bitmap, max, i);
+                len = i - first;
+                free += len;
+                if (len > 1)
+                        ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
+                else
+                        grp->bb_counters[0]++;
+                if (i < max)
+                        i = ext4_find_next_zero_bit(bitmap, max, i);
+        }
+        grp->bb_fragments = fragments;
+        if (free != grp->bb_free) {
+                printk(KERN_DEBUG
+                        "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n",
+                        group, free, grp->bb_free);
+                grp->bb_free = free;
+        }
+        clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
+        period = get_cycles() - period;
+        spin_lock(&EXT4_SB(sb)->s_bal_lock);
+        EXT4_SB(sb)->s_mb_buddies_generated++;
+        EXT4_SB(sb)->s_mb_generation_time += period;
+        spin_unlock(&EXT4_SB(sb)->s_bal_lock);
+}
+/* The buddy information is attached the buddy cache inode
+ * for convenience. The information regarding each group
+ * is loaded via ext4_mb_load_buddy. The information involve
+ * block bitmap and buddy information. The information are
+ * stored in the inode as
+ *
+ * {                        page                        }
+ * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
+ *
+ *
+ * one block each for bitmap and buddy information.
+ * So for each group we take up 2 blocks. A page can
+ * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize)  blocks.
+ * So it can have information regarding groups_per_page which
+ * is blocks_per_page/2
+ */
+static int ext4_mb_init_cache(struct page *page, char *incore)
+{
+        int blocksize;
+        int blocks_per_page;
+        int groups_per_page;
+        int err = 0;
+        int i;
+        ext4_group_t first_group;
+        int first_block;
+        struct super_block *sb;
+        struct buffer_head *bhs;
+        struct buffer_head **bh;
+        struct inode *inode;
+        char *data;
+        char *bitmap;
+        mb_debug("init page %lu\n", page->index);
+        inode = page->mapping->host;
+        sb = inode->i_sb;
+        blocksize = 1 << inode->i_blkbits;
+        blocks_per_page = PAGE_CACHE_SIZE / blocksize;
+        groups_per_page = blocks_per_page >> 1;
+        if (groups_per_page == 0)
+                groups_per_page = 1;
+        /* allocate buffer_heads to read bitmaps */
+        if (groups_per_page > 1) {
+                err = -ENOMEM;
+                i = sizeof(struct buffer_head *) * groups_per_page;
+                bh = kzalloc(i, GFP_NOFS);
+                if (bh == NULL)
+                        goto out;
+        } else
+                bh = &bhs;
+        first_group = page->index * blocks_per_page / 2;
+        /* read all groups the page covers into the cache */
+        for (i = 0; i < groups_per_page; i++) {
+                struct ext4_group_desc *desc;
+                if (first_group + i >= EXT4_SB(sb)->s_groups_count)
+                        break;
+                err = -EIO;
+                desc = ext4_get_group_desc(sb, first_group + i, NULL);
+                if (desc == NULL)
+                        goto out;
+                err = -ENOMEM;
+                bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc));
+                if (bh[i] == NULL)
+                        goto out;
+                if (bh_uptodate_or_lock(bh[i]))
+                        continue;
+                if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+                        ext4_init_block_bitmap(sb, bh[i],
+                                                first_group + i, desc);
+                        set_buffer_uptodate(bh[i]);
+                        unlock_buffer(bh[i]);
+                        continue;
+                }
+                get_bh(bh[i]);
+                bh[i]->b_end_io = end_buffer_read_sync;
+                submit_bh(READ, bh[i]);
+                mb_debug("read bitmap for group %lu\n", first_group + i);
+        }
+        /* wait for I/O completion */
+        for (i = 0; i < groups_per_page && bh[i]; i++)
+                wait_on_buffer(bh[i]);
+        err = -EIO;
+        for (i = 0; i < groups_per_page && bh[i]; i++)
+                if (!buffer_uptodate(bh[i]))
+                        goto out;
+        first_block = page->index * blocks_per_page;
+        for (i = 0; i < blocks_per_page; i++) {
+                int group;
+                struct ext4_group_info *grinfo;
+                group = (first_block + i) >> 1;
+                if (group >= EXT4_SB(sb)->s_groups_count)
+                        break;
+                /*
+                 * data carry information regarding this
+                 * particular group in the format specified
+                 * above
+                 *
+                 */
+                data = page_address(page) + (i * blocksize);
+                bitmap = bh[group - first_group]->b_data;
+                /*
+                 * We place the buddy block and bitmap block
+                 * close together
+                 */
+                if ((first_block + i) & 1) {
+                        /* this is block of buddy */
+                        BUG_ON(incore == NULL);
+                        mb_debug("put buddy for group %u in page %lu/%x\n",
+                                group, page->index, i * blocksize);
+                        memset(data, 0xff, blocksize);
+                        grinfo = ext4_get_group_info(sb, group);
+                        grinfo->bb_fragments = 0;
+                        memset(grinfo->bb_counters, 0,
+                               sizeof(unsigned short)*(sb->s_blocksize_bits+2));
+                        /*
+                         * incore got set to the group block bitmap below
+                         */
+                        ext4_mb_generate_buddy(sb, data, incore, group);
+                        incore = NULL;
+                } else {
+                        /* this is block of bitmap */
+                        BUG_ON(incore != NULL);
+                        mb_debug("put bitmap for group %u in page %lu/%x\n",
+                                group, page->index, i * blocksize);
+                        /* see comments in ext4_mb_put_pa() */
+                        ext4_lock_group(sb, group);
+                        memcpy(data, bitmap, blocksize);
+                        /* mark all preallocated blks used in in-core bitmap */
+                        ext4_mb_generate_from_pa(sb, data, group);
+                        ext4_unlock_group(sb, group);
+                        /* set incore so that the buddy information can be
+                         * generated using this
+                         */
+                        incore = data;
+                }
+        }
+        SetPageUptodate(page);
+out:
+        if (bh) {
+                for (i = 0; i < groups_per_page && bh[i]; i++)
+                        brelse(bh[i]);
+                if (bh != &bhs)
+                        kfree(bh);
+        }
+        return err;
+}
+static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
+                struct ext4_buddy *e4b)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct inode *inode = sbi->s_buddy_cache;
+        int blocks_per_page;
+        int block;
+        int pnum;
+        int poff;
+        struct page *page;
+        mb_debug("load group %lu\n", group);
+        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+        e4b->bd_blkbits = sb->s_blocksize_bits;
+        e4b->bd_info = ext4_get_group_info(sb, group);
+        e4b->bd_sb = sb;
+        e4b->bd_group = group;
+        e4b->bd_buddy_page = NULL;
+        e4b->bd_bitmap_page = NULL;
+        /*
+         * the buddy cache inode stores the block bitmap
+         * and buddy information in consecutive blocks.
+         * So for each group we need two blocks.
+         */
+        block = group * 2;
+        pnum = block / blocks_per_page;
+        poff = block % blocks_per_page;
+        /* we could use find_or_create_page(), but it locks page
+         * what we'd like to avoid in fast path ... */
+        page = find_get_page(inode->i_mapping, pnum);
+        if (page == NULL || !PageUptodate(page)) {
+                if (page)
+                        page_cache_release(page);
+                page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+                if (page) {
+                        BUG_ON(page->mapping != inode->i_mapping);
+                        if (!PageUptodate(page)) {
+                                ext4_mb_init_cache(page, NULL);
+                                mb_cmp_bitmaps(e4b, page_address(page) +
+                                               (poff * sb->s_blocksize));
+                        }
+                        unlock_page(page);
+                }
+        }
+        if (page == NULL || !PageUptodate(page))
+                goto err;
+        e4b->bd_bitmap_page = page;
+        e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
+        mark_page_accessed(page);
+        block++;
+        pnum = block / blocks_per_page;
+        poff = block % blocks_per_page;
+        page = find_get_page(inode->i_mapping, pnum);
+        if (page == NULL || !PageUptodate(page)) {
+                if (page)
+                        page_cache_release(page);
+                page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+                if (page) {
+                        BUG_ON(page->mapping != inode->i_mapping);
+                        if (!PageUptodate(page))
+                                ext4_mb_init_cache(page, e4b->bd_bitmap);
+                        unlock_page(page);
+                }
+        }
+        if (page == NULL || !PageUptodate(page))
+                goto err;
+        e4b->bd_buddy_page = page;
+        e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
+        mark_page_accessed(page);
+        BUG_ON(e4b->bd_bitmap_page == NULL);
+        BUG_ON(e4b->bd_buddy_page == NULL);
+        return 0;
+err:
+        if (e4b->bd_bitmap_page)
+                page_cache_release(e4b->bd_bitmap_page);
+        if (e4b->bd_buddy_page)
+                page_cache_release(e4b->bd_buddy_page);
+        e4b->bd_buddy = NULL;
+        e4b->bd_bitmap = NULL;
+        return -EIO;
+}
+static void ext4_mb_release_desc(struct ext4_buddy *e4b)
+{
+        if (e4b->bd_bitmap_page)
+                page_cache_release(e4b->bd_bitmap_page);
+        if (e4b->bd_buddy_page)
+                page_cache_release(e4b->bd_buddy_page);
+}
+static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
+{
+        int order = 1;
+        void *bb;
+        BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
+        BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
+        bb = EXT4_MB_BUDDY(e4b);
+        while (order <= e4b->bd_blkbits + 1) {
+                block = block >> 1;
+                if (!mb_test_bit(block, bb)) {
+                        /* this block is part of buddy of order 'order' */
+                        return order;
+                }
+                bb += 1 << (e4b->bd_blkbits - order);
+                order++;
+        }
+        return 0;
+}
+static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
+{
+        __u32 *addr;
+        len = cur + len;
+        while (cur < len) {
+                if ((cur & 31) == 0 && (len - cur) >= 32) {
+                        /* fast path: clear whole word at once */
+                        addr = bm + (cur >> 3);
+                        *addr = 0;
+                        cur += 32;
+                        continue;
+                }
+                mb_clear_bit_atomic(lock, cur, bm);
+                cur++;
+        }
+}
+static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
+{
+        __u32 *addr;
+        len = cur + len;
+        while (cur < len) {
+                if ((cur & 31) == 0 && (len - cur) >= 32) {
+                        /* fast path: set whole word at once */
+                        addr = bm + (cur >> 3);
+                        *addr = 0xffffffff;
+                        cur += 32;
+                        continue;
+                }
+                mb_set_bit_atomic(lock, cur, bm);
+                cur++;
+        }
+}
+static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
+                          int first, int count)
+{
+        int block = 0;
+        int max = 0;
+        int order;
+        void *buddy;
+        void *buddy2;
+        struct super_block *sb = e4b->bd_sb;
+        BUG_ON(first + count > (sb->s_blocksize << 3));
+        BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
+        mb_check_buddy(e4b);
+        mb_free_blocks_double(inode, e4b, first, count);
+        e4b->bd_info->bb_free += count;
+        if (first < e4b->bd_info->bb_first_free)
+                e4b->bd_info->bb_first_free = first;
+        /* let's maintain fragments counter */
+        if (first != 0)
+                block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b));
+        if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
+                max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b));
+        if (block && max)
+                e4b->bd_info->bb_fragments--;
+        else if (!block && !max)
+                e4b->bd_info->bb_fragments++;
+        /* let's maintain buddy itself */
+        while (count-- > 0) {
+                block = first++;
+                order = 0;
+                if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
+                        ext4_fsblk_t blocknr;
+                        blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
+                        blocknr += block;
+                        blocknr +=
+                            le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+                        ext4_error(sb, __FUNCTION__, "double-free of inode"
+                                   " %lu's block %llu(bit %u in group %lu)\n",
+                                   inode ? inode->i_ino : 0, blocknr, block,
+                                   e4b->bd_group);
+                }
+                mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
+                e4b->bd_info->bb_counters[order]++;
+                /* start of the buddy */
+                buddy = mb_find_buddy(e4b, order, &max);
+                do {
+                        block &= ~1UL;
+                        if (mb_test_bit(block, buddy) ||
+                                        mb_test_bit(block + 1, buddy))
+                                break;
+                        /* both the buddies are free, try to coalesce them */
+                        buddy2 = mb_find_buddy(e4b, order + 1, &max);
+                        if (!buddy2)
+                                break;
+                        if (order > 0) {
+                                /* for special purposes, we don't set
+                                 * free bits in bitmap */
+                                mb_set_bit(block, buddy);
+                                mb_set_bit(block + 1, buddy);
+                        }
+                        e4b->bd_info->bb_counters[order]--;
+                        e4b->bd_info->bb_counters[order]--;
+                        block = block >> 1;
+                        order++;
+                        e4b->bd_info->bb_counters[order]++;
+                        mb_clear_bit(block, buddy2);
+                        buddy = buddy2;
+                } while (1);
+        }
+        mb_check_buddy(e4b);
+        return 0;
+}
+static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
+                                int needed, struct ext4_free_extent *ex)
+{
+        int next = block;
+        int max;
+        int ord;
+        void *buddy;
+        BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+        BUG_ON(ex == NULL);
+        buddy = mb_find_buddy(e4b, order, &max);
+        BUG_ON(buddy == NULL);
+        BUG_ON(block >= max);
+        if (mb_test_bit(block, buddy)) {
+                ex->fe_len = 0;
+                ex->fe_start = 0;
+                ex->fe_group = 0;
+                return 0;
+        }
+        /* FIXME dorp order completely ? */
+        if (likely(order == 0)) {
+                /* find actual order */
+                order = mb_find_order_for_block(e4b, block);
+                block = block >> order;
+        }
+        ex->fe_len = 1 << order;
+        ex->fe_start = block << order;
+        ex->fe_group = e4b->bd_group;
+        /* calc difference from given start */
+        next = next - ex->fe_start;
+        ex->fe_len -= next;
+        ex->fe_start += next;
+        while (needed > ex->fe_len &&
+               (buddy = mb_find_buddy(e4b, order, &max))) {
+                if (block + 1 >= max)
+                        break;
+                next = (block + 1) * (1 << order);
+                if (mb_test_bit(next, EXT4_MB_BITMAP(e4b)))
+                        break;
+                ord = mb_find_order_for_block(e4b, next);
+                order = ord;
+                block = next >> order;
+                ex->fe_len += 1 << order;
+        }
+        BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3)));
+        return ex->fe_len;
+}
+static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
+{
+        int ord;
+        int mlen = 0;
+        int max = 0;
+        int cur;
+        int start = ex->fe_start;
+        int len = ex->fe_len;
+        unsigned ret = 0;
+        int len0 = len;
+        void *buddy;
+        BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
+        BUG_ON(e4b->bd_group != ex->fe_group);
+        BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+        mb_check_buddy(e4b);
+        mb_mark_used_double(e4b, start, len);
+        e4b->bd_info->bb_free -= len;
+        if (e4b->bd_info->bb_first_free == start)
+                e4b->bd_info->bb_first_free += len;
+        /* let's maintain fragments counter */
+        if (start != 0)
+                mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b));
+        if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
+                max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b));
+        if (mlen && max)
+                e4b->bd_info->bb_fragments++;
+        else if (!mlen && !max)
+                e4b->bd_info->bb_fragments--;
+        /* let's maintain buddy itself */
+        while (len) {
+                ord = mb_find_order_for_block(e4b, start);
+                if (((start >> ord) << ord) == start && len >= (1 << ord)) {
+                        /* the whole chunk may be allocated at once! */
+                        mlen = 1 << ord;
+                        buddy = mb_find_buddy(e4b, ord, &max);
+                        BUG_ON((start >> ord) >= max);
+                        mb_set_bit(start >> ord, buddy);
+                        e4b->bd_info->bb_counters[ord]--;
+                        start += mlen;
+                        len -= mlen;
+                        BUG_ON(len < 0);
+                        continue;
+                }
+                /* store for history */
+                if (ret == 0)
+                        ret = len | (ord << 16);
+                /* we have to split large buddy */
+                BUG_ON(ord <= 0);
+                buddy = mb_find_buddy(e4b, ord, &max);
+                mb_set_bit(start >> ord, buddy);
+                e4b->bd_info->bb_counters[ord]--;
+                ord--;
+                cur = (start >> ord) & ~1U;
+                buddy = mb_find_buddy(e4b, ord, &max);
+                mb_clear_bit(cur, buddy);
+                mb_clear_bit(cur + 1, buddy);
+                e4b->bd_info->bb_counters[ord]++;
+                e4b->bd_info->bb_counters[ord]++;
+        }
+        mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group),
+                        EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
+        mb_check_buddy(e4b);
+        return ret;
+}
+/*
+ * Must be called under group lock!
+ */
+static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
+                                        struct ext4_buddy *e4b)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+        int ret;
+        BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
+        BUG_ON(ac->ac_status == AC_STATUS_FOUND);
+        ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
+        ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
+        ret = mb_mark_used(e4b, &ac->ac_b_ex);
+        /* preallocation can change ac_b_ex, thus we store actually
+         * allocated blocks for history */
+        ac->ac_f_ex = ac->ac_b_ex;
+        ac->ac_status = AC_STATUS_FOUND;
+        ac->ac_tail = ret & 0xffff;
+        ac->ac_buddy = ret >> 16;
+        /* XXXXXXX: SUCH A HORRIBLE **CK */
+        /*FIXME!! Why ? */
+        ac->ac_bitmap_page = e4b->bd_bitmap_page;
+        get_page(ac->ac_bitmap_page);
+        ac->ac_buddy_page = e4b->bd_buddy_page;
+        get_page(ac->ac_buddy_page);
+        /* store last allocated for subsequent stream allocation */
+        if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
+                spin_lock(&sbi->s_md_lock);
+                sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
+                sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
+                spin_unlock(&sbi->s_md_lock);
+        }
+}
+/*
+ * regular allocator, for general purposes allocation
+ */
+static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
+                                        struct ext4_buddy *e4b,
+                                        int finish_group)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+        struct ext4_free_extent *bex = &ac->ac_b_ex;
+        struct ext4_free_extent *gex = &ac->ac_g_ex;
+        struct ext4_free_extent ex;
+        int max;
+        /*
+         * We don't want to scan for a whole year
+         */
+        if (ac->ac_found > sbi->s_mb_max_to_scan &&
+                        !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
+                ac->ac_status = AC_STATUS_BREAK;
+                return;
+        }
+        /*
+         * Haven't found good chunk so far, let's continue
+         */
+        if (bex->fe_len < gex->fe_len)
+                return;
+        if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
+                        && bex->fe_group == e4b->bd_group) {
+                /* recheck chunk's availability - we don't know
+                 * when it was found (within this lock-unlock
+                 * period or not) */
+                max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex);
+                if (max >= gex->fe_len) {
+                        ext4_mb_use_best_found(ac, e4b);
+                        return;
+                }
+        }
+}
+/*
+ * The routine checks whether found extent is good enough. If it is,
+ * then the extent gets marked used and flag is set to the context
+ * to stop scanning. Otherwise, the extent is compared with the
+ * previous found extent and if new one is better, then it's stored
+ * in the context. Later, the best found extent will be used, if
+ * mballoc can't find good enough extent.
+ *
+ * FIXME: real allocation policy is to be designed yet!
+ */
+static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
+                                        struct ext4_free_extent *ex,
+                                        struct ext4_buddy *e4b)
+{
+        struct ext4_free_extent *bex = &ac->ac_b_ex;
+        struct ext4_free_extent *gex = &ac->ac_g_ex;
+        BUG_ON(ex->fe_len <= 0);
+        BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+        BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+        BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
+        ac->ac_found++;
+        /*
+         * The special case - take what you catch first
+         */
+        if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
+                *bex = *ex;
+                ext4_mb_use_best_found(ac, e4b);
+                return;
+        }
+        /*
+         * Let's check whether the chuck is good enough
+         */
+        if (ex->fe_len == gex->fe_len) {
+                *bex = *ex;
+                ext4_mb_use_best_found(ac, e4b);
+                return;
+        }
+        /*
+         * If this is first found extent, just store it in the context
+         */
+        if (bex->fe_len == 0) {
+                *bex = *ex;
+                return;
+        }
+        /*
+         * If new found extent is better, store it in the context
+         */
+        if (bex->fe_len < gex->fe_len) {
+                /* if the request isn't satisfied, any found extent
+                 * larger than previous best one is better */
+                if (ex->fe_len > bex->fe_len)
+                        *bex = *ex;
+        } else if (ex->fe_len > gex->fe_len) {
+                /* if the request is satisfied, then we try to find
+                 * an extent that still satisfy the request, but is
+                 * smaller than previous one */
+                if (ex->fe_len < bex->fe_len)
+                        *bex = *ex;
+        }
+        ext4_mb_check_limits(ac, e4b, 0);
+}
+static int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
+                                        struct ext4_buddy *e4b)
+{
+        struct ext4_free_extent ex = ac->ac_b_ex;
+        ext4_group_t group = ex.fe_group;
+        int max;
+        int err;
+        BUG_ON(ex.fe_len <= 0);
+        err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
+        if (err)
+                return err;
+        ext4_lock_group(ac->ac_sb, group);
+        max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex);
+        if (max > 0) {
+                ac->ac_b_ex = ex;
+                ext4_mb_use_best_found(ac, e4b);
+        }
+        ext4_unlock_group(ac->ac_sb, group);
+        ext4_mb_release_desc(e4b);
+        return 0;
+}
+static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
+                                struct ext4_buddy *e4b)
+{
+        ext4_group_t group = ac->ac_g_ex.fe_group;
+        int max;
+        int err;
+        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+        struct ext4_super_block *es = sbi->s_es;
+        struct ext4_free_extent ex;
+        if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
+                return 0;
+        err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
+        if (err)
+                return err;
+        ext4_lock_group(ac->ac_sb, group);
+        max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start,
+                             ac->ac_g_ex.fe_len, &ex);
+        if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
+                ext4_fsblk_t start;
+                start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) +
+                        ex.fe_start + le32_to_cpu(es->s_first_data_block);
+                /* use do_div to get remainder (would be 64-bit modulo) */
+                if (do_div(start, sbi->s_stripe) == 0) {
+                        ac->ac_found++;
+                        ac->ac_b_ex = ex;
+                        ext4_mb_use_best_found(ac, e4b);
+                }
+        } else if (max >= ac->ac_g_ex.fe_len) {
+                BUG_ON(ex.fe_len <= 0);
+                BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
+                BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
+                ac->ac_found++;
+                ac->ac_b_ex = ex;
+                ext4_mb_use_best_found(ac, e4b);
+        } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
+                /* Sometimes, caller may want to merge even small
+                 * number of blocks to an existing extent */
+                BUG_ON(ex.fe_len <= 0);
+                BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
+                BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
+                ac->ac_found++;
+                ac->ac_b_ex = ex;
+                ext4_mb_use_best_found(ac, e4b);
+        }
+        ext4_unlock_group(ac->ac_sb, group);
+        ext4_mb_release_desc(e4b);
+        return 0;
+}
+/*
+ * The routine scans buddy structures (not bitmap!) from given order
+ * to max order and tries to find big enough chunk to satisfy the req
+ */
+static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
+                                        struct ext4_buddy *e4b)
+{
+        struct super_block *sb = ac->ac_sb;
+        struct ext4_group_info *grp = e4b->bd_info;
+        void *buddy;
+        int i;
+        int k;
+        int max;
+        BUG_ON(ac->ac_2order <= 0);
+        for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
+                if (grp->bb_counters[i] == 0)
+                        continue;
+                buddy = mb_find_buddy(e4b, i, &max);
+                BUG_ON(buddy == NULL);
+                k = ext4_find_next_zero_bit(buddy, max, 0);
+                BUG_ON(k >= max);
+                ac->ac_found++;
+                ac->ac_b_ex.fe_len = 1 << i;
+                ac->ac_b_ex.fe_start = k << i;
+                ac->ac_b_ex.fe_group = e4b->bd_group;
+                ext4_mb_use_best_found(ac, e4b);
+                BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
+                if (EXT4_SB(sb)->s_mb_stats)
+                        atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
+                break;
+        }
+}
+/*
+ * The routine scans the group and measures all found extents.
+ * In order to optimize scanning, caller must pass number of
+ * free blocks in the group, so the routine can know upper limit.
+ */
+static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
+                                        struct ext4_buddy *e4b)
+{
+        struct super_block *sb = ac->ac_sb;
+        void *bitmap = EXT4_MB_BITMAP(e4b);
+        struct ext4_free_extent ex;
+        int i;
+        int free;
+        free = e4b->bd_info->bb_free;
+        BUG_ON(free <= 0);
+        i = e4b->bd_info->bb_first_free;
+        while (free && ac->ac_status == AC_STATUS_CONTINUE) {
+                i = ext4_find_next_zero_bit(bitmap,
+                                                EXT4_BLOCKS_PER_GROUP(sb), i);
+                if (i >= EXT4_BLOCKS_PER_GROUP(sb)) {
+                        BUG_ON(free != 0);
+                        break;
+                }
+                mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
+                BUG_ON(ex.fe_len <= 0);
+                BUG_ON(free < ex.fe_len);
+                ext4_mb_measure_extent(ac, &ex, e4b);
+                i += ex.fe_len;
+                free -= ex.fe_len;
+        }
+        ext4_mb_check_limits(ac, e4b, 1);
+}
+/*
+ * This is a special case for storages like raid5
+ * we try to find stripe-aligned chunks for stripe-size requests
+ * XXX should do so at least for multiples of stripe size as well
+ */
+static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
+                                 struct ext4_buddy *e4b)
+{
+        struct super_block *sb = ac->ac_sb;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        void *bitmap = EXT4_MB_BITMAP(e4b);
+        struct ext4_free_extent ex;
+        ext4_fsblk_t first_group_block;
+        ext4_fsblk_t a;
+        ext4_grpblk_t i;
+        int max;
+        BUG_ON(sbi->s_stripe == 0);
+        /* find first stripe-aligned block in group */
+        first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb)
+                + le32_to_cpu(sbi->s_es->s_first_data_block);
+        a = first_group_block + sbi->s_stripe - 1;
+        do_div(a, sbi->s_stripe);
+        i = (a * sbi->s_stripe) - first_group_block;
+        while (i < EXT4_BLOCKS_PER_GROUP(sb)) {
+                if (!mb_test_bit(i, bitmap)) {
+                        max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
+                        if (max >= sbi->s_stripe) {
+                                ac->ac_found++;
+                                ac->ac_b_ex = ex;
+                                ext4_mb_use_best_found(ac, e4b);
+                                break;
+                        }
+                }
+                i += sbi->s_stripe;
+        }
+}
+static int ext4_mb_good_group(struct ext4_allocation_context *ac,
+                                ext4_group_t group, int cr)
+{
+        unsigned free, fragments;
+        unsigned i, bits;
+        struct ext4_group_desc *desc;
+        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
+        BUG_ON(cr < 0 || cr >= 4);
+        BUG_ON(EXT4_MB_GRP_NEED_INIT(grp));
+        free = grp->bb_free;
+        fragments = grp->bb_fragments;
+        if (free == 0)
+                return 0;
+        if (fragments == 0)
+                return 0;
+        switch (cr) {
+        case 0:
+                BUG_ON(ac->ac_2order == 0);
+                /* If this group is uninitialized, skip it initially */
+                desc = ext4_get_group_desc(ac->ac_sb, group, NULL);
+                if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
+                        return 0;
+                bits = ac->ac_sb->s_blocksize_bits + 1;
+                for (i = ac->ac_2order; i <= bits; i++)
+                        if (grp->bb_counters[i] > 0)
+                                return 1;
+                break;
+        case 1:
+                if ((free / fragments) >= ac->ac_g_ex.fe_len)
+                        return 1;
+                break;
+        case 2:
+                if (free >= ac->ac_g_ex.fe_len)
+                        return 1;
+                break;
+        case 3:
+                return 1;
+        default:
+                BUG();
+        }
+        return 0;
+}
+static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
+{
+        ext4_group_t group;
+        ext4_group_t i;
+        int cr;
+        int err = 0;
+        int bsbits;
+        struct ext4_sb_info *sbi;
+        struct super_block *sb;
+        struct ext4_buddy e4b;
+        loff_t size, isize;
+        sb = ac->ac_sb;
+        sbi = EXT4_SB(sb);
+        BUG_ON(ac->ac_status == AC_STATUS_FOUND);
+        /* first, try the goal */
+        err = ext4_mb_find_by_goal(ac, &e4b);
+        if (err || ac->ac_status == AC_STATUS_FOUND)
+                goto out;
+        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+                goto out;
+        /*
+         * ac->ac2_order is set only if the fe_len is a power of 2
+         * if ac2_order is set we also set criteria to 0 so that we
+         * try exact allocation using buddy.
+         */
+        i = fls(ac->ac_g_ex.fe_len);
+        ac->ac_2order = 0;
+        /*
+         * We search using buddy data only if the order of the request
+         * is greater than equal to the sbi_s_mb_order2_reqs
+         * You can tune it via /proc/fs/ext4/<partition>/order2_req
+         */
+        if (i >= sbi->s_mb_order2_reqs) {
+                /*
+                 * This should tell if fe_len is exactly power of 2
+                 */
+                if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
+                        ac->ac_2order = i - 1;
+        }
+        bsbits = ac->ac_sb->s_blocksize_bits;
+        /* if stream allocation is enabled, use global goal */
+        size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+        isize = i_size_read(ac->ac_inode) >> bsbits;
+        if (size < isize)
+                size = isize;
+        if (size < sbi->s_mb_stream_request &&
+                        (ac->ac_flags & EXT4_MB_HINT_DATA)) {
+                /* TBD: may be hot point */
+                spin_lock(&sbi->s_md_lock);
+                ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
+                ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
+                spin_unlock(&sbi->s_md_lock);
+        }
+        /* searching for the right group start from the goal value specified */
+        group = ac->ac_g_ex.fe_group;
+        /* Let's just scan groups to find more-less suitable blocks */
+        cr = ac->ac_2order ? 0 : 1;
+        /*
+         * cr == 0 try to get exact allocation,
+         * cr == 3  try to get anything
+         */
+repeat:
+        for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
+                ac->ac_criteria = cr;
+                for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
+                        struct ext4_group_info *grp;
+                        struct ext4_group_desc *desc;
+                        if (group == EXT4_SB(sb)->s_groups_count)
+                                group = 0;
+                        /* quick check to skip empty groups */
+                        grp = ext4_get_group_info(ac->ac_sb, group);
+                        if (grp->bb_free == 0)
+                                continue;
+                        /*
+                         * if the group is already init we check whether it is
+                         * a good group and if not we don't load the buddy
+                         */
+                        if (EXT4_MB_GRP_NEED_INIT(grp)) {
+                                /*
+                                 * we need full data about the group
+                                 * to make a good selection
+                                 */
+                                err = ext4_mb_load_buddy(sb, group, &e4b);
+                                if (err)
+                                        goto out;
+                                ext4_mb_release_desc(&e4b);
+                        }
+                        /*
+                         * If the particular group doesn't satisfy our
+                         * criteria we continue with the next group
+                         */
+                        if (!ext4_mb_good_group(ac, group, cr))
+                                continue;
+                        err = ext4_mb_load_buddy(sb, group, &e4b);
+                        if (err)
+                                goto out;
+                        ext4_lock_group(sb, group);
+                        if (!ext4_mb_good_group(ac, group, cr)) {
+                                /* someone did allocation from this group */
+                                ext4_unlock_group(sb, group);
+                                ext4_mb_release_desc(&e4b);
+                                continue;
+                        }
+                        ac->ac_groups_scanned++;
+                        desc = ext4_get_group_desc(sb, group, NULL);
+                        if (cr == 0 || (desc->bg_flags &
+                                        cpu_to_le16(EXT4_BG_BLOCK_UNINIT) &&
+                                        ac->ac_2order != 0))
+                                ext4_mb_simple_scan_group(ac, &e4b);
+                        else if (cr == 1 &&
+                                        ac->ac_g_ex.fe_len == sbi->s_stripe)
+                                ext4_mb_scan_aligned(ac, &e4b);
+                        else
+                                ext4_mb_complex_scan_group(ac, &e4b);
+                        ext4_unlock_group(sb, group);
+                        ext4_mb_release_desc(&e4b);
+                        if (ac->ac_status != AC_STATUS_CONTINUE)
+                                break;
+                }
+        }
+        if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
+            !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
+                /*
+                 * We've been searching too long. Let's try to allocate
+                 * the best chunk we've found so far
+                 */
+                ext4_mb_try_best_found(ac, &e4b);
+                if (ac->ac_status != AC_STATUS_FOUND) {
+                        /*
+                         * Someone more lucky has already allocated it.
+                         * The only thing we can do is just take first
+                         * found block(s)
+                        printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
+                         */
+                        ac->ac_b_ex.fe_group = 0;
+                        ac->ac_b_ex.fe_start = 0;
+                        ac->ac_b_ex.fe_len = 0;
+                        ac->ac_status = AC_STATUS_CONTINUE;
+                        ac->ac_flags |= EXT4_MB_HINT_FIRST;
+                        cr = 3;
+                        atomic_inc(&sbi->s_mb_lost_chunks);
+                        goto repeat;
+                }
+        }
+out:
+        return err;
+}
+#ifdef EXT4_MB_HISTORY
+struct ext4_mb_proc_session {
+        struct ext4_mb_history *history;
+        struct super_block *sb;
+        int start;
+        int max;
+};
+static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s,
+                                        struct ext4_mb_history *hs,
+                                        int first)
+{
+        if (hs == s->history + s->max)
+                hs = s->history;
+        if (!first && hs == s->history + s->start)
+                return NULL;
+        while (hs->orig.fe_len == 0) {
+                hs++;
+                if (hs == s->history + s->max)
+                        hs = s->history;
+                if (hs == s->history + s->start)
+                        return NULL;
+        }
+        return hs;
+}
+static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
+{
+        struct ext4_mb_proc_session *s = seq->private;
+        struct ext4_mb_history *hs;
+        int l = *pos;
+        if (l == 0)
+                return SEQ_START_TOKEN;
+        hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1);
+        if (!hs)
+                return NULL;
+        while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL);
+        return hs;
+}
+static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v,
+                                      loff_t *pos)
+{
+        struct ext4_mb_proc_session *s = seq->private;
+        struct ext4_mb_history *hs = v;
+        ++*pos;
+        if (v == SEQ_START_TOKEN)
+                return ext4_mb_history_skip_empty(s, s->history + s->start, 1);
+        else
+                return ext4_mb_history_skip_empty(s, ++hs, 0);
+}
+static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
+{
+        char buf[25], buf2[25], buf3[25], *fmt;
+        struct ext4_mb_history *hs = v;
+        if (v == SEQ_START_TOKEN) {
+                seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
+                                "%-5s %-2s %-5s %-5s %-5s %-6s\n",
+                          "pid", "inode", "original", "goal", "result", "found",
+                           "grps", "cr", "flags", "merge", "tail", "broken");
+                return 0;
+        }
+        if (hs->op == EXT4_MB_HISTORY_ALLOC) {
+                fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
+                        "%-5u %-5s %-5u %-6u\n";
+                sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
+                        hs->result.fe_start, hs->result.fe_len,
+                        hs->result.fe_logical);
+                sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
+                        hs->orig.fe_start, hs->orig.fe_len,
+                        hs->orig.fe_logical);
+                sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group,
+                        hs->goal.fe_start, hs->goal.fe_len,
+                        hs->goal.fe_logical);
+                seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
+                                hs->found, hs->groups, hs->cr, hs->flags,
+                                hs->merged ? "M" : "", hs->tail,
+                                hs->buddy ? 1 << hs->buddy : 0);
+        } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
+                fmt = "%-5u %-8u %-23s %-23s %-23s\n";
+                sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
+                        hs->result.fe_start, hs->result.fe_len,
+                        hs->result.fe_logical);
+                sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
+                        hs->orig.fe_start, hs->orig.fe_len,
+                        hs->orig.fe_logical);
+                seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
+        } else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
+                sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
+                        hs->result.fe_start, hs->result.fe_len);
+                seq_printf(seq, "%-5u %-8u %-23s discard\n",
+                                hs->pid, hs->ino, buf2);
+        } else if (hs->op == EXT4_MB_HISTORY_FREE) {
+                sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
+                        hs->result.fe_start, hs->result.fe_len);
+                seq_printf(seq, "%-5u %-8u %-23s free\n",
+                                hs->pid, hs->ino, buf2);
+        }
+        return 0;
+}
+static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
+{
+}
+static struct seq_operations ext4_mb_seq_history_ops = {
+        .start  = ext4_mb_seq_history_start,
+        .next   = ext4_mb_seq_history_next,
+        .stop   = ext4_mb_seq_history_stop,
+        .show   = ext4_mb_seq_history_show,
+};
+static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
+{
+        struct super_block *sb = PDE(inode)->data;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_mb_proc_session *s;
+        int rc;
+        int size;
+        s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (s == NULL)
+                return -ENOMEM;
+        s->sb = sb;
+        size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max;
+        s->history = kmalloc(size, GFP_KERNEL);
+        if (s->history == NULL) {
+                kfree(s);
+                return -ENOMEM;
+        }
+        spin_lock(&sbi->s_mb_history_lock);
+        memcpy(s->history, sbi->s_mb_history, size);
+        s->max = sbi->s_mb_history_max;
+        s->start = sbi->s_mb_history_cur % s->max;
+        spin_unlock(&sbi->s_mb_history_lock);
+        rc = seq_open(file, &ext4_mb_seq_history_ops);
+        if (rc == 0) {
+                struct seq_file *m = (struct seq_file *)file->private_data;
+                m->private = s;
+        } else {
+                kfree(s->history);
+                kfree(s);
+        }
+        return rc;
+}
+static int ext4_mb_seq_history_release(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq = (struct seq_file *)file->private_data;
+        struct ext4_mb_proc_session *s = seq->private;
+        kfree(s->history);
+        kfree(s);
+        return seq_release(inode, file);
+}
+static ssize_t ext4_mb_seq_history_write(struct file *file,
+                                const char __user *buffer,
+                                size_t count, loff_t *ppos)
+{
+        struct seq_file *seq = (struct seq_file *)file->private_data;
+        struct ext4_mb_proc_session *s = seq->private;
+        struct super_block *sb = s->sb;
+        char str[32];
+        int value;
+        if (count >= sizeof(str)) {
+                printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n",
+                                "mb_history", (int)sizeof(str));
+                return -EOVERFLOW;
+        }
+        if (copy_from_user(str, buffer, count))
+                return -EFAULT;
+        value = simple_strtol(str, NULL, 0);
+        if (value < 0)
+                return -ERANGE;
+        EXT4_SB(sb)->s_mb_history_filter = value;
+        return count;
+}
+static struct file_operations ext4_mb_seq_history_fops = {
+        .owner          = THIS_MODULE,
+        .open           = ext4_mb_seq_history_open,
+        .read           = seq_read,
+        .write          = ext4_mb_seq_history_write,
+        .llseek         = seq_lseek,
+        .release        = ext4_mb_seq_history_release,
+};
+static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
+{
+        struct super_block *sb = seq->private;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        ext4_group_t group;
+        if (*pos < 0 || *pos >= sbi->s_groups_count)
+                return NULL;
+        group = *pos + 1;
+        return (void *) group;
+}
+static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        struct super_block *sb = seq->private;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        ext4_group_t group;
+        ++*pos;
+        if (*pos < 0 || *pos >= sbi->s_groups_count)
+                return NULL;
+        group = *pos + 1;
+        return (void *) group;;
+}
+static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
+{
+        struct super_block *sb = seq->private;
+        long group = (long) v;
+        int i;
+        int err;
+        struct ext4_buddy e4b;
+        struct sg {
+                struct ext4_group_info info;
+                unsigned short counters[16];
+        } sg;
+        group--;
+        if (group == 0)
+                seq_printf(seq, "#%-5s: %-5s %-5s %-5s "
+                                "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s "
+                                  "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n",
+                           "group", "free", "frags", "first",
+                           "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6",
+                           "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13");
+        i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
+                sizeof(struct ext4_group_info);
+        err = ext4_mb_load_buddy(sb, group, &e4b);
+        if (err) {
+                seq_printf(seq, "#%-5lu: I/O error\n", group);
+                return 0;
+        }
+        ext4_lock_group(sb, group);
+        memcpy(&sg, ext4_get_group_info(sb, group), i);
+        ext4_unlock_group(sb, group);
+        ext4_mb_release_desc(&e4b);
+        seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free,
+                        sg.info.bb_fragments, sg.info.bb_first_free);
+        for (i = 0; i <= 13; i++)
+                seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
+                                sg.info.bb_counters[i] : 0);
+        seq_printf(seq, " ]\n");
+        return 0;
+}
+static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
+{
+}
+static struct seq_operations ext4_mb_seq_groups_ops = {
+        .start  = ext4_mb_seq_groups_start,
+        .next   = ext4_mb_seq_groups_next,
+        .stop   = ext4_mb_seq_groups_stop,
+        .show   = ext4_mb_seq_groups_show,
+};
+static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
+{
+        struct super_block *sb = PDE(inode)->data;
+        int rc;
+        rc = seq_open(file, &ext4_mb_seq_groups_ops);
+        if (rc == 0) {
+                struct seq_file *m = (struct seq_file *)file->private_data;
+                m->private = sb;
+        }
+        return rc;
+}
+static struct file_operations ext4_mb_seq_groups_fops = {
+        .owner          = THIS_MODULE,
+        .open           = ext4_mb_seq_groups_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+static void ext4_mb_history_release(struct super_block *sb)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        remove_proc_entry("mb_groups", sbi->s_mb_proc);
+        remove_proc_entry("mb_history", sbi->s_mb_proc);
+        kfree(sbi->s_mb_history);
+}
+static void ext4_mb_history_init(struct super_block *sb)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        int i;
+        if (sbi->s_mb_proc != NULL) {
+                struct proc_dir_entry *p;
+                p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc);
+                if (p) {
+                        p->proc_fops = &ext4_mb_seq_history_fops;
+                        p->data = sb;
+                }
+                p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc);
+                if (p) {
+                        p->proc_fops = &ext4_mb_seq_groups_fops;
+                        p->data = sb;
+                }
+        }
+        sbi->s_mb_history_max = 1000;
+        sbi->s_mb_history_cur = 0;
+        spin_lock_init(&sbi->s_mb_history_lock);
+        i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
+        sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
+        if (likely(sbi->s_mb_history != NULL))
+                memset(sbi->s_mb_history, 0, i);
+        /* if we can't allocate history, then we simple won't use it */
+}
+static void ext4_mb_store_history(struct ext4_allocation_context *ac)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+        struct ext4_mb_history h;
+        if (unlikely(sbi->s_mb_history == NULL))
+                return;
+        if (!(ac->ac_op & sbi->s_mb_history_filter))
+                return;
+        h.op = ac->ac_op;
+        h.pid = current->pid;
+        h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0;
+        h.orig = ac->ac_o_ex;
+        h.result = ac->ac_b_ex;
+        h.flags = ac->ac_flags;
+        h.found = ac->ac_found;
+        h.groups = ac->ac_groups_scanned;
+        h.cr = ac->ac_criteria;
+        h.tail = ac->ac_tail;
+        h.buddy = ac->ac_buddy;
+        h.merged = 0;
+        if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) {
+                if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
+                                ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
+                        h.merged = 1;
+                h.goal = ac->ac_g_ex;
+                h.result = ac->ac_f_ex;
+        }
+        spin_lock(&sbi->s_mb_history_lock);
+        memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
+        if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
+                sbi->s_mb_history_cur = 0;
+        spin_unlock(&sbi->s_mb_history_lock);
+}
+#else
+#define ext4_mb_history_release(sb)
+#define ext4_mb_history_init(sb)
+#endif
+static int ext4_mb_init_backend(struct super_block *sb)
+{
+        ext4_group_t i;
+        int j, len, metalen;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        int num_meta_group_infos =
+                (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >>
+                        EXT4_DESC_PER_BLOCK_BITS(sb);
+        struct ext4_group_info **meta_group_info;
+        /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
+         * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
+         * So a two level scheme suffices for now. */
+        sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
+                                    num_meta_group_infos, GFP_KERNEL);
+        if (sbi->s_group_info == NULL) {
+                printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
+                return -ENOMEM;
+        }
+        sbi->s_buddy_cache = new_inode(sb);
+        if (sbi->s_buddy_cache == NULL) {
+                printk(KERN_ERR "EXT4-fs: can't get new inode\n");
+                goto err_freesgi;
+        }
+        EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
+        metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb);
+        for (i = 0; i < num_meta_group_infos; i++) {
+                if ((i + 1) == num_meta_group_infos)
+                        metalen = sizeof(*meta_group_info) *
+                                (sbi->s_groups_count -
+                                        (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
+                meta_group_info = kmalloc(metalen, GFP_KERNEL);
+                if (meta_group_info == NULL) {
+                        printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
+                               "buddy group\n");
+                        goto err_freemeta;
+                }
+                sbi->s_group_info[i] = meta_group_info;
+        }
+        /*
+         * calculate needed size. if change bb_counters size,
+         * don't forget about ext4_mb_generate_buddy()
+         */
+        len = sizeof(struct ext4_group_info);
+        len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
+        for (i = 0; i < sbi->s_groups_count; i++) {
+                struct ext4_group_desc *desc;
+                meta_group_info =
+                        sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
+                j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
+                meta_group_info[j] = kzalloc(len, GFP_KERNEL);
+                if (meta_group_info[j] == NULL) {
+                        printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
+                        i--;
+                        goto err_freebuddy;
+                }
+                desc = ext4_get_group_desc(sb, i, NULL);
+                if (desc == NULL) {
+                        printk(KERN_ERR
+                                "EXT4-fs: can't read descriptor %lu\n", i);
+                        goto err_freebuddy;
+                }
+                memset(meta_group_info[j], 0, len);
+                set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
+                        &(meta_group_info[j]->bb_state));
+                /*
+                 * initialize bb_free to be able to skip
+                 * empty groups without initialization
+                 */
+                if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+                        meta_group_info[j]->bb_free =
+                                ext4_free_blocks_after_init(sb, i, desc);
+                } else {
+                        meta_group_info[j]->bb_free =
+                                le16_to_cpu(desc->bg_free_blocks_count);
+                }
+                INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
+#ifdef DOUBLE_CHECK
+                {
+                        struct buffer_head *bh;
+                        meta_group_info[j]->bb_bitmap =
+                                kmalloc(sb->s_blocksize, GFP_KERNEL);
+                        BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
+                        bh = read_block_bitmap(sb, i);
+                        BUG_ON(bh == NULL);
+                        memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
+                                        sb->s_blocksize);
+                        put_bh(bh);
+                }
+#endif
+        }
+        return 0;
+err_freebuddy:
+        while (i >= 0) {
+                kfree(ext4_get_group_info(sb, i));
+                i--;
+        }
+        i = num_meta_group_infos;
+err_freemeta:
+        while (--i >= 0)
+                kfree(sbi->s_group_info[i]);
+        iput(sbi->s_buddy_cache);
+err_freesgi:
+        kfree(sbi->s_group_info);
+        return -ENOMEM;
+}
+int ext4_mb_init(struct super_block *sb, int needs_recovery)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        unsigned i;
+        unsigned offset;
+        unsigned max;
+        if (!test_opt(sb, MBALLOC))
+                return 0;
+        i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
+        sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
+        if (sbi->s_mb_offsets == NULL) {
+                clear_opt(sbi->s_mount_opt, MBALLOC);
+                return -ENOMEM;
+        }
+        sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
+        if (sbi->s_mb_maxs == NULL) {
+                clear_opt(sbi->s_mount_opt, MBALLOC);
+                kfree(sbi->s_mb_maxs);
+                return -ENOMEM;
+        }
+        /* order 0 is regular bitmap */
+        sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
+        sbi->s_mb_offsets[0] = 0;
+        i = 1;
+        offset = 0;
+        max = sb->s_blocksize << 2;
+        do {
+                sbi->s_mb_offsets[i] = offset;
+                sbi->s_mb_maxs[i] = max;
+                offset += 1 << (sb->s_blocksize_bits - i);
+                max = max >> 1;
+                i++;
+        } while (i <= sb->s_blocksize_bits + 1);
+        /* init file for buddy data */
+        i = ext4_mb_init_backend(sb);
+        if (i) {
+                clear_opt(sbi->s_mount_opt, MBALLOC);
+                kfree(sbi->s_mb_offsets);
+                kfree(sbi->s_mb_maxs);
+                return i;
+        }
+        spin_lock_init(&sbi->s_md_lock);
+        INIT_LIST_HEAD(&sbi->s_active_transaction);
+        INIT_LIST_HEAD(&sbi->s_closed_transaction);
+        INIT_LIST_HEAD(&sbi->s_committed_transaction);
+        spin_lock_init(&sbi->s_bal_lock);
+        sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
+        sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
+        sbi->s_mb_stats = MB_DEFAULT_STATS;
+        sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
+        sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+        sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
+        sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
+        i = sizeof(struct ext4_locality_group) * NR_CPUS;
+        sbi->s_locality_groups = kmalloc(i, GFP_KERNEL);
+        if (sbi->s_locality_groups == NULL) {
+                clear_opt(sbi->s_mount_opt, MBALLOC);
+                kfree(sbi->s_mb_offsets);
+                kfree(sbi->s_mb_maxs);
+                return -ENOMEM;
+        }
+        for (i = 0; i < NR_CPUS; i++) {
+                struct ext4_locality_group *lg;
+                lg = &sbi->s_locality_groups[i];
+                mutex_init(&lg->lg_mutex);
+                INIT_LIST_HEAD(&lg->lg_prealloc_list);
+                spin_lock_init(&lg->lg_prealloc_lock);
+        }
+        ext4_mb_init_per_dev_proc(sb);
+        ext4_mb_history_init(sb);
+        printk("EXT4-fs: mballoc enabled\n");
+        return 0;
+}
+/* need to called with ext4 group lock (ext4_lock_group) */
+static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
+{
+        struct ext4_prealloc_space *pa;
+        struct list_head *cur, *tmp;
+        int count = 0;
+        list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
+                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
+                list_del(&pa->pa_group_list);
+                count++;
+                kfree(pa);
+        }
+        if (count)
+                mb_debug("mballoc: %u PAs left\n", count);
+}
+int ext4_mb_release(struct super_block *sb)
+{
+        ext4_group_t i;
+        int num_meta_group_infos;
+        struct ext4_group_info *grinfo;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        if (!test_opt(sb, MBALLOC))
+                return 0;
+        /* release freed, non-committed blocks */
+        spin_lock(&sbi->s_md_lock);
+        list_splice_init(&sbi->s_closed_transaction,
+                        &sbi->s_committed_transaction);
+        list_splice_init(&sbi->s_active_transaction,
+                        &sbi->s_committed_transaction);
+        spin_unlock(&sbi->s_md_lock);
+        ext4_mb_free_committed_blocks(sb);
+        if (sbi->s_group_info) {
+                for (i = 0; i < sbi->s_groups_count; i++) {
+                        grinfo = ext4_get_group_info(sb, i);
+#ifdef DOUBLE_CHECK
+                        kfree(grinfo->bb_bitmap);
+#endif
+                        ext4_lock_group(sb, i);
+                        ext4_mb_cleanup_pa(grinfo);
+                        ext4_unlock_group(sb, i);
+                        kfree(grinfo);
+                }
+                num_meta_group_infos = (sbi->s_groups_count +
+                                EXT4_DESC_PER_BLOCK(sb) - 1) >>
+                        EXT4_DESC_PER_BLOCK_BITS(sb);
+                for (i = 0; i < num_meta_group_infos; i++)
+                        kfree(sbi->s_group_info[i]);
+                kfree(sbi->s_group_info);
+        }
+        kfree(sbi->s_mb_offsets);
+        kfree(sbi->s_mb_maxs);
+        if (sbi->s_buddy_cache)
+                iput(sbi->s_buddy_cache);
+        if (sbi->s_mb_stats) {
+                printk(KERN_INFO
+                       "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n",
+                                atomic_read(&sbi->s_bal_allocated),
+                                atomic_read(&sbi->s_bal_reqs),
+                                atomic_read(&sbi->s_bal_success));
+                printk(KERN_INFO
+                      "EXT4-fs: mballoc: %u extents scanned, %u goal hits, "
+                                "%u 2^N hits, %u breaks, %u lost\n",
+                                atomic_read(&sbi->s_bal_ex_scanned),
+                                atomic_read(&sbi->s_bal_goals),
+                                atomic_read(&sbi->s_bal_2orders),
+                                atomic_read(&sbi->s_bal_breaks),
+                                atomic_read(&sbi->s_mb_lost_chunks));
+                printk(KERN_INFO
+                       "EXT4-fs: mballoc: %lu generated and it took %Lu\n",
+                                sbi->s_mb_buddies_generated++,
+                                sbi->s_mb_generation_time);
+                printk(KERN_INFO
+                       "EXT4-fs: mballoc: %u preallocated, %u discarded\n",
+                                atomic_read(&sbi->s_mb_preallocated),
+                                atomic_read(&sbi->s_mb_discarded));
+        }
+        kfree(sbi->s_locality_groups);
+        ext4_mb_history_release(sb);
+        ext4_mb_destroy_per_dev_proc(sb);
+        return 0;
+}
+static void ext4_mb_free_committed_blocks(struct super_block *sb)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        int err;
+        int i;
+        int count = 0;
+        int count2 = 0;
+        struct ext4_free_metadata *md;
+        struct ext4_buddy e4b;
+        if (list_empty(&sbi->s_committed_transaction))
+                return;
+        /* there is committed blocks to be freed yet */
+        do {
+                /* get next array of blocks */
+                md = NULL;
+                spin_lock(&sbi->s_md_lock);
+                if (!list_empty(&sbi->s_committed_transaction)) {
+                        md = list_entry(sbi->s_committed_transaction.next,
+                                        struct ext4_free_metadata, list);
+                        list_del(&md->list);
+                }
+                spin_unlock(&sbi->s_md_lock);
+                if (md == NULL)
+                        break;
+                mb_debug("gonna free %u blocks in group %lu (0x%p):",
+                                md->num, md->group, md);
+                err = ext4_mb_load_buddy(sb, md->group, &e4b);
+                /* we expect to find existing buddy because it's pinned */
+                BUG_ON(err != 0);
+                /* there are blocks to put in buddy to make them really free */
+                count += md->num;
+                count2++;
+                ext4_lock_group(sb, md->group);
+                for (i = 0; i < md->num; i++) {
+                        mb_debug(" %u", md->blocks[i]);
+                        err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
+                        BUG_ON(err != 0);
+                }
+                mb_debug("\n");
+                ext4_unlock_group(sb, md->group);
+                /* balance refcounts from ext4_mb_free_metadata() */
+                page_cache_release(e4b.bd_buddy_page);
+                page_cache_release(e4b.bd_bitmap_page);
+                kfree(md);
+                ext4_mb_release_desc(&e4b);
+        } while (md);
+        mb_debug("freed %u blocks in %u structures\n", count, count2);
+}
+#define EXT4_ROOT                       "ext4"
+#define EXT4_MB_STATS_NAME              "stats"
+#define EXT4_MB_MAX_TO_SCAN_NAME        "max_to_scan"
+#define EXT4_MB_MIN_TO_SCAN_NAME        "min_to_scan"
+#define EXT4_MB_ORDER2_REQ              "order2_req"
+#define EXT4_MB_STREAM_REQ              "stream_req"
+#define EXT4_MB_GROUP_PREALLOC          "group_prealloc"
+#define MB_PROC_VALUE_READ(name)                                \
+static int ext4_mb_read_##name(char *page, char **start,        \
+                off_t off, int count, int *eof, void *data)     \
+{                                                               \
+        struct ext4_sb_info *sbi = data;                        \
+        int len;                                                \
+        *eof = 1;                                               \
+        if (off != 0)                                           \
+                return 0;                                       \
+        len = sprintf(page, "%ld\n", sbi->s_mb_##name);         \
+        *start = page;                                          \
+        return len;                                             \
+}
+#define MB_PROC_VALUE_WRITE(name)                               \
+static int ext4_mb_write_##name(struct file *file,              \
+                const char __user *buf, unsigned long cnt, void *data)  \
+{                                                               \
+        struct ext4_sb_info *sbi = data;                        \
+        char str[32];                                           \
+        long value;                                             \
+        if (cnt >= sizeof(str))                                 \
+                return -EINVAL;                                 \
+        if (copy_from_user(str, buf, cnt))                      \
+                return -EFAULT;                                 \
+        value = simple_strtol(str, NULL, 0);                    \
+        if (value <= 0)                                         \
+                return -ERANGE;                                 \
+        sbi->s_mb_##name = value;                               \
+        return cnt;                                             \
+}
+MB_PROC_VALUE_READ(stats);
+MB_PROC_VALUE_WRITE(stats);
+MB_PROC_VALUE_READ(max_to_scan);
+MB_PROC_VALUE_WRITE(max_to_scan);
+MB_PROC_VALUE_READ(min_to_scan);
+MB_PROC_VALUE_WRITE(min_to_scan);
+MB_PROC_VALUE_READ(order2_reqs);
+MB_PROC_VALUE_WRITE(order2_reqs);
+MB_PROC_VALUE_READ(stream_request);
+MB_PROC_VALUE_WRITE(stream_request);
+MB_PROC_VALUE_READ(group_prealloc);
+MB_PROC_VALUE_WRITE(group_prealloc);
+#define MB_PROC_HANDLER(name, var)                                      \
+do {                                                                    \
+        proc = create_proc_entry(name, mode, sbi->s_mb_proc);           \
+        if (proc == NULL) {                                             \
+                printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
+                goto err_out;                                           \
+        }                                                               \
+        proc->data = sbi;                                               \
+        proc->read_proc  = ext4_mb_read_##var ;                         \
+        proc->write_proc = ext4_mb_write_##var;                         \
+} while (0)
+static int ext4_mb_init_per_dev_proc(struct super_block *sb)
+{
+        mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct proc_dir_entry *proc;
+        char devname[64];
+        snprintf(devname, sizeof(devname) - 1, "%s",
+                bdevname(sb->s_bdev, devname));
+        sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
+        MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats);
+        MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan);
+        MB_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, min_to_scan);
+        MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs);
+        MB_PROC_HANDLER(EXT4_MB_STREAM_REQ, stream_request);
+        MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc);
+        return 0;
+err_out:
+        printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname);
+        remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
+        remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
+        remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
+        remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
+        remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
+        remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
+        remove_proc_entry(devname, proc_root_ext4);
+        sbi->s_mb_proc = NULL;
+        return -ENOMEM;
+}
+static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        char devname[64];
+        if (sbi->s_mb_proc == NULL)
+                return -EINVAL;
+        snprintf(devname, sizeof(devname) - 1, "%s",
+                bdevname(sb->s_bdev, devname));
+        remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
+        remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
+        remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
+        remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
+        remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
+        remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
+        remove_proc_entry(devname, proc_root_ext4);
+        return 0;
+}
+int __init init_ext4_mballoc(void)
+{
+        ext4_pspace_cachep =
+                kmem_cache_create("ext4_prealloc_space",
+                                     sizeof(struct ext4_prealloc_space),
+                                     0, SLAB_RECLAIM_ACCOUNT, NULL);
+        if (ext4_pspace_cachep == NULL)
+                return -ENOMEM;
+#ifdef CONFIG_PROC_FS
+        proc_root_ext4 = proc_mkdir(EXT4_ROOT, proc_root_fs);
+        if (proc_root_ext4 == NULL)
+                printk(KERN_ERR "EXT4-fs: Unable to create %s\n", EXT4_ROOT);
+#endif
+        return 0;
+}
+void exit_ext4_mballoc(void)
+{
+        /* XXX: synchronize_rcu(); */
+        kmem_cache_destroy(ext4_pspace_cachep);
+#ifdef CONFIG_PROC_FS
+        remove_proc_entry(EXT4_ROOT, proc_root_fs);
+#endif
+}
+/*
+ * Check quota and mark choosed space (ac->ac_b_ex) non-free in bitmaps
+ * Returns 0 if success or error code
+ */
+static int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
+                                handle_t *handle)
+{
+        struct buffer_head *bitmap_bh = NULL;
+        struct ext4_super_block *es;
+        struct ext4_group_desc *gdp;
+        struct buffer_head *gdp_bh;
+        struct ext4_sb_info *sbi;
+        struct super_block *sb;
+        ext4_fsblk_t block;
+        int err;
+        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
+        BUG_ON(ac->ac_b_ex.fe_len <= 0);
+        sb = ac->ac_sb;
+        sbi = EXT4_SB(sb);
+        es = sbi->s_es;
+        ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group,
+                        gdp->bg_free_blocks_count);
+        err = -EIO;
+        bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group);
+        if (!bitmap_bh)
+                goto out_err;
+        err = ext4_journal_get_write_access(handle, bitmap_bh);
+        if (err)
+                goto out_err;
+        err = -EIO;
+        gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
+        if (!gdp)
+                goto out_err;
+        err = ext4_journal_get_write_access(handle, gdp_bh);
+        if (err)
+                goto out_err;
+        block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb)
+                + ac->ac_b_ex.fe_start
+                + le32_to_cpu(es->s_first_data_block);
+        if (block == ext4_block_bitmap(sb, gdp) ||
+                        block == ext4_inode_bitmap(sb, gdp) ||
+                        in_range(block, ext4_inode_table(sb, gdp),
+                                EXT4_SB(sb)->s_itb_per_group)) {
+                ext4_error(sb, __FUNCTION__,
+                           "Allocating block in system zone - block = %llu",
+                           block);
+        }
+#ifdef AGGRESSIVE_CHECK
+        {
+                int i;
+                for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
+                        BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
+                                                bitmap_bh->b_data));
+                }
+        }
+#endif
+        mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
+                                ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
+        spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+        if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+                gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+                gdp->bg_free_blocks_count =
+                        cpu_to_le16(ext4_free_blocks_after_init(sb,
+                                                ac->ac_b_ex.fe_group,
+                                                gdp));
+        }
+        gdp->bg_free_blocks_count =
+                cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)
+                                - ac->ac_b_ex.fe_len);
+        gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
+        spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+        percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
+        err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+        if (err)
+                goto out_err;
+        err = ext4_journal_dirty_metadata(handle, gdp_bh);
+out_err:
+        sb->s_dirt = 1;
+        put_bh(bitmap_bh);
+        return err;
+}
+/*
+ * here we normalize request for locality group
+ * Group request are normalized to s_strip size if we set the same via mount
+ * option. If not we set it to s_mb_group_prealloc which can be configured via
+ * /proc/fs/ext4/<partition>/group_prealloc
+ *
+ * XXX: should we try to preallocate more than the group has now?
+ */
+static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
+{
+        struct super_block *sb = ac->ac_sb;
+        struct ext4_locality_group *lg = ac->ac_lg;
+        BUG_ON(lg == NULL);
+        if (EXT4_SB(sb)->s_stripe)
+                ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
+        else
+                ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
+        mb_debug("#%u: goal %lu blocks for locality group\n",
+                current->pid, ac->ac_g_ex.fe_len);
+}
+/*
+ * Normalization means making request better in terms of
+ * size and alignment
+ */
+static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
+                                struct ext4_allocation_request *ar)
+{
+        int bsbits, max;
+        ext4_lblk_t end;
+        struct list_head *cur;
+        loff_t size, orig_size, start_off;
+        ext4_lblk_t start, orig_start;
+        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
+        /* do normalize only data requests, metadata requests
+           do not need preallocation */
+        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+                return;
+        /* sometime caller may want exact blocks */
+        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+                return;
+        /* caller may indicate that preallocation isn't
+         * required (it's a tail, for example) */
+        if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
+                return;
+        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
+                ext4_mb_normalize_group_request(ac);
+                return ;
+        }
+        bsbits = ac->ac_sb->s_blocksize_bits;
+        /* first, let's learn actual file size
+         * given current request is allocated */
+        size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+        size = size << bsbits;
+        if (size < i_size_read(ac->ac_inode))
+                size = i_size_read(ac->ac_inode);
+        /* max available blocks in a free group */
+        max = EXT4_BLOCKS_PER_GROUP(ac->ac_sb) - 1 - 1 -
+                                EXT4_SB(ac->ac_sb)->s_itb_per_group;
+#define NRL_CHECK_SIZE(req, size, max,bits)     \
+                (req <= (size) || max <= ((size) >> bits))
+        /* first, try to predict filesize */
+        /* XXX: should this table be tunable? */
+        start_off = 0;
+        if (size <= 16 * 1024) {
+                size = 16 * 1024;
+        } else if (size <= 32 * 1024) {
+                size = 32 * 1024;
+        } else if (size <= 64 * 1024) {
+                size = 64 * 1024;
+        } else if (size <= 128 * 1024) {
+                size = 128 * 1024;
+        } else if (size <= 256 * 1024) {
+                size = 256 * 1024;
+        } else if (size <= 512 * 1024) {
+                size = 512 * 1024;
+        } else if (size <= 1024 * 1024) {
+                size = 1024 * 1024;
+        } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, bsbits)) {
+                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+                                                (20 - bsbits)) << 20;
+                size = 1024 * 1024;
+        } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, bsbits)) {
+                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+                                                        (22 - bsbits)) << 22;
+                size = 4 * 1024 * 1024;
+        } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
+                                        (8<<20)>>bsbits, max, bsbits)) {
+                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+                                                        (23 - bsbits)) << 23;
+                size = 8 * 1024 * 1024;
+        } else {
+                start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
+                size      = ac->ac_o_ex.fe_len << bsbits;
+        }
+        orig_size = size = size >> bsbits;
+        orig_start = start = start_off >> bsbits;
+        /* don't cover already allocated blocks in selected range */
+        if (ar->pleft && start <= ar->lleft) {
+                size -= ar->lleft + 1 - start;
+                start = ar->lleft + 1;
+        }
+        if (ar->pright && start + size - 1 >= ar->lright)
+                size -= start + size - ar->lright;
+        end = start + size;
+        /* check we don't cross already preallocated blocks */
+        rcu_read_lock();
+        list_for_each_rcu(cur, &ei->i_prealloc_list) {
+                struct ext4_prealloc_space *pa;
+                unsigned long pa_end;
+                pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
+                if (pa->pa_deleted)
+                        continue;
+                spin_lock(&pa->pa_lock);
+                if (pa->pa_deleted) {
+                        spin_unlock(&pa->pa_lock);
+                        continue;
+                }
+                pa_end = pa->pa_lstart + pa->pa_len;
+                /* PA must not overlap original request */
+                BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
+                        ac->ac_o_ex.fe_logical < pa->pa_lstart));
+                /* skip PA normalized request doesn't overlap with */
+                if (pa->pa_lstart >= end) {
+                        spin_unlock(&pa->pa_lock);
+                        continue;
+                }
+                if (pa_end <= start) {
+                        spin_unlock(&pa->pa_lock);
+                        continue;
+                }
+                BUG_ON(pa->pa_lstart <= start && pa_end >= end);
+                if (pa_end <= ac->ac_o_ex.fe_logical) {
+                        BUG_ON(pa_end < start);
+                        start = pa_end;
+                }
+                if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
+                        BUG_ON(pa->pa_lstart > end);
+                        end = pa->pa_lstart;
+                }
+                spin_unlock(&pa->pa_lock);
+        }
+        rcu_read_unlock();
+        size = end - start;
+        /* XXX: extra loop to check we really don't overlap preallocations */
+        rcu_read_lock();
+        list_for_each_rcu(cur, &ei->i_prealloc_list) {
+                struct ext4_prealloc_space *pa;
+                unsigned long pa_end;
+                pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
+                spin_lock(&pa->pa_lock);
+                if (pa->pa_deleted == 0) {
+                        pa_end = pa->pa_lstart + pa->pa_len;
+                        BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
+                }
+                spin_unlock(&pa->pa_lock);
+        }
+        rcu_read_unlock();
+        if (start + size <= ac->ac_o_ex.fe_logical &&
+                        start > ac->ac_o_ex.fe_logical) {
+                printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n",
+                        (unsigned long) start, (unsigned long) size,
+                        (unsigned long) ac->ac_o_ex.fe_logical);
+        }
+        BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
+                        start > ac->ac_o_ex.fe_logical);
+        BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+        /* now prepare goal request */
+        /* XXX: is it better to align blocks WRT to logical
+         * placement or satisfy big request as is */
+        ac->ac_g_ex.fe_logical = start;
+        ac->ac_g_ex.fe_len = size;
+        /* define goal start in order to merge */
+        if (ar->pright && (ar->lright == (start + size))) {
+                /* merge to the right */
+                ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
+                                                &ac->ac_f_ex.fe_group,
+                                                &ac->ac_f_ex.fe_start);
+                ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
+        }
+        if (ar->pleft && (ar->lleft + 1 == start)) {
+                /* merge to the left */
+                ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
+                                                &ac->ac_f_ex.fe_group,
+                                                &ac->ac_f_ex.fe_start);
+                ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
+        }
+        mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size,
+                (unsigned) orig_size, (unsigned) start);
+}
+static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+        if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
+                atomic_inc(&sbi->s_bal_reqs);
+                atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
+                if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len)
+                        atomic_inc(&sbi->s_bal_success);
+                atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
+                if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
+                                ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
+                        atomic_inc(&sbi->s_bal_goals);
+                if (ac->ac_found > sbi->s_mb_max_to_scan)
+                        atomic_inc(&sbi->s_bal_breaks);
+        }
+        ext4_mb_store_history(ac);
+}
+/*
+ * use blocks preallocated to inode
+ */
+static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
+                                struct ext4_prealloc_space *pa)
+{
+        ext4_fsblk_t start;
+        ext4_fsblk_t end;
+        int len;
+        /* found preallocated blocks, use them */
+        start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
+        end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len);
+        len = end - start;
+        ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
+                                        &ac->ac_b_ex.fe_start);
+        ac->ac_b_ex.fe_len = len;
+        ac->ac_status = AC_STATUS_FOUND;
+        ac->ac_pa = pa;
+        BUG_ON(start < pa->pa_pstart);
+        BUG_ON(start + len > pa->pa_pstart + pa->pa_len);
+        BUG_ON(pa->pa_free < len);
+        pa->pa_free -= len;
+        mb_debug("use %llu/%lu from inode pa %p\n", start, len, pa);
+}
+/*
+ * use blocks preallocated to locality group
+ */
+static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
+                                struct ext4_prealloc_space *pa)
+{
+        unsigned len = ac->ac_o_ex.fe_len;
+        ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
+                                        &ac->ac_b_ex.fe_group,
+                                        &ac->ac_b_ex.fe_start);
+        ac->ac_b_ex.fe_len = len;
+        ac->ac_status = AC_STATUS_FOUND;
+        ac->ac_pa = pa;
+        /* we don't correct pa_pstart or pa_plen here to avoid
+         * possible race when tte group is being loaded concurrently
+         * instead we correct pa later, after blocks are marked
+         * in on-disk bitmap -- see ext4_mb_release_context() */
+        /*
+         * FIXME!! but the other CPUs can look at this particular
+         * pa and think that it have enought free blocks if we
+         * don't update pa_free here right ?
+         */
+        mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
+}
+/*
+ * search goal blocks in preallocated space
+ */
+static int ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
+{
+        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
+        struct ext4_locality_group *lg;
+        struct ext4_prealloc_space *pa;
+        struct list_head *cur;
+        /* only data can be preallocated */
+        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+                return 0;
+        /* first, try per-file preallocation */
+        rcu_read_lock();
+        list_for_each_rcu(cur, &ei->i_prealloc_list) {
+                pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
+                /* all fields in this condition don't change,
+                 * so we can skip locking for them */
+                if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
+                        ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
+                        continue;
+                /* found preallocated blocks, use them */
+                spin_lock(&pa->pa_lock);
+                if (pa->pa_deleted == 0 && pa->pa_free) {
+                        atomic_inc(&pa->pa_count);
+                        ext4_mb_use_inode_pa(ac, pa);
+                        spin_unlock(&pa->pa_lock);
+                        ac->ac_criteria = 10;
+                        rcu_read_unlock();
+                        return 1;
+                }
+                spin_unlock(&pa->pa_lock);
+        }
+        rcu_read_unlock();
+        /* can we use group allocation? */
+        if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
+                return 0;
+        /* inode may have no locality group for some reason */
+        lg = ac->ac_lg;
+        if (lg == NULL)
+                return 0;
+        rcu_read_lock();
+        list_for_each_rcu(cur, &lg->lg_prealloc_list) {
+                pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
+                spin_lock(&pa->pa_lock);
+                if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) {
+                        atomic_inc(&pa->pa_count);
+                        ext4_mb_use_group_pa(ac, pa);
+                        spin_unlock(&pa->pa_lock);
+                        ac->ac_criteria = 20;
+                        rcu_read_unlock();
+                        return 1;
+                }
+                spin_unlock(&pa->pa_lock);
+        }
+        rcu_read_unlock();
+        return 0;
+}
+/*
+ * the function goes through all preallocation in this group and marks them
+ * used in in-core bitmap. buddy must be generated from this bitmap
+ * Need to be called with ext4 group lock (ext4_lock_group)
+ */
+static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+                                        ext4_group_t group)
+{
+        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+        struct ext4_prealloc_space *pa;
+        struct list_head *cur;
+        ext4_group_t groupnr;
+        ext4_grpblk_t start;
+        int preallocated = 0;
+        int count = 0;
+        int len;
+        /* all form of preallocation discards first load group,
+         * so the only competing code is preallocation use.
+         * we don't need any locking here
+         * notice we do NOT ignore preallocations with pa_deleted
+         * otherwise we could leave used blocks available for
+         * allocation in buddy when concurrent ext4_mb_put_pa()
+         * is dropping preallocation
+         */
+        list_for_each(cur, &grp->bb_prealloc_list) {
+                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
+                spin_lock(&pa->pa_lock);
+                ext4_get_group_no_and_offset(sb, pa->pa_pstart,
+                                             &groupnr, &start);
+                len = pa->pa_len;
+                spin_unlock(&pa->pa_lock);
+                if (unlikely(len == 0))
+                        continue;
+                BUG_ON(groupnr != group);
+                mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
+                                                bitmap, start, len);
+                preallocated += len;
+                count++;
+        }
+        mb_debug("prellocated %u for group %lu\n", preallocated, group);
+}
+static void ext4_mb_pa_callback(struct rcu_head *head)
+{
+        struct ext4_prealloc_space *pa;
+        pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
+        kmem_cache_free(ext4_pspace_cachep, pa);
+}
+/*
+ * drops a reference to preallocated space descriptor
+ * if this was the last reference and the space is consumed
+ */
+static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
+                        struct super_block *sb, struct ext4_prealloc_space *pa)
+{
+        unsigned long grp;
+        if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
+                return;
+        /* in this short window concurrent discard can set pa_deleted */
+        spin_lock(&pa->pa_lock);
+        if (pa->pa_deleted == 1) {
+                spin_unlock(&pa->pa_lock);
+                return;
+        }
+        pa->pa_deleted = 1;
+        spin_unlock(&pa->pa_lock);
+        /* -1 is to protect from crossing allocation group */
+        ext4_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL);
+        /*
+         * possible race:
+         *
+         *  P1 (buddy init)                     P2 (regular allocation)
+         *                                      find block B in PA
+         *  copy on-disk bitmap to buddy
+         *                                      mark B in on-disk bitmap
+         *                                      drop PA from group
+         *  mark all PAs in buddy
+         *
+         * thus, P1 initializes buddy with B available. to prevent this
+         * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
+         * against that pair
+         */
+        ext4_lock_group(sb, grp);
+        list_del(&pa->pa_group_list);
+        ext4_unlock_group(sb, grp);
+        spin_lock(pa->pa_obj_lock);
+        list_del_rcu(&pa->pa_inode_list);
+        spin_unlock(pa->pa_obj_lock);
+        call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+}
+/*
+ * creates new preallocated space for given inode
+ */
+static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
+{
+        struct super_block *sb = ac->ac_sb;
+        struct ext4_prealloc_space *pa;
+        struct ext4_group_info *grp;
+        struct ext4_inode_info *ei;
+        /* preallocate only when found space is larger then requested */
+        BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
+        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
+        BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
+        pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
+        if (pa == NULL)
+                return -ENOMEM;
+        if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
+                int winl;
+                int wins;
+                int win;
+                int offs;
+                /* we can't allocate as much as normalizer wants.
+                 * so, found space must get proper lstart
+                 * to cover original request */
+                BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
+                BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
+                /* we're limited by original request in that
+                 * logical block must be covered any way
+                 * winl is window we can move our chunk within */
+                winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
+                /* also, we should cover whole original request */
+                wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len;
+                /* the smallest one defines real window */
+                win = min(winl, wins);
+                offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len;
+                if (offs && offs < win)
+                        win = offs;
+                ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win;
+                BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
+                BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
+        }
+        /* preallocation can change ac_b_ex, thus we store actually
+         * allocated blocks for history */
+        ac->ac_f_ex = ac->ac_b_ex;
+        pa->pa_lstart = ac->ac_b_ex.fe_logical;
+        pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+        pa->pa_len = ac->ac_b_ex.fe_len;
+        pa->pa_free = pa->pa_len;
+        atomic_set(&pa->pa_count, 1);
+        spin_lock_init(&pa->pa_lock);
+        pa->pa_deleted = 0;
+        pa->pa_linear = 0;
+        mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
+                        pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+        ext4_mb_use_inode_pa(ac, pa);
+        atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
+        ei = EXT4_I(ac->ac_inode);
+        grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
+        pa->pa_obj_lock = &ei->i_prealloc_lock;
+        pa->pa_inode = ac->ac_inode;
+        ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+        list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
+        ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+        spin_lock(pa->pa_obj_lock);
+        list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
+        spin_unlock(pa->pa_obj_lock);
+        return 0;
+}
+/*
+ * creates new preallocated space for locality group inodes belongs to
+ */
+static int ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
+{
+        struct super_block *sb = ac->ac_sb;
+        struct ext4_locality_group *lg;
+        struct ext4_prealloc_space *pa;
+        struct ext4_group_info *grp;
+        /* preallocate only when found space is larger then requested */
+        BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
+        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
+        BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
+        BUG_ON(ext4_pspace_cachep == NULL);
+        pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
+        if (pa == NULL)
+                return -ENOMEM;
+        /* preallocation can change ac_b_ex, thus we store actually
+         * allocated blocks for history */
+        ac->ac_f_ex = ac->ac_b_ex;
+        pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+        pa->pa_lstart = pa->pa_pstart;
+        pa->pa_len = ac->ac_b_ex.fe_len;
+        pa->pa_free = pa->pa_len;
+        atomic_set(&pa->pa_count, 1);
+        spin_lock_init(&pa->pa_lock);
+        pa->pa_deleted = 0;
+        pa->pa_linear = 1;
+        mb_debug("new group pa %p: %llu/%u for %u\n", pa,
+                        pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+        ext4_mb_use_group_pa(ac, pa);
+        atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
+        grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
+        lg = ac->ac_lg;
+        BUG_ON(lg == NULL);
+        pa->pa_obj_lock = &lg->lg_prealloc_lock;
+        pa->pa_inode = NULL;
+        ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+        list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
+        ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+        spin_lock(pa->pa_obj_lock);
+        list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list);
+        spin_unlock(pa->pa_obj_lock);
+        return 0;
+}
+static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
+{
+        int err;
+        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
+                err = ext4_mb_new_group_pa(ac);
+        else
+                err = ext4_mb_new_inode_pa(ac);
+        return err;
+}
+/*
+ * finds all unused blocks in on-disk bitmap, frees them in
+ * in-core bitmap and buddy.
+ * @pa must be unlinked from inode and group lists, so that
+ * nobody else can find/use it.
+ * the caller MUST hold group/inode locks.
+ * TODO: optimize the case when there are no in-core structures yet
+ */
+static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b,
+                                struct buffer_head *bitmap_bh,
+                                struct ext4_prealloc_space *pa)
+{
+        struct ext4_allocation_context ac;
+        struct super_block *sb = e4b->bd_sb;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        unsigned long end;
+        unsigned long next;
+        ext4_group_t group;
+        ext4_grpblk_t bit;
+        sector_t start;
+        int err = 0;
+        int free = 0;
+        BUG_ON(pa->pa_deleted == 0);
+        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
+        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
+        end = bit + pa->pa_len;
+        ac.ac_sb = sb;
+        ac.ac_inode = pa->pa_inode;
+        ac.ac_op = EXT4_MB_HISTORY_DISCARD;
+        while (bit < end) {
+                bit = ext4_find_next_zero_bit(bitmap_bh->b_data, end, bit);
+                if (bit >= end)
+                        break;
+                next = ext4_find_next_bit(bitmap_bh->b_data, end, bit);
+                if (next > end)
+                        next = end;
+                start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
+                                le32_to_cpu(sbi->s_es->s_first_data_block);
+                mb_debug("    free preallocated %u/%u in group %u\n",
+                                (unsigned) start, (unsigned) next - bit,
+                                (unsigned) group);
+                free += next - bit;
+                ac.ac_b_ex.fe_group = group;
+                ac.ac_b_ex.fe_start = bit;
+                ac.ac_b_ex.fe_len = next - bit;
+                ac.ac_b_ex.fe_logical = 0;
+                ext4_mb_store_history(&ac);
+                mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
+                bit = next + 1;
+        }
+        if (free != pa->pa_free) {
+                printk(KERN_ERR "pa %p: logic %lu, phys. %lu, len %lu\n",
+                        pa, (unsigned long) pa->pa_lstart,
+                        (unsigned long) pa->pa_pstart,
+                        (unsigned long) pa->pa_len);
+                printk(KERN_ERR "free %u, pa_free %u\n", free, pa->pa_free);
+        }
+        BUG_ON(free != pa->pa_free);
+        atomic_add(free, &sbi->s_mb_discarded);
+        return err;
+}
+static int ext4_mb_release_group_pa(struct ext4_buddy *e4b,
+                                struct ext4_prealloc_space *pa)
+{
+        struct ext4_allocation_context ac;
+        struct super_block *sb = e4b->bd_sb;
+        ext4_group_t group;
+        ext4_grpblk_t bit;
+        ac.ac_op = EXT4_MB_HISTORY_DISCARD;
+        BUG_ON(pa->pa_deleted == 0);
+        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
+        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
+        mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
+        atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
+        ac.ac_sb = sb;
+        ac.ac_inode = NULL;
+        ac.ac_b_ex.fe_group = group;
+        ac.ac_b_ex.fe_start = bit;
+        ac.ac_b_ex.fe_len = pa->pa_len;
+        ac.ac_b_ex.fe_logical = 0;
+        ext4_mb_store_history(&ac);
+        return 0;
+}
+/*
+ * releases all preallocations in given group
+ *
+ * first, we need to decide discard policy:
+ * - when do we discard
+ *   1) ENOSPC
+ * - how many do we discard
+ *   1) how many requested
+ */
+static int ext4_mb_discard_group_preallocations(struct super_block *sb,
+                                        ext4_group_t group, int needed)
+{
+        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+        struct buffer_head *bitmap_bh = NULL;
+        struct ext4_prealloc_space *pa, *tmp;
+        struct list_head list;
+        struct ext4_buddy e4b;
+        int err;
+        int busy = 0;
+        int free = 0;
+        mb_debug("discard preallocation for group %lu\n", group);
+        if (list_empty(&grp->bb_prealloc_list))
+                return 0;
+        bitmap_bh = read_block_bitmap(sb, group);
+        if (bitmap_bh == NULL) {
+                /* error handling here */
+                ext4_mb_release_desc(&e4b);
+                BUG_ON(bitmap_bh == NULL);
+        }
+        err = ext4_mb_load_buddy(sb, group, &e4b);
+        BUG_ON(err != 0); /* error handling here */
+        if (needed == 0)
+                needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
+        grp = ext4_get_group_info(sb, group);
+        INIT_LIST_HEAD(&list);
+repeat:
+        ext4_lock_group(sb, group);
+        list_for_each_entry_safe(pa, tmp,
+                                &grp->bb_prealloc_list, pa_group_list) {
+                spin_lock(&pa->pa_lock);
+                if (atomic_read(&pa->pa_count)) {
+                        spin_unlock(&pa->pa_lock);
+                        busy = 1;
+                        continue;
+                }
+                if (pa->pa_deleted) {
+                        spin_unlock(&pa->pa_lock);
+                        continue;
+                }
+                /* seems this one can be freed ... */
+                pa->pa_deleted = 1;
+                /* we can trust pa_free ... */
+                free += pa->pa_free;
+                spin_unlock(&pa->pa_lock);
+                list_del(&pa->pa_group_list);
+                list_add(&pa->u.pa_tmp_list, &list);
+        }
+        /* if we still need more blocks and some PAs were used, try again */
+        if (free < needed && busy) {
+                busy = 0;
+                ext4_unlock_group(sb, group);
+                /*
+                 * Yield the CPU here so that we don't get soft lockup
+                 * in non preempt case.
+                 */
+                yield();
+                goto repeat;
+        }
+        /* found anything to free? */
+        if (list_empty(&list)) {
+                BUG_ON(free != 0);
+                goto out;
+        }
+        /* now free all selected PAs */
+        list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
+                /* remove from object (inode or locality group) */
+                spin_lock(pa->pa_obj_lock);
+                list_del_rcu(&pa->pa_inode_list);
+                spin_unlock(pa->pa_obj_lock);
+                if (pa->pa_linear)
+                        ext4_mb_release_group_pa(&e4b, pa);
+                else
+                        ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
+                list_del(&pa->u.pa_tmp_list);
+                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+        }
+out:
+        ext4_unlock_group(sb, group);
+        ext4_mb_release_desc(&e4b);
+        put_bh(bitmap_bh);
+        return free;
+}
+/*
+ * releases all non-used preallocated blocks for given inode
+ *
+ * It's important to discard preallocations under i_data_sem
+ * We don't want another block to be served from the prealloc
+ * space when we are discarding the inode prealloc space.
+ *
+ * FIXME!! Make sure it is valid at all the call sites
+ */
+void ext4_mb_discard_inode_preallocations(struct inode *inode)
+{
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        struct super_block *sb = inode->i_sb;
+        struct buffer_head *bitmap_bh = NULL;
+        struct ext4_prealloc_space *pa, *tmp;
+        ext4_group_t group = 0;
+        struct list_head list;
+        struct ext4_buddy e4b;
+        int err;
+        if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) {
+                /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
+                return;
+        }
+        mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
+        INIT_LIST_HEAD(&list);
+repeat:
+        /* first, collect all pa's in the inode */
+        spin_lock(&ei->i_prealloc_lock);
+        while (!list_empty(&ei->i_prealloc_list)) {
+                pa = list_entry(ei->i_prealloc_list.next,
+                                struct ext4_prealloc_space, pa_inode_list);
+                BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
+                spin_lock(&pa->pa_lock);
+                if (atomic_read(&pa->pa_count)) {
+                        /* this shouldn't happen often - nobody should
+                         * use preallocation while we're discarding it */
+                        spin_unlock(&pa->pa_lock);
+                        spin_unlock(&ei->i_prealloc_lock);
+                        printk(KERN_ERR "uh-oh! used pa while discarding\n");
+                        WARN_ON(1);
+                        schedule_timeout_uninterruptible(HZ);
+                        goto repeat;
+                }
+                if (pa->pa_deleted == 0) {
+                        pa->pa_deleted = 1;
+                        spin_unlock(&pa->pa_lock);
+                        list_del_rcu(&pa->pa_inode_list);
+                        list_add(&pa->u.pa_tmp_list, &list);
+                        continue;
+                }
+                /* someone is deleting pa right now */
+                spin_unlock(&pa->pa_lock);
+                spin_unlock(&ei->i_prealloc_lock);
+                /* we have to wait here because pa_deleted
+                 * doesn't mean pa is already unlinked from
+                 * the list. as we might be called from
+                 * ->clear_inode() the inode will get freed
+                 * and concurrent thread which is unlinking
+                 * pa from inode's list may access already
+                 * freed memory, bad-bad-bad */
+                /* XXX: if this happens too often, we can
+                 * add a flag to force wait only in case
+                 * of ->clear_inode(), but not in case of
+                 * regular truncate */
+                schedule_timeout_uninterruptible(HZ);
+                goto repeat;
+        }
+        spin_unlock(&ei->i_prealloc_lock);
+        list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
+                BUG_ON(pa->pa_linear != 0);
+                ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+                err = ext4_mb_load_buddy(sb, group, &e4b);
+                BUG_ON(err != 0); /* error handling here */
+                bitmap_bh = read_block_bitmap(sb, group);
+                if (bitmap_bh == NULL) {
+                        /* error handling here */
+                        ext4_mb_release_desc(&e4b);
+                        BUG_ON(bitmap_bh == NULL);
+                }
+                ext4_lock_group(sb, group);
+                list_del(&pa->pa_group_list);
+                ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
+                ext4_unlock_group(sb, group);
+                ext4_mb_release_desc(&e4b);
+                put_bh(bitmap_bh);
+                list_del(&pa->u.pa_tmp_list);
+                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+        }
+}
+/*
+ * finds all preallocated spaces and return blocks being freed to them
+ * if preallocated space becomes full (no block is used from the space)
+ * then the function frees space in buddy
+ * XXX: at the moment, truncate (which is the only way to free blocks)
+ * discards all preallocations
+ */
+static void ext4_mb_return_to_preallocation(struct inode *inode,
+                                        struct ext4_buddy *e4b,
+                                        sector_t block, int count)
+{
+        BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
+}
+#ifdef MB_DEBUG
+static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
+{
+        struct super_block *sb = ac->ac_sb;
+        ext4_group_t i;
+        printk(KERN_ERR "EXT4-fs: Can't allocate:"
+                        " Allocation context details:\n");
+        printk(KERN_ERR "EXT4-fs: status %d flags %d\n",
+                        ac->ac_status, ac->ac_flags);
+        printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, "
+                        "best %lu/%lu/%lu@%lu cr %d\n",
+                        (unsigned long)ac->ac_o_ex.fe_group,
+                        (unsigned long)ac->ac_o_ex.fe_start,
+                        (unsigned long)ac->ac_o_ex.fe_len,
+                        (unsigned long)ac->ac_o_ex.fe_logical,
+                        (unsigned long)ac->ac_g_ex.fe_group,
+                        (unsigned long)ac->ac_g_ex.fe_start,
+                        (unsigned long)ac->ac_g_ex.fe_len,
+                        (unsigned long)ac->ac_g_ex.fe_logical,
+                        (unsigned long)ac->ac_b_ex.fe_group,
+                        (unsigned long)ac->ac_b_ex.fe_start,
+                        (unsigned long)ac->ac_b_ex.fe_len,
+                        (unsigned long)ac->ac_b_ex.fe_logical,
+                        (int)ac->ac_criteria);
+        printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
+                ac->ac_found);
+        printk(KERN_ERR "EXT4-fs: groups: \n");
+        for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+                struct ext4_group_info *grp = ext4_get_group_info(sb, i);
+                struct ext4_prealloc_space *pa;
+                ext4_grpblk_t start;
+                struct list_head *cur;
+                ext4_lock_group(sb, i);
+                list_for_each(cur, &grp->bb_prealloc_list) {
+                        pa = list_entry(cur, struct ext4_prealloc_space,
+                                        pa_group_list);
+                        spin_lock(&pa->pa_lock);
+                        ext4_get_group_no_and_offset(sb, pa->pa_pstart,
+                                                     NULL, &start);
+                        spin_unlock(&pa->pa_lock);
+                        printk(KERN_ERR "PA:%lu:%d:%u \n", i,
+                                                        start, pa->pa_len);
+                }
+                ext4_lock_group(sb, i);
+                if (grp->bb_free == 0)
+                        continue;
+                printk(KERN_ERR "%lu: %d/%d \n",
+                       i, grp->bb_free, grp->bb_fragments);
+        }
+        printk(KERN_ERR "\n");
+}
+#else
+static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
+{
+        return;
+}
+#endif
+/*
+ * We use locality group preallocation for small size file. The size of the
+ * file is determined by the current size or the resulting size after
+ * allocation which ever is larger
+ *
+ * One can tune this size via /proc/fs/ext4/<partition>/stream_req
+ */
+static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+        int bsbits = ac->ac_sb->s_blocksize_bits;
+        loff_t size, isize;
+        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+                return;
+        size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+        isize = i_size_read(ac->ac_inode) >> bsbits;
+        size = max(size, isize);
+        /* don't use group allocation for large files */
+        if (size >= sbi->s_mb_stream_request)
+                return;
+        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+                return;
+        BUG_ON(ac->ac_lg != NULL);
+        /*
+         * locality group prealloc space are per cpu. The reason for having
+         * per cpu locality group is to reduce the contention between block
+         * request from multiple CPUs.
+         */
+        ac->ac_lg = &sbi->s_locality_groups[get_cpu()];
+        put_cpu();
+        /* we're going to use group allocation */
+        ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
+        /* serialize all allocations in the group */
+        mutex_lock(&ac->ac_lg->lg_mutex);
+}
+static int ext4_mb_initialize_context(struct ext4_allocation_context *ac,
+                                struct ext4_allocation_request *ar)
+{
+        struct super_block *sb = ar->inode->i_sb;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_super_block *es = sbi->s_es;
+        ext4_group_t group;
+        unsigned long len;
+        unsigned long goal;
+        ext4_grpblk_t block;
+        /* we can't allocate > group size */
+        len = ar->len;
+        /* just a dirty hack to filter too big requests  */
+        if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10)
+                len = EXT4_BLOCKS_PER_GROUP(sb) - 10;
+        /* start searching from the goal */
+        goal = ar->goal;
+        if (goal < le32_to_cpu(es->s_first_data_block) ||
+                        goal >= ext4_blocks_count(es))
+                goal = le32_to_cpu(es->s_first_data_block);
+        ext4_get_group_no_and_offset(sb, goal, &group, &block);
+        /* set up allocation goals */
+        ac->ac_b_ex.fe_logical = ar->logical;
+        ac->ac_b_ex.fe_group = 0;
+        ac->ac_b_ex.fe_start = 0;
+        ac->ac_b_ex.fe_len = 0;
+        ac->ac_status = AC_STATUS_CONTINUE;
+        ac->ac_groups_scanned = 0;
+        ac->ac_ex_scanned = 0;
+        ac->ac_found = 0;
+        ac->ac_sb = sb;
+        ac->ac_inode = ar->inode;
+        ac->ac_o_ex.fe_logical = ar->logical;
+        ac->ac_o_ex.fe_group = group;
+        ac->ac_o_ex.fe_start = block;
+        ac->ac_o_ex.fe_len = len;
+        ac->ac_g_ex.fe_logical = ar->logical;
+        ac->ac_g_ex.fe_group = group;
+        ac->ac_g_ex.fe_start = block;
+        ac->ac_g_ex.fe_len = len;
+        ac->ac_f_ex.fe_len = 0;
+        ac->ac_flags = ar->flags;
+        ac->ac_2order = 0;
+        ac->ac_criteria = 0;
+        ac->ac_pa = NULL;
+        ac->ac_bitmap_page = NULL;
+        ac->ac_buddy_page = NULL;
+        ac->ac_lg = NULL;
+        /* we have to define context: we'll we work with a file or
+         * locality group. this is a policy, actually */
+        ext4_mb_group_or_file(ac);
+        mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
+                        "left: %u/%u, right %u/%u to %swritable\n",
+                        (unsigned) ar->len, (unsigned) ar->logical,
+                        (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
+                        (unsigned) ar->lleft, (unsigned) ar->pleft,
+                        (unsigned) ar->lright, (unsigned) ar->pright,
+                        atomic_read(&ar->inode->i_writecount) ? "" : "non-");
+        return 0;
+}
+/*
+ * release all resource we used in allocation
+ */
+static int ext4_mb_release_context(struct ext4_allocation_context *ac)
+{
+        if (ac->ac_pa) {
+                if (ac->ac_pa->pa_linear) {
+                        /* see comment in ext4_mb_use_group_pa() */
+                        spin_lock(&ac->ac_pa->pa_lock);
+                        ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len;
+                        ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len;
+                        ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len;
+                        ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len;
+                        spin_unlock(&ac->ac_pa->pa_lock);
+                }
+                ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa);
+        }
+        if (ac->ac_bitmap_page)
+                page_cache_release(ac->ac_bitmap_page);
+        if (ac->ac_buddy_page)
+                page_cache_release(ac->ac_buddy_page);
+        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
+                mutex_unlock(&ac->ac_lg->lg_mutex);
+        ext4_mb_collect_stats(ac);
+        return 0;
+}
+static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
+{
+        ext4_group_t i;
+        int ret;
+        int freed = 0;
+        for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
+                ret = ext4_mb_discard_group_preallocations(sb, i, needed);
+                freed += ret;
+                needed -= ret;
+        }
+        return freed;
+}
+/*
+ * Main entry point into mballoc to allocate blocks
+ * it tries to use preallocation first, then falls back
+ * to usual allocation
+ */
+ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
+                                 struct ext4_allocation_request *ar, int *errp)
+{
+        struct ext4_allocation_context ac;
+        struct ext4_sb_info *sbi;
+        struct super_block *sb;
+        ext4_fsblk_t block = 0;
+        int freed;
+        int inquota;
+        sb = ar->inode->i_sb;
+        sbi = EXT4_SB(sb);
+        if (!test_opt(sb, MBALLOC)) {
+                block = ext4_new_blocks_old(handle, ar->inode, ar->goal,
+                                            &(ar->len), errp);
+                return block;
+        }
+        while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
+                ar->flags |= EXT4_MB_HINT_NOPREALLOC;
+                ar->len--;
+        }
+        if (ar->len == 0) {
+                *errp = -EDQUOT;
+                return 0;
+        }
+        inquota = ar->len;
+        ext4_mb_poll_new_transaction(sb, handle);
+        *errp = ext4_mb_initialize_context(&ac, ar);
+        if (*errp) {
+                ar->len = 0;
+                goto out;
+        }
+        ac.ac_op = EXT4_MB_HISTORY_PREALLOC;
+        if (!ext4_mb_use_preallocated(&ac)) {
+                ac.ac_op = EXT4_MB_HISTORY_ALLOC;
+                ext4_mb_normalize_request(&ac, ar);
+repeat:
+                /* allocate space in core */
+                ext4_mb_regular_allocator(&ac);
+                /* as we've just preallocated more space than
+                 * user requested orinally, we store allocated
+                 * space in a special descriptor */
+                if (ac.ac_status == AC_STATUS_FOUND &&
+                                ac.ac_o_ex.fe_len < ac.ac_b_ex.fe_len)
+                        ext4_mb_new_preallocation(&ac);
+        }
+        if (likely(ac.ac_status == AC_STATUS_FOUND)) {
+                ext4_mb_mark_diskspace_used(&ac, handle);
+                *errp = 0;
+                block = ext4_grp_offs_to_block(sb, &ac.ac_b_ex);
+                ar->len = ac.ac_b_ex.fe_len;
+        } else {
+                freed  = ext4_mb_discard_preallocations(sb, ac.ac_o_ex.fe_len);
+                if (freed)
+                        goto repeat;
+                *errp = -ENOSPC;
+                ac.ac_b_ex.fe_len = 0;
+                ar->len = 0;
+                ext4_mb_show_ac(&ac);
+        }
+        ext4_mb_release_context(&ac);
+out:
+        if (ar->len < inquota)
+                DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
+        return block;
+}
+static void ext4_mb_poll_new_transaction(struct super_block *sb,
+                                                handle_t *handle)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        if (sbi->s_last_transaction == handle->h_transaction->t_tid)
+                return;
+        /* new transaction! time to close last one and free blocks for
+         * committed transaction. we know that only transaction can be
+         * active, so previos transaction can be being logged and we
+         * know that transaction before previous is known to be already
+         * logged. this means that now we may free blocks freed in all
+         * transactions before previous one. hope I'm clear enough ... */
+        spin_lock(&sbi->s_md_lock);
+        if (sbi->s_last_transaction != handle->h_transaction->t_tid) {
+                mb_debug("new transaction %lu, old %lu\n",
+                                (unsigned long) handle->h_transaction->t_tid,
+                                (unsigned long) sbi->s_last_transaction);
+                list_splice_init(&sbi->s_closed_transaction,
+                                &sbi->s_committed_transaction);
+                list_splice_init(&sbi->s_active_transaction,
+                                &sbi->s_closed_transaction);
+                sbi->s_last_transaction = handle->h_transaction->t_tid;
+        }
+        spin_unlock(&sbi->s_md_lock);
+        ext4_mb_free_committed_blocks(sb);
+}
+static int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
+                          ext4_group_t group, ext4_grpblk_t block, int count)
+{
+        struct ext4_group_info *db = e4b->bd_info;
+        struct super_block *sb = e4b->bd_sb;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_free_metadata *md;
+        int i;
+        BUG_ON(e4b->bd_bitmap_page == NULL);
+        BUG_ON(e4b->bd_buddy_page == NULL);
+        ext4_lock_group(sb, group);
+        for (i = 0; i < count; i++) {
+                md = db->bb_md_cur;
+                if (md && db->bb_tid != handle->h_transaction->t_tid) {
+                        db->bb_md_cur = NULL;
+                        md = NULL;
+                }
+                if (md == NULL) {
+                        ext4_unlock_group(sb, group);
+                        md = kmalloc(sizeof(*md), GFP_NOFS);
+                        if (md == NULL)
+                                return -ENOMEM;
+                        md->num = 0;
+                        md->group = group;
+                        ext4_lock_group(sb, group);
+                        if (db->bb_md_cur == NULL) {
+                                spin_lock(&sbi->s_md_lock);
+                                list_add(&md->list, &sbi->s_active_transaction);
+                                spin_unlock(&sbi->s_md_lock);
+                                /* protect buddy cache from being freed,
+                                 * otherwise we'll refresh it from
+                                 * on-disk bitmap and lose not-yet-available
+                                 * blocks */
+                                page_cache_get(e4b->bd_buddy_page);
+                                page_cache_get(e4b->bd_bitmap_page);
+                                db->bb_md_cur = md;
+                                db->bb_tid = handle->h_transaction->t_tid;
+                                mb_debug("new md 0x%p for group %lu\n",
+                                                md, md->group);
+                        } else {
+                                kfree(md);
+                                md = db->bb_md_cur;
+                        }
+                }
+                BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
+                md->blocks[md->num] = block + i;
+                md->num++;
+                if (md->num == EXT4_BB_MAX_BLOCKS) {
+                        /* no more space, put full container on a sb's list */
+                        db->bb_md_cur = NULL;
+                }
+        }
+        ext4_unlock_group(sb, group);
+        return 0;
+}
+/*
+ * Main entry point into mballoc to free blocks
+ */
+void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
+                        unsigned long block, unsigned long count,
+                        int metadata, unsigned long *freed)
+{
+        struct buffer_head *bitmap_bh = 0;
+        struct super_block *sb = inode->i_sb;
+        struct ext4_allocation_context ac;
+        struct ext4_group_desc *gdp;
+        struct ext4_super_block *es;
+        unsigned long overflow;
+        ext4_grpblk_t bit;
+        struct buffer_head *gd_bh;
+        ext4_group_t block_group;
+        struct ext4_sb_info *sbi;
+        struct ext4_buddy e4b;
+        int err = 0;
+        int ret;
+        *freed = 0;
+        ext4_mb_poll_new_transaction(sb, handle);
+        sbi = EXT4_SB(sb);
+        es = EXT4_SB(sb)->s_es;
+        if (block < le32_to_cpu(es->s_first_data_block) ||
+            block + count < block ||
+            block + count > ext4_blocks_count(es)) {
+                ext4_error(sb, __FUNCTION__,
+                            "Freeing blocks not in datazone - "
+                            "block = %lu, count = %lu", block, count);
+                goto error_return;
+        }
+        ext4_debug("freeing block %lu\n", block);
+        ac.ac_op = EXT4_MB_HISTORY_FREE;
+        ac.ac_inode = inode;
+        ac.ac_sb = sb;
+do_more:
+        overflow = 0;
+        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+        /*
+         * Check to see if we are freeing blocks across a group
+         * boundary.
+         */
+        if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
+                overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
+                count -= overflow;
+        }
+        bitmap_bh = read_block_bitmap(sb, block_group);
+        if (!bitmap_bh)
+                goto error_return;
+        gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
+        if (!gdp)
+                goto error_return;
+        if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
+            in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
+            in_range(block, ext4_inode_table(sb, gdp),
+                      EXT4_SB(sb)->s_itb_per_group) ||
+            in_range(block + count - 1, ext4_inode_table(sb, gdp),
+                      EXT4_SB(sb)->s_itb_per_group)) {
+                ext4_error(sb, __FUNCTION__,
+                           "Freeing blocks in system zone - "
+                           "Block = %lu, count = %lu", block, count);
+        }
+        BUFFER_TRACE(bitmap_bh, "getting write access");
+        err = ext4_journal_get_write_access(handle, bitmap_bh);
+        if (err)
+                goto error_return;
+        /*
+         * We are about to modify some metadata.  Call the journal APIs
+         * to unshare ->b_data if a currently-committing transaction is
+         * using it
+         */
+        BUFFER_TRACE(gd_bh, "get_write_access");
+        err = ext4_journal_get_write_access(handle, gd_bh);
+        if (err)
+                goto error_return;
+        err = ext4_mb_load_buddy(sb, block_group, &e4b);
+        if (err)
+                goto error_return;
+#ifdef AGGRESSIVE_CHECK
+        {
+                int i;
+                for (i = 0; i < count; i++)
+                        BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
+        }
+#endif
+        mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
+                        bit, count);
+        /* We dirtied the bitmap block */
+        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+        err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+        ac.ac_b_ex.fe_group = block_group;
+        ac.ac_b_ex.fe_start = bit;
+        ac.ac_b_ex.fe_len = count;
+        ext4_mb_store_history(&ac);
+        if (metadata) {
+                /* blocks being freed are metadata. these blocks shouldn't
+                 * be used until this transaction is committed */
+                ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
+        } else {
+                ext4_lock_group(sb, block_group);
+                err = mb_free_blocks(inode, &e4b, bit, count);
+                ext4_mb_return_to_preallocation(inode, &e4b, block, count);
+                ext4_unlock_group(sb, block_group);
+                BUG_ON(err != 0);
+        }
+        spin_lock(sb_bgl_lock(sbi, block_group));
+        gdp->bg_free_blocks_count =
+                cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
+        gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
+        spin_unlock(sb_bgl_lock(sbi, block_group));
+        percpu_counter_add(&sbi->s_freeblocks_counter, count);
+        ext4_mb_release_desc(&e4b);
+        *freed += count;
+        /* And the group descriptor block */
+        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+        ret = ext4_journal_dirty_metadata(handle, gd_bh);
+        if (!err)
+                err = ret;
+        if (overflow && !err) {
+                block += count;
+                count = overflow;
+                put_bh(bitmap_bh);
+                goto do_more;
+        }
+        sb->s_dirt = 1;
+error_return:
+        brelse(bitmap_bh);
+        ext4_std_error(sb, err);
+        return;
+}
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
new file mode 100644
index 000000000000..3ebc2332f52e
--- /dev/null
+++ b/fs/ext4/migrate.c
@@ -0,0 +1,560 @@
+/*
+ * Copyright IBM Corporation, 2007
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#include <linux/module.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/ext4_fs_extents.h>
+/*
+ * The contiguous blocks details which can be
+ * represented by a single extent
+ */
+struct list_blocks_struct {
+        ext4_lblk_t first_block, last_block;
+        ext4_fsblk_t first_pblock, last_pblock;
+};
+static int finish_range(handle_t *handle, struct inode *inode,
+                                struct list_blocks_struct *lb)
+{
+        int retval = 0, needed;
+        struct ext4_extent newext;
+        struct ext4_ext_path *path;
+        if (lb->first_pblock == 0)
+                return 0;
+        /* Add the extent to temp inode*/
+        newext.ee_block = cpu_to_le32(lb->first_block);
+        newext.ee_len   = cpu_to_le16(lb->last_block - lb->first_block + 1);
+        ext4_ext_store_pblock(&newext, lb->first_pblock);
+        path = ext4_ext_find_extent(inode, lb->first_block, NULL);
+        if (IS_ERR(path)) {
+                retval = PTR_ERR(path);
+                goto err_out;
+        }
+        /*
+         * Calculate the credit needed to inserting this extent
+         * Since we are doing this in loop we may accumalate extra
+         * credit. But below we try to not accumalate too much
+         * of them by restarting the journal.
+         */
+        needed = ext4_ext_calc_credits_for_insert(inode, path);
+        /*
+         * Make sure the credit we accumalated is not really high
+         */
+        if (needed && handle->h_buffer_credits >= EXT4_RESERVE_TRANS_BLOCKS) {
+                retval = ext4_journal_restart(handle, needed);
+                if (retval)
+                        goto err_out;
+        }
+        if (needed) {
+                retval = ext4_journal_extend(handle, needed);
+                if (retval != 0) {
+                        /*
+                         * IF not able to extend the journal restart the journal
+                         */
+                        retval = ext4_journal_restart(handle, needed);
+                        if (retval)
+                                goto err_out;
+                }
+        }
+        retval = ext4_ext_insert_extent(handle, inode, path, &newext);
+err_out:
+        lb->first_pblock = 0;
+        return retval;
+}
+static int update_extent_range(handle_t *handle, struct inode *inode,
+                                ext4_fsblk_t pblock, ext4_lblk_t blk_num,
+                                struct list_blocks_struct *lb)
+{
+        int retval;
+        /*
+         * See if we can add on to the existing range (if it exists)
+         */
+        if (lb->first_pblock &&
+                (lb->last_pblock+1 == pblock) &&
+                (lb->last_block+1 == blk_num)) {
+                lb->last_pblock = pblock;
+                lb->last_block = blk_num;
+                return 0;
+        }
+        /*
+         * Start a new range.
+         */
+        retval = finish_range(handle, inode, lb);
+        lb->first_pblock = lb->last_pblock = pblock;
+        lb->first_block = lb->last_block = blk_num;
+        return retval;
+}
+static int update_ind_extent_range(handle_t *handle, struct inode *inode,
+                                   ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
+                                   struct list_blocks_struct *lb)
+{
+        struct buffer_head *bh;
+        __le32 *i_data;
+        int i, retval = 0;
+        ext4_lblk_t blk_count = *blk_nump;
+        unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+        if (!pblock) {
+                /* Only update the file block number */
+                *blk_nump += max_entries;
+                return 0;
+        }
+        bh = sb_bread(inode->i_sb, pblock);
+        if (!bh)
+                return -EIO;
+        i_data = (__le32 *)bh->b_data;
+        for (i = 0; i < max_entries; i++, blk_count++) {
+                if (i_data[i]) {
+                        retval = update_extent_range(handle, inode,
+                                                le32_to_cpu(i_data[i]),
+                                                blk_count, lb);
+                        if (retval)
+                                break;
+                }
+        }
+        /* Update the file block number */
+        *blk_nump = blk_count;
+        put_bh(bh);
+        return retval;
+}
+static int update_dind_extent_range(handle_t *handle, struct inode *inode,
+                                    ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
+                                    struct list_blocks_struct *lb)
+{
+        struct buffer_head *bh;
+        __le32 *i_data;
+        int i, retval = 0;
+        ext4_lblk_t blk_count = *blk_nump;
+        unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+        if (!pblock) {
+                /* Only update the file block number */
+                *blk_nump += max_entries * max_entries;
+                return 0;
+        }
+        bh = sb_bread(inode->i_sb, pblock);
+        if (!bh)
+                return -EIO;
+        i_data = (__le32 *)bh->b_data;
+        for (i = 0; i < max_entries; i++) {
+                if (i_data[i]) {
+                        retval = update_ind_extent_range(handle, inode,
+                                                le32_to_cpu(i_data[i]),
+                                                &blk_count, lb);
+                        if (retval)
+                                break;
+                } else {
+                        /* Only update the file block number */
+                        blk_count += max_entries;
+                }
+        }
+        /* Update the file block number */
+        *blk_nump = blk_count;
+        put_bh(bh);
+        return retval;
+}
+static int update_tind_extent_range(handle_t *handle, struct inode *inode,
+                                     ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
+                                     struct list_blocks_struct *lb)
+{
+        struct buffer_head *bh;
+        __le32 *i_data;
+        int i, retval = 0;
+        ext4_lblk_t blk_count = *blk_nump;
+        unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+        if (!pblock) {
+                /* Only update the file block number */
+                *blk_nump += max_entries * max_entries * max_entries;
+                return 0;
+        }
+        bh = sb_bread(inode->i_sb, pblock);
+        if (!bh)
+                return -EIO;
+        i_data = (__le32 *)bh->b_data;
+        for (i = 0; i < max_entries; i++) {
+                if (i_data[i]) {
+                        retval = update_dind_extent_range(handle, inode,
+                                                le32_to_cpu(i_data[i]),
+                                                &blk_count, lb);
+                        if (retval)
+                                break;
+                } else
+                        /* Only update the file block number */
+                        blk_count += max_entries * max_entries;
+        }
+        /* Update the file block number */
+        *blk_nump = blk_count;
+        put_bh(bh);
+        return retval;
+}
+static int free_dind_blocks(handle_t *handle,
+                                struct inode *inode, __le32 i_data)
+{
+        int i;
+        __le32 *tmp_idata;
+        struct buffer_head *bh;
+        unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+        bh = sb_bread(inode->i_sb, le32_to_cpu(i_data));
+        if (!bh)
+                return -EIO;
+        tmp_idata = (__le32 *)bh->b_data;
+        for (i = 0; i < max_entries; i++) {
+                if (tmp_idata[i])
+                        ext4_free_blocks(handle, inode,
+                                        le32_to_cpu(tmp_idata[i]), 1, 1);
+        }
+        put_bh(bh);
+        ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
+        return 0;
+}
+static int free_tind_blocks(handle_t *handle,
+                                struct inode *inode, __le32 i_data)
+{
+        int i, retval = 0;
+        __le32 *tmp_idata;
+        struct buffer_head *bh;
+        unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+        bh = sb_bread(inode->i_sb, le32_to_cpu(i_data));
+        if (!bh)
+                return -EIO;
+        tmp_idata = (__le32 *)bh->b_data;
+        for (i = 0; i < max_entries; i++) {
+                if (tmp_idata[i]) {
+                        retval = free_dind_blocks(handle,
+                                        inode, tmp_idata[i]);
+                        if (retval) {
+                                put_bh(bh);
+                                return retval;
+                        }
+                }
+        }
+        put_bh(bh);
+        ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
+        return 0;
+}
+static int free_ind_block(handle_t *handle, struct inode *inode)
+{
+        int retval;
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        if (ei->i_data[EXT4_IND_BLOCK])
+                ext4_free_blocks(handle, inode,
+                                le32_to_cpu(ei->i_data[EXT4_IND_BLOCK]), 1, 1);
+        if (ei->i_data[EXT4_DIND_BLOCK]) {
+                retval = free_dind_blocks(handle, inode,
+                                                ei->i_data[EXT4_DIND_BLOCK]);
+                if (retval)
+                        return retval;
+        }
+        if (ei->i_data[EXT4_TIND_BLOCK]) {
+                retval = free_tind_blocks(handle, inode,
+                                                ei->i_data[EXT4_TIND_BLOCK]);
+                if (retval)
+                        return retval;
+        }
+        return 0;
+}
+static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
+                                struct inode *tmp_inode, int retval)
+{
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode);
+        retval = free_ind_block(handle, inode);
+        if (retval)
+                goto err_out;
+        /*
+         * One credit accounted for writing the
+         * i_data field of the original inode
+         */
+        retval = ext4_journal_extend(handle, 1);
+        if (retval != 0) {
+                retval = ext4_journal_restart(handle, 1);
+                if (retval)
+                        goto err_out;
+        }
+        /*
+         * We have the extent map build with the tmp inode.
+         * Now copy the i_data across
+         */
+        ei->i_flags |= EXT4_EXTENTS_FL;
+        memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
+        /*
+         * Update i_blocks with the new blocks that got
+         * allocated while adding extents for extent index
+         * blocks.
+         *
+         * While converting to extents we need not
+         * update the orignal inode i_blocks for extent blocks
+         * via quota APIs. The quota update happened via tmp_inode already.
+         */
+        spin_lock(&inode->i_lock);
+        inode->i_blocks += tmp_inode->i_blocks;
+        spin_unlock(&inode->i_lock);
+        ext4_mark_inode_dirty(handle, inode);
+err_out:
+        return retval;
+}
+static int free_ext_idx(handle_t *handle, struct inode *inode,
+                                        struct ext4_extent_idx *ix)
+{
+        int i, retval = 0;
+        ext4_fsblk_t block;
+        struct buffer_head *bh;
+        struct ext4_extent_header *eh;
+        block = idx_pblock(ix);
+        bh = sb_bread(inode->i_sb, block);
+        if (!bh)
+                return -EIO;
+        eh = (struct ext4_extent_header *)bh->b_data;
+        if (eh->eh_depth != 0) {
+                ix = EXT_FIRST_INDEX(eh);
+                for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) {
+                        retval = free_ext_idx(handle, inode, ix);
+                        if (retval)
+                                break;
+                }
+        }
+        put_bh(bh);
+        ext4_free_blocks(handle, inode, block, 1, 1);
+        return retval;
+}
+/*
+ * Free the extent meta data blocks only
+ */
+static int free_ext_block(handle_t *handle, struct inode *inode)
+{
+        int i, retval = 0;
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        struct ext4_extent_header *eh = (struct ext4_extent_header *)ei->i_data;
+        struct ext4_extent_idx *ix;
+        if (eh->eh_depth == 0)
+                /*
+                 * No extra blocks allocated for extent meta data
+                 */
+                return 0;
+        ix = EXT_FIRST_INDEX(eh);
+        for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) {
+                retval = free_ext_idx(handle, inode, ix);
+                if (retval)
+                        return retval;
+        }
+        return retval;
+}
+int ext4_ext_migrate(struct inode *inode, struct file *filp,
+                                unsigned int cmd, unsigned long arg)
+{
+        handle_t *handle;
+        int retval = 0, i;
+        __le32 *i_data;
+        ext4_lblk_t blk_count = 0;
+        struct ext4_inode_info *ei;
+        struct inode *tmp_inode = NULL;
+        struct list_blocks_struct lb;
+        unsigned long max_entries;
+        if (!test_opt(inode->i_sb, EXTENTS))
+                /*
+                 * if mounted with noextents we don't allow the migrate
+                 */
+                return -EINVAL;
+        if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+                return -EINVAL;
+        down_write(&EXT4_I(inode)->i_data_sem);
+        handle = ext4_journal_start(inode,
+                                        EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
+                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+                                        2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)
+                                        + 1);
+        if (IS_ERR(handle)) {
+                retval = PTR_ERR(handle);
+                goto err_out;
+        }
+        tmp_inode = ext4_new_inode(handle,
+                                inode->i_sb->s_root->d_inode,
+                                S_IFREG);
+        if (IS_ERR(tmp_inode)) {
+                retval = -ENOMEM;
+                ext4_journal_stop(handle);
+                tmp_inode = NULL;
+                goto err_out;
+        }
+        i_size_write(tmp_inode, i_size_read(inode));
+        /*
+         * We don't want the inode to be reclaimed
+         * if we got interrupted in between. We have
+         * this tmp inode carrying reference to the
+         * data blocks of the original file. We set
+         * the i_nlink to zero at the last stage after
+         * switching the original file to extent format
+         */
+        tmp_inode->i_nlink = 1;
+        ext4_ext_tree_init(handle, tmp_inode);
+        ext4_orphan_add(handle, tmp_inode);
+        ext4_journal_stop(handle);
+        ei = EXT4_I(inode);
+        i_data = ei->i_data;
+        memset(&lb, 0, sizeof(lb));
+        /* 32 bit block address 4 bytes */
+        max_entries = inode->i_sb->s_blocksize >> 2;
+        /*
+         * start with one credit accounted for
+         * superblock modification.
+         *
+         * For the tmp_inode we already have commited the
+         * trascation that created the inode. Later as and
+         * when we add extents we extent the journal
+         */
+        handle = ext4_journal_start(inode, 1);
+        for (i = 0; i < EXT4_NDIR_BLOCKS; i++, blk_count++) {
+                if (i_data[i]) {
+                        retval = update_extent_range(handle, tmp_inode,
+                                                le32_to_cpu(i_data[i]),
+                                                blk_count, &lb);
+                        if (retval)
+                                goto err_out;
+                }
+        }
+        if (i_data[EXT4_IND_BLOCK]) {
+                retval = update_ind_extent_range(handle, tmp_inode,
+                                        le32_to_cpu(i_data[EXT4_IND_BLOCK]),
+                                        &blk_count, &lb);
+                        if (retval)
+                                goto err_out;
+        } else
+                blk_count +=  max_entries;
+        if (i_data[EXT4_DIND_BLOCK]) {
+                retval = update_dind_extent_range(handle, tmp_inode,
+                                        le32_to_cpu(i_data[EXT4_DIND_BLOCK]),
+                                        &blk_count, &lb);
+                        if (retval)
+                                goto err_out;
+        } else
+                blk_count += max_entries * max_entries;
+        if (i_data[EXT4_TIND_BLOCK]) {
+                retval = update_tind_extent_range(handle, tmp_inode,
+                                        le32_to_cpu(i_data[EXT4_TIND_BLOCK]),
+                                        &blk_count, &lb);
+                        if (retval)
+                                goto err_out;
+        }
+        /*
+         * Build the last extent
+         */
+        retval = finish_range(handle, tmp_inode, &lb);
+err_out:
+        /*
+         * We are either freeing extent information or indirect
+         * blocks. During this we touch superblock, group descriptor
+         * and block bitmap. Later we mark the tmp_inode dirty
+         * via ext4_ext_tree_init. So allocate a credit of 4
+         * We may update quota (user and group).
+         *
+         * FIXME!! we may be touching bitmaps in different block groups.
+         */
+        if (ext4_journal_extend(handle,
+                        4 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb)) != 0)
+                ext4_journal_restart(handle,
+                                4 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
+        if (retval)
+                /*
+                 * Failure case delete the extent information with the
+                 * tmp_inode
+                 */
+                free_ext_block(handle, tmp_inode);
+        else
+                retval = ext4_ext_swap_inode_data(handle, inode,
+                                                        tmp_inode, retval);
+        /*
+         * Mark the tmp_inode as of size zero
+         */
+        i_size_write(tmp_inode, 0);
+        /*
+         * set the  i_blocks count to zero
+         * so that the ext4_delete_inode does the
+         * right job
+         *
+         * We don't need to take the i_lock because
+         * the inode is not visible to user space.
+         */
+        tmp_inode->i_blocks = 0;
+        /* Reset the extent details */
+        ext4_ext_tree_init(handle, tmp_inode);
+        /*
+         * Set the i_nlink to zero so that
+         * generic_drop_inode really deletes the
+         * inode
+         */
+        tmp_inode->i_nlink = 0;
+        ext4_journal_stop(handle);
+        up_write(&EXT4_I(inode)->i_data_sem);
+        if (tmp_inode)
+                iput(tmp_inode);
+        return retval;
+}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 94ee6f315dc1..67b6d8a1ceff 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -51,7 +51,7 @@
 static struct buffer_head *ext4_append(handle_t *handle,
                                        struct inode *inode,
-                                        u32 *block, int *err)
+                                        ext4_lblk_t *block, int *err)
 {
        struct buffer_head *bh;
@@ -144,8 +144,8 @@ struct dx_map_entry
        u16 size;
 };
-static inline unsigned dx_get_block (struct dx_entry *entry);
+static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
-static void dx_set_block (struct dx_entry *entry, unsigned value);
+static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
 static inline unsigned dx_get_hash (struct dx_entry *entry);
 static void dx_set_hash (struct dx_entry *entry, unsigned value);
 static unsigned dx_get_count (struct dx_entry *entries);
@@ -166,7 +166,8 @@ static void dx_sort_map(struct dx_map_entry *map, unsigned count);
 static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to,
                struct dx_map_entry *offsets, int count);
 static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size);
-static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
+static void dx_insert_block(struct dx_frame *frame,
+                                        u32 hash, ext4_lblk_t block);
 static int ext4_htree_next_block(struct inode *dir, __u32 hash,
                                 struct dx_frame *frame,
                                 struct dx_frame *frames,
@@ -181,12 +182,12 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 * Mask them off for now.
 */
-static inline unsigned dx_get_block (struct dx_entry *entry)
+static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
 {
        return le32_to_cpu(entry->block) & 0x00ffffff;
 }
-static inline void dx_set_block (struct dx_entry *entry, unsigned value)
+static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
 {
        entry->block = cpu_to_le32(value);
 }
@@ -243,8 +244,8 @@ static void dx_show_index (char * label, struct dx_entry *entries)
        int i, n = dx_get_count (entries);
        printk("%s index ", label);
        for (i = 0; i < n; i++) {
-                printk("%x->%u ", i? dx_get_hash(entries + i) :
+                printk("%x->%lu ", i? dx_get_hash(entries + i) :
-                                0, dx_get_block(entries + i));
+                                0, (unsigned long)dx_get_block(entries + i));
        }
        printk("\n");
 }
@@ -280,7 +281,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
                        space += EXT4_DIR_REC_LEN(de->name_len);
                        names++;
                }
-                de = (struct ext4_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
+                de = ext4_next_entry(de);
        }
        printk("(%i)\n", names);
        return (struct stats) { names, space, 1 };
@@ -297,7 +298,8 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
        printk("%i indexed blocks...\n", count);
        for (i = 0; i < count; i++, entries++)
        {
-                u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
+                ext4_lblk_t block = dx_get_block(entries);
+                ext4_lblk_t hash  = i ? dx_get_hash(entries): 0;
                u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
                struct stats stats;
                printk("%s%3u:%03u hash %8x/%8x ",levels?"":"   ", i, block, hash, range);
@@ -551,7 +553,8 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
 */
 static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
 {
-        return (struct ext4_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len));
+        return (struct ext4_dir_entry_2 *)((char *)p +
+                ext4_rec_len_from_disk(p->rec_len));
 }
 /*
@@ -560,7 +563,7 @@ static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *
 * into the tree.  If there is an error it is returned in err.
 */
 static int htree_dirblock_to_tree(struct file *dir_file,
-                                  struct inode *dir, int block,
+                                  struct inode *dir, ext4_lblk_t block,
                                  struct dx_hash_info *hinfo,
                                  __u32 start_hash, __u32 start_minor_hash)
 {
@@ -568,7 +571,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
        struct ext4_dir_entry_2 *de, *top;
        int err, count = 0;
-        dxtrace(printk("In htree dirblock_to_tree: block %d\n", block));
+        dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
+                                                        (unsigned long)block));
        if (!(bh = ext4_bread (NULL, dir, block, 0, &err)))
                return err;
@@ -620,9 +624,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
        struct ext4_dir_entry_2 *de;
        struct dx_frame frames[2], *frame;
        struct inode *dir;
-        int block, err;
+        ext4_lblk_t block;
        int count = 0;
-        int ret;
+        int ret, err;
        __u32 hashval;
        dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
@@ -720,7 +724,7 @@ static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
                        cond_resched();
                }
                /* XXX: do we need to check rec_len == 0 case? -Chris */
-                de = (struct ext4_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
+                de = ext4_next_entry(de);
        }
        return count;
 }
@@ -752,7 +756,7 @@ static void dx_sort_map (struct dx_map_entry *map, unsigned count)
        } while(more);
 }
-static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
+static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
 {
        struct dx_entry *entries = frame->entries;
        struct dx_entry *old = frame->at, *new = old + 1;
@@ -820,7 +824,7 @@ static inline int search_dirblock(struct buffer_head * bh,
                        return 1;
                }
                /* prevent looping on a bad block */
-                de_len = le16_to_cpu(de->rec_len);
+                de_len = ext4_rec_len_from_disk(de->rec_len);
                if (de_len <= 0)
                        return -1;
                offset += de_len;
@@ -847,23 +851,20 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry,
        struct super_block * sb;
        struct buffer_head * bh_use[NAMEI_RA_SIZE];
        struct buffer_head * bh, *ret = NULL;
-        unsigned long start, block, b;
+        ext4_lblk_t start, block, b;
        int ra_max = 0;         /* Number of bh's in the readahead
                                   buffer, bh_use[] */
        int ra_ptr = 0;         /* Current index into readahead
                                   buffer */
        int num = 0;
-        int nblocks, i, err;
+        ext4_lblk_t  nblocks;
+        int i, err;
        struct inode *dir = dentry->d_parent->d_inode;
        int namelen;
-        const u8 *name;
-        unsigned blocksize;
        *res_dir = NULL;
        sb = dir->i_sb;
-        blocksize = sb->s_blocksize;
        namelen = dentry->d_name.len;
-        name = dentry->d_name.name;
        if (namelen > EXT4_NAME_LEN)
                return NULL;
        if (is_dx(dir)) {
@@ -914,7 +915,8 @@ restart:
                if (!buffer_uptodate(bh)) {
                        /* read error, skip block & hope for the best */
                        ext4_error(sb, __FUNCTION__, "reading directory #%lu "
-                                   "offset %lu", dir->i_ino, block);
+                                   "offset %lu", dir->i_ino,
+                                   (unsigned long)block);
                        brelse(bh);
                        goto next;
                }
@@ -961,7 +963,7 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
        struct dx_frame frames[2], *frame;
        struct ext4_dir_entry_2 *de, *top;
        struct buffer_head *bh;
-        unsigned long block;
+        ext4_lblk_t block;
        int retval;
        int namelen = dentry->d_name.len;
        const u8 *name = dentry->d_name.name;
@@ -1128,7 +1130,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
                rec_len = EXT4_DIR_REC_LEN(de->name_len);
                memcpy (to, de, rec_len);
                ((struct ext4_dir_entry_2 *) to)->rec_len =
-                                cpu_to_le16(rec_len);
+                                ext4_rec_len_to_disk(rec_len);
                de->inode = 0;
                map++;
                to += rec_len;
@@ -1147,13 +1149,12 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size)
        prev = to = de;
        while ((char*)de < base + size) {
-                next = (struct ext4_dir_entry_2 *) ((char *) de +
+                next = ext4_next_entry(de);
-                                                    le16_to_cpu(de->rec_len));
                if (de->inode && de->name_len) {
                        rec_len = EXT4_DIR_REC_LEN(de->name_len);
                        if (de > to)
                                memmove(to, de, rec_len);
-                        to->rec_len = cpu_to_le16(rec_len);
+                        to->rec_len = ext4_rec_len_to_disk(rec_len);
                        prev = to;
                        to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
                }
@@ -1174,7 +1175,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
        unsigned blocksize = dir->i_sb->s_blocksize;
        unsigned count, continued;
        struct buffer_head *bh2;
-        u32 newblock;
+        ext4_lblk_t newblock;
        u32 hash2;
        struct dx_map_entry *map;
        char *data1 = (*bh)->b_data, *data2;
@@ -1221,14 +1222,15 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
        split = count - move;
        hash2 = map[split].hash;
        continued = hash2 == map[split - 1].hash;
-        dxtrace(printk("Split block %i at %x, %i/%i\n",
+        dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n",
-                dx_get_block(frame->at), hash2, split, count-split));
+                        (unsigned long)dx_get_block(frame->at),
+                                        hash2, split, count-split));
        /* Fancy dance to stay within two buffers */
        de2 = dx_move_dirents(data1, data2, map + split, count - split);
        de = dx_pack_dirents(data1,blocksize);
-        de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
+        de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
-        de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
+        de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2);
        dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
        dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
@@ -1297,7 +1299,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
                                return -EEXIST;
                        }
                        nlen = EXT4_DIR_REC_LEN(de->name_len);
-                        rlen = le16_to_cpu(de->rec_len);
+                        rlen = ext4_rec_len_from_disk(de->rec_len);
                        if ((de->inode? rlen - nlen: rlen) >= reclen)
                                break;
                        de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
@@ -1316,11 +1318,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        /* By now the buffer is marked for journaling */
        nlen = EXT4_DIR_REC_LEN(de->name_len);
-        rlen = le16_to_cpu(de->rec_len);
+        rlen = ext4_rec_len_from_disk(de->rec_len);
        if (de->inode) {
                struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
-                de1->rec_len = cpu_to_le16(rlen - nlen);
+                de1->rec_len = ext4_rec_len_to_disk(rlen - nlen);
-                de->rec_len = cpu_to_le16(nlen);
+                de->rec_len = ext4_rec_len_to_disk(nlen);
                de = de1;
        }
        de->file_type = EXT4_FT_UNKNOWN;
@@ -1374,7 +1376,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        int             retval;
        unsigned        blocksize;
        struct dx_hash_info hinfo;
-        u32             block;
+        ext4_lblk_t  block;
        struct fake_dirent *fde;
        blocksize =  dir->i_sb->s_blocksize;
@@ -1397,17 +1399,18 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        /* The 0th block becomes the root, move the dirents out */
        fde = &root->dotdot;
-        de = (struct ext4_dir_entry_2 *)((char *)fde + le16_to_cpu(fde->rec_len));
+        de = (struct ext4_dir_entry_2 *)((char *)fde +
+                ext4_rec_len_from_disk(fde->rec_len));
        len = ((char *) root) + blocksize - (char *) de;
        memcpy (data1, de, len);
        de = (struct ext4_dir_entry_2 *) data1;
        top = data1 + len;
-        while ((char *)(de2=(void*)de+le16_to_cpu(de->rec_len)) < top)
+        while ((char *)(de2 = ext4_next_entry(de)) < top)
                de = de2;
-        de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
+        de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
        /* Initialize the root; the dot dirents already exist */
        de = (struct ext4_dir_entry_2 *) (&root->dotdot);
-        de->rec_len = cpu_to_le16(blocksize - EXT4_DIR_REC_LEN(2));
+        de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2));
        memset (&root->info, 0, sizeof(root->info));
        root->info.info_length = sizeof(root->info);
        root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
@@ -1454,7 +1457,7 @@ static int ext4_add_entry (handle_t *handle, struct dentry *dentry,
        int     retval;
        int     dx_fallback=0;
        unsigned blocksize;
-        u32 block, blocks;
+        ext4_lblk_t block, blocks;
        sb = dir->i_sb;
        blocksize = sb->s_blocksize;
@@ -1487,7 +1490,7 @@ static int ext4_add_entry (handle_t *handle, struct dentry *dentry,
                return retval;
        de = (struct ext4_dir_entry_2 *) bh->b_data;
        de->inode = 0;
-        de->rec_len = cpu_to_le16(blocksize);
+        de->rec_len = ext4_rec_len_to_disk(blocksize);
        return add_dirent_to_buf(handle, dentry, inode, de, bh);
 }
@@ -1531,7 +1534,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                       dx_get_count(entries), dx_get_limit(entries)));
        /* Need to split index? */
        if (dx_get_count(entries) == dx_get_limit(entries)) {
-                u32 newblock;
+                ext4_lblk_t newblock;
                unsigned icount = dx_get_count(entries);
                int levels = frame - frames;
                struct dx_entry *entries2;
@@ -1550,7 +1553,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        goto cleanup;
                node2 = (struct dx_node *)(bh2->b_data);
                entries2 = node2->entries;
-                node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
+                node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize);
                node2->fake.inode = 0;
                BUFFER_TRACE(frame->bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, frame->bh);
@@ -1648,9 +1651,9 @@ static int ext4_delete_entry (handle_t *handle,
                        BUFFER_TRACE(bh, "get_write_access");
                        ext4_journal_get_write_access(handle, bh);
                        if (pde)
-                                pde->rec_len =
+                                pde->rec_len = ext4_rec_len_to_disk(
-                                        cpu_to_le16(le16_to_cpu(pde->rec_len) +
+                                        ext4_rec_len_from_disk(pde->rec_len) +
-                                                    le16_to_cpu(de->rec_len));
+                                        ext4_rec_len_from_disk(de->rec_len));
                        else
                                de->inode = 0;
                        dir->i_version++;
@@ -1658,10 +1661,9 @@ static int ext4_delete_entry (handle_t *handle,
                        ext4_journal_dirty_metadata(handle, bh);
                        return 0;
                }
-                i += le16_to_cpu(de->rec_len);
+                i += ext4_rec_len_from_disk(de->rec_len);
                pde = de;
-                de = (struct ext4_dir_entry_2 *)
+                de = ext4_next_entry(de);
-                        ((char *) de + le16_to_cpu(de->rec_len));
        }
        return -ENOENT;
 }
@@ -1824,13 +1826,13 @@ retry:
        de = (struct ext4_dir_entry_2 *) dir_block->b_data;
        de->inode = cpu_to_le32(inode->i_ino);
        de->name_len = 1;
-        de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de->name_len));
+        de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len));
        strcpy (de->name, ".");
        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
-        de = (struct ext4_dir_entry_2 *)
+        de = ext4_next_entry(de);
-                        ((char *) de + le16_to_cpu(de->rec_len));
        de->inode = cpu_to_le32(dir->i_ino);
-        de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT4_DIR_REC_LEN(1));
+        de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize -
+                                                EXT4_DIR_REC_LEN(1));
        de->name_len = 2;
        strcpy (de->name, "..");
        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
@@ -1882,8 +1884,7 @@ static int empty_dir (struct inode * inode)
                return 1;
        }
        de = (struct ext4_dir_entry_2 *) bh->b_data;
-        de1 = (struct ext4_dir_entry_2 *)
+        de1 = ext4_next_entry(de);
-                        ((char *) de + le16_to_cpu(de->rec_len));
        if (le32_to_cpu(de->inode) != inode->i_ino ||
                        !le32_to_cpu(de1->inode) ||
                        strcmp (".", de->name) ||
@@ -1894,9 +1895,9 @@ static int empty_dir (struct inode * inode)
                brelse (bh);
                return 1;
        }
-        offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
+        offset = ext4_rec_len_from_disk(de->rec_len) +
-        de = (struct ext4_dir_entry_2 *)
+                 ext4_rec_len_from_disk(de1->rec_len);
-                        ((char *) de1 + le16_to_cpu(de1->rec_len));
+        de = ext4_next_entry(de1);
        while (offset < inode->i_size ) {
                if (!bh ||
                        (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
@@ -1925,9 +1926,8 @@ static int empty_dir (struct inode * inode)
                        brelse (bh);
                        return 0;
                }
-                offset += le16_to_cpu(de->rec_len);
+                offset += ext4_rec_len_from_disk(de->rec_len);
-                de = (struct ext4_dir_entry_2 *)
+                de = ext4_next_entry(de);
-                                ((char *) de + le16_to_cpu(de->rec_len));
        }
        brelse (bh);
        return 1;
@@ -2282,8 +2282,7 @@ retry:
 }
 #define PARENT_INO(buffer) \
-        ((struct ext4_dir_entry_2 *) ((char *) buffer + \
+        (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer))->inode)
-        le16_to_cpu(((struct ext4_dir_entry_2 *) buffer)->rec_len)))->inode
 /*
 * Anybody can rename anything with this: the permission checks are left to the
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index bd8a52bb3999..4fbba60816f4 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -28,7 +28,7 @@ static int verify_group_input(struct super_block *sb,
        struct ext4_super_block *es = sbi->s_es;
        ext4_fsblk_t start = ext4_blocks_count(es);
        ext4_fsblk_t end = start + input->blocks_count;
-        unsigned group = input->group;
+        ext4_group_t group = input->group;
        ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
        unsigned overhead = ext4_bg_has_super(sb, group) ?
                (1 + ext4_bg_num_gdb(sb, group) +
@@ -206,7 +206,7 @@ static int setup_new_group_blocks(struct super_block *sb,
        }
        if (ext4_bg_has_super(sb, input->group)) {
-                ext4_debug("mark backup superblock %#04lx (+0)\n", start);
+                ext4_debug("mark backup superblock %#04llx (+0)\n", start);
                ext4_set_bit(0, bh->b_data);
        }
@@ -215,7 +215,7 @@ static int setup_new_group_blocks(struct super_block *sb,
             i < gdblocks; i++, block++, bit++) {
                struct buffer_head *gdb;
-                ext4_debug("update backup group %#04lx (+%d)\n", block, bit);
+                ext4_debug("update backup group %#04llx (+%d)\n", block, bit);
                if ((err = extend_or_restart_transaction(handle, 1, bh)))
                        goto exit_bh;
@@ -243,7 +243,7 @@ static int setup_new_group_blocks(struct super_block *sb,
             i < reserved_gdb; i++, block++, bit++) {
                struct buffer_head *gdb;
-                ext4_debug("clear reserved block %#04lx (+%d)\n", block, bit);
+                ext4_debug("clear reserved block %#04llx (+%d)\n", block, bit);
                if ((err = extend_or_restart_transaction(handle, 1, bh)))
                        goto exit_bh;
@@ -256,10 +256,10 @@ static int setup_new_group_blocks(struct super_block *sb,
                ext4_set_bit(bit, bh->b_data);
                brelse(gdb);
        }
-        ext4_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap,
+        ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
                   input->block_bitmap - start);
        ext4_set_bit(input->block_bitmap - start, bh->b_data);
-        ext4_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap,
+        ext4_debug("mark inode bitmap %#04llx (+%llu)\n", input->inode_bitmap,
                   input->inode_bitmap - start);
        ext4_set_bit(input->inode_bitmap - start, bh->b_data);
@@ -268,7 +268,7 @@ static int setup_new_group_blocks(struct super_block *sb,
             i < sbi->s_itb_per_group; i++, bit++, block++) {
                struct buffer_head *it;
-                ext4_debug("clear inode block %#04lx (+%d)\n", block, bit);
+                ext4_debug("clear inode block %#04llx (+%d)\n", block, bit);
                if ((err = extend_or_restart_transaction(handle, 1, bh)))
                        goto exit_bh;
@@ -291,7 +291,7 @@ static int setup_new_group_blocks(struct super_block *sb,
        brelse(bh);
        /* Mark unused entries in inode bitmap used */
-        ext4_debug("clear inode bitmap %#04x (+%ld)\n",
+        ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
                   input->inode_bitmap, input->inode_bitmap - start);
        if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
                err = PTR_ERR(bh);
@@ -357,7 +357,7 @@ static int verify_reserved_gdb(struct super_block *sb,
                               struct buffer_head *primary)
 {
        const ext4_fsblk_t blk = primary->b_blocknr;
-        const unsigned long end = EXT4_SB(sb)->s_groups_count;
+        const ext4_group_t end = EXT4_SB(sb)->s_groups_count;
        unsigned three = 1;
        unsigned five = 5;
        unsigned seven = 7;
@@ -656,12 +656,12 @@ static void update_backups(struct super_block *sb,
                           int blk_off, char *data, int size)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        const unsigned long last = sbi->s_groups_count;
+        const ext4_group_t last = sbi->s_groups_count;
        const int bpg = EXT4_BLOCKS_PER_GROUP(sb);
        unsigned three = 1;
        unsigned five = 5;
        unsigned seven = 7;
-        unsigned group;
+        ext4_group_t group;
        int rest = sb->s_blocksize - size;
        handle_t *handle;
        int err = 0, err2;
@@ -716,7 +716,7 @@ static void update_backups(struct super_block *sb,
 exit_err:
        if (err) {
                ext4_warning(sb, __FUNCTION__,
-                             "can't update backup for group %d (err %d), "
+                             "can't update backup for group %lu (err %d), "
                             "forcing fsck on next reboot", group, err);
                sbi->s_mount_state &= ~EXT4_VALID_FS;
                sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
@@ -952,7 +952,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                      ext4_fsblk_t n_blocks_count)
 {
        ext4_fsblk_t o_blocks_count;
-        unsigned long o_groups_count;
+        ext4_group_t o_groups_count;
        ext4_grpblk_t last;
        ext4_grpblk_t add;
        struct buffer_head * bh;
@@ -1054,7 +1054,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
        ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
        sb->s_dirt = 1;
        unlock_super(sb);
-        ext4_debug("freeing blocks %lu through %llu\n", o_blocks_count,
+        ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
                   o_blocks_count + add);
        ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
        ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8031dc0e24e5..055a0cd0168e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -373,6 +373,66 @@ void ext4_update_dynamic_rev(struct super_block *sb)
         */
 }
+int ext4_update_compat_feature(handle_t *handle,
+                                        struct super_block *sb, __u32 compat)
+{
+        int err = 0;
+        if (!EXT4_HAS_COMPAT_FEATURE(sb, compat)) {
+                err = ext4_journal_get_write_access(handle,
+                                EXT4_SB(sb)->s_sbh);
+                if (err)
+                        return err;
+                EXT4_SET_COMPAT_FEATURE(sb, compat);
+                sb->s_dirt = 1;
+                handle->h_sync = 1;
+                BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
+                                        "call ext4_journal_dirty_met adata");
+                err = ext4_journal_dirty_metadata(handle,
+                                EXT4_SB(sb)->s_sbh);
+        }
+        return err;
+}
+int ext4_update_rocompat_feature(handle_t *handle,
+                                        struct super_block *sb, __u32 rocompat)
+{
+        int err = 0;
+        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, rocompat)) {
+                err = ext4_journal_get_write_access(handle,
+                                EXT4_SB(sb)->s_sbh);
+                if (err)
+                        return err;
+                EXT4_SET_RO_COMPAT_FEATURE(sb, rocompat);
+                sb->s_dirt = 1;
+                handle->h_sync = 1;
+                BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
+                                        "call ext4_journal_dirty_met adata");
+                err = ext4_journal_dirty_metadata(handle,
+                                EXT4_SB(sb)->s_sbh);
+        }
+        return err;
+}
+int ext4_update_incompat_feature(handle_t *handle,
+                                        struct super_block *sb, __u32 incompat)
+{
+        int err = 0;
+        if (!EXT4_HAS_INCOMPAT_FEATURE(sb, incompat)) {
+                err = ext4_journal_get_write_access(handle,
+                                EXT4_SB(sb)->s_sbh);
+                if (err)
+                        return err;
+                EXT4_SET_INCOMPAT_FEATURE(sb, incompat);
+                sb->s_dirt = 1;
+                handle->h_sync = 1;
+                BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
+                                        "call ext4_journal_dirty_met adata");
+                err = ext4_journal_dirty_metadata(handle,
+                                EXT4_SB(sb)->s_sbh);
+        }
+        return err;
+}
 /*
 * Open the external journal device
 */
@@ -443,6 +503,7 @@ static void ext4_put_super (struct super_block * sb)
        struct ext4_super_block *es = sbi->s_es;
        int i;
+        ext4_mb_release(sb);
        ext4_ext_release(sb);
        ext4_xattr_put_super(sb);
        jbd2_journal_destroy(sbi->s_journal);
@@ -509,6 +570,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        ei->i_block_alloc_info = NULL;
        ei->vfs_inode.i_version = 1;
        memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
+        INIT_LIST_HEAD(&ei->i_prealloc_list);
+        spin_lock_init(&ei->i_prealloc_lock);
        return &ei->vfs_inode;
 }
@@ -533,7 +596,7 @@ static void init_once(struct kmem_cache *cachep, void *foo)
 #ifdef CONFIG_EXT4DEV_FS_XATTR
        init_rwsem(&ei->xattr_sem);
 #endif
-        mutex_init(&ei->truncate_mutex);
+        init_rwsem(&ei->i_data_sem);
        inode_init_once(&ei->vfs_inode);
 }
@@ -605,18 +668,20 @@ static inline void ext4_show_quota_options(struct seq_file *seq, struct super_bl
 */
 static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
+        int def_errors;
+        unsigned long def_mount_opts;
        struct super_block *sb = vfs->mnt_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
-        unsigned long def_mount_opts;
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+        def_errors     = le16_to_cpu(es->s_errors);
        if (sbi->s_sb_block != 1)
                seq_printf(seq, ",sb=%llu", sbi->s_sb_block);
        if (test_opt(sb, MINIX_DF))
                seq_puts(seq, ",minixdf");
-        if (test_opt(sb, GRPID))
+        if (test_opt(sb, GRPID) && !(def_mount_opts & EXT4_DEFM_BSDGROUPS))
                seq_puts(seq, ",grpid");
        if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS))
                seq_puts(seq, ",nogrpid");
@@ -628,34 +693,33 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
            le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) {
                seq_printf(seq, ",resgid=%u", sbi->s_resgid);
        }
-        if (test_opt(sb, ERRORS_CONT)) {
+        if (test_opt(sb, ERRORS_RO)) {
-                int def_errors = le16_to_cpu(es->s_errors);
                if (def_errors == EXT4_ERRORS_PANIC ||
-                    def_errors == EXT4_ERRORS_RO) {
+                    def_errors == EXT4_ERRORS_CONTINUE) {
-                        seq_puts(seq, ",errors=continue");
+                        seq_puts(seq, ",errors=remount-ro");
                }
        }
-        if (test_opt(sb, ERRORS_RO))
+        if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
-                seq_puts(seq, ",errors=remount-ro");
+                seq_puts(seq, ",errors=continue");
-        if (test_opt(sb, ERRORS_PANIC))
+        if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
                seq_puts(seq, ",errors=panic");
-        if (test_opt(sb, NO_UID32))
+        if (test_opt(sb, NO_UID32) && !(def_mount_opts & EXT4_DEFM_UID16))
                seq_puts(seq, ",nouid32");
-        if (test_opt(sb, DEBUG))
+        if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG))
                seq_puts(seq, ",debug");
        if (test_opt(sb, OLDALLOC))
                seq_puts(seq, ",oldalloc");
-#ifdef CONFIG_EXT4_FS_XATTR
+#ifdef CONFIG_EXT4DEV_FS_XATTR
-        if (test_opt(sb, XATTR_USER))
+        if (test_opt(sb, XATTR_USER) &&
+                !(def_mount_opts & EXT4_DEFM_XATTR_USER))
                seq_puts(seq, ",user_xattr");
        if (!test_opt(sb, XATTR_USER) &&
            (def_mount_opts & EXT4_DEFM_XATTR_USER)) {
                seq_puts(seq, ",nouser_xattr");
        }
 #endif
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
+#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
-        if (test_opt(sb, POSIX_ACL))
+        if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
                seq_puts(seq, ",acl");
        if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
                seq_puts(seq, ",noacl");
@@ -672,7 +736,17 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",nobh");
        if (!test_opt(sb, EXTENTS))
                seq_puts(seq, ",noextents");
+        if (!test_opt(sb, MBALLOC))
+                seq_puts(seq, ",nomballoc");
+        if (test_opt(sb, I_VERSION))
+                seq_puts(seq, ",i_version");
+        if (sbi->s_stripe)
+                seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
+        /*
+         * journal mode get enabled in different ways
+         * So just print the value even if we didn't specify it
+         */
        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
                seq_puts(seq, ",data=journal");
        else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
@@ -681,7 +755,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",data=writeback");
        ext4_show_quota_options(seq, sb);
        return 0;
 }
@@ -809,11 +882,13 @@ enum {
        Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
        Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
        Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
+        Opt_journal_checksum, Opt_journal_async_commit,
        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
        Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
-        Opt_grpquota, Opt_extents, Opt_noextents,
+        Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
+        Opt_mballoc, Opt_nomballoc, Opt_stripe,
 };
 static match_table_t tokens = {
@@ -848,6 +923,8 @@ static match_table_t tokens = {
        {Opt_journal_update, "journal=update"},
        {Opt_journal_inum, "journal=%u"},
        {Opt_journal_dev, "journal_dev=%u"},
+        {Opt_journal_checksum, "journal_checksum"},
+        {Opt_journal_async_commit, "journal_async_commit"},
        {Opt_abort, "abort"},
        {Opt_data_journal, "data=journal"},
        {Opt_data_ordered, "data=ordered"},
@@ -865,6 +942,10 @@ static match_table_t tokens = {
        {Opt_barrier, "barrier=%u"},
        {Opt_extents, "extents"},
        {Opt_noextents, "noextents"},
+        {Opt_i_version, "i_version"},
+        {Opt_mballoc, "mballoc"},
+        {Opt_nomballoc, "nomballoc"},
+        {Opt_stripe, "stripe=%u"},
        {Opt_err, NULL},
        {Opt_resize, "resize"},
 };
@@ -1035,6 +1116,13 @@ static int parse_options (char *options, struct super_block *sb,
                                return 0;
                        *journal_devnum = option;
                        break;
+                case Opt_journal_checksum:
+                        set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
+                        break;
+                case Opt_journal_async_commit:
+                        set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
+                        set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
+                        break;
                case Opt_noload:
                        set_opt (sbi->s_mount_opt, NOLOAD);
                        break;
@@ -1203,6 +1291,23 @@ clear_qf_name:
                case Opt_noextents:
                        clear_opt (sbi->s_mount_opt, EXTENTS);
                        break;
+                case Opt_i_version:
+                        set_opt(sbi->s_mount_opt, I_VERSION);
+                        sb->s_flags |= MS_I_VERSION;
+                        break;
+                case Opt_mballoc:
+                        set_opt(sbi->s_mount_opt, MBALLOC);
+                        break;
+                case Opt_nomballoc:
+                        clear_opt(sbi->s_mount_opt, MBALLOC);
+                        break;
+                case Opt_stripe:
+                        if (match_int(&args[0], &option))
+                                return 0;
+                        if (option < 0)
+                                return 0;
+                        sbi->s_stripe = option;
+                        break;
                default:
                        printk (KERN_ERR
                                "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1364,7 +1469,7 @@ static int ext4_check_descriptors (struct super_block * sb)
        struct ext4_group_desc * gdp = NULL;
        int desc_block = 0;
        int flexbg_flag = 0;
-        int i;
+        ext4_group_t i;
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
                flexbg_flag = 1;
@@ -1386,7 +1491,7 @@ static int ext4_check_descriptors (struct super_block * sb)
                if (block_bitmap < first_block || block_bitmap > last_block)
                {
                        ext4_error (sb, "ext4_check_descriptors",
-                                    "Block bitmap for group %d"
+                                    "Block bitmap for group %lu"
                                    " not in group (block %llu)!",
                                    i, block_bitmap);
                        return 0;
@@ -1395,7 +1500,7 @@ static int ext4_check_descriptors (struct super_block * sb)
                if (inode_bitmap < first_block || inode_bitmap > last_block)
                {
                        ext4_error (sb, "ext4_check_descriptors",
-                                    "Inode bitmap for group %d"
+                                    "Inode bitmap for group %lu"
                                    " not in group (block %llu)!",
                                    i, inode_bitmap);
                        return 0;
@@ -1405,17 +1510,16 @@ static int ext4_check_descriptors (struct super_block * sb)
                    inode_table + sbi->s_itb_per_group - 1 > last_block)
                {
                        ext4_error (sb, "ext4_check_descriptors",
-                                    "Inode table for group %d"
+                                    "Inode table for group %lu"
                                    " not in group (block %llu)!",
                                    i, inode_table);
                        return 0;
                }
                if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
                        ext4_error(sb, __FUNCTION__,
-                                   "Checksum for group %d failed (%u!=%u)\n", i,
+                                   "Checksum for group %lu failed (%u!=%u)\n",
-                                   le16_to_cpu(ext4_group_desc_csum(sbi, i,
+                                    i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
-                                                                    gdp)),
+                                    gdp)), le16_to_cpu(gdp->bg_checksum));
-                                   le16_to_cpu(gdp->bg_checksum));
                        return 0;
                }
                if (!flexbg_flag)
@@ -1429,7 +1533,6 @@ static int ext4_check_descriptors (struct super_block * sb)
        return 1;
 }
 /* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
 * the superblock) which were deleted from all directories, but held open by
 * a process at the time of a crash.  We walk the list and try to delete these
@@ -1542,20 +1645,95 @@ static void ext4_orphan_cleanup (struct super_block * sb,
 #endif
        sb->s_flags = s_flags; /* Restore MS_RDONLY status */
 }
+/*
+ * Maximal extent format file size.
+ * Resulting logical blkno at s_maxbytes must fit in our on-disk
+ * extent format containers, within a sector_t, and within i_blocks
+ * in the vfs.  ext4 inode has 48 bits of i_block in fsblock units,
+ * so that won't be a limiting factor.
+ *
+ * Note, this does *not* consider any metadata overhead for vfs i_blocks.
+ */
+static loff_t ext4_max_size(int blkbits)
+{
+        loff_t res;
+        loff_t upper_limit = MAX_LFS_FILESIZE;
+        /* small i_blocks in vfs inode? */
+        if (sizeof(blkcnt_t) < sizeof(u64)) {
+                /*
+                 * CONFIG_LSF is not enabled implies the inode
+                 * i_block represent total blocks in 512 bytes
+                 * 32 == size of vfs inode i_blocks * 8
+                 */
+                upper_limit = (1LL << 32) - 1;
+                /* total blocks in file system block size */
+                upper_limit >>= (blkbits - 9);
+                upper_limit <<= blkbits;
+        }
+        /* 32-bit extent-start container, ee_block */
+        res = 1LL << 32;
+        res <<= blkbits;
+        res -= 1;
+        /* Sanity check against vm- & vfs- imposed limits */
+        if (res > upper_limit)
+                res = upper_limit;
+        return res;
+}
 /*
- * Maximal file size.  There is a direct, and {,double-,triple-}indirect
+ * Maximal bitmap file size.  There is a direct, and {,double-,triple-}indirect
- * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks.
+ * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
- * We need to be 1 filesystem block less than the 2^32 sector limit.
+ * We need to be 1 filesystem block less than the 2^48 sector limit.
 */
-static loff_t ext4_max_size(int bits)
+static loff_t ext4_max_bitmap_size(int bits)
 {
        loff_t res = EXT4_NDIR_BLOCKS;
-        /* This constant is calculated to be the largest file size for a
+        int meta_blocks;
-         * dense, 4k-blocksize file such that the total number of
+        loff_t upper_limit;
+        /* This is calculated to be the largest file size for a
+         * dense, bitmapped file such that the total number of
         * sectors in the file, including data and all indirect blocks,
-         * does not exceed 2^32. */
+         * does not exceed 2^48 -1
-        const loff_t upper_limit = 0x1ff7fffd000LL;
+         * __u32 i_blocks_lo and _u16 i_blocks_high representing the
+         * total number of  512 bytes blocks of the file
+         */
+        if (sizeof(blkcnt_t) < sizeof(u64)) {
+                /*
+                 * CONFIG_LSF is not enabled implies the inode
+                 * i_block represent total blocks in 512 bytes
+                 * 32 == size of vfs inode i_blocks * 8
+                 */
+                upper_limit = (1LL << 32) - 1;
+                /* total blocks in file system block size */
+                upper_limit >>= (bits - 9);
+        } else {
+                /*
+                 * We use 48 bit ext4_inode i_blocks
+                 * With EXT4_HUGE_FILE_FL set the i_blocks
+                 * represent total number of blocks in
+                 * file system block size
+                 */
+                upper_limit = (1LL << 48) - 1;
+        }
+        /* indirect blocks */
+        meta_blocks = 1;
+        /* double indirect blocks */
+        meta_blocks += 1 + (1LL << (bits-2));
+        /* tripple indirect blocks */
+        meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
+        upper_limit -= meta_blocks;
+        upper_limit <<= bits;
        res += 1LL << (bits-2);
        res += 1LL << (2*(bits-2));
@@ -1563,6 +1741,10 @@ static loff_t ext4_max_size(int bits)
        res <<= bits;
        if (res > upper_limit)
                res = upper_limit;
+        if (res > MAX_LFS_FILESIZE)
+                res = MAX_LFS_FILESIZE;
        return res;
 }
@@ -1570,7 +1752,7 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
                                ext4_fsblk_t logical_sb_block, int nr)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        unsigned long bg, first_meta_bg;
+        ext4_group_t bg, first_meta_bg;
        int has_super = 0;
        first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
@@ -1584,8 +1766,39 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
        return (has_super + ext4_group_first_block_no(sb, bg));
 }
+/**
+ * ext4_get_stripe_size: Get the stripe size.
+ * @sbi: In memory super block info
+ *
+ * If we have specified it via mount option, then
+ * use the mount option value. If the value specified at mount time is
+ * greater than the blocks per group use the super block value.
+ * If the super block value is greater than blocks per group return 0.
+ * Allocator needs it be less than blocks per group.
+ *
+ */
+static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
+{
+        unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
+        unsigned long stripe_width =
+                        le32_to_cpu(sbi->s_es->s_raid_stripe_width);
+        if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
+                return sbi->s_stripe;
+        if (stripe_width <= sbi->s_blocks_per_group)
+                return stripe_width;
+        if (stride <= sbi->s_blocks_per_group)
+                return stride;
+        return 0;
+}
 static int ext4_fill_super (struct super_block *sb, void *data, int silent)
+                                __releases(kernel_sem)
+                                __acquires(kernel_sem)
 {
        struct buffer_head * bh;
        struct ext4_super_block *es = NULL;
@@ -1599,7 +1812,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
        unsigned long def_mount_opts;
        struct inode *root;
        int blocksize;
-        int hblock;
        int db_count;
        int i;
        int needs_recovery;
@@ -1624,6 +1836,11 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                goto out_fail;
        }
+        if (!sb_set_blocksize(sb, blocksize)) {
+                printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize);
+                goto out_fail;
+        }
        /*
         * The ext4 superblock will not be buffer aligned for other than 1kB
         * block sizes.  We need to calculate the offset from buffer start.
@@ -1674,10 +1891,10 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
        if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
                set_opt(sbi->s_mount_opt, ERRORS_PANIC);
-        else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_RO)
+        else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
-                set_opt(sbi->s_mount_opt, ERRORS_RO);
-        else
                set_opt(sbi->s_mount_opt, ERRORS_CONT);
+        else
+                set_opt(sbi->s_mount_opt, ERRORS_RO);
        sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
        sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
@@ -1689,6 +1906,11 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
         * User -o noextents to turn it off
         */
        set_opt(sbi->s_mount_opt, EXTENTS);
+        /*
+         * turn on mballoc feature by default in ext4 filesystem
+         * User -o nomballoc to turn it off
+         */
+        set_opt(sbi->s_mount_opt, MBALLOC);
        if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
                            NULL, 0))
@@ -1723,6 +1945,19 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                       sb->s_id, le32_to_cpu(features));
                goto failed_mount;
        }
+        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+                /*
+                 * Large file size enabled file system can only be
+                 * mount if kernel is build with CONFIG_LSF
+                 */
+                if (sizeof(root->i_blocks) < sizeof(u64) &&
+                                !(sb->s_flags & MS_RDONLY)) {
+                        printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge "
+                                        "files cannot be mounted read-write "
+                                        "without CONFIG_LSF.\n", sb->s_id);
+                        goto failed_mount;
+                }
+        }
        blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
        if (blocksize < EXT4_MIN_BLOCK_SIZE ||
@@ -1733,20 +1968,16 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
-        hblock = bdev_hardsect_size(sb->s_bdev);
        if (sb->s_blocksize != blocksize) {
-                /*
-                 * Make sure the blocksize for the filesystem is larger
+                /* Validate the filesystem blocksize */
-                 * than the hardware sectorsize for the machine.
+                if (!sb_set_blocksize(sb, blocksize)) {
-                 */
+                        printk(KERN_ERR "EXT4-fs: bad block size %d.\n",
-                if (blocksize < hblock) {
+                                        blocksize);
-                        printk(KERN_ERR "EXT4-fs: blocksize %d too small for "
-                               "device blocksize %d.\n", blocksize, hblock);
                        goto failed_mount;
                }
                brelse (bh);
-                sb_set_blocksize(sb, blocksize);
                logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
                offset = do_div(logical_sb_block, blocksize);
                bh = sb_bread(sb, logical_sb_block);
@@ -1764,6 +1995,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                }
        }
+        sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits);
        sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits);
        if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
@@ -1797,7 +2029,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
        sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
        sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
-        if (EXT4_INODE_SIZE(sb) == 0)
+        if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
                goto cantfind_ext4;
        sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
        if (sbi->s_inodes_per_block == 0)
@@ -1838,6 +2070,17 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
        if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
                goto cantfind_ext4;
+        /* ensure blocks_count calculation below doesn't sign-extend */
+        if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) <
+            le32_to_cpu(es->s_first_data_block) + 1) {
+                printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, "
+                       "first data block %u, blocks per group %lu\n",
+                        ext4_blocks_count(es),
+                        le32_to_cpu(es->s_first_data_block),
+                        EXT4_BLOCKS_PER_GROUP(sb));
+                goto failed_mount;
+        }
        blocks_count = (ext4_blocks_count(es) -
                        le32_to_cpu(es->s_first_data_block) +
                        EXT4_BLOCKS_PER_GROUP(sb) - 1);
@@ -1900,6 +2143,8 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
        sbi->s_rsv_window_head.rsv_goal_size = 0;
        ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
+        sbi->s_stripe = ext4_get_stripe_size(sbi);
        /*
         * set up enough so that it can read an inode
         */
@@ -1944,6 +2189,21 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                goto failed_mount4;
        }
+        if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+                jbd2_journal_set_features(sbi->s_journal,
+                                JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+        } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
+                jbd2_journal_set_features(sbi->s_journal,
+                                JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
+                jbd2_journal_clear_features(sbi->s_journal, 0, 0,
+                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+        } else {
+                jbd2_journal_clear_features(sbi->s_journal,
+                                JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+        }
        /* We have now updated the journal if required, so we can
         * validate the data journaling mode. */
        switch (test_opt(sb, DATA_FLAGS)) {
@@ -2044,6 +2304,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                "writeback");
        ext4_ext_init(sb);
+        ext4_mb_init(sb, needs_recovery);
        lock_kernel();
        return 0;
@@ -2673,7 +2934,7 @@ static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
        if (test_opt(sb, MINIX_DF)) {
                sbi->s_overhead_last = 0;
        } else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
-                unsigned long ngroups = sbi->s_groups_count, i;
+                ext4_group_t ngroups = sbi->s_groups_count, i;
                ext4_fsblk_t overhead = 0;
                smp_rmb();
@@ -2909,7 +3170,7 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off)
 {
        struct inode *inode = sb_dqopt(sb)->files[type];
-        sector_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
+        ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
        int err = 0;
        int offset = off & (sb->s_blocksize - 1);
        int tocopy;
@@ -2947,7 +3208,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
                                const char *data, size_t len, loff_t off)
 {
        struct inode *inode = sb_dqopt(sb)->files[type];
-        sector_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
+        ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
        int err = 0;
        int offset = off & (sb->s_blocksize - 1);
        int tocopy;
@@ -3002,7 +3263,6 @@ out:
                i_size_write(inode, off+len-towrite);
                EXT4_I(inode)->i_disksize = inode->i_size;
        }
-        inode->i_version++;
        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        ext4_mark_inode_dirty(handle, inode);
        mutex_unlock(&inode->i_mutex);
@@ -3027,9 +3287,15 @@ static struct file_system_type ext4dev_fs_type = {
 static int __init init_ext4_fs(void)
 {
-        int err = init_ext4_xattr();
+        int err;
+        err = init_ext4_mballoc();
        if (err)
                return err;
+        err = init_ext4_xattr();
+        if (err)
+                goto out2;
        err = init_inodecache();
        if (err)
                goto out1;
@@ -3041,6 +3307,8 @@ out:
        destroy_inodecache();
 out1:
        exit_ext4_xattr();
+out2:
+        exit_ext4_mballoc();
        return err;
 }
@@ -3049,6 +3317,7 @@ static void __exit exit_ext4_fs(void)
        unregister_filesystem(&ext4dev_fs_type);
        destroy_inodecache();
        exit_ext4_xattr();
+        exit_ext4_mballoc();
 }
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 86387302c2a9..d7962139c010 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -480,7 +480,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
                ea_bdebug(bh, "refcount now=0; freeing");
                if (ce)
                        mb_cache_entry_free(ce);
-                ext4_free_blocks(handle, inode, bh->b_blocknr, 1);
+                ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1);
                get_bh(bh);
                ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
        } else {
@@ -821,7 +821,7 @@ inserted:
                        new_bh = sb_getblk(sb, block);
                        if (!new_bh) {
 getblk_failed:
-                                ext4_free_blocks(handle, inode, block, 1);
+                                ext4_free_blocks(handle, inode, block, 1, 1);
                                error = -EIO;
                                goto cleanup;
                        }
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 2c1b73fb82ae..5fb366992b73 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -590,21 +590,49 @@ error:
 EXPORT_SYMBOL_GPL(fat_free_clusters);
+/* 128kb is the whole sectors for FAT12 and FAT16 */
+#define FAT_READA_SIZE          (128 * 1024)
+static void fat_ent_reada(struct super_block *sb, struct fat_entry *fatent,
+                          unsigned long reada_blocks)
+{
+        struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
+        sector_t blocknr;
+        int i, offset;
+        ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr);
+        for (i = 0; i < reada_blocks; i++)
+                sb_breadahead(sb, blocknr + i);
+}
 int fat_count_free_clusters(struct super_block *sb)
 {
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        struct fatent_operations *ops = sbi->fatent_ops;
        struct fat_entry fatent;
+        unsigned long reada_blocks, reada_mask, cur_block;
        int err = 0, free;
        lock_fat(sbi);
        if (sbi->free_clusters != -1)
                goto out;
+        reada_blocks = FAT_READA_SIZE >> sb->s_blocksize_bits;
+        reada_mask = reada_blocks - 1;
+        cur_block = 0;
        free = 0;
        fatent_init(&fatent);
        fatent_set_entry(&fatent, FAT_START_ENT);
        while (fatent.entry < sbi->max_cluster) {
+                /* readahead of fat blocks */
+                if ((cur_block & reada_mask) == 0) {
+                        unsigned long rest = sbi->fat_length - cur_block;
+                        fat_ent_reada(sb, &fatent, min(reada_blocks, rest));
+                }
+                cur_block++;
                err = fat_ent_read_block(sb, &fatent);
                if (err)
                        goto out;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 0fca82021d76..300324bd563c 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -482,8 +482,6 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
                if (wbc->nr_to_write <= 0)
                        break;
        }
-        if (!list_empty(&sb->s_more_io))
-                wbc->more_io = 1;
        return;         /* Leave any unwritten inodes on s_io */
 }
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 3763757f9fe7..80d2f5292cf9 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -132,6 +132,21 @@ static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
        req->out.args[0].value = outarg;
 }
+static u64 fuse_get_attr_version(struct fuse_conn *fc)
+{
+        u64 curr_version;
+        /*
+         * The spin lock isn't actually needed on 64bit archs, but we
+         * don't yet care too much about such optimizations.
+         */
+        spin_lock(&fc->lock);
+        curr_version = fc->attr_version;
+        spin_unlock(&fc->lock);
+        return curr_version;
+}
 /*
 * Check whether the dentry is still valid
 *
@@ -171,9 +186,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                        return 0;
                }
-                spin_lock(&fc->lock);
+                attr_version = fuse_get_attr_version(fc);
-                attr_version = fc->attr_version;
-                spin_unlock(&fc->lock);
                parent = dget_parent(entry);
                fuse_lookup_init(req, parent->d_inode, entry, &outarg);
@@ -264,9 +277,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
                return ERR_PTR(PTR_ERR(forget_req));
        }
-        spin_lock(&fc->lock);
+        attr_version = fuse_get_attr_version(fc);
-        attr_version = fc->attr_version;
-        spin_unlock(&fc->lock);
        fuse_lookup_init(req, dir, entry, &outarg);
        request_send(fc, req);
@@ -646,6 +657,9 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (!err) {
+                /* ctime changes */
+                fuse_invalidate_attr(oldent->d_inode);
                fuse_invalidate_attr(olddir);
                if (olddir != newdir)
                        fuse_invalidate_attr(newdir);
@@ -733,9 +747,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
        if (IS_ERR(req))
                return PTR_ERR(req);
-        spin_lock(&fc->lock);
+        attr_version = fuse_get_attr_version(fc);
-        attr_version = fc->attr_version;
-        spin_unlock(&fc->lock);
        memset(&inarg, 0, sizeof(inarg));
        memset(&outarg, 0, sizeof(outarg));
@@ -775,6 +787,31 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
        return err;
 }
+int fuse_update_attributes(struct inode *inode, struct kstat *stat,
+                           struct file *file, bool *refreshed)
+{
+        struct fuse_inode *fi = get_fuse_inode(inode);
+        int err;
+        bool r;
+        if (fi->i_time < get_jiffies_64()) {
+                r = true;
+                err = fuse_do_getattr(inode, stat, file);
+        } else {
+                r = false;
+                err = 0;
+                if (stat) {
+                        generic_fillattr(inode, stat);
+                        stat->mode = fi->orig_i_mode;
+                }
+        }
+        if (refreshed != NULL)
+                *refreshed = r;
+        return err;
+}
 /*
 * Calling into a user-controlled filesystem gives the filesystem
 * daemon ptrace-like capabilities over the requester process.  This
@@ -862,14 +899,9 @@ static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
         */
        if ((fc->flags & FUSE_DEFAULT_PERMISSIONS) ||
            ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) {
-                struct fuse_inode *fi = get_fuse_inode(inode);
+                err = fuse_update_attributes(inode, NULL, NULL, &refreshed);
-                if (fi->i_time < get_jiffies_64()) {
+                if (err)
-                        err = fuse_do_getattr(inode, NULL, NULL);
+                        return err;
-                        if (err)
-                                return err;
-                        refreshed = true;
-                }
        }
        if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
@@ -935,7 +967,6 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
        struct page *page;
        struct inode *inode = file->f_path.dentry->d_inode;
        struct fuse_conn *fc = get_fuse_conn(inode);
-        struct fuse_file *ff = file->private_data;
        struct fuse_req *req;
        if (is_bad_inode(inode))
@@ -952,7 +983,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
        }
        req->num_pages = 1;
        req->pages[0] = page;
-        fuse_read_fill(req, ff, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR);
+        fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR);
        request_send(fc, req);
        nbytes = req->out.args[0].size;
        err = req->out.h.error;
@@ -1173,22 +1204,12 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
                        struct kstat *stat)
 {
        struct inode *inode = entry->d_inode;
-        struct fuse_inode *fi = get_fuse_inode(inode);
        struct fuse_conn *fc = get_fuse_conn(inode);
-        int err;
        if (!fuse_allow_task(fc, current))
                return -EACCES;
-        if (fi->i_time < get_jiffies_64())
+        return fuse_update_attributes(inode, stat, NULL, NULL);
-                err = fuse_do_getattr(inode, stat, NULL);
-        else {
-                err = 0;
-                generic_fillattr(inode, stat);
-                stat->mode = fi->orig_i_mode;
-        }
-        return err;
 }
 static int fuse_setxattr(struct dentry *entry, const char *name,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 0fcdba9d47c0..bb05d227cf30 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -55,9 +55,10 @@ struct fuse_file *fuse_file_alloc(void)
                if (!ff->reserved_req) {
                        kfree(ff);
                        ff = NULL;
+                } else {
+                        INIT_LIST_HEAD(&ff->write_entry);
+                        atomic_set(&ff->count, 0);
                }
-                INIT_LIST_HEAD(&ff->write_entry);
-                atomic_set(&ff->count, 0);
        }
        return ff;
 }
@@ -288,14 +289,16 @@ static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
        return fuse_fsync_common(file, de, datasync, 0);
 }
-void fuse_read_fill(struct fuse_req *req, struct fuse_file *ff,
+void fuse_read_fill(struct fuse_req *req, struct file *file,
                    struct inode *inode, loff_t pos, size_t count, int opcode)
 {
        struct fuse_read_in *inarg = &req->misc.read_in;
+        struct fuse_file *ff = file->private_data;
        inarg->fh = ff->fh;
        inarg->offset = pos;
        inarg->size = count;
+        inarg->flags = file->f_flags;
        req->in.h.opcode = opcode;
        req->in.h.nodeid = get_node_id(inode);
        req->in.numargs = 1;
@@ -312,9 +315,8 @@ static size_t fuse_send_read(struct fuse_req *req, struct file *file,
                             fl_owner_t owner)
 {
        struct fuse_conn *fc = get_fuse_conn(inode);
-        struct fuse_file *ff = file->private_data;
-        fuse_read_fill(req, ff, inode, pos, count, FUSE_READ);
+        fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
        if (owner != NULL) {
                struct fuse_read_in *inarg = &req->misc.read_in;
@@ -375,15 +377,16 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
        fuse_put_request(fc, req);
 }
-static void fuse_send_readpages(struct fuse_req *req, struct fuse_file *ff,
+static void fuse_send_readpages(struct fuse_req *req, struct file *file,
                                struct inode *inode)
 {
        struct fuse_conn *fc = get_fuse_conn(inode);
        loff_t pos = page_offset(req->pages[0]);
        size_t count = req->num_pages << PAGE_CACHE_SHIFT;
        req->out.page_zeroing = 1;
-        fuse_read_fill(req, ff, inode, pos, count, FUSE_READ);
+        fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
        if (fc->async_read) {
+                struct fuse_file *ff = file->private_data;
                req->ff = fuse_file_get(ff);
                req->end = fuse_readpages_end;
                request_send_background(fc, req);
@@ -395,7 +398,7 @@ static void fuse_send_readpages(struct fuse_req *req, struct fuse_file *ff,
 struct fuse_fill_data {
        struct fuse_req *req;
-        struct fuse_file *ff;
+        struct file *file;
        struct inode *inode;
 };
@@ -410,7 +413,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
            (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
             (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
             req->pages[req->num_pages - 1]->index + 1 != page->index)) {
-                fuse_send_readpages(req, data->ff, inode);
+                fuse_send_readpages(req, data->file, inode);
                data->req = req = fuse_get_req(fc);
                if (IS_ERR(req)) {
                        unlock_page(page);
@@ -434,7 +437,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
        if (is_bad_inode(inode))
                goto out;
-        data.ff = file->private_data;
+        data.file = file;
        data.inode = inode;
        data.req = fuse_get_req(fc);
        err = PTR_ERR(data.req);
@@ -444,7 +447,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
        err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
        if (!err) {
                if (data.req->num_pages)
-                        fuse_send_readpages(data.req, data.ff, inode);
+                        fuse_send_readpages(data.req, file, inode);
                else
                        fuse_put_request(fc, data.req);
        }
@@ -452,11 +455,31 @@ out:
        return err;
 }
-static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
+static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+                                  unsigned long nr_segs, loff_t pos)
+{
+        struct inode *inode = iocb->ki_filp->f_mapping->host;
+        if (pos + iov_length(iov, nr_segs) > i_size_read(inode)) {
+                int err;
+                /*
+                 * If trying to read past EOF, make sure the i_size
+                 * attribute is up-to-date.
+                 */
+                err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL);
+                if (err)
+                        return err;
+        }
+        return generic_file_aio_read(iocb, iov, nr_segs, pos);
+}
+static void fuse_write_fill(struct fuse_req *req, struct file *file,
                            struct inode *inode, loff_t pos, size_t count,
                            int writepage)
 {
        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_file *ff = file->private_data;
        struct fuse_write_in *inarg = &req->misc.write.in;
        struct fuse_write_out *outarg = &req->misc.write.out;
@@ -465,6 +488,7 @@ static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
        inarg->offset = pos;
        inarg->size = count;
        inarg->write_flags = writepage ? FUSE_WRITE_CACHE : 0;
+        inarg->flags = file->f_flags;
        req->in.h.opcode = FUSE_WRITE;
        req->in.h.nodeid = get_node_id(inode);
        req->in.argpages = 1;
@@ -485,7 +509,7 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,
                              fl_owner_t owner)
 {
        struct fuse_conn *fc = get_fuse_conn(inode);
-        fuse_write_fill(req, file->private_data, inode, pos, count, 0);
+        fuse_write_fill(req, file, inode, pos, count, 0);
        if (owner != NULL) {
                struct fuse_write_in *inarg = &req->misc.write.in;
                inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
@@ -886,7 +910,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
 static const struct file_operations fuse_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
-        .aio_read       = generic_file_aio_read,
+        .aio_read       = fuse_file_aio_read,
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .mmap           = fuse_file_mmap,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 6c5461de1a5f..3ab8a3048e8b 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -447,7 +447,7 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
 /**
 * Initialize READ or READDIR request
 */
-void fuse_read_fill(struct fuse_req *req, struct fuse_file *ff,
+void fuse_read_fill(struct fuse_req *req, struct file *file,
                    struct inode *inode, loff_t pos, size_t count, int opcode);
 /**
@@ -593,3 +593,6 @@ int fuse_valid_type(int m);
 int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task);
 u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
+int fuse_update_attributes(struct inode *inode, struct kstat *stat,
+                           struct file *file, bool *refreshed);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 9a68d6970845..e5e80d1a4687 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -56,6 +56,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
        fi->i_time = 0;
        fi->nodeid = 0;
        fi->nlookup = 0;
+        fi->attr_version = 0;
        INIT_LIST_HEAD(&fi->write_files);
        fi->forget_req = fuse_request_alloc();
        if (!fi->forget_req) {
@@ -562,8 +563,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
        arg->major = FUSE_KERNEL_VERSION;
        arg->minor = FUSE_KERNEL_MINOR_VERSION;
        arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
-        arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_FILE_OPS |
+        arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC;
-                FUSE_ATOMIC_O_TRUNC;
        req->in.h.opcode = FUSE_INIT;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(*arg);
@@ -744,9 +744,6 @@ static inline void unregister_fuseblk(void)
 }
 #endif
-static decl_subsys(fuse, NULL, NULL);
-static decl_subsys(connections, NULL, NULL);
 static void fuse_inode_init_once(struct kmem_cache *cachep, void *foo)
 {
        struct inode * inode = foo;
@@ -791,32 +788,37 @@ static void fuse_fs_cleanup(void)
        kmem_cache_destroy(fuse_inode_cachep);
 }
+static struct kobject *fuse_kobj;
+static struct kobject *connections_kobj;
 static int fuse_sysfs_init(void)
 {
        int err;
-        kobj_set_kset_s(&fuse_subsys, fs_subsys);
+        fuse_kobj = kobject_create_and_add("fuse", fs_kobj);
-        err = subsystem_register(&fuse_subsys);
+        if (!fuse_kobj) {
-        if (err)
+                err = -ENOMEM;
                goto out_err;
+        }
-        kobj_set_kset_s(&connections_subsys, fuse_subsys);
+        connections_kobj = kobject_create_and_add("connections", fuse_kobj);
-        err = subsystem_register(&connections_subsys);
+        if (!connections_kobj) {
-        if (err)
+                err = -ENOMEM;
                goto out_fuse_unregister;
+        }
        return 0;
 out_fuse_unregister:
-        subsystem_unregister(&fuse_subsys);
+        kobject_put(fuse_kobj);
 out_err:
        return err;
 }
 static void fuse_sysfs_cleanup(void)
 {
-        subsystem_unregister(&connections_subsys);
+        kobject_put(connections_kobj);
-        subsystem_unregister(&fuse_subsys);
+        kobject_put(fuse_kobj);
 }
 static int __init fuse_init(void)
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 04ad0caebedb..8fff11058cee 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -2,7 +2,7 @@ obj-$(CONFIG_GFS2_FS) += gfs2.o
 gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
        glops.o inode.o lm.o log.o lops.o locking.o main.o meta_io.o \
        mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
-        ops_fstype.o ops_inode.o ops_super.o ops_vm.o quota.o \
+        ops_fstype.o ops_inode.o ops_super.o quota.o \
        recovery.o rgrp.o super.o sys.o trans.o util.o
 obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 93fa427bb5f5..e4effc47abfc 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -59,7 +59,6 @@ struct strip_mine {
 static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
                               u64 block, struct page *page)
 {
-        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct inode *inode = &ip->i_inode;
        struct buffer_head *bh;
        int release = 0;
@@ -95,7 +94,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
        set_buffer_uptodate(bh);
        if (!gfs2_is_jdata(ip))
                mark_buffer_dirty(bh);
-        if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
+        if (!gfs2_is_writeback(ip))
                gfs2_trans_add_bh(ip->i_gl, bh, 0);
        if (release) {
@@ -453,8 +452,8 @@ static inline void bmap_unlock(struct inode *inode, int create)
 * Returns: errno
 */
-int gfs2_block_map(struct inode *inode, u64 lblock, int create,
+int gfs2_block_map(struct inode *inode, sector_t lblock,
-                   struct buffer_head *bh_map)
+                   struct buffer_head *bh_map, int create)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -470,6 +469,7 @@ int gfs2_block_map(struct inode *inode, u64 lblock, int create,
        unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
        struct metapath mp;
        u64 size;
+        struct buffer_head *dibh = NULL;
        BUG_ON(maxlen == 0);
@@ -500,6 +500,8 @@ int gfs2_block_map(struct inode *inode, u64 lblock, int create,
        error = gfs2_meta_inode_buffer(ip, &bh);
        if (error)
                goto out_fail;
+        dibh = bh;
+        get_bh(dibh);
        for (x = 0; x < end_of_metadata; x++) {
                lookup_block(ip, bh, x, &mp, create, &new, &dblock);
@@ -518,13 +520,8 @@ int gfs2_block_map(struct inode *inode, u64 lblock, int create,
                if (boundary)
                        set_buffer_boundary(bh_map);
                if (new) {
-                        struct buffer_head *dibh;
+                        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-                        error = gfs2_meta_inode_buffer(ip, &dibh);
+                        gfs2_dinode_out(ip, dibh->b_data);
-                        if (!error) {
-                                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-                                gfs2_dinode_out(ip, dibh->b_data);
-                                brelse(dibh);
-                        }
                        set_buffer_new(bh_map);
                        goto out_brelse;
                }
@@ -545,6 +542,8 @@ out_brelse:
 out_ok:
        error = 0;
 out_fail:
+        if (dibh)
+                brelse(dibh);
        bmap_unlock(inode, create);
        return error;
 }
@@ -560,7 +559,7 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
        BUG_ON(!new);
        bh.b_size = 1 << (inode->i_blkbits + 5);
-        ret = gfs2_block_map(inode, lblock, create, &bh);
+        ret = gfs2_block_map(inode, lblock, &bh, create);
        *extlen = bh.b_size >> inode->i_blkbits;
        *dblock = bh.b_blocknr;
        if (buffer_new(&bh))
@@ -684,7 +683,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        if (metadata)
                revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
-        error = gfs2_rindex_hold(sdp, &ip->i_alloc.al_ri_gh);
+        error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
        if (error)
                return error;
@@ -786,7 +785,7 @@ out_rg_gunlock:
 out_rlist:
        gfs2_rlist_free(&rlist);
 out:
-        gfs2_glock_dq_uninit(&ip->i_alloc.al_ri_gh);
+        gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
        return error;
 }
@@ -879,7 +878,6 @@ static int gfs2_block_truncate_page(struct address_space *mapping)
 {
        struct inode *inode = mapping->host;
        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_sbd *sdp = GFS2_SB(inode);
        loff_t from = inode->i_size;
        unsigned long index = from >> PAGE_CACHE_SHIFT;
        unsigned offset = from & (PAGE_CACHE_SIZE-1);
@@ -911,7 +909,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping)
        err = 0;
        if (!buffer_mapped(bh)) {
-                gfs2_get_block(inode, iblock, bh, 0);
+                gfs2_block_map(inode, iblock, bh, 0);
                /* unmapped? It's a hole - nothing to do */
                if (!buffer_mapped(bh))
                        goto unlock;
@@ -931,7 +929,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping)
                err = 0;
        }
-        if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
+        if (!gfs2_is_writeback(ip))
                gfs2_trans_add_bh(ip->i_gl, bh, 0);
        zero_user_page(page, offset, length, KM_USER0);
@@ -1224,8 +1222,13 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
                do_div(lblock_stop, bsize);
        } else {
                unsigned int shift = sdp->sd_sb.sb_bsize_shift;
+                u64 end_of_file = (ip->i_di.di_size + sdp->sd_sb.sb_bsize - 1) >> shift;
                lblock = offset >> shift;
                lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
+                if (lblock_stop > end_of_file) {
+                        *alloc_required = 1;
+                        return 0;
+                }
        }
        for (; lblock < lblock_stop; lblock += extlen) {
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index ac2fd04370dc..4e6cde2943bd 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -15,7 +15,7 @@ struct gfs2_inode;
 struct page;
 int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
-int gfs2_block_map(struct inode *inode, u64 lblock, int create, struct buffer_head *bh);
+int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create);
 int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
 int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
index 3731ab0771d5..e51991947d2c 100644
--- a/fs/gfs2/daemon.c
+++ b/fs/gfs2/daemon.c
@@ -83,56 +83,6 @@ int gfs2_recoverd(void *data)
 }
 /**
- * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
- * @sdp: Pointer to GFS2 superblock
- *
- * Also, periodically check to make sure that we're using the most recent
- * journal index.
- */
-int gfs2_logd(void *data)
-{
-        struct gfs2_sbd *sdp = data;
-        struct gfs2_holder ji_gh;
-        unsigned long t;
-        int need_flush;
-        while (!kthread_should_stop()) {
-                /* Advance the log tail */
-                t = sdp->sd_log_flush_time +
-                    gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
-                gfs2_ail1_empty(sdp, DIO_ALL);
-                gfs2_log_lock(sdp);
-                need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks);
-                gfs2_log_unlock(sdp);
-                if (need_flush || time_after_eq(jiffies, t)) {
-                        gfs2_log_flush(sdp, NULL);
-                        sdp->sd_log_flush_time = jiffies;
-                }
-                /* Check for latest journal index */
-                t = sdp->sd_jindex_refresh_time +
-                    gfs2_tune_get(sdp, gt_jindex_refresh_secs) * HZ;
-                if (time_after_eq(jiffies, t)) {
-                        if (!gfs2_jindex_hold(sdp, &ji_gh))
-                                gfs2_glock_dq_uninit(&ji_gh);
-                        sdp->sd_jindex_refresh_time = jiffies;
-                }
-                t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
-                if (freezing(current))
-                        refrigerator();
-                schedule_timeout_interruptible(t);
-        }
-        return 0;
-}
-/**
 * gfs2_quotad - Write cached quota changes into the quota file
 * @sdp: Pointer to GFS2 superblock
 *
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
index 0de9b3557955..4be084fb6a62 100644
--- a/fs/gfs2/daemon.h
+++ b/fs/gfs2/daemon.h
@@ -12,7 +12,6 @@
 int gfs2_glockd(void *data);
 int gfs2_recoverd(void *data);
-int gfs2_logd(void *data);
 int gfs2_quotad(void *data);
 #endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 9949bb746a52..57e2ed932adc 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1876,7 +1876,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
        if (error)
                goto out;
-        error = gfs2_rindex_hold(sdp, &dip->i_alloc.al_ri_gh);
+        error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh);
        if (error)
                goto out_qs;
@@ -1949,7 +1949,7 @@ out_rg_gunlock:
        gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
 out_rlist:
        gfs2_rlist_free(&rlist);
-        gfs2_glock_dq_uninit(&dip->i_alloc.al_ri_gh);
+        gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh);
 out_qs:
        gfs2_quota_unhold(dip);
 out:
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
index aa8dbf303f6d..f114ba2b3557 100644
--- a/fs/gfs2/eaops.c
+++ b/fs/gfs2/eaops.c
@@ -56,46 +56,6 @@ unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name)
        return type;
 }
-static int user_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-        struct inode *inode = &ip->i_inode;
-        int error = permission(inode, MAY_READ, NULL);
-        if (error)
-                return error;
-        return gfs2_ea_get_i(ip, er);
-}
-static int user_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-        struct inode *inode = &ip->i_inode;
-        if (S_ISREG(inode->i_mode) ||
-            (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
-                int error = permission(inode, MAY_WRITE, NULL);
-                if (error)
-                        return error;
-        } else
-                return -EPERM;
-        return gfs2_ea_set_i(ip, er);
-}
-static int user_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-        struct inode *inode = &ip->i_inode;
-        if (S_ISREG(inode->i_mode) ||
-            (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
-                int error = permission(inode, MAY_WRITE, NULL);
-                if (error)
-                        return error;
-        } else
-                return -EPERM;
-        return gfs2_ea_remove_i(ip, er);
-}
 static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 {
        if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
@@ -108,8 +68,6 @@ static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
             GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
                return -EOPNOTSUPP;
        return gfs2_ea_get_i(ip, er);
 }
@@ -170,40 +128,10 @@ static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
        return gfs2_ea_remove_i(ip, er);
 }
-static int security_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-        struct inode *inode = &ip->i_inode;
-        int error = permission(inode, MAY_READ, NULL);
-        if (error)
-                return error;
-        return gfs2_ea_get_i(ip, er);
-}
-static int security_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-        struct inode *inode = &ip->i_inode;
-        int error = permission(inode, MAY_WRITE, NULL);
-        if (error)
-                return error;
-        return gfs2_ea_set_i(ip, er);
-}
-static int security_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-        struct inode *inode = &ip->i_inode;
-        int error = permission(inode, MAY_WRITE, NULL);
-        if (error)
-                return error;
-        return gfs2_ea_remove_i(ip, er);
-}
 static const struct gfs2_eattr_operations gfs2_user_eaops = {
-        .eo_get = user_eo_get,
+        .eo_get = gfs2_ea_get_i,
-        .eo_set = user_eo_set,
+        .eo_set = gfs2_ea_set_i,
-        .eo_remove = user_eo_remove,
+        .eo_remove = gfs2_ea_remove_i,
        .eo_name = "user",
 };
@@ -215,9 +143,9 @@ const struct gfs2_eattr_operations gfs2_system_eaops = {
 };
 static const struct gfs2_eattr_operations gfs2_security_eaops = {
-        .eo_get = security_eo_get,
+        .eo_get = gfs2_ea_get_i,
-        .eo_set = security_eo_set,
+        .eo_set = gfs2_ea_set_i,
-        .eo_remove = security_eo_remove,
+        .eo_remove = gfs2_ea_remove_i,
        .eo_name = "security",
 };
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 2a7435b5c4dc..bee99704ea10 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -1418,7 +1418,7 @@ out:
 static int ea_dealloc_block(struct gfs2_inode *ip)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_alloc *al = ip->i_alloc;
        struct gfs2_rgrpd *rgd;
        struct buffer_head *dibh;
        int error;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a37efe4aae6f..80e09c50590a 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -217,7 +217,6 @@ int gfs2_glock_put(struct gfs2_glock *gl)
        if (atomic_dec_and_test(&gl->gl_ref)) {
                hlist_del(&gl->gl_list);
                write_unlock(gl_lock_addr(gl->gl_hash));
-                BUG_ON(spin_is_locked(&gl->gl_spin));
                gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
                gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
                gfs2_assert(sdp, list_empty(&gl->gl_holders));
@@ -346,7 +345,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        gl->gl_object = NULL;
        gl->gl_sbd = sdp;
        gl->gl_aspace = NULL;
-        lops_init_le(&gl->gl_le, &gfs2_glock_lops);
        INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
        /* If this glock protects actual on-disk data or metadata blocks,
@@ -461,7 +459,6 @@ static void wait_on_holder(struct gfs2_holder *gh)
 static void gfs2_demote_wake(struct gfs2_glock *gl)
 {
-        BUG_ON(!spin_is_locked(&gl->gl_spin));
        gl->gl_demote_state = LM_ST_EXCLUSIVE;
        clear_bit(GLF_DEMOTE, &gl->gl_flags);
        smp_mb__after_clear_bit();
@@ -507,21 +504,12 @@ static int rq_mutex(struct gfs2_holder *gh)
 static int rq_promote(struct gfs2_holder *gh)
 {
        struct gfs2_glock *gl = gh->gh_gl;
-        struct gfs2_sbd *sdp = gl->gl_sbd;
        if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
                if (list_empty(&gl->gl_holders)) {
                        gl->gl_req_gh = gh;
                        set_bit(GLF_LOCK, &gl->gl_flags);
                        spin_unlock(&gl->gl_spin);
-                        if (atomic_read(&sdp->sd_reclaim_count) >
-                            gfs2_tune_get(sdp, gt_reclaim_limit) &&
-                            !(gh->gh_flags & LM_FLAG_PRIORITY)) {
-                                gfs2_reclaim_glock(sdp);
-                                gfs2_reclaim_glock(sdp);
-                        }
                        gfs2_glock_xmote_th(gh->gh_gl, gh);
                        spin_lock(&gl->gl_spin);
                }
@@ -567,7 +555,10 @@ static int rq_demote(struct gfs2_glock *gl)
                gfs2_demote_wake(gl);
                return 0;
        }
        set_bit(GLF_LOCK, &gl->gl_flags);
+        set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
        if (gl->gl_demote_state == LM_ST_UNLOCKED ||
            gl->gl_state != LM_ST_EXCLUSIVE) {
                spin_unlock(&gl->gl_spin);
@@ -576,7 +567,9 @@ static int rq_demote(struct gfs2_glock *gl)
                spin_unlock(&gl->gl_spin);
                gfs2_glock_xmote_th(gl, NULL);
        }
        spin_lock(&gl->gl_spin);
+        clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
        return 0;
 }
@@ -598,23 +591,18 @@ static void run_queue(struct gfs2_glock *gl)
                if (!list_empty(&gl->gl_waiters1)) {
                        gh = list_entry(gl->gl_waiters1.next,
                                        struct gfs2_holder, gh_list);
+                        blocked = rq_mutex(gh);
-                        if (test_bit(HIF_MUTEX, &gh->gh_iflags))
-                                blocked = rq_mutex(gh);
-                        else
-                                gfs2_assert_warn(gl->gl_sbd, 0);
                } else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
                        blocked = rq_demote(gl);
+                        if (gl->gl_waiters2 && !blocked) {
+                                set_bit(GLF_DEMOTE, &gl->gl_flags);
+                                gl->gl_demote_state = LM_ST_UNLOCKED;
+                        }
+                        gl->gl_waiters2 = 0;
                } else if (!list_empty(&gl->gl_waiters3)) {
                        gh = list_entry(gl->gl_waiters3.next,
                                        struct gfs2_holder, gh_list);
+                        blocked = rq_promote(gh);
-                        if (test_bit(HIF_PROMOTE, &gh->gh_iflags))
-                                blocked = rq_promote(gh);
-                        else
-                                gfs2_assert_warn(gl->gl_sbd, 0);
                } else
                        break;
@@ -632,27 +620,21 @@ static void run_queue(struct gfs2_glock *gl)
 static void gfs2_glmutex_lock(struct gfs2_glock *gl)
 {
-        struct gfs2_holder gh;
-        gfs2_holder_init(gl, 0, 0, &gh);
-        set_bit(HIF_MUTEX, &gh.gh_iflags);
-        if (test_and_set_bit(HIF_WAIT, &gh.gh_iflags))
-                BUG();
        spin_lock(&gl->gl_spin);
        if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
+                struct gfs2_holder gh;
+                gfs2_holder_init(gl, 0, 0, &gh);
+                set_bit(HIF_WAIT, &gh.gh_iflags);
                list_add_tail(&gh.gh_list, &gl->gl_waiters1);
+                spin_unlock(&gl->gl_spin);
+                wait_on_holder(&gh);
+                gfs2_holder_uninit(&gh);
        } else {
                gl->gl_owner_pid = current->pid;
                gl->gl_ip = (unsigned long)__builtin_return_address(0);
-                clear_bit(HIF_WAIT, &gh.gh_iflags);
+                spin_unlock(&gl->gl_spin);
-                smp_mb();
-                wake_up_bit(&gh.gh_iflags, HIF_WAIT);
        }
-        spin_unlock(&gl->gl_spin);
-        wait_on_holder(&gh);
-        gfs2_holder_uninit(&gh);
 }
 /**
@@ -691,7 +673,6 @@ static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
        gl->gl_owner_pid = 0;
        gl->gl_ip = 0;
        run_queue(gl);
-        BUG_ON(!spin_is_locked(&gl->gl_spin));
        spin_unlock(&gl->gl_spin);
 }
@@ -722,7 +703,10 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
                }
        } else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
                        gl->gl_demote_state != state) {
-                gl->gl_demote_state = LM_ST_UNLOCKED;
+                if (test_bit(GLF_DEMOTE_IN_PROGRESS,  &gl->gl_flags)) 
+                        gl->gl_waiters2 = 1;
+                else 
+                        gl->gl_demote_state = LM_ST_UNLOCKED;
        }
        spin_unlock(&gl->gl_spin);
 }
@@ -943,8 +927,8 @@ static void gfs2_glock_drop_th(struct gfs2_glock *gl)
        const struct gfs2_glock_operations *glops = gl->gl_ops;
        unsigned int ret;
-        if (glops->go_drop_th)
+        if (glops->go_xmote_th)
-                glops->go_drop_th(gl);
+                glops->go_xmote_th(gl);
        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
@@ -1156,8 +1140,6 @@ restart:
                return -EIO;
        }
-        set_bit(HIF_PROMOTE, &gh->gh_iflags);
        spin_lock(&gl->gl_spin);
        add_to_queue(gh);
        run_queue(gl);
@@ -1248,12 +1230,11 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
        list_del_init(&gh->gh_list);
        if (list_empty(&gl->gl_holders)) {
-                spin_unlock(&gl->gl_spin);
+                if (glops->go_unlock) {
+                        spin_unlock(&gl->gl_spin);
-                if (glops->go_unlock)
                        glops->go_unlock(gh);
+                        spin_lock(&gl->gl_spin);
-                spin_lock(&gl->gl_spin);
+                }
                gl->gl_stamp = jiffies;
        }
@@ -1910,8 +1891,6 @@ static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
        print_dbg(gi, "  req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
        print_dbg(gi, "  lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
        print_dbg(gi, "  object = %s\n", (gl->gl_object) ? "yes" : "no");
-        print_dbg(gi, "  le = %s\n",
-                   (list_empty(&gl->gl_le.le_list)) ? "no" : "yes");
        print_dbg(gi, "  reclaim = %s\n",
                   (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
        if (gl->gl_aspace)
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 4670dcb2a877..c663b7a0f410 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -56,7 +56,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
                bd = list_entry(head->next, struct gfs2_bufdata,
                                bd_ail_gl_list);
                bh = bd->bd_bh;
-                gfs2_remove_from_ail(NULL, bd);
+                gfs2_remove_from_ail(bd);
                bd->bd_bh = NULL;
                bh->b_private = NULL;
                bd->bd_blkno = bh->b_blocknr;
@@ -86,15 +86,10 @@ static void gfs2_pte_inval(struct gfs2_glock *gl)
        if (!ip || !S_ISREG(inode->i_mode))
                return;
-        if (!test_bit(GIF_PAGED, &ip->i_flags))
-                return;
        unmap_shared_mapping_range(inode->i_mapping, 0, 0);
        if (test_bit(GIF_SW_PAGED, &ip->i_flags))
                set_bit(GLF_DIRTY, &gl->gl_flags);
-        clear_bit(GIF_SW_PAGED, &ip->i_flags);
 }
 /**
@@ -143,44 +138,34 @@ static void meta_go_inval(struct gfs2_glock *gl, int flags)
 static void inode_go_sync(struct gfs2_glock *gl)
 {
        struct gfs2_inode *ip = gl->gl_object;
+        struct address_space *metamapping = gl->gl_aspace->i_mapping;
+        int error;
+        if (gl->gl_state != LM_ST_UNLOCKED)
+                gfs2_pte_inval(gl);
+        if (gl->gl_state != LM_ST_EXCLUSIVE)
+                return;
        if (ip && !S_ISREG(ip->i_inode.i_mode))
                ip = NULL;
        if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
-                if (ip && !gfs2_is_jdata(ip))
-                        filemap_fdatawrite(ip->i_inode.i_mapping);
                gfs2_log_flush(gl->gl_sbd, gl);
-                if (ip && gfs2_is_jdata(ip))
+                filemap_fdatawrite(metamapping);
-                        filemap_fdatawrite(ip->i_inode.i_mapping);
-                gfs2_meta_sync(gl);
                if (ip) {
                        struct address_space *mapping = ip->i_inode.i_mapping;
-                        int error = filemap_fdatawait(mapping);
+                        filemap_fdatawrite(mapping);
+                        error = filemap_fdatawait(mapping);
                        mapping_set_error(mapping, error);
                }
+                error = filemap_fdatawait(metamapping);
+                mapping_set_error(metamapping, error);
                clear_bit(GLF_DIRTY, &gl->gl_flags);
                gfs2_ail_empty_gl(gl);
        }
 }
 /**
- * inode_go_xmote_th - promote/demote a glock
- * @gl: the glock
- * @state: the requested state
- * @flags:
- *
- */
-static void inode_go_xmote_th(struct gfs2_glock *gl)
-{
-        if (gl->gl_state != LM_ST_UNLOCKED)
-                gfs2_pte_inval(gl);
-        if (gl->gl_state == LM_ST_EXCLUSIVE)
-                inode_go_sync(gl);
-}
-/**
 * inode_go_xmote_bh - After promoting/demoting a glock
 * @gl: the glock
 *
@@ -201,22 +186,6 @@ static void inode_go_xmote_bh(struct gfs2_glock *gl)
 }
 /**
- * inode_go_drop_th - unlock a glock
- * @gl: the glock
- *
- * Invoked from rq_demote().
- * Another node needs the lock in EXCLUSIVE mode, or lock (unused for too long)
- * is being purged from our node's glock cache; we're dropping lock.
- */
-static void inode_go_drop_th(struct gfs2_glock *gl)
-{
-        gfs2_pte_inval(gl);
-        if (gl->gl_state == LM_ST_EXCLUSIVE)
-                inode_go_sync(gl);
-}
-/**
 * inode_go_inval - prepare a inode glock to be released
 * @gl: the glock
 * @flags:
@@ -234,10 +203,8 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
                        set_bit(GIF_INVALID, &ip->i_flags);
        }
-        if (ip && S_ISREG(ip->i_inode.i_mode)) {
+        if (ip && S_ISREG(ip->i_inode.i_mode))
                truncate_inode_pages(ip->i_inode.i_mapping, 0);
-                clear_bit(GIF_PAGED, &ip->i_flags);
-        }
 }
 /**
@@ -294,23 +261,6 @@ static int inode_go_lock(struct gfs2_holder *gh)
 }
 /**
- * inode_go_unlock - operation done before an inode lock is unlocked by a
- *                   process
- * @gl: the glock
- * @flags:
- *
- */
-static void inode_go_unlock(struct gfs2_holder *gh)
-{
-        struct gfs2_glock *gl = gh->gh_gl;
-        struct gfs2_inode *ip = gl->gl_object;
-        if (ip)
-                gfs2_meta_cache_flush(ip);
-}
-/**
 * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
 * @gl: the glock
 *
@@ -350,14 +300,14 @@ static void rgrp_go_unlock(struct gfs2_holder *gh)
 }
 /**
- * trans_go_xmote_th - promote/demote the transaction glock
+ * trans_go_sync - promote/demote the transaction glock
 * @gl: the glock
 * @state: the requested state
 * @flags:
 *
 */
-static void trans_go_xmote_th(struct gfs2_glock *gl)
+static void trans_go_sync(struct gfs2_glock *gl)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -384,7 +334,6 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
        if (gl->gl_state != LM_ST_UNLOCKED &&
            test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
-                gfs2_meta_cache_flush(GFS2_I(sdp->sd_jdesc->jd_inode));
                j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
                error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -402,24 +351,6 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
 }
 /**
- * trans_go_drop_th - unlock the transaction glock
- * @gl: the glock
- *
- * We want to sync the device even with localcaching.  Remember
- * that localcaching journal replay only marks buffers dirty.
- */
-static void trans_go_drop_th(struct gfs2_glock *gl)
-{
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
-                gfs2_meta_syncfs(sdp);
-                gfs2_log_shutdown(sdp);
-        }
-}
-/**
 * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
 * @gl: the glock
 *
@@ -433,25 +364,21 @@ static int quota_go_demote_ok(struct gfs2_glock *gl)
 const struct gfs2_glock_operations gfs2_meta_glops = {
        .go_xmote_th = meta_go_sync,
-        .go_drop_th = meta_go_sync,
        .go_type = LM_TYPE_META,
 };
 const struct gfs2_glock_operations gfs2_inode_glops = {
-        .go_xmote_th = inode_go_xmote_th,
+        .go_xmote_th = inode_go_sync,
        .go_xmote_bh = inode_go_xmote_bh,
-        .go_drop_th = inode_go_drop_th,
        .go_inval = inode_go_inval,
        .go_demote_ok = inode_go_demote_ok,
        .go_lock = inode_go_lock,
-        .go_unlock = inode_go_unlock,
        .go_type = LM_TYPE_INODE,
        .go_min_hold_time = HZ / 10,
 };
 const struct gfs2_glock_operations gfs2_rgrp_glops = {
        .go_xmote_th = meta_go_sync,
-        .go_drop_th = meta_go_sync,
        .go_inval = meta_go_inval,
        .go_demote_ok = rgrp_go_demote_ok,
        .go_lock = rgrp_go_lock,
@@ -461,9 +388,8 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
 };
 const struct gfs2_glock_operations gfs2_trans_glops = {
-        .go_xmote_th = trans_go_xmote_th,
+        .go_xmote_th = trans_go_sync,
        .go_xmote_bh = trans_go_xmote_bh,
-        .go_drop_th = trans_go_drop_th,
        .go_type = LM_TYPE_NONDISK,
 };
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index eaddfb5a8e6f..513aaf0dc0ab 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -131,7 +131,6 @@ struct gfs2_bufdata {
 struct gfs2_glock_operations {
        void (*go_xmote_th) (struct gfs2_glock *gl);
        void (*go_xmote_bh) (struct gfs2_glock *gl);
-        void (*go_drop_th) (struct gfs2_glock *gl);
        void (*go_inval) (struct gfs2_glock *gl, int flags);
        int (*go_demote_ok) (struct gfs2_glock *gl);
        int (*go_lock) (struct gfs2_holder *gh);
@@ -141,10 +140,6 @@ struct gfs2_glock_operations {
 };
 enum {
-        /* Actions */
-        HIF_MUTEX               = 0,
-        HIF_PROMOTE             = 1,
        /* States */
        HIF_HOLDER              = 6,
        HIF_FIRST               = 7,
@@ -171,6 +166,8 @@ enum {
        GLF_DEMOTE              = 3,
        GLF_PENDING_DEMOTE      = 4,
        GLF_DIRTY               = 5,
+        GLF_DEMOTE_IN_PROGRESS  = 6,
+        GLF_LFLUSH              = 7,
 };
 struct gfs2_glock {
@@ -190,6 +187,7 @@ struct gfs2_glock {
        struct list_head gl_holders;
        struct list_head gl_waiters1;   /* HIF_MUTEX */
        struct list_head gl_waiters3;   /* HIF_PROMOTE */
+        int gl_waiters2;                /* GIF_DEMOTE */
        const struct gfs2_glock_operations *gl_ops;
@@ -210,7 +208,6 @@ struct gfs2_glock {
        struct gfs2_sbd *gl_sbd;
        struct inode *gl_aspace;
-        struct gfs2_log_element gl_le;
        struct list_head gl_ail_list;
        atomic_t gl_ail_count;
        struct delayed_work gl_work;
@@ -239,7 +236,6 @@ struct gfs2_alloc {
 enum {
        GIF_INVALID             = 0,
        GIF_QD_LOCKED           = 1,
-        GIF_PAGED               = 2,
        GIF_SW_PAGED            = 3,
 };
@@ -268,14 +264,10 @@ struct gfs2_inode {
        struct gfs2_glock *i_gl; /* Move into i_gh? */
        struct gfs2_holder i_iopen_gh;
        struct gfs2_holder i_gh; /* for prepare/commit_write only */
-        struct gfs2_alloc i_alloc;
+        struct gfs2_alloc *i_alloc;
        u64 i_last_rg_alloc;
-        spinlock_t i_spin;
        struct rw_semaphore i_rw_mutex;
-        unsigned long i_last_pfault;
-        struct buffer_head *i_cache[GFS2_MAX_META_HEIGHT];
 };
 /*
@@ -287,19 +279,12 @@ static inline struct gfs2_inode *GFS2_I(struct inode *inode)
        return container_of(inode, struct gfs2_inode, i_inode);
 }
-/* To be removed? */
+static inline struct gfs2_sbd *GFS2_SB(const struct inode *inode)
-static inline struct gfs2_sbd *GFS2_SB(struct inode *inode)
 {
        return inode->i_sb->s_fs_info;
 }
-enum {
-        GFF_DID_DIRECT_ALLOC    = 0,
-        GFF_EXLOCK = 1,
-};
 struct gfs2_file {
-        unsigned long f_flags;          /* GFF_... */
        struct mutex f_fl_mutex;
        struct gfs2_holder f_fl_gh;
 };
@@ -373,8 +358,17 @@ struct gfs2_ail {
        u64 ai_sync_gen;
 };
+struct gfs2_journal_extent {
+        struct list_head extent_list;
+        unsigned int lblock; /* First logical block */
+        u64 dblock; /* First disk block */
+        u64 blocks;
+};
 struct gfs2_jdesc {
        struct list_head jd_list;
+        struct list_head extent_list;
        struct inode *jd_inode;
        unsigned int jd_jid;
@@ -421,13 +415,9 @@ struct gfs2_args {
 struct gfs2_tune {
        spinlock_t gt_spin;
-        unsigned int gt_ilimit;
-        unsigned int gt_ilimit_tries;
-        unsigned int gt_ilimit_min;
        unsigned int gt_demote_secs; /* Cache retention for unheld glock */
        unsigned int gt_incore_log_blocks;
        unsigned int gt_log_flush_secs;
-        unsigned int gt_jindex_refresh_secs; /* Check for new journal index */
        unsigned int gt_recoverd_secs;
        unsigned int gt_logd_secs;
@@ -443,10 +433,8 @@ struct gfs2_tune {
        unsigned int gt_new_files_jdata;
        unsigned int gt_new_files_directio;
        unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
-        unsigned int gt_lockdump_size;
        unsigned int gt_stall_secs; /* Detects trouble! */
        unsigned int gt_complain_secs;
-        unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */
        unsigned int gt_statfs_quantum;
        unsigned int gt_statfs_slow;
 };
@@ -539,7 +527,6 @@ struct gfs2_sbd {
        /* StatFS stuff */
        spinlock_t sd_statfs_spin;
-        struct mutex sd_statfs_mutex;
        struct gfs2_statfs_change_host sd_statfs_master;
        struct gfs2_statfs_change_host sd_statfs_local;
        unsigned long sd_statfs_sync_time;
@@ -602,20 +589,18 @@ struct gfs2_sbd {
        unsigned int sd_log_commited_databuf;
        unsigned int sd_log_commited_revoke;
-        unsigned int sd_log_num_gl;
        unsigned int sd_log_num_buf;
        unsigned int sd_log_num_revoke;
        unsigned int sd_log_num_rg;
        unsigned int sd_log_num_databuf;
-        struct list_head sd_log_le_gl;
        struct list_head sd_log_le_buf;
        struct list_head sd_log_le_revoke;
        struct list_head sd_log_le_rg;
        struct list_head sd_log_le_databuf;
        struct list_head sd_log_le_ordered;
-        unsigned int sd_log_blks_free;
+        atomic_t sd_log_blks_free;
        struct mutex sd_log_reserve_mutex;
        u64 sd_log_sequence;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 5f6dc32946cd..728d3169e7bd 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -31,7 +31,6 @@
 #include "log.h"
 #include "meta_io.h"
 #include "ops_address.h"
-#include "ops_file.h"
 #include "ops_inode.h"
 #include "quota.h"
 #include "rgrp.h"
@@ -132,15 +131,21 @@ static struct inode *gfs2_iget_skip(struct super_block *sb,
 void gfs2_set_iop(struct inode *inode)
 {
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
        umode_t mode = inode->i_mode;
        if (S_ISREG(mode)) {
                inode->i_op = &gfs2_file_iops;
-                inode->i_fop = &gfs2_file_fops;
+                if (sdp->sd_args.ar_localflocks)
-                inode->i_mapping->a_ops = &gfs2_file_aops;
+                        inode->i_fop = &gfs2_file_fops_nolock;
+                else
+                        inode->i_fop = &gfs2_file_fops;
        } else if (S_ISDIR(mode)) {
                inode->i_op = &gfs2_dir_iops;
-                inode->i_fop = &gfs2_dir_fops;
+                if (sdp->sd_args.ar_localflocks)
+                        inode->i_fop = &gfs2_dir_fops_nolock;
+                else
+                        inode->i_fop = &gfs2_dir_fops;
        } else if (S_ISLNK(mode)) {
                inode->i_op = &gfs2_symlink_iops;
        } else {
@@ -291,12 +296,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
        di->di_entries = be32_to_cpu(str->di_entries);
        di->di_eattr = be64_to_cpu(str->di_eattr);
-        return 0;
+        if (S_ISREG(ip->i_inode.i_mode))
-}
+                gfs2_set_aops(&ip->i_inode);
-static void gfs2_inode_bh(struct gfs2_inode *ip, struct buffer_head *bh)
+        return 0;
-{
-        ip->i_cache[0] = bh;
 }
 /**
@@ -366,7 +369,8 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip)
        if (error)
                goto out_rg_gunlock;
-        gfs2_trans_add_gl(ip->i_gl);
+        set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
+        set_bit(GLF_LFLUSH, &ip->i_gl->gl_flags);
        gfs2_free_di(rgd, ip);
@@ -707,9 +711,10 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
        int error;
-        gfs2_alloc_get(dip);
+        if (gfs2_alloc_get(dip) == NULL)
+                return -ENOMEM;
-        dip->i_alloc.al_requested = RES_DINODE;
+        dip->i_alloc->al_requested = RES_DINODE;
        error = gfs2_inplace_reserve(dip);
        if (error)
                goto out;
@@ -855,7 +860,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
        error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name);
        if (alloc_required < 0)
-                goto fail;
+                goto fail_quota_locks;
        if (alloc_required) {
                error = gfs2_quota_check(dip, dip->i_inode.i_uid, dip->i_inode.i_gid);
                if (error)
@@ -896,7 +901,7 @@ fail_end_trans:
        gfs2_trans_end(sdp);
 fail_ipreserv:
-        if (dip->i_alloc.al_rgd)
+        if (dip->i_alloc->al_rgd)
                gfs2_inplace_release(dip);
 fail_quota_locks:
@@ -966,7 +971,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
        struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 };
        int error;
        u64 generation;
-        struct buffer_head *bh=NULL;
+        struct buffer_head *bh = NULL;
        if (!name->len || name->len > GFS2_FNAMESIZE)
                return ERR_PTR(-ENAMETOOLONG);
@@ -1003,8 +1008,6 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
        if (IS_ERR(inode))
                goto fail_gunlock2;
-        gfs2_inode_bh(GFS2_I(inode), bh);
        error = gfs2_inode_refresh(GFS2_I(inode));
        if (error)
                goto fail_gunlock2;
@@ -1021,6 +1024,8 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
        if (error)
                goto fail_gunlock2;
+        if (bh)
+                brelse(bh);
        if (!inode)
                return ERR_PTR(-ENOMEM);
        return inode;
@@ -1032,6 +1037,8 @@ fail_gunlock2:
 fail_gunlock:
        gfs2_glock_dq(ghs);
 fail:
+        if (bh)
+                brelse(bh);
        return ERR_PTR(error);
 }
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 351ac87ab384..d44650662615 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -20,6 +20,18 @@ static inline int gfs2_is_jdata(const struct gfs2_inode *ip)
        return ip->i_di.di_flags & GFS2_DIF_JDATA;
 }
+static inline int gfs2_is_writeback(const struct gfs2_inode *ip)
+{
+        const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        return (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK) && !gfs2_is_jdata(ip);
+}
+static inline int gfs2_is_ordered(const struct gfs2_inode *ip)
+{
+        const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        return (sdp->sd_args.ar_data == GFS2_DATA_ORDERED) && !gfs2_is_jdata(ip);
+}
 static inline int gfs2_is_dir(const struct gfs2_inode *ip)
 {
        return S_ISDIR(ip->i_inode.i_mode);
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 41c5b04caaba..f2efff424224 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -67,6 +67,11 @@ static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir)
        memset(data, 0, 256);
        strncpy(data, data_arg, 255);
+        if (!strlen(data)) {
+                log_error("no mount options, (u)mount helpers not installed");
+                return -EINVAL;
+        }
        for (options = data; (x = strsep(&options, ":")); ) {
                if (!*x)
                        continue;
diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c
index 1f7b038530b4..2ebd374b3143 100644
--- a/fs/gfs2/locking/dlm/plock.c
+++ b/fs/gfs2/locking/dlm/plock.c
@@ -89,15 +89,19 @@ int gdlm_plock(void *lockspace, struct lm_lockname *name,
        op->info.number         = name->ln_number;
        op->info.start          = fl->fl_start;
        op->info.end            = fl->fl_end;
-        op->info.owner          = (__u64)(long) fl->fl_owner;
        if (fl->fl_lmops && fl->fl_lmops->fl_grant) {
+                /* fl_owner is lockd which doesn't distinguish
+                   processes on the nfs client */
+                op->info.owner  = (__u64) fl->fl_pid;
                xop->callback   = fl->fl_lmops->fl_grant;
                locks_init_lock(&xop->flc);
                locks_copy_lock(&xop->flc, fl);
                xop->fl         = fl;
                xop->file       = file;
-        } else
+        } else {
+                op->info.owner  = (__u64)(long) fl->fl_owner;
                xop->callback   = NULL;
+        }
        send_op(op);
@@ -203,7 +207,10 @@ int gdlm_punlock(void *lockspace, struct lm_lockname *name,
        op->info.number         = name->ln_number;
        op->info.start          = fl->fl_start;
        op->info.end            = fl->fl_end;
-        op->info.owner          = (__u64)(long) fl->fl_owner;
+        if (fl->fl_lmops && fl->fl_lmops->fl_grant)
+                op->info.owner  = (__u64) fl->fl_pid;
+        else
+                op->info.owner  = (__u64)(long) fl->fl_owner;
        send_op(op);
        wait_event(recv_wq, (op->done != 0));
@@ -242,7 +249,10 @@ int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
        op->info.number         = name->ln_number;
        op->info.start          = fl->fl_start;
        op->info.end            = fl->fl_end;
-        op->info.owner          = (__u64)(long) fl->fl_owner;
+        if (fl->fl_lmops && fl->fl_lmops->fl_grant)
+                op->info.owner  = (__u64) fl->fl_pid;
+        else
+                op->info.owner  = (__u64)(long) fl->fl_owner;
        send_op(op);
        wait_event(recv_wq, (op->done != 0));
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index ae9e6a25fe2b..a87b09839761 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -189,51 +189,39 @@ static struct kobj_type gdlm_ktype = {
        .sysfs_ops     = &gdlm_attr_ops,
 };
-static struct kset gdlm_kset = {
+static struct kset *gdlm_kset;
-        .ktype  = &gdlm_ktype,
-};
 int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
 {
        int error;
-        error = kobject_set_name(&ls->kobj, "%s", "lock_module");
+        ls->kobj.kset = gdlm_kset;
-        if (error) {
+        error = kobject_init_and_add(&ls->kobj, &gdlm_ktype, fskobj,
-                log_error("can't set kobj name %d", error);
+                                     "lock_module");
-                return error;
-        }
-        ls->kobj.kset = &gdlm_kset;
-        ls->kobj.ktype = &gdlm_ktype;
-        ls->kobj.parent = fskobj;
-        error = kobject_register(&ls->kobj);
        if (error)
                log_error("can't register kobj %d", error);
+        kobject_uevent(&ls->kobj, KOBJ_ADD);
        return error;
 }
 void gdlm_kobject_release(struct gdlm_ls *ls)
 {
-        kobject_unregister(&ls->kobj);
+        kobject_put(&ls->kobj);
 }
 int gdlm_sysfs_init(void)
 {
-        int error;
+        gdlm_kset = kset_create_and_add("lock_dlm", NULL, kernel_kobj);
+        if (!gdlm_kset) {
-        kobject_set_name(&gdlm_kset.kobj, "lock_dlm");
+                printk(KERN_WARNING "%s: can not create kset\n", __FUNCTION__);
-        kobj_set_kset_s(&gdlm_kset, kernel_subsys);
+                return -ENOMEM;
-        error = kset_register(&gdlm_kset);
+        }
-        if (error)
+        return 0;
-                printk("lock_dlm: cannot register kset %d\n", error);
-        return error;
 }
 void gdlm_sysfs_exit(void)
 {
-        kset_unregister(&gdlm_kset);
+        kset_unregister(gdlm_kset);
 }
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
index bd938f06481d..521694fc19d6 100644
--- a/fs/gfs2/locking/dlm/thread.c
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -273,18 +273,13 @@ static int gdlm_thread(void *data, int blist)
        struct gdlm_ls *ls = (struct gdlm_ls *) data;
        struct gdlm_lock *lp = NULL;
        uint8_t complete, blocking, submit, drop;
-        DECLARE_WAITQUEUE(wait, current);
        /* Only thread1 is allowed to do blocking callbacks since gfs
           may wait for a completion callback within a blocking cb. */
        while (!kthread_should_stop()) {
-                set_current_state(TASK_INTERRUPTIBLE);
+                wait_event_interruptible(ls->thread_wait,
-                add_wait_queue(&ls->thread_wait, &wait);
+                                !no_work(ls, blist) || kthread_should_stop());
-                if (no_work(ls, blist))
-                        schedule();
-                remove_wait_queue(&ls->thread_wait, &wait);
-                set_current_state(TASK_RUNNING);
                complete = blocking = submit = drop = 0;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 7df702473252..161ab6f2058e 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -16,6 +16,8 @@
 #include <linux/crc32.h>
 #include <linux/lm_interface.h>
 #include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -68,14 +70,12 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
 *
 */
-void gfs2_remove_from_ail(struct address_space *mapping, struct gfs2_bufdata *bd)
+void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
 {
        bd->bd_ail = NULL;
        list_del_init(&bd->bd_ail_st_list);
        list_del_init(&bd->bd_ail_gl_list);
        atomic_dec(&bd->bd_gl->gl_ail_count);
-        if (mapping)
-                gfs2_meta_cache_flush(GFS2_I(mapping->host));
        brelse(bd->bd_bh);
 }
@@ -92,8 +92,6 @@ static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
        struct buffer_head *bh;
        int retry;
-        BUG_ON(!spin_is_locked(&sdp->sd_log_lock));
        do {
                retry = 0;
@@ -210,7 +208,7 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
        gfs2_log_unlock(sdp);
 }
-int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
+static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
 {
        struct gfs2_ail *ai, *s;
        int ret;
@@ -248,7 +246,7 @@ static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
                bd = list_entry(head->prev, struct gfs2_bufdata,
                                bd_ail_st_list);
                gfs2_assert(sdp, bd->bd_ail == ai);
-                gfs2_remove_from_ail(bd->bd_bh->b_page->mapping, bd);
+                gfs2_remove_from_ail(bd);
        }
 }
@@ -303,7 +301,7 @@ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
        mutex_lock(&sdp->sd_log_reserve_mutex);
        gfs2_log_lock(sdp);
-        while(sdp->sd_log_blks_free <= (blks + reserved_blks)) {
+        while(atomic_read(&sdp->sd_log_blks_free) <= (blks + reserved_blks)) {
                gfs2_log_unlock(sdp);
                gfs2_ail1_empty(sdp, 0);
                gfs2_log_flush(sdp, NULL);
@@ -312,7 +310,7 @@ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
                        gfs2_ail1_start(sdp, 0);
                gfs2_log_lock(sdp);
        }
-        sdp->sd_log_blks_free -= blks;
+        atomic_sub(blks, &sdp->sd_log_blks_free);
        gfs2_log_unlock(sdp);
        mutex_unlock(&sdp->sd_log_reserve_mutex);
@@ -332,27 +330,23 @@ void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
 {
        gfs2_log_lock(sdp);
-        sdp->sd_log_blks_free += blks;
+        atomic_add(blks, &sdp->sd_log_blks_free);
        gfs2_assert_withdraw(sdp,
-                             sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
+                             atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
        gfs2_log_unlock(sdp);
        up_read(&sdp->sd_log_flush_lock);
 }
 static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
 {
-        struct inode *inode = sdp->sd_jdesc->jd_inode;
+        struct gfs2_journal_extent *je;
-        int error;
-        struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
+        list_for_each_entry(je, &sdp->sd_jdesc->extent_list, extent_list) {
+                if (lbn >= je->lblock && lbn < je->lblock + je->blocks)
-        bh_map.b_size = 1 << inode->i_blkbits;
+                        return je->dblock + lbn - je->lblock;
-        error = gfs2_block_map(inode, lbn, 0, &bh_map);
+        }
-        if (error || !bh_map.b_blocknr)
-                printk(KERN_INFO "error=%d, dbn=%llu lbn=%u", error,
+        return -1;
-                       (unsigned long long)bh_map.b_blocknr, lbn);
-        gfs2_assert_withdraw(sdp, !error && bh_map.b_blocknr);
-        return bh_map.b_blocknr;
 }
 /**
@@ -561,8 +555,8 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
        ail2_empty(sdp, new_tail);
        gfs2_log_lock(sdp);
-        sdp->sd_log_blks_free += dist;
+        atomic_add(dist, &sdp->sd_log_blks_free);
-        gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
+        gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
        gfs2_log_unlock(sdp);
        sdp->sd_log_tail = new_tail;
@@ -652,7 +646,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
                get_bh(bh);
                gfs2_log_unlock(sdp);
                lock_buffer(bh);
-                if (test_clear_buffer_dirty(bh)) {
+                if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
                        bh->b_end_io = end_buffer_write_sync;
                        submit_bh(WRITE, bh);
                } else {
@@ -694,20 +688,16 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
 *
 */
-void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
+void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 {
        struct gfs2_ail *ai;
        down_write(&sdp->sd_log_flush_lock);
-        if (gl) {
+        /* Log might have been flushed while we waited for the flush lock */
-                gfs2_log_lock(sdp);
+        if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) {
-                if (list_empty(&gl->gl_le.le_list)) {
+                up_write(&sdp->sd_log_flush_lock);
-                        gfs2_log_unlock(sdp);
+                return;
-                        up_write(&sdp->sd_log_flush_lock);
-                        return;
-                }
-                gfs2_log_unlock(sdp);
        }
        ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
@@ -739,7 +729,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
                log_flush_commit(sdp);
        else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
                gfs2_log_lock(sdp);
-                sdp->sd_log_blks_free--; /* Adjust for unreserved buffer */
+                atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
                gfs2_log_unlock(sdp);
                log_write_header(sdp, 0, PULL);
        }
@@ -767,7 +757,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 {
        unsigned int reserved;
-        unsigned int old;
+        unsigned int unused;
        gfs2_log_lock(sdp);
@@ -779,14 +769,11 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
        sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
        gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
        reserved = calc_reserved(sdp);
-        old = sdp->sd_log_blks_free;
+        unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
-        sdp->sd_log_blks_free += tr->tr_reserved -
+        gfs2_assert_withdraw(sdp, unused >= 0);
-                                 (reserved - sdp->sd_log_blks_reserved);
+        atomic_add(unused, &sdp->sd_log_blks_free);
+        gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
-        gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free >= old);
-        gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free <=
                             sdp->sd_jdesc->jd_blocks);
        sdp->sd_log_blks_reserved = reserved;
        gfs2_log_unlock(sdp);
@@ -825,7 +812,6 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp)
        down_write(&sdp->sd_log_flush_lock);
        gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
-        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_gl);
        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
@@ -838,7 +824,7 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp)
        log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT,
                         (sdp->sd_log_tail == current_tail(sdp)) ? 0 : PULL);
-        gfs2_assert_warn(sdp, sdp->sd_log_blks_free == sdp->sd_jdesc->jd_blocks);
+        gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks);
        gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail);
        gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list));
@@ -866,3 +852,42 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
        }
 }
+/**
+ * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
+ * @sdp: Pointer to GFS2 superblock
+ *
+ * Also, periodically check to make sure that we're using the most recent
+ * journal index.
+ */
+int gfs2_logd(void *data)
+{
+        struct gfs2_sbd *sdp = data;
+        unsigned long t;
+        int need_flush;
+        while (!kthread_should_stop()) {
+                /* Advance the log tail */
+                t = sdp->sd_log_flush_time +
+                    gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
+                gfs2_ail1_empty(sdp, DIO_ALL);
+                gfs2_log_lock(sdp);
+                need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks);
+                gfs2_log_unlock(sdp);
+                if (need_flush || time_after_eq(jiffies, t)) {
+                        gfs2_log_flush(sdp, NULL);
+                        sdp->sd_log_flush_time = jiffies;
+                }
+                t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
+                if (freezing(current))
+                        refrigerator();
+                schedule_timeout_interruptible(t);
+        }
+        return 0;
+}
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index dae282400627..771152816508 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -48,8 +48,6 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
 unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
                            unsigned int ssize);
-int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags);
 int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
 void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
 void gfs2_log_incr_head(struct gfs2_sbd *sdp);
@@ -57,11 +55,19 @@ void gfs2_log_incr_head(struct gfs2_sbd *sdp);
 struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
 struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
                                      struct buffer_head *real);
-void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
+void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
+static inline void gfs2_log_flush(struct gfs2_sbd *sbd, struct gfs2_glock *gl)
+{
+        if (!gl || test_bit(GLF_LFLUSH, &gl->gl_flags))
+                __gfs2_log_flush(sbd, gl);
+}
 void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
-void gfs2_remove_from_ail(struct address_space *mapping, struct gfs2_bufdata *bd);
+void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
 void gfs2_log_shutdown(struct gfs2_sbd *sdp);
 void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
+int gfs2_logd(void *data);
 #endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 6c27cea761c6..fae59d69d01a 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -87,6 +87,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
        }
        bd->bd_ail = ai;
        list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
+        clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
        gfs2_log_unlock(sdp);
        unlock_buffer(bh);
 }
@@ -124,49 +125,6 @@ static struct buffer_head *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type)
        return bh;
 }
-static void __glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
-{
-        struct gfs2_glock *gl;
-        struct gfs2_trans *tr = current->journal_info;
-        tr->tr_touched = 1;
-        gl = container_of(le, struct gfs2_glock, gl_le);
-        if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)))
-                return;
-        if (!list_empty(&le->le_list))
-                return;
-        gfs2_glock_hold(gl);
-        set_bit(GLF_DIRTY, &gl->gl_flags);
-        sdp->sd_log_num_gl++;
-        list_add(&le->le_list, &sdp->sd_log_le_gl);
-}
-static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
-{
-        gfs2_log_lock(sdp);
-        __glock_lo_add(sdp, le);
-        gfs2_log_unlock(sdp);
-}
-static void glock_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
-{
-        struct list_head *head = &sdp->sd_log_le_gl;
-        struct gfs2_glock *gl;
-        while (!list_empty(head)) {
-                gl = list_entry(head->next, struct gfs2_glock, gl_le.le_list);
-                list_del_init(&gl->gl_le.le_list);
-                sdp->sd_log_num_gl--;
-                gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl));
-                gfs2_glock_put(gl);
-        }
-        gfs2_assert_warn(sdp, !sdp->sd_log_num_gl);
-}
 static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 {
        struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
@@ -182,7 +140,8 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
        list_add(&bd->bd_list_tr, &tr->tr_list_buf);
        if (!list_empty(&le->le_list))
                goto out;
-        __glock_lo_add(sdp, &bd->bd_gl->gl_le);
+        set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+        set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
        gfs2_meta_check(sdp, bd->bd_bh);
        gfs2_pin(sdp, bd->bd_bh);
        sdp->sd_log_num_buf++;
@@ -556,17 +515,20 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
        lock_buffer(bd->bd_bh);
        gfs2_log_lock(sdp);
-        if (!list_empty(&bd->bd_list_tr))
+        if (tr) {
-                goto out;
+                if (!list_empty(&bd->bd_list_tr))
-        tr->tr_touched = 1;
+                        goto out;
-        if (gfs2_is_jdata(ip)) {
+                tr->tr_touched = 1;
-                tr->tr_num_buf++;
+                if (gfs2_is_jdata(ip)) {
-                list_add(&bd->bd_list_tr, &tr->tr_list_buf);
+                        tr->tr_num_buf++;
+                        list_add(&bd->bd_list_tr, &tr->tr_list_buf);
+                }
        }
        if (!list_empty(&le->le_list))
                goto out;
-        __glock_lo_add(sdp, &bd->bd_gl->gl_le);
+        set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+        set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
        if (gfs2_is_jdata(ip)) {
                gfs2_pin(sdp, bd->bd_bh);
                tr->tr_num_databuf_new++;
@@ -773,12 +735,6 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
 }
-const struct gfs2_log_operations gfs2_glock_lops = {
-        .lo_add = glock_lo_add,
-        .lo_after_commit = glock_lo_after_commit,
-        .lo_name = "glock",
-};
 const struct gfs2_log_operations gfs2_buf_lops = {
        .lo_add = buf_lo_add,
        .lo_incore_commit = buf_lo_incore_commit,
@@ -816,7 +772,6 @@ const struct gfs2_log_operations gfs2_databuf_lops = {
 };
 const struct gfs2_log_operations *gfs2_log_ops[] = {
-        &gfs2_glock_lops,
        &gfs2_databuf_lops,
        &gfs2_buf_lops,
        &gfs2_rg_lops,
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 7ecfe0d3a491..9c7765c12d62 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -29,9 +29,8 @@ static void gfs2_init_inode_once(struct kmem_cache *cachep, void *foo)
        struct gfs2_inode *ip = foo;
        inode_init_once(&ip->i_inode);
-        spin_lock_init(&ip->i_spin);
        init_rwsem(&ip->i_rw_mutex);
-        memset(ip->i_cache, 0, sizeof(ip->i_cache));
+        ip->i_alloc = NULL;
 }
 static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo)
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 4da423985e4f..85aea27b4a86 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -50,6 +50,7 @@ static int gfs2_aspace_writepage(struct page *page,
 static const struct address_space_operations aspace_aops = {
        .writepage = gfs2_aspace_writepage,
        .releasepage = gfs2_releasepage,
+        .sync_page = block_sync_page,
 };
 /**
@@ -221,13 +222,14 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
                   struct buffer_head **bhp)
 {
        *bhp = getbuf(gl, blkno, CREATE);
-        if (!buffer_uptodate(*bhp))
+        if (!buffer_uptodate(*bhp)) {
                ll_rw_block(READ_META, 1, bhp);
-        if (flags & DIO_WAIT) {
+                if (flags & DIO_WAIT) {
-                int error = gfs2_meta_wait(gl->gl_sbd, *bhp);
+                        int error = gfs2_meta_wait(gl->gl_sbd, *bhp);
-                if (error) {
+                        if (error) {
-                        brelse(*bhp);
+                                brelse(*bhp);
-                        return error;
+                                return error;
+                        }
                }
        }
@@ -282,7 +284,7 @@ void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
                return;
        }
-        bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL),
+        bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL);
        bd->bd_bh = bh;
        bd->bd_gl = gl;
@@ -317,7 +319,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
        }
        if (bd) {
                if (bd->bd_ail) {
-                        gfs2_remove_from_ail(NULL, bd);
+                        gfs2_remove_from_ail(bd);
                        bh->b_private = NULL;
                        bd->bd_bh = NULL;
                        bd->bd_blkno = bh->b_blocknr;
@@ -358,32 +360,6 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
 }
 /**
- * gfs2_meta_cache_flush - get rid of any references on buffers for this inode
- * @ip: The GFS2 inode
- *
- * This releases buffers that are in the most-recently-used array of
- * blocks used for indirect block addressing for this inode.
- */
-void gfs2_meta_cache_flush(struct gfs2_inode *ip)
-{
-        struct buffer_head **bh_slot;
-        unsigned int x;
-        spin_lock(&ip->i_spin);
-        for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) {
-                bh_slot = &ip->i_cache[x];
-                if (*bh_slot) {
-                        brelse(*bh_slot);
-                        *bh_slot = NULL;
-                }
-        }
-        spin_unlock(&ip->i_spin);
-}
-/**
 * gfs2_meta_indirect_buffer - Get a metadata buffer
 * @ip: The GFS2 inode
 * @height: The level of this buf in the metadata (indir addr) tree (if any)
@@ -391,8 +367,6 @@ void gfs2_meta_cache_flush(struct gfs2_inode *ip)
 * @new: Non-zero if we may create a new buffer
 * @bhp: the buffer is returned here
 *
- * Try to use the gfs2_inode's MRU metadata tree cache.
- *
 * Returns: errno
 */
@@ -401,58 +375,25 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_glock *gl = ip->i_gl;
-        struct buffer_head *bh = NULL, **bh_slot = ip->i_cache + height;
+        struct buffer_head *bh;
-        int in_cache = 0;
+        int ret = 0;
-        BUG_ON(!gl);
-        BUG_ON(!sdp);
-        spin_lock(&ip->i_spin);
-        if (*bh_slot && (*bh_slot)->b_blocknr == num) {
-                bh = *bh_slot;
-                get_bh(bh);
-                in_cache = 1;
-        }
-        spin_unlock(&ip->i_spin);
-        if (!bh)
-                bh = getbuf(gl, num, CREATE);
-        if (!bh)
-                return -ENOBUFS;
        if (new) {
-                if (gfs2_assert_warn(sdp, height))
+                BUG_ON(height == 0);
-                        goto err;
+                bh = gfs2_meta_new(gl, num);
-                meta_prep_new(bh);
                gfs2_trans_add_bh(ip->i_gl, bh, 1);
                gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
                gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
        } else {
                u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI;
-                if (!buffer_uptodate(bh)) {
+                ret = gfs2_meta_read(gl, num, DIO_WAIT, &bh);
-                        ll_rw_block(READ_META, 1, &bh);
+                if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) {
-                        if (gfs2_meta_wait(sdp, bh))
+                        brelse(bh);
-                                goto err;
+                        ret = -EIO;
                }
-                if (gfs2_metatype_check(sdp, bh, mtype))
-                        goto err;
-        }
-        if (!in_cache) {
-                spin_lock(&ip->i_spin);
-                if (*bh_slot)
-                        brelse(*bh_slot);
-                *bh_slot = bh;
-                get_bh(bh);
-                spin_unlock(&ip->i_spin);
        }
        *bhp = bh;
-        return 0;
+        return ret;
-err:
-        brelse(bh);
-        return -EIO;
 }
 /**
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index b7048222ebb4..73e3b1c76fe1 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -56,7 +56,6 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr,
 void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
-void gfs2_meta_cache_flush(struct gfs2_inode *ip);
 int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
                              int new, struct buffer_head **bhp);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 9679f8b9870d..38dbe99a30ed 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -20,6 +20,8 @@
 #include <linux/swap.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
+#include <linux/backing-dev.h>
+#include <linux/pagevec.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -32,7 +34,6 @@
 #include "quota.h"
 #include "trans.h"
 #include "rgrp.h"
-#include "ops_file.h"
 #include "super.h"
 #include "util.h"
 #include "glops.h"
@@ -58,22 +59,6 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
 }
 /**
- * gfs2_get_block - Fills in a buffer head with details about a block
- * @inode: The inode
- * @lblock: The block number to look up
- * @bh_result: The buffer head to return the result in
- * @create: Non-zero if we may add block to the file
- *
- * Returns: errno
- */
-int gfs2_get_block(struct inode *inode, sector_t lblock,
-                   struct buffer_head *bh_result, int create)
-{
-        return gfs2_block_map(inode, lblock, create, bh_result);
-}
-/**
 * gfs2_get_block_noalloc - Fills in a buffer head with details about a block
 * @inode: The inode
 * @lblock: The block number to look up
@@ -88,7 +73,7 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
 {
        int error;
-        error = gfs2_block_map(inode, lblock, 0, bh_result);
+        error = gfs2_block_map(inode, lblock, bh_result, 0);
        if (error)
                return error;
        if (!buffer_mapped(bh_result))
@@ -99,20 +84,19 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
 static int gfs2_get_block_direct(struct inode *inode, sector_t lblock,
                                 struct buffer_head *bh_result, int create)
 {
-        return gfs2_block_map(inode, lblock, 0, bh_result);
+        return gfs2_block_map(inode, lblock, bh_result, 0);
 }
 /**
- * gfs2_writepage - Write complete page
+ * gfs2_writepage_common - Common bits of writepage
- * @page: Page to write
+ * @page: The page to be written
+ * @wbc: The writeback control
 *
- * Returns: errno
+ * Returns: 1 if writepage is ok, otherwise an error code or zero if no error.
- *
- * Some of this is copied from block_write_full_page() although we still
- * call it to do most of the work.
 */
-static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
+static int gfs2_writepage_common(struct page *page,
+                                 struct writeback_control *wbc)
 {
        struct inode *inode = page->mapping->host;
        struct gfs2_inode *ip = GFS2_I(inode);
@@ -120,41 +104,133 @@ static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
        loff_t i_size = i_size_read(inode);
        pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
        unsigned offset;
-        int error;
+        int ret = -EIO;
-        int done_trans = 0;
-        if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) {
+        if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
-                unlock_page(page);
+                goto out;
-                return -EIO;
+        ret = 0;
-        }
        if (current->journal_info)
-                goto out_ignore;
+                goto redirty;
        /* Is the page fully outside i_size? (truncate in progress) */
-        offset = i_size & (PAGE_CACHE_SIZE-1);
+        offset = i_size & (PAGE_CACHE_SIZE-1);
        if (page->index > end_index || (page->index == end_index && !offset)) {
                page->mapping->a_ops->invalidatepage(page, 0);
-                unlock_page(page);
+                goto out;
-                return 0; /* don't care */
+        }
+        return 1;
+redirty:
+        redirty_page_for_writepage(wbc, page);
+out:
+        unlock_page(page);
+        return 0;
+}
+/**
+ * gfs2_writeback_writepage - Write page for writeback mappings
+ * @page: The page
+ * @wbc: The writeback control
+ *
+ */
+static int gfs2_writeback_writepage(struct page *page,
+                                    struct writeback_control *wbc)
+{
+        int ret;
+        ret = gfs2_writepage_common(page, wbc);
+        if (ret <= 0)
+                return ret;
+        ret = mpage_writepage(page, gfs2_get_block_noalloc, wbc);
+        if (ret == -EAGAIN)
+                ret = block_write_full_page(page, gfs2_get_block_noalloc, wbc);
+        return ret;
+}
+/**
+ * gfs2_ordered_writepage - Write page for ordered data files
+ * @page: The page to write
+ * @wbc: The writeback control
+ *
+ */
+static int gfs2_ordered_writepage(struct page *page,
+                                  struct writeback_control *wbc)
+{
+        struct inode *inode = page->mapping->host;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        int ret;
+        ret = gfs2_writepage_common(page, wbc);
+        if (ret <= 0)
+                return ret;
+        if (!page_has_buffers(page)) {
+                create_empty_buffers(page, inode->i_sb->s_blocksize,
+                                     (1 << BH_Dirty)|(1 << BH_Uptodate));
        }
+        gfs2_page_add_databufs(ip, page, 0, inode->i_sb->s_blocksize-1);
+        return block_write_full_page(page, gfs2_get_block_noalloc, wbc);
+}
-        if ((sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) &&
+/**
-            PageChecked(page)) {
+ * __gfs2_jdata_writepage - The core of jdata writepage
+ * @page: The page to write
+ * @wbc: The writeback control
+ *
+ * This is shared between writepage and writepages and implements the
+ * core of the writepage operation. If a transaction is required then
+ * PageChecked will have been set and the transaction will have
+ * already been started before this is called.
+ */
+static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
+{
+        struct inode *inode = page->mapping->host;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        if (PageChecked(page)) {
                ClearPageChecked(page);
-                error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
-                if (error)
-                        goto out_ignore;
                if (!page_has_buffers(page)) {
                        create_empty_buffers(page, inode->i_sb->s_blocksize,
                                             (1 << BH_Dirty)|(1 << BH_Uptodate));
                }
                gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
+        }
+        return block_write_full_page(page, gfs2_get_block_noalloc, wbc);
+}
+/**
+ * gfs2_jdata_writepage - Write complete page
+ * @page: Page to write
+ *
+ * Returns: errno
+ *
+ */
+static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
+{
+        struct inode *inode = page->mapping->host;
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        int error;
+        int done_trans = 0;
+        error = gfs2_writepage_common(page, wbc);
+        if (error <= 0)
+                return error;
+        if (PageChecked(page)) {
+                if (wbc->sync_mode != WB_SYNC_ALL)
+                        goto out_ignore;
+                error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
+                if (error)
+                        goto out_ignore;
                done_trans = 1;
        }
-        error = block_write_full_page(page, gfs2_get_block_noalloc, wbc);
+        error = __gfs2_jdata_writepage(page, wbc);
        if (done_trans)
                gfs2_trans_end(sdp);
-        gfs2_meta_cache_flush(ip);
        return error;
 out_ignore:
@@ -164,29 +240,190 @@ out_ignore:
 }
 /**
- * gfs2_writepages - Write a bunch of dirty pages back to disk
+ * gfs2_writeback_writepages - Write a bunch of dirty pages back to disk
 * @mapping: The mapping to write
 * @wbc: Write-back control
 *
- * For journaled files and/or ordered writes this just falls back to the
+ * For the data=writeback case we can already ignore buffer heads
- * kernel's default writepages path for now. We will probably want to change
- * that eventually (i.e. when we look at allocate on flush).
- *
- * For the data=writeback case though we can already ignore buffer heads
 * and write whole extents at once. This is a big reduction in the
 * number of I/O requests we send and the bmap calls we make in this case.
 */
-static int gfs2_writepages(struct address_space *mapping,
+static int gfs2_writeback_writepages(struct address_space *mapping,
-                           struct writeback_control *wbc)
+                                     struct writeback_control *wbc)
+{
+        return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
+}
+/**
+ * gfs2_write_jdata_pagevec - Write back a pagevec's worth of pages
+ * @mapping: The mapping
+ * @wbc: The writeback control
+ * @writepage: The writepage function to call for each page
+ * @pvec: The vector of pages
+ * @nr_pages: The number of pages to write
+ *
+ * Returns: non-zero if loop should terminate, zero otherwise
+ */
+static int gfs2_write_jdata_pagevec(struct address_space *mapping,
+                                    struct writeback_control *wbc,
+                                    struct pagevec *pvec,
+                                    int nr_pages, pgoff_t end)
 {
        struct inode *inode = mapping->host;
-        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        loff_t i_size = i_size_read(inode);
+        pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+        unsigned offset = i_size & (PAGE_CACHE_SIZE-1);
+        unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize);
+        struct backing_dev_info *bdi = mapping->backing_dev_info;
+        int i;
+        int ret;
+        ret = gfs2_trans_begin(sdp, nrblocks, 0);
+        if (ret < 0)
+                return ret;
+        for(i = 0; i < nr_pages; i++) {
+                struct page *page = pvec->pages[i];
+                lock_page(page);
+                if (unlikely(page->mapping != mapping)) {
+                        unlock_page(page);
+                        continue;
+                }
+                if (!wbc->range_cyclic && page->index > end) {
+                        ret = 1;
+                        unlock_page(page);
+                        continue;
+                }
+                if (wbc->sync_mode != WB_SYNC_NONE)
+                        wait_on_page_writeback(page);
+                if (PageWriteback(page) ||
+                    !clear_page_dirty_for_io(page)) {
+                        unlock_page(page);
+                        continue;
+                }
+                /* Is the page fully outside i_size? (truncate in progress) */
+                if (page->index > end_index || (page->index == end_index && !offset)) {
+                        page->mapping->a_ops->invalidatepage(page, 0);
+                        unlock_page(page);
+                        continue;
+                }
+                ret = __gfs2_jdata_writepage(page, wbc);
+                if (ret || (--(wbc->nr_to_write) <= 0))
+                        ret = 1;
+                if (wbc->nonblocking && bdi_write_congested(bdi)) {
+                        wbc->encountered_congestion = 1;
+                        ret = 1;
+                }
+        }
+        gfs2_trans_end(sdp);
+        return ret;
+}
+/**
+ * gfs2_write_cache_jdata - Like write_cache_pages but different
+ * @mapping: The mapping to write
+ * @wbc: The writeback control
+ * @writepage: The writepage function to call
+ * @data: The data to pass to writepage
+ *
+ * The reason that we use our own function here is that we need to
+ * start transactions before we grab page locks. This allows us
+ * to get the ordering right.
+ */
+static int gfs2_write_cache_jdata(struct address_space *mapping,
+                                  struct writeback_control *wbc)
+{
+        struct backing_dev_info *bdi = mapping->backing_dev_info;
+        int ret = 0;
+        int done = 0;
+        struct pagevec pvec;
+        int nr_pages;
+        pgoff_t index;
+        pgoff_t end;
+        int scanned = 0;
+        int range_whole = 0;
+        if (wbc->nonblocking && bdi_write_congested(bdi)) {
+                wbc->encountered_congestion = 1;
+                return 0;
+        }
+        pagevec_init(&pvec, 0);
+        if (wbc->range_cyclic) {
+                index = mapping->writeback_index; /* Start from prev offset */
+                end = -1;
+        } else {
+                index = wbc->range_start >> PAGE_CACHE_SHIFT;
+                end = wbc->range_end >> PAGE_CACHE_SHIFT;
+                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+                        range_whole = 1;
+                scanned = 1;
+        }
-        if (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK && !gfs2_is_jdata(ip))
+retry:
-                return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
+         while (!done && (index <= end) &&
+                (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                                               PAGECACHE_TAG_DIRTY,
+                                               min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+                scanned = 1;
+                ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end);
+                if (ret)
+                        done = 1;
+                if (ret > 0)
+                        ret = 0;
+                pagevec_release(&pvec);
+                cond_resched();
+        }
+        if (!scanned && !done) {
+                /*
+                 * We hit the last page and there is more work to be done: wrap
+                 * back to the start of the file
+                 */
+                scanned = 1;
+                index = 0;
+                goto retry;
+        }
+        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+                mapping->writeback_index = index;
+        return ret;
+}
+/**
+ * gfs2_jdata_writepages - Write a bunch of dirty pages back to disk
+ * @mapping: The mapping to write
+ * @wbc: The writeback control
+ * 
+ */
-        return generic_writepages(mapping, wbc);
+static int gfs2_jdata_writepages(struct address_space *mapping,
+                                 struct writeback_control *wbc)
+{
+        struct gfs2_inode *ip = GFS2_I(mapping->host);
+        struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
+        int ret;
+        ret = gfs2_write_cache_jdata(mapping, wbc);
+        if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) {
+                gfs2_log_flush(sdp, ip->i_gl);
+                ret = gfs2_write_cache_jdata(mapping, wbc);
+        }
+        return ret;
 }
 /**
@@ -231,62 +468,107 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
 /**
- * gfs2_readpage - readpage with locking
+ * __gfs2_readpage - readpage
- * @file: The file to read a page for. N.B. This may be NULL if we are
+ * @file: The file to read a page for
- * reading an internal file.
 * @page: The page to read
 *
- * Returns: errno
+ * This is the core of gfs2's readpage. Its used by the internal file
+ * reading code as in that case we already hold the glock. Also its
+ * called by gfs2_readpage() once the required lock has been granted.
+ *
 */
-static int gfs2_readpage(struct file *file, struct page *page)
+static int __gfs2_readpage(void *file, struct page *page)
 {
        struct gfs2_inode *ip = GFS2_I(page->mapping->host);
        struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
-        struct gfs2_file *gf = NULL;
-        struct gfs2_holder gh;
        int error;
-        int do_unlock = 0;
-        if (likely(file != &gfs2_internal_file_sentinel)) {
-                if (file) {
-                        gf = file->private_data;
-                        if (test_bit(GFF_EXLOCK, &gf->f_flags))
-                                /* gfs2_sharewrite_fault has grabbed the ip->i_gl already */
-                                goto skip_lock;
-                }
-                gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh);
-                do_unlock = 1;
-                error = gfs2_glock_nq_atime(&gh);
-                if (unlikely(error))
-                        goto out_unlock;
-        }
-skip_lock:
        if (gfs2_is_stuffed(ip)) {
                error = stuffed_readpage(ip, page);
                unlock_page(page);
-        } else
+        } else {
-                error = mpage_readpage(page, gfs2_get_block);
+                error = mpage_readpage(page, gfs2_block_map);
+        }
        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                error = -EIO;
+                return -EIO;
+        return error;
+}
+/**
+ * gfs2_readpage - read a page of a file
+ * @file: The file to read
+ * @page: The page of the file
+ *
+ * This deals with the locking required. We use a trylock in order to
+ * avoid the page lock / glock ordering problems returning AOP_TRUNCATED_PAGE
+ * in the event that we are unable to get the lock.
+ */
+static int gfs2_readpage(struct file *file, struct page *page)
+{
+        struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+        struct gfs2_holder gh;
+        int error;
-        if (do_unlock) {
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh);
-                gfs2_glock_dq_m(1, &gh);
+        error = gfs2_glock_nq_atime(&gh);
-                gfs2_holder_uninit(&gh);
+        if (unlikely(error)) {
+                unlock_page(page);
+                goto out;
        }
+        error = __gfs2_readpage(file, page);
+        gfs2_glock_dq(&gh);
 out:
-        return error;
+        gfs2_holder_uninit(&gh);
-out_unlock:
-        unlock_page(page);
        if (error == GLR_TRYFAILED) {
-                error = AOP_TRUNCATED_PAGE;
                yield();
+                return AOP_TRUNCATED_PAGE;
        }
-        if (do_unlock)
+        return error;
-                gfs2_holder_uninit(&gh);
+}
-        goto out;
+/**
+ * gfs2_internal_read - read an internal file
+ * @ip: The gfs2 inode
+ * @ra_state: The readahead state (or NULL for no readahead)
+ * @buf: The buffer to fill
+ * @pos: The file position
+ * @size: The amount to read
+ *
+ */
+int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
+                       char *buf, loff_t *pos, unsigned size)
+{
+        struct address_space *mapping = ip->i_inode.i_mapping;
+        unsigned long index = *pos / PAGE_CACHE_SIZE;
+        unsigned offset = *pos & (PAGE_CACHE_SIZE - 1);
+        unsigned copied = 0;
+        unsigned amt;
+        struct page *page;
+        void *p;
+        do {
+                amt = size - copied;
+                if (offset + size > PAGE_CACHE_SIZE)
+                        amt = PAGE_CACHE_SIZE - offset;
+                page = read_cache_page(mapping, index, __gfs2_readpage, NULL);
+                if (IS_ERR(page))
+                        return PTR_ERR(page);
+                p = kmap_atomic(page, KM_USER0);
+                memcpy(buf + copied, p + offset, amt);
+                kunmap_atomic(p, KM_USER0);
+                mark_page_accessed(page);
+                page_cache_release(page);
+                copied += amt;
+                index++;
+                offset = 0;
+        } while(copied < size);
+        (*pos) += size;
+        return size;
 }
 /**
@@ -300,10 +582,9 @@ out_unlock:
 *    Any I/O we ignore at this time will be done via readpage later.
 * 2. We don't handle stuffed files here we let readpage do the honours.
 * 3. mpage_readpages() does most of the heavy lifting in the common case.
- * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places.
+ * 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places.
- * 5. We use LM_FLAG_TRY_1CB here, effectively we then have lock-ahead as
- *    well as read-ahead.
 */
 static int gfs2_readpages(struct file *file, struct address_space *mapping,
                          struct list_head *pages, unsigned nr_pages)
 {
@@ -311,42 +592,20 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping,
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct gfs2_holder gh;
-        int ret = 0;
+        int ret;
-        int do_unlock = 0;
-        if (likely(file != &gfs2_internal_file_sentinel)) {
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
-                if (file) {
+        ret = gfs2_glock_nq_atime(&gh);
-                        struct gfs2_file *gf = file->private_data;
+        if (unlikely(ret))
-                        if (test_bit(GFF_EXLOCK, &gf->f_flags))
+                goto out_uninit;
-                                goto skip_lock;
-                }
-                gfs2_holder_init(ip->i_gl, LM_ST_SHARED,
-                                 LM_FLAG_TRY_1CB|GL_ATIME, &gh);
-                do_unlock = 1;
-                ret = gfs2_glock_nq_atime(&gh);
-                if (ret == GLR_TRYFAILED)
-                        goto out_noerror;
-                if (unlikely(ret))
-                        goto out_unlock;
-        }
-skip_lock:
        if (!gfs2_is_stuffed(ip))
-                ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block);
+                ret = mpage_readpages(mapping, pages, nr_pages, gfs2_block_map);
+        gfs2_glock_dq(&gh);
-        if (do_unlock) {
+out_uninit:
-                gfs2_glock_dq_m(1, &gh);
+        gfs2_holder_uninit(&gh);
-                gfs2_holder_uninit(&gh);
-        }
-out:
        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
                ret = -EIO;
        return ret;
-out_noerror:
-        ret = 0;
-out_unlock:
-        if (do_unlock)
-                gfs2_holder_uninit(&gh);
-        goto out;
 }
 /**
@@ -382,20 +641,11 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
        if (unlikely(error))
                goto out_uninit;
-        error = -ENOMEM;
-        page = __grab_cache_page(mapping, index);
-        *pagep = page;
-        if (!page)
-                goto out_unlock;
        gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
        error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
        if (error)
-                goto out_putpage;
+                goto out_unlock;
-        ip->i_alloc.al_requested = 0;
        if (alloc_required) {
                al = gfs2_alloc_get(ip);
@@ -424,40 +674,47 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
        if (error)
                goto out_trans_fail;
+        error = -ENOMEM;
+        page = __grab_cache_page(mapping, index);
+        *pagep = page;
+        if (unlikely(!page))
+                goto out_endtrans;
        if (gfs2_is_stuffed(ip)) {
+                error = 0;
                if (pos + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
                        error = gfs2_unstuff_dinode(ip, page);
                        if (error == 0)
                                goto prepare_write;
-                } else if (!PageUptodate(page))
+                } else if (!PageUptodate(page)) {
                        error = stuffed_readpage(ip, page);
+                }
                goto out;
        }
 prepare_write:
-        error = block_prepare_write(page, from, to, gfs2_get_block);
+        error = block_prepare_write(page, from, to, gfs2_block_map);
 out:
-        if (error) {
+        if (error == 0)
-                gfs2_trans_end(sdp);
+                return 0;
+        page_cache_release(page);
+        if (pos + len > ip->i_inode.i_size)
+                vmtruncate(&ip->i_inode, ip->i_inode.i_size);
+out_endtrans:
+        gfs2_trans_end(sdp);
 out_trans_fail:
-                if (alloc_required) {
+        if (alloc_required) {
-                        gfs2_inplace_release(ip);
+                gfs2_inplace_release(ip);
 out_qunlock:
-                        gfs2_quota_unlock(ip);
+                gfs2_quota_unlock(ip);
 out_alloc_put:
-                        gfs2_alloc_put(ip);
+                gfs2_alloc_put(ip);
-                }
+        }
-out_putpage:
-                page_cache_release(page);
-                if (pos + len > ip->i_inode.i_size)
-                        vmtruncate(&ip->i_inode, ip->i_inode.i_size);
 out_unlock:
-                gfs2_glock_dq_m(1, &ip->i_gh);
+        gfs2_glock_dq(&ip->i_gh);
 out_uninit:
-                gfs2_holder_uninit(&ip->i_gh);
+        gfs2_holder_uninit(&ip->i_gh);
-        }
        return error;
 }
@@ -565,7 +822,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct buffer_head *dibh;
-        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_alloc *al = ip->i_alloc;
        struct gfs2_dinode *di;
        unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
        unsigned int to = from + len;
@@ -585,19 +842,16 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
        if (gfs2_is_stuffed(ip))
                return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page);
-        if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
+        if (!gfs2_is_writeback(ip))
                gfs2_page_add_databufs(ip, page, from, to);
        ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-        if (likely(ret >= 0)) {
+        if (likely(ret >= 0) && (inode->i_size > ip->i_di.di_size)) {
-                copied = ret;
+                di = (struct gfs2_dinode *)dibh->b_data;
-                if  ((pos + copied) > inode->i_size) {
+                ip->i_di.di_size = inode->i_size;
-                        di = (struct gfs2_dinode *)dibh->b_data;
+                di->di_size = cpu_to_be64(inode->i_size);
-                        ip->i_di.di_size = inode->i_size;
+                mark_inode_dirty(inode);
-                        di->di_size = cpu_to_be64(inode->i_size);
-                        mark_inode_dirty(inode);
-                }
        }
        if (inode == sdp->sd_rindex)
@@ -606,7 +860,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
        brelse(dibh);
        gfs2_trans_end(sdp);
 failed:
-        if (al->al_requested) {
+        if (al) {
                gfs2_inplace_release(ip);
                gfs2_quota_unlock(ip);
                gfs2_alloc_put(ip);
@@ -625,11 +879,7 @@ failed:
 
 static int gfs2_set_page_dirty(struct page *page)
 {
-        struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+        SetPageChecked(page);
-        struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
-        if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
-                SetPageChecked(page);
        return __set_page_dirty_buffers(page);
 }
@@ -653,7 +903,7 @@ static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
                return 0;
        if (!gfs2_is_stuffed(ip))
-                dblock = generic_block_bmap(mapping, lblock, gfs2_get_block);
+                dblock = generic_block_bmap(mapping, lblock, gfs2_block_map);
        gfs2_glock_dq_uninit(&i_gh);
@@ -719,13 +969,9 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
 {
        /*
         * Should we return an error here? I can't see that O_DIRECT for
-         * a journaled file makes any sense. For now we'll silently fall
+         * a stuffed file makes any sense. For now we'll silently fall
-         * back to buffered I/O, likewise we do the same for stuffed
+         * back to buffered I/O
-         * files since they are (a) small and (b) unaligned.
         */
-        if (gfs2_is_jdata(ip))
-                return 0;
        if (gfs2_is_stuffed(ip))
                return 0;
@@ -836,9 +1082,23 @@ cannot_release:
        return 0;
 }
-const struct address_space_operations gfs2_file_aops = {
+static const struct address_space_operations gfs2_writeback_aops = {
-        .writepage = gfs2_writepage,
+        .writepage = gfs2_writeback_writepage,
-        .writepages = gfs2_writepages,
+        .writepages = gfs2_writeback_writepages,
+        .readpage = gfs2_readpage,
+        .readpages = gfs2_readpages,
+        .sync_page = block_sync_page,
+        .write_begin = gfs2_write_begin,
+        .write_end = gfs2_write_end,
+        .bmap = gfs2_bmap,
+        .invalidatepage = gfs2_invalidatepage,
+        .releasepage = gfs2_releasepage,
+        .direct_IO = gfs2_direct_IO,
+        .migratepage = buffer_migrate_page,
+};
+static const struct address_space_operations gfs2_ordered_aops = {
+        .writepage = gfs2_ordered_writepage,
        .readpage = gfs2_readpage,
        .readpages = gfs2_readpages,
        .sync_page = block_sync_page,
@@ -849,5 +1109,34 @@ const struct address_space_operations gfs2_file_aops = {
        .invalidatepage = gfs2_invalidatepage,
        .releasepage = gfs2_releasepage,
        .direct_IO = gfs2_direct_IO,
+        .migratepage = buffer_migrate_page,
 };
+static const struct address_space_operations gfs2_jdata_aops = {
+        .writepage = gfs2_jdata_writepage,
+        .writepages = gfs2_jdata_writepages,
+        .readpage = gfs2_readpage,
+        .readpages = gfs2_readpages,
+        .sync_page = block_sync_page,
+        .write_begin = gfs2_write_begin,
+        .write_end = gfs2_write_end,
+        .set_page_dirty = gfs2_set_page_dirty,
+        .bmap = gfs2_bmap,
+        .invalidatepage = gfs2_invalidatepage,
+        .releasepage = gfs2_releasepage,
+};
+void gfs2_set_aops(struct inode *inode)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        if (gfs2_is_writeback(ip))
+                inode->i_mapping->a_ops = &gfs2_writeback_aops;
+        else if (gfs2_is_ordered(ip))
+                inode->i_mapping->a_ops = &gfs2_ordered_aops;
+        else if (gfs2_is_jdata(ip))
+                inode->i_mapping->a_ops = &gfs2_jdata_aops;
+        else
+                BUG();
+}
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
index fa1b5b3d28b9..5da21285bba4 100644
--- a/fs/gfs2/ops_address.h
+++ b/fs/gfs2/ops_address.h
@@ -14,9 +14,10 @@
 #include <linux/buffer_head.h>
 #include <linux/mm.h>
-extern const struct address_space_operations gfs2_file_aops;
-extern int gfs2_get_block(struct inode *inode, sector_t lblock,
-                          struct buffer_head *bh_result, int create);
 extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
+extern int gfs2_internal_read(struct gfs2_inode *ip,
+                              struct file_ra_state *ra_state,
+                              char *buf, loff_t *pos, unsigned size);
+extern void gfs2_set_aops(struct inode *inode);
 #endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index bb11fd6752d3..f4842f2548cd 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -33,57 +33,12 @@
 #include "lm.h"
 #include "log.h"
 #include "meta_io.h"
-#include "ops_file.h"
-#include "ops_vm.h"
 #include "quota.h"
 #include "rgrp.h"
 #include "trans.h"
 #include "util.h"
 #include "eaops.h"
+#include "ops_address.h"
-/*
- * Most fields left uninitialised to catch anybody who tries to
- * use them. f_flags set to prevent file_accessed() from touching
- * any other part of this. Its use is purely as a flag so that we
- * know (in readpage()) whether or not do to locking.
- */
-struct file gfs2_internal_file_sentinel = {
-        .f_flags = O_NOATIME|O_RDONLY,
-};
-static int gfs2_read_actor(read_descriptor_t *desc, struct page *page,
-                           unsigned long offset, unsigned long size)
-{
-        char *kaddr;
-        unsigned long count = desc->count;
-        if (size > count)
-                size = count;
-        kaddr = kmap(page);
-        memcpy(desc->arg.data, kaddr + offset, size);
-        kunmap(page);
-        desc->count = count - size;
-        desc->written += size;
-        desc->arg.buf += size;
-        return size;
-}
-int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
-                       char *buf, loff_t *pos, unsigned size)
-{
-        struct inode *inode = &ip->i_inode;
-        read_descriptor_t desc;
-        desc.written = 0;
-        desc.arg.data = buf;
-        desc.count = size;
-        desc.error = 0;
-        do_generic_mapping_read(inode->i_mapping, ra_state,
-                                &gfs2_internal_file_sentinel, pos, &desc,
-                                gfs2_read_actor);
-        return desc.written ? desc.written : desc.error;
-}
 /**
 * gfs2_llseek - seek to a location in a file
@@ -214,7 +169,7 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
        if (put_user(fsflags, ptr))
                error = -EFAULT;
-        gfs2_glock_dq_m(1, &gh);
+        gfs2_glock_dq(&gh);
        gfs2_holder_uninit(&gh);
        return error;
 }
@@ -291,7 +246,16 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
                if (error)
                        goto out;
        }
+        if ((flags ^ new_flags) & GFS2_DIF_JDATA) {
+                if (flags & GFS2_DIF_JDATA)
+                        gfs2_log_flush(sdp, ip->i_gl);
+                error = filemap_fdatawrite(inode->i_mapping);
+                if (error)
+                        goto out;
+                error = filemap_fdatawait(inode->i_mapping);
+                if (error)
+                        goto out;
+        }
        error = gfs2_trans_begin(sdp, RES_DINODE, 0);
        if (error)
                goto out;
@@ -303,6 +267,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
        gfs2_dinode_out(ip, bh->b_data);
        brelse(bh);
        gfs2_set_inode_flags(inode);
+        gfs2_set_aops(inode);
 out_trans_end:
        gfs2_trans_end(sdp);
 out:
@@ -338,6 +303,128 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        return -ENOTTY;
 }
+/**
+ * gfs2_allocate_page_backing - Use bmap to allocate blocks
+ * @page: The (locked) page to allocate backing for
+ *
+ * We try to allocate all the blocks required for the page in
+ * one go. This might fail for various reasons, so we keep
+ * trying until all the blocks to back this page are allocated.
+ * If some of the blocks are already allocated, thats ok too.
+ */
+static int gfs2_allocate_page_backing(struct page *page)
+{
+        struct inode *inode = page->mapping->host;
+        struct buffer_head bh;
+        unsigned long size = PAGE_CACHE_SIZE;
+        u64 lblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        do {
+                bh.b_state = 0;
+                bh.b_size = size;
+                gfs2_block_map(inode, lblock, &bh, 1);
+                if (!buffer_mapped(&bh))
+                        return -EIO;
+                size -= bh.b_size;
+                lblock += (bh.b_size >> inode->i_blkbits);
+        } while(size > 0);
+        return 0;
+}
+/**
+ * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable
+ * @vma: The virtual memory area
+ * @page: The page which is about to become writable
+ *
+ * When the page becomes writable, we need to ensure that we have
+ * blocks allocated on disk to back that page.
+ */
+static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        unsigned long last_index;
+        u64 pos = page->index << (PAGE_CACHE_SIZE - inode->i_blkbits);
+        unsigned int data_blocks, ind_blocks, rblocks;
+        int alloc_required = 0;
+        struct gfs2_holder gh;
+        struct gfs2_alloc *al;
+        int ret;
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &gh);
+        ret = gfs2_glock_nq_atime(&gh);
+        if (ret)
+                goto out;
+        set_bit(GIF_SW_PAGED, &ip->i_flags);
+        gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
+        ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
+        if (ret || !alloc_required)
+                goto out_unlock;
+        ret = -ENOMEM;
+        al = gfs2_alloc_get(ip);
+        if (al == NULL)
+                goto out_unlock;
+        ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        if (ret)
+                goto out_alloc_put;
+        ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
+        if (ret)
+                goto out_quota_unlock;
+        al->al_requested = data_blocks + ind_blocks;
+        ret = gfs2_inplace_reserve(ip);
+        if (ret)
+                goto out_quota_unlock;
+        rblocks = RES_DINODE + ind_blocks;
+        if (gfs2_is_jdata(ip))
+                rblocks += data_blocks ? data_blocks : 1;
+        if (ind_blocks || data_blocks)
+                rblocks += RES_STATFS + RES_QUOTA;
+        ret = gfs2_trans_begin(sdp, rblocks, 0);
+        if (ret)
+                goto out_trans_fail;
+        lock_page(page);
+        ret = -EINVAL;
+        last_index = ip->i_inode.i_size >> PAGE_CACHE_SHIFT;
+        if (page->index > last_index)
+                goto out_unlock_page;
+        ret = 0;
+        if (!PageUptodate(page) || page->mapping != ip->i_inode.i_mapping)
+                goto out_unlock_page;
+        if (gfs2_is_stuffed(ip)) {
+                ret = gfs2_unstuff_dinode(ip, page);
+                if (ret)
+                        goto out_unlock_page;
+        }
+        ret = gfs2_allocate_page_backing(page);
+out_unlock_page:
+        unlock_page(page);
+        gfs2_trans_end(sdp);
+out_trans_fail:
+        gfs2_inplace_release(ip);
+out_quota_unlock:
+        gfs2_quota_unlock(ip);
+out_alloc_put:
+        gfs2_alloc_put(ip);
+out_unlock:
+        gfs2_glock_dq(&gh);
+out:
+        gfs2_holder_uninit(&gh);
+        return ret;
+}
+static struct vm_operations_struct gfs2_vm_ops = {
+        .fault = filemap_fault,
+        .page_mkwrite = gfs2_page_mkwrite,
+};
 /**
 * gfs2_mmap -
@@ -360,14 +447,7 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
                return error;
        }
-        /* This is VM_MAYWRITE instead of VM_WRITE because a call
+        vma->vm_ops = &gfs2_vm_ops;
-           to mprotect() can turn on VM_WRITE later. */
-        if ((vma->vm_flags & (VM_MAYSHARE | VM_MAYWRITE)) ==
-            (VM_MAYSHARE | VM_MAYWRITE))
-                vma->vm_ops = &gfs2_vm_ops_sharewrite;
-        else
-                vma->vm_ops = &gfs2_vm_ops_private;
        gfs2_glock_dq_uninit(&i_gh);
@@ -538,15 +618,6 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
        if (__mandatory_lock(&ip->i_inode))
                return -ENOLCK;
-        if (sdp->sd_args.ar_localflocks) {
-                if (IS_GETLK(cmd)) {
-                        posix_test_lock(file, fl);
-                        return 0;
-                } else {
-                        return posix_lock_file_wait(file, fl);
-                }
-        }
        if (cmd == F_CANCELLK) {
                /* Hack: */
                cmd = F_SETLK;
@@ -632,16 +703,12 @@ static void do_unflock(struct file *file, struct file_lock *fl)
 static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 {
        struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-        struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
        if (!(fl->fl_flags & FL_FLOCK))
                return -ENOLCK;
        if (__mandatory_lock(&ip->i_inode))
                return -ENOLCK;
-        if (sdp->sd_args.ar_localflocks)
-                return flock_lock_file_wait(file, fl);
        if (fl->fl_type == F_UNLCK) {
                do_unflock(file, fl);
                return 0;
@@ -678,3 +745,27 @@ const struct file_operations gfs2_dir_fops = {
        .flock          = gfs2_flock,
 };
+const struct file_operations gfs2_file_fops_nolock = {
+        .llseek         = gfs2_llseek,
+        .read           = do_sync_read,
+        .aio_read       = generic_file_aio_read,
+        .write          = do_sync_write,
+        .aio_write      = generic_file_aio_write,
+        .unlocked_ioctl = gfs2_ioctl,
+        .mmap           = gfs2_mmap,
+        .open           = gfs2_open,
+        .release        = gfs2_close,
+        .fsync          = gfs2_fsync,
+        .splice_read    = generic_file_splice_read,
+        .splice_write   = generic_file_splice_write,
+        .setlease       = gfs2_setlease,
+};
+const struct file_operations gfs2_dir_fops_nolock = {
+        .readdir        = gfs2_readdir,
+        .unlocked_ioctl = gfs2_ioctl,
+        .open           = gfs2_open,
+        .release        = gfs2_close,
+        .fsync          = gfs2_fsync,
+};
diff --git a/fs/gfs2/ops_file.h b/fs/gfs2/ops_file.h
deleted file mode 100644
index 7e5d8ec9c846..000000000000
--- a/fs/gfs2/ops_file.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#ifndef __OPS_FILE_DOT_H__
-#define __OPS_FILE_DOT_H__
-#include <linux/fs.h>
-struct gfs2_inode;
-extern struct file gfs2_internal_file_sentinel;
-extern int gfs2_internal_read(struct gfs2_inode *ip,
-                              struct file_ra_state *ra_state,
-                              char *buf, loff_t *pos, unsigned size);
-extern void gfs2_set_inode_flags(struct inode *inode);
-extern const struct file_operations gfs2_file_fops;
-extern const struct file_operations gfs2_dir_fops;
-#endif /* __OPS_FILE_DOT_H__ */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 17de58e83d92..43d511bba52d 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -21,6 +21,7 @@
 #include "gfs2.h"
 #include "incore.h"
+#include "bmap.h"
 #include "daemon.h"
 #include "glock.h"
 #include "glops.h"
@@ -59,7 +60,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        mutex_init(&sdp->sd_inum_mutex);
        spin_lock_init(&sdp->sd_statfs_spin);
-        mutex_init(&sdp->sd_statfs_mutex);
        spin_lock_init(&sdp->sd_rindex_spin);
        mutex_init(&sdp->sd_rindex_mutex);
@@ -77,7 +77,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        spin_lock_init(&sdp->sd_log_lock);
-        INIT_LIST_HEAD(&sdp->sd_log_le_gl);
        INIT_LIST_HEAD(&sdp->sd_log_le_buf);
        INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
        INIT_LIST_HEAD(&sdp->sd_log_le_rg);
@@ -303,6 +302,67 @@ out:
        return error;
 }
+/**
+ * map_journal_extents - create a reusable "extent" mapping from all logical
+ * blocks to all physical blocks for the given journal.  This will save
+ * us time when writing journal blocks.  Most journals will have only one
+ * extent that maps all their logical blocks.  That's because gfs2.mkfs
+ * arranges the journal blocks sequentially to maximize performance.
+ * So the extent would map the first block for the entire file length.
+ * However, gfs2_jadd can happen while file activity is happening, so
+ * those journals may not be sequential.  Less likely is the case where
+ * the users created their own journals by mounting the metafs and
+ * laying it out.  But it's still possible.  These journals might have
+ * several extents.
+ *
+ * TODO: This should be done in bigger chunks rather than one block at a time,
+ *       but since it's only done at mount time, I'm not worried about the
+ *       time it takes.
+ */
+static int map_journal_extents(struct gfs2_sbd *sdp)
+{
+        struct gfs2_jdesc *jd = sdp->sd_jdesc;
+        unsigned int lb;
+        u64 db, prev_db; /* logical block, disk block, prev disk block */
+        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+        struct gfs2_journal_extent *jext = NULL;
+        struct buffer_head bh;
+        int rc = 0;
+        prev_db = 0;
+        for (lb = 0; lb < ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; lb++) {
+                bh.b_state = 0;
+                bh.b_blocknr = 0;
+                bh.b_size = 1 << ip->i_inode.i_blkbits;
+                rc = gfs2_block_map(jd->jd_inode, lb, &bh, 0);
+                db = bh.b_blocknr;
+                if (rc || !db) {
+                        printk(KERN_INFO "GFS2 journal mapping error %d: lb="
+                               "%u db=%llu\n", rc, lb, (unsigned long long)db);
+                        break;
+                }
+                if (!prev_db || db != prev_db + 1) {
+                        jext = kzalloc(sizeof(struct gfs2_journal_extent),
+                                       GFP_KERNEL);
+                        if (!jext) {
+                                printk(KERN_INFO "GFS2 error: out of memory "
+                                       "mapping journal extents.\n");
+                                rc = -ENOMEM;
+                                break;
+                        }
+                        jext->dblock = db;
+                        jext->lblock = lb;
+                        jext->blocks = 1;
+                        list_add_tail(&jext->extent_list, &jd->extent_list);
+                } else {
+                        jext->blocks++;
+                }
+                prev_db = db;
+        }
+        return rc;
+}
 static int init_journal(struct gfs2_sbd *sdp, int undo)
 {
        struct gfs2_holder ji_gh;
@@ -340,7 +400,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
        if (sdp->sd_args.ar_spectator) {
                sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
-                sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
+                atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
        } else {
                if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
                        fs_err(sdp, "can't mount journal #%u\n",
@@ -377,7 +437,10 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
                               sdp->sd_jdesc->jd_jid, error);
                        goto fail_jinode_gh;
                }
-                sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
+                atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
+                /* Map the extents for this journal's blocks */
+                map_journal_extents(sdp);
        }
        if (sdp->sd_lockstruct.ls_first) {
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 291f0c7eaa3b..9f71372c1757 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -61,7 +61,7 @@ static int gfs2_create(struct inode *dir, struct dentry *dentry,
                inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode, 0);
                if (!IS_ERR(inode)) {
                        gfs2_trans_end(sdp);
-                        if (dip->i_alloc.al_rgd)
+                        if (dip->i_alloc->al_rgd)
                                gfs2_inplace_release(dip);
                        gfs2_quota_unlock(dip);
                        gfs2_alloc_put(dip);
@@ -113,8 +113,18 @@ static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
        if (inode && IS_ERR(inode))
                return ERR_PTR(PTR_ERR(inode));
-        if (inode)
+        if (inode) {
+                struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
+                struct gfs2_holder gh;
+                int error;
+                error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+                if (error) {
+                        iput(inode);
+                        return ERR_PTR(error);
+                }
+                gfs2_glock_dq_uninit(&gh);
                return d_splice_alias(inode, dentry);
+        }
        d_add(dentry, inode);
        return NULL;
@@ -366,7 +376,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
        }
        gfs2_trans_end(sdp);
-        if (dip->i_alloc.al_rgd)
+        if (dip->i_alloc->al_rgd)
                gfs2_inplace_release(dip);
        gfs2_quota_unlock(dip);
        gfs2_alloc_put(dip);
@@ -442,7 +452,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        gfs2_assert_withdraw(sdp, !error); /* dip already pinned */
        gfs2_trans_end(sdp);
-        if (dip->i_alloc.al_rgd)
+        if (dip->i_alloc->al_rgd)
                gfs2_inplace_release(dip);
        gfs2_quota_unlock(dip);
        gfs2_alloc_put(dip);
@@ -548,7 +558,7 @@ static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
        }
        gfs2_trans_end(sdp);
-        if (dip->i_alloc.al_rgd)
+        if (dip->i_alloc->al_rgd)
                gfs2_inplace_release(dip);
        gfs2_quota_unlock(dip);
        gfs2_alloc_put(dip);
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
index 34f0caac1a03..fd8cee231e1d 100644
--- a/fs/gfs2/ops_inode.h
+++ b/fs/gfs2/ops_inode.h
@@ -16,5 +16,11 @@ extern const struct inode_operations gfs2_file_iops;
 extern const struct inode_operations gfs2_dir_iops;
 extern const struct inode_operations gfs2_symlink_iops;
 extern const struct inode_operations gfs2_dev_iops;
+extern const struct file_operations gfs2_file_fops;
+extern const struct file_operations gfs2_dir_fops;
+extern const struct file_operations gfs2_file_fops_nolock;
+extern const struct file_operations gfs2_dir_fops_nolock;
+extern void gfs2_set_inode_flags(struct inode *inode);
 #endif /* __OPS_INODE_DOT_H__ */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 950f31460e8b..5e524217944a 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -487,7 +487,6 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
        if (ip) {
                ip->i_flags = 0;
                ip->i_gl = NULL;
-                ip->i_last_pfault = jiffies;
        }
        return &ip->i_inode;
 }
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
deleted file mode 100644
index 927d739d4685..000000000000
--- a/fs/gfs2/ops_vm.c
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
-#include "gfs2.h"
-#include "incore.h"
-#include "bmap.h"
-#include "glock.h"
-#include "inode.h"
-#include "ops_vm.h"
-#include "quota.h"
-#include "rgrp.h"
-#include "trans.h"
-#include "util.h"
-static int gfs2_private_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-        struct gfs2_inode *ip = GFS2_I(vma->vm_file->f_mapping->host);
-        set_bit(GIF_PAGED, &ip->i_flags);
-        return filemap_fault(vma, vmf);
-}
-static int alloc_page_backing(struct gfs2_inode *ip, struct page *page)
-{
-        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        unsigned long index = page->index;
-        u64 lblock = index << (PAGE_CACHE_SHIFT -
-                                    sdp->sd_sb.sb_bsize_shift);
-        unsigned int blocks = PAGE_CACHE_SIZE >> sdp->sd_sb.sb_bsize_shift;
-        struct gfs2_alloc *al;
-        unsigned int data_blocks, ind_blocks;
-        unsigned int x;
-        int error;
-        al = gfs2_alloc_get(ip);
-        error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
-        if (error)
-                goto out;
-        error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
-        if (error)
-                goto out_gunlock_q;
-        gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
-        al->al_requested = data_blocks + ind_blocks;
-        error = gfs2_inplace_reserve(ip);
-        if (error)
-                goto out_gunlock_q;
-        error = gfs2_trans_begin(sdp, al->al_rgd->rd_length +
-                                 ind_blocks + RES_DINODE +
-                                 RES_STATFS + RES_QUOTA, 0);
-        if (error)
-                goto out_ipres;
-        if (gfs2_is_stuffed(ip)) {
-                error = gfs2_unstuff_dinode(ip, NULL);
-                if (error)
-                        goto out_trans;
-        }
-        for (x = 0; x < blocks; ) {
-                u64 dblock;
-                unsigned int extlen;
-                int new = 1;
-                error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
-                if (error)
-                        goto out_trans;
-                lblock += extlen;
-                x += extlen;
-        }
-        gfs2_assert_warn(sdp, al->al_alloced);
-out_trans:
-        gfs2_trans_end(sdp);
-out_ipres:
-        gfs2_inplace_release(ip);
-out_gunlock_q:
-        gfs2_quota_unlock(ip);
-out:
-        gfs2_alloc_put(ip);
-        return error;
-}
-static int gfs2_sharewrite_fault(struct vm_area_struct *vma,
-                                                struct vm_fault *vmf)
-{
-        struct file *file = vma->vm_file;
-        struct gfs2_file *gf = file->private_data;
-        struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-        struct gfs2_holder i_gh;
-        int alloc_required;
-        int error;
-        int ret = 0;
-        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
-        if (error)
-                goto out;
-        set_bit(GIF_PAGED, &ip->i_flags);
-        set_bit(GIF_SW_PAGED, &ip->i_flags);
-        error = gfs2_write_alloc_required(ip,
-                                        (u64)vmf->pgoff << PAGE_CACHE_SHIFT,
-                                        PAGE_CACHE_SIZE, &alloc_required);
-        if (error) {
-                ret = VM_FAULT_OOM; /* XXX: are these right? */
-                goto out_unlock;
-        }
-        set_bit(GFF_EXLOCK, &gf->f_flags);
-        ret = filemap_fault(vma, vmf);
-        clear_bit(GFF_EXLOCK, &gf->f_flags);
-        if (ret & VM_FAULT_ERROR)
-                goto out_unlock;
-        if (alloc_required) {
-                /* XXX: do we need to drop page lock around alloc_page_backing?*/
-                error = alloc_page_backing(ip, vmf->page);
-                if (error) {
-                        /*
-                         * VM_FAULT_LOCKED should always be the case for
-                         * filemap_fault, but it may not be in a future
-                         * implementation.
-                         */
-                        if (ret & VM_FAULT_LOCKED)
-                                unlock_page(vmf->page);
-                        page_cache_release(vmf->page);
-                        ret = VM_FAULT_OOM;
-                        goto out_unlock;
-                }
-                set_page_dirty(vmf->page);
-        }
-out_unlock:
-        gfs2_glock_dq_uninit(&i_gh);
-out:
-        return ret;
-}
-struct vm_operations_struct gfs2_vm_ops_private = {
-        .fault = gfs2_private_fault,
-};
-struct vm_operations_struct gfs2_vm_ops_sharewrite = {
-        .fault = gfs2_sharewrite_fault,
-};
diff --git a/fs/gfs2/ops_vm.h b/fs/gfs2/ops_vm.h
deleted file mode 100644
index 4ae8f43ed5e3..000000000000
--- a/fs/gfs2/ops_vm.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#ifndef __OPS_VM_DOT_H__
-#define __OPS_VM_DOT_H__
-#include <linux/mm.h>
-extern struct vm_operations_struct gfs2_vm_ops_private;
-extern struct vm_operations_struct gfs2_vm_ops_sharewrite;
-#endif /* __OPS_VM_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index addb51e0f135..a08dabd6ce90 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -59,7 +59,6 @@
 #include "super.h"
 #include "trans.h"
 #include "inode.h"
-#include "ops_file.h"
 #include "ops_address.h"
 #include "util.h"
@@ -274,10 +273,10 @@ static int bh_get(struct gfs2_quota_data *qd)
        }
        block = qd->qd_slot / sdp->sd_qc_per_block;
-        offset = qd->qd_slot % sdp->sd_qc_per_block;;
+        offset = qd->qd_slot % sdp->sd_qc_per_block;
        bh_map.b_size = 1 << ip->i_inode.i_blkbits;
-        error = gfs2_block_map(&ip->i_inode, block, 0, &bh_map);
+        error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0);
        if (error)
                goto fail;
        error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh);
@@ -454,7 +453,7 @@ static void qdsb_put(struct gfs2_quota_data *qd)
 int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_alloc *al = ip->i_alloc;
        struct gfs2_quota_data **qd = al->al_qd;
        int error;
@@ -502,7 +501,7 @@ out:
 void gfs2_quota_unhold(struct gfs2_inode *ip)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_alloc *al = ip->i_alloc;
        unsigned int x;
        gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags));
@@ -646,7 +645,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
        }
        if (!buffer_mapped(bh)) {
-                gfs2_get_block(inode, iblock, bh, 1);
+                gfs2_block_map(inode, iblock, bh, 1);
                if (!buffer_mapped(bh))
                        goto unlock;
        }
@@ -793,11 +792,9 @@ static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
        struct gfs2_holder i_gh;
        struct gfs2_quota_host q;
        char buf[sizeof(struct gfs2_quota)];
-        struct file_ra_state ra_state;
        int error;
        struct gfs2_quota_lvb *qlvb;
-        file_ra_state_init(&ra_state, sdp->sd_quota_inode->i_mapping);
 restart:
        error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
        if (error)
@@ -820,8 +817,8 @@ restart:
                memset(buf, 0, sizeof(struct gfs2_quota));
                pos = qd2offset(qd);
-                error = gfs2_internal_read(ip, &ra_state, buf,
+                error = gfs2_internal_read(ip, NULL, buf, &pos,
-                                           &pos, sizeof(struct gfs2_quota));
+                                           sizeof(struct gfs2_quota));
                if (error < 0)
                        goto fail_gunlock;
@@ -856,7 +853,7 @@ fail:
 int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_alloc *al = ip->i_alloc;
        unsigned int x;
        int error = 0;
@@ -924,7 +921,7 @@ static int need_sync(struct gfs2_quota_data *qd)
 void gfs2_quota_unlock(struct gfs2_inode *ip)
 {
-        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_alloc *al = ip->i_alloc;
        struct gfs2_quota_data *qda[4];
        unsigned int count = 0;
        unsigned int x;
@@ -972,7 +969,7 @@ static int print_message(struct gfs2_quota_data *qd, char *type)
 int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_alloc *al = ip->i_alloc;
        struct gfs2_quota_data *qd;
        s64 value;
        unsigned int x;
@@ -1016,10 +1013,9 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
 void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
                       u32 uid, u32 gid)
 {
-        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_alloc *al = ip->i_alloc;
        struct gfs2_quota_data *qd;
        unsigned int x;
-        unsigned int found = 0;
        if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
                return;
@@ -1032,7 +1028,6 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
                if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
                    (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) {
                        do_qc(qd, change);
-                        found++;
                }
        }
 }
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index beb6c7ac0086..b249e294a95b 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -391,7 +391,7 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
        lblock = head->lh_blkno;
        gfs2_replay_incr_blk(sdp, &lblock);
        bh_map.b_size = 1 << ip->i_inode.i_blkbits;
-        error = gfs2_block_map(&ip->i_inode, lblock, 0, &bh_map);
+        error = gfs2_block_map(&ip->i_inode, lblock, &bh_map, 0);
        if (error)
                return error;
        if (!bh_map.b_blocknr) {
@@ -504,13 +504,21 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
                        if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
                                ro = 1;
                } else {
-                        if (sdp->sd_vfs->s_flags & MS_RDONLY)
+                        if (sdp->sd_vfs->s_flags & MS_RDONLY) {
-                                ro = 1;
+                                /* check if device itself is read-only */
+                                ro = bdev_read_only(sdp->sd_vfs->s_bdev);
+                                if (!ro) {
+                                        fs_info(sdp, "recovery required on "
+                                                "read-only filesystem.\n");
+                                        fs_info(sdp, "write access will be "
+                                                "enabled during recovery.\n");
+                                }
+                        }
                }
                if (ro) {
-                        fs_warn(sdp, "jid=%u: Can't replay: read-only FS\n",
+                        fs_warn(sdp, "jid=%u: Can't replay: read-only block "
-                                jd->jd_jid);
+                                "device\n", jd->jd_jid);
                        error = -EROFS;
                        goto fail_gunlock_tr;
                }
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 708c287e1d0e..3552110b2e5f 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -25,10 +25,10 @@
 #include "rgrp.h"
 #include "super.h"
 #include "trans.h"
-#include "ops_file.h"
 #include "util.h"
 #include "log.h"
 #include "inode.h"
+#include "ops_address.h"
 #define BFITNOENT ((u32)~0)
 #define NO_BLOCK ((u64)~0)
@@ -126,41 +126,43 @@ static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
 * Return: the block number (bitmap buffer scope) that was found
 */
-static u32 gfs2_bitfit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+static u32 gfs2_bitfit(unsigned char *buffer, unsigned int buflen, u32 goal,
-                            unsigned int buflen, u32 goal,
+                       unsigned char old_state)
-                            unsigned char old_state)
 {
-        unsigned char *byte, *end, alloc;
+        unsigned char *byte;
        u32 blk = goal;
-        unsigned int bit;
+        unsigned int bit, bitlong;
+        unsigned long *plong, plong55;
        byte = buffer + (goal / GFS2_NBBY);
+        plong = (unsigned long *)(buffer + (goal / GFS2_NBBY));
        bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
-        end = buffer + buflen;
+        bitlong = bit;
-        alloc = (old_state == GFS2_BLKST_FREE) ? 0x55 : 0;
+#if BITS_PER_LONG == 32
+        plong55 = 0x55555555;
-        while (byte < end) {
+#else
-                /* If we're looking for a free block we can eliminate all
+        plong55 = 0x5555555555555555;
-                   bitmap settings with 0x55, which represents four data
+#endif
-                   blocks in a row.  If we're looking for a data block, we can
+        while (byte < buffer + buflen) {
-                   eliminate 0x00 which corresponds to four free blocks. */
-                if ((*byte & 0x55) == alloc) {
+                if (bitlong == 0 && old_state == 0 && *plong == plong55) {
-                        blk += (8 - bit) >> 1;
+                        plong++;
+                        byte += sizeof(unsigned long);
-                        bit = 0;
+                        blk += sizeof(unsigned long) * GFS2_NBBY;
-                        byte++;
                        continue;
                }
                if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
                        return blk;
                bit += GFS2_BIT_SIZE;
                if (bit >= 8) {
                        bit = 0;
                        byte++;
                }
+                bitlong += GFS2_BIT_SIZE;
+                if (bitlong >= sizeof(unsigned long) * 8) {
+                        bitlong = 0;
+                        plong++;
+                }
                blk++;
        }
@@ -817,11 +819,9 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
 struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
 {
-        struct gfs2_alloc *al = &ip->i_alloc;
+        BUG_ON(ip->i_alloc != NULL);
+        ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_KERNEL);
-        /* FIXME: Should assert that the correct locks are held here... */
+        return ip->i_alloc;
-        memset(al, 0, sizeof(*al));
-        return al;
 }
 /**
@@ -1059,26 +1059,34 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
        struct inode *inode = NULL;
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrpd *rgd, *begin = NULL;
-        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_alloc *al = ip->i_alloc;
        int flags = LM_FLAG_TRY;
        int skipped = 0;
        int loops = 0;
-        int error;
+        int error, rg_locked;
        /* Try recently successful rgrps */
        rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc);
        while (rgd) {
-                error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
+                rg_locked = 0;
-                                           LM_FLAG_TRY, &al->al_rgd_gh);
+                if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
+                        rg_locked = 1;
+                        error = 0;
+                } else {
+                        error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
+                                                   LM_FLAG_TRY, &al->al_rgd_gh);
+                }
                switch (error) {
                case 0:
                        if (try_rgrp_fit(rgd, al))
                                goto out;
                        if (rgd->rd_flags & GFS2_RDF_CHECK)
                                inode = try_rgrp_unlink(rgd, last_unlinked);
-                        gfs2_glock_dq_uninit(&al->al_rgd_gh);
+                        if (!rg_locked)
+                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
                        if (inode)
                                return inode;
                        rgd = recent_rgrp_next(rgd, 1);
@@ -1098,15 +1106,23 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
        begin = rgd = forward_rgrp_get(sdp);
        for (;;) {
-                error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags,
+                rg_locked = 0;
-                                          &al->al_rgd_gh);
+                if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
+                        rg_locked = 1;
+                        error = 0;
+                } else {
+                        error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags,
+                                                   &al->al_rgd_gh);
+                }
                switch (error) {
                case 0:
                        if (try_rgrp_fit(rgd, al))
                                goto out;
                        if (rgd->rd_flags & GFS2_RDF_CHECK)
                                inode = try_rgrp_unlink(rgd, last_unlinked);
-                        gfs2_glock_dq_uninit(&al->al_rgd_gh);
+                        if (!rg_locked)
+                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
                        if (inode)
                                return inode;
                        break;
@@ -1158,7 +1174,7 @@ out:
 int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_alloc *al = ip->i_alloc;
        struct inode *inode;
        int error = 0;
        u64 last_unlinked = NO_BLOCK;
@@ -1204,7 +1220,7 @@ try_again:
 void gfs2_inplace_release(struct gfs2_inode *ip)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_alloc *al = ip->i_alloc;
        if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1)
                fs_warn(sdp, "al_alloced = %u, al_requested = %u "
@@ -1213,7 +1229,8 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
                             al->al_line);
        al->al_rgd = NULL;
-        gfs2_glock_dq_uninit(&al->al_rgd_gh);
+        if (al->al_rgd_gh.gh_gl)
+                gfs2_glock_dq_uninit(&al->al_rgd_gh);
        if (ip != GFS2_I(sdp->sd_rindex))
                gfs2_glock_dq_uninit(&al->al_ri_gh);
 }
@@ -1301,11 +1318,10 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
                /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
                   bitmaps, so we must search the originals for that. */
                if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone)
-                        blk = gfs2_bitfit(rgd, bi->bi_clone + bi->bi_offset,
+                        blk = gfs2_bitfit(bi->bi_clone + bi->bi_offset,
                                          bi->bi_len, goal, old_state);
                else
-                        blk = gfs2_bitfit(rgd,
+                        blk = gfs2_bitfit(bi->bi_bh->b_data + bi->bi_offset,
-                                          bi->bi_bh->b_data + bi->bi_offset,
                                          bi->bi_len, goal, old_state);
                if (blk != BFITNOENT)
                        break;
@@ -1394,7 +1410,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 u64 gfs2_alloc_data(struct gfs2_inode *ip)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_alloc *al = ip->i_alloc;
        struct gfs2_rgrpd *rgd = al->al_rgd;
        u32 goal, blk;
        u64 block;
@@ -1439,7 +1455,7 @@ u64 gfs2_alloc_data(struct gfs2_inode *ip)
 u64 gfs2_alloc_meta(struct gfs2_inode *ip)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_alloc *al = ip->i_alloc;
        struct gfs2_rgrpd *rgd = al->al_rgd;
        u32 goal, blk;
        u64 block;
@@ -1485,7 +1501,7 @@ u64 gfs2_alloc_meta(struct gfs2_inode *ip)
 u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
-        struct gfs2_alloc *al = &dip->i_alloc;
+        struct gfs2_alloc *al = dip->i_alloc;
        struct gfs2_rgrpd *rgd = al->al_rgd;
        u32 blk;
        u64 block;
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index b4c6adfc6f2e..149bb161f4b6 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -32,7 +32,9 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
 struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
 static inline void gfs2_alloc_put(struct gfs2_inode *ip)
 {
-        return; /* So we can see where ip->i_alloc is used */
+        BUG_ON(ip->i_alloc == NULL);
+        kfree(ip->i_alloc);
+        ip->i_alloc = NULL;
 }
 int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index dd3e737f528e..ef0562c3bc71 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -51,13 +51,9 @@ void gfs2_tune_init(struct gfs2_tune *gt)
 {
        spin_lock_init(&gt->gt_spin);
-        gt->gt_ilimit = 100;
-        gt->gt_ilimit_tries = 3;
-        gt->gt_ilimit_min = 1;
        gt->gt_demote_secs = 300;
        gt->gt_incore_log_blocks = 1024;
        gt->gt_log_flush_secs = 60;
-        gt->gt_jindex_refresh_secs = 60;
        gt->gt_recoverd_secs = 60;
        gt->gt_logd_secs = 1;
        gt->gt_quotad_secs = 5;
@@ -71,10 +67,8 @@ void gfs2_tune_init(struct gfs2_tune *gt)
        gt->gt_new_files_jdata = 0;
        gt->gt_new_files_directio = 0;
        gt->gt_max_readahead = 1 << 18;
-        gt->gt_lockdump_size = 131072;
        gt->gt_stall_secs = 600;
        gt->gt_complain_secs = 10;
-        gt->gt_reclaim_limit = 5000;
        gt->gt_statfs_quantum = 30;
        gt->gt_statfs_slow = 0;
 }
@@ -393,6 +387,7 @@ int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
                if (!jd)
                        break;
+                INIT_LIST_HEAD(&jd->extent_list);
                jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL);
                if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
                        if (!jd->jd_inode)
@@ -422,8 +417,9 @@ int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
 void gfs2_jindex_free(struct gfs2_sbd *sdp)
 {
-        struct list_head list;
+        struct list_head list, *head;
        struct gfs2_jdesc *jd;
+        struct gfs2_journal_extent *jext;
        spin_lock(&sdp->sd_jindex_spin);
        list_add(&list, &sdp->sd_jindex_list);
@@ -433,6 +429,14 @@ void gfs2_jindex_free(struct gfs2_sbd *sdp)
        while (!list_empty(&list)) {
                jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
+                head = &jd->extent_list;
+                while (!list_empty(head)) {
+                        jext = list_entry(head->next,
+                                          struct gfs2_journal_extent,
+                                          extent_list);
+                        list_del(&jext->extent_list);
+                        kfree(jext);
+                }
                list_del(&jd->jd_list);
                iput(jd->jd_inode);
                kfree(jd);
@@ -543,7 +547,6 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
        if (error)
                return error;
-        gfs2_meta_cache_flush(ip);
        j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
        error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -686,9 +689,7 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
        if (error)
                return;
-        mutex_lock(&sdp->sd_statfs_mutex);
        gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
-        mutex_unlock(&sdp->sd_statfs_mutex);
        spin_lock(&sdp->sd_statfs_spin);
        l_sc->sc_total += total;
@@ -736,9 +737,7 @@ int gfs2_statfs_sync(struct gfs2_sbd *sdp)
        if (error)
                goto out_bh2;
-        mutex_lock(&sdp->sd_statfs_mutex);
        gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
-        mutex_unlock(&sdp->sd_statfs_mutex);
        spin_lock(&sdp->sd_statfs_spin);
        m_sc->sc_total += l_sc->sc_total;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 06e0b7768d97..eaa3b7b2f99e 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -32,7 +32,8 @@ spinlock_t gfs2_sys_margs_lock;
 static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
 {
-        return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_vfs->s_id);
+        return snprintf(buf, PAGE_SIZE, "%u:%u\n",
+                        MAJOR(sdp->sd_vfs->s_dev), MINOR(sdp->sd_vfs->s_dev));
 }
 static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf)
@@ -221,9 +222,7 @@ static struct kobj_type gfs2_ktype = {
        .sysfs_ops     = &gfs2_attr_ops,
 };
-static struct kset gfs2_kset = {
+static struct kset *gfs2_kset;
-        .ktype  = &gfs2_ktype,
-};
 /*
 * display struct lm_lockstruct fields
@@ -427,13 +426,11 @@ TUNE_ATTR_2(name, name##_store)
 TUNE_ATTR(demote_secs, 0);
 TUNE_ATTR(incore_log_blocks, 0);
 TUNE_ATTR(log_flush_secs, 0);
-TUNE_ATTR(jindex_refresh_secs, 0);
 TUNE_ATTR(quota_warn_period, 0);
 TUNE_ATTR(quota_quantum, 0);
 TUNE_ATTR(atime_quantum, 0);
 TUNE_ATTR(max_readahead, 0);
 TUNE_ATTR(complain_secs, 0);
-TUNE_ATTR(reclaim_limit, 0);
 TUNE_ATTR(statfs_slow, 0);
 TUNE_ATTR(new_files_jdata, 0);
 TUNE_ATTR(new_files_directio, 0);
@@ -450,13 +447,11 @@ static struct attribute *tune_attrs[] = {
        &tune_attr_demote_secs.attr,
        &tune_attr_incore_log_blocks.attr,
        &tune_attr_log_flush_secs.attr,
-        &tune_attr_jindex_refresh_secs.attr,
        &tune_attr_quota_warn_period.attr,
        &tune_attr_quota_quantum.attr,
        &tune_attr_atime_quantum.attr,
        &tune_attr_max_readahead.attr,
        &tune_attr_complain_secs.attr,
-        &tune_attr_reclaim_limit.attr,
        &tune_attr_statfs_slow.attr,
        &tune_attr_quota_simul_sync.attr,
        &tune_attr_quota_cache_secs.attr,
@@ -495,14 +490,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 {
        int error;
-        sdp->sd_kobj.kset = &gfs2_kset;
+        sdp->sd_kobj.kset = gfs2_kset;
-        sdp->sd_kobj.ktype = &gfs2_ktype;
+        error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL,
+                                     "%s", sdp->sd_table_name);
-        error = kobject_set_name(&sdp->sd_kobj, "%s", sdp->sd_table_name);
-        if (error)
-                goto fail;
-        error = kobject_register(&sdp->sd_kobj);
        if (error)
                goto fail;
@@ -522,6 +512,7 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
        if (error)
                goto fail_args;
+        kobject_uevent(&sdp->sd_kobj, KOBJ_ADD);
        return 0;
 fail_args:
@@ -531,7 +522,7 @@ fail_counters:
 fail_lockstruct:
        sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
 fail_reg:
-        kobject_unregister(&sdp->sd_kobj);
+        kobject_put(&sdp->sd_kobj);
 fail:
        fs_err(sdp, "error %d adding sysfs files", error);
        return error;
@@ -543,21 +534,22 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
        sysfs_remove_group(&sdp->sd_kobj, &args_group);
        sysfs_remove_group(&sdp->sd_kobj, &counters_group);
        sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
-        kobject_unregister(&sdp->sd_kobj);
+        kobject_put(&sdp->sd_kobj);
 }
 int gfs2_sys_init(void)
 {
        gfs2_sys_margs = NULL;
        spin_lock_init(&gfs2_sys_margs_lock);
-        kobject_set_name(&gfs2_kset.kobj, "gfs2");
+        gfs2_kset = kset_create_and_add("gfs2", NULL, fs_kobj);
-        kobj_set_kset_s(&gfs2_kset, fs_subsys);
+        if (!gfs2_kset)
-        return kset_register(&gfs2_kset);
+                return -ENOMEM;
+        return 0;
 }
 void gfs2_sys_uninit(void)
 {
        kfree(gfs2_sys_margs);
-        kset_unregister(&gfs2_kset);
+        kset_unregister(gfs2_kset);
 }
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 717983e2c2ae..73e5d92a657c 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -114,11 +114,6 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
                gfs2_log_flush(sdp, NULL);
 }
-void gfs2_trans_add_gl(struct gfs2_glock *gl)
-{
-        lops_add(gl->gl_sbd, &gl->gl_le);
-}
 /**
 * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction
 * @gl: the glock the buffer belongs to
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index 043d5f4b9c4c..e826f0dab80a 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -30,7 +30,6 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
 void gfs2_trans_end(struct gfs2_sbd *sdp);
-void gfs2_trans_add_gl(struct gfs2_glock *gl);
 void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
 void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
 void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno);
diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c
index f13f1494d4fe..f8452a0eab56 100644
--- a/fs/hfs/bfind.c
+++ b/fs/hfs/bfind.c
@@ -52,6 +52,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
                rec = (e + b) / 2;
                len = hfs_brec_lenoff(bnode, rec, &off);
                keylen = hfs_brec_keylen(bnode, rec);
+                if (keylen == HFS_BAD_KEYLEN) {
+                        res = -EINVAL;
+                        goto done;
+                }
                hfs_bnode_read(bnode, fd->key, off, keylen);
                cmpval = bnode->tree->keycmp(fd->key, fd->search_key);
                if (!cmpval) {
@@ -67,6 +71,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
        if (rec != e && e >= 0) {
                len = hfs_brec_lenoff(bnode, e, &off);
                keylen = hfs_brec_keylen(bnode, e);
+                if (keylen == HFS_BAD_KEYLEN) {
+                        res = -EINVAL;
+                        goto done;
+                }
                hfs_bnode_read(bnode, fd->key, off, keylen);
        }
 done:
@@ -198,6 +206,10 @@ int hfs_brec_goto(struct hfs_find_data *fd, int cnt)
        len = hfs_brec_lenoff(bnode, fd->record, &off);
        keylen = hfs_brec_keylen(bnode, fd->record);
+        if (keylen == HFS_BAD_KEYLEN) {
+                res = -EINVAL;
+                goto out;
+        }
        fd->keyoffset = off;
        fd->keylength = keylen;
        fd->entryoffset = off + keylen;
diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c
index 5c87cf4801fc..8626ee375ea8 100644
--- a/fs/hfs/brec.c
+++ b/fs/hfs/brec.c
@@ -44,10 +44,21 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
                recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2);
                if (!recoff)
                        return 0;
-                if (node->tree->attributes & HFS_TREE_BIGKEYS)
+                if (node->tree->attributes & HFS_TREE_BIGKEYS) {
                        retval = hfs_bnode_read_u16(node, recoff) + 2;
-                else
+                        if (retval > node->tree->max_key_len + 2) {
+                                printk(KERN_ERR "hfs: keylen %d too large\n",
+                                        retval);
+                                retval = HFS_BAD_KEYLEN;
+                        }
+                } else {
                        retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1;
+                        if (retval > node->tree->max_key_len + 1) {
+                                printk(KERN_ERR "hfs: keylen %d too large\n",
+                                        retval);
+                                retval = HFS_BAD_KEYLEN;
+                        }
+                }
        }
        return retval;
 }
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 8a3a650abc87..110dd3515dc8 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -61,7 +61,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
        mapping = tree->inode->i_mapping;
        page = read_mapping_page(mapping, 0, NULL);
        if (IS_ERR(page))
-                goto free_tree;
+                goto free_inode;
        /* Load the header */
        head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
@@ -81,6 +81,17 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
                goto fail_page;
        if (!tree->node_count)
                goto fail_page;
+        if ((id == HFS_EXT_CNID) && (tree->max_key_len != HFS_MAX_EXT_KEYLEN)) {
+                printk(KERN_ERR "hfs: invalid extent max_key_len %d\n",
+                        tree->max_key_len);
+                goto fail_page;
+        }
+        if ((id == HFS_CAT_CNID) && (tree->max_key_len != HFS_MAX_CAT_KEYLEN)) {
+                printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n",
+                        tree->max_key_len);
+                goto fail_page;
+        }
        tree->node_size_shift = ffs(size) - 1;
        tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
@@ -88,11 +99,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
        page_cache_release(page);
        return tree;
- fail_page:
+fail_page:
-        tree->inode->i_mapping->a_ops = &hfs_aops;
        page_cache_release(page);
- free_tree:
+free_inode:
+        tree->inode->i_mapping->a_ops = &hfs_aops;
        iput(tree->inode);
+free_tree:
        kfree(tree);
        return NULL;
 }
diff --git a/fs/hfs/hfs.h b/fs/hfs/hfs.h
index 1445e3a56ed4..c6aae61adfe6 100644
--- a/fs/hfs/hfs.h
+++ b/fs/hfs/hfs.h
@@ -28,6 +28,8 @@
 #define HFS_MAX_NAMELEN         128
 #define HFS_MAX_VALENCE         32767U
+#define HFS_BAD_KEYLEN          0xFF
 /* Meanings of the drAtrb field of the MDB,
 * Reference: _Inside Macintosh: Files_ p. 2-61
 */
@@ -167,6 +169,9 @@ typedef union hfs_btree_key {
        struct hfs_ext_key ext;
 } hfs_btree_key;
+#define HFS_MAX_CAT_KEYLEN      (sizeof(struct hfs_cat_key) - sizeof(u8))
+#define HFS_MAX_EXT_KEYLEN      (sizeof(struct hfs_ext_key) - sizeof(u8))
 typedef union hfs_btree_key btree_key;
 struct hfs_extent {
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 12aca8ed605f..09ee07f02663 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -364,7 +364,6 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
                        ++next;
                        truncate_huge_page(page);
                        unlock_page(page);
-                        hugetlb_put_quota(mapping);
                        freed++;
                }
                huge_pagevec_release(&pvec);
@@ -859,15 +858,15 @@ out_free:
        return -ENOMEM;
 }
-int hugetlb_get_quota(struct address_space *mapping)
+int hugetlb_get_quota(struct address_space *mapping, long delta)
 {
        int ret = 0;
        struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
        if (sbinfo->free_blocks > -1) {
                spin_lock(&sbinfo->stat_lock);
-                if (sbinfo->free_blocks > 0)
+                if (sbinfo->free_blocks - delta >= 0)
-                        sbinfo->free_blocks--;
+                        sbinfo->free_blocks -= delta;
                else
                        ret = -ENOMEM;
                spin_unlock(&sbinfo->stat_lock);
@@ -876,13 +875,13 @@ int hugetlb_get_quota(struct address_space *mapping)
        return ret;
 }
-void hugetlb_put_quota(struct address_space *mapping)
+void hugetlb_put_quota(struct address_space *mapping, long delta)
 {
        struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
        if (sbinfo->free_blocks > -1) {
                spin_lock(&sbinfo->stat_lock);
-                sbinfo->free_blocks++;
+                sbinfo->free_blocks += delta;
                spin_unlock(&sbinfo->stat_lock);
        }
 }
diff --git a/fs/inode.c b/fs/inode.c
index ed35383d0b6c..276ffd6b6fdd 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1276,6 +1276,11 @@ void file_update_time(struct file *file)
                sync_it = 1;
        }
+        if (IS_I_VERSION(inode)) {
+                inode_inc_iversion(inode);
+                sync_it = 1;
+        }
        if (sync_it)
                mark_inode_dirty_sync(inode);
 }
diff --git a/fs/ioprio.c b/fs/ioprio.c
index d6ff77e8e7ec..c4a1c3c65aac 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -41,18 +41,28 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
                return err;
        task_lock(task);
+        do {
+                ioc = task->io_context;
+                /* see wmb() in current_io_context() */
+                smp_read_barrier_depends();
+                if (ioc)
+                        break;
-        task->ioprio = ioprio;
+                ioc = alloc_io_context(GFP_ATOMIC, -1);
+                if (!ioc) {
-        ioc = task->io_context;
+                        err = -ENOMEM;
-        /* see wmb() in current_io_context() */
+                        break;
-        smp_read_barrier_depends();
+                }
+                task->io_context = ioc;
+        } while (1);
-        if (ioc)
+        if (!err) {
+                ioc->ioprio = ioprio;
                ioc->ioprio_changed = 1;
+        }
        task_unlock(task);
-        return 0;
+        return err;
 }
 asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
@@ -75,8 +85,10 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
                        break;
                case IOPRIO_CLASS_IDLE:
-                        if (!capable(CAP_SYS_ADMIN))
+                        break;
-                                return -EPERM;
+                case IOPRIO_CLASS_NONE:
+                        if (data)
+                                return -EINVAL;
                        break;
                default:
                        return -EINVAL;
@@ -144,7 +156,9 @@ static int get_task_ioprio(struct task_struct *p)
        ret = security_task_getioprio(p);
        if (ret)
                goto out;
-        ret = p->ioprio;
+        ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM);
+        if (p->io_context)
+                ret = p->io_context->ioprio;
 out:
        return ret;
 }
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 47552d4a6324..a5432bbbfb88 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -347,7 +347,8 @@ restart:
                                break;
                        }
                        retry = __process_buffer(journal, jh, bhs,&batch_count);
-                        if (!retry && lock_need_resched(&journal->j_list_lock)){
+                        if (!retry && (need_resched() ||
+                                spin_needbreak(&journal->j_list_lock))) {
                                spin_unlock(&journal->j_list_lock);
                                retry = 1;
                                break;
@@ -602,15 +603,15 @@ int __journal_remove_checkpoint(struct journal_head *jh)
        /*
         * There is one special case to worry about: if we have just pulled the
-         * buffer off a committing transaction's forget list, then even if the
+         * buffer off a running or committing transaction's checkpoing list,
-         * checkpoint list is empty, the transaction obviously cannot be
+         * then even if the checkpoint list is empty, the transaction obviously
-         * dropped!
+         * cannot be dropped!
         *
-         * The locking here around j_committing_transaction is a bit sleazy.
+         * The locking here around t_state is a bit sleazy.
         * See the comment at the end of journal_commit_transaction().
         */
-        if (transaction == journal->j_committing_transaction) {
+        if (transaction->t_state != T_FINISHED) {
-                JBUFFER_TRACE(jh, "belongs to committing transaction");
+                JBUFFER_TRACE(jh, "belongs to running/committing transaction");
                goto out;
        }
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 8f1f2aa5fb39..31853eb65b4c 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -265,7 +265,7 @@ write_out_data:
                        put_bh(bh);
                }
-                if (lock_need_resched(&journal->j_list_lock)) {
+                if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
                        spin_unlock(&journal->j_list_lock);
                        goto write_out_data;
                }
@@ -858,10 +858,10 @@ restart_loop:
        }
        spin_unlock(&journal->j_list_lock);
        /*
-         * This is a bit sleazy.  We borrow j_list_lock to protect
+         * This is a bit sleazy.  We use j_list_lock to protect transition
-         * journal->j_committing_transaction in __journal_remove_checkpoint.
+         * of a transaction into T_FINISHED state and calling
-         * Really, __journal_remove_checkpoint should be using j_state_lock but
+         * __journal_drop_transaction(). Otherwise we could race with
-         * it's a bit hassle to hold that across __journal_remove_checkpoint
+         * other checkpointing code processing the transaction...
         */
        spin_lock(&journal->j_state_lock);
        spin_lock(&journal->j_list_lock);
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 08ff6c7028cc..038ed7436199 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -288,10 +288,12 @@ handle_t *journal_start(journal_t *journal, int nblocks)
                jbd_free_handle(handle);
                current->journal_info = NULL;
                handle = ERR_PTR(err);
+                goto out;
        }
        lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+out:
        return handle;
 }
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 3fccde7ba008..6914598022ce 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -232,7 +232,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
 */
 static int __process_buffer(journal_t *journal, struct journal_head *jh,
-                        struct buffer_head **bhs, int *batch_count)
+                        struct buffer_head **bhs, int *batch_count,
+                        transaction_t *transaction)
 {
        struct buffer_head *bh = jh2bh(jh);
        int ret = 0;
@@ -250,6 +251,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
                transaction_t *t = jh->b_transaction;
                tid_t tid = t->t_tid;
+                transaction->t_chp_stats.cs_forced_to_close++;
                spin_unlock(&journal->j_list_lock);
                jbd_unlock_bh_state(bh);
                jbd2_log_start_commit(journal, tid);
@@ -279,6 +281,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
                bhs[*batch_count] = bh;
                __buffer_relink_io(jh);
                jbd_unlock_bh_state(bh);
+                transaction->t_chp_stats.cs_written++;
                (*batch_count)++;
                if (*batch_count == NR_BATCH) {
                        spin_unlock(&journal->j_list_lock);
@@ -322,6 +325,8 @@ int jbd2_log_do_checkpoint(journal_t *journal)
        if (!journal->j_checkpoint_transactions)
                goto out;
        transaction = journal->j_checkpoint_transactions;
+        if (transaction->t_chp_stats.cs_chp_time == 0)
+                transaction->t_chp_stats.cs_chp_time = jiffies;
        this_tid = transaction->t_tid;
 restart:
        /*
@@ -346,8 +351,10 @@ restart:
                                retry = 1;
                                break;
                        }
-                        retry = __process_buffer(journal, jh, bhs,&batch_count);
+                        retry = __process_buffer(journal, jh, bhs, &batch_count,
-                        if (!retry && lock_need_resched(&journal->j_list_lock)){
+                                                 transaction);
+                        if (!retry && (need_resched() ||
+                                spin_needbreak(&journal->j_list_lock))) {
                                spin_unlock(&journal->j_list_lock);
                                retry = 1;
                                break;
@@ -602,15 +609,15 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
        /*
         * There is one special case to worry about: if we have just pulled the
-         * buffer off a committing transaction's forget list, then even if the
+         * buffer off a running or committing transaction's checkpoing list,
-         * checkpoint list is empty, the transaction obviously cannot be
+         * then even if the checkpoint list is empty, the transaction obviously
-         * dropped!
+         * cannot be dropped!
         *
-         * The locking here around j_committing_transaction is a bit sleazy.
+         * The locking here around t_state is a bit sleazy.
         * See the comment at the end of jbd2_journal_commit_transaction().
         */
-        if (transaction == journal->j_committing_transaction) {
+        if (transaction->t_state != T_FINISHED) {
-                JBUFFER_TRACE(jh, "belongs to committing transaction");
+                JBUFFER_TRACE(jh, "belongs to running/committing transaction");
                goto out;
        }
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6986f334c643..4f302d279279 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -20,6 +20,8 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
+#include <linux/jiffies.h>
+#include <linux/crc32.h>
 /*
 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -92,19 +94,23 @@ static int inverted_lock(journal_t *journal, struct buffer_head *bh)
        return 1;
 }
-/* Done it all: now write the commit record.  We should have
+/*
+ * Done it all: now submit the commit record.  We should have
 * cleaned up our previous buffers by now, so if we are in abort
 * mode we can now just skip the rest of the journal write
 * entirely.
 *
 * Returns 1 if the journal needs to be aborted or 0 on success
 */
-static int journal_write_commit_record(journal_t *journal,
+static int journal_submit_commit_record(journal_t *journal,
-                                        transaction_t *commit_transaction)
+                                        transaction_t *commit_transaction,
+                                        struct buffer_head **cbh,
+                                        __u32 crc32_sum)
 {
        struct journal_head *descriptor;
+        struct commit_header *tmp;
        struct buffer_head *bh;
-        int i, ret;
+        int ret;
        int barrier_done = 0;
        if (is_journal_aborted(journal))
@@ -116,21 +122,33 @@ static int journal_write_commit_record(journal_t *journal,
        bh = jh2bh(descriptor);
-        /* AKPM: buglet - add `i' to tmp! */
+        tmp = (struct commit_header *)bh->b_data;
-        for (i = 0; i < bh->b_size; i += 512) {
+        tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
-                journal_header_t *tmp = (journal_header_t*)bh->b_data;
+        tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
-                tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
+        tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
-                tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
-                tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+        if (JBD2_HAS_COMPAT_FEATURE(journal,
+                                    JBD2_FEATURE_COMPAT_CHECKSUM)) {
+                tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
+                tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
+                tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
        }
-        JBUFFER_TRACE(descriptor, "write commit block");
+        JBUFFER_TRACE(descriptor, "submit commit block");
+        lock_buffer(bh);
        set_buffer_dirty(bh);
-        if (journal->j_flags & JBD2_BARRIER) {
+        set_buffer_uptodate(bh);
+        bh->b_end_io = journal_end_buffer_io_sync;
+        if (journal->j_flags & JBD2_BARRIER &&
+                !JBD2_HAS_COMPAT_FEATURE(journal,
+                                         JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
                set_buffer_ordered(bh);
                barrier_done = 1;
        }
-        ret = sync_dirty_buffer(bh);
+        ret = submit_bh(WRITE, bh);
        /* is it possible for another commit to fail at roughly
         * the same time as this one?  If so, we don't want to
         * trust the barrier flag in the super, but instead want
@@ -151,14 +169,72 @@ static int journal_write_commit_record(journal_t *journal,
                clear_buffer_ordered(bh);
                set_buffer_uptodate(bh);
                set_buffer_dirty(bh);
-                ret = sync_dirty_buffer(bh);
+                ret = submit_bh(WRITE, bh);
        }
-        put_bh(bh);             /* One for getblk() */
+        *cbh = bh;
-        jbd2_journal_put_journal_head(descriptor);
+        return ret;
+}
+/*
+ * This function along with journal_submit_commit_record
+ * allows to write the commit record asynchronously.
+ */
+static int journal_wait_on_commit_record(struct buffer_head *bh)
+{
+        int ret = 0;
+        clear_buffer_dirty(bh);
+        wait_on_buffer(bh);
+        if (unlikely(!buffer_uptodate(bh)))
+                ret = -EIO;
+        put_bh(bh);            /* One for getblk() */
+        jbd2_journal_put_journal_head(bh2jh(bh));
-        return (ret == -EIO);
+        return ret;
 }
+/*
+ * Wait for all submitted IO to complete.
+ */
+static int journal_wait_on_locked_list(journal_t *journal,
+                                       transaction_t *commit_transaction)
+{
+        int ret = 0;
+        struct journal_head *jh;
+        while (commit_transaction->t_locked_list) {
+                struct buffer_head *bh;
+                jh = commit_transaction->t_locked_list->b_tprev;
+                bh = jh2bh(jh);
+                get_bh(bh);
+                if (buffer_locked(bh)) {
+                        spin_unlock(&journal->j_list_lock);
+                        wait_on_buffer(bh);
+                        if (unlikely(!buffer_uptodate(bh)))
+                                ret = -EIO;
+                        spin_lock(&journal->j_list_lock);
+                }
+                if (!inverted_lock(journal, bh)) {
+                        put_bh(bh);
+                        spin_lock(&journal->j_list_lock);
+                        continue;
+                }
+                if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
+                        __jbd2_journal_unfile_buffer(jh);
+                        jbd_unlock_bh_state(bh);
+                        jbd2_journal_remove_journal_head(bh);
+                        put_bh(bh);
+                } else {
+                        jbd_unlock_bh_state(bh);
+                }
+                put_bh(bh);
+                cond_resched_lock(&journal->j_list_lock);
+        }
+        return ret;
+  }
 static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
 {
        int i;
@@ -265,7 +341,7 @@ write_out_data:
                        put_bh(bh);
                }
-                if (lock_need_resched(&journal->j_list_lock)) {
+                if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
                        spin_unlock(&journal->j_list_lock);
                        goto write_out_data;
                }
@@ -274,7 +350,21 @@ write_out_data:
        journal_do_submit_data(wbuf, bufs);
 }
-static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
+static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
+{
+        struct page *page = bh->b_page;
+        char *addr;
+        __u32 checksum;
+        addr = kmap_atomic(page, KM_USER0);
+        checksum = crc32_be(crc32_sum,
+                (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
+        kunmap_atomic(addr, KM_USER0);
+        return checksum;
+}
+static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
                                   unsigned long long block)
 {
        tag->t_blocknr = cpu_to_be32(block & (u32)~0);
@@ -290,6 +380,7 @@ static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
 */
 void jbd2_journal_commit_transaction(journal_t *journal)
 {
+        struct transaction_stats_s stats;
        transaction_t *commit_transaction;
        struct journal_head *jh, *new_jh, *descriptor;
        struct buffer_head **wbuf = journal->j_wbuf;
@@ -305,6 +396,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        int tag_flag;
        int i;
        int tag_bytes = journal_tag_bytes(journal);
+        struct buffer_head *cbh = NULL; /* For transactional checksums */
+        __u32 crc32_sum = ~0;
        /*
         * First job: lock down the current transaction and wait for
@@ -337,6 +430,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        spin_lock(&journal->j_state_lock);
        commit_transaction->t_state = T_LOCKED;
+        stats.u.run.rs_wait = commit_transaction->t_max_wait;
+        stats.u.run.rs_locked = jiffies;
+        stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
+                                                stats.u.run.rs_locked);
        spin_lock(&commit_transaction->t_handle_lock);
        while (commit_transaction->t_updates) {
                DEFINE_WAIT(wait);
@@ -407,6 +505,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
         */
        jbd2_journal_switch_revoke_table(journal);
+        stats.u.run.rs_flushing = jiffies;
+        stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
+                                               stats.u.run.rs_flushing);
        commit_transaction->t_state = T_FLUSH;
        journal->j_committing_transaction = commit_transaction;
        journal->j_running_transaction = NULL;
@@ -440,38 +542,15 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        journal_submit_data_buffers(journal, commit_transaction);
        /*
-         * Wait for all previously submitted IO to complete.
+         * Wait for all previously submitted IO to complete if commit
+         * record is to be written synchronously.
         */
        spin_lock(&journal->j_list_lock);
-        while (commit_transaction->t_locked_list) {
+        if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
-                struct buffer_head *bh;
+                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
+                err = journal_wait_on_locked_list(journal,
+                                                commit_transaction);
-                jh = commit_transaction->t_locked_list->b_tprev;
-                bh = jh2bh(jh);
-                get_bh(bh);
-                if (buffer_locked(bh)) {
-                        spin_unlock(&journal->j_list_lock);
-                        wait_on_buffer(bh);
-                        if (unlikely(!buffer_uptodate(bh)))
-                                err = -EIO;
-                        spin_lock(&journal->j_list_lock);
-                }
-                if (!inverted_lock(journal, bh)) {
-                        put_bh(bh);
-                        spin_lock(&journal->j_list_lock);
-                        continue;
-                }
-                if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
-                        __jbd2_journal_unfile_buffer(jh);
-                        jbd_unlock_bh_state(bh);
-                        jbd2_journal_remove_journal_head(bh);
-                        put_bh(bh);
-                } else {
-                        jbd_unlock_bh_state(bh);
-                }
-                put_bh(bh);
-                cond_resched_lock(&journal->j_list_lock);
-        }
        spin_unlock(&journal->j_list_lock);
        if (err)
@@ -498,6 +577,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
         */
        commit_transaction->t_state = T_COMMIT;
+        stats.u.run.rs_logging = jiffies;
+        stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
+                                                 stats.u.run.rs_logging);
+        stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
+        stats.u.run.rs_blocks_logged = 0;
        descriptor = NULL;
        bufs = 0;
        while (commit_transaction->t_buffers) {
@@ -639,6 +724,15 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 start_journal_io:
                        for (i = 0; i < bufs; i++) {
                                struct buffer_head *bh = wbuf[i];
+                                /*
+                                 * Compute checksum.
+                                 */
+                                if (JBD2_HAS_COMPAT_FEATURE(journal,
+                                        JBD2_FEATURE_COMPAT_CHECKSUM)) {
+                                        crc32_sum =
+                                            jbd2_checksum_data(crc32_sum, bh);
+                                }
                                lock_buffer(bh);
                                clear_buffer_dirty(bh);
                                set_buffer_uptodate(bh);
@@ -646,6 +740,7 @@ start_journal_io:
                                submit_bh(WRITE, bh);
                        }
                        cond_resched();
+                        stats.u.run.rs_blocks_logged += bufs;
                        /* Force a new descriptor to be generated next
                           time round the loop. */
@@ -654,6 +749,23 @@ start_journal_io:
                }
        }
+        /* Done it all: now write the commit record asynchronously. */
+        if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+                err = journal_submit_commit_record(journal, commit_transaction,
+                                                 &cbh, crc32_sum);
+                if (err)
+                        __jbd2_journal_abort_hard(journal);
+                spin_lock(&journal->j_list_lock);
+                err = journal_wait_on_locked_list(journal,
+                                                commit_transaction);
+                spin_unlock(&journal->j_list_lock);
+                if (err)
+                        __jbd2_journal_abort_hard(journal);
+        }
        /* Lo and behold: we have just managed to send a transaction to
           the log.  Before we can commit it, wait for the IO so far to
           complete.  Control buffers being written are on the
@@ -753,8 +865,14 @@ wait_for_iobuf:
        jbd_debug(3, "JBD: commit phase 6\n");
-        if (journal_write_commit_record(journal, commit_transaction))
+        if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
-                err = -EIO;
+                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+                err = journal_submit_commit_record(journal, commit_transaction,
+                                                &cbh, crc32_sum);
+                if (err)
+                        __jbd2_journal_abort_hard(journal);
+        }
+        err = journal_wait_on_commit_record(cbh);
        if (err)
                jbd2_journal_abort(journal, err);
@@ -816,6 +934,7 @@ restart_loop:
                cp_transaction = jh->b_cp_transaction;
                if (cp_transaction) {
                        JBUFFER_TRACE(jh, "remove from old cp transaction");
+                        cp_transaction->t_chp_stats.cs_dropped++;
                        __jbd2_journal_remove_checkpoint(jh);
                }
@@ -867,10 +986,10 @@ restart_loop:
        }
        spin_unlock(&journal->j_list_lock);
        /*
-         * This is a bit sleazy.  We borrow j_list_lock to protect
+         * This is a bit sleazy.  We use j_list_lock to protect transition
-         * journal->j_committing_transaction in __jbd2_journal_remove_checkpoint.
+         * of a transaction into T_FINISHED state and calling
-         * Really, __jbd2_journal_remove_checkpoint should be using j_state_lock but
+         * __jbd2_journal_drop_transaction(). Otherwise we could race with
-         * it's a bit hassle to hold that across __jbd2_journal_remove_checkpoint
+         * other checkpointing code processing the transaction...
         */
        spin_lock(&journal->j_state_lock);
        spin_lock(&journal->j_list_lock);
@@ -890,6 +1009,36 @@ restart_loop:
        J_ASSERT(commit_transaction->t_state == T_COMMIT);
+        commit_transaction->t_start = jiffies;
+        stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
+                                                commit_transaction->t_start);
+        /*
+         * File the transaction for history
+         */
+        stats.ts_type = JBD2_STATS_RUN;
+        stats.ts_tid = commit_transaction->t_tid;
+        stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
+        spin_lock(&journal->j_history_lock);
+        memcpy(journal->j_history + journal->j_history_cur, &stats,
+                        sizeof(stats));
+        if (++journal->j_history_cur == journal->j_history_max)
+                journal->j_history_cur = 0;
+        /*
+         * Calculate overall stats
+         */
+        journal->j_stats.ts_tid++;
+        journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
+        journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
+        journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
+        journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
+        journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
+        journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
+        journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
+        journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
+        spin_unlock(&journal->j_history_lock);
        commit_transaction->t_state = T_FINISHED;
        J_ASSERT(commit_transaction == journal->j_committing_transaction);
        journal->j_commit_sequence = commit_transaction->t_tid;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 6ddc5531587c..96ba846992e9 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -36,6 +36,7 @@
 #include <linux/poison.h>
 #include <linux/proc_fs.h>
 #include <linux/debugfs.h>
+#include <linux/seq_file.h>
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -640,6 +641,312 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
        return jbd2_journal_add_journal_head(bh);
 }
+struct jbd2_stats_proc_session {
+        journal_t *journal;
+        struct transaction_stats_s *stats;
+        int start;
+        int max;
+};
+static void *jbd2_history_skip_empty(struct jbd2_stats_proc_session *s,
+                                        struct transaction_stats_s *ts,
+                                        int first)
+{
+        if (ts == s->stats + s->max)
+                ts = s->stats;
+        if (!first && ts == s->stats + s->start)
+                return NULL;
+        while (ts->ts_type == 0) {
+                ts++;
+                if (ts == s->stats + s->max)
+                        ts = s->stats;
+                if (ts == s->stats + s->start)
+                        return NULL;
+        }
+        return ts;
+}
+static void *jbd2_seq_history_start(struct seq_file *seq, loff_t *pos)
+{
+        struct jbd2_stats_proc_session *s = seq->private;
+        struct transaction_stats_s *ts;
+        int l = *pos;
+        if (l == 0)
+                return SEQ_START_TOKEN;
+        ts = jbd2_history_skip_empty(s, s->stats + s->start, 1);
+        if (!ts)
+                return NULL;
+        l--;
+        while (l) {
+                ts = jbd2_history_skip_empty(s, ++ts, 0);
+                if (!ts)
+                        break;
+                l--;
+        }
+        return ts;
+}
+static void *jbd2_seq_history_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        struct jbd2_stats_proc_session *s = seq->private;
+        struct transaction_stats_s *ts = v;
+        ++*pos;
+        if (v == SEQ_START_TOKEN)
+                return jbd2_history_skip_empty(s, s->stats + s->start, 1);
+        else
+                return jbd2_history_skip_empty(s, ++ts, 0);
+}
+static int jbd2_seq_history_show(struct seq_file *seq, void *v)
+{
+        struct transaction_stats_s *ts = v;
+        if (v == SEQ_START_TOKEN) {
+                seq_printf(seq, "%-4s %-5s %-5s %-5s %-5s %-5s %-5s %-6s %-5s "
+                                "%-5s %-5s %-5s %-5s %-5s\n", "R/C", "tid",
+                                "wait", "run", "lock", "flush", "log", "hndls",
+                                "block", "inlog", "ctime", "write", "drop",
+                                "close");
+                return 0;
+        }
+        if (ts->ts_type == JBD2_STATS_RUN)
+                seq_printf(seq, "%-4s %-5lu %-5u %-5u %-5u %-5u %-5u "
+                                "%-6lu %-5lu %-5lu\n", "R", ts->ts_tid,
+                                jiffies_to_msecs(ts->u.run.rs_wait),
+                                jiffies_to_msecs(ts->u.run.rs_running),
+                                jiffies_to_msecs(ts->u.run.rs_locked),
+                                jiffies_to_msecs(ts->u.run.rs_flushing),
+                                jiffies_to_msecs(ts->u.run.rs_logging),
+                                ts->u.run.rs_handle_count,
+                                ts->u.run.rs_blocks,
+                                ts->u.run.rs_blocks_logged);
+        else if (ts->ts_type == JBD2_STATS_CHECKPOINT)
+                seq_printf(seq, "%-4s %-5lu %48s %-5u %-5lu %-5lu %-5lu\n",
+                                "C", ts->ts_tid, " ",
+                                jiffies_to_msecs(ts->u.chp.cs_chp_time),
+                                ts->u.chp.cs_written, ts->u.chp.cs_dropped,
+                                ts->u.chp.cs_forced_to_close);
+        else
+                J_ASSERT(0);
+        return 0;
+}
+static void jbd2_seq_history_stop(struct seq_file *seq, void *v)
+{
+}
+static struct seq_operations jbd2_seq_history_ops = {
+        .start  = jbd2_seq_history_start,
+        .next   = jbd2_seq_history_next,
+        .stop   = jbd2_seq_history_stop,
+        .show   = jbd2_seq_history_show,
+};
+static int jbd2_seq_history_open(struct inode *inode, struct file *file)
+{
+        journal_t *journal = PDE(inode)->data;
+        struct jbd2_stats_proc_session *s;
+        int rc, size;
+        s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (s == NULL)
+                return -ENOMEM;
+        size = sizeof(struct transaction_stats_s) * journal->j_history_max;
+        s->stats = kmalloc(size, GFP_KERNEL);
+        if (s->stats == NULL) {
+                kfree(s);
+                return -ENOMEM;
+        }
+        spin_lock(&journal->j_history_lock);
+        memcpy(s->stats, journal->j_history, size);
+        s->max = journal->j_history_max;
+        s->start = journal->j_history_cur % s->max;
+        spin_unlock(&journal->j_history_lock);
+        rc = seq_open(file, &jbd2_seq_history_ops);
+        if (rc == 0) {
+                struct seq_file *m = file->private_data;
+                m->private = s;
+        } else {
+                kfree(s->stats);
+                kfree(s);
+        }
+        return rc;
+}
+static int jbd2_seq_history_release(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq = file->private_data;
+        struct jbd2_stats_proc_session *s = seq->private;
+        kfree(s->stats);
+        kfree(s);
+        return seq_release(inode, file);
+}
+static struct file_operations jbd2_seq_history_fops = {
+        .owner          = THIS_MODULE,
+        .open           = jbd2_seq_history_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = jbd2_seq_history_release,
+};
+static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
+{
+        return *pos ? NULL : SEQ_START_TOKEN;
+}
+static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        return NULL;
+}
+static int jbd2_seq_info_show(struct seq_file *seq, void *v)
+{
+        struct jbd2_stats_proc_session *s = seq->private;
+        if (v != SEQ_START_TOKEN)
+                return 0;
+        seq_printf(seq, "%lu transaction, each upto %u blocks\n",
+                        s->stats->ts_tid,
+                        s->journal->j_max_transaction_buffers);
+        if (s->stats->ts_tid == 0)
+                return 0;
+        seq_printf(seq, "average: \n  %ums waiting for transaction\n",
+            jiffies_to_msecs(s->stats->u.run.rs_wait / s->stats->ts_tid));
+        seq_printf(seq, "  %ums running transaction\n",
+            jiffies_to_msecs(s->stats->u.run.rs_running / s->stats->ts_tid));
+        seq_printf(seq, "  %ums transaction was being locked\n",
+            jiffies_to_msecs(s->stats->u.run.rs_locked / s->stats->ts_tid));
+        seq_printf(seq, "  %ums flushing data (in ordered mode)\n",
+            jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid));
+        seq_printf(seq, "  %ums logging transaction\n",
+            jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid));
+        seq_printf(seq, "  %lu handles per transaction\n",
+            s->stats->u.run.rs_handle_count / s->stats->ts_tid);
+        seq_printf(seq, "  %lu blocks per transaction\n",
+            s->stats->u.run.rs_blocks / s->stats->ts_tid);
+        seq_printf(seq, "  %lu logged blocks per transaction\n",
+            s->stats->u.run.rs_blocks_logged / s->stats->ts_tid);
+        return 0;
+}
+static void jbd2_seq_info_stop(struct seq_file *seq, void *v)
+{
+}
+static struct seq_operations jbd2_seq_info_ops = {
+        .start  = jbd2_seq_info_start,
+        .next   = jbd2_seq_info_next,
+        .stop   = jbd2_seq_info_stop,
+        .show   = jbd2_seq_info_show,
+};
+static int jbd2_seq_info_open(struct inode *inode, struct file *file)
+{
+        journal_t *journal = PDE(inode)->data;
+        struct jbd2_stats_proc_session *s;
+        int rc, size;
+        s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (s == NULL)
+                return -ENOMEM;
+        size = sizeof(struct transaction_stats_s);
+        s->stats = kmalloc(size, GFP_KERNEL);
+        if (s->stats == NULL) {
+                kfree(s);
+                return -ENOMEM;
+        }
+        spin_lock(&journal->j_history_lock);
+        memcpy(s->stats, &journal->j_stats, size);
+        s->journal = journal;
+        spin_unlock(&journal->j_history_lock);
+        rc = seq_open(file, &jbd2_seq_info_ops);
+        if (rc == 0) {
+                struct seq_file *m = file->private_data;
+                m->private = s;
+        } else {
+                kfree(s->stats);
+                kfree(s);
+        }
+        return rc;
+}
+static int jbd2_seq_info_release(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq = file->private_data;
+        struct jbd2_stats_proc_session *s = seq->private;
+        kfree(s->stats);
+        kfree(s);
+        return seq_release(inode, file);
+}
+static struct file_operations jbd2_seq_info_fops = {
+        .owner          = THIS_MODULE,
+        .open           = jbd2_seq_info_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = jbd2_seq_info_release,
+};
+static struct proc_dir_entry *proc_jbd2_stats;
+static void jbd2_stats_proc_init(journal_t *journal)
+{
+        char name[BDEVNAME_SIZE];
+        snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name));
+        journal->j_proc_entry = proc_mkdir(name, proc_jbd2_stats);
+        if (journal->j_proc_entry) {
+                struct proc_dir_entry *p;
+                p = create_proc_entry("history", S_IRUGO,
+                                journal->j_proc_entry);
+                if (p) {
+                        p->proc_fops = &jbd2_seq_history_fops;
+                        p->data = journal;
+                        p = create_proc_entry("info", S_IRUGO,
+                                                journal->j_proc_entry);
+                        if (p) {
+                                p->proc_fops = &jbd2_seq_info_fops;
+                                p->data = journal;
+                        }
+                }
+        }
+}
+static void jbd2_stats_proc_exit(journal_t *journal)
+{
+        char name[BDEVNAME_SIZE];
+        snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name));
+        remove_proc_entry("info", journal->j_proc_entry);
+        remove_proc_entry("history", journal->j_proc_entry);
+        remove_proc_entry(name, proc_jbd2_stats);
+}
+static void journal_init_stats(journal_t *journal)
+{
+        int size;
+        if (!proc_jbd2_stats)
+                return;
+        journal->j_history_max = 100;
+        size = sizeof(struct transaction_stats_s) * journal->j_history_max;
+        journal->j_history = kzalloc(size, GFP_KERNEL);
+        if (!journal->j_history) {
+                journal->j_history_max = 0;
+                return;
+        }
+        spin_lock_init(&journal->j_history_lock);
+}
 /*
 * Management for journal control blocks: functions to create and
 * destroy journal_t structures, and to initialise and read existing
@@ -681,6 +988,9 @@ static journal_t * journal_init_common (void)
                kfree(journal);
                goto fail;
        }
+        journal_init_stats(journal);
        return journal;
 fail:
        return NULL;
@@ -735,6 +1045,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
        journal->j_fs_dev = fs_dev;
        journal->j_blk_offset = start;
        journal->j_maxlen = len;
+        jbd2_stats_proc_init(journal);
        bh = __getblk(journal->j_dev, start, journal->j_blocksize);
        J_ASSERT(bh != NULL);
@@ -773,6 +1084,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
        journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
        journal->j_blocksize = inode->i_sb->s_blocksize;
+        jbd2_stats_proc_init(journal);
        /* journal descriptor can store up to n blocks -bzzz */
        n = journal->j_blocksize / sizeof(journal_block_tag_t);
@@ -1153,6 +1465,8 @@ void jbd2_journal_destroy(journal_t *journal)
                brelse(journal->j_sb_buffer);
        }
+        if (journal->j_proc_entry)
+                jbd2_stats_proc_exit(journal);
        if (journal->j_inode)
                iput(journal->j_inode);
        if (journal->j_revoke)
@@ -1264,6 +1578,32 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
        return 1;
 }
+/*
+ * jbd2_journal_clear_features () - Clear a given journal feature in the
+ *                                  superblock
+ * @journal: Journal to act on.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Clear a given journal feature as present on the
+ * superblock.
+ */
+void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
+                                unsigned long ro, unsigned long incompat)
+{
+        journal_superblock_t *sb;
+        jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
+                  compat, ro, incompat);
+        sb = journal->j_superblock;
+        sb->s_feature_compat    &= ~cpu_to_be32(compat);
+        sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
+        sb->s_feature_incompat  &= ~cpu_to_be32(incompat);
+}
+EXPORT_SYMBOL(jbd2_journal_clear_features);
 /**
 * int jbd2_journal_update_format () - Update on-disk journal structure.
@@ -1633,7 +1973,7 @@ static int journal_init_jbd2_journal_head_cache(void)
        jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
                                sizeof(struct journal_head),
                                0,              /* offset */
-                                0,              /* flags */
+                                SLAB_TEMPORARY, /* flags */
                                NULL);          /* ctor */
        retval = 0;
        if (jbd2_journal_head_cache == 0) {
@@ -1900,6 +2240,28 @@ static void __exit jbd2_remove_debugfs_entry(void)
 #endif
+#ifdef CONFIG_PROC_FS
+#define JBD2_STATS_PROC_NAME "fs/jbd2"
+static void __init jbd2_create_jbd_stats_proc_entry(void)
+{
+        proc_jbd2_stats = proc_mkdir(JBD2_STATS_PROC_NAME, NULL);
+}
+static void __exit jbd2_remove_jbd_stats_proc_entry(void)
+{
+        if (proc_jbd2_stats)
+                remove_proc_entry(JBD2_STATS_PROC_NAME, NULL);
+}
+#else
+#define jbd2_create_jbd_stats_proc_entry() do {} while (0)
+#define jbd2_remove_jbd_stats_proc_entry() do {} while (0)
+#endif
 struct kmem_cache *jbd2_handle_cache;
 static int __init journal_init_handle_cache(void)
@@ -1907,7 +2269,7 @@ static int __init journal_init_handle_cache(void)
        jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle",
                                sizeof(handle_t),
                                0,              /* offset */
-                                0,              /* flags */
+                                SLAB_TEMPORARY, /* flags */
                                NULL);          /* ctor */
        if (jbd2_handle_cache == NULL) {
                printk(KERN_EMERG "JBD: failed to create handle cache\n");
@@ -1955,6 +2317,7 @@ static int __init journal_init(void)
        if (ret != 0)
                jbd2_journal_destroy_caches();
        jbd2_create_debugfs_entry();
+        jbd2_create_jbd_stats_proc_entry();
        return ret;
 }
@@ -1966,6 +2329,7 @@ static void __exit journal_exit(void)
                printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
 #endif
        jbd2_remove_debugfs_entry();
+        jbd2_remove_jbd_stats_proc_entry();
        jbd2_journal_destroy_caches();
 }
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index d0ce627539ef..921680663fa2 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -21,6 +21,7 @@
 #include <linux/jbd2.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
+#include <linux/crc32.h>
 #endif
 /*
@@ -316,6 +317,37 @@ static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag
        return block;
 }
+/*
+ * calc_chksums calculates the checksums for the blocks described in the
+ * descriptor block.
+ */
+static int calc_chksums(journal_t *journal, struct buffer_head *bh,
+                        unsigned long *next_log_block, __u32 *crc32_sum)
+{
+        int i, num_blks, err;
+        unsigned long io_block;
+        struct buffer_head *obh;
+        num_blks = count_tags(journal, bh);
+        /* Calculate checksum of the descriptor block. */
+        *crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size);
+        for (i = 0; i < num_blks; i++) {
+                io_block = (*next_log_block)++;
+                wrap(journal, *next_log_block);
+                err = jread(&obh, journal, io_block);
+                if (err) {
+                        printk(KERN_ERR "JBD: IO error %d recovering block "
+                                "%lu in log\n", err, io_block);
+                        return 1;
+                } else {
+                        *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
+                                     obh->b_size);
+                }
+        }
+        return 0;
+}
 static int do_one_pass(journal_t *journal,
                        struct recovery_info *info, enum passtype pass)
 {
@@ -328,6 +360,7 @@ static int do_one_pass(journal_t *journal,
        unsigned int            sequence;
        int                     blocktype;
        int                     tag_bytes = journal_tag_bytes(journal);
+        __u32                   crc32_sum = ~0; /* Transactional Checksums */
        /* Precompute the maximum metadata descriptors in a descriptor block */
        int                     MAX_BLOCKS_PER_DESC;
@@ -419,12 +452,26 @@ static int do_one_pass(journal_t *journal,
                switch(blocktype) {
                case JBD2_DESCRIPTOR_BLOCK:
                        /* If it is a valid descriptor block, replay it
-                         * in pass REPLAY; otherwise, just skip over the
+                         * in pass REPLAY; if journal_checksums enabled, then
-                         * blocks it describes. */
+                         * calculate checksums in PASS_SCAN, otherwise,
+                         * just skip over the blocks it describes. */
                        if (pass != PASS_REPLAY) {
+                                if (pass == PASS_SCAN &&
+                                    JBD2_HAS_COMPAT_FEATURE(journal,
+                                            JBD2_FEATURE_COMPAT_CHECKSUM) &&
+                                    !info->end_transaction) {
+                                        if (calc_chksums(journal, bh,
+                                                        &next_log_block,
+                                                        &crc32_sum)) {
+                                                put_bh(bh);
+                                                break;
+                                        }
+                                        put_bh(bh);
+                                        continue;
+                                }
                                next_log_block += count_tags(journal, bh);
                                wrap(journal, next_log_block);
-                                brelse(bh);
+                                put_bh(bh);
                                continue;
                        }
@@ -516,9 +563,96 @@ static int do_one_pass(journal_t *journal,
                        continue;
                case JBD2_COMMIT_BLOCK:
-                        /* Found an expected commit block: not much to
+                        /*     How to differentiate between interrupted commit
-                         * do other than move on to the next sequence
+                         *               and journal corruption ?
+                         *
+                         * {nth transaction}
+                         *        Checksum Verification Failed
+                         *                       |
+                         *               ____________________
+                         *              |                    |
+                         *      async_commit             sync_commit
+                         *              |                    |
+                         *              | GO TO NEXT    "Journal Corruption"
+                         *              | TRANSACTION
+                         *              |
+                         * {(n+1)th transanction}
+                         *              |
+                         *       _______|______________
+                         *      |                     |
+                         * Commit block found   Commit block not found
+                         *      |                     |
+                         * "Journal Corruption"       |
+                         *               _____________|_________
+                         *              |                       |
+                         *      nth trans corrupt       OR   nth trans
+                         *      and (n+1)th interrupted     interrupted
+                         *      before commit block
+                         *      could reach the disk.
+                         *      (Cannot find the difference in above
+                         *       mentioned conditions. Hence assume
+                         *       "Interrupted Commit".)
+                         */
+                        /* Found an expected commit block: if checksums
+                         * are present verify them in PASS_SCAN; else not
+                         * much to do other than move on to the next sequence
                         * number. */
+                        if (pass == PASS_SCAN &&
+                            JBD2_HAS_COMPAT_FEATURE(journal,
+                                    JBD2_FEATURE_COMPAT_CHECKSUM)) {
+                                int chksum_err, chksum_seen;
+                                struct commit_header *cbh =
+                                        (struct commit_header *)bh->b_data;
+                                unsigned found_chksum =
+                                        be32_to_cpu(cbh->h_chksum[0]);
+                                chksum_err = chksum_seen = 0;
+                                if (info->end_transaction) {
+                                        printk(KERN_ERR "JBD: Transaction %u "
+                                                "found to be corrupt.\n",
+                                                next_commit_ID - 1);
+                                        brelse(bh);
+                                        break;
+                                }
+                                if (crc32_sum == found_chksum &&
+                                    cbh->h_chksum_type == JBD2_CRC32_CHKSUM &&
+                                    cbh->h_chksum_size ==
+                                                JBD2_CRC32_CHKSUM_SIZE)
+                                       chksum_seen = 1;
+                                else if (!(cbh->h_chksum_type == 0 &&
+                                             cbh->h_chksum_size == 0 &&
+                                             found_chksum == 0 &&
+                                             !chksum_seen))
+                                /*
+                                 * If fs is mounted using an old kernel and then
+                                 * kernel with journal_chksum is used then we
+                                 * get a situation where the journal flag has
+                                 * checksum flag set but checksums are not
+                                 * present i.e chksum = 0, in the individual
+                                 * commit blocks.
+                                 * Hence to avoid checksum failures, in this
+                                 * situation, this extra check is added.
+                                 */
+                                                chksum_err = 1;
+                                if (chksum_err) {
+                                        info->end_transaction = next_commit_ID;
+                                        if (!JBD2_HAS_COMPAT_FEATURE(journal,
+                                           JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){
+                                                printk(KERN_ERR
+                                                       "JBD: Transaction %u "
+                                                       "found to be corrupt.\n",
+                                                       next_commit_ID);
+                                                brelse(bh);
+                                                break;
+                                        }
+                                }
+                                crc32_sum = ~0;
+                        }
                        brelse(bh);
                        next_commit_ID++;
                        continue;
@@ -554,9 +688,10 @@ static int do_one_pass(journal_t *journal,
         * transaction marks the end of the valid log.
         */
-        if (pass == PASS_SCAN)
+        if (pass == PASS_SCAN) {
-                info->end_transaction = next_commit_ID;
+                if (!info->end_transaction)
-        else {
+                        info->end_transaction = next_commit_ID;
+        } else {
                /* It's really bad news if different passes end up at
                 * different places (but possible due to IO errors). */
                if (info->end_transaction != next_commit_ID) {
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 3595fd432d5b..df36f42e19e1 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -171,13 +171,15 @@ int __init jbd2_journal_init_revoke_caches(void)
 {
        jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record",
                                           sizeof(struct jbd2_revoke_record_s),
-                                           0, SLAB_HWCACHE_ALIGN, NULL);
+                                           0,
+                                           SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
+                                           NULL);
        if (jbd2_revoke_record_cache == 0)
                return -ENOMEM;
        jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table",
                                           sizeof(struct jbd2_revoke_table_s),
-                                           0, 0, NULL);
+                                           0, SLAB_TEMPORARY, NULL);
        if (jbd2_revoke_table_cache == 0) {
                kmem_cache_destroy(jbd2_revoke_record_cache);
                jbd2_revoke_record_cache = NULL;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index b1fcf2b3dca3..b9b0b6f899b9 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -54,11 +54,13 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
        spin_lock_init(&transaction->t_handle_lock);
        /* Set up the commit timer for the new transaction. */
-        journal->j_commit_timer.expires = transaction->t_expires;
+        journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
        add_timer(&journal->j_commit_timer);
        J_ASSERT(journal->j_running_transaction == NULL);
        journal->j_running_transaction = transaction;
+        transaction->t_max_wait = 0;
+        transaction->t_start = jiffies;
        return transaction;
 }
@@ -85,6 +87,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle)
        int nblocks = handle->h_buffer_credits;
        transaction_t *new_transaction = NULL;
        int ret = 0;
+        unsigned long ts = jiffies;
        if (nblocks > journal->j_max_transaction_buffers) {
                printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
@@ -217,6 +220,12 @@ repeat_locked:
        /* OK, account for the buffers that this operation expects to
         * use and add the handle to the running transaction. */
+        if (time_after(transaction->t_start, ts)) {
+                ts = jbd2_time_diff(ts, transaction->t_start);
+                if (ts > transaction->t_max_wait)
+                        transaction->t_max_wait = ts;
+        }
        handle->h_transaction = transaction;
        transaction->t_outstanding_credits += nblocks;
        transaction->t_updates++;
@@ -232,6 +241,8 @@ out:
        return ret;
 }
+static struct lock_class_key jbd2_handle_key;
 /* Allocate a new handle.  This should probably be in a slab... */
 static handle_t *new_handle(int nblocks)
 {
@@ -242,6 +253,9 @@ static handle_t *new_handle(int nblocks)
        handle->h_buffer_credits = nblocks;
        handle->h_ref = 1;
+        lockdep_init_map(&handle->h_lockdep_map, "jbd2_handle",
+                                                &jbd2_handle_key, 0);
        return handle;
 }
@@ -284,7 +298,11 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
                jbd2_free_handle(handle);
                current->journal_info = NULL;
                handle = ERR_PTR(err);
+                goto out;
        }
+        lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+out:
        return handle;
 }
@@ -1164,7 +1182,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
        }
        /* That test should have eliminated the following case: */
-        J_ASSERT_JH(jh, jh->b_frozen_data == 0);
+        J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
        JBUFFER_TRACE(jh, "file as BJ_Metadata");
        spin_lock(&journal->j_list_lock);
@@ -1410,6 +1428,8 @@ int jbd2_journal_stop(handle_t *handle)
                spin_unlock(&journal->j_state_lock);
        }
+        lock_release(&handle->h_lockdep_map, 1, _THIS_IP_);
        jbd2_free_handle(handle);
        return err;
 }
@@ -1512,7 +1532,7 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
        J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
        if (jh->b_jlist != BJ_None)
-                J_ASSERT_JH(jh, transaction != 0);
+                J_ASSERT_JH(jh, transaction != NULL);
        switch (jh->b_jlist) {
        case BJ_None:
@@ -1581,11 +1601,11 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
        if (buffer_locked(bh) || buffer_dirty(bh))
                goto out;
-        if (jh->b_next_transaction != 0)
+        if (jh->b_next_transaction != NULL)
                goto out;
        spin_lock(&journal->j_list_lock);
-        if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) {
+        if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
                if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
                        /* A written-back ordered data buffer */
                        JBUFFER_TRACE(jh, "release data");
@@ -1593,7 +1613,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
                        jbd2_journal_remove_journal_head(bh);
                        __brelse(bh);
                }
-        } else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) {
+        } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
                /* written-back checkpointed metadata buffer */
                if (jh->b_jlist == BJ_None) {
                        JBUFFER_TRACE(jh, "remove from checkpoint list");
@@ -1953,7 +1973,7 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
        J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
        J_ASSERT_JH(jh, jh->b_transaction == transaction ||
-                                jh->b_transaction == 0);
+                                jh->b_transaction == NULL);
        if (jh->b_transaction && jh->b_jlist == jlist)
                return;
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index d568ae846741..8adebd3e43c6 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -105,7 +105,7 @@ static int jffs2_garbage_collect_thread(void *_c)
                /* Put_super will send a SIGKILL and then wait on the sem.
                 */
-                while (signal_pending(current)) {
+                while (signal_pending(current) || freezing(current)) {
                        siginfo_t info;
                        unsigned long signr;
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index 3c8663bea98c..dfda12a073e1 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -79,6 +79,9 @@ int jfs_ioctl(struct inode * inode, struct file * filp, unsigned int cmd,
                if (!S_ISDIR(inode->i_mode))
                        flags &= ~JFS_DIRSYNC_FL;
+                /* Is it quota file? Do not allow user to mess with it */
+                if (IS_NOQUOTA(inode))
+                        return -EPERM;
                jfs_get_inode_flags(jfs_inode);
                oldflags = jfs_inode->mode2;
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index df25ecc418af..4dcc05819998 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -284,11 +284,11 @@ static struct dir_table_slot *find_index(struct inode *ip, u32 index,
                        release_metapage(*mp);
                        *mp = NULL;
                }
-                if (*mp == 0) {
+                if (!(*mp)) {
                        *lblock = blkno;
                        *mp = read_index_page(ip, blkno);
                }
-                if (*mp == 0) {
+                if (!(*mp)) {
                        jfs_err("free_index: error reading directory table");
                        return NULL;
                }
@@ -413,7 +413,8 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
                }
                ip->i_size = PSIZE;
-                if ((mp = get_index_page(ip, 0)) == 0) {
+                mp = get_index_page(ip, 0);
+                if (!mp) {
                        jfs_err("add_index: get_metapage failed!");
                        xtTruncate(tid, ip, 0, COMMIT_PWMAP);
                        memcpy(&jfs_ip->i_dirtable, temp_table,
@@ -461,7 +462,7 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
        } else
                mp = read_index_page(ip, blkno);
-        if (mp == 0) {
+        if (!mp) {
                jfs_err("add_index: get/read_metapage failed!");
                goto clean_up;
        }
@@ -499,7 +500,7 @@ static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next)
        dirtab_slot = find_index(ip, index, &mp, &lblock);
-        if (dirtab_slot == 0)
+        if (!dirtab_slot)
                return;
        dirtab_slot->flag = DIR_INDEX_FREE;
@@ -526,7 +527,7 @@ static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn,
        dirtab_slot = find_index(ip, index, mp, lblock);
-        if (dirtab_slot == 0)
+        if (!dirtab_slot)
                return;
        DTSaddress(dirtab_slot, bn);
@@ -552,7 +553,7 @@ static int read_index(struct inode *ip, u32 index,
        struct dir_table_slot *slot;
        slot = find_index(ip, index, &mp, &lblock);
-        if (slot == 0) {
+        if (!slot) {
                return -EIO;
        }
@@ -592,10 +593,8 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
        struct component_name ciKey;
        struct super_block *sb = ip->i_sb;
-        ciKey.name =
+        ciKey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), GFP_NOFS);
-            (wchar_t *) kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t),
+        if (!ciKey.name) {
-                                GFP_NOFS);
-        if (ciKey.name == 0) {
                rc = -ENOMEM;
                goto dtSearch_Exit2;
        }
@@ -957,10 +956,8 @@ static int dtSplitUp(tid_t tid,
        smp = split->mp;
        sp = DT_PAGE(ip, smp);
-        key.name =
+        key.name = kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t), GFP_NOFS);
-            (wchar_t *) kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t),
+        if (!key.name) {
-                                GFP_NOFS);
-        if (key.name == 0) {
                DT_PUTPAGE(smp);
                rc = -ENOMEM;
                goto dtSplitUp_Exit;
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index 8561c6ecece0..cdac2d5bafeb 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h
@@ -74,7 +74,7 @@ struct idtentry {
 #define DTIHDRDATALEN   11
 /* compute number of slots for entry */
-#define NDTINTERNAL(klen) ( ((4 + (klen)) + (15 - 1)) / 15 )
+#define NDTINTERNAL(klen) (DIV_ROUND_UP((4 + (klen)), 15))
 /*
@@ -133,7 +133,7 @@ struct dir_table_slot {
        ( ((s64)((dts)->addr1)) << 32 | __le32_to_cpu((dts)->addr2) )
 /* compute number of slots for entry */
-#define NDTLEAF_LEGACY(klen)    ( ((2 + (klen)) + (15 - 1)) / 15 )
+#define NDTLEAF_LEGACY(klen)    (DIV_ROUND_UP((2 + (klen)), 15))
 #define NDTLEAF NDTINTERNAL
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 3870ba8b9086..9bf29f771737 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -381,7 +381,7 @@ int diRead(struct inode *ip)
        /* read the page of disk inode */
        mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
-        if (mp == 0) {
+        if (!mp) {
                jfs_err("diRead: read_metapage failed");
                return -EIO;
        }
@@ -654,7 +654,7 @@ int diWrite(tid_t tid, struct inode *ip)
        /* read the page of disk inode */
      retry:
        mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
-        if (mp == 0)
+        if (!mp)
                return -EIO;
        /* get the pointer to the disk inode */
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 15a3974cdeeb..325a9679b95a 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -208,6 +208,17 @@ static struct lmStat {
 } lmStat;
 #endif
+static void write_special_inodes(struct jfs_log *log,
+                                 int (*writer)(struct address_space *))
+{
+        struct jfs_sb_info *sbi;
+        list_for_each_entry(sbi, &log->sb_list, log_list) {
+                writer(sbi->ipbmap->i_mapping);
+                writer(sbi->ipimap->i_mapping);
+                writer(sbi->direct_inode->i_mapping);
+        }
+}
 /*
 * NAME:        lmLog()
@@ -935,22 +946,13 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
        struct lrd lrd;
        int lsn;
        struct logsyncblk *lp;
-        struct jfs_sb_info *sbi;
        unsigned long flags;
        /* push dirty metapages out to disk */
        if (hard_sync)
-                list_for_each_entry(sbi, &log->sb_list, log_list) {
+                write_special_inodes(log, filemap_fdatawrite);
-                        filemap_fdatawrite(sbi->ipbmap->i_mapping);
-                        filemap_fdatawrite(sbi->ipimap->i_mapping);
-                        filemap_fdatawrite(sbi->direct_inode->i_mapping);
-                }
        else
-                list_for_each_entry(sbi, &log->sb_list, log_list) {
+                write_special_inodes(log, filemap_flush);
-                        filemap_flush(sbi->ipbmap->i_mapping);
-                        filemap_flush(sbi->ipimap->i_mapping);
-                        filemap_flush(sbi->direct_inode->i_mapping);
-                }
        /*
         *      forward syncpt
@@ -1536,7 +1538,6 @@ void jfs_flush_journal(struct jfs_log *log, int wait)
 {
        int i;
        struct tblock *target = NULL;
-        struct jfs_sb_info *sbi;
        /* jfs_write_inode may call us during read-only mount */
        if (!log)
@@ -1598,11 +1599,7 @@ void jfs_flush_journal(struct jfs_log *log, int wait)
        if (wait < 2)
                return;
-        list_for_each_entry(sbi, &log->sb_list, log_list) {
+        write_special_inodes(log, filemap_fdatawrite);
-                filemap_fdatawrite(sbi->ipbmap->i_mapping);
-                filemap_fdatawrite(sbi->ipimap->i_mapping);
-                filemap_fdatawrite(sbi->direct_inode->i_mapping);
-        }
        /*
         * If there was recent activity, we may need to wait
@@ -1611,6 +1608,7 @@ void jfs_flush_journal(struct jfs_log *log, int wait)
        if ((!list_empty(&log->cqueue)) || !list_empty(&log->synclist)) {
                for (i = 0; i < 200; i++) {     /* Too much? */
                        msleep(250);
+                        write_special_inodes(log, filemap_fdatawrite);
                        if (list_empty(&log->cqueue) &&
                            list_empty(&log->synclist))
                                break;
@@ -2347,7 +2345,7 @@ int jfsIOWait(void *arg)
        do {
                spin_lock_irq(&log_redrive_lock);
-                while ((bp = log_redrive_list) != 0) {
+                while ((bp = log_redrive_list)) {
                        log_redrive_list = bp->l_redrive_next;
                        bp->l_redrive_next = NULL;
                        spin_unlock_irq(&log_redrive_lock);
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index f5cd8d38af7a..d1e64f2f2fcd 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -39,11 +39,11 @@ static struct {
 #endif
 #define metapage_locked(mp) test_bit(META_locked, &(mp)->flag)
-#define trylock_metapage(mp) test_and_set_bit(META_locked, &(mp)->flag)
+#define trylock_metapage(mp) test_and_set_bit_lock(META_locked, &(mp)->flag)
 static inline void unlock_metapage(struct metapage *mp)
 {
-        clear_bit(META_locked, &mp->flag);
+        clear_bit_unlock(META_locked, &mp->flag);
        wake_up(&mp->wait);
 }
@@ -88,7 +88,7 @@ struct meta_anchor {
 };
 #define mp_anchor(page) ((struct meta_anchor *)page_private(page))
-static inline struct metapage *page_to_mp(struct page *page, uint offset)
+static inline struct metapage *page_to_mp(struct page *page, int offset)
 {
        if (!PagePrivate(page))
                return NULL;
@@ -153,7 +153,7 @@ static inline void dec_io(struct page *page, void (*handler) (struct page *))
 }
 #else
-static inline struct metapage *page_to_mp(struct page *page, uint offset)
+static inline struct metapage *page_to_mp(struct page *page, int offset)
 {
        return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL;
 }
@@ -249,7 +249,7 @@ static inline void drop_metapage(struct page *page, struct metapage *mp)
 */
 static sector_t metapage_get_blocks(struct inode *inode, sector_t lblock,
-                                    unsigned int *len)
+                                    int *len)
 {
        int rc = 0;
        int xflag;
@@ -352,25 +352,27 @@ static void metapage_write_end_io(struct bio *bio, int err)
 static int metapage_writepage(struct page *page, struct writeback_control *wbc)
 {
        struct bio *bio = NULL;
-        unsigned int block_offset;      /* block offset of mp within page */
+        int block_offset;       /* block offset of mp within page */
        struct inode *inode = page->mapping->host;
-        unsigned int blocks_per_mp = JFS_SBI(inode->i_sb)->nbperpage;
+        int blocks_per_mp = JFS_SBI(inode->i_sb)->nbperpage;
-        unsigned int len;
+        int len;
-        unsigned int xlen;
+        int xlen;
        struct metapage *mp;
        int redirty = 0;
        sector_t lblock;
+        int nr_underway = 0;
        sector_t pblock;
        sector_t next_block = 0;
        sector_t page_start;
        unsigned long bio_bytes = 0;
        unsigned long bio_offset = 0;
-        unsigned int offset;
+        int offset;
        page_start = (sector_t)page->index <<
                     (PAGE_CACHE_SHIFT - inode->i_blkbits);
        BUG_ON(!PageLocked(page));
        BUG_ON(PageWriteback(page));
+        set_page_writeback(page);
        for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
                mp = page_to_mp(page, offset);
@@ -413,11 +415,10 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
                        if (!bio->bi_size)
                                goto dump_bio;
                        submit_bio(WRITE, bio);
+                        nr_underway++;
                        bio = NULL;
-                } else {
+                } else
-                        set_page_writeback(page);
                        inc_io(page);
-                }
                xlen = (PAGE_CACHE_SIZE - offset) >> inode->i_blkbits;
                pblock = metapage_get_blocks(inode, lblock, &xlen);
                if (!pblock) {
@@ -427,7 +428,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
                        continue;
                }
                set_bit(META_io, &mp->flag);
-                len = min(xlen, (uint) JFS_SBI(inode->i_sb)->nbperpage);
+                len = min(xlen, (int)JFS_SBI(inode->i_sb)->nbperpage);
                bio = bio_alloc(GFP_NOFS, 1);
                bio->bi_bdev = inode->i_sb->s_bdev;
@@ -449,12 +450,16 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
                        goto dump_bio;
                submit_bio(WRITE, bio);
+                nr_underway++;
        }
        if (redirty)
                redirty_page_for_writepage(wbc, page);
        unlock_page(page);
+        if (nr_underway == 0)
+                end_page_writeback(page);
        return 0;
 add_failed:
        /* We should never reach here, since we're only adding one vec */
@@ -475,13 +480,13 @@ static int metapage_readpage(struct file *fp, struct page *page)
 {
        struct inode *inode = page->mapping->host;
        struct bio *bio = NULL;
-        unsigned int block_offset;
+        int block_offset;
-        unsigned int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
+        int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
        sector_t page_start;    /* address of page in fs blocks */
        sector_t pblock;
-        unsigned int xlen;
+        int xlen;
        unsigned int len;
-        unsigned int offset;
+        int offset;
        BUG_ON(!PageLocked(page));
        page_start = (sector_t)page->index <<
@@ -530,7 +535,7 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
 {
        struct metapage *mp;
        int ret = 1;
-        unsigned int offset;
+        int offset;
        for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
                mp = page_to_mp(page, offset);
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 644429acb8c0..7b698f2ec45a 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -147,7 +147,7 @@ int jfs_mount(struct super_block *sb)
         */
        if ((sbi->mntflag & JFS_BAD_SAIT) == 0) {
                ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1);
-                if (ipaimap2 == 0) {
+                if (!ipaimap2) {
                        jfs_err("jfs_mount: Faild to read AGGREGATE_I");
                        rc = -EIO;
                        goto errout35;
diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c
index 7971f37534a3..adcf92d3b603 100644
--- a/fs/jfs/jfs_umount.c
+++ b/fs/jfs/jfs_umount.c
@@ -68,7 +68,7 @@ int jfs_umount(struct super_block *sb)
                /*
                 * Wait for outstanding transactions to be written to log:
                 */
-                jfs_flush_journal(log, 2);
+                jfs_flush_journal(log, 1);
        /*
         * close fileset inode allocation map (aka fileset inode)
@@ -146,7 +146,7 @@ int jfs_umount_rw(struct super_block *sb)
         *
         * remove file system from log active file system list.
         */
-        jfs_flush_journal(log, 2);
+        jfs_flush_journal(log, 1);
        /*
         * Make sure all metadata makes it to disk
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 4e0a8493cef6..f8718de3505e 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1103,8 +1103,8 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         * Make sure dest inode number (if any) is what we think it is
         */
        rc = dtSearch(new_dir, &new_dname, &ino, &btstack, JFS_LOOKUP);
-        if (rc == 0) {
+        if (!rc) {
-                if ((new_ip == 0) || (ino != new_ip->i_ino)) {
+                if ((!new_ip) || (ino != new_ip->i_ino)) {
                        rc = -ESTALE;
                        goto out3;
                }
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 71984ee95346..7f24a0bb08ca 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -172,7 +172,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
         */
        t64 = ((newLVSize - newLogSize + BPERDMAP - 1) >> L2BPERDMAP)
            << L2BPERDMAP;
-        t32 = ((t64 + (BITSPERPAGE - 1)) / BITSPERPAGE) + 1 + 50;
+        t32 = DIV_ROUND_UP(t64, BITSPERPAGE) + 1 + 50;
        newFSCKSize = t32 << sbi->l2nbperpage;
        newFSCKAddress = newLogAddress - newFSCKSize;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 314bb4ff1ba8..70a14001c98f 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -598,6 +598,12 @@ static int jfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_printf(seq, ",umask=%03o", sbi->umask);
        if (sbi->flag & JFS_NOINTEGRITY)
                seq_puts(seq, ",nointegrity");
+        if (sbi->nls_tab)
+                seq_printf(seq, ",iocharset=%s", sbi->nls_tab->charset);
+        if (sbi->flag & JFS_ERR_CONTINUE)
+                seq_printf(seq, ",errors=continue");
+        if (sbi->flag & JFS_ERR_PANIC)
+                seq_printf(seq, ",errors=panic");
 #ifdef CONFIG_QUOTA
        if (sbi->flag & JFS_USRQUOTA)
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index d070b18e539d..0b45fd3a4bfd 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -41,6 +41,48 @@ struct nlm_wait {
 static LIST_HEAD(nlm_blocked);
+/**
+ * nlmclnt_init - Set up per-NFS mount point lockd data structures
+ * @nlm_init: pointer to arguments structure
+ *
+ * Returns pointer to an appropriate nlm_host struct,
+ * or an ERR_PTR value.
+ */
+struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
+{
+        struct nlm_host *host;
+        u32 nlm_version = (nlm_init->nfs_version == 2) ? 1 : 4;
+        int status;
+        status = lockd_up(nlm_init->protocol);
+        if (status < 0)
+                return ERR_PTR(status);
+        host = nlmclnt_lookup_host((struct sockaddr_in *)nlm_init->address,
+                                   nlm_init->protocol, nlm_version,
+                                   nlm_init->hostname,
+                                   strlen(nlm_init->hostname));
+        if (host == NULL) {
+                lockd_down();
+                return ERR_PTR(-ENOLCK);
+        }
+        return host;
+}
+EXPORT_SYMBOL_GPL(nlmclnt_init);
+/**
+ * nlmclnt_done - Release resources allocated by nlmclnt_init()
+ * @host: nlm_host structure reserved by nlmclnt_init()
+ *
+ */
+void nlmclnt_done(struct nlm_host *host)
+{
+        nlm_release_host(host);
+        lockd_down();
+}
+EXPORT_SYMBOL_GPL(nlmclnt_done);
 /*
 * Queue up a lock for blocking so that the GRANTED request can see it
 */
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index a10343bed160..b6b74a60e1eb 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -145,34 +145,21 @@ static void nlmclnt_release_lockargs(struct nlm_rqst *req)
        BUG_ON(req->a_args.lock.fl.fl_ops != NULL);
 }
-/*
+/**
- * This is the main entry point for the NLM client.
+ * nlmclnt_proc - Perform a single client-side lock request
+ * @host: address of a valid nlm_host context representing the NLM server
+ * @cmd: fcntl-style file lock operation to perform
+ * @fl: address of arguments for the lock operation
+ *
 */
-int
+int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
-nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl)
 {
-        struct rpc_clnt         *client = NFS_CLIENT(inode);
-        struct sockaddr_in      addr;
-        struct nfs_server       *nfssrv = NFS_SERVER(inode);
-        struct nlm_host         *host;
        struct nlm_rqst         *call;
        sigset_t                oldset;
        unsigned long           flags;
-        int                     status, vers;
+        int                     status;
-        vers = (NFS_PROTO(inode)->version == 3) ? 4 : 1;
-        if (NFS_PROTO(inode)->version > 3) {
-                printk(KERN_NOTICE "NFSv4 file locking not implemented!\n");
-                return -ENOLCK;
-        }
-        rpc_peeraddr(client, (struct sockaddr *) &addr, sizeof(addr));
-        host = nlmclnt_lookup_host(&addr, client->cl_xprt->prot, vers,
-                                   nfssrv->nfs_client->cl_hostname,
-                                   strlen(nfssrv->nfs_client->cl_hostname));
-        if (host == NULL)
-                return -ENOLCK;
+        nlm_get_host(host);
        call = nlm_alloc_call(host);
        if (call == NULL)
                return -ENOMEM;
@@ -219,7 +206,7 @@ nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl)
        dprintk("lockd: clnt proc returns %d\n", status);
        return status;
 }
-EXPORT_SYMBOL(nlmclnt_proc);
+EXPORT_SYMBOL_GPL(nlmclnt_proc);
 /*
 * Allocate an NLM RPC call struct
@@ -257,7 +244,7 @@ void nlm_release_call(struct nlm_rqst *call)
 static void nlmclnt_rpc_release(void *data)
 {
-        return nlm_release_call(data);
+        nlm_release_call(data);
 }
 static int nlm_wait_on_grace(wait_queue_head_t *queue)
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 572601e98dcd..ca6b16fc3101 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -34,10 +34,10 @@ static DEFINE_MUTEX(nlm_host_mutex);
 static void                     nlm_gc_hosts(void);
 static struct nsm_handle *      __nsm_find(const struct sockaddr_in *,
-                                        const char *, int, int);
+                                        const char *, unsigned int, int);
 static struct nsm_handle *      nsm_find(const struct sockaddr_in *sin,
                                         const char *hostname,
-                                         int hostname_len);
+                                         unsigned int hostname_len);
 /*
 * Common host lookup routine for server & client
@@ -45,7 +45,8 @@ static struct nsm_handle *	nsm_find(const struct sockaddr_in *sin,
 static struct nlm_host *
 nlm_lookup_host(int server, const struct sockaddr_in *sin,
                int proto, int version, const char *hostname,
-                int hostname_len, const struct sockaddr_in *ssin)
+                unsigned int hostname_len,
+                const struct sockaddr_in *ssin)
 {
        struct hlist_head *chain;
        struct hlist_node *pos;
@@ -176,7 +177,7 @@ nlm_destroy_host(struct nlm_host *host)
 */
 struct nlm_host *
 nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version,
-                        const char *hostname, int hostname_len)
+                        const char *hostname, unsigned int hostname_len)
 {
        struct sockaddr_in ssin = {0};
@@ -189,7 +190,7 @@ nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version,
 */
 struct nlm_host *
 nlmsvc_lookup_host(struct svc_rqst *rqstp,
-                        const char *hostname, int hostname_len)
+                        const char *hostname, unsigned int hostname_len)
 {
        struct sockaddr_in ssin = {0};
@@ -307,7 +308,8 @@ void nlm_release_host(struct nlm_host *host)
 * Release all resources held by that peer.
 */
 void nlm_host_rebooted(const struct sockaddr_in *sin,
-                                const char *hostname, int hostname_len,
+                                const char *hostname,
+                                unsigned int hostname_len,
                                u32 new_state)
 {
        struct hlist_head *chain;
@@ -377,8 +379,13 @@ nlm_shutdown_hosts(void)
        /* First, make all hosts eligible for gc */
        dprintk("lockd: nuking all hosts...\n");
        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
-                hlist_for_each_entry(host, pos, chain, h_hash)
+                hlist_for_each_entry(host, pos, chain, h_hash) {
                        host->h_expires = jiffies - 1;
+                        if (host->h_rpcclnt) {
+                                rpc_shutdown_client(host->h_rpcclnt);
+                                host->h_rpcclnt = NULL;
+                        }
+                }
        }
        /* Then, perform a garbage collection pass */
@@ -449,7 +456,7 @@ static DEFINE_MUTEX(nsm_mutex);
 static struct nsm_handle *
 __nsm_find(const struct sockaddr_in *sin,
-                const char *hostname, int hostname_len,
+                const char *hostname, unsigned int hostname_len,
                int create)
 {
        struct nsm_handle *nsm = NULL;
@@ -503,7 +510,8 @@ out:
 }
 static struct nsm_handle *
-nsm_find(const struct sockaddr_in *sin, const char *hostname, int hostname_len)
+nsm_find(const struct sockaddr_in *sin, const char *hostname,
+         unsigned int hostname_len)
 {
        return __nsm_find(sin, hostname, hostname_len, 1);
 }
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 82e2192a0d5c..08226464e563 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -219,19 +219,6 @@ lockd(struct svc_rqst *rqstp)
        module_put_and_exit(0);
 }
-static int find_socket(struct svc_serv *serv, int proto)
-{
-        struct svc_sock *svsk;
-        int found = 0;
-        list_for_each_entry(svsk, &serv->sv_permsocks, sk_list)
-                if (svsk->sk_sk->sk_protocol == proto) {
-                        found = 1;
-                        break;
-                }
-        return found;
-}
 /*
 * Make any sockets that are needed but not present.
 * If nlm_udpport or nlm_tcpport were set as module
@@ -240,17 +227,25 @@ static int find_socket(struct svc_serv *serv, int proto)
 static int make_socks(struct svc_serv *serv, int proto)
 {
        static int warned;
+        struct svc_xprt *xprt;
        int err = 0;
-        if (proto == IPPROTO_UDP || nlm_udpport)
+        if (proto == IPPROTO_UDP || nlm_udpport) {
-                if (!find_socket(serv, IPPROTO_UDP))
+                xprt = svc_find_xprt(serv, "udp", 0, 0);
-                        err = svc_makesock(serv, IPPROTO_UDP, nlm_udpport,
+                if (!xprt)
-                                                SVC_SOCK_DEFAULTS);
+                        err = svc_create_xprt(serv, "udp", nlm_udpport,
-        if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport))
+                                              SVC_SOCK_DEFAULTS);
-                if (!find_socket(serv, IPPROTO_TCP))
+                else
-                        err = svc_makesock(serv, IPPROTO_TCP, nlm_tcpport,
+                        svc_xprt_put(xprt);
-                                                SVC_SOCK_DEFAULTS);
+        }
+        if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) {
+                xprt = svc_find_xprt(serv, "tcp", 0, 0);
+                if (!xprt)
+                        err = svc_create_xprt(serv, "tcp", nlm_tcpport,
+                                              SVC_SOCK_DEFAULTS);
+                else
+                        svc_xprt_put(xprt);
+        }
        if (err >= 0) {
                warned = 0;
                err = 0;
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index bf27b6c6cb6b..385437e3387d 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -84,6 +84,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 {
        struct nlm_host *host;
        struct nlm_file *file;
+        int rc = rpc_success;
        dprintk("lockd: TEST4        called\n");
        resp->cookie = argp->cookie;
@@ -91,7 +92,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
        /* Don't accept test requests during grace period */
        if (nlmsvc_grace_period) {
                resp->status = nlm_lck_denied_grace_period;
-                return rpc_success;
+                return rc;
        }
        /* Obtain client and file */
@@ -101,12 +102,13 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
        /* Now check for conflicting locks */
        resp->status = nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie);
        if (resp->status == nlm_drop_reply)
-                return rpc_drop_reply;
+                rc = rpc_drop_reply;
+        else
+                dprintk("lockd: TEST4        status %d\n", ntohl(resp->status));
-        dprintk("lockd: TEST4          status %d\n", ntohl(resp->status));
        nlm_release_host(host);
        nlm_release_file(file);
-        return rpc_success;
+        return rc;
 }
 static __be32
@@ -115,6 +117,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 {
        struct nlm_host *host;
        struct nlm_file *file;
+        int rc = rpc_success;
        dprintk("lockd: LOCK          called\n");
@@ -123,7 +126,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
        /* Don't accept new lock requests during grace period */
        if (nlmsvc_grace_period && !argp->reclaim) {
                resp->status = nlm_lck_denied_grace_period;
-                return rpc_success;
+                return rc;
        }
        /* Obtain client and file */
@@ -146,12 +149,13 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = nlmsvc_lock(rqstp, file, &argp->lock,
                                        argp->block, &argp->cookie);
        if (resp->status == nlm_drop_reply)
-                return rpc_drop_reply;
+                rc = rpc_drop_reply;
+        else
+                dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
-        dprintk("lockd: LOCK          status %d\n", ntohl(resp->status));
        nlm_release_host(host);
        nlm_release_file(file);
-        return rpc_success;
+        return rc;
 }
 static __be32
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index d120ec39bcb0..2f4d8fa66689 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -501,25 +501,29 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
                        block, block->b_flags, block->b_fl);
                if (block->b_flags & B_TIMED_OUT) {
                        nlmsvc_unlink_block(block);
-                        return nlm_lck_denied;
+                        ret = nlm_lck_denied;
+                        goto out;
                }
                if (block->b_flags & B_GOT_CALLBACK) {
+                        nlmsvc_unlink_block(block);
                        if (block->b_fl != NULL
                                        && block->b_fl->fl_type != F_UNLCK) {
                                lock->fl = *block->b_fl;
                                goto conf_lock;
-                        }
+                        } else {
-                        else {
+                                ret = nlm_granted;
-                                nlmsvc_unlink_block(block);
+                                goto out;
-                                return nlm_granted;
                        }
                }
-                return nlm_drop_reply;
+                ret = nlm_drop_reply;
+                goto out;
        }
        error = vfs_test_lock(file->f_file, &lock->fl);
-        if (error == -EINPROGRESS)
+        if (error == -EINPROGRESS) {
-                return nlmsvc_defer_lock_rqst(rqstp, block);
+                ret = nlmsvc_defer_lock_rqst(rqstp, block);
+                goto out;
+        }
        if (error) {
                ret = nlm_lck_denied_nolocks;
                goto out;
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 9cd5c8b37593..88379cc6e0b1 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -113,6 +113,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 {
        struct nlm_host *host;
        struct nlm_file *file;
+        int rc = rpc_success;
        dprintk("lockd: TEST          called\n");
        resp->cookie = argp->cookie;
@@ -120,7 +121,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
        /* Don't accept test requests during grace period */
        if (nlmsvc_grace_period) {
                resp->status = nlm_lck_denied_grace_period;
-                return rpc_success;
+                return rc;
        }
        /* Obtain client and file */
@@ -130,13 +131,14 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
        /* Now check for conflicting locks */
        resp->status = cast_status(nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie));
        if (resp->status == nlm_drop_reply)
-                return rpc_drop_reply;
+                rc = rpc_drop_reply;
+        else
+                dprintk("lockd: TEST          status %d vers %d\n",
+                        ntohl(resp->status), rqstp->rq_vers);
-        dprintk("lockd: TEST          status %d vers %d\n",
-                ntohl(resp->status), rqstp->rq_vers);
        nlm_release_host(host);
        nlm_release_file(file);
-        return rpc_success;
+        return rc;
 }
 static __be32
@@ -145,6 +147,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 {
        struct nlm_host *host;
        struct nlm_file *file;
+        int rc = rpc_success;
        dprintk("lockd: LOCK          called\n");
@@ -153,7 +156,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
        /* Don't accept new lock requests during grace period */
        if (nlmsvc_grace_period && !argp->reclaim) {
                resp->status = nlm_lck_denied_grace_period;
-                return rpc_success;
+                return rc;
        }
        /* Obtain client and file */
@@ -176,12 +179,13 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = cast_status(nlmsvc_lock(rqstp, file, &argp->lock,
                                               argp->block, &argp->cookie));
        if (resp->status == nlm_drop_reply)
-                return rpc_drop_reply;
+                rc = rpc_drop_reply;
+        else
+                dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
-        dprintk("lockd: LOCK          status %d\n", ntohl(resp->status));
        nlm_release_host(host);
        nlm_release_file(file);
-        return rpc_success;
+        return rc;
 }
 static __be32
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 84ebba33b98d..dbbefbcd6712 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -87,7 +87,7 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
        unsigned int    hash;
        __be32          nfserr;
-        nlm_debug_print_fh("nlm_file_lookup", f);
+        nlm_debug_print_fh("nlm_lookup_file", f);
        hash = file_hash(f);
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 633653bff944..3e459e18cc31 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -612,8 +612,7 @@ const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
         * called with BKL held.
         */
        static char buf[2*NLM_MAXCOOKIELEN+1];
-        int i;
+        unsigned int i, len = sizeof(buf);
-        int len = sizeof(buf);
        char *p = buf;
        len--;  /* allow for trailing \0 */
diff --git a/fs/locks.c b/fs/locks.c
index 0127a2846819..8b8388eca05e 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -696,17 +696,28 @@ EXPORT_SYMBOL(posix_test_lock);
 * Note: the above assumption may not be true when handling lock requests
 * from a broken NFS client. But broken NFS clients have a lot more to
 * worry about than proper deadlock detection anyway... --okir
+ *
+ * However, the failure of this assumption (also possible in the case of
+ * multiple tasks sharing the same open file table) also means there's no
+ * guarantee that the loop below will terminate.  As a hack, we give up
+ * after a few iterations.
 */
+#define MAX_DEADLK_ITERATIONS 10
 static int posix_locks_deadlock(struct file_lock *caller_fl,
                                struct file_lock *block_fl)
 {
        struct file_lock *fl;
+        int i = 0;
 next_task:
        if (posix_same_owner(caller_fl, block_fl))
                return 1;
        list_for_each_entry(fl, &blocked_list, fl_link) {
                if (posix_same_owner(fl, block_fl)) {
+                        if (i++ > MAX_DEADLK_ITERATIONS)
+                                return 0;
                        fl = fl->fl_next;
                        block_fl = fl;
                        goto next_task;
diff --git a/fs/namei.c b/fs/namei.c
index 3b993db26cee..73e2e665817a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1605,7 +1605,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
        if (S_ISLNK(inode->i_mode))
                return -ELOOP;
        
-        if (S_ISDIR(inode->i_mode) && (flag & FMODE_WRITE))
+        if (S_ISDIR(inode->i_mode) && (acc_mode & MAY_WRITE))
                return -EISDIR;
        /*
@@ -1620,7 +1620,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
                        return -EACCES;
                flag &= ~O_TRUNC;
-        } else if (IS_RDONLY(inode) && (flag & FMODE_WRITE))
+        } else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE))
                return -EROFS;
        error = vfs_permission(nd, acc_mode);
diff --git a/fs/namespace.c b/fs/namespace.c
index 06083885b21e..61bf376e29e8 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -41,8 +41,8 @@ static struct kmem_cache *mnt_cache __read_mostly;
 static struct rw_semaphore namespace_sem;
 /* /sys/fs */
-decl_subsys(fs, NULL, NULL);
+struct kobject *fs_kobj;
-EXPORT_SYMBOL_GPL(fs_subsys);
+EXPORT_SYMBOL_GPL(fs_kobj);
 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 {
@@ -1861,10 +1861,9 @@ void __init mnt_init(void)
        if (err)
                printk(KERN_WARNING "%s: sysfs_init error: %d\n",
                        __FUNCTION__, err);
-        err = subsystem_register(&fs_subsys);
+        fs_kobj = kobject_create_and_add("fs", NULL);
-        if (err)
+        if (!fs_kobj)
-                printk(KERN_WARNING "%s: subsystem_register error: %d\n",
+                printk(KERN_WARNING "%s: kobj create error\n", __FUNCTION__);
-                        __FUNCTION__, err);
        init_rootfs();
        init_mount_tree();
 }
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index a796be5051bf..bd185a572a23 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -73,8 +73,6 @@ static void nfs_callback_svc(struct svc_rqst *rqstp)
        complete(&nfs_callback_info.started);
        for(;;) {
-                char buf[RPC_MAX_ADDRBUFLEN];
                if (signalled()) {
                        if (nfs_callback_info.users == 0)
                                break;
@@ -92,8 +90,6 @@ static void nfs_callback_svc(struct svc_rqst *rqstp)
                                        __FUNCTION__, -err);
                        break;
                }
-                dprintk("%s: request from %s\n", __FUNCTION__,
-                                svc_print_addr(rqstp, buf, sizeof(buf)));
                svc_process(rqstp);
        }
@@ -123,8 +119,8 @@ int nfs_callback_up(void)
        if (!serv)
                goto out_err;
-        ret = svc_makesock(serv, IPPROTO_TCP, nfs_callback_set_tcpport,
+        ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport,
-                                                        SVC_SOCK_ANONYMOUS);
+                              SVC_SOCK_ANONYMOUS);
        if (ret <= 0)
                goto out_destroy;
        nfs_callback_tcpport = ret;
@@ -168,12 +164,11 @@ void nfs_callback_down(void)
 static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 {
-        struct sockaddr_in *addr = svc_addr_in(rqstp);
        struct nfs_client *clp;
        char buf[RPC_MAX_ADDRBUFLEN];
        /* Don't talk to strangers */
-        clp = nfs_find_client(addr, 4);
+        clp = nfs_find_client(svc_addr(rqstp), 4);
        if (clp == NULL)
                return SVC_DROP;
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index c2bb14e053e1..bb25d2135ff1 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -38,7 +38,7 @@ struct cb_compound_hdr_res {
 };
 struct cb_getattrargs {
-        struct sockaddr_in *addr;
+        struct sockaddr *addr;
        struct nfs_fh fh;
        uint32_t bitmap[2];
 };
@@ -53,7 +53,7 @@ struct cb_getattrres {
 };
 struct cb_recallargs {
-        struct sockaddr_in *addr;
+        struct sockaddr *addr;
        struct nfs_fh fh;
        nfs4_stateid stateid;
        uint32_t truncate;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 72e55d83756d..15f7785048d3 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -12,7 +12,9 @@
 #include "delegation.h"
 #include "internal.h"
+#ifdef NFS_DEBUG
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
+#endif
 
 __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res)
 {
@@ -20,12 +22,16 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
        struct nfs_delegation *delegation;
        struct nfs_inode *nfsi;
        struct inode *inode;
-        
        res->bitmap[0] = res->bitmap[1] = 0;
        res->status = htonl(NFS4ERR_BADHANDLE);
        clp = nfs_find_client(args->addr, 4);
        if (clp == NULL)
                goto out;
+        dprintk("NFS: GETATTR callback request from %s\n",
+                rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
        inode = nfs_delegation_find_inode(clp, &args->fh);
        if (inode == NULL)
                goto out_putclient;
@@ -65,23 +71,32 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
        clp = nfs_find_client(args->addr, 4);
        if (clp == NULL)
                goto out;
-        inode = nfs_delegation_find_inode(clp, &args->fh);
-        if (inode == NULL)
+        dprintk("NFS: RECALL callback request from %s\n",
-                goto out_putclient;
+                rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
-        /* Set up a helper thread to actually return the delegation */
-        switch(nfs_async_inode_return_delegation(inode, &args->stateid)) {
+        do {
-                case 0:
+                struct nfs_client *prev = clp;
-                        res = 0;
-                        break;
+                inode = nfs_delegation_find_inode(clp, &args->fh);
-                case -ENOENT:
+                if (inode != NULL) {
-                        res = htonl(NFS4ERR_BAD_STATEID);
+                        /* Set up a helper thread to actually return the delegation */
-                        break;
+                        switch(nfs_async_inode_return_delegation(inode, &args->stateid)) {
-                default:
+                                case 0:
-                        res = htonl(NFS4ERR_RESOURCE);
+                                        res = 0;
-        }
+                                        break;
-        iput(inode);
+                                case -ENOENT:
-out_putclient:
+                                        if (res != 0)
-        nfs_put_client(clp);
+                                                res = htonl(NFS4ERR_BAD_STATEID);
+                                        break;
+                                default:
+                                        res = htonl(NFS4ERR_RESOURCE);
+                        }
+                        iput(inode);
+                }
+                clp = nfs_find_client_next(prev);
+                nfs_put_client(prev);
+        } while (clp != NULL);
 out:
        dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(res));
        return res;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 058ade7efe79..c63eb720b68b 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -139,7 +139,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
        if (unlikely(status != 0))
                return status;
        /* We do not like overly long tags! */
-        if (hdr->taglen > CB_OP_TAGLEN_MAXSZ-12 || hdr->taglen < 0) {
+        if (hdr->taglen > CB_OP_TAGLEN_MAXSZ - 12) {
                printk("NFSv4 CALLBACK %s: client sent tag of length %u\n",
                                __FUNCTION__, hdr->taglen);
                return htonl(NFS4ERR_RESOURCE);
@@ -176,7 +176,7 @@ static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr
        status = decode_fh(xdr, &args->fh);
        if (unlikely(status != 0))
                goto out;
-        args->addr = svc_addr_in(rqstp);
+        args->addr = svc_addr(rqstp);
        status = decode_bitmap(xdr, args->bitmap);
 out:
        dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(status));
@@ -188,7 +188,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr,
        __be32 *p;
        __be32 status;
-        args->addr = svc_addr_in(rqstp);
+        args->addr = svc_addr(rqstp);
        status = decode_stateid(xdr, &args->stateid);
        if (unlikely(status != 0))
                goto out;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 70587f383f10..c5c0175898f6 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -34,6 +34,8 @@
 #include <linux/nfs_idmap.h>
 #include <linux/vfs.h>
 #include <linux/inet.h>
+#include <linux/in6.h>
+#include <net/ipv6.h>
 #include <linux/nfs_xdr.h>
 #include <asm/system.h>
@@ -93,22 +95,30 @@ struct rpc_program		nfsacl_program = {
 };
 #endif  /* CONFIG_NFS_V3_ACL */
+struct nfs_client_initdata {
+        const char *hostname;
+        const struct sockaddr *addr;
+        size_t addrlen;
+        const struct nfs_rpc_ops *rpc_ops;
+        int proto;
+};
 /*
 * Allocate a shared client record
 *
 * Since these are allocated/deallocated very rarely, we don't
 * bother putting them in a slab cache...
 */
-static struct nfs_client *nfs_alloc_client(const char *hostname,
+static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
-                                           const struct sockaddr_in *addr,
-                                           int nfsversion)
 {
        struct nfs_client *clp;
        if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL)
                goto error_0;
-        if (nfsversion == 4) {
+        clp->rpc_ops = cl_init->rpc_ops;
+        if (cl_init->rpc_ops->version == 4) {
                if (nfs_callback_up() < 0)
                        goto error_2;
                __set_bit(NFS_CS_CALLBACK, &clp->cl_res_state);
@@ -117,11 +127,11 @@ static struct nfs_client *nfs_alloc_client(const char *hostname,
        atomic_set(&clp->cl_count, 1);
        clp->cl_cons_state = NFS_CS_INITING;
-        clp->cl_nfsversion = nfsversion;
+        memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen);
-        memcpy(&clp->cl_addr, addr, sizeof(clp->cl_addr));
+        clp->cl_addrlen = cl_init->addrlen;
-        if (hostname) {
+        if (cl_init->hostname) {
-                clp->cl_hostname = kstrdup(hostname, GFP_KERNEL);
+                clp->cl_hostname = kstrdup(cl_init->hostname, GFP_KERNEL);
                if (!clp->cl_hostname)
                        goto error_3;
        }
@@ -129,6 +139,8 @@ static struct nfs_client *nfs_alloc_client(const char *hostname,
        INIT_LIST_HEAD(&clp->cl_superblocks);
        clp->cl_rpcclient = ERR_PTR(-EINVAL);
+        clp->cl_proto = cl_init->proto;
 #ifdef CONFIG_NFS_V4
        init_rwsem(&clp->cl_sem);
        INIT_LIST_HEAD(&clp->cl_delegations);
@@ -166,7 +178,7 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
 */
 static void nfs_free_client(struct nfs_client *clp)
 {
-        dprintk("--> nfs_free_client(%d)\n", clp->cl_nfsversion);
+        dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version);
        nfs4_shutdown_client(clp);
@@ -203,76 +215,148 @@ void nfs_put_client(struct nfs_client *clp)
        }
 }
+static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
+                                 const struct sockaddr_in *sa2)
+{
+        return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr;
+}
+static int nfs_sockaddr_match_ipaddr6(const struct sockaddr_in6 *sa1,
+                                 const struct sockaddr_in6 *sa2)
+{
+        return ipv6_addr_equal(&sa1->sin6_addr, &sa2->sin6_addr);
+}
+static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
+                                 const struct sockaddr *sa2)
+{
+        switch (sa1->sa_family) {
+        case AF_INET:
+                return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
+                                (const struct sockaddr_in *)sa2);
+        case AF_INET6:
+                return nfs_sockaddr_match_ipaddr6((const struct sockaddr_in6 *)sa1,
+                                (const struct sockaddr_in6 *)sa2);
+        }
+        BUG();
+}
 /*
- * Find a client by address
+ * Find a client by IP address and protocol version
- * - caller must hold nfs_client_lock
+ * - returns NULL if no such client
 */
-static struct nfs_client *__nfs_find_client(const struct sockaddr_in *addr, int nfsversion, int match_port)
+struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
 {
        struct nfs_client *clp;
+        spin_lock(&nfs_client_lock);
        list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+                struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
                /* Don't match clients that failed to initialise properly */
-                if (clp->cl_cons_state < 0)
+                if (clp->cl_cons_state != NFS_CS_READY)
                        continue;
                /* Different NFS versions cannot share the same nfs_client */
-                if (clp->cl_nfsversion != nfsversion)
+                if (clp->rpc_ops->version != nfsversion)
                        continue;
-                if (memcmp(&clp->cl_addr.sin_addr, &addr->sin_addr,
+                if (addr->sa_family != clap->sa_family)
-                           sizeof(clp->cl_addr.sin_addr)) != 0)
+                        continue;
+                /* Match only the IP address, not the port number */
+                if (!nfs_sockaddr_match_ipaddr(addr, clap))
                        continue;
-                if (!match_port || clp->cl_addr.sin_port == addr->sin_port)
+                atomic_inc(&clp->cl_count);
-                        goto found;
+                spin_unlock(&nfs_client_lock);
+                return clp;
        }
+        spin_unlock(&nfs_client_lock);
        return NULL;
-found:
-        atomic_inc(&clp->cl_count);
-        return clp;
 }
 /*
 * Find a client by IP address and protocol version
 * - returns NULL if no such client
 */
-struct nfs_client *nfs_find_client(const struct sockaddr_in *addr, int nfsversion)
+struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
 {
-        struct nfs_client *clp;
+        struct sockaddr *sap = (struct sockaddr *)&clp->cl_addr;
+        u32 nfsvers = clp->rpc_ops->version;
        spin_lock(&nfs_client_lock);
-        clp = __nfs_find_client(addr, nfsversion, 0);
+        list_for_each_entry_continue(clp, &nfs_client_list, cl_share_link) {
+                struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
+                /* Don't match clients that failed to initialise properly */
+                if (clp->cl_cons_state != NFS_CS_READY)
+                        continue;
+                /* Different NFS versions cannot share the same nfs_client */
+                if (clp->rpc_ops->version != nfsvers)
+                        continue;
+                if (sap->sa_family != clap->sa_family)
+                        continue;
+                /* Match only the IP address, not the port number */
+                if (!nfs_sockaddr_match_ipaddr(sap, clap))
+                        continue;
+                atomic_inc(&clp->cl_count);
+                spin_unlock(&nfs_client_lock);
+                return clp;
+        }
        spin_unlock(&nfs_client_lock);
-        if (clp != NULL && clp->cl_cons_state != NFS_CS_READY) {
+        return NULL;
-                nfs_put_client(clp);
+}
-                clp = NULL;
+/*
+ * Find an nfs_client on the list that matches the initialisation data
+ * that is supplied.
+ */
+static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data)
+{
+        struct nfs_client *clp;
+        list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+                /* Don't match clients that failed to initialise properly */
+                if (clp->cl_cons_state < 0)
+                        continue;
+                /* Different NFS versions cannot share the same nfs_client */
+                if (clp->rpc_ops != data->rpc_ops)
+                        continue;
+                if (clp->cl_proto != data->proto)
+                        continue;
+                /* Match the full socket address */
+                if (memcmp(&clp->cl_addr, data->addr, sizeof(clp->cl_addr)) != 0)
+                        continue;
+                atomic_inc(&clp->cl_count);
+                return clp;
        }
-        return clp;
+        return NULL;
 }
 /*
 * Look up a client by IP address and protocol version
 * - creates a new record if one doesn't yet exist
 */
-static struct nfs_client *nfs_get_client(const char *hostname,
+static struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
-                                         const struct sockaddr_in *addr,
-                                         int nfsversion)
 {
        struct nfs_client *clp, *new = NULL;
        int error;
-        dprintk("--> nfs_get_client(%s,"NIPQUAD_FMT":%d,%d)\n",
+        dprintk("--> nfs_get_client(%s,v%u)\n",
-                hostname ?: "", NIPQUAD(addr->sin_addr),
+                cl_init->hostname ?: "", cl_init->rpc_ops->version);
-                addr->sin_port, nfsversion);
        /* see if the client already exists */
        do {
                spin_lock(&nfs_client_lock);
-                clp = __nfs_find_client(addr, nfsversion, 1);
+                clp = nfs_match_client(cl_init);
                if (clp)
                        goto found_client;
                if (new)
@@ -280,7 +364,7 @@ static struct nfs_client *nfs_get_client(const char *hostname,
                spin_unlock(&nfs_client_lock);
-                new = nfs_alloc_client(hostname, addr, nfsversion);
+                new = nfs_alloc_client(cl_init);
        } while (new);
        return ERR_PTR(-ENOMEM);
@@ -302,7 +386,7 @@ found_client:
        if (new)
                nfs_free_client(new);
-        error = wait_event_interruptible(nfs_client_active_wq,
+        error = wait_event_killable(nfs_client_active_wq,
                                clp->cl_cons_state != NFS_CS_INITING);
        if (error < 0) {
                nfs_put_client(clp);
@@ -344,12 +428,16 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
        switch (proto) {
        case XPRT_TRANSPORT_TCP:
        case XPRT_TRANSPORT_RDMA:
-                if (!to->to_initval)
+                if (to->to_initval == 0)
                        to->to_initval = 60 * HZ;
                if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
                        to->to_initval = NFS_MAX_TCP_TIMEOUT;
                to->to_increment = to->to_initval;
                to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
+                if (to->to_maxval > NFS_MAX_TCP_TIMEOUT)
+                        to->to_maxval = NFS_MAX_TCP_TIMEOUT;
+                if (to->to_maxval < to->to_initval)
+                        to->to_maxval = to->to_initval;
                to->to_exponential = 0;
                break;
        case XPRT_TRANSPORT_UDP:
@@ -367,19 +455,17 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
 /*
 * Create an RPC client handle
 */
-static int nfs_create_rpc_client(struct nfs_client *clp, int proto,
+static int nfs_create_rpc_client(struct nfs_client *clp,
-                                                unsigned int timeo,
+                                 const struct rpc_timeout *timeparms,
-                                                unsigned int retrans,
+                                 rpc_authflavor_t flavor,
-                                                rpc_authflavor_t flavor,
+                                 int flags)
-                                                int flags)
 {
-        struct rpc_timeout      timeparms;
        struct rpc_clnt         *clnt = NULL;
        struct rpc_create_args args = {
-                .protocol       = proto,
+                .protocol       = clp->cl_proto,
                .address        = (struct sockaddr *)&clp->cl_addr,
-                .addrsize       = sizeof(clp->cl_addr),
+                .addrsize       = clp->cl_addrlen,
-                .timeout        = &timeparms,
+                .timeout        = timeparms,
                .servername     = clp->cl_hostname,
                .program        = &nfs_program,
                .version        = clp->rpc_ops->version,
@@ -390,10 +476,6 @@ static int nfs_create_rpc_client(struct nfs_client *clp, int proto,
        if (!IS_ERR(clp->cl_rpcclient))
                return 0;
-        nfs_init_timeout_values(&timeparms, proto, timeo, retrans);
-        clp->retrans_timeo = timeparms.to_initval;
-        clp->retrans_count = timeparms.to_retries;
        clnt = rpc_create(&args);
        if (IS_ERR(clnt)) {
                dprintk("%s: cannot create RPC client. Error = %ld\n",
@@ -410,11 +492,8 @@ static int nfs_create_rpc_client(struct nfs_client *clp, int proto,
 */
 static void nfs_destroy_server(struct nfs_server *server)
 {
-        if (!IS_ERR(server->client_acl))
-                rpc_shutdown_client(server->client_acl);
        if (!(server->flags & NFS_MOUNT_NONLM))
-                lockd_down();   /* release rpc.lockd */
+                nlmclnt_done(server->nlm_host);
 }
 /*
@@ -422,20 +501,29 @@ static void nfs_destroy_server(struct nfs_server *server)
 */
 static int nfs_start_lockd(struct nfs_server *server)
 {
-        int error = 0;
+        struct nlm_host *host;
+        struct nfs_client *clp = server->nfs_client;
+        struct nlmclnt_initdata nlm_init = {
+                .hostname       = clp->cl_hostname,
+                .address        = (struct sockaddr *)&clp->cl_addr,
+                .addrlen        = clp->cl_addrlen,
+                .protocol       = server->flags & NFS_MOUNT_TCP ?
+                                                IPPROTO_TCP : IPPROTO_UDP,
+                .nfs_version    = clp->rpc_ops->version,
+        };
-        if (server->nfs_client->cl_nfsversion > 3)
+        if (nlm_init.nfs_version > 3)
-                goto out;
+                return 0;
        if (server->flags & NFS_MOUNT_NONLM)
-                goto out;
+                return 0;
-        error = lockd_up((server->flags & NFS_MOUNT_TCP) ?
-                        IPPROTO_TCP : IPPROTO_UDP);
+        host = nlmclnt_init(&nlm_init);
-        if (error < 0)
+        if (IS_ERR(host))
-                server->flags |= NFS_MOUNT_NONLM;
+                return PTR_ERR(host);
-        else
-                server->destroy = nfs_destroy_server;
+        server->nlm_host = host;
-out:
+        server->destroy = nfs_destroy_server;
-        return error;
+        return 0;
 }
 /*
@@ -444,7 +532,7 @@ out:
 #ifdef CONFIG_NFS_V3_ACL
 static void nfs_init_server_aclclient(struct nfs_server *server)
 {
-        if (server->nfs_client->cl_nfsversion != 3)
+        if (server->nfs_client->rpc_ops->version != 3)
                goto out_noacl;
        if (server->flags & NFS_MOUNT_NOACL)
                goto out_noacl;
@@ -471,7 +559,9 @@ static inline void nfs_init_server_aclclient(struct nfs_server *server)
 /*
 * Create a general RPC client
 */
-static int nfs_init_server_rpcclient(struct nfs_server *server, rpc_authflavor_t pseudoflavour)
+static int nfs_init_server_rpcclient(struct nfs_server *server,
+                const struct rpc_timeout *timeo,
+                rpc_authflavor_t pseudoflavour)
 {
        struct nfs_client *clp = server->nfs_client;
@@ -481,6 +571,11 @@ static int nfs_init_server_rpcclient(struct nfs_server *server, rpc_authflavor_t
                return PTR_ERR(server->client);
        }
+        memcpy(&server->client->cl_timeout_default,
+                        timeo,
+                        sizeof(server->client->cl_timeout_default));
+        server->client->cl_timeout = &server->client->cl_timeout_default;
        if (pseudoflavour != clp->cl_rpcclient->cl_auth->au_flavor) {
                struct rpc_auth *auth;
@@ -494,10 +589,6 @@ static int nfs_init_server_rpcclient(struct nfs_server *server, rpc_authflavor_t
        if (server->flags & NFS_MOUNT_SOFT)
                server->client->cl_softrtry = 1;
-        server->client->cl_intr = 0;
-        if (server->flags & NFS4_MOUNT_INTR)
-                server->client->cl_intr = 1;
        return 0;
 }
@@ -505,6 +596,7 @@ static int nfs_init_server_rpcclient(struct nfs_server *server, rpc_authflavor_t
 * Initialise an NFS2 or NFS3 client
 */
 static int nfs_init_client(struct nfs_client *clp,
+                           const struct rpc_timeout *timeparms,
                           const struct nfs_parsed_mount_data *data)
 {
        int error;
@@ -515,18 +607,11 @@ static int nfs_init_client(struct nfs_client *clp,
                return 0;
        }
-        /* Check NFS protocol revision and initialize RPC op vector */
-        clp->rpc_ops = &nfs_v2_clientops;
-#ifdef CONFIG_NFS_V3
-        if (clp->cl_nfsversion == 3)
-                clp->rpc_ops = &nfs_v3_clientops;
-#endif
        /*
         * Create a client RPC handle for doing FSSTAT with UNIX auth only
         * - RFC 2623, sec 2.3.2
         */
-        error = nfs_create_rpc_client(clp, data->nfs_server.protocol,
+        error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX, 0);
-                                data->timeo, data->retrans, RPC_AUTH_UNIX, 0);
        if (error < 0)
                goto error;
        nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -544,25 +629,34 @@ error:
 static int nfs_init_server(struct nfs_server *server,
                           const struct nfs_parsed_mount_data *data)
 {
+        struct nfs_client_initdata cl_init = {
+                .hostname = data->nfs_server.hostname,
+                .addr = (const struct sockaddr *)&data->nfs_server.address,
+                .addrlen = data->nfs_server.addrlen,
+                .rpc_ops = &nfs_v2_clientops,
+                .proto = data->nfs_server.protocol,
+        };
+        struct rpc_timeout timeparms;
        struct nfs_client *clp;
-        int error, nfsvers = 2;
+        int error;
        dprintk("--> nfs_init_server()\n");
 #ifdef CONFIG_NFS_V3
        if (data->flags & NFS_MOUNT_VER3)
-                nfsvers = 3;
+                cl_init.rpc_ops = &nfs_v3_clientops;
 #endif
        /* Allocate or find a client reference we can use */
-        clp = nfs_get_client(data->nfs_server.hostname,
+        clp = nfs_get_client(&cl_init);
-                                &data->nfs_server.address, nfsvers);
        if (IS_ERR(clp)) {
                dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
                return PTR_ERR(clp);
        }
-        error = nfs_init_client(clp, data);
+        nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
+                        data->timeo, data->retrans);
+        error = nfs_init_client(clp, &timeparms, data);
        if (error < 0)
                goto error;
@@ -586,7 +680,7 @@ static int nfs_init_server(struct nfs_server *server,
        if (error < 0)
                goto error;
-        error = nfs_init_server_rpcclient(server, data->auth_flavors[0]);
+        error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]);
        if (error < 0)
                goto error;
@@ -732,6 +826,9 @@ static struct nfs_server *nfs_alloc_server(void)
        INIT_LIST_HEAD(&server->client_link);
        INIT_LIST_HEAD(&server->master_link);
+        init_waitqueue_head(&server->active_wq);
+        atomic_set(&server->active, 0);
        server->io_stats = nfs_alloc_iostats();
        if (!server->io_stats) {
                kfree(server);
@@ -755,6 +852,9 @@ void nfs_free_server(struct nfs_server *server)
        if (server->destroy != NULL)
                server->destroy(server);
+        if (!IS_ERR(server->client_acl))
+                rpc_shutdown_client(server->client_acl);
        if (!IS_ERR(server->client))
                rpc_shutdown_client(server->client);
@@ -840,7 +940,7 @@ error:
 * Initialise an NFS4 client record
 */
 static int nfs4_init_client(struct nfs_client *clp,
-                int proto, int timeo, int retrans,
+                const struct rpc_timeout *timeparms,
                const char *ip_addr,
                rpc_authflavor_t authflavour)
 {
@@ -855,7 +955,7 @@ static int nfs4_init_client(struct nfs_client *clp,
        /* Check NFS protocol revision and initialize RPC op vector */
        clp->rpc_ops = &nfs_v4_clientops;
-        error = nfs_create_rpc_client(clp, proto, timeo, retrans, authflavour,
+        error = nfs_create_rpc_client(clp, timeparms, authflavour,
                                        RPC_CLNT_CREATE_DISCRTRY);
        if (error < 0)
                goto error;
@@ -882,23 +982,32 @@ error:
 * Set up an NFS4 client
 */
 static int nfs4_set_client(struct nfs_server *server,
-                const char *hostname, const struct sockaddr_in *addr,
+                const char *hostname,
+                const struct sockaddr *addr,
+                const size_t addrlen,
                const char *ip_addr,
                rpc_authflavor_t authflavour,
-                int proto, int timeo, int retrans)
+                int proto, const struct rpc_timeout *timeparms)
 {
+        struct nfs_client_initdata cl_init = {
+                .hostname = hostname,
+                .addr = addr,
+                .addrlen = addrlen,
+                .rpc_ops = &nfs_v4_clientops,
+                .proto = proto,
+        };
        struct nfs_client *clp;
        int error;
        dprintk("--> nfs4_set_client()\n");
        /* Allocate or find a client reference we can use */
-        clp = nfs_get_client(hostname, addr, 4);
+        clp = nfs_get_client(&cl_init);
        if (IS_ERR(clp)) {
                error = PTR_ERR(clp);
                goto error;
        }
-        error = nfs4_init_client(clp, proto, timeo, retrans, ip_addr, authflavour);
+        error = nfs4_init_client(clp, timeparms, ip_addr, authflavour);
        if (error < 0)
                goto error_put;
@@ -919,10 +1028,26 @@ error:
 static int nfs4_init_server(struct nfs_server *server,
                const struct nfs_parsed_mount_data *data)
 {
+        struct rpc_timeout timeparms;
        int error;
        dprintk("--> nfs4_init_server()\n");
+        nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
+                        data->timeo, data->retrans);
+        /* Get a client record */
+        error = nfs4_set_client(server,
+                        data->nfs_server.hostname,
+                        (const struct sockaddr *)&data->nfs_server.address,
+                        data->nfs_server.addrlen,
+                        data->client_address,
+                        data->auth_flavors[0],
+                        data->nfs_server.protocol,
+                        &timeparms);
+        if (error < 0)
+                goto error;
        /* Initialise the client representation from the mount data */
        server->flags = data->flags & NFS_MOUNT_FLAGMASK;
        server->caps |= NFS_CAP_ATOMIC_OPEN;
@@ -937,8 +1062,9 @@ static int nfs4_init_server(struct nfs_server *server,
        server->acdirmin = data->acdirmin * HZ;
        server->acdirmax = data->acdirmax * HZ;
-        error = nfs_init_server_rpcclient(server, data->auth_flavors[0]);
+        error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]);
+error:
        /* Done */
        dprintk("<-- nfs4_init_server() = %d\n", error);
        return error;
@@ -961,17 +1087,6 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
        if (!server)
                return ERR_PTR(-ENOMEM);
-        /* Get a client record */
-        error = nfs4_set_client(server,
-                        data->nfs_server.hostname,
-                        &data->nfs_server.address,
-                        data->client_address,
-                        data->auth_flavors[0],
-                        data->nfs_server.protocol,
-                        data->timeo, data->retrans);
-        if (error < 0)
-                goto error;
        /* set up the general RPC client */
        error = nfs4_init_server(server, data);
        if (error < 0)
@@ -1039,12 +1154,13 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
        /* Get a client representation.
         * Note: NFSv4 always uses TCP, */
-        error = nfs4_set_client(server, data->hostname, data->addr,
+        error = nfs4_set_client(server, data->hostname,
-                        parent_client->cl_ipaddr,
+                                data->addr,
-                        data->authflavor,
+                                data->addrlen,
-                        parent_server->client->cl_xprt->prot,
+                                parent_client->cl_ipaddr,
-                        parent_client->retrans_timeo,
+                                data->authflavor,
-                        parent_client->retrans_count);
+                                parent_server->client->cl_xprt->prot,
+                                parent_server->client->cl_timeout);
        if (error < 0)
                goto error;
@@ -1052,7 +1168,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
        nfs_server_copy_userdata(server, parent_server);
        server->caps |= NFS_CAP_ATOMIC_OPEN;
-        error = nfs_init_server_rpcclient(server, data->authflavor);
+        error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor);
        if (error < 0)
                goto error;
@@ -1121,7 +1237,9 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
        server->fsid = fattr->fsid;
-        error = nfs_init_server_rpcclient(server, source->client->cl_auth->au_flavor);
+        error = nfs_init_server_rpcclient(server,
+                        source->client->cl_timeout,
+                        source->client->cl_auth->au_flavor);
        if (error < 0)
                goto out_free_server;
        if (!IS_ERR(source->client_acl))
@@ -1263,10 +1381,10 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
        /* display one transport per line on subsequent lines */
        clp = list_entry(v, struct nfs_client, cl_share_link);
-        seq_printf(m, "v%d %02x%02x%02x%02x %4hx %3d %s\n",
+        seq_printf(m, "v%u %s %s %3d %s\n",
-                   clp->cl_nfsversion,
+                   clp->rpc_ops->version,
-                   NIPQUAD(clp->cl_addr.sin_addr),
+                   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
-                   ntohs(clp->cl_addr.sin_port),
+                   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
                   atomic_read(&clp->cl_count),
                   clp->cl_hostname);
@@ -1342,10 +1460,10 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
                 (unsigned long long) server->fsid.major,
                 (unsigned long long) server->fsid.minor);
-        seq_printf(m, "v%d %02x%02x%02x%02x %4hx %-7s %-17s\n",
+        seq_printf(m, "v%u %s %s %-7s %-17s\n",
-                   clp->cl_nfsversion,
+                   clp->rpc_ops->version,
-                   NIPQUAD(clp->cl_addr.sin_addr),
+                   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
-                   ntohs(clp->cl_addr.sin_port),
+                   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
                   dev,
                   fsid);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 11833f4caeaa..b9eadd18ba70 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -125,6 +125,32 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, st
        put_rpccred(oldcred);
 }
+static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
+{
+        int res = 0;
+        res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid, issync);
+        nfs_free_delegation(delegation);
+        return res;
+}
+static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid)
+{
+        struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation);
+        if (delegation == NULL)
+                goto nomatch;
+        if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data,
+                                sizeof(delegation->stateid.data)) != 0)
+                goto nomatch;
+        list_del_rcu(&delegation->super_list);
+        nfsi->delegation_state = 0;
+        rcu_assign_pointer(nfsi->delegation, NULL);
+        return delegation;
+nomatch:
+        return NULL;
+}
 /*
 * Set up a delegation on an inode
 */
@@ -133,6 +159,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation;
+        struct nfs_delegation *freeme = NULL;
        int status = 0;
        delegation = kmalloc(sizeof(*delegation), GFP_KERNEL);
@@ -147,41 +174,45 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
        delegation->inode = inode;
        spin_lock(&clp->cl_lock);
-        if (rcu_dereference(nfsi->delegation) == NULL) {
+        if (rcu_dereference(nfsi->delegation) != NULL) {
-                list_add_rcu(&delegation->super_list, &clp->cl_delegations);
-                nfsi->delegation_state = delegation->type;
-                rcu_assign_pointer(nfsi->delegation, delegation);
-                delegation = NULL;
-        } else {
                if (memcmp(&delegation->stateid, &nfsi->delegation->stateid,
-                                        sizeof(delegation->stateid)) != 0 ||
+                                        sizeof(delegation->stateid)) == 0 &&
-                                delegation->type != nfsi->delegation->type) {
+                                delegation->type == nfsi->delegation->type) {
-                        printk("%s: server %u.%u.%u.%u, handed out a duplicate delegation!\n",
+                        goto out;
-                                        __FUNCTION__, NIPQUAD(clp->cl_addr.sin_addr));
+                }
-                        status = -EIO;
+                /*
+                 * Deal with broken servers that hand out two
+                 * delegations for the same file.
+                 */
+                dfprintk(FILE, "%s: server %s handed out "
+                                "a duplicate delegation!\n",
+                                __FUNCTION__, clp->cl_hostname);
+                if (delegation->type <= nfsi->delegation->type) {
+                        freeme = delegation;
+                        delegation = NULL;
+                        goto out;
                }
+                freeme = nfs_detach_delegation_locked(nfsi, NULL);
        }
+        list_add_rcu(&delegation->super_list, &clp->cl_delegations);
+        nfsi->delegation_state = delegation->type;
+        rcu_assign_pointer(nfsi->delegation, delegation);
+        delegation = NULL;
        /* Ensure we revalidate the attributes and page cache! */
        spin_lock(&inode->i_lock);
        nfsi->cache_validity |= NFS_INO_REVAL_FORCED;
        spin_unlock(&inode->i_lock);
+out:
        spin_unlock(&clp->cl_lock);
        if (delegation != NULL)
                nfs_free_delegation(delegation);
+        if (freeme != NULL)
+                nfs_do_return_delegation(inode, freeme, 0);
        return status;
 }
-static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation)
-{
-        int res = 0;
-        res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid);
-        nfs_free_delegation(delegation);
-        return res;
-}
 /* Sync all data to disk upon delegation return */
 static void nfs_msync_inode(struct inode *inode)
 {
@@ -207,24 +238,28 @@ static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegat
        up_read(&clp->cl_sem);
        nfs_msync_inode(inode);
-        return nfs_do_return_delegation(inode, delegation);
+        return nfs_do_return_delegation(inode, delegation, 1);
 }
-static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid)
+/*
+ * This function returns the delegation without reclaiming opens
+ * or protecting against delegation reclaims.
+ * It is therefore really only safe to be called from
+ * nfs4_clear_inode()
+ */
+void nfs_inode_return_delegation_noreclaim(struct inode *inode)
 {
-        struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation);
+        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+        struct nfs_inode *nfsi = NFS_I(inode);
+        struct nfs_delegation *delegation;
-        if (delegation == NULL)
+        if (rcu_dereference(nfsi->delegation) != NULL) {
-                goto nomatch;
+                spin_lock(&clp->cl_lock);
-        if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data,
+                delegation = nfs_detach_delegation_locked(nfsi, NULL);
-                                sizeof(delegation->stateid.data)) != 0)
+                spin_unlock(&clp->cl_lock);
-                goto nomatch;
+                if (delegation != NULL)
-        list_del_rcu(&delegation->super_list);
+                        nfs_do_return_delegation(inode, delegation, 0);
-        nfsi->delegation_state = 0;
+        }
-        rcu_assign_pointer(nfsi->delegation, NULL);
-        return delegation;
-nomatch:
-        return NULL;
 }
 int nfs_inode_return_delegation(struct inode *inode)
@@ -314,8 +349,9 @@ void nfs_expire_all_delegations(struct nfs_client *clp)
        __module_get(THIS_MODULE);
        atomic_inc(&clp->cl_count);
        task = kthread_run(nfs_do_expire_all_delegations, clp,
-                        "%u.%u.%u.%u-delegreturn",
+                                "%s-delegreturn",
-                        NIPQUAD(clp->cl_addr.sin_addr));
+                                rpc_peeraddr2str(clp->cl_rpcclient,
+                                                        RPC_DISPLAY_ADDR));
        if (!IS_ERR(task))
                return;
        nfs_put_client(clp);
@@ -386,7 +422,7 @@ static int recall_thread(void *data)
        nfs_msync_inode(inode);
        if (delegation != NULL)
-                nfs_do_return_delegation(inode, delegation);
+                nfs_do_return_delegation(inode, delegation, 1);
        iput(inode);
        module_put_and_exit(0);
 }
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 5874ce7fdbae..f1c5e2a5d88e 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -29,6 +29,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
 int nfs_inode_return_delegation(struct inode *inode);
 int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
+void nfs_inode_return_delegation_noreclaim(struct inode *inode);
 struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
 void nfs_return_all_delegations(struct super_block *sb);
@@ -39,7 +40,7 @@ void nfs_delegation_mark_reclaim(struct nfs_client *clp);
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
 /* NFSv4 delegation-related procedures */
-int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid);
+int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);
 int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid);
 int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl);
 int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 35334539d947..476cb0f837fd 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -38,6 +38,7 @@
 #include "nfs4_fs.h"
 #include "delegation.h"
 #include "iostat.h"
+#include "internal.h"
 /* #define NFS_DEBUG_VERBOSE 1 */
@@ -191,7 +192,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
                /* We requested READDIRPLUS, but the server doesn't grok it */
                if (error == -ENOTSUPP && desc->plus) {
                        NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS;
-                        clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
+                        clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
                        desc->plus = 0;
                        goto again;
                }
@@ -536,12 +537,6 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        lock_kernel();
-        res = nfs_revalidate_mapping_nolock(inode, filp->f_mapping);
-        if (res < 0) {
-                unlock_kernel();
-                return res;
-        }
        /*
         * filp->f_pos points to the dirent entry number.
         * *desc->dir_cookie has the cookie for the next entry. We have
@@ -563,6 +558,10 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        desc->entry = &my_entry;
        nfs_block_sillyrename(dentry);
+        res = nfs_revalidate_mapping_nolock(inode, filp->f_mapping);
+        if (res < 0)
+                goto out;
        while(!desc->entry->eof) {
                res = readdir_search_pagecache(desc);
@@ -578,7 +577,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        break;
                }
                if (res == -ETOOSMALL && desc->plus) {
-                        clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
+                        clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
                        nfs_zap_caches(inode);
                        desc->plus = 0;
                        desc->entry->eof = 0;
@@ -593,6 +592,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        break;
                }
        }
+out:
        nfs_unblock_sillyrename(dentry);
        unlock_kernel();
        if (res > 0)
@@ -638,6 +638,21 @@ static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
        return 0;
 }
+/**
+ * nfs_force_lookup_revalidate - Mark the directory as having changed
+ * @dir - pointer to directory inode
+ *
+ * This forces the revalidation code in nfs_lookup_revalidate() to do a
+ * full lookup on all child dentries of 'dir' whenever a change occurs
+ * on the server that might have invalidated our dcache.
+ *
+ * The caller should be holding dir->i_lock
+ */
+void nfs_force_lookup_revalidate(struct inode *dir)
+{
+        NFS_I(dir)->cache_change_attribute = jiffies;
+}
 /*
 * A check for whether or not the parent directory has changed.
 * In the case it has, we assume that the dentries are untrustworthy
@@ -826,6 +841,10 @@ static int nfs_dentry_delete(struct dentry *dentry)
                dentry->d_parent->d_name.name, dentry->d_name.name,
                dentry->d_flags);
+        /* Unhash any dentry with a stale inode */
+        if (dentry->d_inode != NULL && NFS_STALE(dentry->d_inode))
+                return 1;
        if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
                /* Unhash it, so that ->d_iput() would be called */
                return 1;
@@ -845,7 +864,6 @@ static int nfs_dentry_delete(struct dentry *dentry)
 */
 static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
 {
-        nfs_inode_return_delegation(inode);
        if (S_ISDIR(inode->i_mode))
                /* drop any readdir cache as it could easily be old */
                NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
@@ -1267,6 +1285,12 @@ out_err:
        return error;
 }
+static void nfs_dentry_handle_enoent(struct dentry *dentry)
+{
+        if (dentry->d_inode != NULL && !d_unhashed(dentry))
+                d_delete(dentry);
+}
 static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
        int error;
@@ -1279,6 +1303,8 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
        /* Ensure the VFS deletes this inode */
        if (error == 0 && dentry->d_inode != NULL)
                clear_nlink(dentry->d_inode);
+        else if (error == -ENOENT)
+                nfs_dentry_handle_enoent(dentry);
        unlock_kernel();
        return error;
@@ -1385,6 +1411,8 @@ static int nfs_safe_remove(struct dentry *dentry)
                nfs_mark_for_revalidate(inode);
        } else
                error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
+        if (error == -ENOENT)
+                nfs_dentry_handle_enoent(dentry);
 out:
        return error;
 }
@@ -1421,7 +1449,7 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
        spin_unlock(&dentry->d_lock);
        spin_unlock(&dcache_lock);
        error = nfs_safe_remove(dentry);
-        if (!error) {
+        if (!error || error == -ENOENT) {
                nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
        } else if (need_rehash)
                d_rehash(dentry);
@@ -1634,7 +1662,8 @@ out:
                d_move(old_dentry, new_dentry);
                nfs_set_verifier(new_dentry,
                                        nfs_save_change_attribute(new_dir));
-        }
+        } else if (error == -ENOENT)
+                nfs_dentry_handle_enoent(old_dentry);
        /* new dentry created? */
        if (dentry)
@@ -1665,13 +1694,19 @@ int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask)
 restart:
        spin_lock(&nfs_access_lru_lock);
        list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) {
+                struct rw_semaphore *s_umount;
                struct inode *inode;
                if (nr_to_scan-- == 0)
                        break;
+                s_umount = &nfsi->vfs_inode.i_sb->s_umount;
+                if (!down_read_trylock(s_umount))
+                        continue;
                inode = igrab(&nfsi->vfs_inode);
-                if (inode == NULL)
+                if (inode == NULL) {
+                        up_read(s_umount);
                        continue;
+                }
                spin_lock(&inode->i_lock);
                if (list_empty(&nfsi->access_cache_entry_lru))
                        goto remove_lru_entry;
@@ -1690,6 +1725,7 @@ remove_lru_entry:
                spin_unlock(&inode->i_lock);
                spin_unlock(&nfs_access_lru_lock);
                iput(inode);
+                up_read(s_umount);
                goto restart;
        }
        spin_unlock(&nfs_access_lru_lock);
@@ -1730,7 +1766,7 @@ static void __nfs_access_zap_cache(struct inode *inode)
 void nfs_access_zap_cache(struct inode *inode)
 {
        /* Remove from global LRU init */
-        if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_FLAGS(inode))) {
+        if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
                spin_lock(&nfs_access_lru_lock);
                list_del_init(&NFS_I(inode)->access_cache_inode_lru);
                spin_unlock(&nfs_access_lru_lock);
@@ -1844,7 +1880,7 @@ static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *s
        smp_mb__after_atomic_inc();
        /* Add inode to global LRU list */
-        if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_FLAGS(inode))) {
+        if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
                spin_lock(&nfs_access_lru_lock);
                list_add_tail(&NFS_I(inode)->access_cache_inode_lru, &nfs_access_lru_list);
                spin_unlock(&nfs_access_lru_lock);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index afcab007a22b..16844f98f50e 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -193,7 +193,7 @@ static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
        if (dreq->iocb)
                goto out;
-        result = wait_for_completion_interruptible(&dreq->completion);
+        result = wait_for_completion_killable(&dreq->completion);
        if (!result)
                result = dreq->error;
@@ -263,17 +263,29 @@ static const struct rpc_call_ops nfs_read_direct_ops = {
 * handled automatically by nfs_direct_read_result().  Otherwise, if
 * no requests have been sent, just return an error.
 */
-static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos)
+static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
+                                                const struct iovec *iov,
+                                                loff_t pos)
 {
        struct nfs_open_context *ctx = dreq->ctx;
        struct inode *inode = ctx->path.dentry->d_inode;
+        unsigned long user_addr = (unsigned long)iov->iov_base;
+        size_t count = iov->iov_len;
        size_t rsize = NFS_SERVER(inode)->rsize;
+        struct rpc_task *task;
+        struct rpc_message msg = {
+                .rpc_cred = ctx->cred,
+        };
+        struct rpc_task_setup task_setup_data = {
+                .rpc_client = NFS_CLIENT(inode),
+                .rpc_message = &msg,
+                .callback_ops = &nfs_read_direct_ops,
+                .flags = RPC_TASK_ASYNC,
+        };
        unsigned int pgbase;
        int result;
        ssize_t started = 0;
-        get_dreq(dreq);
        do {
                struct nfs_read_data *data;
                size_t bytes;
@@ -309,7 +321,7 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo
                data->req = (struct nfs_page *) dreq;
                data->inode = inode;
-                data->cred = ctx->cred;
+                data->cred = msg.rpc_cred;
                data->args.fh = NFS_FH(inode);
                data->args.context = ctx;
                data->args.offset = pos;
@@ -319,14 +331,16 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo
                data->res.fattr = &data->fattr;
                data->res.eof = 0;
                data->res.count = bytes;
+                msg.rpc_argp = &data->args;
+                msg.rpc_resp = &data->res;
-                rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
+                task_setup_data.task = &data->task;
-                                &nfs_read_direct_ops, data);
+                task_setup_data.callback_data = data;
-                NFS_PROTO(inode)->read_setup(data);
+                NFS_PROTO(inode)->read_setup(data, &msg);
-                data->task.tk_cookie = (unsigned long) inode;
+                task = rpc_run_task(&task_setup_data);
+                if (!IS_ERR(task))
-                rpc_execute(&data->task);
+                        rpc_put_task(task);
                dprintk("NFS: %5u initiated direct read call "
                        "(req %s/%Ld, %zu bytes @ offset %Lu)\n",
@@ -347,20 +361,49 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo
                count -= bytes;
        } while (count != 0);
+        if (started)
+                return started;
+        return result < 0 ? (ssize_t) result : -EFAULT;
+}
+static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
+                                              const struct iovec *iov,
+                                              unsigned long nr_segs,
+                                              loff_t pos)
+{
+        ssize_t result = -EINVAL;
+        size_t requested_bytes = 0;
+        unsigned long seg;
+        get_dreq(dreq);
+        for (seg = 0; seg < nr_segs; seg++) {
+                const struct iovec *vec = &iov[seg];
+                result = nfs_direct_read_schedule_segment(dreq, vec, pos);
+                if (result < 0)
+                        break;
+                requested_bytes += result;
+                if ((size_t)result < vec->iov_len)
+                        break;
+                pos += vec->iov_len;
+        }
        if (put_dreq(dreq))
                nfs_direct_complete(dreq);
-        if (started)
+        if (requested_bytes != 0)
                return 0;
-        return result < 0 ? (ssize_t) result : -EFAULT;
+        if (result < 0)
+                return result;
+        return -EIO;
 }
-static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos)
+static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
+                               unsigned long nr_segs, loff_t pos)
 {
        ssize_t result = 0;
-        sigset_t oldset;
        struct inode *inode = iocb->ki_filp->f_mapping->host;
-        struct rpc_clnt *clnt = NFS_CLIENT(inode);
        struct nfs_direct_req *dreq;
        dreq = nfs_direct_req_alloc();
@@ -372,12 +415,9 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
        if (!is_sync_kiocb(iocb))
                dreq->iocb = iocb;
-        nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count);
+        result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos);
-        rpc_clnt_sigmask(clnt, &oldset);
-        result = nfs_direct_read_schedule(dreq, user_addr, count, pos);
        if (!result)
                result = nfs_direct_wait(dreq);
-        rpc_clnt_sigunmask(clnt, &oldset);
        nfs_direct_req_release(dreq);
        return result;
@@ -399,6 +439,15 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
        struct inode *inode = dreq->inode;
        struct list_head *p;
        struct nfs_write_data *data;
+        struct rpc_task *task;
+        struct rpc_message msg = {
+                .rpc_cred = dreq->ctx->cred,
+        };
+        struct rpc_task_setup task_setup_data = {
+                .rpc_client = NFS_CLIENT(inode),
+                .callback_ops = &nfs_write_direct_ops,
+                .flags = RPC_TASK_ASYNC,
+        };
        dreq->count = 0;
        get_dreq(dreq);
@@ -408,6 +457,9 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
                get_dreq(dreq);
+                /* Use stable writes */
+                data->args.stable = NFS_FILE_SYNC;
                /*
                 * Reset data->res.
                 */
@@ -419,17 +471,18 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
                 * Reuse data->task; data->args should not have changed
                 * since the original request was sent.
                 */
-                rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
+                task_setup_data.task = &data->task;
-                                &nfs_write_direct_ops, data);
+                task_setup_data.callback_data = data;
-                NFS_PROTO(inode)->write_setup(data, FLUSH_STABLE);
+                msg.rpc_argp = &data->args;
+                msg.rpc_resp = &data->res;
-                data->task.tk_priority = RPC_PRIORITY_NORMAL;
+                NFS_PROTO(inode)->write_setup(data, &msg);
-                data->task.tk_cookie = (unsigned long) inode;
                /*
                 * We're called via an RPC callback, so BKL is already held.
                 */
-                rpc_execute(&data->task);
+                task = rpc_run_task(&task_setup_data);
+                if (!IS_ERR(task))
+                        rpc_put_task(task);
                dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
                                data->task.tk_pid,
@@ -472,9 +525,23 @@ static const struct rpc_call_ops nfs_commit_direct_ops = {
 static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 {
        struct nfs_write_data *data = dreq->commit_data;
+        struct rpc_task *task;
+        struct rpc_message msg = {
+                .rpc_argp = &data->args,
+                .rpc_resp = &data->res,
+                .rpc_cred = dreq->ctx->cred,
+        };
+        struct rpc_task_setup task_setup_data = {
+                .task = &data->task,
+                .rpc_client = NFS_CLIENT(dreq->inode),
+                .rpc_message = &msg,
+                .callback_ops = &nfs_commit_direct_ops,
+                .callback_data = data,
+                .flags = RPC_TASK_ASYNC,
+        };
        data->inode = dreq->inode;
-        data->cred = dreq->ctx->cred;
+        data->cred = msg.rpc_cred;
        data->args.fh = NFS_FH(data->inode);
        data->args.offset = 0;
@@ -483,18 +550,16 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
        data->res.fattr = &data->fattr;
        data->res.verf = &data->verf;
-        rpc_init_task(&data->task, NFS_CLIENT(dreq->inode), RPC_TASK_ASYNC,
+        NFS_PROTO(data->inode)->commit_setup(data, &msg);
-                                &nfs_commit_direct_ops, data);
-        NFS_PROTO(data->inode)->commit_setup(data, 0);
-        data->task.tk_priority = RPC_PRIORITY_NORMAL;
-        data->task.tk_cookie = (unsigned long)data->inode;
        /* Note: task.tk_ops->rpc_release will free dreq->commit_data */
        dreq->commit_data = NULL;
        dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
-        rpc_execute(&data->task);
+        task = rpc_run_task(&task_setup_data);
+        if (!IS_ERR(task))
+                rpc_put_task(task);
 }
 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
@@ -601,17 +666,29 @@ static const struct rpc_call_ops nfs_write_direct_ops = {
 * handled automatically by nfs_direct_write_result().  Otherwise, if
 * no requests have been sent, just return an error.
 */
-static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos, int sync)
+static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
+                                                 const struct iovec *iov,
+                                                 loff_t pos, int sync)
 {
        struct nfs_open_context *ctx = dreq->ctx;
        struct inode *inode = ctx->path.dentry->d_inode;
+        unsigned long user_addr = (unsigned long)iov->iov_base;
+        size_t count = iov->iov_len;
+        struct rpc_task *task;
+        struct rpc_message msg = {
+                .rpc_cred = ctx->cred,
+        };
+        struct rpc_task_setup task_setup_data = {
+                .rpc_client = NFS_CLIENT(inode),
+                .rpc_message = &msg,
+                .callback_ops = &nfs_write_direct_ops,
+                .flags = RPC_TASK_ASYNC,
+        };
        size_t wsize = NFS_SERVER(inode)->wsize;
        unsigned int pgbase;
        int result;
        ssize_t started = 0;
-        get_dreq(dreq);
        do {
                struct nfs_write_data *data;
                size_t bytes;
@@ -649,25 +726,27 @@ static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned l
                data->req = (struct nfs_page *) dreq;
                data->inode = inode;
-                data->cred = ctx->cred;
+                data->cred = msg.rpc_cred;
                data->args.fh = NFS_FH(inode);
                data->args.context = ctx;
                data->args.offset = pos;
                data->args.pgbase = pgbase;
                data->args.pages = data->pagevec;
                data->args.count = bytes;
+                data->args.stable = sync;
                data->res.fattr = &data->fattr;
                data->res.count = bytes;
                data->res.verf = &data->verf;
-                rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
+                task_setup_data.task = &data->task;
-                                &nfs_write_direct_ops, data);
+                task_setup_data.callback_data = data;
-                NFS_PROTO(inode)->write_setup(data, sync);
+                msg.rpc_argp = &data->args;
+                msg.rpc_resp = &data->res;
+                NFS_PROTO(inode)->write_setup(data, &msg);
-                data->task.tk_priority = RPC_PRIORITY_NORMAL;
+                task = rpc_run_task(&task_setup_data);
-                data->task.tk_cookie = (unsigned long) inode;
+                if (!IS_ERR(task))
+                        rpc_put_task(task);
-                rpc_execute(&data->task);
                dprintk("NFS: %5u initiated direct write call "
                        "(req %s/%Ld, %zu bytes @ offset %Lu)\n",
@@ -689,23 +768,54 @@ static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned l
                count -= bytes;
        } while (count != 0);
+        if (started)
+                return started;
+        return result < 0 ? (ssize_t) result : -EFAULT;
+}
+static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
+                                               const struct iovec *iov,
+                                               unsigned long nr_segs,
+                                               loff_t pos, int sync)
+{
+        ssize_t result = 0;
+        size_t requested_bytes = 0;
+        unsigned long seg;
+        get_dreq(dreq);
+        for (seg = 0; seg < nr_segs; seg++) {
+                const struct iovec *vec = &iov[seg];
+                result = nfs_direct_write_schedule_segment(dreq, vec,
+                                                           pos, sync);
+                if (result < 0)
+                        break;
+                requested_bytes += result;
+                if ((size_t)result < vec->iov_len)
+                        break;
+                pos += vec->iov_len;
+        }
        if (put_dreq(dreq))
-                nfs_direct_write_complete(dreq, inode);
+                nfs_direct_write_complete(dreq, dreq->inode);
-        if (started)
+        if (requested_bytes != 0)
                return 0;
-        return result < 0 ? (ssize_t) result : -EFAULT;
+        if (result < 0)
+                return result;
+        return -EIO;
 }
-static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos)
+static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
+                                unsigned long nr_segs, loff_t pos,
+                                size_t count)
 {
        ssize_t result = 0;
-        sigset_t oldset;
        struct inode *inode = iocb->ki_filp->f_mapping->host;
-        struct rpc_clnt *clnt = NFS_CLIENT(inode);
        struct nfs_direct_req *dreq;
        size_t wsize = NFS_SERVER(inode)->wsize;
-        int sync = 0;
+        int sync = NFS_UNSTABLE;
        dreq = nfs_direct_req_alloc();
        if (!dreq)
@@ -713,20 +823,16 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
        nfs_alloc_commit_data(dreq);
        if (dreq->commit_data == NULL || count < wsize)
-                sync = FLUSH_STABLE;
+                sync = NFS_FILE_SYNC;
        dreq->inode = inode;
        dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
        if (!is_sync_kiocb(iocb))
                dreq->iocb = iocb;
-        nfs_add_stats(inode, NFSIOS_DIRECTWRITTENBYTES, count);
+        result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync);
-        rpc_clnt_sigmask(clnt, &oldset);
-        result = nfs_direct_write_schedule(dreq, user_addr, count, pos, sync);
        if (!result)
                result = nfs_direct_wait(dreq);
-        rpc_clnt_sigunmask(clnt, &oldset);
        nfs_direct_req_release(dreq);
        return result;
@@ -759,21 +865,16 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
        ssize_t retval = -EINVAL;
        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
-        /* XXX: temporary */
+        size_t count;
-        const char __user *buf = iov[0].iov_base;
-        size_t count = iov[0].iov_len;
-        dprintk("nfs: direct read(%s/%s, %lu@%Ld)\n",
+        count = iov_length(iov, nr_segs);
+        nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
+        dprintk("nfs: direct read(%s/%s, %zd@%Ld)\n",
                file->f_path.dentry->d_parent->d_name.name,
                file->f_path.dentry->d_name.name,
-                (unsigned long) count, (long long) pos);
+                count, (long long) pos);
-        if (nr_segs != 1)
-                goto out;
-        retval = -EFAULT;
-        if (!access_ok(VERIFY_WRITE, buf, count))
-                goto out;
        retval = 0;
        if (!count)
                goto out;
@@ -782,7 +883,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
        if (retval)
                goto out;
-        retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos);
+        retval = nfs_direct_read(iocb, iov, nr_segs, pos);
        if (retval > 0)
                iocb->ki_pos = pos + retval;
@@ -821,17 +922,15 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
        ssize_t retval = -EINVAL;
        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
-        /* XXX: temporary */
+        size_t count;
-        const char __user *buf = iov[0].iov_base;
-        size_t count = iov[0].iov_len;
+        count = iov_length(iov, nr_segs);
+        nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
-        dprintk("nfs: direct write(%s/%s, %lu@%Ld)\n",
+        dfprintk(VFS, "nfs: direct write(%s/%s, %zd@%Ld)\n",
                file->f_path.dentry->d_parent->d_name.name,
                file->f_path.dentry->d_name.name,
-                (unsigned long) count, (long long) pos);
+                count, (long long) pos);
-        if (nr_segs != 1)
-                goto out;
        retval = generic_write_checks(file, &pos, &count, 0);
        if (retval)
@@ -844,15 +943,11 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
        if (!count)
                goto out;
-        retval = -EFAULT;
-        if (!access_ok(VERIFY_READ, buf, count))
-                goto out;
        retval = nfs_sync_mapping(mapping);
        if (retval)
                goto out;
-        retval = nfs_direct_write(iocb, (unsigned long) buf, count, pos);
+        retval = nfs_direct_write(iocb, iov, nr_segs, pos, count);
        if (retval > 0)
                iocb->ki_pos = pos + retval;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index b3bb89f7d5d2..ef57a5ae5904 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -349,7 +349,9 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
        unlock_page(page);
        page_cache_release(page);
-        return status < 0 ? status : copied;
+        if (status < 0)
+                return status;
+        return copied;
 }
 static void nfs_invalidate_page(struct page *page, unsigned long offset)
@@ -392,35 +394,27 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
        struct file *filp = vma->vm_file;
        unsigned pagelen;
        int ret = -EINVAL;
-        void *fsdata;
        struct address_space *mapping;
-        loff_t offset;
        lock_page(page);
        mapping = page->mapping;
-        if (mapping != vma->vm_file->f_path.dentry->d_inode->i_mapping) {
+        if (mapping != vma->vm_file->f_path.dentry->d_inode->i_mapping)
-                unlock_page(page);
+                goto out_unlock;
-                return -EINVAL;
-        }
+        ret = 0;
        pagelen = nfs_page_length(page);
-        offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+        if (pagelen == 0)
-        unlock_page(page);
+                goto out_unlock;
-        /*
+        ret = nfs_flush_incompatible(filp, page);
-         * we can use mapping after releasing the page lock, because:
+        if (ret != 0)
-         * we hold mmap_sem on the fault path, which should pin the vma
+                goto out_unlock;
-         * which should pin the file, which pins the dentry which should
-         * hold a reference on inode.
-         */
-        if (pagelen) {
+        ret = nfs_updatepage(filp, page, 0, pagelen);
-                struct page *page2 = NULL;
+        if (ret == 0)
-                ret = nfs_write_begin(filp, mapping, offset, pagelen,
+                ret = pagelen;
-                                0, &page2, &fsdata);
+out_unlock:
-                if (!ret)
+        unlock_page(page);
-                        ret = nfs_write_end(filp, mapping, offset, pagelen,
-                                        pagelen, page2, fsdata);
-        }
        return ret;
 }
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 522e5ad4d8ad..e6242cdbaf91 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -43,6 +43,36 @@
 #define NFSDBG_FACILITY         NFSDBG_CLIENT
 /*
+ * Set the superblock root dentry.
+ * Note that this function frees the inode in case of error.
+ */
+static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *inode)
+{
+        /* The mntroot acts as the dummy root dentry for this superblock */
+        if (sb->s_root == NULL) {
+                sb->s_root = d_alloc_root(inode);
+                if (sb->s_root == NULL) {
+                        iput(inode);
+                        return -ENOMEM;
+                }
+                /* Circumvent igrab(): we know the inode is not being freed */
+                atomic_inc(&inode->i_count);
+                /*
+                 * Ensure that this dentry is invisible to d_find_alias().
+                 * Otherwise, it may be spliced into the tree by
+                 * d_materialise_unique if a parent directory from the same
+                 * filesystem gets mounted at a later time.
+                 * This again causes shrink_dcache_for_umount_subtree() to
+                 * Oops, since the test for IS_ROOT() will fail.
+                 */
+                spin_lock(&dcache_lock);
+                list_del_init(&sb->s_root->d_alias);
+                spin_unlock(&dcache_lock);
+        }
+        return 0;
+}
+/*
 * get an NFS2/NFS3 root dentry from the root filehandle
 */
 struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
@@ -54,33 +84,6 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
        struct inode *inode;
        int error;
-        /* create a dummy root dentry with dummy inode for this superblock */
-        if (!sb->s_root) {
-                struct nfs_fh dummyfh;
-                struct dentry *root;
-                struct inode *iroot;
-                memset(&dummyfh, 0, sizeof(dummyfh));
-                memset(&fattr, 0, sizeof(fattr));
-                nfs_fattr_init(&fattr);
-                fattr.valid = NFS_ATTR_FATTR;
-                fattr.type = NFDIR;
-                fattr.mode = S_IFDIR | S_IRUSR | S_IWUSR;
-                fattr.nlink = 2;
-                iroot = nfs_fhget(sb, &dummyfh, &fattr);
-                if (IS_ERR(iroot))
-                        return ERR_PTR(PTR_ERR(iroot));
-                root = d_alloc_root(iroot);
-                if (!root) {
-                        iput(iroot);
-                        return ERR_PTR(-ENOMEM);
-                }
-                sb->s_root = root;
-        }
        /* get the actual root for this mount */
        fsinfo.fattr = &fattr;
@@ -96,6 +99,10 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
                return ERR_PTR(PTR_ERR(inode));
        }
+        error = nfs_superblock_set_dummy_root(sb, inode);
+        if (error != 0)
+                return ERR_PTR(error);
        /* root dentries normally start off anonymous and get spliced in later
         * if the dentry tree reaches them; however if the dentry already
         * exists, we'll pick it up at this point and use it as the root
@@ -241,33 +248,6 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
        dprintk("--> nfs4_get_root()\n");
-        /* create a dummy root dentry with dummy inode for this superblock */
-        if (!sb->s_root) {
-                struct nfs_fh dummyfh;
-                struct dentry *root;
-                struct inode *iroot;
-                memset(&dummyfh, 0, sizeof(dummyfh));
-                memset(&fattr, 0, sizeof(fattr));
-                nfs_fattr_init(&fattr);
-                fattr.valid = NFS_ATTR_FATTR;
-                fattr.type = NFDIR;
-                fattr.mode = S_IFDIR | S_IRUSR | S_IWUSR;
-                fattr.nlink = 2;
-                iroot = nfs_fhget(sb, &dummyfh, &fattr);
-                if (IS_ERR(iroot))
-                        return ERR_PTR(PTR_ERR(iroot));
-                root = d_alloc_root(iroot);
-                if (!root) {
-                        iput(iroot);
-                        return ERR_PTR(-ENOMEM);
-                }
-                sb->s_root = root;
-        }
        /* get the info about the server and filesystem */
        error = nfs4_server_capabilities(server, mntfh);
        if (error < 0) {
@@ -289,6 +269,10 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
                return ERR_PTR(PTR_ERR(inode));
        }
+        error = nfs_superblock_set_dummy_root(sb, inode);
+        if (error != 0)
+                return ERR_PTR(error);
        /* root dentries normally start off anonymous and get spliced in later
         * if the dentry tree reaches them; however if the dentry already
         * exists, we'll pick it up at this point and use it as the root
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index d11eb055265c..8ae5dba2d4e5 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -72,39 +72,39 @@ module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
                 &nfs_idmap_cache_timeout, 0644);
 struct idmap_hashent {
-        unsigned long ih_expires;
+        unsigned long           ih_expires;
-        __u32 ih_id;
+        __u32                   ih_id;
-        int ih_namelen;
+        size_t                  ih_namelen;
-        char ih_name[IDMAP_NAMESZ];
+        char                    ih_name[IDMAP_NAMESZ];
 };
 struct idmap_hashtable {
-        __u8 h_type;
+        __u8                    h_type;
-        struct idmap_hashent h_entries[IDMAP_HASH_SZ];
+        struct idmap_hashent    h_entries[IDMAP_HASH_SZ];
 };
 struct idmap {
-        struct dentry        *idmap_dentry;
+        struct dentry           *idmap_dentry;
-        wait_queue_head_t     idmap_wq;
+        wait_queue_head_t       idmap_wq;
-        struct idmap_msg      idmap_im;
+        struct idmap_msg        idmap_im;
-        struct mutex          idmap_lock;    /* Serializes upcalls */
+        struct mutex            idmap_lock;     /* Serializes upcalls */
-        struct mutex          idmap_im_lock; /* Protects the hashtable */
+        struct mutex            idmap_im_lock;  /* Protects the hashtable */
-        struct idmap_hashtable idmap_user_hash;
+        struct idmap_hashtable  idmap_user_hash;
-        struct idmap_hashtable idmap_group_hash;
+        struct idmap_hashtable  idmap_group_hash;
 };
-static ssize_t   idmap_pipe_upcall(struct file *, struct rpc_pipe_msg *,
+static ssize_t idmap_pipe_upcall(struct file *, struct rpc_pipe_msg *,
-                     char __user *, size_t);
+                                 char __user *, size_t);
-static ssize_t   idmap_pipe_downcall(struct file *, const char __user *,
+static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
-                     size_t);
+                                   size_t);
-static void      idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
+static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
 static unsigned int fnvhash32(const void *, size_t);
 static struct rpc_pipe_ops idmap_upcall_ops = {
-        .upcall         = idmap_pipe_upcall,
+        .upcall         = idmap_pipe_upcall,
-        .downcall       = idmap_pipe_downcall,
+        .downcall       = idmap_pipe_downcall,
-        .destroy_msg    = idmap_pipe_destroy_msg,
+        .destroy_msg    = idmap_pipe_destroy_msg,
 };
 int
@@ -115,19 +115,20 @@ nfs_idmap_new(struct nfs_client *clp)
        BUG_ON(clp->cl_idmap != NULL);
-        if ((idmap = kzalloc(sizeof(*idmap), GFP_KERNEL)) == NULL)
+        idmap = kzalloc(sizeof(*idmap), GFP_KERNEL);
-                return -ENOMEM;
+        if (idmap == NULL)
+                return -ENOMEM;
-        idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, "idmap",
+        idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, "idmap",
-            idmap, &idmap_upcall_ops, 0);
+                                         idmap, &idmap_upcall_ops, 0);
-        if (IS_ERR(idmap->idmap_dentry)) {
+        if (IS_ERR(idmap->idmap_dentry)) {
                error = PTR_ERR(idmap->idmap_dentry);
                kfree(idmap);
                return error;
        }
-        mutex_init(&idmap->idmap_lock);
+        mutex_init(&idmap->idmap_lock);
-        mutex_init(&idmap->idmap_im_lock);
+        mutex_init(&idmap->idmap_im_lock);
        init_waitqueue_head(&idmap->idmap_wq);
        idmap->idmap_user_hash.h_type = IDMAP_TYPE_USER;
        idmap->idmap_group_hash.h_type = IDMAP_TYPE_GROUP;
@@ -192,7 +193,7 @@ idmap_lookup_id(struct idmap_hashtable *h, __u32 id)
 * pretty trivial.
 */
 static inline struct idmap_hashent *
-idmap_alloc_name(struct idmap_hashtable *h, char *name, unsigned len)
+idmap_alloc_name(struct idmap_hashtable *h, char *name, size_t len)
 {
        return idmap_name_hash(h, name, len);
 }
@@ -285,7 +286,7 @@ nfs_idmap_id(struct idmap *idmap, struct idmap_hashtable *h,
        memset(im, 0, sizeof(*im));
        mutex_unlock(&idmap->idmap_im_lock);
        mutex_unlock(&idmap->idmap_lock);
-        return (ret);
+        return ret;
 }
 /*
@@ -354,42 +355,40 @@ nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h,
 /* RPC pipefs upcall/downcall routines */
 static ssize_t
 idmap_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
-    char __user *dst, size_t buflen)
+                  char __user *dst, size_t buflen)
 {
-        char *data = (char *)msg->data + msg->copied;
+        char *data = (char *)msg->data + msg->copied;
-        ssize_t mlen = msg->len - msg->copied;
+        size_t mlen = min(msg->len, buflen);
-        ssize_t left;
+        unsigned long left;
-        if (mlen > buflen)
+        left = copy_to_user(dst, data, mlen);
-                mlen = buflen;
+        if (left == mlen) {
+                msg->errno = -EFAULT;
-        left = copy_to_user(dst, data, mlen);
+                return -EFAULT;
-        if (left < 0) {
-                msg->errno = left;
-                return left;
        }
        mlen -= left;
        msg->copied += mlen;
        msg->errno = 0;
-        return mlen;
+        return mlen;
 }
 static ssize_t
 idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 {
-        struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode);
+        struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode);
        struct idmap *idmap = (struct idmap *)rpci->private;
        struct idmap_msg im_in, *im = &idmap->idmap_im;
        struct idmap_hashtable *h;
        struct idmap_hashent *he = NULL;
-        int namelen_in;
+        size_t namelen_in;
        int ret;
-        if (mlen != sizeof(im_in))
+        if (mlen != sizeof(im_in))
-                return (-ENOSPC);
+                return -ENOSPC;
-        if (copy_from_user(&im_in, src, mlen) != 0)
+        if (copy_from_user(&im_in, src, mlen) != 0)
-                return (-EFAULT);
+                return -EFAULT;
        mutex_lock(&idmap->idmap_im_lock);
@@ -487,7 +486,7 @@ static unsigned int fnvhash32(const void *buf, size_t buflen)
                hash ^= (unsigned int)*p;
        }
-        return (hash);
+        return hash;
 }
 int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index db5d96dc6107..966a8850aa30 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -192,7 +192,7 @@ void nfs_invalidate_atime(struct inode *inode)
 */
 static void nfs_invalidate_inode(struct inode *inode)
 {
-        set_bit(NFS_INO_STALE, &NFS_FLAGS(inode));
+        set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
        nfs_zap_caches_locked(inode);
 }
@@ -229,7 +229,7 @@ nfs_init_locked(struct inode *inode, void *opaque)
        struct nfs_find_desc    *desc = (struct nfs_find_desc *)opaque;
        struct nfs_fattr        *fattr = desc->fattr;
-        NFS_FILEID(inode) = fattr->fileid;
+        set_nfs_fileid(inode, fattr->fileid);
        nfs_copy_fh(NFS_FH(inode), desc->fh);
        return 0;
 }
@@ -291,7 +291,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                        inode->i_fop = &nfs_dir_operations;
                        if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)
                            && fattr->size <= NFS_LIMIT_READDIRPLUS)
-                                set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
+                                set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
                        /* Deal with crossing mountpoints */
                        if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
                                if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
@@ -433,15 +433,11 @@ static int nfs_wait_schedule(void *word)
 */
 static int nfs_wait_on_inode(struct inode *inode)
 {
-        struct rpc_clnt *clnt = NFS_CLIENT(inode);
        struct nfs_inode *nfsi = NFS_I(inode);
-        sigset_t oldmask;
        int error;
-        rpc_clnt_sigmask(clnt, &oldmask);
        error = wait_on_bit_lock(&nfsi->flags, NFS_INO_REVALIDATING,
-                                        nfs_wait_schedule, TASK_INTERRUPTIBLE);
+                                        nfs_wait_schedule, TASK_KILLABLE);
-        rpc_clnt_sigunmask(clnt, &oldmask);
        return error;
 }
@@ -461,9 +457,18 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
        int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
        int err;
-        /* Flush out writes to the server in order to update c/mtime */
+        /*
-        if (S_ISREG(inode->i_mode))
+         * Flush out writes to the server in order to update c/mtime.
+         *
+         * Hold the i_mutex to suspend application writes temporarily;
+         * this prevents long-running writing applications from blocking
+         * nfs_wb_nocommit.
+         */
+        if (S_ISREG(inode->i_mode)) {
+                mutex_lock(&inode->i_mutex);
                nfs_wb_nocommit(inode);
+                mutex_unlock(&inode->i_mutex);
+        }
        /*
         * We may force a getattr if the user cares about atime.
@@ -659,7 +664,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
                if (status == -ESTALE) {
                        nfs_zap_caches(inode);
                        if (!S_ISDIR(inode->i_mode))
-                                set_bit(NFS_INO_STALE, &NFS_FLAGS(inode));
+                                set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
                }
                goto out;
        }
@@ -814,8 +819,9 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        if (S_ISDIR(inode->i_mode))
                                nfsi->cache_validity |= NFS_INO_INVALID_DATA;
                }
-                if (inode->i_size == fattr->pre_size && nfsi->npages == 0)
+                if (inode->i_size == nfs_size_to_loff_t(fattr->pre_size) &&
-                        inode->i_size = fattr->size;
+                    nfsi->npages == 0)
+                        inode->i_size = nfs_size_to_loff_t(fattr->size);
        }
 }
@@ -1019,7 +1025,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        dprintk("NFS: mtime change on server for file %s/%ld\n",
                                        inode->i_sb->s_id, inode->i_ino);
                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
-                        nfsi->cache_change_attribute = now;
+                        if (S_ISDIR(inode->i_mode))
+                                nfs_force_lookup_revalidate(inode);
                }
                /* If ctime has changed we should definitely clear access+acl caches */
                if (!timespec_equal(&inode->i_ctime, &fattr->ctime))
@@ -1028,7 +1035,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                dprintk("NFS: change_attr change on server for file %s/%ld\n",
                                inode->i_sb->s_id, inode->i_ino);
                invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-                nfsi->cache_change_attribute = now;
+                if (S_ISDIR(inode->i_mode))
+                        nfs_force_lookup_revalidate(inode);
        }
        /* Check if our cached file size is stale */
@@ -1133,7 +1141,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 void nfs4_clear_inode(struct inode *inode)
 {
        /* If we are holding a delegation, return it! */
-        nfs_inode_return_delegation(inode);
+        nfs_inode_return_delegation_noreclaim(inode);
        /* First call standard NFS clear_inode() code */
        nfs_clear_inode(inode);
 }
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f3acf48412be..0f5619611b8d 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -21,7 +21,8 @@ struct nfs_clone_mount {
        struct nfs_fattr *fattr;
        char *hostname;
        char *mnt_path;
-        struct sockaddr_in *addr;
+        struct sockaddr *addr;
+        size_t addrlen;
        rpc_authflavor_t authflavor;
 };
@@ -41,19 +42,19 @@ struct nfs_parsed_mount_data {
        char                    *client_address;
        struct {
-                struct sockaddr_in      address;
+                struct sockaddr_storage address;
+                size_t                  addrlen;
                char                    *hostname;
-                unsigned int            program;
                unsigned int            version;
                unsigned short          port;
                int                     protocol;
        } mount_server;
        struct {
-                struct sockaddr_in      address;
+                struct sockaddr_storage address;
+                size_t                  addrlen;
                char                    *hostname;
                char                    *export_path;
-                unsigned int            program;
                int                     protocol;
        } nfs_server;
 };
@@ -62,7 +63,8 @@ struct nfs_parsed_mount_data {
 extern struct rpc_program nfs_program;
 extern void nfs_put_client(struct nfs_client *);
-extern struct nfs_client *nfs_find_client(const struct sockaddr_in *, int);
+extern struct nfs_client *nfs_find_client(const struct sockaddr *, u32);
+extern struct nfs_client *nfs_find_client_next(struct nfs_client *);
 extern struct nfs_server *nfs_create_server(
                                        const struct nfs_parsed_mount_data *,
                                        struct nfs_fh *);
@@ -160,6 +162,8 @@ extern struct rpc_stat nfs_rpcstat;
 extern int __init register_nfs_fs(void);
 extern void __exit unregister_nfs_fs(void);
+extern void nfs_sb_active(struct nfs_server *server);
+extern void nfs_sb_deactive(struct nfs_server *server);
 /* namespace.c */
 extern char *nfs_path(const char *base,
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 8afd9f7e7a97..49c7cd0502cc 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -56,7 +56,7 @@ int nfs_mount(struct sockaddr *addr, size_t len, char *hostname, char *path,
                .program        = &mnt_program,
                .version        = version,
                .authflavor     = RPC_AUTH_UNIX,
-                .flags          = RPC_CLNT_CREATE_INTR,
+                .flags          = 0,
        };
        struct rpc_clnt         *mnt_clnt;
        int                     status;
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index acfc56f9edc0..be4ce1c3a3d8 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -188,7 +188,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
 {
 #ifdef CONFIG_NFS_V4
        struct vfsmount *mnt = NULL;
-        switch (server->nfs_client->cl_nfsversion) {
+        switch (server->nfs_client->rpc_ops->version) {
                case 2:
                case 3:
                        mnt = vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 668ab96c7b59..1f7ea675e0c5 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -262,7 +262,9 @@ static int
 nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
 {
        struct kvec *iov = req->rq_rcv_buf.head;
-        int     status, count, recvd, hdrlen;
+        size_t hdrlen;
+        u32 count, recvd;
+        int status;
        if ((status = ntohl(*p++)))
                return -nfs_stat_to_errno(status);
@@ -273,7 +275,7 @@ nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
        if (iov->iov_len < hdrlen) {
                dprintk("NFS: READ reply header overflowed:"
-                                "length %d > %Zu\n", hdrlen, iov->iov_len);
+                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
                return -errno_NFSERR_IO;
        } else if (iov->iov_len != hdrlen) {
                dprintk("NFS: READ header is short. iovec will be shifted.\n");
@@ -283,11 +285,11 @@ nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
        recvd = req->rq_rcv_buf.len - hdrlen;
        if (count > recvd) {
                dprintk("NFS: server cheating in read reply: "
-                        "count %d > recvd %d\n", count, recvd);
+                        "count %u > recvd %u\n", count, recvd);
                count = recvd;
        }
-        dprintk("RPC:      readres OK count %d\n", count);
+        dprintk("RPC:      readres OK count %u\n", count);
        if (count < res->count)
                res->count = count;
@@ -423,9 +425,10 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
        struct kvec *iov = rcvbuf->head;
        struct page **page;
-        int hdrlen, recvd;
+        size_t hdrlen;
+        unsigned int pglen, recvd;
+        u32 len;
        int status, nr;
-        unsigned int len, pglen;
        __be32 *end, *entry, *kaddr;
        if ((status = ntohl(*p++)))
@@ -434,7 +437,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
        if (iov->iov_len < hdrlen) {
                dprintk("NFS: READDIR reply header overflowed:"
-                                "length %d > %Zu\n", hdrlen, iov->iov_len);
+                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
                return -errno_NFSERR_IO;
        } else if (iov->iov_len != hdrlen) {
                dprintk("NFS: READDIR header is short. iovec will be shifted.\n");
@@ -576,7 +579,8 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
 {
        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
        struct kvec *iov = rcvbuf->head;
-        int hdrlen, len, recvd;
+        size_t hdrlen;
+        u32 len, recvd;
        char    *kaddr;
        int     status;
@@ -584,14 +588,14 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
                return -nfs_stat_to_errno(status);
        /* Convert length of symlink */
        len = ntohl(*p++);
-        if (len >= rcvbuf->page_len || len <= 0) {
+        if (len >= rcvbuf->page_len) {
                dprintk("nfs: server returned giant symlink!\n");
                return -ENAMETOOLONG;
        }
        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
        if (iov->iov_len < hdrlen) {
                dprintk("NFS: READLINK reply header overflowed:"
-                                "length %d > %Zu\n", hdrlen, iov->iov_len);
+                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
                return -errno_NFSERR_IO;
        } else if (iov->iov_len != hdrlen) {
                dprintk("NFS: READLINK header is short. iovec will be shifted.\n");
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 4cdc2361a669..549dbce714a4 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -27,17 +27,14 @@
 static int
 nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
 {
-        sigset_t oldset;
        int res;
-        rpc_clnt_sigmask(clnt, &oldset);
        do {
                res = rpc_call_sync(clnt, msg, flags);
                if (res != -EJUKEBOX)
                        break;
-                schedule_timeout_interruptible(NFS_JUKEBOX_RETRY_TIME);
+                schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
                res = -ERESTARTSYS;
-        } while (!signalled());
+        } while (!fatal_signal_pending(current));
-        rpc_clnt_sigunmask(clnt, &oldset);
        return res;
 }
@@ -732,16 +729,9 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data)
        return 0;
 }
-static void nfs3_proc_read_setup(struct nfs_read_data *data)
+static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
 {
-        struct rpc_message      msg = {
+        msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
-                .rpc_proc       = &nfs3_procedures[NFS3PROC_READ],
-                .rpc_argp       = &data->args,
-                .rpc_resp       = &data->res,
-                .rpc_cred       = data->cred,
-        };
-        rpc_call_setup(&data->task, &msg, 0);
 }
 static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -753,24 +743,9 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
        return 0;
 }
-static void nfs3_proc_write_setup(struct nfs_write_data *data, int how)
+static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
 {
-        struct rpc_message      msg = {
+        msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
-                .rpc_proc       = &nfs3_procedures[NFS3PROC_WRITE],
-                .rpc_argp       = &data->args,
-                .rpc_resp       = &data->res,
-                .rpc_cred       = data->cred,
-        };
-        data->args.stable = NFS_UNSTABLE;
-        if (how & FLUSH_STABLE) {
-                data->args.stable = NFS_FILE_SYNC;
-                if (NFS_I(data->inode)->ncommit)
-                        data->args.stable = NFS_DATA_SYNC;
-        }
-        /* Finalize the task. */
-        rpc_call_setup(&data->task, &msg, 0);
 }
 static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -781,22 +756,17 @@ static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data)
        return 0;
 }
-static void nfs3_proc_commit_setup(struct nfs_write_data *data, int how)
+static void nfs3_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
 {
-        struct rpc_message      msg = {
+        msg->rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT];
-                .rpc_proc       = &nfs3_procedures[NFS3PROC_COMMIT],
-                .rpc_argp       = &data->args,
-                .rpc_resp       = &data->res,
-                .rpc_cred       = data->cred,
-        };
-        rpc_call_setup(&data->task, &msg, 0);
 }
 static int
 nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
 {
-        return nlmclnt_proc(filp->f_path.dentry->d_inode, cmd, fl);
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
 }
 const struct nfs_rpc_ops nfs_v3_clientops = {
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 616d3267b7e7..3917e2fa4e40 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -506,9 +506,9 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
        struct kvec *iov = rcvbuf->head;
        struct page **page;
-        int hdrlen, recvd;
+        size_t hdrlen;
+        u32 len, recvd, pglen;
        int status, nr;
-        unsigned int len, pglen;
        __be32 *entry, *end, *kaddr;
        status = ntohl(*p++);
@@ -527,7 +527,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
        if (iov->iov_len < hdrlen) {
                dprintk("NFS: READDIR reply header overflowed:"
-                                "length %d > %Zu\n", hdrlen, iov->iov_len);
+                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
                return -errno_NFSERR_IO;
        } else if (iov->iov_len != hdrlen) {
                dprintk("NFS: READDIR header is short. iovec will be shifted.\n");
@@ -549,7 +549,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
                len = ntohl(*p++);              /* string length */
                p += XDR_QUADLEN(len) + 2;      /* name + cookie */
                if (len > NFS3_MAXNAMLEN) {
-                        dprintk("NFS: giant filename in readdir (len %x)!\n",
+                        dprintk("NFS: giant filename in readdir (len 0x%x)!\n",
                                                len);
                        goto err_unmap;
                }
@@ -570,7 +570,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
                                len = ntohl(*p++);
                                if (len > NFS3_FHSIZE) {
                                        dprintk("NFS: giant filehandle in "
-                                                "readdir (len %x)!\n", len);
+                                                "readdir (len 0x%x)!\n", len);
                                        goto err_unmap;
                                }
                                p += XDR_QUADLEN(len);
@@ -815,7 +815,8 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 {
        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
        struct kvec *iov = rcvbuf->head;
-        int hdrlen, len, recvd;
+        size_t hdrlen;
+        u32 len, recvd;
        char    *kaddr;
        int     status;
@@ -827,7 +828,7 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
        /* Convert length of symlink */
        len = ntohl(*p++);
-        if (len >= rcvbuf->page_len || len <= 0) {
+        if (len >= rcvbuf->page_len) {
                dprintk("nfs: server returned giant symlink!\n");
                return -ENAMETOOLONG;
        }
@@ -835,7 +836,7 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
        if (iov->iov_len < hdrlen) {
                dprintk("NFS: READLINK reply header overflowed:"
-                                "length %d > %Zu\n", hdrlen, iov->iov_len);
+                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
                return -errno_NFSERR_IO;
        } else if (iov->iov_len != hdrlen) {
                dprintk("NFS: READLINK header is short. "
@@ -863,7 +864,9 @@ static int
 nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
 {
        struct kvec *iov = req->rq_rcv_buf.head;
-        int     status, count, ocount, recvd, hdrlen;
+        size_t hdrlen;
+        u32 count, ocount, recvd;
+        int status;
        status = ntohl(*p++);
        p = xdr_decode_post_op_attr(p, res->fattr);
@@ -871,7 +874,7 @@ nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
        if (status != 0)
                return -nfs_stat_to_errno(status);
-        /* Decode reply could and EOF flag. NFSv3 is somewhat redundant
+        /* Decode reply count and EOF flag. NFSv3 is somewhat redundant
         * in that it puts the count both in the res struct and in the
         * opaque data count. */
        count    = ntohl(*p++);
@@ -886,7 +889,7 @@ nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
        if (iov->iov_len < hdrlen) {
                dprintk("NFS: READ reply header overflowed:"
-                                "length %d > %Zu\n", hdrlen, iov->iov_len);
+                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
                return -errno_NFSERR_IO;
        } else if (iov->iov_len != hdrlen) {
                dprintk("NFS: READ header is short. iovec will be shifted.\n");
@@ -896,7 +899,7 @@ nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
        recvd = req->rq_rcv_buf.len - hdrlen;
        if (count > recvd) {
                dprintk("NFS: server cheating in read reply: "
-                        "count %d > recvd %d\n", count, recvd);
+                        "count %u > recvd %u\n", count, recvd);
                count = recvd;
                res->eof = 0;
        }
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index b35069a2aa9e..bd1b9d663fb9 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -115,6 +115,7 @@ struct nfs4_lock_state {
 #define NFS_LOCK_INITIALIZED 1
        int                     ls_flags;
        struct nfs_seqid_counter        ls_seqid;
+        struct rpc_sequence     ls_sequence;
        struct nfs_unique_id    ls_id;
        nfs4_stateid            ls_stateid;
        atomic_t                ls_count;
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index dd5fef20c702..5f9ba41ed5bf 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -114,10 +114,7 @@ static inline int valid_ipaddr4(const char *buf)
 * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
 * @mnt_parent - mountpoint of parent directory
 * @dentry - parent directory
- * @fspath - fs path returned in fs_locations
+ * @locations - array of NFSv4 server location information
- * @mntpath - mount path to new server
- * @hostname - hostname of new server
- * @addr - host addr of new server
 *
 */
 static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
@@ -131,7 +128,8 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
                .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor,
        };
        char *page = NULL, *page2 = NULL;
-        int loc, s, error;
+        unsigned int s;
+        int loc, error;
        if (locations == NULL || locations->nlocations <= 0)
                goto out;
@@ -174,7 +172,10 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
                s = 0;
                while (s < location->nservers) {
-                        struct sockaddr_in addr = {};
+                        struct sockaddr_in addr = {
+                                .sin_family     = AF_INET,
+                                .sin_port       = htons(NFS_PORT),
+                        };
                        if (location->servers[s].len <= 0 ||
                            valid_ipaddr4(location->servers[s].data) < 0) {
@@ -183,10 +184,9 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
                        }
                        mountdata.hostname = location->servers[s].data;
-                        addr.sin_addr.s_addr = in_aton(mountdata.hostname);
+                        addr.sin_addr.s_addr = in_aton(mountdata.hostname),
-                        addr.sin_family = AF_INET;
+                        mountdata.addr = (struct sockaddr *)&addr;
-                        addr.sin_port = htons(NFS_PORT);
+                        mountdata.addrlen = sizeof(addr);
-                        mountdata.addr = &addr;
                        snprintf(page, PAGE_SIZE, "%s:%s",
                                        mountdata.hostname,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f03d9d5f5ba4..027e1095256e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -210,7 +210,7 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
        spin_lock(&dir->i_lock);
        nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
        if (!cinfo->atomic || cinfo->before != nfsi->change_attr)
-                nfsi->cache_change_attribute = jiffies;
+                nfs_force_lookup_revalidate(dir);
        nfsi->change_attr = cinfo->after;
        spin_unlock(&dir->i_lock);
 }
@@ -316,12 +316,9 @@ static void nfs4_opendata_put(struct nfs4_opendata *p)
 static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task)
 {
-        sigset_t oldset;
        int ret;
-        rpc_clnt_sigmask(task->tk_client, &oldset);
        ret = rpc_wait_for_completion_task(task);
-        rpc_clnt_sigunmask(task->tk_client, &oldset);
        return ret;
 }
@@ -718,19 +715,6 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
        return err;
 }
-static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata)
-{
-        struct nfs4_opendata *data = calldata;
-        struct  rpc_message msg = {
-                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM],
-                .rpc_argp = &data->c_arg,
-                .rpc_resp = &data->c_res,
-                .rpc_cred = data->owner->so_cred,
-        };
-        data->timestamp = jiffies;
-        rpc_call_setup(task, &msg, 0);
-}
 static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
 {
        struct nfs4_opendata *data = calldata;
@@ -741,10 +725,10 @@ static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
        if (data->rpc_status == 0) {
                memcpy(data->o_res.stateid.data, data->c_res.stateid.data,
                                sizeof(data->o_res.stateid.data));
+                nfs_confirm_seqid(&data->owner->so_seqid, 0);
                renew_lease(data->o_res.server, data->timestamp);
                data->rpc_done = 1;
        }
-        nfs_confirm_seqid(&data->owner->so_seqid, data->rpc_status);
        nfs_increment_open_seqid(data->rpc_status, data->c_arg.seqid);
 }
@@ -759,7 +743,6 @@ static void nfs4_open_confirm_release(void *calldata)
        /* In case of error, no cleanup! */
        if (!data->rpc_done)
                goto out_free;
-        nfs_confirm_seqid(&data->owner->so_seqid, 0);
        state = nfs4_opendata_to_nfs4_state(data);
        if (!IS_ERR(state))
                nfs4_close_state(&data->path, state, data->o_arg.open_flags);
@@ -768,7 +751,6 @@ out_free:
 }
 static const struct rpc_call_ops nfs4_open_confirm_ops = {
-        .rpc_call_prepare = nfs4_open_confirm_prepare,
        .rpc_call_done = nfs4_open_confirm_done,
        .rpc_release = nfs4_open_confirm_release,
 };
@@ -780,12 +762,26 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
 {
        struct nfs_server *server = NFS_SERVER(data->dir->d_inode);
        struct rpc_task *task;
+        struct  rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM],
+                .rpc_argp = &data->c_arg,
+                .rpc_resp = &data->c_res,
+                .rpc_cred = data->owner->so_cred,
+        };
+        struct rpc_task_setup task_setup_data = {
+                .rpc_client = server->client,
+                .rpc_message = &msg,
+                .callback_ops = &nfs4_open_confirm_ops,
+                .callback_data = data,
+                .flags = RPC_TASK_ASYNC,
+        };
        int status;
        kref_get(&data->kref);
        data->rpc_done = 0;
        data->rpc_status = 0;
-        task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_open_confirm_ops, data);
+        data->timestamp = jiffies;
+        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
                return PTR_ERR(task);
        status = nfs4_wait_for_completion_rpc_task(task);
@@ -802,13 +798,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs4_opendata *data = calldata;
        struct nfs4_state_owner *sp = data->owner;
-        struct rpc_message msg = {
-                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN],
-                .rpc_argp = &data->o_arg,
-                .rpc_resp = &data->o_res,
-                .rpc_cred = sp->so_cred,
-        };
-        
        if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
                return;
        /*
@@ -833,11 +823,11 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
        data->o_arg.id = sp->so_owner_id.id;
        data->o_arg.clientid = sp->so_client->cl_clientid;
        if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) {
-                msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
+                task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
                nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
        }
        data->timestamp = jiffies;
-        rpc_call_setup(task, &msg, 0);
+        rpc_call_start(task);
        return;
 out_no_action:
        task->tk_action = NULL;
@@ -886,7 +876,6 @@ static void nfs4_open_release(void *calldata)
        /* In case we need an open_confirm, no cleanup! */
        if (data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM)
                goto out_free;
-        nfs_confirm_seqid(&data->owner->so_seqid, 0);
        state = nfs4_opendata_to_nfs4_state(data);
        if (!IS_ERR(state))
                nfs4_close_state(&data->path, state, data->o_arg.open_flags);
@@ -910,13 +899,26 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
        struct nfs_openargs *o_arg = &data->o_arg;
        struct nfs_openres *o_res = &data->o_res;
        struct rpc_task *task;
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN],
+                .rpc_argp = o_arg,
+                .rpc_resp = o_res,
+                .rpc_cred = data->owner->so_cred,
+        };
+        struct rpc_task_setup task_setup_data = {
+                .rpc_client = server->client,
+                .rpc_message = &msg,
+                .callback_ops = &nfs4_open_ops,
+                .callback_data = data,
+                .flags = RPC_TASK_ASYNC,
+        };
        int status;
        kref_get(&data->kref);
        data->rpc_done = 0;
        data->rpc_status = 0;
        data->cancelled = 0;
-        task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_open_ops, data);
+        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
                return PTR_ERR(task);
        status = nfs4_wait_for_completion_rpc_task(task);
@@ -1246,12 +1248,6 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 {
        struct nfs4_closedata *calldata = data;
        struct nfs4_state *state = calldata->state;
-        struct rpc_message msg = {
-                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE],
-                .rpc_argp = &calldata->arg,
-                .rpc_resp = &calldata->res,
-                .rpc_cred = state->owner->so_cred,
-        };
        int clear_rd, clear_wr, clear_rdwr;
        if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
@@ -1278,14 +1274,14 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
        }
        nfs_fattr_init(calldata->res.fattr);
        if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0) {
-                msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
+                task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
                calldata->arg.open_flags = FMODE_READ;
        } else if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0) {
-                msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
+                task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
                calldata->arg.open_flags = FMODE_WRITE;
        }
        calldata->timestamp = jiffies;
-        rpc_call_setup(task, &msg, 0);
+        rpc_call_start(task);
 }
 static const struct rpc_call_ops nfs4_close_ops = {
@@ -1311,6 +1307,16 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
        struct nfs4_closedata *calldata;
        struct nfs4_state_owner *sp = state->owner;
        struct rpc_task *task;
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE],
+                .rpc_cred = state->owner->so_cred,
+        };
+        struct rpc_task_setup task_setup_data = {
+                .rpc_client = server->client,
+                .rpc_message = &msg,
+                .callback_ops = &nfs4_close_ops,
+                .flags = RPC_TASK_ASYNC,
+        };
        int status = -ENOMEM;
        calldata = kmalloc(sizeof(*calldata), GFP_KERNEL);
@@ -1330,7 +1336,10 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
        calldata->path.mnt = mntget(path->mnt);
        calldata->path.dentry = dget(path->dentry);
-        task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_close_ops, calldata);
+        msg.rpc_argp = &calldata->arg,
+        msg.rpc_resp = &calldata->res,
+        task_setup_data.callback_data = calldata;
+        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
                return PTR_ERR(task);
        status = 0;
@@ -2416,18 +2425,10 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
        return 0;
 }
-static void nfs4_proc_read_setup(struct nfs_read_data *data)
+static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
 {
-        struct rpc_message msg = {
-                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ],
-                .rpc_argp = &data->args,
-                .rpc_resp = &data->res,
-                .rpc_cred = data->cred,
-        };
        data->timestamp   = jiffies;
+        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
-        rpc_call_setup(&data->task, &msg, 0);
 }
 static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -2445,33 +2446,15 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
        return 0;
 }
-static void nfs4_proc_write_setup(struct nfs_write_data *data, int how)
+static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
 {
-        struct rpc_message msg = {
+        struct nfs_server *server = NFS_SERVER(data->inode);
-                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE],
-                .rpc_argp = &data->args,
-                .rpc_resp = &data->res,
-                .rpc_cred = data->cred,
-        };
-        struct inode *inode = data->inode;
-        struct nfs_server *server = NFS_SERVER(inode);
-        int stable;
-        
-        if (how & FLUSH_STABLE) {
-                if (!NFS_I(inode)->ncommit)
-                        stable = NFS_FILE_SYNC;
-                else
-                        stable = NFS_DATA_SYNC;
-        } else
-                stable = NFS_UNSTABLE;
-        data->args.stable = stable;
        data->args.bitmask = server->attr_bitmask;
        data->res.server = server;
        data->timestamp   = jiffies;
-        /* Finalize the task. */
+        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
-        rpc_call_setup(&data->task, &msg, 0);
 }
 static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -2486,20 +2469,13 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
        return 0;
 }
-static void nfs4_proc_commit_setup(struct nfs_write_data *data, int how)
+static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
 {
-        struct rpc_message msg = {
-                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT],
-                .rpc_argp = &data->args,
-                .rpc_resp = &data->res,
-                .rpc_cred = data->cred,
-        };      
        struct nfs_server *server = NFS_SERVER(data->inode);
        
        data->args.bitmask = server->attr_bitmask;
        data->res.server = server;
+        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
-        rpc_call_setup(&data->task, &msg, 0);
 }
 /*
@@ -2806,9 +2782,9 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
        return 0;
 }
-static int nfs4_wait_bit_interruptible(void *word)
+static int nfs4_wait_bit_killable(void *word)
 {
-        if (signal_pending(current))
+        if (fatal_signal_pending(current))
                return -ERESTARTSYS;
        schedule();
        return 0;
@@ -2816,18 +2792,14 @@ static int nfs4_wait_bit_interruptible(void *word)
 static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp)
 {
-        sigset_t oldset;
        int res;
        might_sleep();
        rwsem_acquire(&clp->cl_sem.dep_map, 0, 0, _RET_IP_);
-        rpc_clnt_sigmask(clnt, &oldset);
        res = wait_on_bit(&clp->cl_state, NFS4CLNT_STATE_RECOVER,
-                        nfs4_wait_bit_interruptible,
+                        nfs4_wait_bit_killable, TASK_KILLABLE);
-                        TASK_INTERRUPTIBLE);
-        rpc_clnt_sigunmask(clnt, &oldset);
        rwsem_release(&clp->cl_sem.dep_map, 1, _RET_IP_);
        return res;
@@ -2835,7 +2807,6 @@ static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp)
 static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
 {
-        sigset_t oldset;
        int res = 0;
        might_sleep();
@@ -2844,14 +2815,9 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
                *timeout = NFS4_POLL_RETRY_MIN;
        if (*timeout > NFS4_POLL_RETRY_MAX)
                *timeout = NFS4_POLL_RETRY_MAX;
-        rpc_clnt_sigmask(clnt, &oldset);
+        schedule_timeout_killable(*timeout);
-        if (clnt->cl_intr) {
+        if (fatal_signal_pending(current))
-                schedule_timeout_interruptible(*timeout);
+                res = -ERESTARTSYS;
-                if (signalled())
-                        res = -ERESTARTSYS;
-        } else
-                schedule_timeout_uninterruptible(*timeout);
-        rpc_clnt_sigunmask(clnt, &oldset);
        *timeout <<= 1;
        return res;
 }
@@ -2912,14 +2878,20 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
        for(;;) {
                setclientid.sc_name_len = scnprintf(setclientid.sc_name,
-                                sizeof(setclientid.sc_name), "%s/%u.%u.%u.%u %s %u",
+                                sizeof(setclientid.sc_name), "%s/%s %s %s %u",
-                                clp->cl_ipaddr, NIPQUAD(clp->cl_addr.sin_addr),
+                                clp->cl_ipaddr,
+                                rpc_peeraddr2str(clp->cl_rpcclient,
+                                                        RPC_DISPLAY_ADDR),
+                                rpc_peeraddr2str(clp->cl_rpcclient,
+                                                        RPC_DISPLAY_PROTO),
                                cred->cr_ops->cr_name,
                                clp->cl_id_uniquifier);
                setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
-                                sizeof(setclientid.sc_netid), "tcp");
+                                sizeof(setclientid.sc_netid),
+                                rpc_peeraddr2str(clp->cl_rpcclient,
+                                                        RPC_DISPLAY_NETID));
                setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr,
-                                sizeof(setclientid.sc_uaddr), "%s.%d.%d",
+                                sizeof(setclientid.sc_uaddr), "%s.%u.%u",
                                clp->cl_ipaddr, port >> 8, port & 255);
                status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
@@ -2983,25 +2955,11 @@ struct nfs4_delegreturndata {
        struct nfs4_delegreturnres res;
        struct nfs_fh fh;
        nfs4_stateid stateid;
-        struct rpc_cred *cred;
        unsigned long timestamp;
        struct nfs_fattr fattr;
        int rpc_status;
 };
-static void nfs4_delegreturn_prepare(struct rpc_task *task, void *calldata)
-{
-        struct nfs4_delegreturndata *data = calldata;
-        struct rpc_message msg = {
-                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DELEGRETURN],
-                .rpc_argp = &data->args,
-                .rpc_resp = &data->res,
-                .rpc_cred = data->cred,
-        };
-        nfs_fattr_init(data->res.fattr);
-        rpc_call_setup(task, &msg, 0);
-}
 static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 {
        struct nfs4_delegreturndata *data = calldata;
@@ -3012,24 +2970,30 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 static void nfs4_delegreturn_release(void *calldata)
 {
-        struct nfs4_delegreturndata *data = calldata;
-        put_rpccred(data->cred);
        kfree(calldata);
 }
 static const struct rpc_call_ops nfs4_delegreturn_ops = {
-        .rpc_call_prepare = nfs4_delegreturn_prepare,
        .rpc_call_done = nfs4_delegreturn_done,
        .rpc_release = nfs4_delegreturn_release,
 };
-static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid)
+static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync)
 {
        struct nfs4_delegreturndata *data;
        struct nfs_server *server = NFS_SERVER(inode);
        struct rpc_task *task;
-        int status;
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DELEGRETURN],
+                .rpc_cred = cred,
+        };
+        struct rpc_task_setup task_setup_data = {
+                .rpc_client = server->client,
+                .rpc_message = &msg,
+                .callback_ops = &nfs4_delegreturn_ops,
+                .flags = RPC_TASK_ASYNC,
+        };
+        int status = 0;
        data = kmalloc(sizeof(*data), GFP_KERNEL);
        if (data == NULL)
@@ -3041,30 +3005,37 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
        memcpy(&data->stateid, stateid, sizeof(data->stateid));
        data->res.fattr = &data->fattr;
        data->res.server = server;
-        data->cred = get_rpccred(cred);
+        nfs_fattr_init(data->res.fattr);
        data->timestamp = jiffies;
        data->rpc_status = 0;
-        task = rpc_run_task(NFS_CLIENT(inode), RPC_TASK_ASYNC, &nfs4_delegreturn_ops, data);
+        task_setup_data.callback_data = data;
+        msg.rpc_argp = &data->args,
+        msg.rpc_resp = &data->res,
+        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
                return PTR_ERR(task);
+        if (!issync)
+                goto out;
        status = nfs4_wait_for_completion_rpc_task(task);
-        if (status == 0) {
+        if (status != 0)
-                status = data->rpc_status;
+                goto out;
-                if (status == 0)
+        status = data->rpc_status;
-                        nfs_refresh_inode(inode, &data->fattr);
+        if (status != 0)
-        }
+                goto out;
+        nfs_refresh_inode(inode, &data->fattr);
+out:
        rpc_put_task(task);
        return status;
 }
-int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid)
+int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync)
 {
        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs4_exception exception = { };
        int err;
        do {
-                err = _nfs4_proc_delegreturn(inode, cred, stateid);
+                err = _nfs4_proc_delegreturn(inode, cred, stateid, issync);
                switch (err) {
                        case -NFS4ERR_STALE_STATEID:
                        case -NFS4ERR_EXPIRED:
@@ -3085,7 +3056,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
 static unsigned long
 nfs4_set_lock_task_retry(unsigned long timeout)
 {
-        schedule_timeout_interruptible(timeout);
+        schedule_timeout_killable(timeout);
        timeout <<= 1;
        if (timeout > NFS4_LOCK_MAXTIMEOUT)
                return NFS4_LOCK_MAXTIMEOUT;
@@ -3232,12 +3203,6 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
 static void nfs4_locku_prepare(struct rpc_task *task, void *data)
 {
        struct nfs4_unlockdata *calldata = data;
-        struct rpc_message msg = {
-                .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_LOCKU],
-                .rpc_argp       = &calldata->arg,
-                .rpc_resp       = &calldata->res,
-                .rpc_cred       = calldata->lsp->ls_state->owner->so_cred,
-        };
        if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
                return;
@@ -3247,7 +3212,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
                return;
        }
        calldata->timestamp = jiffies;
-        rpc_call_setup(task, &msg, 0);
+        rpc_call_start(task);
 }
 static const struct rpc_call_ops nfs4_locku_ops = {
@@ -3262,6 +3227,16 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
                struct nfs_seqid *seqid)
 {
        struct nfs4_unlockdata *data;
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKU],
+                .rpc_cred = ctx->cred,
+        };
+        struct rpc_task_setup task_setup_data = {
+                .rpc_client = NFS_CLIENT(lsp->ls_state->inode),
+                .rpc_message = &msg,
+                .callback_ops = &nfs4_locku_ops,
+                .flags = RPC_TASK_ASYNC,
+        };
        /* Ensure this is an unlock - when canceling a lock, the
         * canceled lock is passed in, and it won't be an unlock.
@@ -3274,7 +3249,10 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
                return ERR_PTR(-ENOMEM);
        }
-        return rpc_run_task(NFS_CLIENT(lsp->ls_state->inode), RPC_TASK_ASYNC, &nfs4_locku_ops, data);
+        msg.rpc_argp = &data->arg,
+        msg.rpc_resp = &data->res,
+        task_setup_data.callback_data = data;
+        return rpc_run_task(&task_setup_data);
 }
 static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
@@ -3333,9 +3311,12 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
        p->arg.fh = NFS_FH(inode);
        p->arg.fl = &p->fl;
+        p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid);
+        if (p->arg.open_seqid == NULL)
+                goto out_free;
        p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid);
        if (p->arg.lock_seqid == NULL)
-                goto out_free;
+                goto out_free_seqid;
        p->arg.lock_stateid = &lsp->ls_stateid;
        p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
        p->arg.lock_owner.id = lsp->ls_id.id;
@@ -3344,6 +3325,8 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
        p->ctx = get_nfs_open_context(ctx);
        memcpy(&p->fl, fl, sizeof(p->fl));
        return p;
+out_free_seqid:
+        nfs_free_seqid(p->arg.open_seqid);
 out_free:
        kfree(p);
        return NULL;
@@ -3353,31 +3336,20 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs4_lockdata *data = calldata;
        struct nfs4_state *state = data->lsp->ls_state;
-        struct nfs4_state_owner *sp = state->owner;
-        struct rpc_message msg = {
-                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCK],
-                .rpc_argp = &data->arg,
-                .rpc_resp = &data->res,
-                .rpc_cred = sp->so_cred,
-        };
+        dprintk("%s: begin!\n", __FUNCTION__);
        if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
                return;
-        dprintk("%s: begin!\n", __FUNCTION__);
        /* Do we need to do an open_to_lock_owner? */
        if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) {
-                data->arg.open_seqid = nfs_alloc_seqid(&sp->so_seqid);
+                if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0)
-                if (data->arg.open_seqid == NULL) {
+                        return;
-                        data->rpc_status = -ENOMEM;
-                        task->tk_action = NULL;
-                        goto out;
-                }
                data->arg.open_stateid = &state->stateid;
                data->arg.new_lock_owner = 1;
-        }
+        } else
+                data->arg.new_lock_owner = 0;
        data->timestamp = jiffies;
-        rpc_call_setup(task, &msg, 0);
+        rpc_call_start(task);
-out:
        dprintk("%s: done!, ret = %d\n", __FUNCTION__, data->rpc_status);
 }
@@ -3413,8 +3385,7 @@ static void nfs4_lock_release(void *calldata)
        struct nfs4_lockdata *data = calldata;
        dprintk("%s: begin!\n", __FUNCTION__);
-        if (data->arg.open_seqid != NULL)
+        nfs_free_seqid(data->arg.open_seqid);
-                nfs_free_seqid(data->arg.open_seqid);
        if (data->cancelled != 0) {
                struct rpc_task *task;
                task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp,
@@ -3440,6 +3411,16 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
 {
        struct nfs4_lockdata *data;
        struct rpc_task *task;
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCK],
+                .rpc_cred = state->owner->so_cred,
+        };
+        struct rpc_task_setup task_setup_data = {
+                .rpc_client = NFS_CLIENT(state->inode),
+                .rpc_message = &msg,
+                .callback_ops = &nfs4_lock_ops,
+                .flags = RPC_TASK_ASYNC,
+        };
        int ret;
        dprintk("%s: begin!\n", __FUNCTION__);
@@ -3451,8 +3432,10 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
                data->arg.block = 1;
        if (reclaim != 0)
                data->arg.reclaim = 1;
-        task = rpc_run_task(NFS_CLIENT(state->inode), RPC_TASK_ASYNC,
+        msg.rpc_argp = &data->arg,
-                        &nfs4_lock_ops, data);
+        msg.rpc_resp = &data->res,
+        task_setup_data.callback_data = data;
+        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
                return PTR_ERR(task);
        ret = nfs4_wait_for_completion_rpc_task(task);
@@ -3625,10 +3608,6 @@ int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf,
        if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
                return -EOPNOTSUPP;
-        if (!S_ISREG(inode->i_mode) &&
-            (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
-                return -EPERM;
        return nfs4_proc_set_acl(inode, buf, buflen);
 }
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 3ea352d82eba..5e2e4af1a0e6 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -133,9 +133,7 @@ nfs4_renewd_prepare_shutdown(struct nfs_server *server)
 void
 nfs4_kill_renewd(struct nfs_client *clp)
 {
-        down_read(&clp->cl_sem);
        cancel_delayed_work_sync(&clp->cl_renewd);
-        up_read(&clp->cl_sem);
 }
 /*
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 23a9a36556bf..f9c7432471dc 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -509,7 +509,10 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
        lsp = kzalloc(sizeof(*lsp), GFP_KERNEL);
        if (lsp == NULL)
                return NULL;
-        lsp->ls_seqid.sequence = &state->owner->so_sequence;
+        rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue");
+        spin_lock_init(&lsp->ls_sequence.lock);
+        INIT_LIST_HEAD(&lsp->ls_sequence.list);
+        lsp->ls_seqid.sequence = &lsp->ls_sequence;
        atomic_set(&lsp->ls_count, 1);
        lsp->ls_owner = fl_owner;
        spin_lock(&clp->cl_lock);
@@ -641,27 +644,26 @@ void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t f
 struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter)
 {
-        struct rpc_sequence *sequence = counter->sequence;
        struct nfs_seqid *new;
        new = kmalloc(sizeof(*new), GFP_KERNEL);
        if (new != NULL) {
                new->sequence = counter;
-                spin_lock(&sequence->lock);
+                INIT_LIST_HEAD(&new->list);
-                list_add_tail(&new->list, &sequence->list);
-                spin_unlock(&sequence->lock);
        }
        return new;
 }
 void nfs_free_seqid(struct nfs_seqid *seqid)
 {
-        struct rpc_sequence *sequence = seqid->sequence->sequence;
+        if (!list_empty(&seqid->list)) {
+                struct rpc_sequence *sequence = seqid->sequence->sequence;
-        spin_lock(&sequence->lock);
+                spin_lock(&sequence->lock);
-        list_del(&seqid->list);
+                list_del(&seqid->list);
-        spin_unlock(&sequence->lock);
+                spin_unlock(&sequence->lock);
-        rpc_wake_up(&sequence->wait);
+                rpc_wake_up(&sequence->wait);
+        }
        kfree(seqid);
 }
@@ -672,6 +674,7 @@ void nfs_free_seqid(struct nfs_seqid *seqid)
 */
 static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
 {
+        BUG_ON(list_first_entry(&seqid->sequence->sequence->list, struct nfs_seqid, list) != seqid);
        switch (status) {
                case 0:
                        break;
@@ -723,15 +726,15 @@ int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
        struct rpc_sequence *sequence = seqid->sequence->sequence;
        int status = 0;
-        if (sequence->list.next == &seqid->list)
-                goto out;
        spin_lock(&sequence->lock);
-        if (sequence->list.next != &seqid->list) {
+        if (list_empty(&seqid->list))
-                rpc_sleep_on(&sequence->wait, task, NULL, NULL);
+                list_add_tail(&seqid->list, &sequence->list);
-                status = -EAGAIN;
+        if (list_first_entry(&sequence->list, struct nfs_seqid, list) == seqid)
-        }
+                goto unlock;
+        rpc_sleep_on(&sequence->wait, task, NULL, NULL);
+        status = -EAGAIN;
+unlock:
        spin_unlock(&sequence->lock);
-out:
        return status;
 }
@@ -755,8 +758,9 @@ static void nfs4_recover_state(struct nfs_client *clp)
        __module_get(THIS_MODULE);
        atomic_inc(&clp->cl_count);
-        task = kthread_run(reclaimer, clp, "%u.%u.%u.%u-reclaim",
+        task = kthread_run(reclaimer, clp, "%s-reclaim",
-                        NIPQUAD(clp->cl_addr.sin_addr));
+                                rpc_peeraddr2str(clp->cl_rpcclient,
+                                                        RPC_DISPLAY_ADDR));
        if (!IS_ERR(task))
                return;
        nfs4_clear_recover_bit(clp);
@@ -967,8 +971,8 @@ out:
        module_put_and_exit(0);
        return 0;
 out_error:
-        printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %u.%u.%u.%u with error %d\n",
+        printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %s"
-                                NIPQUAD(clp->cl_addr.sin_addr), -status);
+                        " with error %d\n", clp->cl_hostname, -status);
        set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
        goto out;
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 51dd3804866f..db1ed9c46ede 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -116,10 +116,12 @@ static int nfs4_stat_to_errno(int);
 #define decode_renew_maxsz      (op_decode_hdr_maxsz)
 #define encode_setclientid_maxsz \
                                (op_encode_hdr_maxsz + \
-                                4 /*server->ip_addr*/ + \
+                                XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \
-                                1 /*Netid*/ + \
+                                XDR_QUADLEN(NFS4_SETCLIENTID_NAMELEN) + \
-                                6 /*uaddr*/ + \
+                                1 /* sc_prog */ + \
-                                6 + (NFS4_VERIFIER_SIZE >> 2))
+                                XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \
+                                XDR_QUADLEN(RPCBIND_MAXUADDRLEN) + \
+                                1) /* sc_cb_ident */
 #define decode_setclientid_maxsz \
                                (op_decode_hdr_maxsz + \
                                2 + \
@@ -2515,14 +2517,12 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
 {
-        int n;
+        u32 n;
        __be32 *p;
        int status = 0;
        READ_BUF(4);
        READ32(n);
-        if (n < 0)
-                goto out_eio;
        if (n == 0)
                goto root_path;
        dprintk("path ");
@@ -2579,13 +2579,11 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
                goto out_eio;
        res->nlocations = 0;
        while (res->nlocations < n) {
-                int m;
+                u32 m;
                struct nfs4_fs_location *loc = &res->locations[res->nlocations];
                READ_BUF(4);
                READ32(m);
-                if (m <= 0)
-                        goto out_eio;
                loc->nservers = 0;
                dprintk("%s: servers ", __FUNCTION__);
@@ -2598,8 +2596,12 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
                        if (loc->nservers < NFS4_FS_LOCATION_MAXSERVERS)
                                loc->nservers++;
                        else {
-                                int i;
+                                unsigned int i;
-                                dprintk("%s: using first %d of %d servers returned for location %d\n", __FUNCTION__, NFS4_FS_LOCATION_MAXSERVERS, m, res->nlocations);
+                                dprintk("%s: using first %u of %u servers "
+                                        "returned for location %u\n",
+                                                __FUNCTION__,
+                                                NFS4_FS_LOCATION_MAXSERVERS,
+                                                m, res->nlocations);
                                for (i = loc->nservers; i < m; i++) {
                                        unsigned int len;
                                        char *data;
@@ -3476,10 +3478,11 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
        struct xdr_buf  *rcvbuf = &req->rq_rcv_buf;
        struct page     *page = *rcvbuf->pages;
        struct kvec     *iov = rcvbuf->head;
-        unsigned int    nr, pglen = rcvbuf->page_len;
+        size_t          hdrlen;
+        u32             recvd, pglen = rcvbuf->page_len;
        __be32          *end, *entry, *p, *kaddr;
-        uint32_t        len, attrlen, xlen;
+        unsigned int    nr;
-        int             hdrlen, recvd, status;
+        int             status;
        status = decode_op_hdr(xdr, OP_READDIR);
        if (status)
@@ -3503,6 +3506,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
        end = p + ((pglen + readdir->pgbase) >> 2);
        entry = p;
        for (nr = 0; *p++; nr++) {
+                u32 len, attrlen, xlen;
                if (end - p < 3)
                        goto short_pkt;
                dprintk("cookie = %Lu, ", *((unsigned long long *)p));
@@ -3551,7 +3555,8 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
 {
        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
        struct kvec *iov = rcvbuf->head;
-        int hdrlen, len, recvd;
+        size_t hdrlen;
+        u32 len, recvd;
        __be32 *p;
        char *kaddr;
        int status;
@@ -3646,7 +3651,8 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
        if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U)))
                return -EIO;
        if (likely(bitmap[0] & FATTR4_WORD0_ACL)) {
-                int hdrlen, recvd;
+                size_t hdrlen;
+                u32 recvd;
                /* We ignore &savep and don't do consistency checks on
                 * the attr length.  Let userspace figure it out.... */
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 4b0334590ee5..531379d36823 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -228,10 +228,7 @@ static int __init root_nfs_parse(char *name, char *buf)
                                nfs_data.flags &= ~NFS_MOUNT_SOFT;
                                break;
                        case Opt_intr:
-                                nfs_data.flags |= NFS_MOUNT_INTR;
-                                break;
                        case Opt_nointr:
-                                nfs_data.flags &= ~NFS_MOUNT_INTR;
                                break;
                        case Opt_posix:
                                nfs_data.flags |= NFS_MOUNT_POSIX;
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 345bb9b4765b..7f079209d70a 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -58,7 +58,6 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
                   struct page *page,
                   unsigned int offset, unsigned int count)
 {
-        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs_page         *req;
        for (;;) {
@@ -67,7 +66,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
                if (req != NULL)
                        break;
-                if (signalled() && (server->flags & NFS_MOUNT_INTR))
+                if (fatal_signal_pending(current))
                        return ERR_PTR(-ERESTARTSYS);
                yield();
        }
@@ -111,13 +110,14 @@ void nfs_unlock_request(struct nfs_page *req)
 * nfs_set_page_tag_locked - Tag a request as locked
 * @req:
 */
-static int nfs_set_page_tag_locked(struct nfs_page *req)
+int nfs_set_page_tag_locked(struct nfs_page *req)
 {
        struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode);
-        if (!nfs_lock_request(req))
+        if (!nfs_lock_request_dontget(req))
                return 0;
-        radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
+        if (req->wb_page != NULL)
+                radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
        return 1;
 }
@@ -132,9 +132,10 @@ void nfs_clear_page_tag_locked(struct nfs_page *req)
        if (req->wb_page != NULL) {
                spin_lock(&inode->i_lock);
                radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
+                nfs_unlock_request(req);
                spin_unlock(&inode->i_lock);
-        }
+        } else
-        nfs_unlock_request(req);
+                nfs_unlock_request(req);
 }
 /**
@@ -175,11 +176,11 @@ void nfs_release_request(struct nfs_page *req)
        kref_put(&req->wb_kref, nfs_free_request);
 }
-static int nfs_wait_bit_interruptible(void *word)
+static int nfs_wait_bit_killable(void *word)
 {
        int ret = 0;
-        if (signal_pending(current))
+        if (fatal_signal_pending(current))
                ret = -ERESTARTSYS;
        else
                schedule();
@@ -190,26 +191,18 @@ static int nfs_wait_bit_interruptible(void *word)
 * nfs_wait_on_request - Wait for a request to complete.
 * @req: request to wait upon.
 *
- * Interruptible by signals only if mounted with intr flag.
+ * Interruptible by fatal signals only.
 * The user is responsible for holding a count on the request.
 */
 int
 nfs_wait_on_request(struct nfs_page *req)
 {
-        struct rpc_clnt *clnt = NFS_CLIENT(req->wb_context->path.dentry->d_inode);
-        sigset_t oldmask;
        int ret = 0;
        if (!test_bit(PG_BUSY, &req->wb_flags))
                goto out;
-        /*
-         * Note: the call to rpc_clnt_sigmask() suffices to ensure that we
-         *       are not interrupted if intr flag is not set
-         */
-        rpc_clnt_sigmask(clnt, &oldmask);
        ret = out_of_line_wait_on_bit(&req->wb_flags, PG_BUSY,
-                        nfs_wait_bit_interruptible, TASK_INTERRUPTIBLE);
+                        nfs_wait_bit_killable, TASK_KILLABLE);
-        rpc_clnt_sigunmask(clnt, &oldmask);
 out:
        return ret;
 }
@@ -421,6 +414,7 @@ int nfs_scan_list(struct nfs_inode *nfsi,
                                goto out;
                        idx_start = req->wb_index + 1;
                        if (nfs_set_page_tag_locked(req)) {
+                                kref_get(&req->wb_kref);
                                nfs_list_remove_request(req);
                                radix_tree_tag_clear(&nfsi->nfs_page_tree,
                                                req->wb_index, tag);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 4f80d88e9fee..5ccf7faee19c 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -565,16 +565,9 @@ static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
        return 0;
 }
-static void nfs_proc_read_setup(struct nfs_read_data *data)
+static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
 {
-        struct rpc_message      msg = {
+        msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
-                .rpc_proc       = &nfs_procedures[NFSPROC_READ],
-                .rpc_argp       = &data->args,
-                .rpc_resp       = &data->res,
-                .rpc_cred       = data->cred,
-        };
-        rpc_call_setup(&data->task, &msg, 0);
 }
 static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -584,24 +577,15 @@ static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
        return 0;
 }
-static void nfs_proc_write_setup(struct nfs_write_data *data, int how)
+static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
 {
-        struct rpc_message      msg = {
-                .rpc_proc       = &nfs_procedures[NFSPROC_WRITE],
-                .rpc_argp       = &data->args,
-                .rpc_resp       = &data->res,
-                .rpc_cred       = data->cred,
-        };
        /* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */
        data->args.stable = NFS_FILE_SYNC;
+        msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
-        /* Finalize the task. */
-        rpc_call_setup(&data->task, &msg, 0);
 }
 static void
-nfs_proc_commit_setup(struct nfs_write_data *data, int how)
+nfs_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
 {
        BUG();
 }
@@ -609,7 +593,9 @@ nfs_proc_commit_setup(struct nfs_write_data *data, int how)
 static int
 nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
 {
-        return nlmclnt_proc(filp->f_path.dentry->d_inode, cmd, fl);
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
 }
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 4587a86adaac..8fd6dfbe1bc3 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -160,12 +160,26 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
                const struct rpc_call_ops *call_ops,
                unsigned int count, unsigned int offset)
 {
-        struct inode            *inode;
+        struct inode *inode = req->wb_context->path.dentry->d_inode;
-        int flags;
+        int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
+        struct rpc_task *task;
+        struct rpc_message msg = {
+                .rpc_argp = &data->args,
+                .rpc_resp = &data->res,
+                .rpc_cred = req->wb_context->cred,
+        };
+        struct rpc_task_setup task_setup_data = {
+                .task = &data->task,
+                .rpc_client = NFS_CLIENT(inode),
+                .rpc_message = &msg,
+                .callback_ops = call_ops,
+                .callback_data = data,
+                .flags = RPC_TASK_ASYNC | swap_flags,
+        };
        data->req         = req;
-        data->inode       = inode = req->wb_context->path.dentry->d_inode;
+        data->inode       = inode;
-        data->cred        = req->wb_context->cred;
+        data->cred        = msg.rpc_cred;
        data->args.fh     = NFS_FH(inode);
        data->args.offset = req_offset(req) + offset;
@@ -180,11 +194,7 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
        nfs_fattr_init(&data->fattr);
        /* Set up the initial task struct. */
-        flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0);
+        NFS_PROTO(inode)->read_setup(data, &msg);
-        rpc_init_task(&data->task, NFS_CLIENT(inode), flags, call_ops, data);
-        NFS_PROTO(inode)->read_setup(data);
-        data->task.tk_cookie = (unsigned long)inode;
        dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
                        data->task.tk_pid,
@@ -192,6 +202,10 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
                        (long long)NFS_FILEID(inode),
                        count,
                        (unsigned long long)data->args.offset);
+        task = rpc_run_task(&task_setup_data);
+        if (!IS_ERR(task))
+                rpc_put_task(task);
 }
 static void
@@ -208,19 +222,6 @@ nfs_async_read_error(struct list_head *head)
 }
 /*
- * Start an async read operation
- */
-static void nfs_execute_read(struct nfs_read_data *data)
-{
-        struct rpc_clnt *clnt = NFS_CLIENT(data->inode);
-        sigset_t oldset;
-        rpc_clnt_sigmask(clnt, &oldset);
-        rpc_execute(&data->task);
-        rpc_clnt_sigunmask(clnt, &oldset);
-}
-/*
 * Generate multiple requests to fill a single page.
 *
 * We optimize to reduce the number of read operations on the wire.  If we
@@ -274,7 +275,6 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
                                  rsize, offset);
                offset += rsize;
                nbytes -= rsize;
-                nfs_execute_read(data);
        } while (nbytes != 0);
        return 0;
@@ -312,8 +312,6 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned
        req = nfs_list_entry(data->pages.next);
        nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0);
-        nfs_execute_read(data);
        return 0;
 out_bad:
        nfs_async_read_error(head);
@@ -338,7 +336,7 @@ int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data)
        nfs_add_stats(data->inode, NFSIOS_SERVERREADBYTES, data->res.count);
        if (task->tk_status == -ESTALE) {
-                set_bit(NFS_INO_STALE, &NFS_FLAGS(data->inode));
+                set_bit(NFS_INO_STALE, &NFS_I(data->inode)->flags);
                nfs_mark_for_revalidate(data->inode);
        }
        return 0;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index fa517ae9207f..7f4505f6ac6f 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -45,6 +45,8 @@
 #include <linux/nfs_idmap.h>
 #include <linux/vfs.h>
 #include <linux/inet.h>
+#include <linux/in6.h>
+#include <net/ipv6.h>
 #include <linux/nfs_xdr.h>
 #include <linux/magic.h>
 #include <linux/parser.h>
@@ -83,11 +85,11 @@ enum {
        Opt_actimeo,
        Opt_namelen,
        Opt_mountport,
-        Opt_mountprog, Opt_mountvers,
+        Opt_mountvers,
-        Opt_nfsprog, Opt_nfsvers,
+        Opt_nfsvers,
        /* Mount options that take string arguments */
-        Opt_sec, Opt_proto, Opt_mountproto,
+        Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
        Opt_addr, Opt_mountaddr, Opt_clientaddr,
        /* Mount options that are ignored */
@@ -137,9 +139,7 @@ static match_table_t nfs_mount_option_tokens = {
        { Opt_userspace, "retry=%u" },
        { Opt_namelen, "namlen=%u" },
        { Opt_mountport, "mountport=%u" },
-        { Opt_mountprog, "mountprog=%u" },
        { Opt_mountvers, "mountvers=%u" },
-        { Opt_nfsprog, "nfsprog=%u" },
        { Opt_nfsvers, "nfsvers=%u" },
        { Opt_nfsvers, "vers=%u" },
@@ -148,7 +148,7 @@ static match_table_t nfs_mount_option_tokens = {
        { Opt_mountproto, "mountproto=%s" },
        { Opt_addr, "addr=%s" },
        { Opt_clientaddr, "clientaddr=%s" },
-        { Opt_userspace, "mounthost=%s" },
+        { Opt_mounthost, "mounthost=%s" },
        { Opt_mountaddr, "mountaddr=%s" },
        { Opt_err, NULL }
@@ -202,6 +202,7 @@ static int nfs_get_sb(struct file_system_type *, int, const char *, void *, stru
 static int nfs_xdev_get_sb(struct file_system_type *fs_type,
                int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
 static void nfs_kill_super(struct super_block *);
+static void nfs_put_super(struct super_block *);
 static struct file_system_type nfs_fs_type = {
        .owner          = THIS_MODULE,
@@ -223,6 +224,7 @@ static const struct super_operations nfs_sops = {
        .alloc_inode    = nfs_alloc_inode,
        .destroy_inode  = nfs_destroy_inode,
        .write_inode    = nfs_write_inode,
+        .put_super      = nfs_put_super,
        .statfs         = nfs_statfs,
        .clear_inode    = nfs_clear_inode,
        .umount_begin   = nfs_umount_begin,
@@ -325,6 +327,28 @@ void __exit unregister_nfs_fs(void)
        unregister_filesystem(&nfs_fs_type);
 }
+void nfs_sb_active(struct nfs_server *server)
+{
+        atomic_inc(&server->active);
+}
+void nfs_sb_deactive(struct nfs_server *server)
+{
+        if (atomic_dec_and_test(&server->active))
+                wake_up(&server->active_wq);
+}
+static void nfs_put_super(struct super_block *sb)
+{
+        struct nfs_server *server = NFS_SB(sb);
+        /*
+         * Make sure there are no outstanding ops to this server.
+         * If so, wait for them to finish before allowing the
+         * unmount to continue.
+         */
+        wait_event(server->active_wq, atomic_read(&server->active) == 0);
+}
 /*
 * Deliver file system statistics to userspace
 */
@@ -424,7 +448,6 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
                const char *nostr;
        } nfs_info[] = {
                { NFS_MOUNT_SOFT, ",soft", ",hard" },
-                { NFS_MOUNT_INTR, ",intr", ",nointr" },
                { NFS_MOUNT_NOCTO, ",nocto", "" },
                { NFS_MOUNT_NOAC, ",noac", "" },
                { NFS_MOUNT_NONLM, ",nolock", "" },
@@ -455,8 +478,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
        }
        seq_printf(m, ",proto=%s",
                   rpc_peeraddr2str(nfss->client, RPC_DISPLAY_PROTO));
-        seq_printf(m, ",timeo=%lu", 10U * clp->retrans_timeo / HZ);
+        seq_printf(m, ",timeo=%lu", 10U * nfss->client->cl_timeout->to_initval / HZ);
-        seq_printf(m, ",retrans=%u", clp->retrans_count);
+        seq_printf(m, ",retrans=%u", nfss->client->cl_timeout->to_retries);
        seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor));
 }
@@ -469,8 +492,9 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
        nfs_show_mount_options(m, nfss, 0);
-        seq_printf(m, ",addr="NIPQUAD_FMT,
+        seq_printf(m, ",addr=%s",
-                NIPQUAD(nfss->nfs_client->cl_addr.sin_addr));
+                        rpc_peeraddr2str(nfss->nfs_client->cl_rpcclient,
+                                                        RPC_DISPLAY_ADDR));
        return 0;
 }
@@ -507,7 +531,7 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
        seq_printf(m, ",namelen=%d", nfss->namelen);
 #ifdef CONFIG_NFS_V4
-        if (nfss->nfs_client->cl_nfsversion == 4) {
+        if (nfss->nfs_client->rpc_ops->version == 4) {
                seq_printf(m, "\n\tnfsv4:\t");
                seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
                seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
@@ -575,16 +599,40 @@ static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags)
 }
 /*
- * Sanity-check a server address provided by the mount command
+ * Set the port number in an address.  Be agnostic about the address family.
+ */
+static void nfs_set_port(struct sockaddr *sap, unsigned short port)
+{
+        switch (sap->sa_family) {
+        case AF_INET: {
+                struct sockaddr_in *ap = (struct sockaddr_in *)sap;
+                ap->sin_port = htons(port);
+                break;
+        }
+        case AF_INET6: {
+                struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
+                ap->sin6_port = htons(port);
+                break;
+        }
+        }
+}
+/*
+ * Sanity-check a server address provided by the mount command.
+ *
+ * Address family must be initialized, and address must not be
+ * the ANY address for that family.
 */
 static int nfs_verify_server_address(struct sockaddr *addr)
 {
        switch (addr->sa_family) {
        case AF_INET: {
-                struct sockaddr_in *sa = (struct sockaddr_in *) addr;
+                struct sockaddr_in *sa = (struct sockaddr_in *)addr;
-                if (sa->sin_addr.s_addr != INADDR_ANY)
+                return sa->sin_addr.s_addr != INADDR_ANY;
-                        return 1;
+        }
-                break;
+        case AF_INET6: {
+                struct in6_addr *sa = &((struct sockaddr_in6 *)addr)->sin6_addr;
+                return !ipv6_addr_any(sa);
        }
        }
@@ -592,6 +640,40 @@ static int nfs_verify_server_address(struct sockaddr *addr)
 }
 /*
+ * Parse string addresses passed in via a mount option,
+ * and construct a sockaddr based on the result.
+ *
+ * If address parsing fails, set the sockaddr's address
+ * family to AF_UNSPEC to force nfs_verify_server_address()
+ * to punt the mount.
+ */
+static void nfs_parse_server_address(char *value,
+                                     struct sockaddr *sap,
+                                     size_t *len)
+{
+        if (strchr(value, ':')) {
+                struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
+                u8 *addr = (u8 *)&ap->sin6_addr.in6_u;
+                ap->sin6_family = AF_INET6;
+                *len = sizeof(*ap);
+                if (in6_pton(value, -1, addr, '\0', NULL))
+                        return;
+        } else {
+                struct sockaddr_in *ap = (struct sockaddr_in *)sap;
+                u8 *addr = (u8 *)&ap->sin_addr.s_addr;
+                ap->sin_family = AF_INET;
+                *len = sizeof(*ap);
+                if (in4_pton(value, -1, addr, '\0', NULL))
+                        return;
+        }
+        sap->sa_family = AF_UNSPEC;
+        *len = 0;
+}
+/*
 * Error-check and convert a string of mount options from user space into
 * a data structure
 */
@@ -599,6 +681,7 @@ static int nfs_parse_mount_options(char *raw,
                                   struct nfs_parsed_mount_data *mnt)
 {
        char *p, *string;
+        unsigned short port = 0;
        if (!raw) {
                dfprintk(MOUNT, "NFS: mount options string was NULL.\n");
@@ -624,10 +707,7 @@ static int nfs_parse_mount_options(char *raw,
                        mnt->flags &= ~NFS_MOUNT_SOFT;
                        break;
                case Opt_intr:
-                        mnt->flags |= NFS_MOUNT_INTR;
-                        break;
                case Opt_nointr:
-                        mnt->flags &= ~NFS_MOUNT_INTR;
                        break;
                case Opt_posix:
                        mnt->flags |= NFS_MOUNT_POSIX;
@@ -701,7 +781,7 @@ static int nfs_parse_mount_options(char *raw,
                                return 0;
                        if (option < 0 || option > 65535)
                                return 0;
-                        mnt->nfs_server.address.sin_port = htons(option);
+                        port = option;
                        break;
                case Opt_rsize:
                        if (match_int(args, &mnt->rsize))
@@ -763,13 +843,6 @@ static int nfs_parse_mount_options(char *raw,
                                return 0;
                        mnt->mount_server.port = option;
                        break;
-                case Opt_mountprog:
-                        if (match_int(args, &option))
-                                return 0;
-                        if (option < 0)
-                                return 0;
-                        mnt->mount_server.program = option;
-                        break;
                case Opt_mountvers:
                        if (match_int(args, &option))
                                return 0;
@@ -777,13 +850,6 @@ static int nfs_parse_mount_options(char *raw,
                                return 0;
                        mnt->mount_server.version = option;
                        break;
-                case Opt_nfsprog:
-                        if (match_int(args, &option))
-                                return 0;
-                        if (option < 0)
-                                return 0;
-                        mnt->nfs_server.program = option;
-                        break;
                case Opt_nfsvers:
                        if (match_int(args, &option))
                                return 0;
@@ -927,24 +993,32 @@ static int nfs_parse_mount_options(char *raw,
                        string = match_strdup(args);
                        if (string == NULL)
                                goto out_nomem;
-                        mnt->nfs_server.address.sin_family = AF_INET;
+                        nfs_parse_server_address(string, (struct sockaddr *)
-                        mnt->nfs_server.address.sin_addr.s_addr =
+                                                 &mnt->nfs_server.address,
-                                                        in_aton(string);
+                                                 &mnt->nfs_server.addrlen);
                        kfree(string);
                        break;
                case Opt_clientaddr:
                        string = match_strdup(args);
                        if (string == NULL)
                                goto out_nomem;
+                        kfree(mnt->client_address);
                        mnt->client_address = string;
                        break;
+                case Opt_mounthost:
+                        string = match_strdup(args);
+                        if (string == NULL)
+                                goto out_nomem;
+                        kfree(mnt->mount_server.hostname);
+                        mnt->mount_server.hostname = string;
+                        break;
                case Opt_mountaddr:
                        string = match_strdup(args);
                        if (string == NULL)
                                goto out_nomem;
-                        mnt->mount_server.address.sin_family = AF_INET;
+                        nfs_parse_server_address(string, (struct sockaddr *)
-                        mnt->mount_server.address.sin_addr.s_addr =
+                                                 &mnt->mount_server.address,
-                                                        in_aton(string);
+                                                 &mnt->mount_server.addrlen);
                        kfree(string);
                        break;
@@ -957,6 +1031,8 @@ static int nfs_parse_mount_options(char *raw,
                }
        }
+        nfs_set_port((struct sockaddr *)&mnt->nfs_server.address, port);
        return 1;
 out_nomem:
@@ -987,7 +1063,8 @@ out_unknown:
 static int nfs_try_mount(struct nfs_parsed_mount_data *args,
                         struct nfs_fh *root_fh)
 {
-        struct sockaddr_in sin;
+        struct sockaddr *sap = (struct sockaddr *)&args->mount_server.address;
+        char *hostname;
        int status;
        if (args->mount_server.version == 0) {
@@ -997,25 +1074,32 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
                        args->mount_server.version = NFS_MNT_VERSION;
        }
+        if (args->mount_server.hostname)
+                hostname = args->mount_server.hostname;
+        else
+                hostname = args->nfs_server.hostname;
        /*
         * Construct the mount server's address.
         */
-        if (args->mount_server.address.sin_addr.s_addr != INADDR_ANY)
+        if (args->mount_server.address.ss_family == AF_UNSPEC) {
-                sin = args->mount_server.address;
+                memcpy(sap, &args->nfs_server.address,
-        else
+                       args->nfs_server.addrlen);
-                sin = args->nfs_server.address;
+                args->mount_server.addrlen = args->nfs_server.addrlen;
+        }
        /*
         * autobind will be used if mount_server.port == 0
         */
-        sin.sin_port = htons(args->mount_server.port);
+        nfs_set_port(sap, args->mount_server.port);
        /*
         * Now ask the mount server to map our export path
         * to a file handle.
         */
-        status = nfs_mount((struct sockaddr *) &sin,
+        status = nfs_mount(sap,
-                           sizeof(sin),
+                           args->mount_server.addrlen,
-                           args->nfs_server.hostname,
+                           hostname,
                           args->nfs_server.export_path,
                           args->mount_server.version,
                           args->mount_server.protocol,
@@ -1023,8 +1107,8 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
        if (status == 0)
                return 0;
-        dfprintk(MOUNT, "NFS: unable to mount server " NIPQUAD_FMT
+        dfprintk(MOUNT, "NFS: unable to mount server %s, error %d",
-                        ", error %d\n", NIPQUAD(sin.sin_addr.s_addr), status);
+                        hostname, status);
        return status;
 }
@@ -1043,9 +1127,6 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 *
 * + breaking back: trying proto=udp after proto=tcp, v2 after v3,
 *   mountproto=tcp after mountproto=udp, and so on
- *
- * XXX: as far as I can tell, changing the NFS program number is not
- *      supported in the NFS client.
 */
 static int nfs_validate_mount_data(void *options,
                                   struct nfs_parsed_mount_data *args,
@@ -1054,10 +1135,11 @@ static int nfs_validate_mount_data(void *options,
 {
        struct nfs_mount_data *data = (struct nfs_mount_data *)options;
+        memset(args, 0, sizeof(*args));
        if (data == NULL)
                goto out_no_data;
-        memset(args, 0, sizeof(*args));
        args->flags             = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
        args->rsize             = NFS_MAX_FILE_IO_SIZE;
        args->wsize             = NFS_MAX_FILE_IO_SIZE;
@@ -1068,9 +1150,7 @@ static int nfs_validate_mount_data(void *options,
        args->acdirmin          = 30;
        args->acdirmax          = 60;
        args->mount_server.protocol = XPRT_TRANSPORT_UDP;
-        args->mount_server.program = NFS_MNT_PROGRAM;
        args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
-        args->nfs_server.program = NFS_PROGRAM;
        switch (data->version) {
        case 1:
@@ -1101,9 +1181,6 @@ static int nfs_validate_mount_data(void *options,
                        memset(mntfh->data + mntfh->size, 0,
                               sizeof(mntfh->data) - mntfh->size);
-                if (!nfs_verify_server_address((struct sockaddr *) &data->addr))
-                        goto out_no_address;
                /*
                 * Translate to nfs_parsed_mount_data, which nfs_fill_super
                 * can deal with.
@@ -1118,7 +1195,14 @@ static int nfs_validate_mount_data(void *options,
                args->acregmax          = data->acregmax;
                args->acdirmin          = data->acdirmin;
                args->acdirmax          = data->acdirmax;
-                args->nfs_server.address = data->addr;
+                memcpy(&args->nfs_server.address, &data->addr,
+                       sizeof(data->addr));
+                args->nfs_server.addrlen = sizeof(data->addr);
+                if (!nfs_verify_server_address((struct sockaddr *)
+                                                &args->nfs_server.address))
+                        goto out_no_address;
                if (!(data->flags & NFS_MOUNT_TCP))
                        args->nfs_server.protocol = XPRT_TRANSPORT_UDP;
                /* N.B. caller will free nfs_server.hostname in all cases */
@@ -1321,15 +1405,50 @@ static int nfs_set_super(struct super_block *s, void *data)
        return ret;
 }
+static int nfs_compare_super_address(struct nfs_server *server1,
+                                     struct nfs_server *server2)
+{
+        struct sockaddr *sap1, *sap2;
+        sap1 = (struct sockaddr *)&server1->nfs_client->cl_addr;
+        sap2 = (struct sockaddr *)&server2->nfs_client->cl_addr;
+        if (sap1->sa_family != sap2->sa_family)
+                return 0;
+        switch (sap1->sa_family) {
+        case AF_INET: {
+                struct sockaddr_in *sin1 = (struct sockaddr_in *)sap1;
+                struct sockaddr_in *sin2 = (struct sockaddr_in *)sap2;
+                if (sin1->sin_addr.s_addr != sin2->sin_addr.s_addr)
+                        return 0;
+                if (sin1->sin_port != sin2->sin_port)
+                        return 0;
+                break;
+        }
+        case AF_INET6: {
+                struct sockaddr_in6 *sin1 = (struct sockaddr_in6 *)sap1;
+                struct sockaddr_in6 *sin2 = (struct sockaddr_in6 *)sap2;
+                if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
+                        return 0;
+                if (sin1->sin6_port != sin2->sin6_port)
+                        return 0;
+                break;
+        }
+        default:
+                return 0;
+        }
+        return 1;
+}
 static int nfs_compare_super(struct super_block *sb, void *data)
 {
        struct nfs_sb_mountdata *sb_mntdata = data;
        struct nfs_server *server = sb_mntdata->server, *old = NFS_SB(sb);
        int mntflags = sb_mntdata->mntflags;
-        if (memcmp(&old->nfs_client->cl_addr,
+        if (!nfs_compare_super_address(old, server))
-                                &server->nfs_client->cl_addr,
-                                sizeof(old->nfs_client->cl_addr)) != 0)
                return 0;
        /* Note: NFS_MOUNT_UNSHARED == NFS4_MOUNT_UNSHARED */
        if (old->flags & NFS_MOUNT_UNSHARED)
@@ -1399,6 +1518,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 out:
        kfree(data.nfs_server.hostname);
+        kfree(data.mount_server.hostname);
        return error;
 out_err_nosb:
@@ -1474,6 +1594,11 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
                error = PTR_ERR(mntroot);
                goto error_splat_super;
        }
+        if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) {
+                dput(mntroot);
+                error = -ESTALE;
+                goto error_splat_super;
+        }
        s->s_flags |= MS_ACTIVE;
        mnt->mnt_sb = s;
@@ -1522,19 +1647,43 @@ static void nfs4_fill_super(struct super_block *sb)
 }
 /*
+ * If the user didn't specify a port, set the port number to
+ * the NFS version 4 default port.
+ */
+static void nfs4_default_port(struct sockaddr *sap)
+{
+        switch (sap->sa_family) {
+        case AF_INET: {
+                struct sockaddr_in *ap = (struct sockaddr_in *)sap;
+                if (ap->sin_port == 0)
+                        ap->sin_port = htons(NFS_PORT);
+                break;
+        }
+        case AF_INET6: {
+                struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
+                if (ap->sin6_port == 0)
+                        ap->sin6_port = htons(NFS_PORT);
+                break;
+        }
+        }
+}
+/*
 * Validate NFSv4 mount options
 */
 static int nfs4_validate_mount_data(void *options,
                                    struct nfs_parsed_mount_data *args,
                                    const char *dev_name)
 {
+        struct sockaddr_in *ap;
        struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;
        char *c;
+        memset(args, 0, sizeof(*args));
        if (data == NULL)
                goto out_no_data;
-        memset(args, 0, sizeof(*args));
        args->rsize             = NFS_MAX_FILE_IO_SIZE;
        args->wsize             = NFS_MAX_FILE_IO_SIZE;
        args->timeo             = 600;
@@ -1547,18 +1696,21 @@ static int nfs4_validate_mount_data(void *options,
        switch (data->version) {
        case 1:
-                if (data->host_addrlen != sizeof(args->nfs_server.address))
+                ap = (struct sockaddr_in *)&args->nfs_server.address;
+                if (data->host_addrlen > sizeof(args->nfs_server.address))
                        goto out_no_address;
-                if (copy_from_user(&args->nfs_server.address,
+                if (data->host_addrlen == 0)
-                                   data->host_addr,
+                        goto out_no_address;
-                                   sizeof(args->nfs_server.address)))
+                args->nfs_server.addrlen = data->host_addrlen;
+                if (copy_from_user(ap, data->host_addr, data->host_addrlen))
                        return -EFAULT;
-                if (args->nfs_server.address.sin_port == 0)
-                        args->nfs_server.address.sin_port = htons(NFS_PORT);
                if (!nfs_verify_server_address((struct sockaddr *)
                                                &args->nfs_server.address))
                        goto out_no_address;
+                nfs4_default_port((struct sockaddr *)
+                                  &args->nfs_server.address);
                switch (data->auth_flavourlen) {
                case 0:
                        args->auth_flavors[0] = RPC_AUTH_UNIX;
@@ -1616,6 +1768,9 @@ static int nfs4_validate_mount_data(void *options,
                                                &args->nfs_server.address))
                        return -EINVAL;
+                nfs4_default_port((struct sockaddr *)
+                                  &args->nfs_server.address);
                switch (args->auth_flavor_len) {
                case 0:
                        args->auth_flavors[0] = RPC_AUTH_UNIX;
@@ -1636,21 +1791,16 @@ static int nfs4_validate_mount_data(void *options,
                len = c - dev_name;
                if (len > NFS4_MAXNAMLEN)
                        return -ENAMETOOLONG;
-                args->nfs_server.hostname = kzalloc(len, GFP_KERNEL);
+                /* N.B. caller will free nfs_server.hostname in all cases */
-                if (args->nfs_server.hostname == NULL)
+                args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL);
-                        return -ENOMEM;
-                strncpy(args->nfs_server.hostname, dev_name, len - 1);
                c++;                    /* step over the ':' */
                len = strlen(c);
                if (len > NFS4_MAXPATHLEN)
                        return -ENAMETOOLONG;
-                args->nfs_server.export_path = kzalloc(len + 1, GFP_KERNEL);
+                args->nfs_server.export_path = kstrndup(c, len, GFP_KERNEL);
-                if (args->nfs_server.export_path == NULL)
-                        return -ENOMEM;
-                strncpy(args->nfs_server.export_path, c, len);
-                dprintk("MNTPATH: %s\n", args->nfs_server.export_path);
+                dprintk("NFS: MNTPATH: '%s'\n", args->nfs_server.export_path);
                if (args->client_address == NULL)
                        goto out_no_client_address;
@@ -1819,6 +1969,11 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
                error = PTR_ERR(mntroot);
                goto error_splat_super;
        }
+        if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) {
+                dput(mntroot);
+                error = -ESTALE;
+                goto error_splat_super;
+        }
        s->s_flags |= MS_ACTIVE;
        mnt->mnt_sb = s;
@@ -1893,6 +2048,11 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags,
                error = PTR_ERR(mntroot);
                goto error_splat_super;
        }
+        if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) {
+                dput(mntroot);
+                error = -ESTALE;
+                goto error_splat_super;
+        }
        s->s_flags |= MS_ACTIVE;
        mnt->mnt_sb = s;
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 233ad38161f9..757415363422 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -14,6 +14,8 @@
 #include <linux/sched.h>
 #include <linux/wait.h>
+#include "internal.h"
 struct nfs_unlinkdata {
        struct hlist_node list;
        struct nfs_removeargs args;
@@ -69,24 +71,6 @@ static void nfs_dec_sillycount(struct inode *dir)
 }
 /**
- * nfs_async_unlink_init - Initialize the RPC info
- * task: rpc_task of the sillydelete
- */
-static void nfs_async_unlink_init(struct rpc_task *task, void *calldata)
-{
-        struct nfs_unlinkdata *data = calldata;
-        struct inode *dir = data->dir;
-        struct rpc_message msg = {
-                .rpc_argp = &data->args,
-                .rpc_resp = &data->res,
-                .rpc_cred = data->cred,
-        };
-        NFS_PROTO(dir)->unlink_setup(&msg, dir);
-        rpc_call_setup(task, &msg, 0);
-}
-/**
 * nfs_async_unlink_done - Sillydelete post-processing
 * @task: rpc_task of the sillydelete
 *
@@ -113,32 +97,45 @@ static void nfs_async_unlink_release(void *calldata)
        struct nfs_unlinkdata   *data = calldata;
        nfs_dec_sillycount(data->dir);
+        nfs_sb_deactive(NFS_SERVER(data->dir));
        nfs_free_unlinkdata(data);
 }
 static const struct rpc_call_ops nfs_unlink_ops = {
-        .rpc_call_prepare = nfs_async_unlink_init,
        .rpc_call_done = nfs_async_unlink_done,
        .rpc_release = nfs_async_unlink_release,
 };
 static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct nfs_unlinkdata *data)
 {
+        struct rpc_message msg = {
+                .rpc_argp = &data->args,
+                .rpc_resp = &data->res,
+                .rpc_cred = data->cred,
+        };
+        struct rpc_task_setup task_setup_data = {
+                .rpc_message = &msg,
+                .callback_ops = &nfs_unlink_ops,
+                .callback_data = data,
+                .flags = RPC_TASK_ASYNC,
+        };
        struct rpc_task *task;
        struct dentry *alias;
        alias = d_lookup(parent, &data->args.name);
        if (alias != NULL) {
                int ret = 0;
                /*
                 * Hey, we raced with lookup... See if we need to transfer
                 * the sillyrename information to the aliased dentry.
                 */
                nfs_free_dname(data);
                spin_lock(&alias->d_lock);
-                if (!(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
+                if (alias->d_inode != NULL &&
+                    !(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
                        alias->d_fsdata = data;
-                        alias->d_flags ^= DCACHE_NFSFS_RENAMED;
+                        alias->d_flags |= DCACHE_NFSFS_RENAMED;
                        ret = 1;
                }
                spin_unlock(&alias->d_lock);
@@ -151,10 +148,14 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
                nfs_dec_sillycount(dir);
                return 0;
        }
+        nfs_sb_active(NFS_SERVER(dir));
        data->args.fh = NFS_FH(dir);
        nfs_fattr_init(&data->res.dir_attr);
-        task = rpc_run_task(NFS_CLIENT(dir), RPC_TASK_ASYNC, &nfs_unlink_ops, data);
+        NFS_PROTO(dir)->unlink_setup(&msg, dir);
+        task_setup_data.rpc_client = NFS_CLIENT(dir);
+        task = rpc_run_task(&task_setup_data);
        if (!IS_ERR(task))
                rpc_put_task(task);
        return 1;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 89527a487ed7..522efff3e2c5 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -196,7 +196,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
        }
        /* Update file length */
        nfs_grow_file(page, offset, count);
-        nfs_unlock_request(req);
+        nfs_clear_page_tag_locked(req);
        return 0;
 }
@@ -252,7 +252,6 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
                                struct page *page)
 {
        struct inode *inode = page->mapping->host;
-        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_page *req;
        int ret;
@@ -263,10 +262,10 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
                        spin_unlock(&inode->i_lock);
                        return 0;
                }
-                if (nfs_lock_request_dontget(req))
+                if (nfs_set_page_tag_locked(req))
                        break;
                /* Note: If we hold the page lock, as is the case in nfs_writepage,
-                 *       then the call to nfs_lock_request_dontget() will always
+                 *       then the call to nfs_set_page_tag_locked() will always
                 *       succeed provided that someone hasn't already marked the
                 *       request as dirty (in which case we don't care).
                 */
@@ -280,7 +279,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
        if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
                /* This request is marked for commit */
                spin_unlock(&inode->i_lock);
-                nfs_unlock_request(req);
+                nfs_clear_page_tag_locked(req);
                nfs_pageio_complete(pgio);
                return 0;
        }
@@ -288,8 +287,6 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
                spin_unlock(&inode->i_lock);
                BUG();
        }
-        radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
-                        NFS_PAGE_TAG_LOCKED);
        spin_unlock(&inode->i_lock);
        nfs_pageio_add_request(pgio, req);
        return 0;
@@ -381,6 +378,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
        set_page_private(req->wb_page, (unsigned long)req);
        nfsi->npages++;
        kref_get(&req->wb_kref);
+        radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
        return 0;
 }
@@ -490,7 +488,7 @@ int nfs_reschedule_unstable_write(struct nfs_page *req)
 /*
 * Wait for a request to complete.
 *
- * Interruptible by signals only if mounted with intr flag.
+ * Interruptible by fatal signals only.
 */
 static int nfs_wait_on_requests_locked(struct inode *inode, pgoff_t idx_start, unsigned int npages)
 {
@@ -596,7 +594,7 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
                spin_lock(&inode->i_lock);
                req = nfs_page_find_request_locked(page);
                if (req) {
-                        if (!nfs_lock_request_dontget(req)) {
+                        if (!nfs_set_page_tag_locked(req)) {
                                int error;
                                spin_unlock(&inode->i_lock);
@@ -646,7 +644,7 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
            || req->wb_page != page
            || !nfs_dirty_request(req)
            || offset > rqend || end < req->wb_offset) {
-                nfs_unlock_request(req);
+                nfs_clear_page_tag_locked(req);
                return ERR_PTR(-EBUSY);
        }
@@ -755,7 +753,7 @@ static void nfs_writepage_release(struct nfs_page *req)
        nfs_clear_page_tag_locked(req);
 }
-static inline int flush_task_priority(int how)
+static int flush_task_priority(int how)
 {
        switch (how & (FLUSH_HIGHPRI|FLUSH_LOWPRI)) {
                case FLUSH_HIGHPRI:
@@ -775,15 +773,31 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
                unsigned int count, unsigned int offset,
                int how)
 {
-        struct inode            *inode;
+        struct inode *inode = req->wb_context->path.dentry->d_inode;
-        int flags;
+        int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
+        int priority = flush_task_priority(how);
+        struct rpc_task *task;
+        struct rpc_message msg = {
+                .rpc_argp = &data->args,
+                .rpc_resp = &data->res,
+                .rpc_cred = req->wb_context->cred,
+        };
+        struct rpc_task_setup task_setup_data = {
+                .rpc_client = NFS_CLIENT(inode),
+                .task = &data->task,
+                .rpc_message = &msg,
+                .callback_ops = call_ops,
+                .callback_data = data,
+                .flags = flags,
+                .priority = priority,
+        };
        /* Set up the RPC argument and reply structs
         * NB: take care not to mess about with data->commit et al. */
        data->req = req;
        data->inode = inode = req->wb_context->path.dentry->d_inode;
-        data->cred = req->wb_context->cred;
+        data->cred = msg.rpc_cred;
        data->args.fh     = NFS_FH(inode);
        data->args.offset = req_offset(req) + offset;
@@ -791,6 +805,12 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
        data->args.pages  = data->pagevec;
        data->args.count  = count;
        data->args.context = req->wb_context;
+        data->args.stable  = NFS_UNSTABLE;
+        if (how & FLUSH_STABLE) {
+                data->args.stable = NFS_DATA_SYNC;
+                if (!NFS_I(inode)->ncommit)
+                        data->args.stable = NFS_FILE_SYNC;
+        }
        data->res.fattr   = &data->fattr;
        data->res.count   = count;
@@ -798,12 +818,7 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
        nfs_fattr_init(&data->fattr);
        /* Set up the initial task struct.  */
-        flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
+        NFS_PROTO(inode)->write_setup(data, &msg);
-        rpc_init_task(&data->task, NFS_CLIENT(inode), flags, call_ops, data);
-        NFS_PROTO(inode)->write_setup(data, how);
-        data->task.tk_priority = flush_task_priority(how);
-        data->task.tk_cookie = (unsigned long)inode;
        dprintk("NFS: %5u initiated write call "
                "(req %s/%Ld, %u bytes @ offset %Lu)\n",
@@ -812,16 +827,10 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
                (long long)NFS_FILEID(inode),
                count,
                (unsigned long long)data->args.offset);
-}
-static void nfs_execute_write(struct nfs_write_data *data)
-{
-        struct rpc_clnt *clnt = NFS_CLIENT(data->inode);
-        sigset_t oldset;
-        rpc_clnt_sigmask(clnt, &oldset);
+        task = rpc_run_task(&task_setup_data);
-        rpc_execute(&data->task);
+        if (!IS_ERR(task))
-        rpc_clnt_sigunmask(clnt, &oldset);
+                rpc_put_task(task);
 }
 /*
@@ -868,7 +877,6 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
                                   wsize, offset, how);
                offset += wsize;
                nbytes -= wsize;
-                nfs_execute_write(data);
        } while (nbytes != 0);
        return 0;
@@ -916,7 +924,6 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
        /* Set up the argument struct */
        nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how);
-        nfs_execute_write(data);
        return 0;
 out_bad:
        while (!list_empty(head)) {
@@ -932,7 +939,7 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
 static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
                                  struct inode *inode, int ioflags)
 {
-        int wsize = NFS_SERVER(inode)->wsize;
+        size_t wsize = NFS_SERVER(inode)->wsize;
        if (wsize < PAGE_CACHE_SIZE)
                nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
@@ -1146,19 +1153,33 @@ static void nfs_commit_rpcsetup(struct list_head *head,
                struct nfs_write_data *data,
                int how)
 {
-        struct nfs_page         *first;
+        struct nfs_page *first = nfs_list_entry(head->next);
-        struct inode            *inode;
+        struct inode *inode = first->wb_context->path.dentry->d_inode;
-        int flags;
+        int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
+        int priority = flush_task_priority(how);
+        struct rpc_task *task;
+        struct rpc_message msg = {
+                .rpc_argp = &data->args,
+                .rpc_resp = &data->res,
+                .rpc_cred = first->wb_context->cred,
+        };
+        struct rpc_task_setup task_setup_data = {
+                .task = &data->task,
+                .rpc_client = NFS_CLIENT(inode),
+                .rpc_message = &msg,
+                .callback_ops = &nfs_commit_ops,
+                .callback_data = data,
+                .flags = flags,
+                .priority = priority,
+        };
        /* Set up the RPC argument and reply structs
         * NB: take care not to mess about with data->commit et al. */
        list_splice_init(head, &data->pages);
-        first = nfs_list_entry(data->pages.next);
-        inode = first->wb_context->path.dentry->d_inode;
        data->inode       = inode;
-        data->cred        = first->wb_context->cred;
+        data->cred        = msg.rpc_cred;
        data->args.fh     = NFS_FH(data->inode);
        /* Note: we always request a commit of the entire inode */
@@ -1170,14 +1191,13 @@ static void nfs_commit_rpcsetup(struct list_head *head,
        nfs_fattr_init(&data->fattr);
        /* Set up the initial task struct.  */
-        flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
+        NFS_PROTO(inode)->commit_setup(data, &msg);
-        rpc_init_task(&data->task, NFS_CLIENT(inode), flags, &nfs_commit_ops, data);
-        NFS_PROTO(inode)->commit_setup(data, how);
-        data->task.tk_priority = flush_task_priority(how);
-        data->task.tk_cookie = (unsigned long)inode;
-        
        dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
+        task = rpc_run_task(&task_setup_data);
+        if (!IS_ERR(task))
+                rpc_put_task(task);
 }
 /*
@@ -1197,7 +1217,6 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
        /* Set up the argument struct */
        nfs_commit_rpcsetup(head, data, how);
-        nfs_execute_write(data);
        return 0;
 out_bad:
        while (!list_empty(head)) {
@@ -1436,7 +1455,8 @@ out:
        return ret;
 }
-int nfs_wb_page_priority(struct inode *inode, struct page *page, int how)
+static int nfs_wb_page_priority(struct inode *inode, struct page *page,
+                                int how)
 {
        loff_t range_start = page_offset(page);
        loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
diff --git a/fs/nfsd/auth.h b/fs/nfsd/auth.h
new file mode 100644
index 000000000000..78b3c0e93822
--- /dev/null
+++ b/fs/nfsd/auth.h
@@ -0,0 +1,22 @@
+/*
+ * nfsd-specific authentication stuff.
+ * uid/gid mapping not yet implemented.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+#ifndef LINUX_NFSD_AUTH_H
+#define LINUX_NFSD_AUTH_H
+#define nfsd_luid(rq, uid)      ((u32)(uid))
+#define nfsd_lgid(rq, gid)      ((u32)(gid))
+#define nfsd_ruid(rq, uid)      ((u32)(uid))
+#define nfsd_rgid(rq, gid)      ((u32)(gid))
+/*
+ * Set the current process's fsuid/fsgid etc to those of the NFS
+ * client user
+ */
+int nfsd_setuser(struct svc_rqst *, struct svc_export *);
+#endif /* LINUX_NFSD_AUTH_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 66d0aeb32a47..79b4bf812960 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1357,8 +1357,6 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
        mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL);
        exp = rqst_exp_find(rqstp, FSID_NUM, fsidv);
-        if (PTR_ERR(exp) == -ENOENT)
-                return nfserr_perm;
        if (IS_ERR(exp))
                return nfserrno(PTR_ERR(exp));
        rv = fh_compose(fhp, exp, exp->ex_dentry, NULL);
@@ -1637,13 +1635,19 @@ exp_verify_string(char *cp, int max)
 /*
 * Initialize the exports module.
 */
-void
+int
 nfsd_export_init(void)
 {
+        int rv;
        dprintk("nfsd: initializing export module.\n");
-        cache_register(&svc_export_cache);
+        rv = cache_register(&svc_export_cache);
-        cache_register(&svc_expkey_cache);
+        if (rv)
+                return rv;
+        rv = cache_register(&svc_expkey_cache);
+        if (rv)
+                cache_unregister(&svc_export_cache);
+        return rv;
 }
@@ -1670,10 +1674,8 @@ nfsd_export_shutdown(void)
        exp_writelock();
-        if (cache_unregister(&svc_expkey_cache))
+        cache_unregister(&svc_expkey_cache);
-                printk(KERN_ERR "nfsd: failed to unregister expkey cache\n");
+        cache_unregister(&svc_export_cache);
-        if (cache_unregister(&svc_export_cache))
-                printk(KERN_ERR "nfsd: failed to unregister export cache\n");
        svcauth_unix_purge();
        exp_writeunlock();
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index b61742885011..1c3b7654e966 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -41,7 +41,7 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
        fh = fh_copy(&resp->fh, &argp->fh);
        if ((nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP)))
-                RETURN_STATUS(nfserr_inval);
+                RETURN_STATUS(nfserr);
        if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
                RETURN_STATUS(nfserr_inval);
@@ -221,12 +221,17 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
                struct nfsd3_getaclres *resp)
 {
        struct dentry *dentry = resp->fh.fh_dentry;
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
        struct kvec *head = rqstp->rq_res.head;
        unsigned int base;
        int n;
        int w;
+        /*
+         * Since this is version 2, the check for nfserr in
+         * nfsd_dispatch actually ensures the following cannot happen.
+         * However, it seems fragile to depend on that.
+         */
        if (dentry == NULL || dentry->d_inode == NULL)
                return 0;
        inode = dentry->d_inode;
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 3e3f2de82c36..b647f2f872dc 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -37,7 +37,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
        fh = fh_copy(&resp->fh, &argp->fh);
        if ((nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP)))
-                RETURN_STATUS(nfserr_inval);
+                RETURN_STATUS(nfserr);
        if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
                RETURN_STATUS(nfserr_inval);
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 2d116d2298f8..d7647f70e02b 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -21,6 +21,7 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/nfsd/nfsd.h>
 #include <linux/nfsd/xdr3.h>
+#include "auth.h"
 #define NFSDDBG_FACILITY                NFSDDBG_XDR
@@ -88,10 +89,10 @@ encode_fh(__be32 *p, struct svc_fh *fhp)
 * no slashes or null bytes.
 */
 static __be32 *
-decode_filename(__be32 *p, char **namp, int *lenp)
+decode_filename(__be32 *p, char **namp, unsigned int *lenp)
 {
        char            *name;
-        int             i;
+        unsigned int    i;
        if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS3_MAXNAMLEN)) != NULL) {
                for (i = 0, name = *namp; i < *lenp; i++, name++) {
@@ -388,8 +389,11 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
         * Round the length of the data which was specified up to
         * the next multiple of XDR units and then compare that
         * against the length which was actually received.
+         * Note that when RPCSEC/GSS (for example) is used, the
+         * data buffer can be padded so dlen might be larger
+         * than required.  It must never be smaller.
         */
-        if (dlen != XDR_QUADLEN(len)*4)
+        if (dlen < XDR_QUADLEN(len)*4)
                return 0;
        if (args->count > max_blocksize) {
@@ -449,8 +453,7 @@ int
 nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_symlinkargs *args)
 {
-        unsigned int len;
+        unsigned int len, avail;
-        int avail;
        char *old, *new;
        struct kvec *vec;
@@ -483,7 +486,8 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
        /* now copy next page if there is one */
        if (len && !avail && rqstp->rq_arg.page_len) {
                avail = rqstp->rq_arg.page_len;
-                if (avail > PAGE_SIZE) avail = PAGE_SIZE;
+                if (avail > PAGE_SIZE)
+                        avail = PAGE_SIZE;
                old = page_address(rqstp->rq_arg.pages[0]);
        }
        while (len && avail && *old) {
@@ -813,11 +817,11 @@ static __be32 *
 encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p,
                struct svc_fh *fhp)
 {
-                p = encode_post_op_attr(cd->rqstp, p, fhp);
+        p = encode_post_op_attr(cd->rqstp, p, fhp);
-                *p++ = xdr_one;                 /* yes, a file handle follows */
+        *p++ = xdr_one;                 /* yes, a file handle follows */
-                p = encode_fh(p, fhp);
+        p = encode_fh(p, fhp);
-                fh_put(fhp);
+        fh_put(fhp);
-                return p;
+        return p;
 }
 static int
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 9d536a8cb379..aae2b29ae2c9 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -350,30 +350,6 @@ static struct rpc_version *	nfs_cb_version[] = {
 static int do_probe_callback(void *data)
 {
        struct nfs4_client *clp = data;
-        struct nfs4_callback *cb = &clp->cl_callback;
-        struct rpc_message msg = {
-                .rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
-                .rpc_argp       = clp,
-        };
-        int status;
-        status = rpc_call_sync(cb->cb_client, &msg, RPC_TASK_SOFT);
-        if (status) {
-                rpc_shutdown_client(cb->cb_client);
-                cb->cb_client = NULL;
-        } else
-                atomic_set(&cb->cb_set, 1);
-        put_nfs4_client(clp);
-        return 0;
-}
-/*
- * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
- */
-void
-nfsd4_probe_callback(struct nfs4_client *clp)
-{
        struct sockaddr_in      addr;
        struct nfs4_callback    *cb = &clp->cl_callback;
        struct rpc_timeout      timeparms = {
@@ -390,13 +366,15 @@ nfsd4_probe_callback(struct nfs4_client *clp)
                .timeout        = &timeparms,
                .program        = program,
                .version        = nfs_cb_version[1]->number,
-                .authflavor     = RPC_AUTH_UNIX,        /* XXX: need AUTH_GSS... */
+                .authflavor     = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
                .flags          = (RPC_CLNT_CREATE_NOPING),
        };
-        struct task_struct *t;
+        struct rpc_message msg = {
+                .rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
-        if (atomic_read(&cb->cb_set))
+                .rpc_argp       = clp,
-                return;
+        };
+        struct rpc_clnt *client;
+        int status;
        /* Initialize address */
        memset(&addr, 0, sizeof(addr));
@@ -416,29 +394,50 @@ nfsd4_probe_callback(struct nfs4_client *clp)
        program->stats->program = program;
        /* Create RPC client */
-        cb->cb_client = rpc_create(&args);
+        client = rpc_create(&args);
-        if (IS_ERR(cb->cb_client)) {
+        if (IS_ERR(client)) {
                dprintk("NFSD: couldn't create callback client\n");
+                status = PTR_ERR(client);
                goto out_err;
        }
+        status = rpc_call_sync(client, &msg, RPC_TASK_SOFT);
+        if (status)
+                goto out_release_client;
+        cb->cb_client = client;
+        atomic_set(&cb->cb_set, 1);
+        put_nfs4_client(clp);
+        return 0;
+out_release_client:
+        rpc_shutdown_client(client);
+out_err:
+        put_nfs4_client(clp);
+        dprintk("NFSD: warning: no callback path to client %.*s\n",
+                (int)clp->cl_name.len, clp->cl_name.data);
+        return status;
+}
+/*
+ * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
+ */
+void
+nfsd4_probe_callback(struct nfs4_client *clp)
+{
+        struct task_struct *t;
+        BUG_ON(atomic_read(&clp->cl_callback.cb_set));
        /* the task holds a reference to the nfs4_client struct */
        atomic_inc(&clp->cl_count);
        t = kthread_run(do_probe_callback, clp, "nfs4_cb_probe");
        if (IS_ERR(t))
-                goto out_release_clp;
+                atomic_dec(&clp->cl_count);
        return;
-out_release_clp:
-        atomic_dec(&clp->cl_count);
-        rpc_shutdown_client(cb->cb_client);
-out_err:
-        cb->cb_client = NULL;
-        dprintk("NFSD: warning: no callback path to client %.*s\n",
-                (int)clp->cl_name.len, clp->cl_name.data);
 }
 /*
@@ -458,9 +457,6 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
        int retries = 1;
        int status = 0;
-        if ((!atomic_read(&clp->cl_callback.cb_set)) || !clnt)
-                return;
        cbr->cbr_trunc = 0; /* XXX need to implement truncate optimization */
        cbr->cbr_dp = dp;
@@ -469,6 +465,7 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
                switch (status) {
                        case -EIO:
                                /* Network partition? */
+                                atomic_set(&clp->cl_callback.cb_set, 0);
                        case -EBADHANDLE:
                        case -NFS4ERR_BAD_STATEID:
                                /* Race: client probably got cb_recall
@@ -481,11 +478,10 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
                status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT);
        }
 out_put_cred:
-        if (status == -EIO)
+        /*
-                atomic_set(&clp->cl_callback.cb_set, 0);
+         * Success or failure, now we're either waiting for lease expiration
-        /* Success or failure, now we're either waiting for lease expiration
+         * or deleg_return.
-         * or deleg_return. */
+         */
-        dprintk("NFSD: nfs4_cb_recall: dp %p dl_flock %p dl_count %d\n",dp, dp->dl_flock, atomic_read(&dp->dl_count));
        put_nfs4_client(clp);
        nfs4_put_delegation(dp);
        return;
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 4c0c683ce07a..996bd88b75ba 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -255,13 +255,10 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
                goto out;
        if (len == 0)
                set_bit(CACHE_NEGATIVE, &ent.h.flags);
-        else {
+        else if (len >= IDMAP_NAMESZ)
-                if (error >= IDMAP_NAMESZ) {
+                goto out;
-                        error = -EINVAL;
+        else
-                        goto out;
-                }
                memcpy(ent.name, buf1, sizeof(ent.name));
-        }
        error = -ENOMEM;
        res = idtoname_update(&ent, res);
        if (res == NULL)
@@ -467,20 +464,25 @@ nametoid_update(struct ent *new, struct ent *old)
 * Exported API
 */
-void
+int
 nfsd_idmap_init(void)
 {
-        cache_register(&idtoname_cache);
+        int rv;
-        cache_register(&nametoid_cache);
+        rv = cache_register(&idtoname_cache);
+        if (rv)
+                return rv;
+        rv = cache_register(&nametoid_cache);
+        if (rv)
+                cache_unregister(&idtoname_cache);
+        return rv;
 }
 void
 nfsd_idmap_shutdown(void)
 {
-        if (cache_unregister(&idtoname_cache))
+        cache_unregister(&idtoname_cache);
-                printk(KERN_ERR "nfsd: failed to unregister idtoname cache\n");
+        cache_unregister(&nametoid_cache);
-        if (cache_unregister(&nametoid_cache))
-                printk(KERN_ERR "nfsd: failed to unregister nametoid cache\n");
 }
 /*
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 18ead1790bb3..c593db047d8b 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -750,7 +750,7 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                                    cstate->current_fh.fh_export,
                                    cstate->current_fh.fh_dentry, buf,
                                    &count, verify->ve_bmval,
-                                    rqstp);
+                                    rqstp, 0);
        /* this means that nfsd4_encode_fattr() ran out of space */
        if (status == nfserr_resource && count == 0)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 6f03918018a3..1602cd00dd45 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -43,7 +43,7 @@
 #include <linux/file.h>
 #include <linux/namei.h>
 #include <asm/uaccess.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <linux/crypto.h>
 #include <linux/sched.h>
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 31673cd251c3..f6744bc03dae 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -61,7 +61,6 @@ static time_t lease_time = 90;     /* default lease time */
 static time_t user_lease_time = 90;
 static time_t boot_time;
 static int in_grace = 1;
-static u32 current_clientid = 1;
 static u32 current_ownerid = 1;
 static u32 current_fileid = 1;
 static u32 current_delegid = 1;
@@ -340,21 +339,20 @@ STALE_CLIENTID(clientid_t *clid)
 * This type of memory management is somewhat inefficient, but we use it
 * anyway since SETCLIENTID is not a common operation.
 */
-static inline struct nfs4_client *
+static struct nfs4_client *alloc_client(struct xdr_netobj name)
-alloc_client(struct xdr_netobj name)
 {
        struct nfs4_client *clp;
-        if ((clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL))!= NULL) {
+        clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL);
-                if ((clp->cl_name.data = kmalloc(name.len, GFP_KERNEL)) != NULL) {
+        if (clp == NULL)
-                        memcpy(clp->cl_name.data, name.data, name.len);
+                return NULL;
-                        clp->cl_name.len = name.len;
+        clp->cl_name.data = kmalloc(name.len, GFP_KERNEL);
-                }
+        if (clp->cl_name.data == NULL) {
-                else {
+                kfree(clp);
-                        kfree(clp);
+                return NULL;
-                        clp = NULL;
-                }
        }
+        memcpy(clp->cl_name.data, name.data, name.len);
+        clp->cl_name.len = name.len;
        return clp;
 }
@@ -363,8 +361,11 @@ shutdown_callback_client(struct nfs4_client *clp)
 {
        struct rpc_clnt *clnt = clp->cl_callback.cb_client;
-        /* shutdown rpc client, ending any outstanding recall rpcs */
        if (clnt) {
+                /*
+                 * Callback threads take a reference on the client, so there
+                 * should be no outstanding callbacks at this point.
+                 */
                clp->cl_callback.cb_client = NULL;
                rpc_shutdown_client(clnt);
        }
@@ -422,12 +423,13 @@ expire_client(struct nfs4_client *clp)
        put_nfs4_client(clp);
 }
-static struct nfs4_client *
+static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
-create_client(struct xdr_netobj name, char *recdir) {
+{
        struct nfs4_client *clp;
-        if (!(clp = alloc_client(name)))
+        clp = alloc_client(name);
-                goto out;
+        if (clp == NULL)
+                return NULL;
        memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
        atomic_set(&clp->cl_count, 1);
        atomic_set(&clp->cl_callback.cb_set, 0);
@@ -436,32 +438,30 @@ create_client(struct xdr_netobj name, char *recdir) {
        INIT_LIST_HEAD(&clp->cl_openowners);
        INIT_LIST_HEAD(&clp->cl_delegations);
        INIT_LIST_HEAD(&clp->cl_lru);
-out:
        return clp;
 }
-static void
+static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
-copy_verf(struct nfs4_client *target, nfs4_verifier *source) {
+{
-        memcpy(target->cl_verifier.data, source->data, sizeof(target->cl_verifier.data));
+        memcpy(target->cl_verifier.data, source->data,
+                        sizeof(target->cl_verifier.data));
 }
-static void
+static void copy_clid(struct nfs4_client *target, struct nfs4_client *source)
-copy_clid(struct nfs4_client *target, struct nfs4_client *source) {
+{
        target->cl_clientid.cl_boot = source->cl_clientid.cl_boot; 
        target->cl_clientid.cl_id = source->cl_clientid.cl_id; 
 }
-static void
+static void copy_cred(struct svc_cred *target, struct svc_cred *source)
-copy_cred(struct svc_cred *target, struct svc_cred *source) {
+{
        target->cr_uid = source->cr_uid;
        target->cr_gid = source->cr_gid;
        target->cr_group_info = source->cr_group_info;
        get_group_info(target->cr_group_info);
 }
-static inline int
+static int same_name(const char *n1, const char *n2)
-same_name(const char *n1, const char *n2)
 {
        return 0 == memcmp(n1, n2, HEXDIR_LEN);
 }
@@ -485,26 +485,26 @@ same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
        return cr1->cr_uid == cr2->cr_uid;
 }
-static void
+static void gen_clid(struct nfs4_client *clp)
-gen_clid(struct nfs4_client *clp) {
+{
+        static u32 current_clientid = 1;
        clp->cl_clientid.cl_boot = boot_time;
        clp->cl_clientid.cl_id = current_clientid++; 
 }
-static void
+static void gen_confirm(struct nfs4_client *clp)
-gen_confirm(struct nfs4_client *clp) {
+{
-        struct timespec         tv;
+        static u32 i;
-        u32 *                   p;
+        u32 *p;
-        tv = CURRENT_TIME;
        p = (u32 *)clp->cl_confirm.data;
-        *p++ = tv.tv_sec;
+        *p++ = get_seconds();
-        *p++ = tv.tv_nsec;
+        *p++ = i++;
 }
-static int
+static int check_name(struct xdr_netobj name)
-check_name(struct xdr_netobj name) {
+{
        if (name.len == 0) 
                return 0;
        if (name.len > NFS4_OPAQUE_LIMIT) {
@@ -683,39 +683,6 @@ out_err:
        return;
 }
-/*
- * RFC 3010 has a complex implmentation description of processing a 
- * SETCLIENTID request consisting of 5 bullets, labeled as 
- * CASE0 - CASE4 below.
- *
- * NOTES:
- *      callback information will be processed in a future patch
- *
- *      an unconfirmed record is added when:
- *      NORMAL (part of CASE 4): there is no confirmed nor unconfirmed record.
- *      CASE 1: confirmed record found with matching name, principal,
- *              verifier, and clientid.
- *      CASE 2: confirmed record found with matching name, principal,
- *              and there is no unconfirmed record with matching
- *              name and principal
- *
- *      an unconfirmed record is replaced when:
- *      CASE 3: confirmed record found with matching name, principal,
- *              and an unconfirmed record is found with matching 
- *              name, principal, and with clientid and
- *              confirm that does not match the confirmed record.
- *      CASE 4: there is no confirmed record with matching name and 
- *              principal. there is an unconfirmed record with 
- *              matching name, principal.
- *
- *      an unconfirmed record is deleted when:
- *      CASE 1: an unconfirmed record that matches input name, verifier,
- *              and confirmed clientid.
- *      CASE 4: any unconfirmed records with matching name and principal
- *              that exist after an unconfirmed record has been replaced
- *              as described above.
- *
- */
 __be32
 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                  struct nfsd4_setclientid *setclid)
@@ -748,11 +715,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        nfs4_lock_state();
        conf = find_confirmed_client_by_str(dname, strhashval);
        if (conf) {
-                /* 
+                /* RFC 3530 14.2.33 CASE 0: */
-                 * CASE 0:
-                 * clname match, confirmed, different principal
-                 * or different ip_address
-                 */
                status = nfserr_clid_inuse;
                if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)
                                || conf->cl_addr != sin->sin_addr.s_addr) {
@@ -761,12 +724,17 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                        goto out;
                }
        }
+        /*
+         * section 14.2.33 of RFC 3530 (under the heading "IMPLEMENTATION")
+         * has a description of SETCLIENTID request processing consisting
+         * of 5 bullet points, labeled as CASE0 - CASE4 below.
+         */
        unconf = find_unconfirmed_client_by_str(dname, strhashval);
        status = nfserr_resource;
        if (!conf) {
-                /* 
+                /*
-                 * CASE 4:
+                 * RFC 3530 14.2.33 CASE 4:
-                 * placed first, because it is the normal case.
+                 * placed first, because it is the normal case
                 */
                if (unconf)
                        expire_client(unconf);
@@ -776,17 +744,8 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                gen_clid(new);
        } else if (same_verf(&conf->cl_verifier, &clverifier)) {
                /*
-                 * CASE 1:
+                 * RFC 3530 14.2.33 CASE 1:
-                 * cl_name match, confirmed, principal match
+                 * probable callback update
-                 * verifier match: probable callback update
-                 *
-                 * remove any unconfirmed nfs4_client with 
-                 * matching cl_name, cl_verifier, and cl_clientid
-                 *
-                 * create and insert an unconfirmed nfs4_client with same 
-                 * cl_name, cl_verifier, and cl_clientid as existing 
-                 * nfs4_client,  but with the new callback info and a 
-                 * new cl_confirm
                 */
                if (unconf) {
                        /* Note this is removing unconfirmed {*x***},
@@ -802,43 +761,25 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                copy_clid(new, conf);
        } else if (!unconf) {
                /*
-                 * CASE 2:
+                 * RFC 3530 14.2.33 CASE 2:
-                 * clname match, confirmed, principal match
+                 * probable client reboot; state will be removed if
-                 * verfier does not match
+                 * confirmed.
-                 * no unconfirmed. create a new unconfirmed nfs4_client
-                 * using input clverifier, clname, and callback info
-                 * and generate a new cl_clientid and cl_confirm.
                 */
                new = create_client(clname, dname);
                if (new == NULL)
                        goto out;
                gen_clid(new);
-        } else if (!same_verf(&conf->cl_confirm, &unconf->cl_confirm)) {
+        } else {
-                /*      
+                /*
-                 * CASE3:
+                 * RFC 3530 14.2.33 CASE 3:
-                 * confirmed found (name, principal match)
+                 * probable client reboot; state will be removed if
-                 * confirmed verifier does not match input clverifier
+                 * confirmed.
-                 *
-                 * unconfirmed found (name match)
-                 * confirmed->cl_confirm != unconfirmed->cl_confirm
-                 *
-                 * remove unconfirmed.
-                 *
-                 * create an unconfirmed nfs4_client 
-                 * with same cl_name as existing confirmed nfs4_client, 
-                 * but with new callback info, new cl_clientid,
-                 * new cl_verifier and a new cl_confirm
                 */
                expire_client(unconf);
                new = create_client(clname, dname);
                if (new == NULL)
                        goto out;
                gen_clid(new);
-        } else {
-                /* No cases hit !!! */
-                status = nfserr_inval;
-                goto out;
        }
        copy_verf(new, &clverifier);
        new->cl_addr = sin->sin_addr.s_addr;
@@ -857,11 +798,9 @@ out:
 /*
- * RFC 3010 has a complex implmentation description of processing a 
+ * Section 14.2.34 of RFC 3530 (under the heading "IMPLEMENTATION") has
- * SETCLIENTID_CONFIRM request consisting of 4 bullets describing
+ * a description of SETCLIENTID_CONFIRM request processing consisting of 4
- * processing on a DRC miss, labeled as CASE1 - CASE4 below.
+ * bullets, labeled as CASE1 - CASE4 below.
- *
- * NOTE: callback information will be processed here in a future patch
 */
 __be32
 nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
@@ -892,16 +831,16 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
        if (unconf && unconf->cl_addr != sin->sin_addr.s_addr)
                goto out;
-        if ((conf && unconf) && 
+        /*
-            (same_verf(&unconf->cl_confirm, &confirm)) &&
+         * section 14.2.34 of RFC 3530 has a description of
-            (same_verf(&conf->cl_verifier, &unconf->cl_verifier)) &&
+         * SETCLIENTID_CONFIRM request processing consisting
-            (same_name(conf->cl_recdir,unconf->cl_recdir))  &&
+         * of 4 bullet points, labeled as CASE1 - CASE4 below.
-            (!same_verf(&conf->cl_confirm, &unconf->cl_confirm))) {
+         */
-                /* CASE 1:
+        if (conf && unconf && same_verf(&confirm, &unconf->cl_confirm)) {
-                * unconf record that matches input clientid and input confirm.
+                /*
-                * conf record that matches input clientid.
+                 * RFC 3530 14.2.34 CASE 1:
-                * conf and unconf records match names, verifiers
+                 * callback update
-                */
+                 */
                if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
                        status = nfserr_clid_inuse;
                else {
@@ -914,15 +853,11 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                        status = nfs_ok;
                }
-        } else if ((conf && !unconf) ||
+        } else if (conf && !unconf) {
-            ((conf && unconf) && 
+                /*
-             (!same_verf(&conf->cl_verifier, &unconf->cl_verifier) ||
+                 * RFC 3530 14.2.34 CASE 2:
-              !same_name(conf->cl_recdir, unconf->cl_recdir)))) {
+                 * probable retransmitted request; play it safe and
-                /* CASE 2:
+                 * do nothing.
-                 * conf record that matches input clientid.
-                 * if unconf record matches input clientid, then
-                 * unconf->cl_name or unconf->cl_verifier don't match the
-                 * conf record.
                 */
                if (!same_creds(&conf->cl_cred, &rqstp->rq_cred))
                        status = nfserr_clid_inuse;
@@ -930,10 +865,9 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                        status = nfs_ok;
        } else if (!conf && unconf
                        && same_verf(&unconf->cl_confirm, &confirm)) {
-                /* CASE 3:
+                /*
-                 * conf record not found.
+                 * RFC 3530 14.2.34 CASE 3:
-                 * unconf record found.
+                 * Normal case; new or rebooted client:
-                 * unconf->cl_confirm matches input confirm
                 */
                if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred)) {
                        status = nfserr_clid_inuse;
@@ -948,16 +882,15 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                        }
                        move_to_confirmed(unconf);
                        conf = unconf;
+                        nfsd4_probe_callback(conf);
                        status = nfs_ok;
                }
        } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
            && (!unconf || (unconf && !same_verf(&unconf->cl_confirm,
                                                                &confirm)))) {
-                /* CASE 4:
+                /*
-                 * conf record not found, or if conf, conf->cl_confirm does not
+                 * RFC 3530 14.2.34 CASE 4:
-                 * match input confirm.
+                 * Client probably hasn't noticed that we rebooted yet.
-                 * unconf record not found, or if unconf, unconf->cl_confirm
-                 * does not match input confirm.
                 */
                status = nfserr_stale_clientid;
        } else {
@@ -965,8 +898,6 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                status = nfserr_clid_inuse;
        }
 out:
-        if (!status)
-                nfsd4_probe_callback(conf);
        nfs4_unlock_state();
        return status;
 }
@@ -1226,14 +1157,19 @@ find_file(struct inode *ino)
        return NULL;
 }
-static int access_valid(u32 x)
+static inline int access_valid(u32 x)
 {
-        return (x > 0 && x < 4);
+        if (x < NFS4_SHARE_ACCESS_READ)
+                return 0;
+        if (x > NFS4_SHARE_ACCESS_BOTH)
+                return 0;
+        return 1;
 }
-static int deny_valid(u32 x)
+static inline int deny_valid(u32 x)
 {
-        return (x >= 0 && x < 5);
+        /* Note: unlike access bits, deny bits may be zero. */
+        return x <= NFS4_SHARE_DENY_BOTH;
 }
 static void
@@ -2162,8 +2098,10 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
                goto check_replay;
        }
+        *stpp = stp;
+        *sopp = sop = stp->st_stateowner;
        if (lock) {
-                struct nfs4_stateowner *sop = stp->st_stateowner;
                clientid_t *lockclid = &lock->v.new.clientid;
                struct nfs4_client *clp = sop->so_client;
                int lkflg = 0;
@@ -2193,9 +2131,6 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
                return nfserr_bad_stateid;
        }
-        *stpp = stp;
-        *sopp = sop = stp->st_stateowner;
        /*
        *  We now validate the seqid and stateid generation numbers.
        *  For the moment, we ignore the possibility of 
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 57333944af7f..b0592e7c378d 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -148,12 +148,12 @@ xdr_error:					\
        }                                       \
 } while (0)
-static __be32 *read_buf(struct nfsd4_compoundargs *argp, int nbytes)
+static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
 {
        /* We want more bytes than seem to be available.
         * Maybe we need a new page, maybe we have just run out
         */
-        int avail = (char*)argp->end - (char*)argp->p;
+        unsigned int avail = (char *)argp->end - (char *)argp->p;
        __be32 *p;
        if (avail + argp->pagelen < nbytes)
                return NULL;
@@ -169,6 +169,11 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, int nbytes)
                        return NULL;
                
        }
+        /*
+         * The following memcpy is safe because read_buf is always
+         * called with nbytes > avail, and the two cases above both
+         * guarantee p points to at least nbytes bytes.
+         */
        memcpy(p, argp->p, avail);
        /* step to next page */
        argp->p = page_address(argp->pagelist[0]);
@@ -1448,7 +1453,7 @@ static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err)
 __be32
 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
                struct dentry *dentry, __be32 *buffer, int *countp, u32 *bmval,
-                struct svc_rqst *rqstp)
+                struct svc_rqst *rqstp, int ignore_crossmnt)
 {
        u32 bmval0 = bmval[0];
        u32 bmval1 = bmval[1];
@@ -1828,7 +1833,12 @@ out_acl:
        if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
                if ((buflen -= 8) < 0)
                        goto out_resource;
-                if (exp->ex_mnt->mnt_root->d_inode == dentry->d_inode) {
+                /*
+                 * Get parent's attributes if not ignoring crossmount
+                 * and this is the root of a cross-mounted filesystem.
+                 */
+                if (ignore_crossmnt == 0 &&
+                    exp->ex_mnt->mnt_root->d_inode == dentry->d_inode) {
                        err = vfs_getattr(exp->ex_mnt->mnt_parent,
                                exp->ex_mnt->mnt_mountpoint, &stat);
                        if (err)
@@ -1864,13 +1874,25 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
        struct svc_export *exp = cd->rd_fhp->fh_export;
        struct dentry *dentry;
        __be32 nfserr;
+        int ignore_crossmnt = 0;
        dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
        if (IS_ERR(dentry))
                return nfserrno(PTR_ERR(dentry));
        exp_get(exp);
-        if (d_mountpoint(dentry)) {
+        /*
+         * In the case of a mountpoint, the client may be asking for
+         * attributes that are only properties of the underlying filesystem
+         * as opposed to the cross-mounted file system. In such a case,
+         * we will not follow the cross mount and will fill the attribtutes
+         * directly from the mountpoint dentry.
+         */
+        if (d_mountpoint(dentry) &&
+            (cd->rd_bmval[0] & ~FATTR4_WORD0_RDATTR_ERROR) == 0 &&
+            (cd->rd_bmval[1] & ~FATTR4_WORD1_MOUNTED_ON_FILEID) == 0)
+                ignore_crossmnt = 1;
+        else if (d_mountpoint(dentry)) {
                int err;
                /*
@@ -1889,7 +1911,7 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
        }
        nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval,
-                                        cd->rd_rqstp);
+                                        cd->rd_rqstp, ignore_crossmnt);
 out_put:
        dput(dentry);
        exp_put(exp);
@@ -2043,7 +2065,7 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
        buflen = resp->end - resp->p - (COMPOUND_ERR_SLACK_SPACE >> 2);
        nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry,
                                    resp->p, &buflen, getattr->ga_bmval,
-                                    resp->rqstp);
+                                    resp->rqstp, 0);
        if (!nfserr)
                resp->p += buflen;
        return nfserr;
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 578f2c9d56be..5bfc2ac60d54 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -44,17 +44,17 @@ static int	nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
 */
 static DEFINE_SPINLOCK(cache_lock);
-void
+int nfsd_reply_cache_init(void)
-nfsd_cache_init(void)
 {
        struct svc_cacherep     *rp;
        int                     i;
        INIT_LIST_HEAD(&lru_head);
        i = CACHESIZE;
-        while(i) {
+        while (i) {
                rp = kmalloc(sizeof(*rp), GFP_KERNEL);
-                if (!rp) break;
+                if (!rp)
+                        goto out_nomem;
                list_add(&rp->c_lru, &lru_head);
                rp->c_state = RC_UNUSED;
                rp->c_type = RC_NOCACHE;
@@ -62,23 +62,19 @@ nfsd_cache_init(void)
                i--;
        }
-        if (i)
-                printk (KERN_ERR "nfsd: cannot allocate all %d cache entries, only got %d\n",
-                        CACHESIZE, CACHESIZE-i);
        hash_list = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
-        if (!hash_list) {
+        if (!hash_list)
-                nfsd_cache_shutdown();
+                goto out_nomem;
-                printk (KERN_ERR "nfsd: cannot allocate %Zd bytes for hash list\n",
-                        HASHSIZE * sizeof(struct hlist_head));
-                return;
-        }
        cache_disabled = 0;
+        return 0;
+out_nomem:
+        printk(KERN_ERR "nfsd: failed to allocate reply cache\n");
+        nfsd_reply_cache_shutdown();
+        return -ENOMEM;
 }
-void
+void nfsd_reply_cache_shutdown(void)
-nfsd_cache_shutdown(void)
 {
        struct svc_cacherep     *rp;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 77dc9893b7ba..8516137cdbb0 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -304,6 +304,9 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
        struct auth_domain *dom;
        struct knfsd_fh fh;
+        if (size == 0)
+                return -EINVAL;
        if (buf[size-1] != '\n')
                return -EINVAL;
        buf[size-1] = 0;
@@ -503,7 +506,7 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
                int len = 0;
                lock_kernel();
                if (nfsd_serv)
-                        len = svc_sock_names(buf, nfsd_serv, NULL);
+                        len = svc_xprt_names(nfsd_serv, buf, 0);
                unlock_kernel();
                return len;
        }
@@ -540,7 +543,7 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
                }
                return err < 0 ? err : 0;
        }
-        if (buf[0] == '-') {
+        if (buf[0] == '-' && isdigit(buf[1])) {
                char *toclose = kstrdup(buf+1, GFP_KERNEL);
                int len = 0;
                if (!toclose)
@@ -554,6 +557,53 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
                kfree(toclose);
                return len;
        }
+        /*
+         * Add a transport listener by writing it's transport name
+         */
+        if (isalpha(buf[0])) {
+                int err;
+                char transport[16];
+                int port;
+                if (sscanf(buf, "%15s %4d", transport, &port) == 2) {
+                        err = nfsd_create_serv();
+                        if (!err) {
+                                err = svc_create_xprt(nfsd_serv,
+                                                      transport, port,
+                                                      SVC_SOCK_ANONYMOUS);
+                                if (err == -ENOENT)
+                                        /* Give a reasonable perror msg for
+                                         * bad transport string */
+                                        err = -EPROTONOSUPPORT;
+                        }
+                        return err < 0 ? err : 0;
+                }
+        }
+        /*
+         * Remove a transport by writing it's transport name and port number
+         */
+        if (buf[0] == '-' && isalpha(buf[1])) {
+                struct svc_xprt *xprt;
+                int err = -EINVAL;
+                char transport[16];
+                int port;
+                if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) {
+                        if (port == 0)
+                                return -EINVAL;
+                        lock_kernel();
+                        if (nfsd_serv) {
+                                xprt = svc_find_xprt(nfsd_serv, transport,
+                                                     AF_UNSPEC, port);
+                                if (xprt) {
+                                        svc_close_xprt(xprt);
+                                        svc_xprt_put(xprt);
+                                        err = 0;
+                                } else
+                                        err = -ENOTCONN;
+                        }
+                        unlock_kernel();
+                        return err < 0 ? err : 0;
+                }
+        }
        return -EINVAL;
 }
@@ -616,7 +666,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
        char *recdir;
        int len, status;
-        if (size > PATH_MAX || buf[size-1] != '\n')
+        if (size == 0 || size > PATH_MAX || buf[size-1] != '\n')
                return -EINVAL;
        buf[size-1] = 0;
@@ -674,6 +724,27 @@ static struct file_system_type nfsd_fs_type = {
        .kill_sb        = kill_litter_super,
 };
+#ifdef CONFIG_PROC_FS
+static int create_proc_exports_entry(void)
+{
+        struct proc_dir_entry *entry;
+        entry = proc_mkdir("fs/nfs", NULL);
+        if (!entry)
+                return -ENOMEM;
+        entry = create_proc_entry("fs/nfs/exports", 0, NULL);
+        if (!entry)
+                return -ENOMEM;
+        entry->proc_fops =  &exports_operations;
+        return 0;
+}
+#else /* CONFIG_PROC_FS */
+static int create_proc_exports_entry(void)
+{
+        return 0;
+}
+#endif
 static int __init init_nfsd(void)
 {
        int retval;
@@ -683,32 +754,43 @@ static int __init init_nfsd(void)
        if (retval)
                return retval;
        nfsd_stat_init();       /* Statistics */
-        nfsd_cache_init();      /* RPC reply cache */
+        retval = nfsd_reply_cache_init();
-        nfsd_export_init();     /* Exports table */
+        if (retval)
+                goto out_free_stat;
+        retval = nfsd_export_init();
+        if (retval)
+                goto out_free_cache;
        nfsd_lockd_init();      /* lockd->nfsd callbacks */
-        nfsd_idmap_init();      /* Name to ID mapping */
+        retval = nfsd_idmap_init();
-        if (proc_mkdir("fs/nfs", NULL)) {
+        if (retval)
-                struct proc_dir_entry *entry;
+                goto out_free_lockd;
-                entry = create_proc_entry("fs/nfs/exports", 0, NULL);
+        retval = create_proc_exports_entry();
-                if (entry)
+        if (retval)
-                        entry->proc_fops =  &exports_operations;
+                goto out_free_idmap;
-        }
        retval = register_filesystem(&nfsd_fs_type);
-        if (retval) {
+        if (retval)
-                nfsd_export_shutdown();
+                goto out_free_all;
-                nfsd_cache_shutdown();
+        return 0;
-                remove_proc_entry("fs/nfs/exports", NULL);
+out_free_all:
-                remove_proc_entry("fs/nfs", NULL);
+        remove_proc_entry("fs/nfs/exports", NULL);
-                nfsd_stat_shutdown();
+        remove_proc_entry("fs/nfs", NULL);
-                nfsd_lockd_shutdown();
+out_free_idmap:
-        }
+        nfsd_idmap_shutdown();
+out_free_lockd:
+        nfsd_lockd_shutdown();
+        nfsd_export_shutdown();
+out_free_cache:
+        nfsd_reply_cache_shutdown();
+out_free_stat:
+        nfsd_stat_shutdown();
+        nfsd4_free_slabs();
        return retval;
 }
 static void __exit exit_nfsd(void)
 {
        nfsd_export_shutdown();
-        nfsd_cache_shutdown();
+        nfsd_reply_cache_shutdown();
        remove_proc_entry("fs/nfs/exports", NULL);
        remove_proc_entry("fs/nfs", NULL);
        nfsd_stat_shutdown();
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 4f712e970584..8fbd2dc08a92 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -22,6 +22,7 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/svcauth_gss.h>
 #include <linux/nfsd/nfsd.h>
+#include "auth.h"
 #define NFSDDBG_FACILITY                NFSDDBG_FH
@@ -95,6 +96,22 @@ nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type)
        return 0;
 }
+static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp,
+                                          struct svc_export *exp)
+{
+        /* Check if the request originated from a secure port. */
+        if (!rqstp->rq_secure && EX_SECURE(exp)) {
+                char buf[RPC_MAX_ADDRBUFLEN];
+                dprintk(KERN_WARNING
+                       "nfsd: request from insecure port %s!\n",
+                       svc_print_addr(rqstp, buf, sizeof(buf)));
+                return nfserr_perm;
+        }
+        /* Set user creds for this exportpoint */
+        return nfserrno(nfsd_setuser(rqstp, exp));
+}
 /*
 * Perform sanity checks on the dentry in a client's file handle.
 *
@@ -167,18 +184,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
                        goto out;
                }
-                /* Check if the request originated from a secure port. */
+                error = nfsd_setuser_and_check_port(rqstp, exp);
-                error = nfserr_perm;
-                if (!rqstp->rq_secure && EX_SECURE(exp)) {
-                        char buf[RPC_MAX_ADDRBUFLEN];
-                        printk(KERN_WARNING
-                               "nfsd: request from insecure port %s!\n",
-                               svc_print_addr(rqstp, buf, sizeof(buf)));
-                        goto out;
-                }
-                /* Set user creds for this exportpoint */
-                error = nfserrno(nfsd_setuser(rqstp, exp));
                if (error)
                        goto out;
@@ -227,18 +233,22 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
                fhp->fh_export = exp;
                nfsd_nr_verified++;
        } else {
-                /* just rechecking permissions
+                /*
-                 * (e.g. nfsproc_create calls fh_verify, then nfsd_create does as well)
+                 * just rechecking permissions
+                 * (e.g. nfsproc_create calls fh_verify, then nfsd_create
+                 * does as well)
                 */
                dprintk("nfsd: fh_verify - just checking\n");
                dentry = fhp->fh_dentry;
                exp = fhp->fh_export;
-                /* Set user creds for this exportpoint; necessary even
+                /*
+                 * Set user creds for this exportpoint; necessary even
                 * in the "just checking" case because this may be a
                 * filehandle that was created by fh_compose, and that
                 * is about to be used in another nfsv4 compound
-                 * operation */
+                 * operation.
-                error = nfserrno(nfsd_setuser(rqstp, exp));
+                 */
+                error = nfsd_setuser_and_check_port(rqstp, exp);
                if (error)
                        goto out;
        }
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 1190aeaa92be..9647b0f7bc0c 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -155,8 +155,8 @@ static int killsig;	/* signal that was used to kill last nfsd */
 static void nfsd_last_thread(struct svc_serv *serv)
 {
        /* When last nfsd thread exits we need to do some clean-up */
-        struct svc_sock *svsk;
+        struct svc_xprt *xprt;
-        list_for_each_entry(svsk, &serv->sv_permsocks, sk_list)
+        list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list)
                lockd_down();
        nfsd_serv = NULL;
        nfsd_racache_shutdown();
@@ -236,7 +236,7 @@ static int nfsd_init_socks(int port)
        error = lockd_up(IPPROTO_UDP);
        if (error >= 0) {
-                error = svc_makesock(nfsd_serv, IPPROTO_UDP, port,
+                error = svc_create_xprt(nfsd_serv, "udp", port,
                                        SVC_SOCK_DEFAULTS);
                if (error < 0)
                        lockd_down();
@@ -247,7 +247,7 @@ static int nfsd_init_socks(int port)
 #ifdef CONFIG_NFSD_TCP
        error = lockd_up(IPPROTO_TCP);
        if (error >= 0) {
-                error = svc_makesock(nfsd_serv, IPPROTO_TCP, port,
+                error = svc_create_xprt(nfsd_serv, "tcp", port,
                                        SVC_SOCK_DEFAULTS);
                if (error < 0)
                        lockd_down();
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 986f9b32083c..61ad61743d94 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -15,6 +15,7 @@
 #include <linux/nfsd/nfsd.h>
 #include <linux/nfsd/xdr.h>
 #include <linux/mm.h>
+#include "auth.h"
 #define NFSDDBG_FACILITY                NFSDDBG_XDR
@@ -62,10 +63,10 @@ encode_fh(__be32 *p, struct svc_fh *fhp)
 * no slashes or null bytes.
 */
 static __be32 *
-decode_filename(__be32 *p, char **namp, int *lenp)
+decode_filename(__be32 *p, char **namp, unsigned int *lenp)
 {
        char            *name;
-        int             i;
+        unsigned int    i;
        if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXNAMLEN)) != NULL) {
                for (i = 0, name = *namp; i < *lenp; i++, name++) {
@@ -78,10 +79,10 @@ decode_filename(__be32 *p, char **namp, int *lenp)
 }
 static __be32 *
-decode_pathname(__be32 *p, char **namp, int *lenp)
+decode_pathname(__be32 *p, char **namp, unsigned int *lenp)
 {
        char            *name;
-        int             i;
+        unsigned int    i;
        if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXPATHLEN)) != NULL) {
                for (i = 0, name = *namp; i < *lenp; i++, name++) {
@@ -313,8 +314,11 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
         * Round the length of the data which was specified up to
         * the next multiple of XDR units and then compare that
         * against the length which was actually received.
+         * Note that when RPCSEC/GSS (for example) is used, the
+         * data buffer can be padded so dlen might be larger
+         * than required.  It must never be smaller.
         */
-        if (dlen != XDR_QUADLEN(len)*4)
+        if (dlen < XDR_QUADLEN(len)*4)
                return 0;
        rqstp->rq_vec[0].iov_base = (void*)p;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d0199189924c..cc75e4fcd02b 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -132,7 +132,7 @@ out:
 __be32
 nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
-                   const char *name, int len,
+                   const char *name, unsigned int len,
                   struct svc_export **exp_ret, struct dentry **dentry_ret)
 {
        struct svc_export       *exp;
@@ -226,7 +226,7 @@ out_nfserr:
 */
 __be32
 nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
-                                        int len, struct svc_fh *resfh)
+                                unsigned int len, struct svc_fh *resfh)
 {
        struct svc_export       *exp;
        struct dentry           *dentry;
@@ -1151,6 +1151,26 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
 }
 #endif /* CONFIG_NFSD_V3 */
+__be32
+nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
+                        struct iattr *iap)
+{
+        /*
+         * Mode has already been set earlier in create:
+         */
+        iap->ia_valid &= ~ATTR_MODE;
+        /*
+         * Setting uid/gid works only for root.  Irix appears to
+         * send along the gid on create when it tries to implement
+         * setgid directories via NFS:
+         */
+        if (current->fsuid != 0)
+                iap->ia_valid &= ~(ATTR_UID|ATTR_GID);
+        if (iap->ia_valid)
+                return nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
+        return 0;
+}
 /*
 * Create a file (regular, directory, device, fifo); UNIX sockets 
 * not yet implemented.
@@ -1167,6 +1187,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
        struct dentry   *dentry, *dchild = NULL;
        struct inode    *dirp;
        __be32          err;
+        __be32          err2;
        int             host_err;
        err = nfserr_perm;
@@ -1257,16 +1278,9 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
        }
-        /* Set file attributes. Mode has already been set and
+        err2 = nfsd_create_setattr(rqstp, resfhp, iap);
-         * setting uid/gid works only for root. Irix appears to
+        if (err2)
-         * send along the gid when it tries to implement setgid
+                err = err2;
-         * directories via NFS.
-         */
-        if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) {
-                __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
-                if (err2)
-                        err = err2;
-        }
        /*
         * Update the file handle to get the new inode info.
         */
@@ -1295,6 +1309,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
        struct dentry   *dentry, *dchild = NULL;
        struct inode    *dirp;
        __be32          err;
+        __be32          err2;
        int             host_err;
        __u32           v_mtime=0, v_atime=0;
@@ -1399,16 +1414,10 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
                iap->ia_atime.tv_nsec = 0;
        }
-        /* Set file attributes.
-         * Irix appears to send along the gid when it tries to
-         * implement setgid directories via NFS. Clear out all that cruft.
-         */
 set_attr:
-        if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) {
+        err2 = nfsd_create_setattr(rqstp, resfhp, iap);
-                __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
+        if (err2)
-                if (err2)
+                err = err2;
-                        err = err2;
-        }
        /*
         * Update the filehandle to get the new inode info.
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index cfdc7900d271..ad87cb01299b 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -405,6 +405,15 @@ static int ntfs_readpage(struct file *file, struct page *page)
 retry_readpage:
        BUG_ON(!PageLocked(page));
+        vi = page->mapping->host;
+        i_size = i_size_read(vi);
+        /* Is the page fully outside i_size? (truncate in progress) */
+        if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >>
+                        PAGE_CACHE_SHIFT)) {
+                zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
+                ntfs_debug("Read outside i_size - truncated?");
+                goto done;
+        }
        /*
         * This can potentially happen because we clear PageUptodate() during
         * ntfs_writepage() of MstProtected() attributes.
@@ -413,7 +422,6 @@ retry_readpage:
                unlock_page(page);
                return 0;
        }
-        vi = page->mapping->host;
        ni = NTFS_I(vi);
        /*
         * Only $DATA attributes can be encrypted and only unnamed $DATA
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 92dabdcf2b80..50d3b0c258e3 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -179,10 +179,7 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx)
         * ntfs_mapping_pairs_decompress() fails.
         */
        end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1;
-        if (!a->data.non_resident.lowest_vcn && end_vcn == 1)
+        if (unlikely(vcn && vcn >= end_vcn)) {
-                end_vcn = sle64_to_cpu(a->data.non_resident.allocated_size) >>
-                                ni->vol->cluster_size_bits;
-        if (unlikely(vcn >= end_vcn)) {
                err = -ENOENT;
                goto err_out;
        }
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index d98daf59e0b6..d1619d05eb23 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -561,6 +561,16 @@ int ntfs_read_compressed_block(struct page *page)
        read_unlock_irqrestore(&ni->size_lock, flags);
        max_page = ((i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
                        offset;
+        /* Is the page fully outside i_size? (truncate in progress) */
+        if (xpage >= max_page) {
+                kfree(bhs);
+                kfree(pages);
+                zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
+                ntfs_debug("Compressed read outside i_size - truncated?");
+                SetPageUptodate(page);
+                unlock_page(page);
+                return 0;
+        }
        if (nr_pages < max_page)
                max_page = nr_pages;
        for (i = 0; i < max_page; i++, offset++) {
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 9fb8132f19b0..4d4ce48bb42c 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -19,16 +19,17 @@ ocfs2-objs := \
        ioctl.o                 \
        journal.o               \
        localalloc.o            \
+        locks.o                 \
        mmap.o                  \
        namei.o                 \
+        resize.o                \
        slot_map.o              \
        suballoc.o              \
        super.o                 \
        symlink.o               \
        sysfile.o               \
        uptodate.o              \
-        ver.o                   \
+        ver.o
-        vote.o
 obj-$(CONFIG_OCFS2_FS) += cluster/
 obj-$(CONFIG_OCFS2_FS) += dlm/
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 4ba7f0bdc248..e6df06ac6405 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2389,6 +2389,18 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
                        goto out;
                }
+                /*
+                 * Caller might still want to make changes to the
+                 * tree root, so re-add it to the journal here.
+                 */
+                ret = ocfs2_journal_access(handle, inode,
+                                           path_root_bh(left_path),
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
                ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
                                                right_path, subtree_root,
                                                dealloc, &deleted);
@@ -3289,16 +3301,6 @@ static int ocfs2_insert_path(struct inode *inode,
        int ret, subtree_index;
        struct buffer_head *leaf_bh = path_leaf_bh(right_path);
-        /*
-         * Pass both paths to the journal. The majority of inserts
-         * will be touching all components anyway.
-         */
-        ret = ocfs2_journal_access_path(inode, handle, right_path);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out;
-        }
        if (left_path) {
                int credits = handle->h_buffer_credits;
@@ -3323,6 +3325,16 @@ static int ocfs2_insert_path(struct inode *inode,
                }
        }
+        /*
+         * Pass both paths to the journal. The majority of inserts
+         * will be touching all components anyway.
+         */
+        ret = ocfs2_journal_access_path(inode, handle, right_path);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
        if (insert->ins_split != SPLIT_NONE) {
                /*
                 * We could call ocfs2_insert_at_leaf() for some types
@@ -3331,6 +3343,17 @@ static int ocfs2_insert_path(struct inode *inode,
                 */
                ocfs2_split_record(inode, left_path, right_path,
                                   insert_rec, insert->ins_split);
+                /*
+                 * Split might have modified either leaf and we don't
+                 * have a guarantee that the later edge insert will
+                 * dirty this for us.
+                 */
+                if (left_path)
+                        ret = ocfs2_journal_dirty(handle,
+                                                  path_leaf_bh(left_path));
+                        if (ret)
+                                mlog_errno(ret);
        } else
                ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
                                     insert, inode);
@@ -3430,6 +3453,17 @@ static int ocfs2_do_insert_extent(struct inode *inode,
                        mlog_errno(ret);
                        goto out;
                }
+                /*
+                 * ocfs2_rotate_tree_right() might have extended the
+                 * transaction without re-journaling our tree root.
+                 */
+                ret = ocfs2_journal_access(handle, inode, di_bh,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
        } else if (type->ins_appending == APPEND_TAIL
                   && type->ins_contig != CONTIG_LEFT) {
                ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
@@ -3941,12 +3975,12 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 {
        int ret = 0;
        struct ocfs2_extent_list *el = path_leaf_el(path);
-        struct buffer_head *eb_bh, *last_eb_bh = NULL;
+        struct buffer_head *last_eb_bh = NULL;
        struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
        struct ocfs2_merge_ctxt ctxt;
        struct ocfs2_extent_list *rightmost_el;
-        if (!rec->e_flags & OCFS2_EXT_UNWRITTEN) {
+        if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
                ret = -EIO;
                mlog_errno(ret);
                goto out;
@@ -3960,14 +3994,6 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
                goto out;
        }
-        eb_bh = path_leaf_bh(path);
-        ret = ocfs2_journal_access(handle, inode, eb_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el,
                                                            split_index,
                                                            split_rec);
@@ -4029,8 +4055,6 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
                        mlog_errno(ret);
        }
-        ocfs2_journal_dirty(handle, eb_bh);
 out:
        brelse(last_eb_bh);
        return ret;
@@ -4707,7 +4731,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
        mutex_lock(&data_alloc_inode->i_mutex);
-        status = ocfs2_meta_lock(data_alloc_inode, &data_alloc_bh, 1);
+        status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
        if (status < 0) {
                mlog_errno(status);
                goto out_mutex;
@@ -4729,7 +4753,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 out_unlock:
        brelse(data_alloc_bh);
-        ocfs2_meta_unlock(data_alloc_inode, 1);
+        ocfs2_inode_unlock(data_alloc_inode, 1);
 out_mutex:
        mutex_unlock(&data_alloc_inode->i_mutex);
@@ -5053,7 +5077,7 @@ static int ocfs2_free_cached_items(struct ocfs2_super *osb,
        mutex_lock(&inode->i_mutex);
-        ret = ocfs2_meta_lock(inode, &di_bh, 1);
+        ret = ocfs2_inode_lock(inode, &di_bh, 1);
        if (ret) {
                mlog_errno(ret);
                goto out_mutex;
@@ -5094,7 +5118,7 @@ out_journal:
        ocfs2_commit_trans(osb, handle);
 out_unlock:
-        ocfs2_meta_unlock(inode, 1);
+        ocfs2_inode_unlock(inode, 1);
        brelse(di_bh);
 out_mutex:
        mutex_unlock(&inode->i_mutex);
@@ -6093,8 +6117,6 @@ start:
        mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
             clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
-        BUG_ON(clusters_to_del == 0);
        mutex_lock(&tl_inode->i_mutex);
        tl_sem = 1;
        /* ocfs2_truncate_log_needs_flush guarantees us at least one
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c69c1b300155..bc7b4cbbe8ec 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -26,6 +26,7 @@
 #include <asm/byteorder.h>
 #include <linux/swap.h>
 #include <linux/pipe_fs_i.h>
+#include <linux/mpage.h>
 #define MLOG_MASK_PREFIX ML_FILE_IO
 #include <cluster/masklog.h>
@@ -139,7 +140,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
 {
        int err = 0;
        unsigned int ext_flags;
-        u64 p_blkno, past_eof;
+        u64 max_blocks = bh_result->b_size >> inode->i_blkbits;
+        u64 p_blkno, count, past_eof;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
@@ -155,7 +157,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
                goto bail;
        }
-        err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL,
+        err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count,
                                          &ext_flags);
        if (err) {
                mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
@@ -164,6 +166,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
                goto bail;
        }
+        if (max_blocks < count)
+                count = max_blocks;
        /*
         * ocfs2 never allocates in this function - the only time we
         * need to use BH_New is when we're extending i_size on a file
@@ -178,6 +183,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
        if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
                map_bh(bh_result, inode->i_sb, p_blkno);
+        bh_result->b_size = count << inode->i_blkbits;
        if (!ocfs2_sparse_alloc(osb)) {
                if (p_blkno == 0) {
                        err = -EIO;
@@ -210,7 +217,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
                           struct buffer_head *di_bh)
 {
        void *kaddr;
-        unsigned int size;
+        loff_t size;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
@@ -224,8 +231,9 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
        if (size > PAGE_CACHE_SIZE ||
            size > ocfs2_max_inline_data(inode->i_sb)) {
                ocfs2_error(inode->i_sb,
-                            "Inode %llu has with inline data has bad size: %u",
+                            "Inode %llu has with inline data has bad size: %Lu",
-                            (unsigned long long)OCFS2_I(inode)->ip_blkno, size);
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                            (unsigned long long)size);
                return -EROFS;
        }
@@ -275,7 +283,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
        mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
-        ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
+        ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
        if (ret != 0) {
                if (ret == AOP_TRUNCATED_PAGE)
                        unlock = 0;
@@ -285,7 +293,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
        if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
                ret = AOP_TRUNCATED_PAGE;
-                goto out_meta_unlock;
+                goto out_inode_unlock;
        }
        /*
@@ -305,25 +313,16 @@ static int ocfs2_readpage(struct file *file, struct page *page)
                goto out_alloc;
        }
-        ret = ocfs2_data_lock_with_page(inode, 0, page);
-        if (ret != 0) {
-                if (ret == AOP_TRUNCATED_PAGE)
-                        unlock = 0;
-                mlog_errno(ret);
-                goto out_alloc;
-        }
        if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
                ret = ocfs2_readpage_inline(inode, page);
        else
                ret = block_read_full_page(page, ocfs2_get_block);
        unlock = 0;
-        ocfs2_data_unlock(inode, 0);
 out_alloc:
        up_read(&OCFS2_I(inode)->ip_alloc_sem);
-out_meta_unlock:
+out_inode_unlock:
-        ocfs2_meta_unlock(inode, 0);
+        ocfs2_inode_unlock(inode, 0);
 out:
        if (unlock)
                unlock_page(page);
@@ -331,6 +330,62 @@ out:
        return ret;
 }
+/*
+ * This is used only for read-ahead. Failures or difficult to handle
+ * situations are safe to ignore.
+ *
+ * Right now, we don't bother with BH_Boundary - in-inode extent lists
+ * are quite large (243 extents on 4k blocks), so most inodes don't
+ * grow out to a tree. If need be, detecting boundary extents could
+ * trivially be added in a future version of ocfs2_get_block().
+ */
+static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
+                           struct list_head *pages, unsigned nr_pages)
+{
+        int ret, err = -EIO;
+        struct inode *inode = mapping->host;
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        loff_t start;
+        struct page *last;
+        /*
+         * Use the nonblocking flag for the dlm code to avoid page
+         * lock inversion, but don't bother with retrying.
+         */
+        ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK);
+        if (ret)
+                return err;
+        if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
+                ocfs2_inode_unlock(inode, 0);
+                return err;
+        }
+        /*
+         * Don't bother with inline-data. There isn't anything
+         * to read-ahead in that case anyway...
+         */
+        if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+                goto out_unlock;
+        /*
+         * Check whether a remote node truncated this file - we just
+         * drop out in that case as it's not worth handling here.
+         */
+        last = list_entry(pages->prev, struct page, lru);
+        start = (loff_t)last->index << PAGE_CACHE_SHIFT;
+        if (start >= i_size_read(inode))
+                goto out_unlock;
+        err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block);
+out_unlock:
+        up_read(&oi->ip_alloc_sem);
+        ocfs2_inode_unlock(inode, 0);
+        return err;
+}
 /* Note: Because we don't support holes, our allocation has
 * already happened (allocation writes zeros to the file data)
 * so we don't have to worry about ordered writes in
@@ -452,7 +507,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
         * accessed concurrently from multiple nodes.
         */
        if (!INODE_JOURNAL(inode)) {
-                err = ocfs2_meta_lock(inode, NULL, 0);
+                err = ocfs2_inode_lock(inode, NULL, 0);
                if (err) {
                        if (err != -ENOENT)
                                mlog_errno(err);
@@ -467,7 +522,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
        if (!INODE_JOURNAL(inode)) {
                up_read(&OCFS2_I(inode)->ip_alloc_sem);
-                ocfs2_meta_unlock(inode, 0);
+                ocfs2_inode_unlock(inode, 0);
        }
        if (err) {
@@ -638,34 +693,12 @@ static ssize_t ocfs2_direct_IO(int rw,
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
                return 0;
-        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
-                /*
-                 * We get PR data locks even for O_DIRECT.  This
-                 * allows concurrent O_DIRECT I/O but doesn't let
-                 * O_DIRECT with extending and buffered zeroing writes
-                 * race.  If they did race then the buffered zeroing
-                 * could be written back after the O_DIRECT I/O.  It's
-                 * one thing to tell people not to mix buffered and
-                 * O_DIRECT writes, but expecting them to understand
-                 * that file extension is also an implicit buffered
-                 * write is too much.  By getting the PR we force
-                 * writeback of the buffered zeroing before
-                 * proceeding.
-                 */
-                ret = ocfs2_data_lock(inode, 0);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-                ocfs2_data_unlock(inode, 0);
-        }
        ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
                                            inode->i_sb->s_bdev, iov, offset,
                                            nr_segs, 
                                            ocfs2_direct_IO_get_blocks,
                                            ocfs2_dio_end_io);
-out:
        mlog_exit(ret);
        return ret;
 }
@@ -729,6 +762,27 @@ static void ocfs2_clear_page_regions(struct page *page,
 }
 /*
+ * Nonsparse file systems fully allocate before we get to the write
+ * code. This prevents ocfs2_write() from tagging the write as an
+ * allocating one, which means ocfs2_map_page_blocks() might try to
+ * read-in the blocks at the tail of our file. Avoid reading them by
+ * testing i_size against each block offset.
+ */
+static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
+                                 unsigned int block_start)
+{
+        u64 offset = page_offset(page) + block_start;
+        if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+                return 1;
+        if (i_size_read(inode) > offset)
+                return 1;
+        return 0;
+}
+/*
 * Some of this taken from block_prepare_write(). We already have our
 * mapping by now though, and the entire write will be allocating or
 * it won't, so not much need to use BH_New.
@@ -781,6 +835,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
                                set_buffer_uptodate(bh);
                } else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
                           !buffer_new(bh) &&
+                           ocfs2_should_read_blk(inode, page, block_start) &&
                           (block_start < from || block_end > to)) {
                        ll_rw_block(READ, 1, &bh);
                        *wait_bh++=bh;
@@ -1492,7 +1547,7 @@ int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size)
 {
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-        if (new_size < le16_to_cpu(di->id2.i_data.id_count))
+        if (new_size <= le16_to_cpu(di->id2.i_data.id_count))
                return 1;
        return 0;
 }
@@ -1732,7 +1787,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
        struct buffer_head *di_bh = NULL;
        struct inode *inode = mapping->host;
-        ret = ocfs2_meta_lock(inode, &di_bh, 1);
+        ret = ocfs2_inode_lock(inode, &di_bh, 1);
        if (ret) {
                mlog_errno(ret);
                return ret;
@@ -1747,30 +1802,22 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
         */
        down_write(&OCFS2_I(inode)->ip_alloc_sem);
-        ret = ocfs2_data_lock(inode, 1);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_fail;
-        }
        ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
                                       fsdata, di_bh, NULL);
        if (ret) {
                mlog_errno(ret);
-                goto out_fail_data;
+                goto out_fail;
        }
        brelse(di_bh);
        return 0;
-out_fail_data:
-        ocfs2_data_unlock(inode, 1);
 out_fail:
        up_write(&OCFS2_I(inode)->ip_alloc_sem);
        brelse(di_bh);
-        ocfs2_meta_unlock(inode, 1);
+        ocfs2_inode_unlock(inode, 1);
        return ret;
 }
@@ -1886,15 +1933,15 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
        ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
-        ocfs2_data_unlock(inode, 1);
        up_write(&OCFS2_I(inode)->ip_alloc_sem);
-        ocfs2_meta_unlock(inode, 1);
+        ocfs2_inode_unlock(inode, 1);
        return ret;
 }
 const struct address_space_operations ocfs2_aops = {
        .readpage       = ocfs2_readpage,
+        .readpages      = ocfs2_readpages,
        .writepage      = ocfs2_writepage,
        .write_begin    = ocfs2_write_begin,
        .write_end      = ocfs2_write_end,
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index c9037414f4f6..f136639f5b41 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -79,7 +79,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
                 * information for this bh as it's not marked locally
                 * uptodate. */
                ret = -EIO;
-                brelse(bh);
+                put_bh(bh);
        }
        mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
@@ -256,7 +256,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
                                 * for this bh as it's not marked locally
                                 * uptodate. */
                                status = -EIO;
-                                brelse(bh);
+                                put_bh(bh);
                                bhs[i] = NULL;
                                continue;
                        }
@@ -280,3 +280,64 @@ bail:
        mlog_exit(status);
        return status;
 }
+/* Check whether the blkno is the super block or one of the backups. */
+static void ocfs2_check_super_or_backup(struct super_block *sb,
+                                        sector_t blkno)
+{
+        int i;
+        u64 backup_blkno;
+        if (blkno == OCFS2_SUPER_BLOCK_BLKNO)
+                return;
+        for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
+                backup_blkno = ocfs2_backup_super_blkno(sb, i);
+                if (backup_blkno == blkno)
+                        return;
+        }
+        BUG();
+}
+/*
+ * Write super block and backups doesn't need to collaborate with journal,
+ * so we don't need to lock ip_io_mutex and inode doesn't need to bea passed
+ * into this function.
+ */
+int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
+                                struct buffer_head *bh)
+{
+        int ret = 0;
+        mlog_entry_void();
+        BUG_ON(buffer_jbd(bh));
+        ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr);
+        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) {
+                ret = -EROFS;
+                goto out;
+        }
+        lock_buffer(bh);
+        set_buffer_uptodate(bh);
+        /* remove from dirty list before I/O. */
+        clear_buffer_dirty(bh);
+        get_bh(bh); /* for end_buffer_write_sync() */
+        bh->b_end_io = end_buffer_write_sync;
+        submit_bh(WRITE, bh);
+        wait_on_buffer(bh);
+        if (!buffer_uptodate(bh)) {
+                ret = -EIO;
+                put_bh(bh);
+        }
+out:
+        mlog_exit(ret);
+        return ret;
+}
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index 6cc20930fac3..c2e78614c3e5 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -47,6 +47,8 @@ int ocfs2_read_blocks(struct ocfs2_super          *osb,
                      int                  flags,
                      struct inode        *inode);
+int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
+                                struct buffer_head *bh);
 #define OCFS2_BH_CACHED            1
 #define OCFS2_BH_READAHEAD         8
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 9cc7c0418b70..f02ccb34604d 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -267,7 +267,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
                current_page = cs / spp;
                page = reg->hr_slot_data[current_page];
-                vec_len = min(PAGE_CACHE_SIZE,
+                vec_len = min(PAGE_CACHE_SIZE - vec_start,
                              (max_slots-cs) * (PAGE_CACHE_SIZE/spp) );
                mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n",
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 35397dd5ecdb..e511339886b3 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -35,7 +35,7 @@
 #define O2HB_LIVE_THRESHOLD        2
 /* number of equal samples to be seen as dead */
 extern unsigned int o2hb_dead_threshold;
-#define O2HB_DEFAULT_DEAD_THRESHOLD        7
+#define O2HB_DEFAULT_DEAD_THRESHOLD        31
 /* Otherwise MAX_WRITE_TIMEOUT will be zero... */
 #define O2HB_MIN_DEAD_THRESHOLD   2
 #define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1))
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index a4882c8df945..23c732f27529 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -146,7 +146,7 @@ static struct kset mlog_kset = {
        .kobj   = {.ktype = &mlog_ktype},
 };
-int mlog_sys_init(struct kset *o2cb_subsys)
+int mlog_sys_init(struct kset *o2cb_kset)
 {
        int i = 0;
@@ -157,7 +157,7 @@ int mlog_sys_init(struct kset *o2cb_subsys)
        mlog_attr_ptrs[i] = NULL;
        kobject_set_name(&mlog_kset.kobj, "logmask");
-        kobj_set_kset_s(&mlog_kset, *o2cb_subsys);
+        mlog_kset.kobj.kset = o2cb_kset;
        return kset_register(&mlog_kset);
 }
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index cd046060114e..597e064bb94f 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -212,7 +212,7 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
 #define mlog_errno(st) do {                                             \
        int _st = (st);                                                 \
        if (_st != -ERESTARTSYS && _st != -EINTR &&                     \
-            _st != AOP_TRUNCATED_PAGE)                                  \
+            _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC)                \
                mlog(ML_ERROR, "status = %lld\n", (long long)_st);      \
 } while (0)
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index 64f6f378fd09..0c095ce7723d 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -28,96 +28,55 @@
 #include <linux/module.h>
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
+#include <linux/fs.h>
 #include "ocfs2_nodemanager.h"
 #include "masklog.h"
 #include "sys.h"
-struct o2cb_attribute {
-        struct attribute        attr;
-        ssize_t (*show)(char *buf);
-        ssize_t (*store)(const char *buf, size_t count);
-};
-#define O2CB_ATTR(_name, _mode, _show, _store)  \
-struct o2cb_attribute o2cb_attr_##_name = __ATTR(_name, _mode, _show, _store)
-#define to_o2cb_attr(_attr) container_of(_attr, struct o2cb_attribute, attr)
-static ssize_t o2cb_interface_revision_show(char *buf)
+static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr,
+                            char *buf)
 {
        return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
 }
+static struct kobj_attribute attr_version =
-static O2CB_ATTR(interface_revision, S_IFREG | S_IRUGO, o2cb_interface_revision_show, NULL);
+        __ATTR(interface_revision, S_IFREG | S_IRUGO, version_show, NULL);
 static struct attribute *o2cb_attrs[] = {
-        &o2cb_attr_interface_revision.attr,
+        &attr_version.attr,
        NULL,
 };
-static ssize_t
+static struct attribute_group o2cb_attr_group = {
-o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer);
+        .attrs = o2cb_attrs,
-static ssize_t
-o2cb_store(struct kobject * kobj, struct attribute * attr,
-           const char * buffer, size_t count);
-static struct sysfs_ops o2cb_sysfs_ops = {
-        .show   = o2cb_show,
-        .store  = o2cb_store,
 };
-static struct kobj_type o2cb_subsys_type = {
+static struct kset *o2cb_kset;
-        .default_attrs  = o2cb_attrs,
-        .sysfs_ops      = &o2cb_sysfs_ops,
-};
-/* gives us o2cb_subsys */
-static decl_subsys(o2cb, NULL, NULL);
-static ssize_t
-o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer)
-{
-        struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
-        struct kset *sbs = to_kset(kobj);
-        BUG_ON(sbs != &o2cb_subsys);
-        if (o2cb_attr->show)
-                return o2cb_attr->show(buffer);
-        return -EIO;
-}
-static ssize_t
-o2cb_store(struct kobject * kobj, struct attribute * attr,
-             const char * buffer, size_t count)
-{
-        struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
-        struct kset *sbs = to_kset(kobj);
-        BUG_ON(sbs != &o2cb_subsys);
-        if (o2cb_attr->store)
-                return o2cb_attr->store(buffer, count);
-        return -EIO;
-}
 void o2cb_sys_shutdown(void)
 {
        mlog_sys_shutdown();
-        subsystem_unregister(&o2cb_subsys);
+        kset_unregister(o2cb_kset);
 }
 int o2cb_sys_init(void)
 {
        int ret;
-        o2cb_subsys.kobj.ktype = &o2cb_subsys_type;
+        o2cb_kset = kset_create_and_add("o2cb", NULL, NULL);
-        ret = subsystem_register(&o2cb_subsys);
+        if (!o2cb_kset)
+                return -ENOMEM;
+        ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
        if (ret)
-                return ret;
+                goto error;
-        ret = mlog_sys_init(&o2cb_subsys);
+        ret = mlog_sys_init(o2cb_kset);
        if (ret)
-                subsystem_unregister(&o2cb_subsys);
+                goto error;
+        return 0;
+error:
+        kset_unregister(o2cb_kset);
        return ret;
 }
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 685c18065c82..ee50c9610e7f 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -58,6 +58,7 @@
 #include <linux/slab.h>
 #include <linux/idr.h>
 #include <linux/kref.h>
+#include <linux/net.h>
 #include <net/tcp.h>
 #include <asm/uaccess.h>
@@ -71,14 +72,6 @@
 #include "tcp_internal.h"
-/* 
- * The linux network stack isn't sparse endian clean.. It has macros like
- * ntohs() which perform the endian checks and structs like sockaddr_in
- * which aren't annotated.  So __force is found here to get the build
- * clean.  When they emerge from the dark ages and annotate the code
- * we can remove these.
- */
 #define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u"
 #define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num,    \
                          NIPQUAD(sc->sc_node->nd_ipv4_address),        \
@@ -616,8 +609,7 @@ static void o2net_shutdown_sc(struct work_struct *work)
                del_timer_sync(&sc->sc_idle_timeout);
                o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
                sc_put(sc);
-                sc->sc_sock->ops->shutdown(sc->sc_sock,
+                kernel_sock_shutdown(sc->sc_sock, SHUT_RDWR);
-                                           RCV_SHUTDOWN|SEND_SHUTDOWN);
        }
        /* not fatal so failed connects before the other guy has our
@@ -1500,7 +1492,7 @@ static void o2net_start_connect(struct work_struct *work)
        myaddr.sin_family = AF_INET;
        myaddr.sin_addr.s_addr = mynode->nd_ipv4_address;
-        myaddr.sin_port = (__force u16)htons(0); /* any port */
+        myaddr.sin_port = htons(0); /* any port */
        ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
                              sizeof(myaddr));
@@ -1701,11 +1693,11 @@ static int o2net_accept_one(struct socket *sock)
        if (ret < 0)
                goto out;
-        node = o2nm_get_node_by_ip((__force __be32)sin.sin_addr.s_addr);
+        node = o2nm_get_node_by_ip(sin.sin_addr.s_addr);
        if (node == NULL) {
                mlog(ML_NOTICE, "attempt to connect from unknown node at "
                     "%u.%u.%u.%u:%d\n", NIPQUAD(sin.sin_addr.s_addr),
-                     ntohs((__force __be16)sin.sin_port));
+                     ntohs(sin.sin_port));
                ret = -EINVAL;
                goto out;
        }
@@ -1714,7 +1706,7 @@ static int o2net_accept_one(struct socket *sock)
                mlog(ML_NOTICE, "unexpected connect attempted from a lower "
                     "numbered node '%s' at " "%u.%u.%u.%u:%d with num %u\n",
                     node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
-                     ntohs((__force __be16)sin.sin_port), node->nd_num);
+                     ntohs(sin.sin_port), node->nd_num);
                ret = -EINVAL;
                goto out;
        }
@@ -1725,7 +1717,7 @@ static int o2net_accept_one(struct socket *sock)
                mlog(ML_CONN, "attempt to connect from node '%s' at "
                     "%u.%u.%u.%u:%d but it isn't heartbeating\n",
                     node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
-                     ntohs((__force __be16)sin.sin_port));
+                     ntohs(sin.sin_port));
                ret = -EINVAL;
                goto out;
        }
@@ -1742,7 +1734,7 @@ static int o2net_accept_one(struct socket *sock)
                mlog(ML_NOTICE, "attempt to connect from node '%s' at "
                     "%u.%u.%u.%u:%d but it already has an open connection\n",
                     node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
-                     ntohs((__force __be16)sin.sin_port));
+                     ntohs(sin.sin_port));
                goto out;
        }
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index da880fc215f0..f36f66aab3dd 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -60,8 +60,8 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data,
 /* same as hb delay, we're waiting for another node to recognize our hb */
 #define O2NET_RECONNECT_DELAY_MS_DEFAULT        2000
-#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT        5000
+#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT        2000
-#define O2NET_IDLE_TIMEOUT_MS_DEFAULT           10000
+#define O2NET_IDLE_TIMEOUT_MS_DEFAULT           30000
 /* TODO: figure this out.... */
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 9606111fe89d..b2e832aca567 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -38,6 +38,12 @@
 * locking semantics of the file system using the protocol.  It should 
 * be somewhere else, I'm sure, but right now it isn't.
 *
+ * New in version 10:
+ *      - Meta/data locks combined
+ *
+ * New in version 9:
+ *      - All votes removed
+ *
 * New in version 8:
 *      - Replace delete inode votes with a cluster lock
 *
@@ -60,7 +66,7 @@
 *      - full 64 bit i_size in the metadata lock lvbs
 *      - introduction of "rw" lock and pushing meta/data locking down
 */
-#define O2NET_PROTOCOL_VERSION 8ULL
+#define O2NET_PROTOCOL_VERSION 10ULL
 struct o2net_handshake {
        __be64  protocol_version;
        __be64  connector_id;
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
index 7286c48bb30d..a56eee6abad3 100644
--- a/fs/ocfs2/cluster/ver.c
+++ b/fs/ocfs2/cluster/ver.c
@@ -28,7 +28,7 @@
 #include "ver.h"
-#define CLUSTER_BUILD_VERSION "1.3.3"
+#define CLUSTER_BUILD_VERSION "1.5.0"
 #define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 3094ddb7a254..b1cc7c381e88 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -128,9 +128,9 @@ static int ocfs2_match_dentry(struct dentry *dentry,
 /*
 * Walk the inode alias list, and find a dentry which has a given
 * parent. ocfs2_dentry_attach_lock() wants to find _any_ alias as it
- * is looking for a dentry_lock reference. The vote thread is looking
+ * is looking for a dentry_lock reference. The downconvert thread is
- * to unhash aliases, so we allow it to skip any that already have
+ * looking to unhash aliases, so we allow it to skip any that already
- * that property.
+ * have that property.
 */
 struct dentry *ocfs2_find_local_alias(struct inode *inode,
                                      u64 parent_blkno,
@@ -266,7 +266,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
        dl->dl_count = 0;
        /*
         * Does this have to happen below, for all attaches, in case
-         * the struct inode gets blown away by votes?
+         * the struct inode gets blown away by the downconvert thread?
         */
        dl->dl_inode = igrab(inode);
        dl->dl_parent_blkno = parent_blkno;
@@ -318,9 +318,9 @@ out_attach:
 static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
                                   struct ocfs2_dentry_lock *dl)
 {
+        iput(dl->dl_inode);
        ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
        ocfs2_lock_res_free(&dl->dl_lockres);
-        iput(dl->dl_inode);
        kfree(dl);
 }
@@ -344,12 +344,24 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
 {
        struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
-        mlog_bug_on_msg(!dl && !(dentry->d_flags & DCACHE_DISCONNECTED),
+        if (!dl) {
-                        "dentry: %.*s\n", dentry->d_name.len,
+                /*
-                        dentry->d_name.name);
+                 * No dentry lock is ok if we're disconnected or
+                 * unhashed.
+                 */
+                if (!(dentry->d_flags & DCACHE_DISCONNECTED) &&
+                    !d_unhashed(dentry)) {
+                        unsigned long long ino = 0ULL;
+                        if (inode)
+                                ino = (unsigned long long)OCFS2_I(inode)->ip_blkno;
+                        mlog(ML_ERROR, "Dentry is missing cluster lock. "
+                             "inode: %llu, d_flags: 0x%x, d_name: %.*s\n",
+                             ino, dentry->d_flags, dentry->d_name.len,
+                             dentry->d_name.name);
+                }
-        if (!dl)
                goto out;
+        }
        mlog_bug_on_msg(dl->dl_count == 0, "dentry: %.*s, count: %u\n",
                        dentry->d_name.len, dentry->d_name.name,
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 6a2f143e269c..6b0107f21344 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -208,9 +208,9 @@ out:
        return NULL;
 }
-struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
+static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
-                                        struct inode *dir,
+                                               struct inode *dir,
-                                        struct ocfs2_dir_entry **res_dir)
+                                               struct ocfs2_dir_entry **res_dir)
 {
        struct super_block *sb;
        struct buffer_head *bh_use[NAMEI_RA_SIZE];
@@ -846,14 +846,14 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
        mlog_entry("dirino=%llu\n",
                   (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        error = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level);
+        error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
        if (lock_level && error >= 0) {
                /* We release EX lock which used to update atime
                 * and get PR lock again to reduce contention
                 * on commonly accessed directories. */
-                ocfs2_meta_unlock(inode, 1);
+                ocfs2_inode_unlock(inode, 1);
                lock_level = 0;
-                error = ocfs2_meta_lock(inode, NULL, 0);
+                error = ocfs2_inode_lock(inode, NULL, 0);
        }
        if (error < 0) {
                if (error != -ENOENT)
@@ -865,7 +865,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
        error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos,
                                      dirent, filldir, NULL);
-        ocfs2_meta_unlock(inode, lock_level);
+        ocfs2_inode_unlock(inode, lock_level);
 bail_nolock:
        mlog_exit(error);
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlm/dlmfsver.c
index d2be3ad841f9..a733b3321f83 100644
--- a/fs/ocfs2/dlm/dlmfsver.c
+++ b/fs/ocfs2/dlm/dlmfsver.c
@@ -28,7 +28,7 @@
 #include "dlmfsver.h"
-#define DLM_BUILD_VERSION "1.3.3"
+#define DLM_BUILD_VERSION "1.5.0"
 #define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 62e4a7daa286..a54d33d95ada 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -908,7 +908,7 @@ lookup:
                 * but they might own this lockres.  wait on them. */
                bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
                if (bit < O2NM_MAX_NODES) {
-                        mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to"
+                        mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to "
                             "recover before lock mastery can begin\n",
                             dlm->name, namelen, (char *)lockid, bit);
                        wait_on_recovery = 1;
@@ -962,7 +962,7 @@ redo_request:
                spin_lock(&dlm->spinlock);
                bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
                if (bit < O2NM_MAX_NODES) {
-                        mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to"
+                        mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to "
                             "recover before lock mastery can begin\n",
                             dlm->name, namelen, (char *)lockid, bit);
                        wait_on_recovery = 1;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 2fde7bf91434..91f747b8a538 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2270,6 +2270,12 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
                }
        }
+        /* Clean up join state on node death. */
+        if (dlm->joining_node == idx) {
+                mlog(0, "Clearing join state for node %u\n", idx);
+                __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+        }
        /* check to see if the node is already considered dead */
        if (!test_bit(idx, dlm->live_nodes_map)) {
                mlog(0, "for domain %s, node %d is already dead. "
@@ -2288,12 +2294,6 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
        clear_bit(idx, dlm->live_nodes_map);
-        /* Clean up join state on node death. */
-        if (dlm->joining_node == idx) {
-                mlog(0, "Clearing join state for node %u\n", idx);
-                __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
-        }
        /* make sure local cleanup occurs before the heartbeat events */
        if (!test_bit(idx, dlm->recovery_map))
                dlm_do_local_recovery_cleanup(dlm, idx);
@@ -2321,6 +2321,13 @@ void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
        if (!dlm_grab(dlm))
                return;
+        /*
+         * This will notify any dlm users that a node in our domain
+         * went away without notifying us first.
+         */
+        if (test_bit(idx, dlm->domain_map))
+                dlm_fire_domain_eviction_callbacks(dlm, idx);
        spin_lock(&dlm->spinlock);
        __dlm_hb_node_down(dlm, idx);
        spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
index 7ef2653f8f41..dfc0da4d158d 100644
--- a/fs/ocfs2/dlm/dlmver.c
+++ b/fs/ocfs2/dlm/dlmver.c
@@ -28,7 +28,7 @@
 #include "dlmver.h"
-#define DLM_BUILD_VERSION "1.3.3"
+#define DLM_BUILD_VERSION "1.5.0"
 #define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 41c76ff2fcfb..3867244fb144 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -55,7 +55,6 @@
 #include "slot_map.h"
 #include "super.h"
 #include "uptodate.h"
-#include "vote.h"
 #include "buffer_head_io.h"
@@ -69,6 +68,7 @@ struct ocfs2_mask_waiter {
 static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
 static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
+static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
 /*
 * Return value from ->downconvert_worker functions.
@@ -153,10 +153,10 @@ struct ocfs2_lock_res_ops {
        struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *);
        /*
-         * Optionally called in the downconvert (or "vote") thread
+         * Optionally called in the downconvert thread after a
-         * after a successful downconvert. The lockres will not be
+         * successful downconvert. The lockres will not be referenced
-         * referenced after this callback is called, so it is safe to
+         * after this callback is called, so it is safe to free
-         * free memory, etc.
+         * memory, etc.
         *
         * The exact semantics of when this is called are controlled
         * by ->downconvert_worker()
@@ -225,17 +225,12 @@ static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
        .flags          = 0,
 };
-static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
+static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = {
        .get_osb        = ocfs2_get_inode_osb,
        .check_downconvert = ocfs2_check_meta_downconvert,
        .set_lvb        = ocfs2_set_meta_lvb,
-        .flags          = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
-};
-static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
-        .get_osb        = ocfs2_get_inode_osb,
        .downconvert_worker = ocfs2_data_convert_worker,
-        .flags          = 0,
+        .flags          = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
 };
 static struct ocfs2_lock_res_ops ocfs2_super_lops = {
@@ -258,10 +253,14 @@ static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
        .flags          = 0,
 };
+static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
+        .get_osb        = ocfs2_get_file_osb,
+        .flags          = 0,
+};
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 {
        return lockres->l_type == OCFS2_LOCK_TYPE_META ||
-                lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
                lockres->l_type == OCFS2_LOCK_TYPE_RW ||
                lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
 }
@@ -310,12 +309,24 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
                "resource %s: %s\n", dlm_errname(_stat), _func, \
                _lockres->l_name, dlm_errmsg(_stat));           \
 } while (0)
-static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
+static int ocfs2_downconvert_thread(void *arg);
-                                 struct ocfs2_lock_res *lockres);
+static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
-static int ocfs2_meta_lock_update(struct inode *inode,
+                                        struct ocfs2_lock_res *lockres);
+static int ocfs2_inode_lock_update(struct inode *inode,
                                  struct buffer_head **bh);
 static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
 static inline int ocfs2_highest_compat_lock_level(int level);
+static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
+                                      int new_level);
+static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
+                                  struct ocfs2_lock_res *lockres,
+                                  int new_level,
+                                  int lvb);
+static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
+                                        struct ocfs2_lock_res *lockres);
+static int ocfs2_cancel_convert(struct ocfs2_super *osb,
+                                struct ocfs2_lock_res *lockres);
 static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
                                  u64 blkno,
@@ -402,10 +413,7 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
                        ops = &ocfs2_inode_rw_lops;
                        break;
                case OCFS2_LOCK_TYPE_META:
-                        ops = &ocfs2_inode_meta_lops;
+                        ops = &ocfs2_inode_inode_lops;
-                        break;
-                case OCFS2_LOCK_TYPE_DATA:
-                        ops = &ocfs2_inode_data_lops;
                        break;
                case OCFS2_LOCK_TYPE_OPEN:
                        ops = &ocfs2_inode_open_lops;
@@ -428,6 +436,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
        return OCFS2_SB(inode->i_sb);
 }
+static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
+{
+        struct ocfs2_file_private *fp = lockres->l_priv;
+        return OCFS2_SB(fp->fp_file->f_mapping->host->i_sb);
+}
 static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres)
 {
        __be64 inode_blkno_be;
@@ -508,6 +523,21 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
                                   &ocfs2_rename_lops, osb);
 }
+void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
+                              struct ocfs2_file_private *fp)
+{
+        struct inode *inode = fp->fp_file->f_mapping->host;
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        ocfs2_lock_res_init_once(lockres);
+        ocfs2_build_lock_name(OCFS2_LOCK_TYPE_FLOCK, oi->ip_blkno,
+                              inode->i_generation, lockres->l_name);
+        ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres,
+                                   OCFS2_LOCK_TYPE_FLOCK, &ocfs2_flock_lops,
+                                   fp);
+        lockres->l_flags |= OCFS2_LOCK_NOCACHE;
+}
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
 {
        mlog_entry_void();
@@ -670,7 +700,7 @@ static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *loc
 {
        mlog_entry_void();
-        BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY));
+        BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY)));
        BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
        if (lockres->l_requested > LKM_NLMODE &&
@@ -724,6 +754,13 @@ static void ocfs2_blocking_ast(void *opaque, int level)
             lockres->l_name, level, lockres->l_level,
             ocfs2_lock_type_string(lockres->l_type));
+        /*
+         * We can skip the bast for locks which don't enable caching -
+         * they'll be dropped at the earliest possible time anyway.
+         */
+        if (lockres->l_flags & OCFS2_LOCK_NOCACHE)
+                return;
        spin_lock_irqsave(&lockres->l_lock, flags);
        needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
        if (needs_downconvert)
@@ -732,7 +769,7 @@ static void ocfs2_blocking_ast(void *opaque, int level)
        wake_up(&lockres->l_event);
-        ocfs2_kick_vote_thread(osb);
+        ocfs2_wake_downconvert_thread(osb);
 }
 static void ocfs2_locking_ast(void *opaque)
@@ -935,6 +972,21 @@ static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
 }
+static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
+                                             struct ocfs2_lock_res *lockres)
+{
+        int ret;
+        ret = wait_for_completion_interruptible(&mw->mw_complete);
+        if (ret)
+                lockres_remove_mask_waiter(lockres, mw);
+        else
+                ret = mw->mw_status;
+        /* Re-arm the completion in case we want to wait on it again */
+        INIT_COMPLETION(mw->mw_complete);
+        return ret;
+}
 static int ocfs2_cluster_lock(struct ocfs2_super *osb,
                              struct ocfs2_lock_res *lockres,
                              int level,
@@ -980,18 +1032,6 @@ again:
                goto unlock;
        }
-        if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
-                /* lock has not been created yet. */
-                spin_unlock_irqrestore(&lockres->l_lock, flags);
-                ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-                goto again;
-        }
        if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
            !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
                /* is the lock is currently blocked on behalf of
@@ -1006,7 +1046,14 @@ again:
                        mlog(ML_ERROR, "lockres %s has action %u pending\n",
                             lockres->l_name, lockres->l_action);
-                lockres->l_action = OCFS2_AST_CONVERT;
+                if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+                        lockres->l_action = OCFS2_AST_ATTACH;
+                        lkm_flags &= ~LKM_CONVERT;
+                } else {
+                        lockres->l_action = OCFS2_AST_CONVERT;
+                        lkm_flags |= LKM_CONVERT;
+                }
                lockres->l_requested = level;
                lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
                spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -1021,7 +1068,7 @@ again:
                status = dlmlock(osb->dlm,
                                 level,
                                 &lockres->l_lksb,
-                                 lkm_flags|LKM_CONVERT,
+                                 lkm_flags,
                                 lockres->l_name,
                                 OCFS2_LOCK_ID_MAX_LEN - 1,
                                 ocfs2_locking_ast,
@@ -1094,7 +1141,7 @@ static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
        mlog_entry_void();
        spin_lock_irqsave(&lockres->l_lock, flags);
        ocfs2_dec_holders(lockres, level);
-        ocfs2_vote_on_unlock(osb, lockres);
+        ocfs2_downconvert_on_unlock(osb, lockres);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
        mlog_exit_void();
 }
@@ -1152,13 +1199,7 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
         * We don't want to use LKM_LOCAL on a meta data lock as they
         * don't use a generation in their lock names.
         */
-        ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_meta_lockres, 1, 0);
+        ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0);
-        if (ret) {
-                mlog_errno(ret);
-                goto bail;
-        }
-        ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_data_lockres, 1, 1);
        if (ret) {
                mlog_errno(ret);
                goto bail;
@@ -1316,76 +1357,221 @@ out:
        mlog_exit_void();
 }
-int ocfs2_data_lock_full(struct inode *inode,
+static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres,
-                         int write,
+                                     int level)
-                         int arg_flags)
 {
-        int status = 0, level;
+        int ret;
-        struct ocfs2_lock_res *lockres;
+        struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        unsigned long flags;
+        struct ocfs2_mask_waiter mw;
-        BUG_ON(!inode);
+        ocfs2_init_mask_waiter(&mw);
-        mlog_entry_void();
+retry_cancel:
+        spin_lock_irqsave(&lockres->l_lock, flags);
+        if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+                ret = ocfs2_prepare_cancel_convert(osb, lockres);
+                if (ret) {
+                        spin_unlock_irqrestore(&lockres->l_lock, flags);
+                        ret = ocfs2_cancel_convert(osb, lockres);
+                        if (ret < 0) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        goto retry_cancel;
+                }
+                lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+                spin_unlock_irqrestore(&lockres->l_lock, flags);
-        mlog(0, "inode %llu take %s DATA lock\n",
+                ocfs2_wait_for_mask(&mw);
-             (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                goto retry_cancel;
-             write ? "EXMODE" : "PRMODE");
+        }
-        /* We'll allow faking a readonly data lock for
+        ret = -ERESTARTSYS;
-         * rodevices. */
+        /*
-        if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) {
+         * We may still have gotten the lock, in which case there's no
-                if (write) {
+         * point to restarting the syscall.
-                        status = -EROFS;
+         */
-                        mlog_errno(status);
+        if (lockres->l_level == level)
+                ret = 0;
+        mlog(0, "Cancel returning %d. flags: 0x%lx, level: %d, act: %d\n", ret,
+             lockres->l_flags, lockres->l_level, lockres->l_action);
+        spin_unlock_irqrestore(&lockres->l_lock, flags);
+out:
+        return ret;
+}
+/*
+ * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
+ * flock() calls. The locking approach this requires is sufficiently
+ * different from all other cluster lock types that we implement a
+ * seperate path to the "low-level" dlm calls. In particular:
+ *
+ * - No optimization of lock levels is done - we take at exactly
+ *   what's been requested.
+ *
+ * - No lock caching is employed. We immediately downconvert to
+ *   no-lock at unlock time. This also means flock locks never go on
+ *   the blocking list).
+ *
+ * - Since userspace can trivially deadlock itself with flock, we make
+ *   sure to allow cancellation of a misbehaving applications flock()
+ *   request.
+ *
+ * - Access to any flock lockres doesn't require concurrency, so we
+ *   can simplify the code by requiring the caller to guarantee
+ *   serialization of dlmglue flock calls.
+ */
+int ocfs2_file_lock(struct file *file, int ex, int trylock)
+{
+        int ret, level = ex ? LKM_EXMODE : LKM_PRMODE;
+        unsigned int lkm_flags = trylock ? LKM_NOQUEUE : 0;
+        unsigned long flags;
+        struct ocfs2_file_private *fp = file->private_data;
+        struct ocfs2_lock_res *lockres = &fp->fp_flock;
+        struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
+        struct ocfs2_mask_waiter mw;
+        ocfs2_init_mask_waiter(&mw);
+        if ((lockres->l_flags & OCFS2_LOCK_BUSY) ||
+            (lockres->l_level > LKM_NLMODE)) {
+                mlog(ML_ERROR,
+                     "File lock \"%s\" has busy or locked state: flags: 0x%lx, "
+                     "level: %u\n", lockres->l_name, lockres->l_flags,
+                     lockres->l_level);
+                return -EINVAL;
+        }
+        spin_lock_irqsave(&lockres->l_lock, flags);
+        if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+                lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+                spin_unlock_irqrestore(&lockres->l_lock, flags);
+                /*
+                 * Get the lock at NLMODE to start - that way we
+                 * can cancel the upconvert request if need be.
+                 */
+                ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
                }
-                goto out;
+                ret = ocfs2_wait_for_mask(&mw);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                spin_lock_irqsave(&lockres->l_lock, flags);
        }
-        if (ocfs2_mount_local(osb))
+        lockres->l_action = OCFS2_AST_CONVERT;
-                goto out;
+        lkm_flags |= LKM_CONVERT;
+        lockres->l_requested = level;
+        lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
-        lockres = &OCFS2_I(inode)->ip_data_lockres;
+        lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+        spin_unlock_irqrestore(&lockres->l_lock, flags);
-        level = write ? LKM_EXMODE : LKM_PRMODE;
+        ret = dlmlock(osb->dlm, level, &lockres->l_lksb, lkm_flags,
+                      lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
+                      ocfs2_locking_ast, lockres, ocfs2_blocking_ast);
+        if (ret != DLM_NORMAL) {
+                if (trylock && ret == DLM_NOTQUEUED)
+                        ret = -EAGAIN;
+                else {
+                        ocfs2_log_dlm_error("dlmlock", ret, lockres);
+                        ret = -EINVAL;
+                }
-        status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level,
+                ocfs2_recover_from_dlm_error(lockres, 1);
-                                    0, arg_flags);
+                lockres_remove_mask_waiter(lockres, &mw);
-        if (status < 0 && status != -EAGAIN)
+                goto out;
-                mlog_errno(status);
+        }
+        ret = ocfs2_wait_for_mask_interruptible(&mw, lockres);
+        if (ret == -ERESTARTSYS) {
+                /*
+                 * Userspace can cause deadlock itself with
+                 * flock(). Current behavior locally is to allow the
+                 * deadlock, but abort the system call if a signal is
+                 * received. We follow this example, otherwise a
+                 * poorly written program could sit in kernel until
+                 * reboot.
+                 *
+                 * Handling this is a bit more complicated for Ocfs2
+                 * though. We can't exit this function with an
+                 * outstanding lock request, so a cancel convert is
+                 * required. We intentionally overwrite 'ret' - if the
+                 * cancel fails and the lock was granted, it's easier
+                 * to just bubble sucess back up to the user.
+                 */
+                ret = ocfs2_flock_handle_signal(lockres, level);
+        }
 out:
-        mlog_exit(status);
-        return status;
+        mlog(0, "Lock: \"%s\" ex: %d, trylock: %d, returns: %d\n",
+             lockres->l_name, ex, trylock, ret);
+        return ret;
 }
-/* see ocfs2_meta_lock_with_page() */
+void ocfs2_file_unlock(struct file *file)
-int ocfs2_data_lock_with_page(struct inode *inode,
-                              int write,
-                              struct page *page)
 {
        int ret;
+        unsigned long flags;
+        struct ocfs2_file_private *fp = file->private_data;
+        struct ocfs2_lock_res *lockres = &fp->fp_flock;
+        struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
+        struct ocfs2_mask_waiter mw;
-        ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK);
+        ocfs2_init_mask_waiter(&mw);
-        if (ret == -EAGAIN) {
-                unlock_page(page);
+        if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
-                if (ocfs2_data_lock(inode, write) == 0)
+                return;
-                        ocfs2_data_unlock(inode, write);
-                ret = AOP_TRUNCATED_PAGE;
+        if (lockres->l_level == LKM_NLMODE)
+                return;
+        mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
+             lockres->l_name, lockres->l_flags, lockres->l_level,
+             lockres->l_action);
+        spin_lock_irqsave(&lockres->l_lock, flags);
+        /*
+         * Fake a blocking ast for the downconvert code.
+         */
+        lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
+        lockres->l_blocking = LKM_EXMODE;
+        ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
+        lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+        spin_unlock_irqrestore(&lockres->l_lock, flags);
+        ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0);
+        if (ret) {
+                mlog_errno(ret);
+                return;
        }
-        return ret;
+        ret = ocfs2_wait_for_mask(&mw);
+        if (ret)
+                mlog_errno(ret);
 }
-static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
+static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
-                                 struct ocfs2_lock_res *lockres)
+                                        struct ocfs2_lock_res *lockres)
 {
        int kick = 0;
        mlog_entry_void();
        /* If we know that another node is waiting on our lock, kick
-         * the vote thread * pre-emptively when we reach a release
+         * the downconvert thread * pre-emptively when we reach a release
         * condition. */
        if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
                switch(lockres->l_blocking) {
@@ -1403,27 +1589,7 @@ static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
        }
        if (kick)
-                ocfs2_kick_vote_thread(osb);
+                ocfs2_wake_downconvert_thread(osb);
-        mlog_exit_void();
-}
-void ocfs2_data_unlock(struct inode *inode,
-                       int write)
-{
-        int level = write ? LKM_EXMODE : LKM_PRMODE;
-        struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        mlog_entry_void();
-        mlog(0, "inode %llu drop %s DATA lock\n",
-             (unsigned long long)OCFS2_I(inode)->ip_blkno,
-             write ? "EXMODE" : "PRMODE");
-        if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
-            !ocfs2_mount_local(osb))
-                ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
        mlog_exit_void();
 }
@@ -1447,11 +1613,11 @@ static u64 ocfs2_pack_timespec(struct timespec *spec)
 /* Call this with the lockres locked. I am reasonably sure we don't
 * need ip_lock in this function as anyone who would be changing those
- * values is supposed to be blocked in ocfs2_meta_lock right now. */
+ * values is supposed to be blocked in ocfs2_inode_lock right now. */
 static void __ocfs2_stuff_meta_lvb(struct inode *inode)
 {
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
+        struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
        struct ocfs2_meta_lvb *lvb;
        mlog_entry_void();
@@ -1501,7 +1667,7 @@ static void ocfs2_unpack_timespec(struct timespec *spec,
 static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 {
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
+        struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
        struct ocfs2_meta_lvb *lvb;
        mlog_entry_void();
@@ -1609,12 +1775,12 @@ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockre
 }
 /* may or may not return a bh if it went to disk. */
-static int ocfs2_meta_lock_update(struct inode *inode,
+static int ocfs2_inode_lock_update(struct inode *inode,
                                  struct buffer_head **bh)
 {
        int status = 0;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
+        struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
        struct ocfs2_dinode *fe;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -1726,7 +1892,7 @@ static int ocfs2_assign_bh(struct inode *inode,
 * returns < 0 error if the callback will never be called, otherwise
 * the result of the lock will be communicated via the callback.
 */
-int ocfs2_meta_lock_full(struct inode *inode,
+int ocfs2_inode_lock_full(struct inode *inode,
                         struct buffer_head **ret_bh,
                         int ex,
                         int arg_flags)
@@ -1761,7 +1927,7 @@ int ocfs2_meta_lock_full(struct inode *inode,
                wait_event(osb->recovery_event,
                           ocfs2_node_map_is_empty(osb, &osb->recovery_map));
-        lockres = &OCFS2_I(inode)->ip_meta_lockres;
+        lockres = &OCFS2_I(inode)->ip_inode_lockres;
        level = ex ? LKM_EXMODE : LKM_PRMODE;
        dlm_flags = 0;
        if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
@@ -1800,11 +1966,11 @@ local:
        }
        /* This is fun. The caller may want a bh back, or it may
-         * not. ocfs2_meta_lock_update definitely wants one in, but
+         * not. ocfs2_inode_lock_update definitely wants one in, but
         * may or may not read one, depending on what's in the
         * LVB. The result of all of this is that we've *only* gone to
         * disk if we have to, so the complexity is worthwhile. */
-        status = ocfs2_meta_lock_update(inode, &local_bh);
+        status = ocfs2_inode_lock_update(inode, &local_bh);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -1826,7 +1992,7 @@ bail:
                        *ret_bh = NULL;
                }
                if (acquired)
-                        ocfs2_meta_unlock(inode, ex);
+                        ocfs2_inode_unlock(inode, ex);
        }
        if (local_bh)
@@ -1837,19 +2003,20 @@ bail:
 }
 /*
- * This is working around a lock inversion between tasks acquiring DLM locks
+ * This is working around a lock inversion between tasks acquiring DLM
- * while holding a page lock and the vote thread which blocks dlm lock acquiry
+ * locks while holding a page lock and the downconvert thread which
- * while acquiring page locks.
+ * blocks dlm lock acquiry while acquiring page locks.
 *
 * ** These _with_page variantes are only intended to be called from aop
 * methods that hold page locks and return a very specific *positive* error
 * code that aop methods pass up to the VFS -- test for errors with != 0. **
 *
- * The DLM is called such that it returns -EAGAIN if it would have blocked
+ * The DLM is called such that it returns -EAGAIN if it would have
- * waiting for the vote thread.  In that case we unlock our page so the vote
+ * blocked waiting for the downconvert thread.  In that case we unlock
- * thread can make progress.  Once we've done this we have to return
+ * our page so the downconvert thread can make progress.  Once we've
- * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up
+ * done this we have to return AOP_TRUNCATED_PAGE so the aop method
- * into the VFS who will then immediately retry the aop call.
+ * that called us can bubble that back up into the VFS who will then
+ * immediately retry the aop call.
 *
 * We do a blocking lock and immediate unlock before returning, though, so that
 * the lock has a great chance of being cached on this node by the time the VFS
@@ -1857,32 +2024,32 @@ bail:
 * ping locks back and forth, but that's a risk we're willing to take to avoid
 * the lock inversion simply.
 */
-int ocfs2_meta_lock_with_page(struct inode *inode,
+int ocfs2_inode_lock_with_page(struct inode *inode,
                              struct buffer_head **ret_bh,
                              int ex,
                              struct page *page)
 {
        int ret;
-        ret = ocfs2_meta_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
+        ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
        if (ret == -EAGAIN) {
                unlock_page(page);
-                if (ocfs2_meta_lock(inode, ret_bh, ex) == 0)
+                if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
-                        ocfs2_meta_unlock(inode, ex);
+                        ocfs2_inode_unlock(inode, ex);
                ret = AOP_TRUNCATED_PAGE;
        }
        return ret;
 }
-int ocfs2_meta_lock_atime(struct inode *inode,
+int ocfs2_inode_lock_atime(struct inode *inode,
                          struct vfsmount *vfsmnt,
                          int *level)
 {
        int ret;
        mlog_entry_void();
-        ret = ocfs2_meta_lock(inode, NULL, 0);
+        ret = ocfs2_inode_lock(inode, NULL, 0);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
@@ -1895,8 +2062,8 @@ int ocfs2_meta_lock_atime(struct inode *inode,
        if (ocfs2_should_update_atime(inode, vfsmnt)) {
                struct buffer_head *bh = NULL;
-                ocfs2_meta_unlock(inode, 0);
+                ocfs2_inode_unlock(inode, 0);
-                ret = ocfs2_meta_lock(inode, &bh, 1);
+                ret = ocfs2_inode_lock(inode, &bh, 1);
                if (ret < 0) {
                        mlog_errno(ret);
                        return ret;
@@ -1913,11 +2080,11 @@ int ocfs2_meta_lock_atime(struct inode *inode,
        return ret;
 }
-void ocfs2_meta_unlock(struct inode *inode,
+void ocfs2_inode_unlock(struct inode *inode,
                       int ex)
 {
        int level = ex ? LKM_EXMODE : LKM_PRMODE;
-        struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
+        struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        mlog_entry_void();
@@ -2325,11 +2492,11 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
                goto bail;
        }
-        /* launch vote thread */
+        /* launch downconvert thread */
-        osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote");
+        osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc");
-        if (IS_ERR(osb->vote_task)) {
+        if (IS_ERR(osb->dc_task)) {
-                status = PTR_ERR(osb->vote_task);
+                status = PTR_ERR(osb->dc_task);
-                osb->vote_task = NULL;
+                osb->dc_task = NULL;
                mlog_errno(status);
                goto bail;
        }
@@ -2358,8 +2525,8 @@ local:
 bail:
        if (status < 0) {
                ocfs2_dlm_shutdown_debug(osb);
-                if (osb->vote_task)
+                if (osb->dc_task)
-                        kthread_stop(osb->vote_task);
+                        kthread_stop(osb->dc_task);
        }
        mlog_exit(status);
@@ -2374,9 +2541,9 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
        ocfs2_drop_osb_locks(osb);
-        if (osb->vote_task) {
+        if (osb->dc_task) {
-                kthread_stop(osb->vote_task);
+                kthread_stop(osb->dc_task);
-                osb->vote_task = NULL;
+                osb->dc_task = NULL;
        }
        ocfs2_lock_res_free(&osb->osb_super_lockres);
@@ -2532,7 +2699,7 @@ out:
 /* Mark the lockres as being dropped. It will no longer be
 * queued if blocking, but we still may have to wait on it
- * being dequeued from the vote thread before we can consider
+ * being dequeued from the downconvert thread before we can consider
 * it safe to drop. 
 *
 * You can *not* attempt to call cluster_lock on this lockres anymore. */
@@ -2595,14 +2762,7 @@ int ocfs2_drop_inode_locks(struct inode *inode)
        status = err;
        err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
-                              &OCFS2_I(inode)->ip_data_lockres);
+                              &OCFS2_I(inode)->ip_inode_lockres);
-        if (err < 0)
-                mlog_errno(err);
-        if (err < 0 && !status)
-                status = err;
-        err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
-                              &OCFS2_I(inode)->ip_meta_lockres);
        if (err < 0)
                mlog_errno(err);
        if (err < 0 && !status)
@@ -2855,6 +3015,9 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
        inode = ocfs2_lock_res_inode(lockres);
        mapping = inode->i_mapping;
+        if (S_ISREG(inode->i_mode))
+                goto out;
        /*
         * We need this before the filemap_fdatawrite() so that it can
         * transfer the dirty bit from the PTE to the
@@ -2880,6 +3043,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
                filemap_fdatawait(mapping);
        }
+out:
        return UNBLOCK_CONTINUE;
 }
@@ -2908,7 +3072,7 @@ static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
 /*
 * Does the final reference drop on our dentry lock. Right now this
- * happens in the vote thread, but we could choose to simplify the
+ * happens in the downconvert thread, but we could choose to simplify the
 * dlmglue API and push these off to the ocfs2_wq in the future.
 */
 static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
@@ -3047,7 +3211,7 @@ void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
        mlog(0, "lockres %s blocked.\n", lockres->l_name);
        /* Detect whether a lock has been marked as going away while
-         * the vote thread was processing other things. A lock can
+         * the downconvert thread was processing other things. A lock can
         * still be marked with OCFS2_LOCK_FREEING after this check,
         * but short circuiting here will still save us some
         * performance. */
@@ -3096,13 +3260,104 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
        lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
-        spin_lock(&osb->vote_task_lock);
+        spin_lock(&osb->dc_task_lock);
        if (list_empty(&lockres->l_blocked_list)) {
                list_add_tail(&lockres->l_blocked_list,
                              &osb->blocked_lock_list);
                osb->blocked_lock_count++;
        }
-        spin_unlock(&osb->vote_task_lock);
+        spin_unlock(&osb->dc_task_lock);
        mlog_exit_void();
 }
+static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
+{
+        unsigned long processed;
+        struct ocfs2_lock_res *lockres;
+        mlog_entry_void();
+        spin_lock(&osb->dc_task_lock);
+        /* grab this early so we know to try again if a state change and
+         * wake happens part-way through our work  */
+        osb->dc_work_sequence = osb->dc_wake_sequence;
+        processed = osb->blocked_lock_count;
+        while (processed) {
+                BUG_ON(list_empty(&osb->blocked_lock_list));
+                lockres = list_entry(osb->blocked_lock_list.next,
+                                     struct ocfs2_lock_res, l_blocked_list);
+                list_del_init(&lockres->l_blocked_list);
+                osb->blocked_lock_count--;
+                spin_unlock(&osb->dc_task_lock);
+                BUG_ON(!processed);
+                processed--;
+                ocfs2_process_blocked_lock(osb, lockres);
+                spin_lock(&osb->dc_task_lock);
+        }
+        spin_unlock(&osb->dc_task_lock);
+        mlog_exit_void();
+}
+static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb)
+{
+        int empty = 0;
+        spin_lock(&osb->dc_task_lock);
+        if (list_empty(&osb->blocked_lock_list))
+                empty = 1;
+        spin_unlock(&osb->dc_task_lock);
+        return empty;
+}
+static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb)
+{
+        int should_wake = 0;
+        spin_lock(&osb->dc_task_lock);
+        if (osb->dc_work_sequence != osb->dc_wake_sequence)
+                should_wake = 1;
+        spin_unlock(&osb->dc_task_lock);
+        return should_wake;
+}
+int ocfs2_downconvert_thread(void *arg)
+{
+        int status = 0;
+        struct ocfs2_super *osb = arg;
+        /* only quit once we've been asked to stop and there is no more
+         * work available */
+        while (!(kthread_should_stop() &&
+                ocfs2_downconvert_thread_lists_empty(osb))) {
+                wait_event_interruptible(osb->dc_event,
+                                         ocfs2_downconvert_thread_should_wake(osb) ||
+                                         kthread_should_stop());
+                mlog(0, "downconvert_thread: awoken\n");
+                ocfs2_downconvert_thread_do_work(osb);
+        }
+        osb->dc_task = NULL;
+        return status;
+}
+void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb)
+{
+        spin_lock(&osb->dc_task_lock);
+        /* make sure the voting thread gets a swipe at whatever changes
+         * the caller may have made to the voting state */
+        osb->dc_wake_sequence++;
+        spin_unlock(&osb->dc_task_lock);
+        wake_up(&osb->dc_event);
+}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 87a785e41205..5f17243ba501 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -49,12 +49,12 @@ struct ocfs2_meta_lvb {
        __be32       lvb_reserved2;
 };
-/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */
+/* ocfs2_inode_lock_full() 'arg_flags' flags */
 /* don't wait on recovery. */
 #define OCFS2_META_LOCK_RECOVERY        (0x01)
 /* Instruct the dlm not to queue ourselves on the other node. */
 #define OCFS2_META_LOCK_NOQUEUE         (0x02)
-/* don't block waiting for the vote thread, instead return -EAGAIN */
+/* don't block waiting for the downconvert thread, instead return -EAGAIN */
 #define OCFS2_LOCK_NONBLOCK             (0x04)
 int ocfs2_dlm_init(struct ocfs2_super *osb);
@@ -66,38 +66,32 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
                               struct inode *inode);
 void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
                                u64 parent, struct inode *inode);
+struct ocfs2_file_private;
+void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
+                              struct ocfs2_file_private *fp);
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
 int ocfs2_create_new_inode_locks(struct inode *inode);
 int ocfs2_drop_inode_locks(struct inode *inode);
-int ocfs2_data_lock_full(struct inode *inode,
-                         int write,
-                         int arg_flags);
-#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0)
-int ocfs2_data_lock_with_page(struct inode *inode,
-                              int write,
-                              struct page *page);
-void ocfs2_data_unlock(struct inode *inode,
-                       int write);
 int ocfs2_rw_lock(struct inode *inode, int write);
 void ocfs2_rw_unlock(struct inode *inode, int write);
 int ocfs2_open_lock(struct inode *inode);
 int ocfs2_try_open_lock(struct inode *inode, int write);
 void ocfs2_open_unlock(struct inode *inode);
-int ocfs2_meta_lock_atime(struct inode *inode,
+int ocfs2_inode_lock_atime(struct inode *inode,
                          struct vfsmount *vfsmnt,
                          int *level);
-int ocfs2_meta_lock_full(struct inode *inode,
+int ocfs2_inode_lock_full(struct inode *inode,
                         struct buffer_head **ret_bh,
                         int ex,
                         int arg_flags);
-int ocfs2_meta_lock_with_page(struct inode *inode,
+int ocfs2_inode_lock_with_page(struct inode *inode,
                              struct buffer_head **ret_bh,
                              int ex,
                              struct page *page);
 /* 99% of the time we don't want to supply any additional flags --
 * those are for very specific cases only. */
-#define ocfs2_meta_lock(i, b, e) ocfs2_meta_lock_full(i, b, e, 0)
+#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full(i, b, e, 0)
-void ocfs2_meta_unlock(struct inode *inode,
+void ocfs2_inode_unlock(struct inode *inode,
                       int ex);
 int ocfs2_super_lock(struct ocfs2_super *osb,
                     int ex);
@@ -107,14 +101,17 @@ int ocfs2_rename_lock(struct ocfs2_super *osb);
 void ocfs2_rename_unlock(struct ocfs2_super *osb);
 int ocfs2_dentry_lock(struct dentry *dentry, int ex);
 void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
+int ocfs2_file_lock(struct file *file, int ex, int trylock);
+void ocfs2_file_unlock(struct file *file);
 void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
 void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
                               struct ocfs2_lock_res *lockres);
-/* for the vote thread */
+/* for the downconvert thread */
 void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
                                struct ocfs2_lock_res *lockres);
+void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb);
 struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
 void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h
index ff257628af16..1942e09f6ee5 100644
--- a/fs/ocfs2/endian.h
+++ b/fs/ocfs2/endian.h
@@ -37,11 +37,6 @@ static inline void le64_add_cpu(__le64 *var, u64 val)
        *var = cpu_to_le64(le64_to_cpu(*var) + val);
 }
-static inline void le32_and_cpu(__le32 *var, u32 val)
-{
-        *var = cpu_to_le32(le32_to_cpu(*var) & val);
-}
 static inline void be32_add_cpu(__be32 *var, u32 val)
 {
        *var = cpu_to_be32(be32_to_cpu(*var) + val);
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 535bfa9568a4..67527cebf214 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -58,7 +58,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
                return ERR_PTR(-ESTALE);
        }
-        inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0);
+        inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0);
        if (IS_ERR(inode))
                return (void *)inode;
@@ -95,7 +95,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
        mlog(0, "find parent of directory %llu\n",
             (unsigned long long)OCFS2_I(dir)->ip_blkno);
-        status = ocfs2_meta_lock(dir, NULL, 0);
+        status = ocfs2_inode_lock(dir, NULL, 0);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -109,7 +109,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
                goto bail_unlock;
        }
-        inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0);
+        inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0);
        if (IS_ERR(inode)) {
                mlog(ML_ERROR, "Unable to create inode %llu\n",
                     (unsigned long long)blkno);
@@ -126,7 +126,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
        parent->d_op = &ocfs2_dentry_ops;
 bail_unlock:
-        ocfs2_meta_unlock(dir, 0);
+        ocfs2_inode_unlock(dir, 0);
 bail:
        mlog_exit_ptr(parent);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index f92fe91ff260..ed5d5232e85d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -51,6 +51,7 @@
 #include "inode.h"
 #include "ioctl.h"
 #include "journal.h"
+#include "locks.h"
 #include "mmap.h"
 #include "suballoc.h"
 #include "super.h"
@@ -63,6 +64,35 @@ static int ocfs2_sync_inode(struct inode *inode)
        return sync_mapping_buffers(inode->i_mapping);
 }
+static int ocfs2_init_file_private(struct inode *inode, struct file *file)
+{
+        struct ocfs2_file_private *fp;
+        fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
+        if (!fp)
+                return -ENOMEM;
+        fp->fp_file = file;
+        mutex_init(&fp->fp_mutex);
+        ocfs2_file_lock_res_init(&fp->fp_flock, fp);
+        file->private_data = fp;
+        return 0;
+}
+static void ocfs2_free_file_private(struct inode *inode, struct file *file)
+{
+        struct ocfs2_file_private *fp = file->private_data;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        if (fp) {
+                ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
+                ocfs2_lock_res_free(&fp->fp_flock);
+                kfree(fp);
+                file->private_data = NULL;
+        }
+}
 static int ocfs2_file_open(struct inode *inode, struct file *file)
 {
        int status;
@@ -89,7 +119,18 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
        oi->ip_open_count++;
        spin_unlock(&oi->ip_lock);
-        status = 0;
+        status = ocfs2_init_file_private(inode, file);
+        if (status) {
+                /*
+                 * We want to set open count back if we're failing the
+                 * open.
+                 */
+                spin_lock(&oi->ip_lock);
+                oi->ip_open_count--;
+                spin_unlock(&oi->ip_lock);
+        }
 leave:
        mlog_exit(status);
        return status;
@@ -108,11 +149,24 @@ static int ocfs2_file_release(struct inode *inode, struct file *file)
                oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
        spin_unlock(&oi->ip_lock);
+        ocfs2_free_file_private(inode, file);
        mlog_exit(0);
        return 0;
 }
+static int ocfs2_dir_open(struct inode *inode, struct file *file)
+{
+        return ocfs2_init_file_private(inode, file);
+}
+static int ocfs2_dir_release(struct inode *inode, struct file *file)
+{
+        ocfs2_free_file_private(inode, file);
+        return 0;
+}
 static int ocfs2_sync_file(struct file *file,
                           struct dentry *dentry,
                           int datasync)
@@ -382,28 +436,23 @@ static int ocfs2_truncate_file(struct inode *inode,
        down_write(&OCFS2_I(inode)->ip_alloc_sem);
-        /* This forces other nodes to sync and drop their pages. Do
+        /*
-         * this even if we have a truncate without allocation change -
+         * The inode lock forced other nodes to sync and drop their
-         * ocfs2 cluster sizes can be much greater than page size, so
+         * pages, which (correctly) happens even if we have a truncate
-         * we have to truncate them anyway.  */
+         * without allocation change - ocfs2 cluster sizes can be much
-        status = ocfs2_data_lock(inode, 1);
+         * greater than page size, so we have to truncate them
-        if (status < 0) {
+         * anyway.
-                up_write(&OCFS2_I(inode)->ip_alloc_sem);
+         */
-                mlog_errno(status);
-                goto bail;
-        }
        unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
        truncate_inode_pages(inode->i_mapping, new_i_size);
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
                status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
-                                               i_size_read(inode), 0);
+                                               i_size_read(inode), 1);
                if (status)
                        mlog_errno(status);
-                goto bail_unlock_data;
+                goto bail_unlock_sem;
        }
        /* alright, we're going to need to do a full blown alloc size
@@ -413,25 +462,23 @@ static int ocfs2_truncate_file(struct inode *inode,
        status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
        if (status < 0) {
                mlog_errno(status);
-                goto bail_unlock_data;
+                goto bail_unlock_sem;
        }
        status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
        if (status < 0) {
                mlog_errno(status);
-                goto bail_unlock_data;
+                goto bail_unlock_sem;
        }
        status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
        if (status < 0) {
                mlog_errno(status);
-                goto bail_unlock_data;
+                goto bail_unlock_sem;
        }
        /* TODO: orphan dir cleanup here. */
-bail_unlock_data:
+bail_unlock_sem:
-        ocfs2_data_unlock(inode, 1);
        up_write(&OCFS2_I(inode)->ip_alloc_sem);
 bail:
@@ -579,7 +626,7 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
        mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
             "clusters_to_add = %u, extents_to_split = %u\n",
-             (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
+             (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode),
             le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
        num_free_extents = ocfs2_num_free_extents(osb, inode, di);
@@ -760,7 +807,7 @@ restarted_transaction:
             le32_to_cpu(fe->i_clusters),
             (unsigned long long)le64_to_cpu(fe->i_size));
        mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
-             OCFS2_I(inode)->ip_clusters, i_size_read(inode));
+             OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
 leave:
        if (handle) {
@@ -917,7 +964,7 @@ static int ocfs2_extend_file(struct inode *inode,
                             struct buffer_head *di_bh,
                             u64 new_i_size)
 {
-        int ret = 0, data_locked = 0;
+        int ret = 0;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        BUG_ON(!di_bh);
@@ -943,20 +990,6 @@ static int ocfs2_extend_file(struct inode *inode,
            && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
                goto out_update_size;
-        /* 
-         * protect the pages that ocfs2_zero_extend is going to be
-         * pulling into the page cache.. we do this before the
-         * metadata extend so that we don't get into the situation
-         * where we've extended the metadata but can't get the data
-         * lock to zero.
-         */
-        ret = ocfs2_data_lock(inode, 1);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out;
-        }
-        data_locked = 1;
        /*
         * The alloc sem blocks people in read/write from reading our
         * allocation until we're done changing it. We depend on
@@ -980,7 +1013,7 @@ static int ocfs2_extend_file(struct inode *inode,
                        up_write(&oi->ip_alloc_sem);
                        mlog_errno(ret);
-                        goto out_unlock;
+                        goto out;
                }
        }
@@ -991,7 +1024,7 @@ static int ocfs2_extend_file(struct inode *inode,
        if (ret < 0) {
                mlog_errno(ret);
-                goto out_unlock;
+                goto out;
        }
 out_update_size:
@@ -999,10 +1032,6 @@ out_update_size:
        if (ret < 0)
                mlog_errno(ret);
-out_unlock:
-        if (data_locked)
-                ocfs2_data_unlock(inode, 1);
 out:
        return ret;
 }
@@ -1050,7 +1079,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                }
        }
-        status = ocfs2_meta_lock(inode, &bh, 1);
+        status = ocfs2_inode_lock(inode, &bh, 1);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -1102,7 +1131,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 bail_commit:
        ocfs2_commit_trans(osb, handle);
 bail_unlock:
-        ocfs2_meta_unlock(inode, 1);
+        ocfs2_inode_unlock(inode, 1);
 bail_unlock_rw:
        if (size_change)
                ocfs2_rw_unlock(inode, 1);
@@ -1149,7 +1178,7 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
        mlog_entry_void();
-        ret = ocfs2_meta_lock(inode, NULL, 0);
+        ret = ocfs2_inode_lock(inode, NULL, 0);
        if (ret) {
                if (ret != -ENOENT)
                        mlog_errno(ret);
@@ -1158,7 +1187,7 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
        ret = generic_permission(inode, mask, NULL);
-        ocfs2_meta_unlock(inode, 0);
+        ocfs2_inode_unlock(inode, 0);
 out:
        mlog_exit(ret);
        return ret;
@@ -1521,6 +1550,7 @@ static int ocfs2_remove_inode_range(struct inode *inode,
        u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_cached_dealloc_ctxt dealloc;
+        struct address_space *mapping = inode->i_mapping;
        ocfs2_init_dealloc_ctxt(&dealloc);
@@ -1529,10 +1559,20 @@ static int ocfs2_remove_inode_range(struct inode *inode,
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
                ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
-                                            byte_start + byte_len, 1);
+                                            byte_start + byte_len, 0);
-                if (ret)
+                if (ret) {
                        mlog_errno(ret);
-                return ret;
+                        goto out;
+                }
+                /*
+                 * There's no need to get fancy with the page cache
+                 * truncate of an inline-data inode. We're talking
+                 * about less than a page here, which will be cached
+                 * in the dinode buffer anyway.
+                 */
+                unmap_mapping_range(mapping, 0, 0, 0);
+                truncate_inode_pages(mapping, 0);
+                goto out;
        }
        trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
@@ -1619,7 +1659,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
                goto out;
        }
-        ret = ocfs2_meta_lock(inode, &di_bh, 1);
+        ret = ocfs2_inode_lock(inode, &di_bh, 1);
        if (ret) {
                mlog_errno(ret);
                goto out_rw_unlock;
@@ -1627,7 +1667,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
        if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
                ret = -EPERM;
-                goto out_meta_unlock;
+                goto out_inode_unlock;
        }
        switch (sr->l_whence) {
@@ -1641,7 +1681,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
                break;
        default:
                ret = -EINVAL;
-                goto out_meta_unlock;
+                goto out_inode_unlock;
        }
        sr->l_whence = 0;
@@ -1652,14 +1692,14 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
            || (sr->l_start + llen) < 0
            || (sr->l_start + llen) > max_off) {
                ret = -EINVAL;
-                goto out_meta_unlock;
+                goto out_inode_unlock;
        }
        size = sr->l_start + sr->l_len;
        if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
                if (sr->l_len <= 0) {
                        ret = -EINVAL;
-                        goto out_meta_unlock;
+                        goto out_inode_unlock;
                }
        }
@@ -1667,7 +1707,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
                ret = __ocfs2_write_remove_suid(inode, di_bh);
                if (ret) {
                        mlog_errno(ret);
-                        goto out_meta_unlock;
+                        goto out_inode_unlock;
                }
        }
@@ -1693,7 +1733,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
        up_write(&OCFS2_I(inode)->ip_alloc_sem);
        if (ret) {
                mlog_errno(ret);
-                goto out_meta_unlock;
+                goto out_inode_unlock;
        }
        /*
@@ -1703,7 +1743,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                mlog_errno(ret);
-                goto out_meta_unlock;
+                goto out_inode_unlock;
        }
        if (change_size && i_size_read(inode) < size)
@@ -1716,9 +1756,9 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
        ocfs2_commit_trans(osb, handle);
-out_meta_unlock:
+out_inode_unlock:
        brelse(di_bh);
-        ocfs2_meta_unlock(inode, 1);
+        ocfs2_inode_unlock(inode, 1);
 out_rw_unlock:
        ocfs2_rw_unlock(inode, 1);
@@ -1788,7 +1828,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
         * if we need to make modifications here.
         */
        for(;;) {
-                ret = ocfs2_meta_lock(inode, NULL, meta_level);
+                ret = ocfs2_inode_lock(inode, NULL, meta_level);
                if (ret < 0) {
                        meta_level = -1;
                        mlog_errno(ret);
@@ -1806,7 +1846,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                 * set inode->i_size at the end of a write. */
                if (should_remove_suid(dentry)) {
                        if (meta_level == 0) {
-                                ocfs2_meta_unlock(inode, meta_level);
+                                ocfs2_inode_unlock(inode, meta_level);
                                meta_level = 1;
                                continue;
                        }
@@ -1875,7 +1915,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                *ppos = saved_pos;
 out_unlock:
-        ocfs2_meta_unlock(inode, meta_level);
+        ocfs2_inode_unlock(inode, meta_level);
 out:
        return ret;
@@ -1891,9 +1931,11 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
        ssize_t written = 0;
        size_t ocount;          /* original count */
        size_t count;           /* after file limit checks */
-        loff_t *ppos = &iocb->ki_pos;
+        loff_t old_size, *ppos = &iocb->ki_pos;
+        u32 old_clusters;
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_path.dentry->d_inode;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        mlog_entry("(0x%p, %u, '%.*s')\n", file,
                   (unsigned int)nr_segs,
@@ -1949,6 +1991,13 @@ relock:
                goto relock;
        }
+        /*
+         * To later detect whether a journal commit for sync writes is
+         * necessary, we sample i_size, and cluster count here.
+         */
+        old_size = i_size_read(inode);
+        old_clusters = OCFS2_I(inode)->ip_clusters;
        /* communicate with ocfs2_dio_end_io */
        ocfs2_iocb_set_rw_locked(iocb, rw_level);
@@ -1978,6 +2027,21 @@ out_dio:
        /* buffered aio wouldn't have proper lock coverage today */
        BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
+        if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) {
+                /*
+                 * The generic write paths have handled getting data
+                 * to disk, but since we don't make use of the dirty
+                 * inode list, a manual journal commit is necessary
+                 * here.
+                 */
+                if (old_size != i_size_read(inode) ||
+                    old_clusters != OCFS2_I(inode)->ip_clusters) {
+                        ret = journal_force_commit(osb->journal->j_journal);
+                        if (ret < 0)
+                                written = ret;
+                }
+        }
        /* 
         * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
         * function pointer which is called when o_direct io completes so that
@@ -2064,12 +2128,12 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
        /*
         * See the comment in ocfs2_file_aio_read()
         */
-        ret = ocfs2_meta_lock(inode, NULL, 0);
+        ret = ocfs2_inode_lock(inode, NULL, 0);
        if (ret < 0) {
                mlog_errno(ret);
                goto bail;
        }
-        ocfs2_meta_unlock(inode, 0);
+        ocfs2_inode_unlock(inode, 0);
        ret = generic_file_splice_read(in, ppos, pipe, len, flags);
@@ -2125,12 +2189,12 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
         * like i_size. This allows the checks down below
         * generic_file_aio_read() a chance of actually working. 
         */
-        ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level);
+        ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
        if (ret < 0) {
                mlog_errno(ret);
                goto bail;
        }
-        ocfs2_meta_unlock(inode, lock_level);
+        ocfs2_inode_unlock(inode, lock_level);
        ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
        if (ret == -EINVAL)
@@ -2169,6 +2233,7 @@ const struct inode_operations ocfs2_special_file_iops = {
 };
 const struct file_operations ocfs2_fops = {
+        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
        .mmap           = ocfs2_mmap,
@@ -2181,16 +2246,21 @@ const struct file_operations ocfs2_fops = {
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ocfs2_compat_ioctl,
 #endif
+        .flock          = ocfs2_flock,
        .splice_read    = ocfs2_file_splice_read,
        .splice_write   = ocfs2_file_splice_write,
 };
 const struct file_operations ocfs2_dops = {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = ocfs2_readdir,
        .fsync          = ocfs2_sync_file,
+        .release        = ocfs2_dir_release,
+        .open           = ocfs2_dir_open,
        .ioctl          = ocfs2_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ocfs2_compat_ioctl,
 #endif
+        .flock          = ocfs2_flock,
 };
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 066f14add3a8..048ddcaf5c80 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -32,6 +32,12 @@ extern const struct inode_operations ocfs2_file_iops;
 extern const struct inode_operations ocfs2_special_file_iops;
 struct ocfs2_alloc_context;
+struct ocfs2_file_private {
+        struct file             *fp_file;
+        struct mutex            fp_mutex;
+        struct ocfs2_lock_res   fp_flock;
+};
 enum ocfs2_alloc_restarted {
        RESTART_NONE = 0,
        RESTART_TRANS,
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index c4c36171240d..c0efd9489fe8 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -30,9 +30,6 @@
 #include <linux/highmem.h>
 #include <linux/kmod.h>
-#include <cluster/heartbeat.h>
-#include <cluster/nodemanager.h>
 #include <dlm/dlmapi.h>
 #define MLOG_MASK_PREFIX ML_SUPER
@@ -44,13 +41,9 @@
 #include "heartbeat.h"
 #include "inode.h"
 #include "journal.h"
-#include "vote.h"
 #include "buffer_head_io.h"
-#define OCFS2_HB_NODE_DOWN_PRI     (0x0000002)
-#define OCFS2_HB_NODE_UP_PRI       OCFS2_HB_NODE_DOWN_PRI
 static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
                                            int bit);
 static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
@@ -64,9 +57,7 @@ static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
 void ocfs2_init_node_maps(struct ocfs2_super *osb)
 {
        spin_lock_init(&osb->node_map_lock);
-        ocfs2_node_map_init(&osb->mounted_map);
        ocfs2_node_map_init(&osb->recovery_map);
-        ocfs2_node_map_init(&osb->umount_map);
        ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
 }
@@ -87,24 +78,7 @@ static void ocfs2_do_node_down(int node_num,
                return;
        }
-        if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) {
-                /* If a node is in the umount map, then we've been
-                 * expecting him to go down and we know ahead of time
-                 * that recovery is not necessary. */
-                ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
-                return;
-        }
        ocfs2_recovery_thread(osb, node_num);
-        ocfs2_remove_node_from_vote_queues(osb, node_num);
-}
-static void ocfs2_hb_node_down_cb(struct o2nm_node *node,
-                                  int node_num,
-                                  void *data)
-{
-        ocfs2_do_node_down(node_num, (struct ocfs2_super *) data);
 }
 /* Called from the dlm when it's about to evict a node. We may also
@@ -121,27 +95,8 @@ static void ocfs2_dlm_eviction_cb(int node_num,
        ocfs2_do_node_down(node_num, osb);
 }
-static void ocfs2_hb_node_up_cb(struct o2nm_node *node,
-                                int node_num,
-                                void *data)
-{
-        struct ocfs2_super *osb = data;
-        BUG_ON(osb->node_num == node_num);
-        mlog(0, "node up event for %d\n", node_num);
-        ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
-}
 void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
 {
-        o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB,
-                            ocfs2_hb_node_down_cb, osb,
-                            OCFS2_HB_NODE_DOWN_PRI);
-        o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB,
-                            ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI);
        /* Not exactly a heartbeat callback, but leads to essentially
         * the same path so we set it up here. */
        dlm_setup_eviction_cb(&osb->osb_eviction_cb,
@@ -149,39 +104,6 @@ void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
                              osb);
 }
-/* Most functions here are just stubs for now... */
-int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
-{
-        int status;
-        if (ocfs2_mount_local(osb))
-                return 0;
-        status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_down);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_up);
-        if (status < 0) {
-                mlog_errno(status);
-                o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
-        }
-bail:
-        return status;
-}
-void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
-{
-        if (ocfs2_mount_local(osb))
-                return;
-        o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
-        o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_up);
-}
 void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
 {
        int ret;
@@ -341,8 +263,6 @@ int ocfs2_recovery_map_set(struct ocfs2_super *osb,
        spin_lock(&osb->node_map_lock);
-        __ocfs2_node_map_clear_bit(&osb->mounted_map, num);
        if (!test_bit(num, osb->recovery_map.map)) {
            __ocfs2_node_map_set_bit(&osb->recovery_map, num);
            set = 1;
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
index e8fb079122e4..56859211888a 100644
--- a/fs/ocfs2/heartbeat.h
+++ b/fs/ocfs2/heartbeat.h
@@ -29,8 +29,6 @@
 void ocfs2_init_node_maps(struct ocfs2_super *osb);
 void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
-int ocfs2_register_hb_callbacks(struct ocfs2_super *osb);
-void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb);
 void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
 /* node map functions - used to keep track of mounted and in-recovery
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 1d5e0cb0fda1..7e9e4c79aec7 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -49,7 +49,6 @@
 #include "symlink.h"
 #include "sysfile.h"
 #include "uptodate.h"
-#include "vote.h"
 #include "buffer_head_io.h"
@@ -58,8 +57,11 @@ struct ocfs2_find_inode_args
        u64             fi_blkno;
        unsigned long   fi_ino;
        unsigned int    fi_flags;
+        unsigned int    fi_sysfile_type;
 };
+static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES];
 static int ocfs2_read_locked_inode(struct inode *inode,
                                   struct ocfs2_find_inode_args *args);
 static int ocfs2_init_locked_inode(struct inode *inode, void *opaque);
@@ -107,7 +109,8 @@ void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
                oi->ip_attr |= OCFS2_DIRSYNC_FL;
 }
-struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
+struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
+                         int sysfile_type)
 {
        struct inode *inode = NULL;
        struct super_block *sb = osb->sb;
@@ -127,6 +130,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
        args.fi_blkno = blkno;
        args.fi_flags = flags;
        args.fi_ino = ino_from_blkno(sb, blkno);
+        args.fi_sysfile_type = sysfile_type;
        inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor,
                             ocfs2_init_locked_inode, &args);
@@ -201,6 +205,9 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
        inode->i_ino = args->fi_ino;
        OCFS2_I(inode)->ip_blkno = args->fi_blkno;
+        if (args->fi_sysfile_type != 0)
+                lockdep_set_class(&inode->i_mutex,
+                        &ocfs2_sysfile_lock_key[args->fi_sysfile_type]);
        mlog_exit(0);
        return 0;
@@ -322,7 +329,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
                 */
                BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL);
-                ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
+                ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
                                          OCFS2_LOCK_TYPE_META, 0, inode);
                ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
@@ -333,10 +340,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
                                  OCFS2_LOCK_TYPE_RW, inode->i_generation,
                                  inode);
-        ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres,
-                                  OCFS2_LOCK_TYPE_DATA, inode->i_generation,
-                                  inode);
        ocfs2_set_inode_flags(inode);
        status = 0;
@@ -414,7 +417,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
        if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
                generation = osb->fs_generation;
-        ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
+        ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
                                  OCFS2_LOCK_TYPE_META,
                                  generation, inode);
@@ -429,7 +432,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
                        mlog_errno(status);
                        return status;
                }
-                status = ocfs2_meta_lock(inode, NULL, 0);
+                status = ocfs2_inode_lock(inode, NULL, 0);
                if (status) {
                        make_bad_inode(inode);
                        mlog_errno(status);
@@ -455,8 +458,8 @@ static int ocfs2_read_locked_inode(struct inode *inode,
        status = -EINVAL;
        fe = (struct ocfs2_dinode *) bh->b_data;
        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
+                mlog(0, "Invalid dinode #%llu: signature = %.*s\n",
-                     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
+                     (unsigned long long)args->fi_blkno, 7,
                     fe->i_signature);
                goto bail;
        }
@@ -484,7 +487,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 bail:
        if (can_lock)
-                ocfs2_meta_unlock(inode, 0);
+                ocfs2_inode_unlock(inode, 0);
        if (status < 0)
                make_bad_inode(inode);
@@ -586,7 +589,7 @@ static int ocfs2_remove_inode(struct inode *inode,
        }
        mutex_lock(&inode_alloc_inode->i_mutex);
-        status = ocfs2_meta_lock(inode_alloc_inode, &inode_alloc_bh, 1);
+        status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1);
        if (status < 0) {
                mutex_unlock(&inode_alloc_inode->i_mutex);
@@ -617,7 +620,7 @@ static int ocfs2_remove_inode(struct inode *inode,
        }
        di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
-        le32_and_cpu(&di->i_flags, ~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
+        di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
        status = ocfs2_journal_dirty(handle, di_bh);
        if (status < 0) {
@@ -635,7 +638,7 @@ static int ocfs2_remove_inode(struct inode *inode,
 bail_commit:
        ocfs2_commit_trans(osb, handle);
 bail_unlock:
-        ocfs2_meta_unlock(inode_alloc_inode, 1);
+        ocfs2_inode_unlock(inode_alloc_inode, 1);
        mutex_unlock(&inode_alloc_inode->i_mutex);
        brelse(inode_alloc_bh);
 bail:
@@ -709,7 +712,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
         * delete_inode operation. We do this now to avoid races with
         * recovery completion on other nodes. */
        mutex_lock(&orphan_dir_inode->i_mutex);
-        status = ocfs2_meta_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+        status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
        if (status < 0) {
                mutex_unlock(&orphan_dir_inode->i_mutex);
@@ -718,8 +721,8 @@ static int ocfs2_wipe_inode(struct inode *inode,
        }
        /* we do this while holding the orphan dir lock because we
-         * don't want recovery being run from another node to vote for
+         * don't want recovery being run from another node to try an
-         * an inode delete on us -- this will result in two nodes
+         * inode delete underneath us -- this will result in two nodes
         * truncating the same file! */
        status = ocfs2_truncate_for_delete(osb, inode, di_bh);
        if (status < 0) {
@@ -733,7 +736,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
                mlog_errno(status);
 bail_unlock_dir:
-        ocfs2_meta_unlock(orphan_dir_inode, 1);
+        ocfs2_inode_unlock(orphan_dir_inode, 1);
        mutex_unlock(&orphan_dir_inode->i_mutex);
        brelse(orphan_dir_bh);
 bail:
@@ -744,7 +747,7 @@ bail:
 }
 /* There is a series of simple checks that should be done before a
- * vote is even considered. Encapsulate those in this function. */
+ * trylock is even considered. Encapsulate those in this function. */
 static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
 {
        int ret = 0;
@@ -758,14 +761,14 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
                goto bail;
        }
-        /* If we're coming from process_vote we can't go into our own
+        /* If we're coming from downconvert_thread we can't go into our own
         * voting [hello, deadlock city!], so unforuntately we just
         * have to skip deleting this guy. That's OK though because
         * the node who's doing the actual deleting should handle it
         * anyway. */
-        if (current == osb->vote_task) {
+        if (current == osb->dc_task) {
                mlog(0, "Skipping delete of %lu because we're currently "
-                     "in process_vote\n", inode->i_ino);
+                     "in downconvert\n", inode->i_ino);
                goto bail;
        }
@@ -779,10 +782,9 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
                goto bail_unlock;
        }
-        /* If we have voted "yes" on the wipe of this inode for
+        /* If we have allowd wipe of this inode for another node, it
-         * another node, it will be marked here so we can safely skip
+         * will be marked here so we can safely skip it. Recovery will
-         * it. Recovery will cleanup any inodes we might inadvertantly
+         * cleanup any inodes we might inadvertantly skip here. */
-         * skip here. */
        if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) {
                mlog(0, "Skipping delete of %lu because another node "
                     "has done this for us.\n", inode->i_ino);
@@ -863,7 +865,7 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
        status = ocfs2_try_open_lock(inode, 1);
        if (status == -EAGAIN) {
                status = 0;
-                mlog(0, "Skipping delete of %llu because it is in use on"
+                mlog(0, "Skipping delete of %llu because it is in use on "
                     "other nodes\n", (unsigned long long)oi->ip_blkno);
                goto bail;
        }
@@ -929,13 +931,13 @@ void ocfs2_delete_inode(struct inode *inode)
        /* Lock down the inode. This gives us an up to date view of
         * it's metadata (for verification), and allows us to
-         * serialize delete_inode votes. 
+         * serialize delete_inode on multiple nodes.
         *
         * Even though we might be doing a truncate, we don't take the
         * allocation lock here as it won't be needed - nobody will
         * have the file open.
         */
-        status = ocfs2_meta_lock(inode, &di_bh, 1);
+        status = ocfs2_inode_lock(inode, &di_bh, 1);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -947,15 +949,15 @@ void ocfs2_delete_inode(struct inode *inode)
         * before we go ahead and wipe the inode. */
        status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
        if (!wipe || status < 0) {
-                /* Error and inode busy vote both mean we won't be
+                /* Error and remote inode busy both mean we won't be
                 * removing the inode, so they take almost the same
                 * path. */
                if (status < 0)
                        mlog_errno(status);
-                /* Someone in the cluster has voted to not wipe this
+                /* Someone in the cluster has disallowed a wipe of
-                 * inode, or it was never completely orphaned. Write
+                 * this inode, or it was never completely
-                 * out the pages and exit now. */
+                 * orphaned. Write out the pages and exit now. */
                ocfs2_cleanup_delete_inode(inode, 1);
                goto bail_unlock_inode;
        }
@@ -981,7 +983,7 @@ void ocfs2_delete_inode(struct inode *inode)
        OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
 bail_unlock_inode:
-        ocfs2_meta_unlock(inode, 1);
+        ocfs2_inode_unlock(inode, 1);
        brelse(di_bh);
 bail_unblock:
        status = sigprocmask(SIG_SETMASK, &oldset, NULL);
@@ -1008,15 +1010,14 @@ void ocfs2_clear_inode(struct inode *inode)
        mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
                        "Inode=%lu\n", inode->i_ino);
-        /* For remove delete_inode vote, we hold open lock before,
+        /* To preven remote deletes we hold open lock before, now it
-         * now it is time to unlock PR and EX open locks. */
+         * is time to unlock PR and EX open locks. */
        ocfs2_open_unlock(inode);
        /* Do these before all the other work so that we don't bounce
-         * the vote thread while waiting to destroy the locks. */
+         * the downconvert thread while waiting to destroy the locks. */
        ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
-        ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
+        ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
-        ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
        ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
        /* We very well may get a clear_inode before all an inodes
@@ -1039,8 +1040,7 @@ void ocfs2_clear_inode(struct inode *inode)
                mlog_errno(status);
        ocfs2_lock_res_free(&oi->ip_rw_lockres);
-        ocfs2_lock_res_free(&oi->ip_meta_lockres);
+        ocfs2_lock_res_free(&oi->ip_inode_lockres);
-        ocfs2_lock_res_free(&oi->ip_data_lockres);
        ocfs2_lock_res_free(&oi->ip_open_lockres);
        ocfs2_metadata_cache_purge(inode);
@@ -1184,15 +1184,15 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
        }
        spin_unlock(&OCFS2_I(inode)->ip_lock);
-        /* Let ocfs2_meta_lock do the work of updating our struct
+        /* Let ocfs2_inode_lock do the work of updating our struct
         * inode for us. */
-        status = ocfs2_meta_lock(inode, NULL, 0);
+        status = ocfs2_inode_lock(inode, NULL, 0);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
                goto bail;
        }
-        ocfs2_meta_unlock(inode, 0);
+        ocfs2_inode_unlock(inode, 0);
 bail:
        mlog_exit(status);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 70e881c55536..390a85596aa0 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -34,8 +34,7 @@ struct ocfs2_inode_info
        u64                     ip_blkno;
        struct ocfs2_lock_res           ip_rw_lockres;
-        struct ocfs2_lock_res           ip_meta_lockres;
+        struct ocfs2_lock_res           ip_inode_lockres;
-        struct ocfs2_lock_res           ip_data_lockres;
        struct ocfs2_lock_res           ip_open_lockres;
        /* protects allocation changes on this inode. */
@@ -121,9 +120,10 @@ void ocfs2_delete_inode(struct inode *inode);
 void ocfs2_drop_inode(struct inode *inode);
 /* Flags for ocfs2_iget() */
-#define OCFS2_FI_FLAG_SYSFILE           0x4
+#define OCFS2_FI_FLAG_SYSFILE           0x1
-#define OCFS2_FI_FLAG_ORPHAN_RECOVERY   0x8
+#define OCFS2_FI_FLAG_ORPHAN_RECOVERY   0x2
-struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags);
+struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
+                         int sysfile_type);
 int ocfs2_inode_init_private(struct inode *inode);
 int ocfs2_inode_revalidate(struct dentry *dentry);
 int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 87dcece7e1b5..5177fba5162b 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -20,6 +20,7 @@
 #include "ocfs2_fs.h"
 #include "ioctl.h"
+#include "resize.h"
 #include <linux/ext2_fs.h>
@@ -27,14 +28,14 @@ static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
 {
        int status;
-        status = ocfs2_meta_lock(inode, NULL, 0);
+        status = ocfs2_inode_lock(inode, NULL, 0);
        if (status < 0) {
                mlog_errno(status);
                return status;
        }
        ocfs2_get_inode_flags(OCFS2_I(inode));
        *flags = OCFS2_I(inode)->ip_attr;
-        ocfs2_meta_unlock(inode, 0);
+        ocfs2_inode_unlock(inode, 0);
        mlog_exit(status);
        return status;
@@ -52,7 +53,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
        mutex_lock(&inode->i_mutex);
-        status = ocfs2_meta_lock(inode, &bh, 1);
+        status = ocfs2_inode_lock(inode, &bh, 1);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -100,7 +101,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
        ocfs2_commit_trans(osb, handle);
 bail_unlock:
-        ocfs2_meta_unlock(inode, 1);
+        ocfs2_inode_unlock(inode, 1);
 bail:
        mutex_unlock(&inode->i_mutex);
@@ -115,8 +116,10 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
        unsigned int cmd, unsigned long arg)
 {
        unsigned int flags;
+        int new_clusters;
        int status;
        struct ocfs2_space_resv sr;
+        struct ocfs2_new_group_input input;
        switch (cmd) {
        case OCFS2_IOC_GETFLAGS:
@@ -140,6 +143,23 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
                        return -EFAULT;
                return ocfs2_change_file_space(filp, cmd, &sr);
+        case OCFS2_IOC_GROUP_EXTEND:
+                if (!capable(CAP_SYS_RESOURCE))
+                        return -EPERM;
+                if (get_user(new_clusters, (int __user *)arg))
+                        return -EFAULT;
+                return ocfs2_group_extend(inode, new_clusters);
+        case OCFS2_IOC_GROUP_ADD:
+        case OCFS2_IOC_GROUP_ADD64:
+                if (!capable(CAP_SYS_RESOURCE))
+                        return -EPERM;
+                if (copy_from_user(&input, (int __user *) arg, sizeof(input)))
+                        return -EFAULT;
+                return ocfs2_group_add(inode, &input);
        default:
                return -ENOTTY;
        }
@@ -162,6 +182,9 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        case OCFS2_IOC_RESVSP64:
        case OCFS2_IOC_UNRESVSP:
        case OCFS2_IOC_UNRESVSP64:
+        case OCFS2_IOC_GROUP_EXTEND:
+        case OCFS2_IOC_GROUP_ADD:
+        case OCFS2_IOC_GROUP_ADD64:
                break;
        default:
                return -ENOIOCTLCMD;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index f9d01e25298d..f31c7e8c19c3 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -44,7 +44,6 @@
 #include "localalloc.h"
 #include "slot_map.h"
 #include "super.h"
-#include "vote.h"
 #include "sysfile.h"
 #include "buffer_head_io.h"
@@ -103,7 +102,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
        mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
             journal->j_trans_id, flushed);
-        ocfs2_kick_vote_thread(osb);
+        ocfs2_wake_downconvert_thread(osb);
        wake_up(&journal->j_checkpointed);
 finally:
        mlog_exit(status);
@@ -174,6 +173,12 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
 * transaction. extend_trans will either extend the current handle by
 * nblocks, or commit it and start a new one with nblocks credits.
 *
+ * This might call journal_restart() which will commit dirty buffers
+ * and then restart the transaction. Before calling
+ * ocfs2_extend_trans(), any changed blocks should have been
+ * dirtied. After calling it, all blocks which need to be changed must
+ * go through another set of journal_access/journal_dirty calls.
+ *
 * WARNING: This will not release any semaphores or disk locks taken
 * during the transaction, so make sure they were taken *before*
 * start_trans or we'll have ordering deadlocks.
@@ -193,11 +198,15 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
        mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
+#ifdef OCFS2_DEBUG_FS
+        status = 1;
+#else
        status = journal_extend(handle, nblocks);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
+#endif
        if (status > 0) {
                mlog(0, "journal_extend failed, trying journal_restart\n");
@@ -304,14 +313,18 @@ int ocfs2_journal_dirty_data(handle_t *handle,
        return err;
 }
-#define OCFS2_DEFAULT_COMMIT_INTERVAL   (HZ * 5)
+#define OCFS2_DEFAULT_COMMIT_INTERVAL   (HZ * JBD_DEFAULT_MAX_COMMIT_AGE)
 void ocfs2_set_journal_params(struct ocfs2_super *osb)
 {
        journal_t *journal = osb->journal->j_journal;
+        unsigned long commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
+        if (osb->osb_commit_interval)
+                commit_interval = osb->osb_commit_interval;
        spin_lock(&journal->j_state_lock);
-        journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
+        journal->j_commit_interval = commit_interval;
        if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
                journal->j_flags |= JFS_BARRIER;
        else
@@ -327,7 +340,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
        struct ocfs2_dinode *di = NULL;
        struct buffer_head *bh = NULL;
        struct ocfs2_super *osb;
-        int meta_lock = 0;
+        int inode_lock = 0;
        mlog_entry_void();
@@ -357,14 +370,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
        /* Skip recovery waits here - journal inode metadata never
         * changes in a live cluster so it can be considered an
         * exception to the rule. */
-        status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
+        status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
        if (status < 0) {
                if (status != -ERESTARTSYS)
                        mlog(ML_ERROR, "Could not get lock on journal!\n");
                goto done;
        }
-        meta_lock = 1;
+        inode_lock = 1;
        di = (struct ocfs2_dinode *)bh->b_data;
        if (inode->i_size <  OCFS2_MIN_JOURNAL_SIZE) {
@@ -404,8 +417,8 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
        status = 0;
 done:
        if (status < 0) {
-                if (meta_lock)
+                if (inode_lock)
-                        ocfs2_meta_unlock(inode, 1);
+                        ocfs2_inode_unlock(inode, 1);
                if (bh != NULL)
                        brelse(bh);
                if (inode) {
@@ -534,7 +547,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
        OCFS2_I(inode)->ip_open_count--;
        /* unlock our journal */
-        ocfs2_meta_unlock(inode, 1);
+        ocfs2_inode_unlock(inode, 1);
        brelse(journal->j_bh);
        journal->j_bh = NULL;
@@ -873,8 +886,8 @@ restart:
        ocfs2_super_unlock(osb, 1);
        /* We always run recovery on our own orphan dir - the dead
-         * node(s) may have voted "no" on an inode delete earlier. A
+         * node(s) may have disallowd a previos inode delete. Re-processing
-         * revote is therefore required. */
+         * is therefore required. */
        ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
                                        NULL);
@@ -963,9 +976,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
        }
        SET_INODE_JOURNAL(inode);
-        status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
+        status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
        if (status < 0) {
-                mlog(0, "status returned from ocfs2_meta_lock=%d\n", status);
+                mlog(0, "status returned from ocfs2_inode_lock=%d\n", status);
                if (status != -ERESTARTSYS)
                        mlog(ML_ERROR, "Could not lock journal!\n");
                goto done;
@@ -1037,7 +1050,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 done:
        /* drop the lock on this nodes journal */
        if (got_lock)
-                ocfs2_meta_unlock(inode, 1);
+                ocfs2_inode_unlock(inode, 1);
        if (inode)
                iput(inode);
@@ -1152,14 +1165,14 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
        SET_INODE_JOURNAL(inode);
        flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE;
-        status = ocfs2_meta_lock_full(inode, NULL, 1, flags);
+        status = ocfs2_inode_lock_full(inode, NULL, 1, flags);
        if (status < 0) {
                if (status != -EAGAIN)
                        mlog_errno(status);
                goto bail;
        }
-        ocfs2_meta_unlock(inode, 1);
+        ocfs2_inode_unlock(inode, 1);
 bail:
        if (inode)
                iput(inode);
@@ -1231,7 +1244,7 @@ static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len,
        /* Skip bad inodes so that recovery can continue */
        iter = ocfs2_iget(p->osb, ino,
-                          OCFS2_FI_FLAG_ORPHAN_RECOVERY);
+                          OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0);
        if (IS_ERR(iter))
                return 0;
@@ -1267,7 +1280,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
        }       
        mutex_lock(&orphan_dir_inode->i_mutex);
-        status = ocfs2_meta_lock(orphan_dir_inode, NULL, 0);
+        status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0);
        if (status < 0) {
                mlog_errno(status);
                goto out;
@@ -1277,12 +1290,13 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
                                   ocfs2_orphan_filldir);
        if (status) {
                mlog_errno(status);
-                goto out;
+                goto out_cluster;
        }
        *head = priv.head;
-        ocfs2_meta_unlock(orphan_dir_inode, 0);
+out_cluster:
+        ocfs2_inode_unlock(orphan_dir_inode, 0);
 out:
        mutex_unlock(&orphan_dir_inode->i_mutex);
        iput(orphan_dir_inode);
@@ -1369,10 +1383,10 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                iter = oi->ip_next_orphan;
                spin_lock(&oi->ip_lock);
-                /* Delete voting may have set these on the assumption
+                /* The remote delete code may have set these on the
-                 * that the other node would wipe them successfully.
+                 * assumption that the other node would wipe them
-                 * If they are still in the node's orphan dir, we need
+                 * successfully.  If they are still in the node's
-                 * to reset that state. */
+                 * orphan dir, we need to reset that state. */
                oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
                /* Set the proper information to get us going into
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 4b32e0961568..220f3e818e78 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -278,6 +278,12 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 /* simple file updates like chmod, etc. */
 #define OCFS2_INODE_UPDATE_CREDITS 1
+/* group extend. inode update and last group update. */
+#define OCFS2_GROUP_EXTEND_CREDITS      (OCFS2_INODE_UPDATE_CREDITS + 1)
+/* group add. inode update and the new group update. */
+#define OCFS2_GROUP_ADD_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
 /* get one bit out of a suballocator: dinode + group descriptor +
 * prev. group desc. if we relink. */
 #define OCFS2_SUBALLOC_ALLOC (3)
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index d272847d5a07..add1ffdc5c6c 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -75,18 +75,12 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
                                          struct inode *local_alloc_inode);
-/*
- * Determine how large our local alloc window should be, in bits.
- *
- * These values (and the behavior in ocfs2_alloc_should_use_local) have
- * been chosen so that most allocations, including new block groups go
- * through local alloc.
- */
 static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
 {
-        BUG_ON(osb->s_clustersize_bits < 12);
+        BUG_ON(osb->s_clustersize_bits > 20);
-        return 2048 >> (osb->s_clustersize_bits - 12);
+        /* Size local alloc windows by the megabyte */
+        return osb->local_alloc_size << (20 - osb->s_clustersize_bits);
 }
 /*
@@ -96,18 +90,23 @@ static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
 int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
 {
        int la_bits = ocfs2_local_alloc_window_bits(osb);
+        int ret = 0;
        if (osb->local_alloc_state != OCFS2_LA_ENABLED)
-                return 0;
+                goto bail;
        /* la_bits should be at least twice the size (in clusters) of
         * a new block group. We want to be sure block group
         * allocations go through the local alloc, so allow an
         * allocation to take up to half the bitmap. */
        if (bits > (la_bits / 2))
-                return 0;
+                goto bail;
-        return 1;
+        ret = 1;
+bail:
+        mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n",
+             osb->local_alloc_state, (unsigned long long)bits, la_bits, ret);
+        return ret;
 }
 int ocfs2_load_local_alloc(struct ocfs2_super *osb)
@@ -121,6 +120,19 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
        mlog_entry_void();
+        if (ocfs2_mount_local(osb))
+                goto bail;
+        if (osb->local_alloc_size == 0)
+                goto bail;
+        if (ocfs2_local_alloc_window_bits(osb) >= osb->bitmap_cpg) {
+                mlog(ML_NOTICE, "Requested local alloc window %d is larger "
+                     "than max possible %u. Using defaults.\n",
+                     ocfs2_local_alloc_window_bits(osb), (osb->bitmap_cpg - 1));
+                osb->local_alloc_size = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+        }
        /* read the alloc off disk */
        inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
                                            osb->slot_num);
@@ -181,6 +193,9 @@ bail:
        if (inode)
                iput(inode);
+        mlog(0, "Local alloc window bits = %d\n",
+             ocfs2_local_alloc_window_bits(osb));
        mlog_exit(status);
        return status;
 }
@@ -231,7 +246,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
        mutex_lock(&main_bm_inode->i_mutex);
-        status = ocfs2_meta_lock(main_bm_inode, &main_bm_bh, 1);
+        status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
        if (status < 0) {
                mlog_errno(status);
                goto out_mutex;
@@ -286,7 +301,7 @@ out_unlock:
        if (main_bm_bh)
                brelse(main_bm_bh);
-        ocfs2_meta_unlock(main_bm_inode, 1);
+        ocfs2_inode_unlock(main_bm_inode, 1);
 out_mutex:
        mutex_unlock(&main_bm_inode->i_mutex);
@@ -399,7 +414,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
        mutex_lock(&main_bm_inode->i_mutex);
-        status = ocfs2_meta_lock(main_bm_inode, &main_bm_bh, 1);
+        status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
        if (status < 0) {
                mlog_errno(status);
                goto out_mutex;
@@ -424,7 +439,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
        ocfs2_commit_trans(osb, handle);
 out_unlock:
-        ocfs2_meta_unlock(main_bm_inode, 1);
+        ocfs2_inode_unlock(main_bm_inode, 1);
 out_mutex:
        mutex_unlock(&main_bm_inode->i_mutex);
@@ -484,6 +499,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
        alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+#ifdef OCFS2_DEBUG_FS
        if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
            ocfs2_local_alloc_count_bits(alloc)) {
                ocfs2_error(osb->sb, "local alloc inode %llu says it has "
@@ -494,6 +510,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
                status = -EIO;
                goto bail;
        }
+#endif
        free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
                le32_to_cpu(alloc->id1.bitmap1.i_used);
@@ -519,6 +536,9 @@ bail:
                iput(local_alloc_inode);
        }
+        mlog(0, "bits=%d, slot=%d, ret=%d\n", bits_wanted, osb->slot_num,
+             status);
        mlog_exit(status);
        return status;
 }
@@ -712,9 +732,8 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
        void *bitmap;
        struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
-        mlog_entry("total = %u, COUNT = %u, used = %u\n",
+        mlog_entry("total = %u, used = %u\n",
                   le32_to_cpu(alloc->id1.bitmap1.i_total),
-                   ocfs2_local_alloc_count_bits(alloc),
                   le32_to_cpu(alloc->id1.bitmap1.i_used));
        if (!alloc->id1.bitmap1.i_total) {
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
new file mode 100644
index 000000000000..203f87143877
--- /dev/null
+++ b/fs/ocfs2/locks.c
@@ -0,0 +1,125 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * locks.c
+ *
+ * Userspace file locking support
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/fs.h>
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "locks.h"
+static int ocfs2_do_flock(struct file *file, struct inode *inode,
+                          int cmd, struct file_lock *fl)
+{
+        int ret = 0, level = 0, trylock = 0;
+        struct ocfs2_file_private *fp = file->private_data;
+        struct ocfs2_lock_res *lockres = &fp->fp_flock;
+        if (fl->fl_type == F_WRLCK)
+                level = 1;
+        if (!IS_SETLKW(cmd))
+                trylock = 1;
+        mutex_lock(&fp->fp_mutex);
+        if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
+            lockres->l_level > LKM_NLMODE) {
+                int old_level = 0;
+                if (lockres->l_level == LKM_EXMODE)
+                        old_level = 1;
+                if (level == old_level)
+                        goto out;
+                /*
+                 * Converting an existing lock is not guaranteed to be
+                 * atomic, so we can get away with simply unlocking
+                 * here and allowing the lock code to try at the new
+                 * level.
+                 */
+                flock_lock_file_wait(file,
+                                     &(struct file_lock){.fl_type = F_UNLCK});
+                ocfs2_file_unlock(file);
+        }
+        ret = ocfs2_file_lock(file, level, trylock);
+        if (ret) {
+                if (ret == -EAGAIN && trylock)
+                        ret = -EWOULDBLOCK;
+                else
+                        mlog_errno(ret);
+                goto out;
+        }
+        ret = flock_lock_file_wait(file, fl);
+out:
+        mutex_unlock(&fp->fp_mutex);
+        return ret;
+}
+static int ocfs2_do_funlock(struct file *file, int cmd, struct file_lock *fl)
+{
+        int ret;
+        struct ocfs2_file_private *fp = file->private_data;
+        mutex_lock(&fp->fp_mutex);
+        ocfs2_file_unlock(file);
+        ret = flock_lock_file_wait(file, fl);
+        mutex_unlock(&fp->fp_mutex);
+        return ret;
+}
+/*
+ * Overall flow of ocfs2_flock() was influenced by gfs2_flock().
+ */
+int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+        struct inode *inode = file->f_mapping->host;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        if (!(fl->fl_flags & FL_FLOCK))
+                return -ENOLCK;
+        if (__mandatory_lock(inode))
+                return -ENOLCK;
+        if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
+            ocfs2_mount_local(osb))
+                return flock_lock_file_wait(file, fl);
+        if (fl->fl_type == F_UNLCK)
+                return ocfs2_do_funlock(file, cmd, fl);
+        else
+                return ocfs2_do_flock(file, inode, cmd, fl);
+}
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/locks.h
index 9ea46f62de31..9743ef2324ec 100644
--- a/fs/ocfs2/vote.h
+++ b/fs/ocfs2/locks.h
@@ -1,9 +1,9 @@
 /* -*- mode: c; c-basic-offset: 8; -*-
 * vim: noexpandtab sw=8 ts=8 sts=0:
 *
- * vote.h
+ * locks.h
 *
- * description here
+ * Function prototypes for Userspace file locking support
 *
 * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
 *
@@ -23,26 +23,9 @@
 * Boston, MA 021110-1307, USA.
 */
+#ifndef OCFS2_LOCKS_H
+#define OCFS2_LOCKS_H
-#ifndef VOTE_H
+int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl);
-#define VOTE_H
-int ocfs2_vote_thread(void *arg);
+#endif /* OCFS2_LOCKS_H */
-static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
-{
-        spin_lock(&osb->vote_task_lock);
-        /* make sure the voting thread gets a swipe at whatever changes
-         * the caller may have made to the voting state */
-        osb->vote_wake_sequence++;
-        spin_unlock(&osb->vote_task_lock);
-        wake_up(&osb->vote_event);
-}
-int ocfs2_request_mount_vote(struct ocfs2_super *osb);
-int ocfs2_request_umount_vote(struct ocfs2_super *osb);
-int ocfs2_register_net_handlers(struct ocfs2_super *osb);
-void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
-void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
-                                        int node_num);
-#endif
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 98756156d298..3dc18d67557c 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -168,7 +168,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
         * node. Taking the data lock will also ensure that we don't
         * attempt page truncation as part of a downconvert.
         */
-        ret = ocfs2_meta_lock(inode, &di_bh, 1);
+        ret = ocfs2_inode_lock(inode, &di_bh, 1);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -181,21 +181,12 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
         */
        down_write(&OCFS2_I(inode)->ip_alloc_sem);
-        ret = ocfs2_data_lock(inode, 1);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out_meta_unlock;
-        }
        ret = __ocfs2_page_mkwrite(inode, di_bh, page);
-        ocfs2_data_unlock(inode, 1);
-out_meta_unlock:
        up_write(&OCFS2_I(inode)->ip_alloc_sem);
        brelse(di_bh);
-        ocfs2_meta_unlock(inode, 1);
+        ocfs2_inode_unlock(inode, 1);
 out:
        ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
@@ -214,13 +205,13 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
 {
        int ret = 0, lock_level = 0;
-        ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode,
+        ret = ocfs2_inode_lock_atime(file->f_dentry->d_inode,
                                    file->f_vfsmnt, &lock_level);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
        }
-        ocfs2_meta_unlock(file->f_dentry->d_inode, lock_level);
+        ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level);
 out:
        vma->vm_ops = &ocfs2_file_vm_ops;
        vma->vm_flags |= VM_CAN_NONLINEAR;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 729259016c18..ae9ad9587516 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -60,7 +60,6 @@
 #include "symlink.h"
 #include "sysfile.h"
 #include "uptodate.h"
-#include "vote.h"
 #include "buffer_head_io.h"
@@ -116,7 +115,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
        mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len,
             dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno);
-        status = ocfs2_meta_lock(dir, NULL, 0);
+        status = ocfs2_inode_lock(dir, NULL, 0);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -129,7 +128,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
        if (status < 0)
                goto bail_add;
-        inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0);
+        inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0);
        if (IS_ERR(inode)) {
                ret = ERR_PTR(-EACCES);
                goto bail_unlock;
@@ -176,8 +175,8 @@ bail_unlock:
        /* Don't drop the cluster lock until *after* the d_add --
         * unlink on another node will message us to remove that
         * dentry under this lock so otherwise we can race this with
-         * the vote thread and have a stale dentry. */
+         * the downconvert thread and have a stale dentry. */
-        ocfs2_meta_unlock(dir, 0);
+        ocfs2_inode_unlock(dir, 0);
 bail:
@@ -209,7 +208,7 @@ static int ocfs2_mknod(struct inode *dir,
        /* get our super block */
        osb = OCFS2_SB(dir->i_sb);
-        status = ocfs2_meta_lock(dir, &parent_fe_bh, 1);
+        status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -323,7 +322,7 @@ leave:
        if (handle)
                ocfs2_commit_trans(osb, handle);
-        ocfs2_meta_unlock(dir, 1);
+        ocfs2_inode_unlock(dir, 1);
        if (status == -ENOSPC)
                mlog(0, "Disk is full\n");
@@ -553,7 +552,7 @@ static int ocfs2_link(struct dentry *old_dentry,
        if (S_ISDIR(inode->i_mode))
                return -EPERM;
-        err = ocfs2_meta_lock(dir, &parent_fe_bh, 1);
+        err = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
        if (err < 0) {
                if (err != -ENOENT)
                        mlog_errno(err);
@@ -578,7 +577,7 @@ static int ocfs2_link(struct dentry *old_dentry,
                goto out;
        }
-        err = ocfs2_meta_lock(inode, &fe_bh, 1);
+        err = ocfs2_inode_lock(inode, &fe_bh, 1);
        if (err < 0) {
                if (err != -ENOENT)
                        mlog_errno(err);
@@ -643,10 +642,10 @@ static int ocfs2_link(struct dentry *old_dentry,
 out_commit:
        ocfs2_commit_trans(osb, handle);
 out_unlock_inode:
-        ocfs2_meta_unlock(inode, 1);
+        ocfs2_inode_unlock(inode, 1);
 out:
-        ocfs2_meta_unlock(dir, 1);
+        ocfs2_inode_unlock(dir, 1);
        if (de_bh)
                brelse(de_bh);
@@ -720,7 +719,7 @@ static int ocfs2_unlink(struct inode *dir,
                return -EPERM;
        }
-        status = ocfs2_meta_lock(dir, &parent_node_bh, 1);
+        status = ocfs2_inode_lock(dir, &parent_node_bh, 1);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -745,7 +744,7 @@ static int ocfs2_unlink(struct inode *dir,
                goto leave;
        }
-        status = ocfs2_meta_lock(inode, &fe_bh, 1);
+        status = ocfs2_inode_lock(inode, &fe_bh, 1);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -765,7 +764,7 @@ static int ocfs2_unlink(struct inode *dir,
        status = ocfs2_remote_dentry_delete(dentry);
        if (status < 0) {
-                /* This vote should succeed under all normal
+                /* This remote delete should succeed under all normal
                 * circumstances. */
                mlog_errno(status);
                goto leave;
@@ -841,13 +840,13 @@ leave:
                ocfs2_commit_trans(osb, handle);
        if (child_locked)
-                ocfs2_meta_unlock(inode, 1);
+                ocfs2_inode_unlock(inode, 1);
-        ocfs2_meta_unlock(dir, 1);
+        ocfs2_inode_unlock(dir, 1);
        if (orphan_dir) {
                /* This was locked for us in ocfs2_prepare_orphan_dir() */
-                ocfs2_meta_unlock(orphan_dir, 1);
+                ocfs2_inode_unlock(orphan_dir, 1);
                mutex_unlock(&orphan_dir->i_mutex);
                iput(orphan_dir);
        }
@@ -908,7 +907,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
                        inode1 = tmpinode;
                }
                /* lock id2 */
-                status = ocfs2_meta_lock(inode2, bh2, 1);
+                status = ocfs2_inode_lock(inode2, bh2, 1);
                if (status < 0) {
                        if (status != -ENOENT)
                                mlog_errno(status);
@@ -917,14 +916,14 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
        }
        /* lock id1 */
-        status = ocfs2_meta_lock(inode1, bh1, 1);
+        status = ocfs2_inode_lock(inode1, bh1, 1);
        if (status < 0) {
                /*
                 * An error return must mean that no cluster locks
                 * were held on function exit.
                 */
                if (oi1->ip_blkno != oi2->ip_blkno)
-                        ocfs2_meta_unlock(inode2, 1);
+                        ocfs2_inode_unlock(inode2, 1);
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -937,10 +936,10 @@ bail:
 static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2)
 {
-        ocfs2_meta_unlock(inode1, 1);
+        ocfs2_inode_unlock(inode1, 1);
        if (inode1 != inode2)
-                ocfs2_meta_unlock(inode2, 1);
+                ocfs2_inode_unlock(inode2, 1);
 }
 static int ocfs2_rename(struct inode *old_dir,
@@ -1031,10 +1030,11 @@ static int ocfs2_rename(struct inode *old_dir,
        /*
         * Aside from allowing a meta data update, the locking here
-         * also ensures that the vote thread on other nodes won't have
+         * also ensures that the downconvert thread on other nodes
-         * to concurrently downconvert the inode and the dentry locks.
+         * won't have to concurrently downconvert the inode and the
+         * dentry locks.
         */
-        status = ocfs2_meta_lock(old_inode, &old_inode_bh, 1);
+        status = ocfs2_inode_lock(old_inode, &old_inode_bh, 1);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -1105,9 +1105,16 @@ static int ocfs2_rename(struct inode *old_dir,
                goto bail;
        }
-        if (!new_de && new_inode)
+        if (!new_de && new_inode) {
-                mlog(ML_ERROR, "inode %lu does not exist in it's parent "
+                /*
-                     "directory!", new_inode->i_ino);
+                 * Target was unlinked by another node while we were
+                 * waiting to get to ocfs2_rename(). There isn't
+                 * anything we can do here to help the situation, so
+                 * bubble up the appropriate error.
+                 */
+                status = -ENOENT;
+                goto bail;
+        }
        /* In case we need to overwrite an existing file, we blow it
         * away first */
@@ -1136,7 +1143,7 @@ static int ocfs2_rename(struct inode *old_dir,
                        goto bail;
                }
-                status = ocfs2_meta_lock(new_inode, &newfe_bh, 1);
+                status = ocfs2_inode_lock(new_inode, &newfe_bh, 1);
                if (status < 0) {
                        if (status != -ENOENT)
                                mlog_errno(status);
@@ -1348,14 +1355,14 @@ bail:
                ocfs2_double_unlock(old_dir, new_dir);
        if (old_child_locked)
-                ocfs2_meta_unlock(old_inode, 1);
+                ocfs2_inode_unlock(old_inode, 1);
        if (new_child_locked)
-                ocfs2_meta_unlock(new_inode, 1);
+                ocfs2_inode_unlock(new_inode, 1);
        if (orphan_dir) {
                /* This was locked for us in ocfs2_prepare_orphan_dir() */
-                ocfs2_meta_unlock(orphan_dir, 1);
+                ocfs2_inode_unlock(orphan_dir, 1);
                mutex_unlock(&orphan_dir->i_mutex);
                iput(orphan_dir);
        }
@@ -1523,7 +1530,7 @@ static int ocfs2_symlink(struct inode *dir,
        credits = ocfs2_calc_symlink_credits(sb);
        /* lock the parent directory */
-        status = ocfs2_meta_lock(dir, &parent_fe_bh, 1);
+        status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -1650,7 +1657,7 @@ bail:
        if (handle)
                ocfs2_commit_trans(osb, handle);
-        ocfs2_meta_unlock(dir, 1);
+        ocfs2_inode_unlock(dir, 1);
        if (new_fe_bh)
                brelse(new_fe_bh);
@@ -1728,7 +1735,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
        mutex_lock(&orphan_dir_inode->i_mutex);
-        status = ocfs2_meta_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+        status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -1738,7 +1745,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
                                              orphan_dir_bh, name,
                                              OCFS2_ORPHAN_NAMELEN, de_bh);
        if (status < 0) {
-                ocfs2_meta_unlock(orphan_dir_inode, 1);
+                ocfs2_inode_unlock(orphan_dir_inode, 1);
                mlog_errno(status);
                goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 60a23e1906b0..d08480580470 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -101,6 +101,7 @@ enum ocfs2_unlock_action {
                                               * about to be
                                               * dropped. */
 #define OCFS2_LOCK_QUEUED        (0x00000100) /* queued for downconvert */
+#define OCFS2_LOCK_NOCACHE       (0x00000200) /* don't use a holder count */
 struct ocfs2_lock_res_ops;
@@ -170,6 +171,7 @@ enum ocfs2_mount_options
        OCFS2_MOUNT_NOINTR  = 1 << 2,   /* Don't catch signals */
        OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
        OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
+        OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
 };
 #define OCFS2_OSB_SOFT_RO       0x0001
@@ -189,9 +191,7 @@ struct ocfs2_super
        struct ocfs2_slot_info *slot_info;
        spinlock_t node_map_lock;
-        struct ocfs2_node_map mounted_map;
        struct ocfs2_node_map recovery_map;
-        struct ocfs2_node_map umount_map;
        u64 root_blkno;
        u64 system_dir_blkno;
@@ -231,7 +231,9 @@ struct ocfs2_super
        wait_queue_head_t checkpoint_event;
        atomic_t needs_checkpoint;
        struct ocfs2_journal *journal;
+        unsigned long osb_commit_interval;
+        int local_alloc_size;
        enum ocfs2_local_alloc_state local_alloc_state;
        struct buffer_head *local_alloc_bh;
        u64 la_last_gd;
@@ -254,28 +256,21 @@ struct ocfs2_super
        wait_queue_head_t recovery_event;
-        spinlock_t vote_task_lock;
+        spinlock_t dc_task_lock;
-        struct task_struct *vote_task;
+        struct task_struct *dc_task;
-        wait_queue_head_t vote_event;
+        wait_queue_head_t dc_event;
-        unsigned long vote_wake_sequence;
+        unsigned long dc_wake_sequence;
-        unsigned long vote_work_sequence;
+        unsigned long dc_work_sequence;
+        /*
+         * Any thread can add locks to the list, but the downconvert
+         * thread is the only one allowed to remove locks. Any change
+         * to this rule requires updating
+         * ocfs2_downconvert_thread_do_work().
+         */
        struct list_head blocked_lock_list;
        unsigned long blocked_lock_count;
-        struct list_head vote_list;
-        int vote_count;
-        u32 net_key;
-        spinlock_t net_response_lock;
-        unsigned int net_response_ids;
-        struct list_head net_response_list;
-        struct o2hb_callback_func osb_hb_up;
-        struct o2hb_callback_func osb_hb_down;
-        struct list_head        osb_net_handlers;
        wait_queue_head_t               osb_mount_event;
        /* Truncate log info */
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 6ef876759a73..3633edd3982f 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -231,6 +231,20 @@ struct ocfs2_space_resv {
 #define OCFS2_IOC_RESVSP64      _IOW ('X', 42, struct ocfs2_space_resv)
 #define OCFS2_IOC_UNRESVSP64    _IOW ('X', 43, struct ocfs2_space_resv)
+/* Used to pass group descriptor data when online resize is done */
+struct ocfs2_new_group_input {
+        __u64 group;            /* Group descriptor's blkno. */
+        __u32 clusters;         /* Total number of clusters in this group */
+        __u32 frees;            /* Total free clusters in this group */
+        __u16 chain;            /* Chain for this group */
+        __u16 reserved1;
+        __u32 reserved2;
+};
+#define OCFS2_IOC_GROUP_EXTEND  _IOW('o', 1, int)
+#define OCFS2_IOC_GROUP_ADD     _IOW('o', 2,struct ocfs2_new_group_input)
+#define OCFS2_IOC_GROUP_ADD64   _IOW('o', 3,struct ocfs2_new_group_input)
 /*
 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
 */
@@ -256,6 +270,14 @@ struct ocfs2_space_resv {
 /* Journal limits (in bytes) */
 #define OCFS2_MIN_JOURNAL_SIZE          (4 * 1024 * 1024)
+/*
+ * Default local alloc size (in megabytes)
+ *
+ * The value chosen should be such that most allocations, including new
+ * block groups, use local alloc.
+ */
+#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE  8
 struct ocfs2_system_inode_info {
        char    *si_name;
        int     si_iflags;
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 4ca02b1c38ac..86f3e3799c2b 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -45,6 +45,7 @@ enum ocfs2_lock_type {
        OCFS2_LOCK_TYPE_RW,
        OCFS2_LOCK_TYPE_DENTRY,
        OCFS2_LOCK_TYPE_OPEN,
+        OCFS2_LOCK_TYPE_FLOCK,
        OCFS2_NUM_LOCK_TYPES
 };
@@ -73,6 +74,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
                case OCFS2_LOCK_TYPE_OPEN:
                        c = 'O';
                        break;
+                case OCFS2_LOCK_TYPE_FLOCK:
+                        c = 'F';
+                        break;
                default:
                        c = '\0';
        }
@@ -90,6 +94,7 @@ static char *ocfs2_lock_type_strings[] = {
        [OCFS2_LOCK_TYPE_RW] = "Write/Read",
        [OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
        [OCFS2_LOCK_TYPE_OPEN] = "Open",
+        [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
 };
 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
new file mode 100644
index 000000000000..37835ffcb039
--- /dev/null
+++ b/fs/ocfs2/resize.c
@@ -0,0 +1,634 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * resize.c
+ *
+ * volume resize.
+ * Inspired by ext3/resize.c.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/fs.h>
+#include <linux/types.h>
+#define MLOG_MASK_PREFIX ML_DISK_ALLOC
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "inode.h"
+#include "journal.h"
+#include "super.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "buffer_head_io.h"
+#include "suballoc.h"
+#include "resize.h"
+/*
+ * Check whether there are new backup superblocks exist
+ * in the last group. If there are some, mark them or clear
+ * them in the bitmap.
+ *
+ * Return how many backups we find in the last group.
+ */
+static u16 ocfs2_calc_new_backup_super(struct inode *inode,
+                                       struct ocfs2_group_desc *gd,
+                                       int new_clusters,
+                                       u32 first_new_cluster,
+                                       u16 cl_cpg,
+                                       int set)
+{
+        int i;
+        u16 backups = 0;
+        u32 cluster;
+        u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno);
+        for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
+                blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
+                cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+                gd_blkno = ocfs2_which_cluster_group(inode, cluster);
+                if (gd_blkno < lgd_blkno)
+                        continue;
+                else if (gd_blkno > lgd_blkno)
+                        break;
+                if (set)
+                        ocfs2_set_bit(cluster % cl_cpg,
+                                      (unsigned long *)gd->bg_bitmap);
+                else
+                        ocfs2_clear_bit(cluster % cl_cpg,
+                                        (unsigned long *)gd->bg_bitmap);
+                backups++;
+        }
+        mlog_exit_void();
+        return backups;
+}
+static int ocfs2_update_last_group_and_inode(handle_t *handle,
+                                             struct inode *bm_inode,
+                                             struct buffer_head *bm_bh,
+                                             struct buffer_head *group_bh,
+                                             u32 first_new_cluster,
+                                             int new_clusters)
+{
+        int ret = 0;
+        struct ocfs2_super *osb = OCFS2_SB(bm_inode->i_sb);
+        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bm_bh->b_data;
+        struct ocfs2_chain_list *cl = &fe->id2.i_chain;
+        struct ocfs2_chain_rec *cr;
+        struct ocfs2_group_desc *group;
+        u16 chain, num_bits, backups = 0;
+        u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
+        u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
+        mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
+                   new_clusters, first_new_cluster);
+        ret = ocfs2_journal_access(handle, bm_inode, group_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        group = (struct ocfs2_group_desc *)group_bh->b_data;
+        /* update the group first. */
+        num_bits = new_clusters * cl_bpc;
+        le16_add_cpu(&group->bg_bits, num_bits);
+        le16_add_cpu(&group->bg_free_bits_count, num_bits);
+        /*
+         * check whether there are some new backup superblocks exist in
+         * this group and update the group bitmap accordingly.
+         */
+        if (OCFS2_HAS_COMPAT_FEATURE(osb->sb,
+                                     OCFS2_FEATURE_COMPAT_BACKUP_SB)) {
+                backups = ocfs2_calc_new_backup_super(bm_inode,
+                                                     group,
+                                                     new_clusters,
+                                                     first_new_cluster,
+                                                     cl_cpg, 1);
+                le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
+        }
+        ret = ocfs2_journal_dirty(handle, group_bh);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out_rollback;
+        }
+        /* update the inode accordingly. */
+        ret = ocfs2_journal_access(handle, bm_inode, bm_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out_rollback;
+        }
+        chain = le16_to_cpu(group->bg_chain);
+        cr = (&cl->cl_recs[chain]);
+        le32_add_cpu(&cr->c_total, num_bits);
+        le32_add_cpu(&cr->c_free, num_bits);
+        le32_add_cpu(&fe->id1.bitmap1.i_total, num_bits);
+        le32_add_cpu(&fe->i_clusters, new_clusters);
+        if (backups) {
+                le32_add_cpu(&cr->c_free, -1 * backups);
+                le32_add_cpu(&fe->id1.bitmap1.i_used, backups);
+        }
+        spin_lock(&OCFS2_I(bm_inode)->ip_lock);
+        OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+        le64_add_cpu(&fe->i_size, new_clusters << osb->s_clustersize_bits);
+        spin_unlock(&OCFS2_I(bm_inode)->ip_lock);
+        i_size_write(bm_inode, le64_to_cpu(fe->i_size));
+        ocfs2_journal_dirty(handle, bm_bh);
+out_rollback:
+        if (ret < 0) {
+                ocfs2_calc_new_backup_super(bm_inode,
+                                            group,
+                                            new_clusters,
+                                            first_new_cluster,
+                                            cl_cpg, 0);
+                le16_add_cpu(&group->bg_free_bits_count, backups);
+                le16_add_cpu(&group->bg_bits, -1 * num_bits);
+                le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
+        }
+out:
+        mlog_exit(ret);
+        return ret;
+}
+static int update_backups(struct inode * inode, u32 clusters, char *data)
+{
+        int i, ret = 0;
+        u32 cluster;
+        u64 blkno;
+        struct buffer_head *backup = NULL;
+        struct ocfs2_dinode *backup_di = NULL;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        /* calculate the real backups we need to update. */
+        for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
+                blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
+                cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+                if (cluster > clusters)
+                        break;
+                ret = ocfs2_read_block(osb, blkno, &backup, 0, NULL);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        break;
+                }
+                memcpy(backup->b_data, data, inode->i_sb->s_blocksize);
+                backup_di = (struct ocfs2_dinode *)backup->b_data;
+                backup_di->i_blkno = cpu_to_le64(blkno);
+                ret = ocfs2_write_super_or_backup(osb, backup);
+                brelse(backup);
+                backup = NULL;
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        break;
+                }
+        }
+        return ret;
+}
+static void ocfs2_update_super_and_backups(struct inode *inode,
+                                           int new_clusters)
+{
+        int ret;
+        u32 clusters = 0;
+        struct buffer_head *super_bh = NULL;
+        struct ocfs2_dinode *super_di = NULL;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        /*
+         * update the superblock last.
+         * It doesn't matter if the write failed.
+         */
+        ret = ocfs2_read_block(osb, OCFS2_SUPER_BLOCK_BLKNO,
+                               &super_bh, 0, NULL);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        super_di = (struct ocfs2_dinode *)super_bh->b_data;
+        le32_add_cpu(&super_di->i_clusters, new_clusters);
+        clusters = le32_to_cpu(super_di->i_clusters);
+        ret = ocfs2_write_super_or_backup(osb, super_bh);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (OCFS2_HAS_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_COMPAT_BACKUP_SB))
+                ret = update_backups(inode, clusters, super_bh->b_data);
+out:
+        brelse(super_bh);
+        if (ret)
+                printk(KERN_WARNING "ocfs2: Failed to update super blocks on %s"
+                        " during fs resize. This condition is not fatal,"
+                        " but fsck.ocfs2 should be run to fix it\n",
+                        osb->dev_str);
+        return;
+}
+/*
+ * Extend the filesystem to the new number of clusters specified.  This entry
+ * point is only used to extend the current filesystem to the end of the last
+ * existing group.
+ */
+int ocfs2_group_extend(struct inode * inode, int new_clusters)
+{
+        int ret;
+        handle_t *handle;
+        struct buffer_head *main_bm_bh = NULL;
+        struct buffer_head *group_bh = NULL;
+        struct inode *main_bm_inode = NULL;
+        struct ocfs2_dinode *fe = NULL;
+        struct ocfs2_group_desc *group = NULL;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        u16 cl_bpc;
+        u32 first_new_cluster;
+        u64 lgd_blkno;
+        mlog_entry_void();
+        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+                return -EROFS;
+        if (new_clusters < 0)
+                return -EINVAL;
+        else if (new_clusters == 0)
+                return 0;
+        main_bm_inode = ocfs2_get_system_file_inode(osb,
+                                                    GLOBAL_BITMAP_SYSTEM_INODE,
+                                                    OCFS2_INVALID_SLOT);
+        if (!main_bm_inode) {
+                ret = -EINVAL;
+                mlog_errno(ret);
+                goto out;
+        }
+        mutex_lock(&main_bm_inode->i_mutex);
+        ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out_mutex;
+        }
+        fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
+        if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
+                                 ocfs2_group_bitmap_size(osb->sb) * 8) {
+                mlog(ML_ERROR, "The disk is too old and small. "
+                     "Force to do offline resize.");
+                ret = -EINVAL;
+                goto out_unlock;
+        }
+        if (!OCFS2_IS_VALID_DINODE(fe)) {
+                OCFS2_RO_ON_INVALID_DINODE(main_bm_inode->i_sb, fe);
+                ret = -EIO;
+                goto out_unlock;
+        }
+        first_new_cluster = le32_to_cpu(fe->i_clusters);
+        lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
+                                              first_new_cluster - 1);
+        ret = ocfs2_read_block(osb, lgd_blkno, &group_bh, OCFS2_BH_CACHED,
+                               main_bm_inode);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out_unlock;
+        }
+        group = (struct ocfs2_group_desc *)group_bh->b_data;
+        ret = ocfs2_check_group_descriptor(inode->i_sb, fe, group);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_unlock;
+        }
+        cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
+        if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
+                le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
+                ret = -EINVAL;
+                goto out_unlock;
+        }
+        mlog(0, "extend the last group at %llu, new clusters = %d\n",
+             (unsigned long long)le64_to_cpu(group->bg_blkno), new_clusters);
+        handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS);
+        if (IS_ERR(handle)) {
+                mlog_errno(PTR_ERR(handle));
+                ret = -EINVAL;
+                goto out_unlock;
+        }
+        /* update the last group descriptor and inode. */
+        ret = ocfs2_update_last_group_and_inode(handle, main_bm_inode,
+                                                main_bm_bh, group_bh,
+                                                first_new_cluster,
+                                                new_clusters);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ocfs2_update_super_and_backups(main_bm_inode, new_clusters);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out_unlock:
+        brelse(group_bh);
+        brelse(main_bm_bh);
+        ocfs2_inode_unlock(main_bm_inode, 1);
+out_mutex:
+        mutex_unlock(&main_bm_inode->i_mutex);
+        iput(main_bm_inode);
+out:
+        mlog_exit_void();
+        return ret;
+}
+static int ocfs2_check_new_group(struct inode *inode,
+                                 struct ocfs2_dinode *di,
+                                 struct ocfs2_new_group_input *input,
+                                 struct buffer_head *group_bh)
+{
+        int ret;
+        struct ocfs2_group_desc *gd;
+        u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
+        unsigned int max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) *
+                                le16_to_cpu(di->id2.i_chain.cl_bpc);
+        gd = (struct ocfs2_group_desc *)group_bh->b_data;
+        ret = -EIO;
+        if (!OCFS2_IS_VALID_GROUP_DESC(gd))
+                mlog(ML_ERROR, "Group descriptor # %llu isn't valid.\n",
+                     (unsigned long long)le64_to_cpu(gd->bg_blkno));
+        else if (di->i_blkno != gd->bg_parent_dinode)
+                mlog(ML_ERROR, "Group descriptor # %llu has bad parent "
+                     "pointer (%llu, expected %llu)\n",
+                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                     (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
+                     (unsigned long long)le64_to_cpu(di->i_blkno));
+        else if (le16_to_cpu(gd->bg_bits) > max_bits)
+                mlog(ML_ERROR, "Group descriptor # %llu has bit count of %u\n",
+                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                     le16_to_cpu(gd->bg_bits));
+        else if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits))
+                mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
+                     "claims that %u are free\n",
+                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                     le16_to_cpu(gd->bg_bits),
+                     le16_to_cpu(gd->bg_free_bits_count));
+        else if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size)))
+                mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
+                     "max bitmap bits of %u\n",
+                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                     le16_to_cpu(gd->bg_bits),
+                     8 * le16_to_cpu(gd->bg_size));
+        else if (le16_to_cpu(gd->bg_chain) != input->chain)
+                mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
+                     "while input has %u set.\n",
+                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                     le16_to_cpu(gd->bg_chain), input->chain);
+        else if (le16_to_cpu(gd->bg_bits) != input->clusters * cl_bpc)
+                mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
+                     "input has %u clusters set\n",
+                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                     le16_to_cpu(gd->bg_bits), input->clusters);
+        else if (le16_to_cpu(gd->bg_free_bits_count) != input->frees * cl_bpc)
+                mlog(ML_ERROR, "Group descriptor # %llu has free bit count %u "
+                     "but it should have %u set\n",
+                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                     le16_to_cpu(gd->bg_bits),
+                     input->frees * cl_bpc);
+        else
+                ret = 0;
+        return ret;
+}
+static int ocfs2_verify_group_and_input(struct inode *inode,
+                                        struct ocfs2_dinode *di,
+                                        struct ocfs2_new_group_input *input,
+                                        struct buffer_head *group_bh)
+{
+        u16 cl_count = le16_to_cpu(di->id2.i_chain.cl_count);
+        u16 cl_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
+        u16 next_free = le16_to_cpu(di->id2.i_chain.cl_next_free_rec);
+        u32 cluster = ocfs2_blocks_to_clusters(inode->i_sb, input->group);
+        u32 total_clusters = le32_to_cpu(di->i_clusters);
+        int ret = -EINVAL;
+        if (cluster < total_clusters)
+                mlog(ML_ERROR, "add a group which is in the current volume.\n");
+        else if (input->chain >= cl_count)
+                mlog(ML_ERROR, "input chain exceeds the limit.\n");
+        else if (next_free != cl_count && next_free != input->chain)
+                mlog(ML_ERROR,
+                     "the add group should be in chain %u\n", next_free);
+        else if (total_clusters + input->clusters < total_clusters)
+                mlog(ML_ERROR, "add group's clusters overflow.\n");
+        else if (input->clusters > cl_cpg)
+                mlog(ML_ERROR, "the cluster exceeds the maximum of a group\n");
+        else if (input->frees > input->clusters)
+                mlog(ML_ERROR, "the free cluster exceeds the total clusters\n");
+        else if (total_clusters % cl_cpg != 0)
+                mlog(ML_ERROR,
+                     "the last group isn't full. Use group extend first.\n");
+        else if (input->group != ocfs2_which_cluster_group(inode, cluster))
+                mlog(ML_ERROR, "group blkno is invalid\n");
+        else if ((ret = ocfs2_check_new_group(inode, di, input, group_bh)))
+                mlog(ML_ERROR, "group descriptor check failed.\n");
+        else
+                ret = 0;
+        return ret;
+}
+/* Add a new group descriptor to global_bitmap. */
+int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
+{
+        int ret;
+        handle_t *handle;
+        struct buffer_head *main_bm_bh = NULL;
+        struct inode *main_bm_inode = NULL;
+        struct ocfs2_dinode *fe = NULL;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct buffer_head *group_bh = NULL;
+        struct ocfs2_group_desc *group = NULL;
+        struct ocfs2_chain_list *cl;
+        struct ocfs2_chain_rec *cr;
+        u16 cl_bpc;
+        mlog_entry_void();
+        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+                return -EROFS;
+        main_bm_inode = ocfs2_get_system_file_inode(osb,
+                                                    GLOBAL_BITMAP_SYSTEM_INODE,
+                                                    OCFS2_INVALID_SLOT);
+        if (!main_bm_inode) {
+                ret = -EINVAL;
+                mlog_errno(ret);
+                goto out;
+        }
+        mutex_lock(&main_bm_inode->i_mutex);
+        ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out_mutex;
+        }
+        fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
+        if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
+                                 ocfs2_group_bitmap_size(osb->sb) * 8) {
+                mlog(ML_ERROR, "The disk is too old and small."
+                     " Force to do offline resize.");
+                ret = -EINVAL;
+                goto out_unlock;
+        }
+        ret = ocfs2_read_block(osb, input->group, &group_bh, 0, NULL);
+        if (ret < 0) {
+                mlog(ML_ERROR, "Can't read the group descriptor # %llu "
+                     "from the device.", (unsigned long long)input->group);
+                goto out_unlock;
+        }
+        ocfs2_set_new_buffer_uptodate(inode, group_bh);
+        ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_unlock;
+        }
+        mlog(0, "Add a new group  %llu in chain = %u, length = %u\n",
+             (unsigned long long)input->group, input->chain, input->clusters);
+        handle = ocfs2_start_trans(osb, OCFS2_GROUP_ADD_CREDITS);
+        if (IS_ERR(handle)) {
+                mlog_errno(PTR_ERR(handle));
+                ret = -EINVAL;
+                goto out_unlock;
+        }
+        cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
+        cl = &fe->id2.i_chain;
+        cr = &cl->cl_recs[input->chain];
+        ret = ocfs2_journal_access(handle, main_bm_inode, group_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        group = (struct ocfs2_group_desc *)group_bh->b_data;
+        group->bg_next_group = cr->c_blkno;
+        ret = ocfs2_journal_dirty(handle, group_bh);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ret = ocfs2_journal_access(handle, main_bm_inode, main_bm_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        if (input->chain == le16_to_cpu(cl->cl_next_free_rec)) {
+                le16_add_cpu(&cl->cl_next_free_rec, 1);
+                memset(cr, 0, sizeof(struct ocfs2_chain_rec));
+        }
+        cr->c_blkno = le64_to_cpu(input->group);
+        le32_add_cpu(&cr->c_total, input->clusters * cl_bpc);
+        le32_add_cpu(&cr->c_free, input->frees * cl_bpc);
+        le32_add_cpu(&fe->id1.bitmap1.i_total, input->clusters *cl_bpc);
+        le32_add_cpu(&fe->id1.bitmap1.i_used,
+                     (input->clusters - input->frees) * cl_bpc);
+        le32_add_cpu(&fe->i_clusters, input->clusters);
+        ocfs2_journal_dirty(handle, main_bm_bh);
+        spin_lock(&OCFS2_I(main_bm_inode)->ip_lock);
+        OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+        le64_add_cpu(&fe->i_size, input->clusters << osb->s_clustersize_bits);
+        spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock);
+        i_size_write(main_bm_inode, le64_to_cpu(fe->i_size));
+        ocfs2_update_super_and_backups(main_bm_inode, input->clusters);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out_unlock:
+        brelse(group_bh);
+        brelse(main_bm_bh);
+        ocfs2_inode_unlock(main_bm_inode, 1);
+out_mutex:
+        mutex_unlock(&main_bm_inode->i_mutex);
+        iput(main_bm_inode);
+out:
+        mlog_exit_void();
+        return ret;
+}
diff --git a/fs/ocfs2/resize.h b/fs/ocfs2/resize.h
new file mode 100644
index 000000000000..f38841abf10b
--- /dev/null
+++ b/fs/ocfs2/resize.h
@@ -0,0 +1,32 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * resize.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef OCFS2_RESIZE_H
+#define OCFS2_RESIZE_H
+int ocfs2_group_extend(struct inode * inode, int new_clusters);
+int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input);
+#endif /* OCFS2_RESIZE_H */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index af4882b62cfa..3a50ce555e64 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -48,25 +48,6 @@ static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
                              s16 slot_num,
                              s16 node_num);
-/* Use the slot information we've collected to create a map of mounted
- * nodes. Should be holding an EX on super block. assumes slot info is
- * up to date. Note that we call this *after* we find a slot, so our
- * own node should be set in the map too... */
-void ocfs2_populate_mounted_map(struct ocfs2_super *osb)
-{
-        int i;
-        struct ocfs2_slot_info *si = osb->slot_info;
-        spin_lock(&si->si_lock);
-        for (i = 0; i < si->si_size; i++)
-                if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT)
-                        ocfs2_node_map_set_bit(osb, &osb->mounted_map,
-                                              si->si_global_node_nums[i]);
-        spin_unlock(&si->si_lock);
-}
 /* post the slot information on disk into our slot_info struct. */
 void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
 {
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index d8c8ceed031b..1025872aaade 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -52,8 +52,6 @@ s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
 void ocfs2_clear_slot(struct ocfs2_slot_info *si,
                      s16 slot_num);
-void ocfs2_populate_mounted_map(struct ocfs2_super *osb);
 static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
                                      int slot_num)
 {
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 8f09f5235e3a..7e397e2c25dd 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -101,8 +101,6 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg
 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
                                                   u64 bg_blkno,
                                                   u16 bg_bit_off);
-static inline u64 ocfs2_which_cluster_group(struct inode *inode,
-                                            u32 cluster);
 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
                                                u64 data_blkno,
                                                u64 *bg_blkno,
@@ -114,7 +112,7 @@ void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
        if (inode) {
                if (ac->ac_which != OCFS2_AC_USE_LOCAL)
-                        ocfs2_meta_unlock(inode, 1);
+                        ocfs2_inode_unlock(inode, 1);
                mutex_unlock(&inode->i_mutex);
@@ -131,9 +129,9 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 }
 /* somewhat more expensive than our other checks, so use sparingly. */
-static int ocfs2_check_group_descriptor(struct super_block *sb,
+int ocfs2_check_group_descriptor(struct super_block *sb,
-                                        struct ocfs2_dinode *di,
+                                 struct ocfs2_dinode *di,
-                                        struct ocfs2_group_desc *gd)
+                                 struct ocfs2_group_desc *gd)
 {
        unsigned int max_bits;
@@ -412,7 +410,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
        mutex_lock(&alloc_inode->i_mutex);
-        status = ocfs2_meta_lock(alloc_inode, &bh, 1);
+        status = ocfs2_inode_lock(alloc_inode, &bh, 1);
        if (status < 0) {
                mutex_unlock(&alloc_inode->i_mutex);
                iput(alloc_inode);
@@ -1443,8 +1441,7 @@ static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
 /* given a cluster offset, calculate which block group it belongs to
 * and return that block offset. */
-static inline u64 ocfs2_which_cluster_group(struct inode *inode,
+u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
-                                            u32 cluster)
 {
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        u32 group_no;
@@ -1519,8 +1516,9 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
                if (min_clusters > (osb->bitmap_cpg - 1)) {
                        /* The only paths asking for contiguousness
                         * should know about this already. */
-                        mlog(ML_ERROR, "minimum allocation requested exceeds "
+                        mlog(ML_ERROR, "minimum allocation requested %u exceeds "
-                                       "group bitmap size!");
+                             "group bitmap size %u!\n", min_clusters,
+                             osb->bitmap_cpg);
                        status = -ENOSPC;
                        goto bail;
                }
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index cafe93703095..8799033bb459 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -147,4 +147,12 @@ static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
                                      struct ocfs2_alloc_context *ac);
+/* given a cluster offset, calculate which block group it belongs to
+ * and return that block offset. */
+u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
+/* somewhat more expensive than our other checks, so use sparingly. */
+int ocfs2_check_group_descriptor(struct super_block *sb,
+                                 struct ocfs2_dinode *di,
+                                 struct ocfs2_group_desc *gd);
 #endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index be562ac3e89c..01fe40ee5ea9 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -65,7 +65,6 @@
 #include "sysfile.h"
 #include "uptodate.h"
 #include "ver.h"
-#include "vote.h"
 #include "buffer_head_io.h"
@@ -84,9 +83,11 @@ MODULE_LICENSE("GPL");
 struct mount_options
 {
+        unsigned long   commit_interval;
        unsigned long   mount_opt;
        unsigned int    atime_quantum;
        signed short    slot;
+        unsigned int    localalloc_opt;
 };
 static int ocfs2_parse_options(struct super_block *sb, char *options,
@@ -150,6 +151,9 @@ enum {
        Opt_data_writeback,
        Opt_atime_quantum,
        Opt_slot,
+        Opt_commit,
+        Opt_localalloc,
+        Opt_localflocks,
        Opt_err,
 };
@@ -165,6 +169,9 @@ static match_table_t tokens = {
        {Opt_data_writeback, "data=writeback"},
        {Opt_atime_quantum, "atime_quantum=%u"},
        {Opt_slot, "preferred_slot=%u"},
+        {Opt_commit, "commit=%u"},
+        {Opt_localalloc, "localalloc=%d"},
+        {Opt_localflocks, "localflocks"},
        {Opt_err, NULL}
 };
@@ -213,7 +220,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
        mlog_entry_void();
-        new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE);
+        new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
        if (IS_ERR(new)) {
                status = PTR_ERR(new);
                mlog_errno(status);
@@ -221,7 +228,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
        }
        osb->root_inode = new;
-        new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE);
+        new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
        if (IS_ERR(new)) {
                status = PTR_ERR(new);
                mlog_errno(status);
@@ -438,14 +445,16 @@ unlock_osb:
        }
        if (!ret) {
-                if (!ocfs2_is_hard_readonly(osb))
-                        ocfs2_set_journal_params(osb);
                /* Only save off the new mount options in case of a successful
                 * remount. */
                osb->s_mount_opt = parsed_options.mount_opt;
                osb->s_atime_quantum = parsed_options.atime_quantum;
                osb->preferred_slot = parsed_options.slot;
+                if (parsed_options.commit_interval)
+                        osb->osb_commit_interval = parsed_options.commit_interval;
+                if (!ocfs2_is_hard_readonly(osb))
+                        ocfs2_set_journal_params(osb);
        }
 out:
        return ret;
@@ -597,6 +606,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        osb->s_mount_opt = parsed_options.mount_opt;
        osb->s_atime_quantum = parsed_options.atime_quantum;
        osb->preferred_slot = parsed_options.slot;
+        osb->osb_commit_interval = parsed_options.commit_interval;
+        osb->local_alloc_size = parsed_options.localalloc_opt;
        sb->s_magic = OCFS2_SUPER_MAGIC;
@@ -747,9 +758,11 @@ static int ocfs2_parse_options(struct super_block *sb,
        mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
                   options ? options : "(none)");
+        mopt->commit_interval = 0;
        mopt->mount_opt = 0;
        mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
        mopt->slot = OCFS2_INVALID_SLOT;
+        mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
        if (!options) {
                status = 1;
@@ -816,6 +829,41 @@ static int ocfs2_parse_options(struct super_block *sb,
                        if (option)
                                mopt->slot = (s16)option;
                        break;
+                case Opt_commit:
+                        option = 0;
+                        if (match_int(&args[0], &option)) {
+                                status = 0;
+                                goto bail;
+                        }
+                        if (option < 0)
+                                return 0;
+                        if (option == 0)
+                                option = JBD_DEFAULT_MAX_COMMIT_AGE;
+                        mopt->commit_interval = HZ * option;
+                        break;
+                case Opt_localalloc:
+                        option = 0;
+                        if (match_int(&args[0], &option)) {
+                                status = 0;
+                                goto bail;
+                        }
+                        if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8))
+                                mopt->localalloc_opt = option;
+                        break;
+                case Opt_localflocks:
+                        /*
+                         * Changing this during remount could race
+                         * flock() requests, or "unbalance" existing
+                         * ones (e.g., a lock is taken in one mode but
+                         * dropped in the other). If users care enough
+                         * to flip locking modes during remount, we
+                         * could add a "local" flag to individual
+                         * flock structures for proper tracking of
+                         * state.
+                         */
+                        if (!is_remount)
+                                mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
+                        break;
                default:
                        mlog(ML_ERROR,
                             "Unrecognized mount option \"%s\" "
@@ -864,6 +912,16 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM)
                seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
+        if (osb->osb_commit_interval)
+                seq_printf(s, ",commit=%u",
+                           (unsigned) (osb->osb_commit_interval / HZ));
+        if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
+                seq_printf(s, ",localalloc=%d", osb->local_alloc_size);
+        if (opts & OCFS2_MOUNT_LOCALFLOCKS)
+                seq_printf(s, ",localflocks,");
        return 0;
 }
@@ -965,7 +1023,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
                goto bail;
        }
-        status = ocfs2_meta_lock(inode, &bh, 0);
+        status = ocfs2_inode_lock(inode, &bh, 0);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -989,7 +1047,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
        brelse(bh);
-        ocfs2_meta_unlock(inode, 0);
+        ocfs2_inode_unlock(inode, 0);
        status = 0;
 bail:
        if (inode)
@@ -1020,8 +1078,7 @@ static void ocfs2_inode_init_once(struct kmem_cache *cachep, void *data)
        oi->ip_clusters = 0;
        ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
-        ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
+        ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
-        ocfs2_lock_res_init_once(&oi->ip_data_lockres);
        ocfs2_lock_res_init_once(&oi->ip_open_lockres);
        ocfs2_metadata_cache_init(&oi->vfs_inode);
@@ -1117,25 +1174,12 @@ static int ocfs2_mount_volume(struct super_block *sb)
                goto leave;
        }
-        status = ocfs2_register_hb_callbacks(osb);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        status = ocfs2_dlm_init(osb);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
-        /* requires vote_thread to be running. */
-        status = ocfs2_register_net_handlers(osb);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        status = ocfs2_super_lock(osb, 1);
        if (status < 0) {
                mlog_errno(status);
@@ -1150,8 +1194,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
                goto leave;
        }
-        ocfs2_populate_mounted_map(osb);
        /* load all node-local system inodes */
        status = ocfs2_init_local_system_inodes(osb);
        if (status < 0) {
@@ -1174,15 +1216,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
        if (ocfs2_mount_local(osb))
                goto leave;
-        /* This should be sent *after* we recovered our journal as it
-         * will cause other nodes to unmark us as needing
-         * recovery. However, we need to send it *before* dropping the
-         * super block lock as otherwise their recovery threads might
-         * try to clean us up while we're live! */
-        status = ocfs2_request_mount_vote(osb);
-        if (status < 0)
-                mlog_errno(status);
 leave:
        if (unlock_super)
                ocfs2_super_unlock(osb, 1);
@@ -1240,10 +1273,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
                        mlog_errno(tmp);
                        return;
                }
-                tmp = ocfs2_request_umount_vote(osb);
-                if (tmp < 0)
-                        mlog_errno(tmp);
        }
        if (osb->slot_num != OCFS2_INVALID_SLOT)
@@ -1254,13 +1283,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        ocfs2_release_system_inodes(osb);
-        if (osb->dlm) {
+        if (osb->dlm)
-                ocfs2_unregister_net_handlers(osb);
                ocfs2_dlm_shutdown(osb);
-        }
-        ocfs2_clear_hb_callbacks(osb);
        debugfs_remove(osb->osb_debug_root);
@@ -1315,7 +1339,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
        int i, cbits, bbits;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
        struct inode *inode = NULL;
-        struct buffer_head *bitmap_bh = NULL;
        struct ocfs2_journal *journal;
        __le32 uuid_net_key;
        struct ocfs2_super *osb;
@@ -1344,19 +1367,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
        osb->s_sectsize_bits = blksize_bits(sector_size);
        BUG_ON(!osb->s_sectsize_bits);
-        osb->net_response_ids = 0;
-        spin_lock_init(&osb->net_response_lock);
-        INIT_LIST_HEAD(&osb->net_response_list);
-        INIT_LIST_HEAD(&osb->osb_net_handlers);
        init_waitqueue_head(&osb->recovery_event);
-        spin_lock_init(&osb->vote_task_lock);
+        spin_lock_init(&osb->dc_task_lock);
-        init_waitqueue_head(&osb->vote_event);
+        init_waitqueue_head(&osb->dc_event);
-        osb->vote_work_sequence = 0;
+        osb->dc_work_sequence = 0;
-        osb->vote_wake_sequence = 0;
+        osb->dc_wake_sequence = 0;
        INIT_LIST_HEAD(&osb->blocked_lock_list);
        osb->blocked_lock_count = 0;
-        INIT_LIST_HEAD(&osb->vote_list);
        spin_lock_init(&osb->osb_lock);
        atomic_set(&osb->alloc_stats.moves, 0);
@@ -1496,7 +1513,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
        }
        memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key));
-        osb->net_key = le32_to_cpu(uuid_net_key);
        strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
        osb->vol_label[63] = '\0';
@@ -1539,25 +1555,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
        }
        osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
-        /* We don't have a cluster lock on the bitmap here because
-         * we're only interested in static information and the extra
-         * complexity at mount time isn't worht it. Don't pass the
-         * inode in to the read function though as we don't want it to
-         * be put in the cache. */
-        status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0,
-                                  NULL);
        iput(inode);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        di = (struct ocfs2_dinode *) bitmap_bh->b_data;
+        osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8;
-        osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
-        brelse(bitmap_bh);
-        mlog(0, "cluster bitmap inode: %llu, clusters per group: %u\n",
-             (unsigned long long)osb->bitmap_blkno, osb->bitmap_cpg);
        status = ocfs2_init_slot_info(osb);
        if (status < 0) {
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index fd2e846e3e6f..ab713ebdd546 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -112,7 +112,7 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
                goto bail;
        }
-        inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE);
+        inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE, type);
        if (IS_ERR(inode)) {
                mlog_errno(PTR_ERR(inode));
                inode = NULL;
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
index 5405ce121c99..e2488f4128a2 100644
--- a/fs/ocfs2/ver.c
+++ b/fs/ocfs2/ver.c
@@ -29,7 +29,7 @@
 #include "ver.h"
-#define OCFS2_BUILD_VERSION "1.3.3"
+#define OCFS2_BUILD_VERSION "1.5.0"
 #define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
deleted file mode 100644
index c05358538f2b..000000000000
--- a/fs/ocfs2/vote.c
+++ /dev/null
@@ -1,756 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * vote.c
- *
- * description here
- *
- * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-#include <linux/kthread.h>
-#include <cluster/heartbeat.h>
-#include <cluster/nodemanager.h>
-#include <cluster/tcp.h>
-#include <dlm/dlmapi.h>
-#define MLOG_MASK_PREFIX ML_VOTE
-#include <cluster/masklog.h>
-#include "ocfs2.h"
-#include "alloc.h"
-#include "dlmglue.h"
-#include "extent_map.h"
-#include "heartbeat.h"
-#include "inode.h"
-#include "journal.h"
-#include "slot_map.h"
-#include "vote.h"
-#include "buffer_head_io.h"
-#define OCFS2_MESSAGE_TYPE_VOTE     (0x1)
-#define OCFS2_MESSAGE_TYPE_RESPONSE (0x2)
-struct ocfs2_msg_hdr
-{
-        __be32 h_response_id; /* used to lookup message handle on sending
-                            * node. */
-        __be32 h_request;
-        __be64 h_blkno;
-        __be32 h_generation;
-        __be32 h_node_num;    /* node sending this particular message. */
-};
-struct ocfs2_vote_msg
-{
-        struct ocfs2_msg_hdr v_hdr;
-        __be32 v_reserved1;
-} __attribute__ ((packed));
-/* Responses are given these values to maintain backwards
- * compatibility with older ocfs2 versions */
-#define OCFS2_RESPONSE_OK               (0)
-#define OCFS2_RESPONSE_BUSY             (-16)
-#define OCFS2_RESPONSE_BAD_MSG          (-22)
-struct ocfs2_response_msg
-{
-        struct ocfs2_msg_hdr r_hdr;
-        __be32 r_response;
-} __attribute__ ((packed));
-struct ocfs2_vote_work {
-        struct list_head   w_list;
-        struct ocfs2_vote_msg w_msg;
-};
-enum ocfs2_vote_request {
-        OCFS2_VOTE_REQ_INVALID = 0,
-        OCFS2_VOTE_REQ_MOUNT,
-        OCFS2_VOTE_REQ_UMOUNT,
-        OCFS2_VOTE_REQ_LAST
-};
-static inline int ocfs2_is_valid_vote_request(int request)
-{
-        return OCFS2_VOTE_REQ_INVALID < request &&
-                request < OCFS2_VOTE_REQ_LAST;
-}
-typedef void (*ocfs2_net_response_callback)(void *priv,
-                                            struct ocfs2_response_msg *resp);
-struct ocfs2_net_response_cb {
-        ocfs2_net_response_callback     rc_cb;
-        void                            *rc_priv;
-};
-struct ocfs2_net_wait_ctxt {
-        struct list_head        n_list;
-        u32                     n_response_id;
-        wait_queue_head_t       n_event;
-        struct ocfs2_node_map   n_node_map;
-        int                     n_response; /* an agreggate response. 0 if
-                                             * all nodes are go, < 0 on any
-                                             * negative response from any
-                                             * node or network error. */
-        struct ocfs2_net_response_cb *n_callback;
-};
-static void ocfs2_process_mount_request(struct ocfs2_super *osb,
-                                        unsigned int node_num)
-{
-        mlog(0, "MOUNT vote from node %u\n", node_num);
-        /* The other node only sends us this message when he has an EX
-         * on the superblock, so our recovery threads (if having been
-         * launched) are waiting on it.*/
-        ocfs2_recovery_map_clear(osb, node_num);
-        ocfs2_node_map_set_bit(osb, &osb->mounted_map, node_num);
-        /* We clear the umount map here because a node may have been
-         * previously mounted, safely unmounted but never stopped
-         * heartbeating - in which case we'd have a stale entry. */
-        ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
-}
-static void ocfs2_process_umount_request(struct ocfs2_super *osb,
-                                         unsigned int node_num)
-{
-        mlog(0, "UMOUNT vote from node %u\n", node_num);
-        ocfs2_node_map_clear_bit(osb, &osb->mounted_map, node_num);
-        ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
-}
-static void ocfs2_process_vote(struct ocfs2_super *osb,
-                               struct ocfs2_vote_msg *msg)
-{
-        int net_status, vote_response;
-        unsigned int node_num;
-        u64 blkno;
-        enum ocfs2_vote_request request;
-        struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
-        struct ocfs2_response_msg response;
-        /* decode the network mumbo jumbo into local variables. */
-        request = be32_to_cpu(hdr->h_request);
-        blkno = be64_to_cpu(hdr->h_blkno);
-        node_num = be32_to_cpu(hdr->h_node_num);
-        mlog(0, "processing vote: request = %u, blkno = %llu, node_num = %u\n",
-             request, (unsigned long long)blkno, node_num);
-        if (!ocfs2_is_valid_vote_request(request)) {
-                mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
-                     request, node_num);
-                vote_response = OCFS2_RESPONSE_BAD_MSG;
-                goto respond;
-        }
-        vote_response = OCFS2_RESPONSE_OK;
-        switch (request) {
-        case OCFS2_VOTE_REQ_UMOUNT:
-                ocfs2_process_umount_request(osb, node_num);
-                goto respond;
-        case OCFS2_VOTE_REQ_MOUNT:
-                ocfs2_process_mount_request(osb, node_num);
-                goto respond;
-        default:
-                /* avoids a gcc warning */
-                break;
-        }
-respond:
-        /* Response struture is small so we just put it on the stack
-         * and stuff it inline. */
-        memset(&response, 0, sizeof(struct ocfs2_response_msg));
-        response.r_hdr.h_response_id = hdr->h_response_id;
-        response.r_hdr.h_blkno = hdr->h_blkno;
-        response.r_hdr.h_generation = hdr->h_generation;
-        response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
-        response.r_response = cpu_to_be32(vote_response);
-        net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
-                                        osb->net_key,
-                                        &response,
-                                        sizeof(struct ocfs2_response_msg),
-                                        node_num,
-                                        NULL);
-        /* We still want to error print for ENOPROTOOPT here. The
-         * sending node shouldn't have unregistered his net handler
-         * without sending an unmount vote 1st */
-        if (net_status < 0
-            && net_status != -ETIMEDOUT
-            && net_status != -ENOTCONN)
-                mlog(ML_ERROR, "message to node %u fails with error %d!\n",
-                     node_num, net_status);
-}
-static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
-{
-        unsigned long processed;
-        struct ocfs2_lock_res *lockres;
-        struct ocfs2_vote_work *work;
-        mlog_entry_void();
-        spin_lock(&osb->vote_task_lock);
-        /* grab this early so we know to try again if a state change and
-         * wake happens part-way through our work  */
-        osb->vote_work_sequence = osb->vote_wake_sequence;
-        processed = osb->blocked_lock_count;
-        while (processed) {
-                BUG_ON(list_empty(&osb->blocked_lock_list));
-                lockres = list_entry(osb->blocked_lock_list.next,
-                                     struct ocfs2_lock_res, l_blocked_list);
-                list_del_init(&lockres->l_blocked_list);
-                osb->blocked_lock_count--;
-                spin_unlock(&osb->vote_task_lock);
-                BUG_ON(!processed);
-                processed--;
-                ocfs2_process_blocked_lock(osb, lockres);
-                spin_lock(&osb->vote_task_lock);
-        }
-        while (osb->vote_count) {
-                BUG_ON(list_empty(&osb->vote_list));
-                work = list_entry(osb->vote_list.next,
-                                  struct ocfs2_vote_work, w_list);
-                list_del(&work->w_list);
-                osb->vote_count--;
-                spin_unlock(&osb->vote_task_lock);
-                ocfs2_process_vote(osb, &work->w_msg);
-                kfree(work);
-                spin_lock(&osb->vote_task_lock);
-        }
-        spin_unlock(&osb->vote_task_lock);
-        mlog_exit_void();
-}
-static int ocfs2_vote_thread_lists_empty(struct ocfs2_super *osb)
-{
-        int empty = 0;
-        spin_lock(&osb->vote_task_lock);
-        if (list_empty(&osb->blocked_lock_list) &&
-            list_empty(&osb->vote_list))
-                empty = 1;
-        spin_unlock(&osb->vote_task_lock);
-        return empty;
-}
-static int ocfs2_vote_thread_should_wake(struct ocfs2_super *osb)
-{
-        int should_wake = 0;
-        spin_lock(&osb->vote_task_lock);
-        if (osb->vote_work_sequence != osb->vote_wake_sequence)
-                should_wake = 1;
-        spin_unlock(&osb->vote_task_lock);
-        return should_wake;
-}
-int ocfs2_vote_thread(void *arg)
-{
-        int status = 0;
-        struct ocfs2_super *osb = arg;
-        /* only quit once we've been asked to stop and there is no more
-         * work available */
-        while (!(kthread_should_stop() &&
-                 ocfs2_vote_thread_lists_empty(osb))) {
-                wait_event_interruptible(osb->vote_event,
-                                         ocfs2_vote_thread_should_wake(osb) ||
-                                         kthread_should_stop());
-                mlog(0, "vote_thread: awoken\n");
-                ocfs2_vote_thread_do_work(osb);
-        }
-        osb->vote_task = NULL;
-        return status;
-}
-static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response_id)
-{
-        struct ocfs2_net_wait_ctxt *w;
-        w = kzalloc(sizeof(*w), GFP_NOFS);
-        if (!w) {
-                mlog_errno(-ENOMEM);
-                goto bail;
-        }
-        INIT_LIST_HEAD(&w->n_list);
-        init_waitqueue_head(&w->n_event);
-        ocfs2_node_map_init(&w->n_node_map);
-        w->n_response_id = response_id;
-        w->n_callback = NULL;
-bail:
-        return w;
-}
-static unsigned int ocfs2_new_response_id(struct ocfs2_super *osb)
-{
-        unsigned int ret;
-        spin_lock(&osb->net_response_lock);
-        ret = ++osb->net_response_ids;
-        spin_unlock(&osb->net_response_lock);
-        return ret;
-}
-static void ocfs2_dequeue_net_wait_ctxt(struct ocfs2_super *osb,
-                                        struct ocfs2_net_wait_ctxt *w)
-{
-        spin_lock(&osb->net_response_lock);
-        list_del(&w->n_list);
-        spin_unlock(&osb->net_response_lock);
-}
-static void ocfs2_queue_net_wait_ctxt(struct ocfs2_super *osb,
-                                      struct ocfs2_net_wait_ctxt *w)
-{
-        spin_lock(&osb->net_response_lock);
-        list_add_tail(&w->n_list,
-                      &osb->net_response_list);
-        spin_unlock(&osb->net_response_lock);
-}
-static void __ocfs2_mark_node_responded(struct ocfs2_super *osb,
-                                        struct ocfs2_net_wait_ctxt *w,
-                                        int node_num)
-{
-        assert_spin_locked(&osb->net_response_lock);
-        ocfs2_node_map_clear_bit(osb, &w->n_node_map, node_num);
-        if (ocfs2_node_map_is_empty(osb, &w->n_node_map))
-                wake_up(&w->n_event);
-}
-/* Intended to be called from the node down callback, we fake remove
- * the node from all our response contexts */
-void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
-                                        int node_num)
-{
-        struct list_head *p;
-        struct ocfs2_net_wait_ctxt *w = NULL;
-        spin_lock(&osb->net_response_lock);
-        list_for_each(p, &osb->net_response_list) {
-                w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
-                __ocfs2_mark_node_responded(osb, w, node_num);
-        }
-        spin_unlock(&osb->net_response_lock);
-}
-static int ocfs2_broadcast_vote(struct ocfs2_super *osb,
-                                struct ocfs2_vote_msg *request,
-                                unsigned int response_id,
-                                int *response,
-                                struct ocfs2_net_response_cb *callback)
-{
-        int status, i, remote_err;
-        struct ocfs2_net_wait_ctxt *w = NULL;
-        int dequeued = 0;
-        mlog_entry_void();
-        w = ocfs2_new_net_wait_ctxt(response_id);
-        if (!w) {
-                status = -ENOMEM;
-                mlog_errno(status);
-                goto bail;
-        }
-        w->n_callback = callback;
-        /* we're pretty much ready to go at this point, and this fills
-         * in n_response which we need anyway... */
-        ocfs2_queue_net_wait_ctxt(osb, w);
-        i = ocfs2_node_map_iterate(osb, &osb->mounted_map, 0);
-        while (i != O2NM_INVALID_NODE_NUM) {
-                if (i != osb->node_num) {
-                        mlog(0, "trying to send request to node %i\n", i);
-                        ocfs2_node_map_set_bit(osb, &w->n_node_map, i);
-                        remote_err = 0;
-                        status = o2net_send_message(OCFS2_MESSAGE_TYPE_VOTE,
-                                                    osb->net_key,
-                                                    request,
-                                                    sizeof(*request),
-                                                    i,
-                                                    &remote_err);
-                        if (status == -ETIMEDOUT) {
-                                mlog(0, "remote node %d timed out!\n", i);
-                                status = -EAGAIN;
-                                goto bail;
-                        }
-                        if (remote_err < 0) {
-                                status = remote_err;
-                                mlog(0, "remote error %d on node %d!\n",
-                                     remote_err, i);
-                                mlog_errno(status);
-                                goto bail;
-                        }
-                        if (status < 0) {
-                                mlog_errno(status);
-                                goto bail;
-                        }
-                }
-                i++;
-                i = ocfs2_node_map_iterate(osb, &osb->mounted_map, i);
-                mlog(0, "next is %d, i am %d\n", i, osb->node_num);
-        }
-        mlog(0, "done sending, now waiting on responses...\n");
-        wait_event(w->n_event, ocfs2_node_map_is_empty(osb, &w->n_node_map));
-        ocfs2_dequeue_net_wait_ctxt(osb, w);
-        dequeued = 1;
-        *response = w->n_response;
-        status = 0;
-bail:
-        if (w) {
-                if (!dequeued)
-                        ocfs2_dequeue_net_wait_ctxt(osb, w);
-                kfree(w);
-        }
-        mlog_exit(status);
-        return status;
-}
-static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
-                                                      u64 blkno,
-                                                      unsigned int generation,
-                                                      enum ocfs2_vote_request type)
-{
-        struct ocfs2_vote_msg *request;
-        struct ocfs2_msg_hdr *hdr;
-        BUG_ON(!ocfs2_is_valid_vote_request(type));
-        request = kzalloc(sizeof(*request), GFP_NOFS);
-        if (!request) {
-                mlog_errno(-ENOMEM);
-        } else {
-                hdr = &request->v_hdr;
-                hdr->h_node_num = cpu_to_be32(osb->node_num);
-                hdr->h_request = cpu_to_be32(type);
-                hdr->h_blkno = cpu_to_be64(blkno);
-                hdr->h_generation = cpu_to_be32(generation);
-        }
-        return request;
-}
-/* Complete the buildup of a new vote request and process the
- * broadcast return value. */
-static int ocfs2_do_request_vote(struct ocfs2_super *osb,
-                                 struct ocfs2_vote_msg *request,
-                                 struct ocfs2_net_response_cb *callback)
-{
-        int status, response = -EBUSY;
-        unsigned int response_id;
-        struct ocfs2_msg_hdr *hdr;
-        response_id = ocfs2_new_response_id(osb);
-        hdr = &request->v_hdr;
-        hdr->h_response_id = cpu_to_be32(response_id);
-        status = ocfs2_broadcast_vote(osb, request, response_id, &response,
-                                      callback);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        status = response;
-bail:
-        return status;
-}
-int ocfs2_request_mount_vote(struct ocfs2_super *osb)
-{
-        int status;
-        struct ocfs2_vote_msg *request = NULL;
-        request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_MOUNT);
-        if (!request) {
-                status = -ENOMEM;
-                goto bail;
-        }
-        status = -EAGAIN;
-        while (status == -EAGAIN) {
-                if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
-                    signal_pending(current)) {
-                        status = -ERESTARTSYS;
-                        goto bail;
-                }
-                if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
-                                           osb->node_num)) {
-                        status = 0;
-                        goto bail;
-                }
-                status = ocfs2_do_request_vote(osb, request, NULL);
-        }
-bail:
-        kfree(request);
-        return status;
-}
-int ocfs2_request_umount_vote(struct ocfs2_super *osb)
-{
-        int status;
-        struct ocfs2_vote_msg *request = NULL;
-        request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_UMOUNT);
-        if (!request) {
-                status = -ENOMEM;
-                goto bail;
-        }
-        status = -EAGAIN;
-        while (status == -EAGAIN) {
-                /* Do not check signals on this vote... We really want
-                 * this one to go all the way through. */
-                if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
-                                           osb->node_num)) {
-                        status = 0;
-                        goto bail;
-                }
-                status = ocfs2_do_request_vote(osb, request, NULL);
-        }
-bail:
-        kfree(request);
-        return status;
-}
-/* TODO: This should eventually be a hash table! */
-static struct ocfs2_net_wait_ctxt * __ocfs2_find_net_wait_ctxt(struct ocfs2_super *osb,
-                                                               u32 response_id)
-{
-        struct list_head *p;
-        struct ocfs2_net_wait_ctxt *w = NULL;
-        list_for_each(p, &osb->net_response_list) {
-                w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
-                if (response_id == w->n_response_id)
-                        break;
-                w = NULL;
-        }
-        return w;
-}
-/* Translate response codes into local node errno values */
-static inline int ocfs2_translate_response(int response)
-{
-        int ret;
-        switch (response) {
-        case OCFS2_RESPONSE_OK:
-                ret = 0;
-                break;
-        case OCFS2_RESPONSE_BUSY:
-                ret = -EBUSY;
-                break;
-        default:
-                ret = -EINVAL;
-        }
-        return ret;
-}
-static int ocfs2_handle_response_message(struct o2net_msg *msg,
-                                         u32 len,
-                                         void *data, void **ret_data)
-{
-        unsigned int response_id, node_num;
-        int response_status;
-        struct ocfs2_super *osb = data;
-        struct ocfs2_response_msg *resp;
-        struct ocfs2_net_wait_ctxt * w;
-        struct ocfs2_net_response_cb *resp_cb;
-        resp = (struct ocfs2_response_msg *) msg->buf;
-        response_id = be32_to_cpu(resp->r_hdr.h_response_id);
-        node_num = be32_to_cpu(resp->r_hdr.h_node_num);
-        response_status = 
-                ocfs2_translate_response(be32_to_cpu(resp->r_response));
-        mlog(0, "received response message:\n");
-        mlog(0, "h_response_id = %u\n", response_id);
-        mlog(0, "h_request = %u\n", be32_to_cpu(resp->r_hdr.h_request));
-        mlog(0, "h_blkno = %llu\n",
-             (unsigned long long)be64_to_cpu(resp->r_hdr.h_blkno));
-        mlog(0, "h_generation = %u\n", be32_to_cpu(resp->r_hdr.h_generation));
-        mlog(0, "h_node_num = %u\n", node_num);
-        mlog(0, "r_response = %d\n", response_status);
-        spin_lock(&osb->net_response_lock);
-        w = __ocfs2_find_net_wait_ctxt(osb, response_id);
-        if (!w) {
-                mlog(0, "request not found!\n");
-                goto bail;
-        }
-        resp_cb = w->n_callback;
-        if (response_status && (!w->n_response)) {
-                /* we only really need one negative response so don't
-                 * set it twice. */
-                w->n_response = response_status;
-        }
-        if (resp_cb) {
-                spin_unlock(&osb->net_response_lock);
-                resp_cb->rc_cb(resp_cb->rc_priv, resp);
-                spin_lock(&osb->net_response_lock);
-        }
-        __ocfs2_mark_node_responded(osb, w, node_num);
-bail:
-        spin_unlock(&osb->net_response_lock);
-        return 0;
-}
-static int ocfs2_handle_vote_message(struct o2net_msg *msg,
-                                     u32 len,
-                                     void *data, void **ret_data)
-{
-        int status;
-        struct ocfs2_super *osb = data;
-        struct ocfs2_vote_work *work;
-        work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_NOFS);
-        if (!work) {
-                status = -ENOMEM;
-                mlog_errno(status);
-                goto bail;
-        }
-        INIT_LIST_HEAD(&work->w_list);
-        memcpy(&work->w_msg, msg->buf, sizeof(struct ocfs2_vote_msg));
-        mlog(0, "scheduling vote request:\n");
-        mlog(0, "h_response_id = %u\n",
-             be32_to_cpu(work->w_msg.v_hdr.h_response_id));
-        mlog(0, "h_request = %u\n", be32_to_cpu(work->w_msg.v_hdr.h_request));
-        mlog(0, "h_blkno = %llu\n",
-             (unsigned long long)be64_to_cpu(work->w_msg.v_hdr.h_blkno));
-        mlog(0, "h_generation = %u\n",
-             be32_to_cpu(work->w_msg.v_hdr.h_generation));
-        mlog(0, "h_node_num = %u\n",
-             be32_to_cpu(work->w_msg.v_hdr.h_node_num));
-        spin_lock(&osb->vote_task_lock);
-        list_add_tail(&work->w_list, &osb->vote_list);
-        osb->vote_count++;
-        spin_unlock(&osb->vote_task_lock);
-        ocfs2_kick_vote_thread(osb);
-        status = 0;
-bail:
-        return status;
-}
-void ocfs2_unregister_net_handlers(struct ocfs2_super *osb)
-{
-        if (!osb->net_key)
-                return;
-        o2net_unregister_handler_list(&osb->osb_net_handlers);
-        if (!list_empty(&osb->net_response_list))
-                mlog(ML_ERROR, "net response list not empty!\n");
-        osb->net_key = 0;
-}
-int ocfs2_register_net_handlers(struct ocfs2_super *osb)
-{
-        int status = 0;
-        if (ocfs2_mount_local(osb))
-                return 0;
-        status = o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE,
-                                        osb->net_key,
-                                        sizeof(struct ocfs2_response_msg),
-                                        ocfs2_handle_response_message,
-                                        osb, NULL, &osb->osb_net_handlers);
-        if (status) {
-                mlog_errno(status);
-                goto bail;
-        }
-        status = o2net_register_handler(OCFS2_MESSAGE_TYPE_VOTE,
-                                        osb->net_key,
-                                        sizeof(struct ocfs2_vote_msg),
-                                        ocfs2_handle_vote_message,
-                                        osb, NULL, &osb->osb_net_handlers);
-        if (status) {
-                mlog_errno(status);
-                goto bail;
-        }
-bail:
-        if (status < 0)
-                ocfs2_unregister_net_handlers(osb);
-        return status;
-}
diff --git a/fs/open.c b/fs/open.c
index 3b69c53e1837..4932b4d1da05 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1061,7 +1061,7 @@ asmlinkage long sys_open(const char __user *filename, int flags, int mode)
        prevent_tail_call(ret);
        return ret;
 }
-EXPORT_SYMBOL_GPL(sys_open);
+EXPORT_UNUSED_SYMBOL_GPL(sys_open); /* To be deleted for 2.6.25 */
 asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
                           int mode)
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index d88173840082..6b7ff1618945 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -131,7 +131,7 @@ static void property_stop(struct seq_file *f, void *v)
        /* Nothing to do */
 }
-static struct seq_operations property_op = {
+static const struct seq_operations property_op = {
        .start          = property_start,
        .next           = property_next,
        .stop           = property_stop,
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 722e12e5acc7..739da701ae7b 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -195,96 +195,45 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
        return ERR_PTR(res);
 }
-/*
+static ssize_t part_start_show(struct device *dev,
- * sysfs bindings for partitions
+                               struct device_attribute *attr, char *buf)
- */
-struct part_attribute {
-        struct attribute attr;
-        ssize_t (*show)(struct hd_struct *,char *);
-        ssize_t (*store)(struct hd_struct *,const char *, size_t);
-};
-static ssize_t 
-part_attr_show(struct kobject * kobj, struct attribute * attr, char * page)
 {
-        struct hd_struct * p = container_of(kobj,struct hd_struct,kobj);
+        struct hd_struct *p = dev_to_part(dev);
-        struct part_attribute * part_attr = container_of(attr,struct part_attribute,attr);
-        ssize_t ret = 0;
-        if (part_attr->show)
-                ret = part_attr->show(p, page);
-        return ret;
-}
-static ssize_t
-part_attr_store(struct kobject * kobj, struct attribute * attr,
-                const char *page, size_t count)
-{
-        struct hd_struct * p = container_of(kobj,struct hd_struct,kobj);
-        struct part_attribute * part_attr = container_of(attr,struct part_attribute,attr);
-        ssize_t ret = 0;
-        if (part_attr->store)
+        return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect);
-                ret = part_attr->store(p, page, count);
-        return ret;
 }
-static struct sysfs_ops part_sysfs_ops = {
+static ssize_t part_size_show(struct device *dev,
-        .show   =       part_attr_show,
+                              struct device_attribute *attr, char *buf)
-        .store  =       part_attr_store,
-};
-static ssize_t part_uevent_store(struct hd_struct * p,
-                                 const char *page, size_t count)
 {
-        kobject_uevent(&p->kobj, KOBJ_ADD);
+        struct hd_struct *p = dev_to_part(dev);
-        return count;
+        return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
 }
-static ssize_t part_dev_read(struct hd_struct * p, char *page)
-{
+static ssize_t part_stat_show(struct device *dev,
-        struct gendisk *disk = container_of(p->kobj.parent,struct gendisk,kobj);
+                              struct device_attribute *attr, char *buf)
-        dev_t dev = MKDEV(disk->major, disk->first_minor + p->partno); 
-        return print_dev_t(page, dev);
-}
-static ssize_t part_start_read(struct hd_struct * p, char *page)
-{
-        return sprintf(page, "%llu\n",(unsigned long long)p->start_sect);
-}
-static ssize_t part_size_read(struct hd_struct * p, char *page)
-{
-        return sprintf(page, "%llu\n",(unsigned long long)p->nr_sects);
-}
-static ssize_t part_stat_read(struct hd_struct * p, char *page)
 {
-        return sprintf(page, "%8u %8llu %8u %8llu\n",
+        struct hd_struct *p = dev_to_part(dev);
+        return sprintf(buf, "%8u %8llu %8u %8llu\n",
                       p->ios[0], (unsigned long long)p->sectors[0],
                       p->ios[1], (unsigned long long)p->sectors[1]);
 }
-static struct part_attribute part_attr_uevent = {
-        .attr = {.name = "uevent", .mode = S_IWUSR },
-        .store  = part_uevent_store
-};
-static struct part_attribute part_attr_dev = {
-        .attr = {.name = "dev", .mode = S_IRUGO },
-        .show   = part_dev_read
-};
-static struct part_attribute part_attr_start = {
-        .attr = {.name = "start", .mode = S_IRUGO },
-        .show   = part_start_read
-};
-static struct part_attribute part_attr_size = {
-        .attr = {.name = "size", .mode = S_IRUGO },
-        .show   = part_size_read
-};
-static struct part_attribute part_attr_stat = {
-        .attr = {.name = "stat", .mode = S_IRUGO },
-        .show   = part_stat_read
-};
 #ifdef CONFIG_FAIL_MAKE_REQUEST
+static ssize_t part_fail_show(struct device *dev,
+                              struct device_attribute *attr, char *buf)
+{
+        struct hd_struct *p = dev_to_part(dev);
-static ssize_t part_fail_store(struct hd_struct * p,
+        return sprintf(buf, "%d\n", p->make_it_fail);
+}
+static ssize_t part_fail_store(struct device *dev,
+                               struct device_attribute *attr,
                               const char *buf, size_t count)
 {
+        struct hd_struct *p = dev_to_part(dev);
        int i;
        if (count > 0 && sscanf(buf, "%d", &i) > 0)
@@ -292,50 +241,53 @@ static ssize_t part_fail_store(struct hd_struct * p,
        return count;
 }
-static ssize_t part_fail_read(struct hd_struct * p, char *page)
+#endif
-{
-        return sprintf(page, "%d\n", p->make_it_fail);
-}
-static struct part_attribute part_attr_fail = {
-        .attr = {.name = "make-it-fail", .mode = S_IRUGO | S_IWUSR },
-        .store  = part_fail_store,
-        .show   = part_fail_read
-};
+static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
+static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
+#ifdef CONFIG_FAIL_MAKE_REQUEST
+static struct device_attribute dev_attr_fail =
+        __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
 #endif
-static struct attribute * default_attrs[] = {
+static struct attribute *part_attrs[] = {
-        &part_attr_uevent.attr,
+        &dev_attr_start.attr,
-        &part_attr_dev.attr,
+        &dev_attr_size.attr,
-        &part_attr_start.attr,
+        &dev_attr_stat.attr,
-        &part_attr_size.attr,
-        &part_attr_stat.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
-        &part_attr_fail.attr,
+        &dev_attr_fail.attr,
 #endif
-        NULL,
+        NULL
 };
-extern struct kset block_subsys;
+static struct attribute_group part_attr_group = {
+        .attrs = part_attrs,
+};
-static void part_release(struct kobject *kobj)
+static struct attribute_group *part_attr_groups[] = {
+        &part_attr_group,
+        NULL
+};
+static void part_release(struct device *dev)
 {
-        struct hd_struct * p = container_of(kobj,struct hd_struct,kobj);
+        struct hd_struct *p = dev_to_part(dev);
        kfree(p);
 }
-struct kobj_type ktype_part = {
+struct device_type part_type = {
+        .name           = "partition",
+        .groups         = part_attr_groups,
        .release        = part_release,
-        .default_attrs  = default_attrs,
-        .sysfs_ops      = &part_sysfs_ops,
 };
 static inline void partition_sysfs_add_subdir(struct hd_struct *p)
 {
        struct kobject *k;
-        k = kobject_get(&p->kobj);
+        k = kobject_get(&p->dev.kobj);
-        p->holder_dir = kobject_add_dir(k, "holders");
+        p->holder_dir = kobject_create_and_add("holders", k);
        kobject_put(k);
 }
@@ -343,15 +295,16 @@ static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
 {
        struct kobject *k;
-        k = kobject_get(&disk->kobj);
+        k = kobject_get(&disk->dev.kobj);
-        disk->holder_dir = kobject_add_dir(k, "holders");
+        disk->holder_dir = kobject_create_and_add("holders", k);
-        disk->slave_dir = kobject_add_dir(k, "slaves");
+        disk->slave_dir = kobject_create_and_add("slaves", k);
        kobject_put(k);
 }
 void delete_partition(struct gendisk *disk, int part)
 {
        struct hd_struct *p = disk->part[part-1];
        if (!p)
                return;
        if (!p->nr_sects)
@@ -361,113 +314,55 @@ void delete_partition(struct gendisk *disk, int part)
        p->nr_sects = 0;
        p->ios[0] = p->ios[1] = 0;
        p->sectors[0] = p->sectors[1] = 0;
-        sysfs_remove_link(&p->kobj, "subsystem");
+        kobject_put(p->holder_dir);
-        kobject_unregister(p->holder_dir);
+        device_del(&p->dev);
-        kobject_uevent(&p->kobj, KOBJ_REMOVE);
+        put_device(&p->dev);
-        kobject_del(&p->kobj);
-        kobject_put(&p->kobj);
 }
 void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags)
 {
        struct hd_struct *p;
+        int err;
        p = kzalloc(sizeof(*p), GFP_KERNEL);
        if (!p)
                return;
-        
        p->start_sect = start;
        p->nr_sects = len;
        p->partno = part;
        p->policy = disk->policy;
-        if (isdigit(disk->kobj.k_name[strlen(disk->kobj.k_name)-1]))
+        if (isdigit(disk->dev.bus_id[strlen(disk->dev.bus_id)-1]))
-                kobject_set_name(&p->kobj, "%sp%d",
+                snprintf(p->dev.bus_id, BUS_ID_SIZE,
-                                 kobject_name(&disk->kobj), part);
+                "%sp%d", disk->dev.bus_id, part);
        else
-                kobject_set_name(&p->kobj, "%s%d",
+                snprintf(p->dev.bus_id, BUS_ID_SIZE,
-                                 kobject_name(&disk->kobj),part);
+                         "%s%d", disk->dev.bus_id, part);
-        p->kobj.parent = &disk->kobj;
-        p->kobj.ktype = &ktype_part;
+        device_initialize(&p->dev);
-        kobject_init(&p->kobj);
+        p->dev.devt = MKDEV(disk->major, disk->first_minor + part);
-        kobject_add(&p->kobj);
+        p->dev.class = &block_class;
-        if (!disk->part_uevent_suppress)
+        p->dev.type = &part_type;
-                kobject_uevent(&p->kobj, KOBJ_ADD);
+        p->dev.parent = &disk->dev;
-        sysfs_create_link(&p->kobj, &block_subsys.kobj, "subsystem");
+        disk->part[part-1] = p;
+        /* delay uevent until 'holders' subdir is created */
+        p->dev.uevent_suppress = 1;
+        device_add(&p->dev);
+        partition_sysfs_add_subdir(p);
+        p->dev.uevent_suppress = 0;
        if (flags & ADDPART_FLAG_WHOLEDISK) {
                static struct attribute addpartattr = {
                        .name = "whole_disk",
                        .mode = S_IRUSR | S_IRGRP | S_IROTH,
                };
+                err = sysfs_create_file(&p->dev.kobj, &addpartattr);
-                sysfs_create_file(&p->kobj, &addpartattr);
        }
-        partition_sysfs_add_subdir(p);
-        disk->part[part-1] = p;
-}
-static char *make_block_name(struct gendisk *disk)
+        /* suppress uevent if the disk supresses it */
-{
+        if (!disk->dev.uevent_suppress)
-        char *name;
+                kobject_uevent(&p->dev.kobj, KOBJ_ADD);
-        static char *block_str = "block:";
-        int size;
-        char *s;
-        size = strlen(block_str) + strlen(disk->disk_name) + 1;
-        name = kmalloc(size, GFP_KERNEL);
-        if (!name)
-                return NULL;
-        strcpy(name, block_str);
-        strcat(name, disk->disk_name);
-        /* ewww... some of these buggers have / in name... */
-        s = strchr(name, '/');
-        if (s)
-                *s = '!';
-        return name;
-}
-static int disk_sysfs_symlinks(struct gendisk *disk)
-{
-        struct device *target = get_device(disk->driverfs_dev);
-        int err;
-        char *disk_name = NULL;
-        if (target) {
-                disk_name = make_block_name(disk);
-                if (!disk_name) {
-                        err = -ENOMEM;
-                        goto err_out;
-                }
-                err = sysfs_create_link(&disk->kobj, &target->kobj, "device");
-                if (err)
-                        goto err_out_disk_name;
-                err = sysfs_create_link(&target->kobj, &disk->kobj, disk_name);
-                if (err)
-                        goto err_out_dev_link;
-        }
-        err = sysfs_create_link(&disk->kobj, &block_subsys.kobj,
-                                "subsystem");
-        if (err)
-                goto err_out_disk_name_lnk;
-        kfree(disk_name);
-        return 0;
-err_out_disk_name_lnk:
-        if (target) {
-                sysfs_remove_link(&target->kobj, disk_name);
-err_out_dev_link:
-                sysfs_remove_link(&disk->kobj, "device");
-err_out_disk_name:
-                kfree(disk_name);
-err_out:
-                put_device(target);
-        }
-        return err;
 }
 /* Not exported, helper to add_disk(). */
@@ -479,19 +374,29 @@ void register_disk(struct gendisk *disk)
        struct hd_struct *p;
        int err;
-        kobject_set_name(&disk->kobj, "%s", disk->disk_name);
+        disk->dev.parent = disk->driverfs_dev;
-        /* ewww... some of these buggers have / in name... */
+        disk->dev.devt = MKDEV(disk->major, disk->first_minor);
-        s = strchr(disk->kobj.k_name, '/');
+        strlcpy(disk->dev.bus_id, disk->disk_name, KOBJ_NAME_LEN);
+        /* ewww... some of these buggers have / in the name... */
+        s = strchr(disk->dev.bus_id, '/');
        if (s)
                *s = '!';
-        if ((err = kobject_add(&disk->kobj)))
+        /* delay uevents, until we scanned partition table */
+        disk->dev.uevent_suppress = 1;
+        if (device_add(&disk->dev))
                return;
-        err = disk_sysfs_symlinks(disk);
+#ifndef CONFIG_SYSFS_DEPRECATED
+        err = sysfs_create_link(block_depr, &disk->dev.kobj,
+                                kobject_name(&disk->dev.kobj));
        if (err) {
-                kobject_del(&disk->kobj);
+                device_del(&disk->dev);
                return;
        }
-        disk_sysfs_add_subdirs(disk);
+#endif
+        disk_sysfs_add_subdirs(disk);
        /* No minors to use for partitions */
        if (disk->minors == 1)
@@ -505,25 +410,23 @@ void register_disk(struct gendisk *disk)
        if (!bdev)
                goto exit;
-        /* scan partition table, but suppress uevents */
        bdev->bd_invalidated = 1;
-        disk->part_uevent_suppress = 1;
        err = blkdev_get(bdev, FMODE_READ, 0);
-        disk->part_uevent_suppress = 0;
        if (err < 0)
                goto exit;
        blkdev_put(bdev);
 exit:
-        /* announce disk after possible partitions are already created */
+        /* announce disk after possible partitions are created */
-        kobject_uevent(&disk->kobj, KOBJ_ADD);
+        disk->dev.uevent_suppress = 0;
+        kobject_uevent(&disk->dev.kobj, KOBJ_ADD);
        /* announce possible partitions */
        for (i = 1; i < disk->minors; i++) {
                p = disk->part[i-1];
                if (!p || !p->nr_sects)
                        continue;
-                kobject_uevent(&p->kobj, KOBJ_ADD);
+                kobject_uevent(&p->dev.kobj, KOBJ_ADD);
        }
 }
@@ -602,19 +505,11 @@ void del_gendisk(struct gendisk *disk)
        disk_stat_set_all(disk, 0);
        disk->stamp = 0;
-        kobject_uevent(&disk->kobj, KOBJ_REMOVE);
+        kobject_put(disk->holder_dir);
-        kobject_unregister(disk->holder_dir);
+        kobject_put(disk->slave_dir);
-        kobject_unregister(disk->slave_dir);
+        disk->driverfs_dev = NULL;
-        if (disk->driverfs_dev) {
+#ifndef CONFIG_SYSFS_DEPRECATED
-                char *disk_name = make_block_name(disk);
+        sysfs_remove_link(block_depr, disk->dev.bus_id);
-                sysfs_remove_link(&disk->kobj, "device");
+#endif
-                if (disk_name) {
+        device_del(&disk->dev);
-                        sysfs_remove_link(&disk->driverfs_dev->kobj, disk_name);
-                        kfree(disk_name);
-                }
-                put_device(disk->driverfs_dev);
-                disk->driverfs_dev = NULL;
-        }
-        sysfs_remove_link(&disk->kobj, "subsystem");
-        kobject_del(&disk->kobj);
 }
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 63c95afb561f..b380313092bd 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -141,12 +141,7 @@ static const char *task_state_array[] = {
 static inline const char *get_task_state(struct task_struct *tsk)
 {
-        unsigned int state = (tsk->state & (TASK_RUNNING |
+        unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state;
-                                            TASK_INTERRUPTIBLE |
-                                            TASK_UNINTERRUPTIBLE |
-                                            TASK_STOPPED |
-                                            TASK_TRACED)) |
-                                           tsk->exit_state;
        const char **p = &task_state_array[0];
        while (state) {
@@ -169,7 +164,7 @@ static inline char *task_state(struct task_struct *p, char *buffer)
        ppid = pid_alive(p) ?
                task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
        tpid = pid_alive(p) && p->ptrace ?
-                task_ppid_nr_ns(rcu_dereference(p->parent), ns) : 0;
+                task_pid_nr_ns(rcu_dereference(p->parent), ns) : 0;
        buffer += sprintf(buffer,
                "State:\t%s\n"
                "Tgid:\t%d\n"
@@ -358,7 +353,8 @@ static cputime_t task_utime(struct task_struct *p)
        }
        utime = (clock_t)temp;
-        return clock_t_to_cputime(utime);
+        p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
+        return p->prev_utime;
 }
 static cputime_t task_stime(struct task_struct *p)
@@ -373,7 +369,10 @@ static cputime_t task_stime(struct task_struct *p)
        stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
                        cputime_to_clock_t(task_utime(p));
-        return clock_t_to_cputime(stime);
+        if (stime >= 0)
+                p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
+        return p->prev_stime;
 }
 #endif
@@ -460,8 +459,8 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
                }
                sid = task_session_nr_ns(task, ns);
+                ppid = task_tgid_nr_ns(task->real_parent, ns);
                pgid = task_pgrp_nr_ns(task, ns);
-                ppid = task_ppid_nr_ns(task, ns);
                unlock_task_sighand(task, &flags);
        }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index aeaf0d0f2f51..33537487f5ab 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -199,9 +199,29 @@ static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vf
        (task == current || \
        (task->parent == current && \
        (task->ptrace & PT_PTRACED) && \
-         (task->state == TASK_STOPPED || task->state == TASK_TRACED) && \
+         (task_is_stopped_or_traced(task)) && \
         security_ptrace(current,task) == 0))
+struct mm_struct *mm_for_maps(struct task_struct *task)
+{
+        struct mm_struct *mm = get_task_mm(task);
+        if (!mm)
+                return NULL;
+        down_read(&mm->mmap_sem);
+        task_lock(task);
+        if (task->mm != mm)
+                goto out;
+        if (task->mm != current->mm && __ptrace_may_attach(task) < 0)
+                goto out;
+        task_unlock(task);
+        return mm;
+out:
+        task_unlock(task);
+        up_read(&mm->mmap_sem);
+        mmput(mm);
+        return NULL;
+}
 static int proc_pid_cmdline(struct task_struct *task, char * buffer)
 {
        int res = 0;
@@ -290,6 +310,77 @@ static int proc_pid_schedstat(struct task_struct *task, char *buffer)
 }
 #endif
+#ifdef CONFIG_LATENCYTOP
+static int lstats_show_proc(struct seq_file *m, void *v)
+{
+        int i;
+        struct task_struct *task = m->private;
+        seq_puts(m, "Latency Top version : v0.1\n");
+        for (i = 0; i < 32; i++) {
+                if (task->latency_record[i].backtrace[0]) {
+                        int q;
+                        seq_printf(m, "%i %li %li ",
+                                task->latency_record[i].count,
+                                task->latency_record[i].time,
+                                task->latency_record[i].max);
+                        for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
+                                char sym[KSYM_NAME_LEN];
+                                char *c;
+                                if (!task->latency_record[i].backtrace[q])
+                                        break;
+                                if (task->latency_record[i].backtrace[q] == ULONG_MAX)
+                                        break;
+                                sprint_symbol(sym, task->latency_record[i].backtrace[q]);
+                                c = strchr(sym, '+');
+                                if (c)
+                                        *c = 0;
+                                seq_printf(m, "%s ", sym);
+                        }
+                        seq_printf(m, "\n");
+                }
+        }
+        return 0;
+}
+static int lstats_open(struct inode *inode, struct file *file)
+{
+        int ret;
+        struct seq_file *m;
+        struct task_struct *task = get_proc_task(inode);
+        ret = single_open(file, lstats_show_proc, NULL);
+        if (!ret) {
+                m = file->private_data;
+                m->private = task;
+        }
+        return ret;
+}
+static ssize_t lstats_write(struct file *file, const char __user *buf,
+                            size_t count, loff_t *offs)
+{
+        struct seq_file *m;
+        struct task_struct *task;
+        m = file->private_data;
+        task = m->private;
+        clear_all_latency_tracing(task);
+        return count;
+}
+static const struct file_operations proc_lstats_operations = {
+        .open           = lstats_open,
+        .read           = seq_read,
+        .write          = lstats_write,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+#endif
 /* The badness from the OOM killer */
 unsigned long badness(struct task_struct *p, unsigned long uptime);
 static int proc_oom_score(struct task_struct *task, char *buffer)
@@ -893,7 +984,7 @@ static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
        if (!task)
                return -ESRCH;
        length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
-                                audit_get_loginuid(task->audit_context));
+                                audit_get_loginuid(task));
        put_task_struct(task);
        return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
 }
@@ -1000,6 +1091,7 @@ static const struct file_operations proc_fault_inject_operations = {
 };
 #endif
 #ifdef CONFIG_SCHED_DEBUG
 /*
 * Print out various scheduling related per-task fields:
@@ -2210,6 +2302,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_SCHEDSTATS
        INF("schedstat",  S_IRUGO, pid_schedstat),
 #endif
+#ifdef CONFIG_LATENCYTOP
+        REG("latency",  S_IRUGO, lstats),
+#endif
 #ifdef CONFIG_PROC_PID_CPUSET
        REG("cpuset",     S_IRUGO, cpuset),
 #endif
@@ -2328,21 +2423,18 @@ out:
 void proc_flush_task(struct task_struct *task)
 {
-        int i, leader;
+        int i;
-        struct pid *pid, *tgid;
+        struct pid *pid, *tgid = NULL;
        struct upid *upid;
-        leader = thread_group_leader(task);
-        proc_flush_task_mnt(proc_mnt, task->pid, leader ? task->tgid : 0);
        pid = task_pid(task);
-        if (pid->level == 0)
+        if (thread_group_leader(task))
-                return;
+                tgid = task_tgid(task);
-        tgid = task_tgid(task);
+        for (i = 0; i <= pid->level; i++) {
-        for (i = 1; i <= pid->level; i++) {
                upid = &pid->numbers[i];
                proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
-                                leader ? 0 : tgid->numbers[i].nr);
+                        tgid ? tgid->numbers[i].nr : 0);
        }
        upid = &pid->numbers[pid->level];
@@ -2414,19 +2506,23 @@ out:
 * Find the first task with tgid >= tgid
 *
 */
-static struct task_struct *next_tgid(unsigned int tgid,
+struct tgid_iter {
-                struct pid_namespace *ns)
+        unsigned int tgid;
-{
        struct task_struct *task;
+};
+static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter)
+{
        struct pid *pid;
+        if (iter.task)
+                put_task_struct(iter.task);
        rcu_read_lock();
 retry:
-        task = NULL;
+        iter.task = NULL;
-        pid = find_ge_pid(tgid, ns);
+        pid = find_ge_pid(iter.tgid, ns);
        if (pid) {
-                tgid = pid_nr_ns(pid, ns) + 1;
+                iter.tgid = pid_nr_ns(pid, ns);
-                task = pid_task(pid, PIDTYPE_PID);
+                iter.task = pid_task(pid, PIDTYPE_PID);
                /* What we to know is if the pid we have find is the
                 * pid of a thread_group_leader.  Testing for task
                 * being a thread_group_leader is the obvious thing
@@ -2439,23 +2535,25 @@ retry:
                 * found doesn't happen to be a thread group leader.
                 * As we don't care in the case of readdir.
                 */
-                if (!task || !has_group_leader_pid(task))
+                if (!iter.task || !has_group_leader_pid(iter.task)) {
+                        iter.tgid += 1;
                        goto retry;
-                get_task_struct(task);
+                }
+                get_task_struct(iter.task);
        }
        rcu_read_unlock();
-        return task;
+        return iter;
 }
 #define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff))
 static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-        struct task_struct *task, int tgid)
+        struct tgid_iter iter)
 {
        char name[PROC_NUMBUF];
-        int len = snprintf(name, sizeof(name), "%d", tgid);
+        int len = snprintf(name, sizeof(name), "%d", iter.tgid);
        return proc_fill_cache(filp, dirent, filldir, name, len,
-                                proc_pid_instantiate, task, NULL);
+                                proc_pid_instantiate, iter.task, NULL);
 }
 /* for the /proc/ directory itself, after non-process stuff has been done */
@@ -2463,8 +2561,7 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
        unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY;
        struct task_struct *reaper = get_proc_task(filp->f_path.dentry->d_inode);
-        struct task_struct *task;
+        struct tgid_iter iter;
-        int tgid;
        struct pid_namespace *ns;
        if (!reaper)
@@ -2477,14 +2574,14 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
        }
        ns = filp->f_dentry->d_sb->s_fs_info;
-        tgid = filp->f_pos - TGID_OFFSET;
+        iter.task = NULL;
-        for (task = next_tgid(tgid, ns);
+        iter.tgid = filp->f_pos - TGID_OFFSET;
-             task;
+        for (iter = next_tgid(ns, iter);
-             put_task_struct(task), task = next_tgid(tgid + 1, ns)) {
+             iter.task;
-                tgid = task_pid_nr_ns(task, ns);
+             iter.tgid += 1, iter = next_tgid(ns, iter)) {
-                filp->f_pos = tgid + TGID_OFFSET;
+                filp->f_pos = iter.tgid + TGID_OFFSET;
-                if (proc_pid_fill_cache(filp, dirent, filldir, task, tgid) < 0) {
+                if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) {
-                        put_task_struct(task);
+                        put_task_struct(iter.task);
                        goto out;
                }
        }
@@ -2533,6 +2630,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_SCHEDSTATS
        INF("schedstat", S_IRUGO, pid_schedstat),
 #endif
+#ifdef CONFIG_LATENCYTOP
+        REG("latency",  S_IRUGO, lstats),
+#endif
 #ifdef CONFIG_PROC_PID_CPUSET
        REG("cpuset",    S_IRUGO, cpuset),
 #endif
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 1bdb62435758..6a2fe5187b62 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -397,8 +397,11 @@ struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry, struct nam
                        if (de->namelen != dentry->d_name.len)
                                continue;
                        if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
-                                unsigned int ino = de->low_ino;
+                                unsigned int ino;
+                                if (de->shadow_proc)
+                                        de = de->shadow_proc(current, de);
+                                ino = de->low_ino;
                                de_get(de);
                                spin_unlock(&proc_subdir_lock);
                                error = -EINVAL;
@@ -555,36 +558,6 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
        return 0;
 }
-/*
- * Kill an inode that got unregistered..
- */
-static void proc_kill_inodes(struct proc_dir_entry *de)
-{
-        struct list_head *p;
-        struct super_block *sb = proc_mnt->mnt_sb;
-        /*
-         * Actually it's a partial revoke().
-         */
-        file_list_lock();
-        list_for_each(p, &sb->s_files) {
-                struct file * filp = list_entry(p, struct file, f_u.fu_list);
-                struct dentry * dentry = filp->f_path.dentry;
-                struct inode * inode;
-                const struct file_operations *fops;
-                if (dentry->d_op != &proc_dentry_operations)
-                        continue;
-                inode = dentry->d_inode;
-                if (PDE(inode) != de)
-                        continue;
-                fops = filp->f_op;
-                filp->f_op = NULL;
-                fops_put(fops);
-        }
-        file_list_unlock();
-}
 static struct proc_dir_entry *proc_create(struct proc_dir_entry **parent,
                                          const char *name,
                                          mode_t mode,
@@ -615,6 +588,7 @@ static struct proc_dir_entry *proc_create(struct proc_dir_entry **parent,
        ent->namelen = len;
        ent->mode = mode;
        ent->nlink = nlink;
+        atomic_set(&ent->count, 1);
        ent->pde_users = 0;
        spin_lock_init(&ent->pde_unload_lock);
        ent->pde_unload_completion = NULL;
@@ -712,7 +686,6 @@ void free_proc_entry(struct proc_dir_entry *de)
 /*
 * Remove a /proc entry and free it if it's not currently in use.
- * If it is in use, we set the 'deleted' flag.
 */
 void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 {
@@ -759,17 +732,10 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 continue_removing:
                if (S_ISDIR(de->mode))
                        parent->nlink--;
-                if (!S_ISREG(de->mode))
-                        proc_kill_inodes(de);
                de->nlink = 0;
                WARN_ON(de->subdir);
-                if (!atomic_read(&de->count))
+                if (atomic_dec_and_test(&de->count))
                        free_proc_entry(de);
-                else {
-                        de->deleted = 1;
-                        printk("remove_proc_entry: %s/%s busy, count=%d\n",
-                                parent->name, de->name, atomic_read(&de->count));
-                }
                break;
        }
        spin_unlock(&proc_subdir_lock);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index abe6a3f04368..1a551d92e1d8 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -43,13 +43,8 @@ void de_put(struct proc_dir_entry *de)
                        return;
                }
-                if (atomic_dec_and_test(&de->count)) {
+                if (atomic_dec_and_test(&de->count))
-                        if (de->deleted) {
+                        free_proc_entry(de);
-                                printk("de_put: deferred delete of %s\n",
-                                        de->name);
-                                free_proc_entry(de);
-                        }
-                }               
                unlock_kernel();
        }
 }
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 1820eb2ef762..05b3e9006262 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -27,6 +27,8 @@ struct vmalloc_info {
        unsigned long   largest_chunk;
 };
+extern struct mm_struct *mm_for_maps(struct task_struct *);
 #ifdef CONFIG_MMU
 #define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
 extern void get_vmalloc_info(struct vmalloc_info *vmi);
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index e0d064e9764e..3462bfde89f6 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -410,7 +410,7 @@ static const struct file_operations proc_modules_operations = {
 };
 #endif
-#ifdef CONFIG_SLAB
+#ifdef CONFIG_SLABINFO
 static int slabinfo_open(struct inode *inode, struct file *file)
 {
        return seq_open(file, &slabinfo_op);
@@ -728,7 +728,7 @@ void __init proc_misc_init(void)
 #endif
        create_seq_entry("stat", 0, &proc_stat_operations);
        create_seq_entry("interrupts", 0, &proc_interrupts_operations);
-#ifdef CONFIG_SLAB
+#ifdef CONFIG_SLABINFO
        create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations);
 #ifdef CONFIG_DEBUG_SLAB_LEAK
        create_seq_entry("slab_allocators", 0 ,&proc_slabstats_operations);
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 749def054a34..4823c9677fac 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -22,16 +22,47 @@
 #include <linux/mount.h>
 #include <linux/nsproxy.h>
 #include <net/net_namespace.h>
+#include <linux/seq_file.h>
 #include "internal.h"
-struct proc_dir_entry *proc_net_create(struct net *net,
+int seq_open_net(struct inode *ino, struct file *f,
-        const char *name, mode_t mode, get_info_t *get_info)
+                 const struct seq_operations *ops, int size)
 {
-        return create_proc_info_entry(name,mode, net->proc_net, get_info);
+        struct net *net;
+        struct seq_net_private *p;
+        BUG_ON(size < sizeof(*p));
+        net = get_proc_net(ino);
+        if (net == NULL)
+                return -ENXIO;
+        p = __seq_open_private(f, ops, size);
+        if (p == NULL) {
+                put_net(net);
+                return -ENOMEM;
+        }
+        p->net = net;
+        return 0;
 }
-EXPORT_SYMBOL_GPL(proc_net_create);
+EXPORT_SYMBOL_GPL(seq_open_net);
+int seq_release_net(struct inode *ino, struct file *f)
+{
+        struct seq_file *seq;
+        struct seq_net_private *p;
+        seq = f->private_data;
+        p = seq->private;
+        put_net(p->net);
+        seq_release_private(ino, f);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(seq_release_net);
 struct proc_dir_entry *proc_net_fops_create(struct net *net,
        const char *name, mode_t mode, const struct file_operations *fops)
@@ -57,88 +88,24 @@ struct net *get_proc_net(const struct inode *inode)
 }
 EXPORT_SYMBOL_GPL(get_proc_net);
-static struct proc_dir_entry *proc_net_shadow;
+static struct proc_dir_entry *shadow_pde;
-static struct dentry *proc_net_shadow_dentry(struct dentry *parent,
+static struct proc_dir_entry *proc_net_shadow(struct task_struct *task,
                                                struct proc_dir_entry *de)
 {
-        struct dentry *shadow = NULL;
+        return task->nsproxy->net_ns->proc_net;
-        struct inode *inode;
-        if (!de)
-                goto out;
-        de_get(de);
-        inode = proc_get_inode(parent->d_inode->i_sb, de->low_ino, de);
-        if (!inode)
-                goto out_de_put;
-        shadow = d_alloc_name(parent, de->name);
-        if (!shadow)
-                goto out_iput;
-        shadow->d_op = parent->d_op; /* proc_dentry_operations */
-        d_instantiate(shadow, inode);
-out:
-        return shadow;
-out_iput:
-        iput(inode);
-out_de_put:
-        de_put(de);
-        goto out;
-}
-static void *proc_net_follow_link(struct dentry *parent, struct nameidata *nd)
-{
-        struct net *net = current->nsproxy->net_ns;
-        struct dentry *shadow;
-        shadow = proc_net_shadow_dentry(parent, net->proc_net);
-        if (!shadow)
-                return ERR_PTR(-ENOENT);
-        dput(nd->dentry);
-        /* My dentry count is 1 and that should be enough as the
-         * shadow dentry is thrown away immediately.
-         */
-        nd->dentry = shadow;
-        return NULL;
 }
-static struct dentry *proc_net_lookup(struct inode *dir, struct dentry *dentry,
+struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
-                                      struct nameidata *nd)
+                struct proc_dir_entry *parent)
 {
-        struct net *net = current->nsproxy->net_ns;
+        struct proc_dir_entry *pde;
-        struct dentry *shadow;
+        pde = proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent);
+        if (pde != NULL)
-        shadow = proc_net_shadow_dentry(nd->dentry, net->proc_net);
+                pde->data = net;
-        if (!shadow)
+        return pde;
-                return ERR_PTR(-ENOENT);
-        dput(nd->dentry);
-        nd->dentry = shadow;
-        return shadow->d_inode->i_op->lookup(shadow->d_inode, dentry, nd);
 }
+EXPORT_SYMBOL_GPL(proc_net_mkdir);
-static int proc_net_setattr(struct dentry *dentry, struct iattr *iattr)
-{
-        struct net *net = current->nsproxy->net_ns;
-        struct dentry *shadow;
-        int ret;
-        shadow = proc_net_shadow_dentry(dentry->d_parent, net->proc_net);
-        if (!shadow)
-                return -ENOENT;
-        ret = shadow->d_inode->i_op->setattr(shadow, iattr);
-        dput(shadow);
-        return ret;
-}
-static const struct file_operations proc_net_dir_operations = {
-        .read                   = generic_read_dir,
-};
-static struct inode_operations proc_net_dir_inode_operations = {
-        .follow_link    = proc_net_follow_link,
-        .lookup         = proc_net_lookup,
-        .setattr        = proc_net_setattr,
-};
 static __net_init int proc_net_ns_init(struct net *net)
 {
@@ -151,18 +118,16 @@ static __net_init int proc_net_ns_init(struct net *net)
                goto out;
        err = -EEXIST;
-        netd = proc_mkdir("net", root);
+        netd = proc_net_mkdir(net, "net", root);
        if (!netd)
                goto free_root;
        err = -EEXIST;
-        net_statd = proc_mkdir("stat", netd);
+        net_statd = proc_net_mkdir(net, "stat", netd);
        if (!net_statd)
                goto free_net;
        root->data = net;
-        netd->data = net;
-        net_statd->data = net;
        net->proc_net_root = root;
        net->proc_net = netd;
@@ -185,16 +150,15 @@ static __net_exit void proc_net_ns_exit(struct net *net)
        kfree(net->proc_net_root);
 }
-static struct pernet_operations proc_net_ns_ops = {
+static struct pernet_operations __net_initdata proc_net_ns_ops = {
        .init = proc_net_ns_init,
        .exit = proc_net_ns_exit,
 };
 int __init proc_net_init(void)
 {
-        proc_net_shadow = proc_mkdir("net", NULL);
+        shadow_pde = proc_mkdir("net", NULL);
-        proc_net_shadow->proc_iops = &proc_net_dir_inode_operations;
+        shadow_pde->shadow_proc = proc_net_shadow;
-        proc_net_shadow->proc_fops = &proc_net_dir_operations;
        return register_pernet_subsys(&proc_net_ns_ops);
 }
diff --git a/fs/proc/root.c b/fs/proc/root.c
index ec9cb3b6c93b..81f99e691f99 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -207,6 +207,7 @@ struct proc_dir_entry proc_root = {
        .name           = "/proc",
        .mode           = S_IFDIR | S_IRUGO | S_IXUGO, 
        .nlink          = 2, 
+        .count          = ATOMIC_INIT(1),
        .proc_iops      = &proc_root_inode_operations, 
        .proc_fops      = &proc_root_operations,
        .parent         = &proc_root,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c24d81a5a040..8043a3eab52c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -397,12 +397,11 @@ static void *m_start(struct seq_file *m, loff_t *pos)
        if (!priv->task)
                return NULL;
-        mm = get_task_mm(priv->task);
+        mm = mm_for_maps(priv->task);
        if (!mm)
                return NULL;
        priv->tail_vma = tail_vma = get_gate_vma(priv->task);
-        down_read(&mm->mmap_sem);
        /* Start with last addr hint */
        if (last_addr && (vma = find_vma(mm, last_addr))) {
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index d8b8c7183c24..1932c2ca3457 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -165,15 +165,13 @@ static void *m_start(struct seq_file *m, loff_t *pos)
        if (!priv->task)
                return NULL;
-        mm = get_task_mm(priv->task);
+        mm = mm_for_maps(priv->task);
        if (!mm) {
                put_task_struct(priv->task);
                priv->task = NULL;
                return NULL;
        }
-        down_read(&mm->mmap_sem);
        /* start from the Nth VMA */
        for (vml = mm->context.vmlist; vml; vml = vml->next)
                if (n-- == 0)
diff --git a/fs/read_write.c b/fs/read_write.c
index 124693e8d3fa..1c177f29e1b7 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -197,25 +197,27 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
 {
        struct inode *inode;
        loff_t pos;
+        int retval = -EINVAL;
        inode = file->f_path.dentry->d_inode;
        if (unlikely((ssize_t) count < 0))
-                goto Einval;
+                return retval;
        pos = *ppos;
        if (unlikely((pos < 0) || (loff_t) (pos + count) < 0))
-                goto Einval;
+                return retval;
        if (unlikely(inode->i_flock && mandatory_lock(inode))) {
-                int retval = locks_mandatory_area(
+                retval = locks_mandatory_area(
                        read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
                        inode, file, pos, count);
                if (retval < 0)
                        return retval;
        }
+        retval = security_file_permission(file,
+                                read_write == READ ? MAY_READ : MAY_WRITE);
+        if (retval)
+                return retval;
        return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
-Einval:
-        return -EINVAL;
 }
 static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
@@ -267,18 +269,15 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
        ret = rw_verify_area(READ, file, pos, count);
        if (ret >= 0) {
                count = ret;
-                ret = security_file_permission (file, MAY_READ);
+                if (file->f_op->read)
-                if (!ret) {
+                        ret = file->f_op->read(file, buf, count, pos);
-                        if (file->f_op->read)
+                else
-                                ret = file->f_op->read(file, buf, count, pos);
+                        ret = do_sync_read(file, buf, count, pos);
-                        else
+                if (ret > 0) {
-                                ret = do_sync_read(file, buf, count, pos);
+                        fsnotify_access(file->f_path.dentry);
-                        if (ret > 0) {
+                        add_rchar(current, ret);
-                                fsnotify_access(file->f_path.dentry);
-                                add_rchar(current, ret);
-                        }
-                        inc_syscr(current);
                }
+                inc_syscr(current);
        }
        return ret;
@@ -325,18 +324,15 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
        ret = rw_verify_area(WRITE, file, pos, count);
        if (ret >= 0) {
                count = ret;
-                ret = security_file_permission (file, MAY_WRITE);
+                if (file->f_op->write)
-                if (!ret) {
+                        ret = file->f_op->write(file, buf, count, pos);
-                        if (file->f_op->write)
+                else
-                                ret = file->f_op->write(file, buf, count, pos);
+                        ret = do_sync_write(file, buf, count, pos);
-                        else
+                if (ret > 0) {
-                                ret = do_sync_write(file, buf, count, pos);
+                        fsnotify_modify(file->f_path.dentry);
-                        if (ret > 0) {
+                        add_wchar(current, ret);
-                                fsnotify_modify(file->f_path.dentry);
-                                add_wchar(current, ret);
-                        }
-                        inc_syscw(current);
                }
+                inc_syscw(current);
        }
        return ret;
@@ -370,7 +366,7 @@ asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)
        return ret;
 }
-EXPORT_SYMBOL_GPL(sys_read);
+EXPORT_UNUSED_SYMBOL_GPL(sys_read); /* to be deleted for 2.6.25 */
 asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t count)
 {
@@ -450,6 +446,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
        }
        return seg;
 }
+EXPORT_SYMBOL(iov_shorten);
 ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
                unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
@@ -603,9 +600,6 @@ static ssize_t do_readv_writev(int type, struct file *file,
        ret = rw_verify_area(type, file, pos, tot_len);
        if (ret < 0)
                goto out;
-        ret = security_file_permission(file, type == READ ? MAY_READ : MAY_WRITE);
-        if (ret)
-                goto out;
        fnv = NULL;
        if (type == READ) {
@@ -737,10 +731,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
                goto fput_in;
        count = retval;
-        retval = security_file_permission (in_file, MAY_READ);
-        if (retval)
-                goto fput_in;
        /*
         * Get output file, and verify that it is ok..
         */
@@ -759,10 +749,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
                goto fput_out;
        count = retval;
-        retval = security_file_permission (out_file, MAY_WRITE);
-        if (retval)
-                goto fput_out;
        if (!max)
                max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
diff --git a/fs/readdir.c b/fs/readdir.c
index efe52e676577..4e026e5407fb 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -30,7 +30,10 @@ int vfs_readdir(struct file *file, filldir_t filler, void *buf)
        if (res)
                goto out;
-        mutex_lock(&inode->i_mutex);
+        res = mutex_lock_killable(&inode->i_mutex);
+        if (res)
+                goto out;
        res = -ENOENT;
        if (!IS_DEADDIR(inode)) {
                res = file->f_op->readdir(file, buf, filler);
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index c438a8f83f26..e0f0f098a523 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -57,6 +57,9 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
                        if (get_user(flags, (int __user *)arg))
                                return -EFAULT;
+                        /* Is it quota file? Do not allow user to mess with it. */
+                        if (IS_NOQUOTA(inode))
+                                return -EPERM;
                        if (((flags ^ REISERFS_I(inode)->
                              i_attrs) & (REISERFS_IMMUTABLE_FL |
                                          REISERFS_APPEND_FL))
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 9aa7a06e093f..001144621672 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -420,12 +420,6 @@ static void *r_start(struct seq_file *m, loff_t * pos)
                return NULL;
        up_write(&s->s_umount);
-        if (de->deleted) {
-                deactivate_super(s);
-                return NULL;
-        }
        return s;
 }
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index ca41567d7890..d2db2417b2bd 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -1458,9 +1458,6 @@ static void unmap_buffers(struct page *page, loff_t pos)
                                }
                                bh = next;
                        } while (bh != head);
-                        if (PAGE_SIZE == bh->b_size) {
-                                cancel_dirty_page(page, PAGE_CACHE_SIZE);
-                        }
                }
        }
 }
diff --git a/fs/smbfs/Makefile b/fs/smbfs/Makefile
index 6673ee82cb4c..4faf8c4722c3 100644
--- a/fs/smbfs/Makefile
+++ b/fs/smbfs/Makefile
@@ -16,23 +16,3 @@ EXTRA_CFLAGS += -DSMBFS_PARANOIA
 #EXTRA_CFLAGS += -DDEBUG_SMB_TIMESTAMP
 #EXTRA_CFLAGS += -Werror
-#
-# Maintainer rules
-#
-# getopt.c not included. It is intentionally separate
-SRC = proc.c dir.c cache.c sock.c inode.c file.c ioctl.c smbiod.c request.c \
-        symlink.c
-proto:
-        -rm -f proto.h
-        @echo >  proto2.h "/*"
-        @echo >> proto2.h " *  Autogenerated with cproto on: " `date`
-        @echo >> proto2.h " */"
-        @echo >> proto2.h ""
-        @echo >> proto2.h "struct smb_request;"
-        @echo >> proto2.h "struct sock;"
-        @echo >> proto2.h "struct statfs;"
-        @echo >> proto2.h ""
-        cproto -E "gcc -E" -e -v -I $(TOPDIR)/include -DMAKING_PROTO -D__KERNEL__ $(SRC) >> proto2.h
-        mv proto2.h proto.h
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index f5d14cebc75a..efbe29af3d7a 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -234,7 +234,7 @@ smb_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
        VERBOSE("before read, size=%ld, flags=%x, atime=%ld\n",
                (long)dentry->d_inode->i_size,
-                dentry->d_inode->i_flags, dentry->d_inode->i_atime);
+                dentry->d_inode->i_flags, dentry->d_inode->i_atime.tv_sec);
        status = generic_file_aio_read(iocb, iov, nr_segs, pos);
 out:
@@ -269,7 +269,7 @@ smb_file_splice_read(struct file *file, loff_t *ppos,
        struct dentry *dentry = file->f_path.dentry;
        ssize_t status;
-        VERBOSE("file %s/%s, pos=%Ld, count=%d\n",
+        VERBOSE("file %s/%s, pos=%Ld, count=%lu\n",
                DENTRY_PATH(dentry), *ppos, count);
        status = smb_revalidate_inode(dentry);
@@ -363,7 +363,8 @@ smb_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                result = generic_file_aio_write(iocb, iov, nr_segs, pos);
                VERBOSE("pos=%ld, size=%ld, mtime=%ld, atime=%ld\n",
                        (long) file->f_pos, (long) dentry->d_inode->i_size,
-                        dentry->d_inode->i_mtime, dentry->d_inode->i_atime);
+                        dentry->d_inode->i_mtime.tv_sec,
+                        dentry->d_inode->i_atime.tv_sec);
        }
 out:
        return result;
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index ab517755ece0..9416ead0c7aa 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -536,7 +536,7 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
        /* Allocate the global temp buffer and some superblock helper structs */
        /* FIXME: move these to the smb_sb_info struct */
-        VERBOSE("alloc chunk = %d\n", sizeof(struct smb_ops) +
+        VERBOSE("alloc chunk = %lu\n", sizeof(struct smb_ops) +
                sizeof(struct smb_mount_data_kernel));
        mem = kmalloc(sizeof(struct smb_ops) +
                      sizeof(struct smb_mount_data_kernel), GFP_KERNEL);
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index feac46050619..d517a27b7f4b 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -2593,7 +2593,7 @@ smb_proc_getattr_ff(struct smb_sb_info *server, struct dentry *dentry,
        fattr->f_mtime.tv_sec = date_dos2unix(server, date, time);
        fattr->f_mtime.tv_nsec = 0;
        VERBOSE("name=%s, date=%x, time=%x, mtime=%ld\n",
-                mask, date, time, fattr->f_mtime);
+                mask, date, time, fattr->f_mtime.tv_sec);
        fattr->f_size = DVAL(req->rq_data, 12);
        /* ULONG allocation size */
        fattr->attr = WVAL(req->rq_data, 20);
diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
index ca4b2d59c0ca..45f45933e862 100644
--- a/fs/smbfs/request.c
+++ b/fs/smbfs/request.c
@@ -105,7 +105,7 @@ struct smb_request *smb_alloc_request(struct smb_sb_info *server, int bufsize)
                if (nfs_try_to_free_pages(server))
                        continue;
-                if (signalled() && (server->flags & NFS_MOUNT_INTR))
+                if (fatal_signal_pending(current))
                        return ERR_PTR(-ERESTARTSYS);
                current->policy = SCHED_YIELD;
                schedule();
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
index 283c5720c9de..fae8e85af0ed 100644
--- a/fs/smbfs/smbiod.c
+++ b/fs/smbfs/smbiod.c
@@ -227,7 +227,7 @@ int smbiod_retry(struct smb_sb_info *server)
                printk(KERN_ERR "smb_retry: signal failed [%d]\n", result);
                goto out;
        }
-        VERBOSE("signalled pid %d\n", pid);
+        VERBOSE("signalled pid %d\n", pid_nr(pid));
        /* FIXME: The retried requests should perhaps get a "time boost". */
diff --git a/fs/splice.c b/fs/splice.c
index 6bdcb6107bc3..4ee49e86edde 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -254,11 +254,16 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
        }
        while (page_nr < spd_pages)
-                page_cache_release(spd->pages[page_nr++]);
+                spd->spd_release(spd, page_nr++);
        return ret;
 }
+static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
+{
+        page_cache_release(spd->pages[i]);
+}
 static int
 __generic_file_splice_read(struct file *in, loff_t *ppos,
                           struct pipe_inode_info *pipe, size_t len,
@@ -277,6 +282,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                .partial = partial,
                .flags = flags,
                .ops = &page_cache_pipe_buf_ops,
+                .spd_release = spd_release_page,
        };
        index = *ppos >> PAGE_CACHE_SHIFT;
@@ -908,10 +914,6 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
        if (unlikely(ret < 0))
                return ret;
-        ret = security_file_permission(out, MAY_WRITE);
-        if (unlikely(ret < 0))
-                return ret;
        return out->f_op->splice_write(pipe, out, ppos, len, flags);
 }
@@ -934,10 +936,6 @@ static long do_splice_to(struct file *in, loff_t *ppos,
        if (unlikely(ret < 0))
                return ret;
-        ret = security_file_permission(in, MAY_READ);
-        if (unlikely(ret < 0))
-                return ret;
        return in->f_op->splice_read(in, ppos, pipe, len, flags);
 }
@@ -1033,7 +1031,9 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
                        goto out_release;
        }
+done:
        pipe->nrbufs = pipe->curbuf = 0;
+        file_accessed(in);
        return bytes;
 out_release:
@@ -1049,16 +1049,11 @@ out_release:
                        buf->ops = NULL;
                }
        }
-        pipe->nrbufs = pipe->curbuf = 0;
-        /*
+        if (!bytes)
-         * If we transferred some data, return the number of bytes:
+                bytes = ret;
-         */
-        if (bytes > 0)
-                return bytes;
-        return ret;
+        goto done;
 }
 EXPORT_SYMBOL(splice_direct_to_actor);
@@ -1440,6 +1435,7 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
                .partial = partial,
                .flags = flags,
                .ops = &user_page_pipe_buf_ops,
+                .spd_release = spd_release_page,
        };
        pipe = pipe_info(file->f_path.dentry->d_inode);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 7a8ce9e98b32..4948d9bc405d 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -132,7 +132,7 @@ struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd)
 *      RETURNS:
 *      Pointer to @sd on success, NULL on failure.
 */
-struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
+static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
 {
        if (unlikely(!sd))
                return NULL;
@@ -161,7 +161,7 @@ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
 *      Put an active reference to @sd.  This function is noop if @sd
 *      is NULL.
 */
-void sysfs_put_active(struct sysfs_dirent *sd)
+static void sysfs_put_active(struct sysfs_dirent *sd)
 {
        struct completion *cmpl;
        int v;
@@ -440,7 +440,7 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 /**
 *      sysfs_remove_one - remove sysfs_dirent from parent
 *      @acxt: addrm context to use
- *      @sd: sysfs_dirent to be added
+ *      @sd: sysfs_dirent to be removed
 *
 *      Mark @sd removed and drop nlink of parent inode if @sd is a
 *      directory.  @sd is unlinked from the children list.
@@ -678,8 +678,10 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
        sd = sysfs_find_dirent(parent_sd, dentry->d_name.name);
        /* no such entry */
-        if (!sd)
+        if (!sd) {
+                ret = ERR_PTR(-ENOENT);
                goto out_unlock;
+        }
        /* attach dentry and inode */
        inode = sysfs_get_inode(sd);
@@ -781,6 +783,7 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
        old_dentry = sysfs_get_dentry(sd);
        if (IS_ERR(old_dentry)) {
                error = PTR_ERR(old_dentry);
+                old_dentry = NULL;
                goto out;
        }
@@ -848,6 +851,7 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
        old_dentry = sysfs_get_dentry(sd);
        if (IS_ERR(old_dentry)) {
                error = PTR_ERR(old_dentry);
+                old_dentry = NULL;
                goto out;
        }
        old_parent = old_dentry->d_parent;
@@ -855,6 +859,7 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
        new_parent = sysfs_get_dentry(new_parent_sd);
        if (IS_ERR(new_parent)) {
                error = PTR_ERR(new_parent);
+                new_parent = NULL;
                goto out;
        }
@@ -878,7 +883,6 @@ again:
        error = 0;
        d_add(new_dentry, NULL);
        d_move(old_dentry, new_dentry);
-        dput(new_dentry);
        /* Remove from old parent's list and insert into new parent's list. */
        sysfs_unlink_sibling(sd);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 27d1785b7644..a271c87c4472 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -20,43 +20,6 @@
 #include "sysfs.h"
-#define to_sattr(a) container_of(a,struct subsys_attribute, attr)
-/*
- * Subsystem file operations.
- * These operations allow subsystems to have files that can be 
- * read/written. 
- */
-static ssize_t 
-subsys_attr_show(struct kobject * kobj, struct attribute * attr, char * page)
-{
-        struct kset *kset = to_kset(kobj);
-        struct subsys_attribute * sattr = to_sattr(attr);
-        ssize_t ret = -EIO;
-        if (sattr->show)
-                ret = sattr->show(kset, page);
-        return ret;
-}
-static ssize_t 
-subsys_attr_store(struct kobject * kobj, struct attribute * attr, 
-                  const char * page, size_t count)
-{
-        struct kset *kset = to_kset(kobj);
-        struct subsys_attribute * sattr = to_sattr(attr);
-        ssize_t ret = -EIO;
-        if (sattr->store)
-                ret = sattr->store(kset, page, count);
-        return ret;
-}
-static struct sysfs_ops subsys_sysfs_ops = {
-        .show   = subsys_attr_show,
-        .store  = subsys_attr_store,
-};
 /*
 * There's one sysfs_buffer for each open file and one
 * sysfs_open_dirent for each sysfs_dirent with one or more open
@@ -66,7 +29,7 @@ static struct sysfs_ops subsys_sysfs_ops = {
 * sysfs_dirent->s_attr.open points to sysfs_open_dirent.  s_attr.open
 * is protected by sysfs_open_dirent_lock.
 */
-static spinlock_t sysfs_open_dirent_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(sysfs_open_dirent_lock);
 struct sysfs_open_dirent {
        atomic_t                refcnt;
@@ -119,7 +82,11 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
        sysfs_put_active_two(attr_sd);
-        BUG_ON(count > (ssize_t)PAGE_SIZE);
+        /*
+         * The code works fine with PAGE_SIZE return but it's likely to
+         * indicate truncated result or overflow in normal use cases.
+         */
+        BUG_ON(count >= (ssize_t)PAGE_SIZE);
        if (count >= 0) {
                buffer->needs_read_fill = 0;
                buffer->count = count;
@@ -350,31 +317,23 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 {
        struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
        struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
-        struct sysfs_buffer * buffer;
+        struct sysfs_buffer *buffer;
-        struct sysfs_ops * ops = NULL;
+        struct sysfs_ops *ops;
-        int error;
+        int error = -EACCES;
        /* need attr_sd for attr and ops, its parent for kobj */
        if (!sysfs_get_active_two(attr_sd))
                return -ENODEV;
-        /* if the kobject has no ktype, then we assume that it is a subsystem
+        /* every kobject with an attribute needs a ktype assigned */
-         * itself, and use ops for it.
+        if (kobj->ktype && kobj->ktype->sysfs_ops)
-         */
-        if (kobj->kset && kobj->kset->ktype)
-                ops = kobj->kset->ktype->sysfs_ops;
-        else if (kobj->ktype)
                ops = kobj->ktype->sysfs_ops;
-        else
+        else {
-                ops = &subsys_sysfs_ops;
+                printk(KERN_ERR "missing sysfs attribute operations for "
+                       "kobject: %s\n", kobject_name(kobj));
-        error = -EACCES;
+                WARN_ON(1);
-        /* No sysfs operations, either from having no subsystem,
-         * or the subsystem have no operations.
-         */
-        if (!ops)
                goto err_out;
+        }
        /* File needs write support.
         * The inode's perms must say it's ok, 
@@ -564,7 +523,11 @@ int sysfs_add_file_to_group(struct kobject *kobj,
        struct sysfs_dirent *dir_sd;
        int error;
-        dir_sd = sysfs_get_dirent(kobj->sd, group);
+        if (group)
+                dir_sd = sysfs_get_dirent(kobj->sd, group);
+        else
+                dir_sd = sysfs_get(kobj->sd);
        if (!dir_sd)
                return -ENOENT;
@@ -652,7 +615,10 @@ void sysfs_remove_file_from_group(struct kobject *kobj,
 {
        struct sysfs_dirent *dir_sd;
-        dir_sd = sysfs_get_dirent(kobj->sd, group);
+        if (group)
+                dir_sd = sysfs_get_dirent(kobj->sd, group);
+        else
+                dir_sd = sysfs_get(kobj->sd);
        if (dir_sd) {
                sysfs_hash_and_remove(dir_sd, attr->name);
                sysfs_put(dir_sd);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index d1972374655a..0871c3dadce1 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -16,25 +16,31 @@
 #include "sysfs.h"
-static void remove_files(struct sysfs_dirent *dir_sd,
+static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
                         const struct attribute_group *grp)
 {
        struct attribute *const* attr;
+        int i;
-        for (attr = grp->attrs; *attr; attr++)
+        for (i = 0, attr = grp->attrs; *attr; i++, attr++)
-                sysfs_hash_and_remove(dir_sd, (*attr)->name);
+                if (!grp->is_visible ||
+                    grp->is_visible(kobj, *attr, i))
+                        sysfs_hash_and_remove(dir_sd, (*attr)->name);
 }
-static int create_files(struct sysfs_dirent *dir_sd,
+static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
                        const struct attribute_group *grp)
 {
        struct attribute *const* attr;
-        int error = 0;
+        int error = 0, i;
-        for (attr = grp->attrs; *attr && !error; attr++)
+        for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++)
-                error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR);
+                if (!grp->is_visible ||
+                    grp->is_visible(kobj, *attr, i))
+                        error |=
+                                sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR);
        if (error)
-                remove_files(dir_sd, grp);
+                remove_files(dir_sd, kobj, grp);
        return error;
 }
@@ -54,7 +60,7 @@ int sysfs_create_group(struct kobject * kobj,
        } else
                sd = kobj->sd;
        sysfs_get(sd);
-        error = create_files(sd, grp);
+        error = create_files(sd, kobj, grp);
        if (error) {
                if (grp->name)
                        sysfs_remove_subdir(sd);
@@ -75,7 +81,7 @@ void sysfs_remove_group(struct kobject * kobj,
        } else
                sd = sysfs_get(dir_sd);
-        remove_files(sd, grp);
+        remove_files(sd, kobj, grp);
        if (grp->name)
                sysfs_remove_subdir(sd);
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 3eac20c63c41..5f66c4466151 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -19,39 +19,6 @@
 #include "sysfs.h"
-static int object_depth(struct sysfs_dirent *sd)
-{
-        int depth = 0;
-        for (; sd->s_parent; sd = sd->s_parent)
-                depth++;
-        return depth;
-}
-static int object_path_length(struct sysfs_dirent * sd)
-{
-        int length = 1;
-        for (; sd->s_parent; sd = sd->s_parent)
-                length += strlen(sd->s_name) + 1;
-        return length;
-}
-static void fill_object_path(struct sysfs_dirent *sd, char *buffer, int length)
-{
-        --length;
-        for (; sd->s_parent; sd = sd->s_parent) {
-                int cur = strlen(sd->s_name);
-                /* back up enough to print this bus id with '/' */
-                length -= cur;
-                strncpy(buffer + length, sd->s_name, cur);
-                *(buffer + --length) = '/';
-        }
-}
 /**
 *      sysfs_create_link - create symlink between two objects.
 *      @kobj:  object whose directory we're creating the link in.
@@ -112,7 +79,6 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
        return error;
 }
 /**
 *      sysfs_remove_link - remove symlink in object's directory.
 *      @kobj:  object we're acting for.
@@ -124,24 +90,54 @@ void sysfs_remove_link(struct kobject * kobj, const char * name)
        sysfs_hash_and_remove(kobj->sd, name);
 }
-static int sysfs_get_target_path(struct sysfs_dirent * parent_sd,
+static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
-                                 struct sysfs_dirent * target_sd, char *path)
+                                 struct sysfs_dirent *target_sd, char *path)
 {
-        char * s;
+        struct sysfs_dirent *base, *sd;
-        int depth, size;
+        char *s = path;
+        int len = 0;
+        /* go up to the root, stop at the base */
+        base = parent_sd;
+        while (base->s_parent) {
+                sd = target_sd->s_parent;
+                while (sd->s_parent && base != sd)
+                        sd = sd->s_parent;
+                if (base == sd)
+                        break;
+                strcpy(s, "../");
+                s += 3;
+                base = base->s_parent;
+        }
+        /* determine end of target string for reverse fillup */
+        sd = target_sd;
+        while (sd->s_parent && sd != base) {
+                len += strlen(sd->s_name) + 1;
+                sd = sd->s_parent;
+        }
-        depth = object_depth(parent_sd);
+        /* check limits */
-        size = object_path_length(target_sd) + depth * 3 - 1;
+        if (len < 2)
-        if (size > PATH_MAX)
+                return -EINVAL;
+        len--;
+        if ((s - path) + len > PATH_MAX)
                return -ENAMETOOLONG;
-        pr_debug("%s: depth = %d, size = %d\n", __FUNCTION__, depth, size);
+        /* reverse fillup of target string from target to base */
+        sd = target_sd;
+        while (sd->s_parent && sd != base) {
+                int slen = strlen(sd->s_name);
-        for (s = path; depth--; s += 3)
+                len -= slen;
-                strcpy(s,"../");
+                strncpy(s + len, sd->s_name, slen);
+                if (len)
+                        s[--len] = '/';
-        fill_object_path(target_sd, path, size);
+                sd = sd->s_parent;
-        pr_debug("%s: path = '%s'\n", __FUNCTION__, path);
+        }
        return 0;
 }
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index f8417988f6b0..ff17f8da9b43 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -103,8 +103,6 @@ extern const struct file_operations sysfs_dir_operations;
 extern const struct inode_operations sysfs_dir_inode_operations;
 struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd);
-struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
-void sysfs_put_active(struct sysfs_dirent *sd);
 struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd);
 void sysfs_put_active_two(struct sysfs_dirent *sd);
 void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 30f8c2bb0c3e..aaf2878305ce 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -179,7 +179,7 @@ bad_entry:
        goto fail;
 Eend:
        p = (struct ufs_dir_entry *)(kaddr + offs);
-        ufs_error (sb, "ext2_check_page",
+        ufs_error(sb, __FUNCTION__,
                   "entry in directory #%lu spans the page boundary"
                   "offset=%lu",
                   dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 584cf12cc40f..0072cb33ebec 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -755,13 +755,13 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                break;
        
        case UFS_MOUNT_UFSTYPE_NEXTSTEP:
-                /*TODO: check may be we need set special dir block size?*/
                UFSD("ufstype=nextstep\n");
                uspi->s_fsize = block_size = 1024;
                uspi->s_fmask = ~(1024 - 1);
                uspi->s_fshift = 10;
                uspi->s_sbsize = super_block_size = 2048;
                uspi->s_sbbase = 0;
+                uspi->s_dirblksize = 1024;
                flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD;
                if (!(sb->s_flags & MS_RDONLY)) {
                        if (!silent)
@@ -771,13 +771,13 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                break;
        
        case UFS_MOUNT_UFSTYPE_NEXTSTEP_CD:
-                /*TODO: check may be we need set special dir block size?*/
                UFSD("ufstype=nextstep-cd\n");
                uspi->s_fsize = block_size = 2048;
                uspi->s_fmask = ~(2048 - 1);
                uspi->s_fshift = 11;
                uspi->s_sbsize = super_block_size = 2048;
                uspi->s_sbbase = 0;
+                uspi->s_dirblksize = 1024;
                flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD;
                if (!(sb->s_flags & MS_RDONLY)) {
                        if (!silent)
@@ -933,20 +933,19 @@ magic_found:
                goto again;
        }
-        /* Set sbi->s_flags here, used by ufs_get_fs_state() below */
+        sbi->s_flags = flags;/*after that line some functions use s_flags*/
-        sbi->s_flags = flags;
        ufs_print_super_stuff(sb, usb1, usb2, usb3);
        /*
         * Check, if file system was correctly unmounted.
         * If not, make it read only.
         */
-        if ((((flags & UFS_ST_MASK) == UFS_ST_44BSD)    ||
+        if (((flags & UFS_ST_MASK) == UFS_ST_44BSD) ||
-             ((flags & UFS_ST_MASK) == UFS_ST_OLD)      ||
+          ((flags & UFS_ST_MASK) == UFS_ST_OLD) ||
-             ((flags & UFS_ST_MASK) == UFS_ST_SUN)      ||
+          (((flags & UFS_ST_MASK) == UFS_ST_SUN ||
-             ((flags & UFS_ST_MASK) == UFS_ST_SUNOS)    ||
+            (flags & UFS_ST_MASK) == UFS_ST_SUNOS ||
-             ((flags & UFS_ST_MASK) == UFS_ST_SUNx86))  &&
+          (flags & UFS_ST_MASK) == UFS_ST_SUNx86) &&
-            (ufs_get_fs_state(sb, usb1, usb3) == (UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time)))) {
+          (ufs_get_fs_state(sb, usb1, usb3) == (UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time))))) {
                switch(usb1->fs_clean) {
                case UFS_FSCLEAN:
                        UFSD("fs is clean\n");
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index b9c8589e05c2..a49dd8d4b069 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -725,15 +725,15 @@ xfs_buf_associate_memory(
 {
        int                     rval;
        int                     i = 0;
-        size_t                  ptr;
+        unsigned long           pageaddr;
-        size_t                  end, end_cur;
+        unsigned long           offset;
-        off_t                   offset;
+        size_t                  buflen;
        int                     page_count;
-        page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
+        pageaddr = (unsigned long)mem & PAGE_CACHE_MASK;
-        offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK);
+        offset = (unsigned long)mem - pageaddr;
-        if (offset && (len > PAGE_CACHE_SIZE))
+        buflen = PAGE_CACHE_ALIGN(len + offset);
-                page_count++;
+        page_count = buflen >> PAGE_CACHE_SHIFT;
        /* Free any previous set of page pointers */
        if (bp->b_pages)
@@ -747,22 +747,15 @@ xfs_buf_associate_memory(
                return rval;
        bp->b_offset = offset;
-        ptr = (size_t) mem & PAGE_CACHE_MASK;
-        end = PAGE_CACHE_ALIGN((size_t) mem + len);
+        for (i = 0; i < bp->b_page_count; i++) {
-        end_cur = end;
+                bp->b_pages[i] = mem_to_page((void *)pageaddr);
-        /* set up first page */
+                pageaddr += PAGE_CACHE_SIZE;
-        bp->b_pages[0] = mem_to_page(mem);
-        ptr += PAGE_CACHE_SIZE;
-        bp->b_page_count = ++i;
-        while (ptr < end) {
-                bp->b_pages[i] = mem_to_page((void *)ptr);
-                bp->b_page_count = ++i;
-                ptr += PAGE_CACHE_SIZE;
        }
        bp->b_locked = 0;
-        bp->b_count_desired = bp->b_buffer_length = len;
+        bp->b_count_desired = len;
+        bp->b_buffer_length = buflen;
        bp->b_flags |= XBF_MAPPED;
        return 0;
@@ -1032,7 +1025,7 @@ xfs_buf_ioend(
        xfs_buf_t               *bp,
        int                     schedule)
 {
-        bp->b_flags &= ~(XBF_READ | XBF_WRITE);
+        bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
        if (bp->b_error == 0)
                bp->b_flags |= XBF_DONE;
@@ -1750,6 +1743,8 @@ xfsbufd(
        current->flags |= PF_MEMALLOC;
+        set_freezable();
        do {
                if (unlikely(freezing(current))) {
                        set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index fb8dd34041eb..21a1c2b1c5fc 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -218,6 +218,15 @@ xfs_vm_fault(
 }
 #endif /* CONFIG_XFS_DMAPI */
+/*
+ * Unfortunately we can't just use the clean and simple readdir implementation
+ * below, because nfs might call back into ->lookup from the filldir callback
+ * and that will deadlock the low-level btree code.
+ *
+ * Hopefully we'll find a better workaround that allows to use the optimal
+ * version at least for local readdirs for 2.6.25.
+ */
+#if 0
 STATIC int
 xfs_file_readdir(
        struct file     *filp,
@@ -249,6 +258,126 @@ xfs_file_readdir(
                return -error;
        return 0;
 }
+#else
+struct hack_dirent {
+        u64             ino;
+        loff_t          offset;
+        int             namlen;
+        unsigned int    d_type;
+        char            name[];
+};
+struct hack_callback {
+        char            *dirent;
+        size_t          len;
+        size_t          used;
+};
+STATIC int
+xfs_hack_filldir(
+        void            *__buf,
+        const char      *name,
+        int             namlen,
+        loff_t          offset,
+        u64             ino,
+        unsigned int    d_type)
+{
+        struct hack_callback *buf = __buf;
+        struct hack_dirent *de = (struct hack_dirent *)(buf->dirent + buf->used);
+        unsigned int reclen;
+        reclen = ALIGN(sizeof(struct hack_dirent) + namlen, sizeof(u64));
+        if (buf->used + reclen > buf->len)
+                return -EINVAL;
+        de->namlen = namlen;
+        de->offset = offset;
+        de->ino = ino;
+        de->d_type = d_type;
+        memcpy(de->name, name, namlen);
+        buf->used += reclen;
+        return 0;
+}
+STATIC int
+xfs_file_readdir(
+        struct file     *filp,
+        void            *dirent,
+        filldir_t       filldir)
+{
+        struct inode    *inode = filp->f_path.dentry->d_inode;
+        xfs_inode_t     *ip = XFS_I(inode);
+        struct hack_callback buf;
+        struct hack_dirent *de;
+        int             error;
+        loff_t          size;
+        int             eof = 0;
+        xfs_off_t       start_offset, curr_offset, offset;
+        /*
+         * Try fairly hard to get memory
+         */
+        buf.len = PAGE_CACHE_SIZE;
+        do {
+                buf.dirent = kmalloc(buf.len, GFP_KERNEL);
+                if (buf.dirent)
+                        break;
+                buf.len >>= 1;
+        } while (buf.len >= 1024);
+        if (!buf.dirent)
+                return -ENOMEM;
+        curr_offset = filp->f_pos;
+        if (curr_offset == 0x7fffffff)
+                offset = 0xffffffff;
+        else
+                offset = filp->f_pos;
+        while (!eof) {
+                unsigned int reclen;
+                start_offset = offset;
+                buf.used = 0;
+                error = -xfs_readdir(ip, &buf, buf.len, &offset,
+                                     xfs_hack_filldir);
+                if (error || offset == start_offset) {
+                        size = 0;
+                        break;
+                }
+                size = buf.used;
+                de = (struct hack_dirent *)buf.dirent;
+                curr_offset = de->offset /* & 0x7fffffff */;
+                while (size > 0) {
+                        if (filldir(dirent, de->name, de->namlen,
+                                        curr_offset & 0x7fffffff,
+                                        de->ino, de->d_type)) {
+                                goto done;
+                        }
+                        reclen = ALIGN(sizeof(struct hack_dirent) + de->namlen,
+                                       sizeof(u64));
+                        size -= reclen;
+                        de = (struct hack_dirent *)((char *)de + reclen);
+                        curr_offset = de->offset /* & 0x7fffffff */;
+                }
+        }
+ done:
+        if (!error) {
+                if (size == 0)
+                        filp->f_pos = offset & 0x7fffffff;
+                else if (de)
+                        filp->f_pos = curr_offset;
+        }
+        kfree(buf.dirent);
+        return error;
+}
+#endif
 STATIC int
 xfs_file_mmap(
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 2b34bad48b07..98a56568bb24 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -1047,24 +1047,20 @@ xfs_ioc_bulkstat(
        if ((count = bulkreq.icount) <= 0)
                return -XFS_ERROR(EINVAL);
+        if (bulkreq.ubuffer == NULL)
+                return -XFS_ERROR(EINVAL);
        if (cmd == XFS_IOC_FSINUMBERS)
                error = xfs_inumbers(mp, &inlast, &count,
                                        bulkreq.ubuffer, xfs_inumbers_fmt);
        else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE)
                error = xfs_bulkstat_single(mp, &inlast,
                                                bulkreq.ubuffer, &done);
-        else {  /* XFS_IOC_FSBULKSTAT */
+        else    /* XFS_IOC_FSBULKSTAT */
-                if (count == 1 && inlast != 0) {
+                error = xfs_bulkstat(mp, &inlast, &count,
-                        inlast++;
+                        (bulkstat_one_pf)xfs_bulkstat_one, NULL,
-                        error = xfs_bulkstat_single(mp, &inlast,
+                        sizeof(xfs_bstat_t), bulkreq.ubuffer,
-                                        bulkreq.ubuffer, &done);
+                        BULKSTAT_FG_QUICK, &done);
-                } else {
-                        error = xfs_bulkstat(mp, &inlast, &count,
-                                (bulkstat_one_pf)xfs_bulkstat_one, NULL,
-                                sizeof(xfs_bstat_t), bulkreq.ubuffer,
-                                BULKSTAT_FG_QUICK, &done);
-                }
-        }
        if (error)
                return -error;
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 0046bdd5b7f1..bf2a956b63c2 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -291,6 +291,9 @@ xfs_ioc_bulkstat_compat(
        if ((count = bulkreq.icount) <= 0)
                return -XFS_ERROR(EINVAL);
+        if (bulkreq.ubuffer == NULL)
+                return -XFS_ERROR(EINVAL);
        if (cmd == XFS_IOC_FSINUMBERS)
                error = xfs_inumbers(mp, &inlast, &count,
                                bulkreq.ubuffer, xfs_inumbers_fmt_compat);
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index ac50f8a37582..5e8bb7f71b5a 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -117,7 +117,7 @@ xfs_ichgtime(
         */
        SYNCHRONIZE();
        ip->i_update_core = 1;
-        if (!(inode->i_state & I_SYNC))
+        if (!(inode->i_state & I_NEW))
                mark_inode_dirty_sync(inode);
 }
@@ -169,7 +169,7 @@ xfs_ichgtime_fast(
         */
        SYNCHRONIZE();
        ip->i_update_core = 1;
-        if (!(inode->i_state & I_SYNC))
+        if (!(inode->i_state & I_NEW))
                mark_inode_dirty_sync(inode);
 }
@@ -332,9 +332,7 @@ xfs_vn_mknod(
                ASSERT(vp);
                ip = vn_to_inode(vp);
-                if (S_ISCHR(mode) || S_ISBLK(mode))
+                if (S_ISDIR(mode))
-                        ip->i_rdev = rdev;
-                else if (S_ISDIR(mode))
                        xfs_validate_fields(ip);
                d_instantiate(dentry, ip);
                xfs_validate_fields(dir);
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index b5f91281b707..d488645f833d 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1008,6 +1008,9 @@ xfs_qm_sync(
        boolean_t       nowait;
        int             error;
+        if (! XFS_IS_QUOTA_ON(mp))
+                return 0;
        restarts = 0;
        /*
         * We won't block unless we are asked to.
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index c171767e242a..a5f4f4fb8868 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -508,7 +508,7 @@ xfs_dir2_block_getdents(
                        continue;
                cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
-                                                    ptr - (char *)block);
+                                            (char *)dep - (char *)block);
                ino = be64_to_cpu(dep->inumber);
 #if XFS_BIG_INUMS
                ino += mp->m_inoadd;
@@ -519,9 +519,7 @@ xfs_dir2_block_getdents(
                 */
                if (filldir(dirent, dep->name, dep->namelen, cook,
                            ino, DT_UNKNOWN)) {
-                        *offset = xfs_dir2_db_off_to_dataptr(mp,
+                        *offset = cook;
-                                        mp->m_dirdatablk,
-                                        (char *)dep - (char *)block);
                        xfs_da_brelse(NULL, bp);
                        return 0;
                }
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index e7c12fa1303e..0ca0020ba09f 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1091,7 +1091,7 @@ xfs_dir2_leaf_getdents(
                 * Won't fit.  Return to caller.
                 */
                if (filldir(dirent, dep->name, dep->namelen,
-                            xfs_dir2_byte_to_dataptr(mp, curoff + length),
+                            xfs_dir2_byte_to_dataptr(mp, curoff),
                            ino, DT_UNKNOWN))
                        break;
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 182c70315ad1..919d275a1cef 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -752,7 +752,7 @@ xfs_dir2_sf_getdents(
 #if XFS_BIG_INUMS
                ino += mp->m_inoadd;
 #endif
-                if (filldir(dirent, ".", 1, dotdot_offset, ino, DT_DIR)) {
+                if (filldir(dirent, ".", 1, dot_offset, ino, DT_DIR)) {
                        *offset = dot_offset;
                        return 0;
                }
@@ -762,13 +762,11 @@ xfs_dir2_sf_getdents(
         * Put .. entry unless we're starting past it.
         */
        if (*offset <= dotdot_offset) {
-                off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
-                                                  XFS_DIR2_DATA_FIRST_OFFSET);
                ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent);
 #if XFS_BIG_INUMS
                ino += mp->m_inoadd;
 #endif
-                if (filldir(dirent, "..", 2, off, ino, DT_DIR)) {
+                if (filldir(dirent, "..", 2, dotdot_offset, ino, DT_DIR)) {
                        *offset = dotdot_offset;
                        return 0;
                }
@@ -793,8 +791,7 @@ xfs_dir2_sf_getdents(
 #endif
                if (filldir(dirent, sfep->name, sfep->namelen,
-                            off + xfs_dir2_data_entsize(sfep->namelen),
+                                            off, ino, DT_UNKNOWN)) {
-                            ino, DT_UNKNOWN)) {
                        *offset = off;
                        return 0;
                }
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 488836e204a3..fb69ef180b27 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -267,7 +267,7 @@ finish_inode:
        icl = NULL;
        if (radix_tree_gang_lookup(&pag->pag_ici_root, (void**)&iq,
                                                        first_index, 1)) {
-                if ((iq->i_ino & mask) == first_index)
+                if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) == first_index)
                        icl = iq->i_cluster;
        }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index abf509a88915..344948082819 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1459,8 +1459,10 @@ xfs_itruncate_start(
        mp = ip->i_mount;
        vp = XFS_ITOV(ip);
-        vn_iowait(ip);  /* wait for the completion of any pending DIOs */
+        /* wait for the completion of any pending DIOs */
-        
+        if (new_size < ip->i_size)
+                vn_iowait(ip);
        /*
         * Call toss_pages or flushinval_pages to get rid of pages
         * overlapping the region being removed.  We have to use
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 9972992fd3c3..9fc4c2886529 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -316,6 +316,8 @@ xfs_bulkstat_use_dinode(
        return 1;
 }
+#define XFS_BULKSTAT_UBLEFT(ubleft)     ((ubleft) >= statstruct_size)
 /*
 * Return stat information in bulk (by-inode) for the filesystem.
 */
@@ -353,7 +355,7 @@ xfs_bulkstat(
        xfs_inobt_rec_incore_t  *irbp;  /* current irec buffer pointer */
        xfs_inobt_rec_incore_t  *irbuf; /* start of irec buffer */
        xfs_inobt_rec_incore_t  *irbufend; /* end of good irec buffer entries */
-        xfs_ino_t               lastino=0; /* last inode number returned */
+        xfs_ino_t               lastino; /* last inode number returned */
        int                     nbcluster; /* # of blocks in a cluster */
        int                     nicluster; /* # of inodes in a cluster */
        int                     nimask; /* mask for inode clusters */
@@ -373,6 +375,7 @@ xfs_bulkstat(
         * Get the last inode value, see if there's nothing to do.
         */
        ino = (xfs_ino_t)*lastinop;
+        lastino = ino;
        dip = NULL;
        agno = XFS_INO_TO_AGNO(mp, ino);
        agino = XFS_INO_TO_AGINO(mp, ino);
@@ -382,6 +385,9 @@ xfs_bulkstat(
                *ubcountp = 0;
                return 0;
        }
+        if (!ubcountp || *ubcountp <= 0) {
+                return EINVAL;
+        }
        ubcount = *ubcountp; /* statstruct's */
        ubleft = ubcount * statstruct_size; /* bytes */
        *ubcountp = ubelem = 0;
@@ -402,7 +408,8 @@ xfs_bulkstat(
         * inode returned; 0 means start of the allocation group.
         */
        rval = 0;
-        while (ubleft >= statstruct_size && agno < mp->m_sb.sb_agcount) {
+        while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
+                cond_resched();
                bp = NULL;
                down_read(&mp->m_peraglock);
                error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
@@ -499,6 +506,7 @@ xfs_bulkstat(
                                        break;
                                error = xfs_inobt_lookup_ge(cur, agino, 0, 0,
                                                            &tmp);
+                                cond_resched();
                        }
                        /*
                         * If ran off the end of the ag either with an error,
@@ -542,6 +550,7 @@ xfs_bulkstat(
                         */
                        agino = gino + XFS_INODES_PER_CHUNK;
                        error = xfs_inobt_increment(cur, 0, &tmp);
+                        cond_resched();
                }
                /*
                 * Drop the btree buffers and the agi buffer.
@@ -555,12 +564,12 @@ xfs_bulkstat(
                 */
                irbufend = irbp;
                for (irbp = irbuf;
-                     irbp < irbufend && ubleft >= statstruct_size; irbp++) {
+                     irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) {
                        /*
                         * Now process this chunk of inodes.
                         */
                        for (agino = irbp->ir_startino, chunkidx = clustidx = 0;
-                             ubleft > 0 &&
+                             XFS_BULKSTAT_UBLEFT(ubleft) &&
                                irbp->ir_freecount < XFS_INODES_PER_CHUNK;
                             chunkidx++, clustidx++, agino++) {
                                ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
@@ -663,15 +672,13 @@ xfs_bulkstat(
                                                ubleft, private_data,
                                                bno, &ubused, dip, &fmterror);
                                if (fmterror == BULKSTAT_RV_NOTHING) {
-                                        if (error == EFAULT) {
+                                        if (error && error != ENOENT &&
-                                                ubleft = 0;
+                                                error != EINVAL) {
-                                                rval = error;
-                                                break;
-                                        }
-                                        else if (error == ENOMEM)
                                                ubleft = 0;
-                                        else
+                                                rval = error;
-                                                lastino = ino;
+                                                break;
+                                        }
+                                        lastino = ino;
                                        continue;
                                }
                                if (fmterror == BULKSTAT_RV_GIVEUP) {
@@ -686,6 +693,8 @@ xfs_bulkstat(
                                ubelem++;
                                lastino = ino;
                        }
+                        cond_resched();
                }
                if (bp)
@@ -694,11 +703,12 @@ xfs_bulkstat(
                /*
                 * Set up for the next loop iteration.
                 */
-                if (ubleft > 0) {
+                if (XFS_BULKSTAT_UBLEFT(ubleft)) {
                        if (end_of_ag) {
                                agno++;
                                agino = 0;
-                        }
+                        } else
+                                agino = XFS_INO_TO_AGINO(mp, lastino);
                } else
                        break;
        }
@@ -707,6 +717,11 @@ xfs_bulkstat(
         */
        kmem_free(irbuf, irbsize);
        *ubcountp = ubelem;
+        /*
+         * Found some inodes, return them now and return the error next time.
+         */
+        if (ubelem)
+                rval = 0;
        if (agno >= mp->m_sb.sb_agcount) {
                /*
                 * If we ran out of filesystem, mark lastino as off
author	David Woodhouse <dwmw2@infradead.org>	2008-02-03 02:29:41 -0500
committer	David Woodhouse <dwmw2@infradead.org>	2008-02-03 02:30:32 -0500
commit	c1f3ee120bb61045b1c0a3ead620d1d65af47130 (patch)
tree	908430bf2b47fe8e96ac623ae7ab6dd5698d0938 /fs
parent	e619a75ff6201b567a539e787aa9af9bc63a3187 (diff)
parent	9135f1901ee6449dfe338adf6e40e9c2025b8150 (diff)