Merge branch 'linus' into perfcounters/core

Merge reason: need the upstream facility added by: 7f1e2ca: hrtimer: fix rq->lock inversion (again) Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2009-04-07 06:05:21 -0400
committer: Ingo Molnar <mingo@elte.hu> 2009-04-07 06:05:25 -0400
commit: 6c009ecef8cca28c7c09eb16d0802e37915a76e1 (patch)
tree: 11c773f780186fdb9fbc9c80a73fb7c8426b1fba /fs
parent: 98c2aaf8be5baf7193be37fb28bce8e7327158bc (diff)
parent: d508afb437daee7cf07da085b635c44a4ebf9b38 (diff)
48 files changed, 3656 insertions, 1331 deletions
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index b8e304a0661e..622e73775c83 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -17,6 +17,7 @@
 #include <linux/spinlock.h>
 #include <linux/kernel.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #endif                          /* __KERNEL__ */
diff --git a/fs/buffer.c b/fs/buffer.c
index 5d55a896ff78..6e35762b6169 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -737,7 +737,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 {
        struct buffer_head *bh;
        struct list_head tmp;
-        struct address_space *mapping;
+        struct address_space *mapping, *prev_mapping = NULL;
        int err = 0, err2;
        INIT_LIST_HEAD(&tmp);
@@ -762,7 +762,18 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
                                 * contents - it is a noop if I/O is still in
                                 * flight on potentially older contents.
                                 */
-                                ll_rw_block(SWRITE_SYNC, 1, &bh);
+                                ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
+                                /*
+                                 * Kick off IO for the previous mapping. Note
+                                 * that we will not run the very last mapping,
+                                 * wait_on_buffer() will do that for us
+                                 * through sync_buffer().
+                                 */
+                                if (prev_mapping && prev_mapping != mapping)
+                                        blk_run_address_space(prev_mapping);
+                                prev_mapping = mapping;
                                brelse(bh);
                                spin_lock(lock);
                        }
@@ -2957,12 +2968,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
        for (i = 0; i < nr; i++) {
                struct buffer_head *bh = bhs[i];
-                if (rw == SWRITE || rw == SWRITE_SYNC)
+                if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
                        lock_buffer(bh);
                else if (!trylock_buffer(bh))
                        continue;
-                if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
+                if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
+                    rw == SWRITE_SYNC_PLUG) {
                        if (test_clear_buffer_dirty(bh)) {
                                bh->b_end_io = end_buffer_write_sync;
                                get_bh(bh);
@@ -2998,7 +3010,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
        if (test_clear_buffer_dirty(bh)) {
                get_bh(bh);
                bh->b_end_io = end_buffer_write_sync;
-                ret = submit_bh(WRITE, bh);
+                ret = submit_bh(WRITE_SYNC, bh);
                wait_on_buffer(bh);
                if (buffer_eopnotsupp(bh)) {
                        clear_buffer_eopnotsupp(bh);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b6d43908ff7a..da258e7249cc 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1126,7 +1126,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        int acquire_i_mutex = 0;
        if (rw & WRITE)
-                rw = WRITE_SYNC;
+                rw = WRITE_ODIRECT;
        if (bdev)
                bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index 8e0cfe44b0fc..fb3c1a21b135 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -28,6 +28,25 @@ config EXT3_FS
          To compile this file system support as a module, choose M here: the
          module will be called ext3.
+config EXT3_DEFAULTS_TO_ORDERED
+        bool "Default to 'data=ordered' in ext3 (legacy option)"
+        depends on EXT3_FS
+        help
+          If a filesystem does not explicitly specify a data ordering
+          mode, and the journal capability allowed it, ext3 used to
+          historically default to 'data=ordered'.
+          That was a rather unfortunate choice, because it leads to all
+          kinds of latency problems, and the 'data=writeback' mode is more
+          appropriate these days.
+          You should probably always answer 'n' here, and if you really
+          want to use 'data=ordered' mode, set it in the filesystem itself
+          with 'tune2fs -o journal_data_ordered'.
+          But if you really want to enable the legacy default, you can do
+          so by answering 'y' to this question.
 config EXT3_FS_XATTR
        bool "Ext3 extended attributes"
        depends on EXT3_FS
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 9e5b8e387e1e..599dbfe504c3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -44,6 +44,12 @@
 #include "acl.h"
 #include "namei.h"
+#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED
+  #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA
+#else
+  #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA
+#endif
 static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
                             unsigned long journal_devnum);
 static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
@@ -1919,7 +1925,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                   cope, else JOURNAL_DATA */
                if (journal_check_available_features
                    (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
-                        set_opt(sbi->s_mount_opt, ORDERED_DATA);
+                        set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE);
                else
                        set_opt(sbi->s_mount_opt, JOURNAL_DATA);
                break;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index f8077b9c8981..a8e8513a78a9 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -351,8 +351,13 @@ void journal_commit_transaction(journal_t *journal)
        spin_lock(&journal->j_state_lock);
        commit_transaction->t_state = T_LOCKED;
+        /*
+         * Use plugged writes here, since we want to submit several before
+         * we unplug the device. We don't do explicit unplugging in here,
+         * instead we rely on sync_buffer() doing the unplug for us.
+         */
        if (commit_transaction->t_synchronous_commit)
-                write_op = WRITE_SYNC;
+                write_op = WRITE_SYNC_PLUG;
        spin_lock(&commit_transaction->t_handle_lock);
        while (commit_transaction->t_updates) {
                DEFINE_WAIT(wait);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 4ea72377c7a2..073c8c3df7cd 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -138,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal,
                set_buffer_ordered(bh);
                barrier_done = 1;
        }
-        ret = submit_bh(WRITE_SYNC, bh);
+        ret = submit_bh(WRITE_SYNC_PLUG, bh);
        if (barrier_done)
                clear_buffer_ordered(bh);
@@ -159,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal,
                lock_buffer(bh);
                set_buffer_uptodate(bh);
                clear_buffer_dirty(bh);
-                ret = submit_bh(WRITE_SYNC, bh);
+                ret = submit_bh(WRITE_SYNC_PLUG, bh);
        }
        *cbh = bh;
        return ret;
@@ -190,7 +190,7 @@ retry:
                set_buffer_uptodate(bh);
                bh->b_end_io = journal_end_buffer_io_sync;
-                ret = submit_bh(WRITE_SYNC, bh);
+                ret = submit_bh(WRITE_SYNC_PLUG, bh);
                if (ret) {
                        unlock_buffer(bh);
                        return ret;
@@ -402,8 +402,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        spin_lock(&journal->j_state_lock);
        commit_transaction->t_state = T_LOCKED;
+        /*
+         * Use plugged writes here, since we want to submit several before
+         * we unplug the device. We don't do explicit unplugging in here,
+         * instead we rely on sync_buffer() doing the unplug for us.
+         */
        if (commit_transaction->t_synchronous_commit)
-                write_op = WRITE_SYNC;
+                write_op = WRITE_SYNC_PLUG;
        stats.u.run.rs_wait = commit_transaction->t_max_wait;
        stats.u.run.rs_locked = jiffies;
        stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 77ccf8cb0823..043740dde20c 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -38,12 +38,12 @@ static int jffs2_acl_count(size_t size)
        size_t s;
        size -= sizeof(struct jffs2_acl_header);
-        s = size - 4 * sizeof(struct jffs2_acl_entry_short);
+        if (size < 4 * sizeof(struct jffs2_acl_entry_short)) {
-        if (s < 0) {
                if (size % sizeof(struct jffs2_acl_entry_short))
                        return -1;
                return size / sizeof(struct jffs2_acl_entry_short);
        } else {
+                s = size - 4 * sizeof(struct jffs2_acl_entry_short);
                if (s % sizeof(struct jffs2_acl_entry))
                        return -1;
                return s / sizeof(struct jffs2_acl_entry) + 4;
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index f9211252b5f1..9eff2bdae8a7 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -284,10 +284,9 @@ void jffs2_free_inode_cache(struct jffs2_inode_cache *x)
 struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void)
 {
        struct jffs2_xattr_datum *xd;
-        xd = kmem_cache_alloc(xattr_datum_cache, GFP_KERNEL);
+        xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL);
        dbg_memalloc("%p\n", xd);
-        memset(xd, 0, sizeof(struct jffs2_xattr_datum));
        xd->class = RAWNODE_CLASS_XATTR_DATUM;
        xd->node = (void *)xd;
        INIT_LIST_HEAD(&xd->xindex);
@@ -303,10 +302,9 @@ void jffs2_free_xattr_datum(struct jffs2_xattr_datum *xd)
 struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void)
 {
        struct jffs2_xattr_ref *ref;
-        ref = kmem_cache_alloc(xattr_ref_cache, GFP_KERNEL);
+        ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL);
        dbg_memalloc("%p\n", ref);
-        memset(ref, 0, sizeof(struct jffs2_xattr_ref));
        ref->class = RAWNODE_CLASS_XATTR_REF;
        ref->node = (void *)ref;
        return ref;
diff --git a/fs/libfs.c b/fs/libfs.c
index 4910a36f516e..cd223190c4e9 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -575,6 +575,21 @@ ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
 * possibly a read which collects the result - which is stored in a
 * file-local buffer.
 */
+void simple_transaction_set(struct file *file, size_t n)
+{
+        struct simple_transaction_argresp *ar = file->private_data;
+        BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);
+        /*
+         * The barrier ensures that ar->size will really remain zero until
+         * ar->data is ready for reading.
+         */
+        smp_mb();
+        ar->size = n;
+}
 char *simple_transaction_get(struct file *file, const char __user *buf, size_t size)
 {
        struct simple_transaction_argresp *ar;
@@ -820,6 +835,7 @@ EXPORT_SYMBOL(simple_sync_file);
 EXPORT_SYMBOL(simple_unlink);
 EXPORT_SYMBOL(simple_read_from_buffer);
 EXPORT_SYMBOL(memory_read_from_buffer);
+EXPORT_SYMBOL(simple_transaction_set);
 EXPORT_SYMBOL(simple_transaction_get);
 EXPORT_SYMBOL(simple_transaction_read);
 EXPORT_SYMBOL(simple_transaction_release);
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 763b78a6e9de..83ee34203bd7 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -426,8 +426,15 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
                        ret = nlm_granted;
                        goto out;
                case -EAGAIN:
+                        /*
+                         * If this is a blocking request for an
+                         * already pending lock request then we need
+                         * to put it back on lockd's block list
+                         */
+                        if (wait)
+                                break;
                        ret = nlm_lck_denied;
-                        break;
+                        goto out;
                case FILE_LOCK_DEFERRED:
                        if (wait)
                                break;
@@ -443,10 +450,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
                        goto out;
        }
-        ret = nlm_lck_denied;
-        if (!wait)
-                goto out;
        ret = nlm_lck_blocked;
        /* Append to list of blocked */
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 82eaadbff408..6717200923fe 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1228,7 +1228,6 @@ static int nfs_parse_mount_options(char *raw,
                                goto out_nomem;
                        token = match_token(string,
                                            nfs_xprt_protocol_tokens, args);
-                        kfree(string);
                        switch (token) {
                        case Opt_xprt_udp:
@@ -1258,6 +1257,7 @@ static int nfs_parse_mount_options(char *raw,
                                goto out_nomem;
                        token = match_token(string,
                                            nfs_xprt_protocol_tokens, args);
+                        kfree(string);
                        switch (token) {
                        case Opt_xprt_udp:
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 44d7d04dab95..503b9da159a3 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -1,6 +1,7 @@
 config NFSD
        tristate "NFS server support"
        depends on INET
+        depends on FILE_LOCKING
        select LOCKD
        select SUNRPC
        select EXPORTFS
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 9dbd2eb91281..7c9fe838f038 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -18,6 +18,7 @@
 #include <linux/unistd.h>
 #include <linux/slab.h>
 #include <linux/major.h>
+#include <linux/magic.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/nfsd/nfsd.h>
@@ -202,6 +203,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
                                         struct nfsd3_writeres  *resp)
 {
        __be32  nfserr;
+        unsigned long cnt = argp->len;
        dprintk("nfsd: WRITE(3)    %s %d bytes at %ld%s\n",
                                SVCFH_fmt(&argp->fh),
@@ -214,9 +216,9 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
        nfserr = nfsd_write(rqstp, &resp->fh, NULL,
                                   argp->offset,
                                   rqstp->rq_vec, argp->vlen,
-                                   argp->len,
+                                   &cnt,
                                   &resp->committed);
-        resp->count = argp->count;
+        resp->count = cnt;
        RETURN_STATUS(nfserr);
 }
@@ -569,7 +571,7 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
                struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb;
                /* Note that we don't care for remote fs's here */
-                if (sb->s_magic == 0x4d44 /* MSDOS_SUPER_MAGIC */) {
+                if (sb->s_magic == MSDOS_SUPER_MAGIC) {
                        resp->f_properties = NFS3_FSF_BILLYBOY;
                }
                resp->f_maxfilesize = sb->s_maxbytes;
@@ -610,7 +612,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle      *argp,
                        resp->p_link_max = EXT2_LINK_MAX;
                        resp->p_name_max = EXT2_NAME_LEN;
                        break;
-                case 0x4d44:    /* MSDOS_SUPER_MAGIC */
+                case MSDOS_SUPER_MAGIC:
                        resp->p_case_insensitive = 1;
                        resp->p_case_preserving  = 0;
                        break;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c464181b5994..290289bd44f7 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -218,7 +218,7 @@ static int
 encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
 {
        __be32 *p;
-        int len = cb_rec->cbr_fhlen;
+        int len = cb_rec->cbr_fh.fh_size;
        RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
        WRITE32(OP_CB_RECALL);
@@ -226,7 +226,7 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
        WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t));
        WRITE32(cb_rec->cbr_trunc);
        WRITE32(len);
-        WRITEMEM(cb_rec->cbr_fhval, len);
+        WRITEMEM(&cb_rec->cbr_fh.fh_base, len);
        return 0;
 }
@@ -361,9 +361,8 @@ static struct rpc_program cb_program = {
 /* Reference counting, callback cleanup, etc., all look racy as heck.
 * And why is cb_set an atomic? */
-static int do_probe_callback(void *data)
+static struct rpc_clnt *setup_callback_client(struct nfs4_client *clp)
 {
-        struct nfs4_client *clp = data;
        struct sockaddr_in      addr;
        struct nfs4_callback    *cb = &clp->cl_callback;
        struct rpc_timeout      timeparms = {
@@ -384,17 +383,10 @@ static int do_probe_callback(void *data)
                .flags          = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
                .client_name    = clp->cl_principal,
        };
-        struct rpc_message msg = {
-                .rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
-                .rpc_argp       = clp,
-        };
        struct rpc_clnt *client;
-        int status;
-        if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) {
+        if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
-                status = nfserr_cb_path_down;
+                return ERR_PTR(-EINVAL);
-                goto out_err;
-        }
        /* Initialize address */
        memset(&addr, 0, sizeof(addr));
@@ -404,9 +396,29 @@ static int do_probe_callback(void *data)
        /* Create RPC client */
        client = rpc_create(&args);
+        if (IS_ERR(client))
+                dprintk("NFSD: couldn't create callback client: %ld\n",
+                        PTR_ERR(client));
+        return client;
+}
+static int do_probe_callback(void *data)
+{
+        struct nfs4_client *clp = data;
+        struct nfs4_callback    *cb = &clp->cl_callback;
+        struct rpc_message msg = {
+                .rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
+                .rpc_argp       = clp,
+        };
+        struct rpc_clnt *client;
+        int status;
+        client = setup_callback_client(clp);
        if (IS_ERR(client)) {
-                dprintk("NFSD: couldn't create callback client\n");
                status = PTR_ERR(client);
+                dprintk("NFSD: couldn't create callback client: %d\n",
+                                                                status);
                goto out_err;
        }
@@ -422,10 +434,10 @@ static int do_probe_callback(void *data)
 out_release_client:
        rpc_shutdown_client(client);
 out_err:
-        dprintk("NFSD: warning: no callback path to client %.*s\n",
+        dprintk("NFSD: warning: no callback path to client %.*s: error %d\n",
-                (int)clp->cl_name.len, clp->cl_name.data);
+                (int)clp->cl_name.len, clp->cl_name.data, status);
        put_nfs4_client(clp);
-        return status;
+        return 0;
 }
 /*
@@ -451,7 +463,6 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 /*
 * called with dp->dl_count inc'ed.
- * nfs4_lock_state() may or may not have been called.
 */
 void
 nfsd4_cb_recall(struct nfs4_delegation *dp)
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 9fa60a3ad48c..b2883e9c6381 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -93,6 +93,21 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
        open->op_truncate = 0;
        if (open->op_create) {
+                /* FIXME: check session persistence and pnfs flags.
+                 * The nfsv4.1 spec requires the following semantics:
+                 *
+                 * Persistent   | pNFS   | Server REQUIRED | Client Allowed
+                 * Reply Cache  | server |                 |
+                 * -------------+--------+-----------------+--------------------
+                 * no           | no     | EXCLUSIVE4_1    | EXCLUSIVE4_1
+                 *              |        |                 | (SHOULD)
+                 *              |        | and EXCLUSIVE4  | or EXCLUSIVE4
+                 *              |        |                 | (SHOULD NOT)
+                 * no           | yes    | EXCLUSIVE4_1    | EXCLUSIVE4_1
+                 * yes          | no     | GUARDED4        | GUARDED4
+                 * yes          | yes    | GUARDED4        | GUARDED4
+                 */
                /*
                 * Note: create modes (UNCHECKED,GUARDED...) are the same
                 * in NFSv4 as in v3.
@@ -103,11 +118,13 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
                                        (u32 *)open->op_verf.data,
                                        &open->op_truncate, &created);
-                /* If we ever decide to use different attrs to store the
+                /*
-                 * verifier in nfsd_create_v3, then we'll need to change this
+                 * Following rfc 3530 14.2.16, use the returned bitmask
+                 * to indicate which attributes we used to store the
+                 * verifier:
                 */
                if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0)
-                        open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS |
+                        open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS |
                                                FATTR4_WORD1_TIME_MODIFY);
        } else {
                status = nfsd_lookup(rqstp, current_fh,
@@ -118,13 +135,11 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
                goto out;
        set_change_info(&open->op_cinfo, current_fh);
-        /* set reply cache */
        fh_dup2(current_fh, &resfh);
-        open->op_stateowner->so_replay.rp_openfh_len = resfh.fh_handle.fh_size;
-        memcpy(open->op_stateowner->so_replay.rp_openfh,
-                        &resfh.fh_handle.fh_base, resfh.fh_handle.fh_size);
+        /* set reply cache */
+        fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+                        &resfh.fh_handle);
        if (!created)
                status = do_open_permission(rqstp, current_fh, open,
                                            NFSD_MAY_NOP);
@@ -150,10 +165,8 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
        memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
        /* set replay cache */
-        open->op_stateowner->so_replay.rp_openfh_len = current_fh->fh_handle.fh_size;
+        fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
-        memcpy(open->op_stateowner->so_replay.rp_openfh,
+                        &current_fh->fh_handle);
-                &current_fh->fh_handle.fh_base,
-                current_fh->fh_handle.fh_size);
        open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
                (open->op_iattr.ia_size == 0);
@@ -164,12 +177,23 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
        return status;
 }
+static void
+copy_clientid(clientid_t *clid, struct nfsd4_session *session)
+{
+        struct nfsd4_sessionid *sid =
+                        (struct nfsd4_sessionid *)session->se_sessionid.data;
+        clid->cl_boot = sid->clientid.cl_boot;
+        clid->cl_id = sid->clientid.cl_id;
+}
 static __be32
 nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
           struct nfsd4_open *open)
 {
        __be32 status;
+        struct nfsd4_compoundres *resp;
        dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n",
                (int)open->op_fname.len, open->op_fname.data,
                open->op_stateowner);
@@ -178,16 +202,19 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
                return nfserr_inval;
+        if (nfsd4_has_session(cstate))
+                copy_clientid(&open->op_clientid, cstate->session);
        nfs4_lock_state();
        /* check seqid for replay. set nfs4_owner */
-        status = nfsd4_process_open1(open);
+        resp = rqstp->rq_resp;
+        status = nfsd4_process_open1(&resp->cstate, open);
        if (status == nfserr_replay_me) {
                struct nfs4_replay *rp = &open->op_stateowner->so_replay;
                fh_put(&cstate->current_fh);
-                cstate->current_fh.fh_handle.fh_size = rp->rp_openfh_len;
+                fh_copy_shallow(&cstate->current_fh.fh_handle,
-                memcpy(&cstate->current_fh.fh_handle.fh_base, rp->rp_openfh,
+                                &rp->rp_openfh);
-                                rp->rp_openfh_len);
                status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
                if (status)
                        dprintk("nfsd4_open: replay failed"
@@ -209,10 +236,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        switch (open->op_claim_type) {
                case NFS4_OPEN_CLAIM_DELEGATE_CUR:
-                        status = nfserr_inval;
-                        if (open->op_create)
-                                goto out;
-                        /* fall through */
                case NFS4_OPEN_CLAIM_NULL:
                        /*
                         * (1) set CURRENT_FH to the file being opened,
@@ -455,8 +478,9 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
                return nfserr_inval;
-        getattr->ga_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0;
+        getattr->ga_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
-        getattr->ga_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1;
+        getattr->ga_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
+        getattr->ga_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
        getattr->ga_fhp = &cstate->current_fh;
        return nfs_ok;
@@ -520,9 +544,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        nfs4_lock_state();
        /* check stateid */
-        if ((status = nfs4_preprocess_stateid_op(&cstate->current_fh,
+        if ((status = nfs4_preprocess_stateid_op(cstate, &read->rd_stateid,
-                                &read->rd_stateid,
+                                                 RD_STATE, &read->rd_filp))) {
-                                CHECK_FH | RD_STATE, &read->rd_filp))) {
                dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
                goto out;
        }
@@ -548,8 +571,9 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
                return nfserr_inval;
-        readdir->rd_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0;
+        readdir->rd_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
-        readdir->rd_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1;
+        readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
+        readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
        if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) ||
            (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE)))
@@ -653,8 +677,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
                nfs4_lock_state();
-                status = nfs4_preprocess_stateid_op(&cstate->current_fh,
+                status = nfs4_preprocess_stateid_op(cstate,
-                        &setattr->sa_stateid, CHECK_FH | WR_STATE, NULL);
+                        &setattr->sa_stateid, WR_STATE, NULL);
                nfs4_unlock_state();
                if (status) {
                        dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
@@ -685,6 +709,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        struct file *filp = NULL;
        u32 *p;
        __be32 status = nfs_ok;
+        unsigned long cnt;
        /* no need to check permission - this will be done in nfsd_write() */
@@ -692,8 +717,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                return nfserr_inval;
        nfs4_lock_state();
-        status = nfs4_preprocess_stateid_op(&cstate->current_fh, stateid,
+        status = nfs4_preprocess_stateid_op(cstate, stateid, WR_STATE, &filp);
-                                        CHECK_FH | WR_STATE, &filp);
        if (filp)
                get_file(filp);
        nfs4_unlock_state();
@@ -703,7 +727,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                return status;
        }
-        write->wr_bytes_written = write->wr_buflen;
+        cnt = write->wr_buflen;
        write->wr_how_written = write->wr_stable_how;
        p = (u32 *)write->wr_verifier.data;
        *p++ = nfssvc_boot.tv_sec;
@@ -711,10 +735,12 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        status =  nfsd_write(rqstp, &cstate->current_fh, filp,
                             write->wr_offset, rqstp->rq_vec, write->wr_vlen,
-                             write->wr_buflen, &write->wr_how_written);
+                             &cnt, &write->wr_how_written);
        if (filp)
                fput(filp);
+        write->wr_bytes_written = cnt;
        if (status == nfserr_symlink)
                status = nfserr_inval;
        return status;
@@ -737,8 +763,9 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (status)
                return status;
-        if ((verify->ve_bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0)
+        if ((verify->ve_bmval[0] & ~nfsd_suppattrs0(cstate->minorversion))
-            || (verify->ve_bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1))
+            || (verify->ve_bmval[1] & ~nfsd_suppattrs1(cstate->minorversion))
+            || (verify->ve_bmval[2] & ~nfsd_suppattrs2(cstate->minorversion)))
                return nfserr_attrnotsupp;
        if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)
            || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1))
@@ -766,7 +793,8 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (status)
                goto out_kfree;
-        p = buf + 3;
+        /* skip bitmap */
+        p = buf + 1 + ntohl(buf[0]);
        status = nfserr_not_same;
        if (ntohl(*p++) != verify->ve_attrlen)
                goto out_kfree;
@@ -813,39 +841,17 @@ static inline void nfsd4_increment_op_stats(u32 opnum)
                nfsdstats.nfs4_opcount[opnum]++;
 }
-static void cstate_free(struct nfsd4_compound_state *cstate)
-{
-        if (cstate == NULL)
-                return;
-        fh_put(&cstate->current_fh);
-        fh_put(&cstate->save_fh);
-        BUG_ON(cstate->replay_owner);
-        kfree(cstate);
-}
-static struct nfsd4_compound_state *cstate_alloc(void)
-{
-        struct nfsd4_compound_state *cstate;
-        cstate = kmalloc(sizeof(struct nfsd4_compound_state), GFP_KERNEL);
-        if (cstate == NULL)
-                return NULL;
-        fh_init(&cstate->current_fh, NFS4_FHSIZE);
-        fh_init(&cstate->save_fh, NFS4_FHSIZE);
-        cstate->replay_owner = NULL;
-        return cstate;
-}
 typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
                              void *);
+enum nfsd4_op_flags {
+        ALLOWED_WITHOUT_FH = 1 << 0,    /* No current filehandle required */
+        ALLOWED_ON_ABSENT_FS = 2 << 0,  /* ops processed on absent fs */
+        ALLOWED_AS_FIRST_OP = 3 << 0,   /* ops reqired first in compound */
+};
 struct nfsd4_operation {
        nfsd4op_func op_func;
        u32 op_flags;
-/* Most ops require a valid current filehandle; a few don't: */
-#define ALLOWED_WITHOUT_FH 1
-/* GETATTR and ops not listed as returning NFS4ERR_MOVED: */
-#define ALLOWED_ON_ABSENT_FS 2
        char *op_name;
 };
@@ -854,6 +860,51 @@ static struct nfsd4_operation nfsd4_ops[];
 static const char *nfsd4_op_name(unsigned opnum);
 /*
+ * This is a replay of a compound for which no cache entry pages
+ * were used. Encode the sequence operation, and if cachethis is FALSE
+ * encode the uncache rep error on the next operation.
+ */
+static __be32
+nfsd4_enc_uncached_replay(struct nfsd4_compoundargs *args,
+                         struct nfsd4_compoundres *resp)
+{
+        struct nfsd4_op *op;
+        dprintk("--> %s resp->opcnt %d ce_cachethis %u \n", __func__,
+                resp->opcnt, resp->cstate.slot->sl_cache_entry.ce_cachethis);
+        /* Encode the replayed sequence operation */
+        BUG_ON(resp->opcnt != 1);
+        op = &args->ops[resp->opcnt - 1];
+        nfsd4_encode_operation(resp, op);
+        /*return nfserr_retry_uncached_rep in next operation. */
+        if (resp->cstate.slot->sl_cache_entry.ce_cachethis == 0) {
+                op = &args->ops[resp->opcnt++];
+                op->status = nfserr_retry_uncached_rep;
+                nfsd4_encode_operation(resp, op);
+        }
+        return op->status;
+}
+/*
+ * Enforce NFSv4.1 COMPOUND ordering rules.
+ *
+ * TODO:
+ * - enforce NFS4ERR_NOT_ONLY_OP,
+ * - DESTROY_SESSION MUST be the final operation in the COMPOUND request.
+ */
+static bool nfs41_op_ordering_ok(struct nfsd4_compoundargs *args)
+{
+        if (args->minorversion && args->opcnt > 0) {
+                struct nfsd4_op *op = &args->ops[0];
+                return (op->status == nfserr_op_illegal) ||
+                       (nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP);
+        }
+        return true;
+}
+/*
 * COMPOUND call.
 */
 static __be32
@@ -863,12 +914,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 {
        struct nfsd4_op *op;
        struct nfsd4_operation *opdesc;
-        struct nfsd4_compound_state *cstate = NULL;
+        struct nfsd4_compound_state *cstate = &resp->cstate;
        int             slack_bytes;
        __be32          status;
        resp->xbuf = &rqstp->rq_res;
-        resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len;
+        resp->p = rqstp->rq_res.head[0].iov_base +
+                                                rqstp->rq_res.head[0].iov_len;
        resp->tagp = resp->p;
        /* reserve space for: taglen, tag, and opcnt */
        resp->p += 2 + XDR_QUADLEN(args->taglen);
@@ -877,18 +929,25 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
        resp->tag = args->tag;
        resp->opcnt = 0;
        resp->rqstp = rqstp;
+        resp->cstate.minorversion = args->minorversion;
+        resp->cstate.replay_owner = NULL;
+        fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
+        fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
+        /* Use the deferral mechanism only for NFSv4.0 compounds */
+        rqstp->rq_usedeferral = (args->minorversion == 0);
        /*
         * According to RFC3010, this takes precedence over all other errors.
         */
        status = nfserr_minor_vers_mismatch;
-        if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION)
+        if (args->minorversion > nfsd_supported_minorversion)
                goto out;
-        status = nfserr_resource;
+        if (!nfs41_op_ordering_ok(args)) {
-        cstate = cstate_alloc();
+                op = &args->ops[0];
-        if (cstate == NULL)
+                op->status = nfserr_sequence_pos;
-                goto out;
+                goto encode_op;
+        }
        status = nfs_ok;
        while (!status && resp->opcnt < args->opcnt) {
@@ -897,7 +956,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
                dprintk("nfsv4 compound op #%d/%d: %d (%s)\n",
                        resp->opcnt, args->opcnt, op->opnum,
                        nfsd4_op_name(op->opnum));
                /*
                 * The XDR decode routines may have pre-set op->status;
                 * for example, if there is a miscellaneous XDR error
@@ -938,6 +996,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
                        BUG_ON(op->status == nfs_ok);
 encode_op:
+                /* Only from SEQUENCE or CREATE_SESSION */
+                if (resp->cstate.status == nfserr_replay_cache) {
+                        dprintk("%s NFS4.1 replay from cache\n", __func__);
+                        if (nfsd4_not_cached(resp))
+                                status = nfsd4_enc_uncached_replay(args, resp);
+                        else
+                                status = op->status;
+                        goto out;
+                }
                if (op->status == nfserr_replay_me) {
                        op->replay = &cstate->replay_owner->so_replay;
                        nfsd4_encode_replay(resp, op);
@@ -961,15 +1028,24 @@ encode_op:
                nfsd4_increment_op_stats(op->opnum);
        }
+        if (!rqstp->rq_usedeferral && status == nfserr_dropit) {
+                dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__);
+                status = nfserr_jukebox;
+        }
-        cstate_free(cstate);
+        resp->cstate.status = status;
+        fh_put(&resp->cstate.current_fh);
+        fh_put(&resp->cstate.save_fh);
+        BUG_ON(resp->cstate.replay_owner);
 out:
        nfsd4_release_compoundargs(args);
+        /* Reset deferral mechanism for RPC deferrals */
+        rqstp->rq_usedeferral = 1;
        dprintk("nfsv4 compound returned %d\n", ntohl(status));
        return status;
 }
-static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
+static struct nfsd4_operation nfsd4_ops[] = {
        [OP_ACCESS] = {
                .op_func = (nfsd4op_func)nfsd4_access,
                .op_name = "OP_ACCESS",
@@ -1045,7 +1121,7 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
                .op_name = "OP_PUTFH",
        },
        [OP_PUTPUBFH] = {
-                /* unsupported, just for future reference: */
+                .op_func = (nfsd4op_func)nfsd4_putrootfh,
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
                .op_name = "OP_PUTPUBFH",
        },
@@ -1119,6 +1195,28 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
                .op_name = "OP_RELEASE_LOCKOWNER",
        },
+        /* NFSv4.1 operations */
+        [OP_EXCHANGE_ID] = {
+                .op_func = (nfsd4op_func)nfsd4_exchange_id,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+                .op_name = "OP_EXCHANGE_ID",
+        },
+        [OP_CREATE_SESSION] = {
+                .op_func = (nfsd4op_func)nfsd4_create_session,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+                .op_name = "OP_CREATE_SESSION",
+        },
+        [OP_DESTROY_SESSION] = {
+                .op_func = (nfsd4op_func)nfsd4_destroy_session,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+                .op_name = "OP_DESTROY_SESSION",
+        },
+        [OP_SEQUENCE] = {
+                .op_func = (nfsd4op_func)nfsd4_sequence,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+                .op_name = "OP_SEQUENCE",
+        },
 };
 static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 74f7b67567fd..3444c0052a87 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -182,36 +182,26 @@ out_unlock:
 typedef int (recdir_func)(struct dentry *, struct dentry *);
-struct dentry_list {
+struct name_list {
-        struct dentry *dentry;
+        char name[HEXDIR_LEN];
        struct list_head list;
 };
-struct dentry_list_arg {
-        struct list_head dentries;
-        struct dentry *parent;
-};
 static int
-nfsd4_build_dentrylist(void *arg, const char *name, int namlen,
+nfsd4_build_namelist(void *arg, const char *name, int namlen,
                loff_t offset, u64 ino, unsigned int d_type)
 {
-        struct dentry_list_arg *dla = arg;
+        struct list_head *names = arg;
-        struct list_head *dentries = &dla->dentries;
+        struct name_list *entry;
-        struct dentry *parent = dla->parent;
-        struct dentry *dentry;
-        struct dentry_list *child;
-        if (name && isdotent(name, namlen))
+        if (namlen != HEXDIR_LEN - 1)
                return 0;
-        dentry = lookup_one_len(name, parent, namlen);
+        entry = kmalloc(sizeof(struct name_list), GFP_KERNEL);
-        if (IS_ERR(dentry))
+        if (entry == NULL)
-                return PTR_ERR(dentry);
-        child = kmalloc(sizeof(*child), GFP_KERNEL);
-        if (child == NULL)
                return -ENOMEM;
-        child->dentry = dentry;
+        memcpy(entry->name, name, HEXDIR_LEN - 1);
-        list_add(&child->list, dentries);
+        entry->name[HEXDIR_LEN - 1] = '\0';
+        list_add(&entry->list, names);
        return 0;
 }
@@ -220,11 +210,9 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
 {
        const struct cred *original_cred;
        struct file *filp;
-        struct dentry_list_arg dla = {
+        LIST_HEAD(names);
-                .parent = dir,
+        struct name_list *entry;
-        };
+        struct dentry *dentry;
-        struct list_head *dentries = &dla.dentries;
-        struct dentry_list *child;
        int status;
        if (!rec_dir_init)
@@ -233,31 +221,34 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
        status = nfs4_save_creds(&original_cred);
        if (status < 0)
                return status;
-        INIT_LIST_HEAD(dentries);
        filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY,
                           current_cred());
        status = PTR_ERR(filp);
        if (IS_ERR(filp))
                goto out;
-        INIT_LIST_HEAD(dentries);
+        status = vfs_readdir(filp, nfsd4_build_namelist, &names);
-        status = vfs_readdir(filp, nfsd4_build_dentrylist, &dla);
        fput(filp);
-        while (!list_empty(dentries)) {
+        while (!list_empty(&names)) {
-                child = list_entry(dentries->next, struct dentry_list, list);
+                entry = list_entry(names.next, struct name_list, list);
-                status = f(dir, child->dentry);
+                dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
+                if (IS_ERR(dentry)) {
+                        status = PTR_ERR(dentry);
+                        goto out;
+                }
+                status = f(dir, dentry);
+                dput(dentry);
                if (status)
                        goto out;
-                list_del(&child->list);
+                list_del(&entry->list);
-                dput(child->dentry);
+                kfree(entry);
-                kfree(child);
        }
 out:
-        while (!list_empty(dentries)) {
+        while (!list_empty(&names)) {
-                child = list_entry(dentries->next, struct dentry_list, list);
+                entry = list_entry(names.next, struct name_list, list);
-                list_del(&child->list);
+                list_del(&entry->list);
-                dput(child->dentry);
+                kfree(entry);
-                kfree(child);
        }
        nfs4_reset_creds(original_cred);
        return status;
@@ -353,7 +344,8 @@ purge_old(struct dentry *parent, struct dentry *child)
 {
        int status;
-        if (nfs4_has_reclaimed_state(child->d_name.name))
+        /* note: we currently use this path only for minorversion 0 */
+        if (nfs4_has_reclaimed_state(child->d_name.name, false))
                return 0;
        status = nfsd4_clear_clid_dir(parent, child);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b6f60f48e94b..c65a27b76a9d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -68,6 +68,7 @@ static u32 current_delegid = 1;
 static u32 nfs4_init;
 static stateid_t zerostateid;             /* bits all 0 */
 static stateid_t onestateid;              /* bits all 1 */
+static u64 current_sessionid = 1;
 #define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t)))
 #define ONE_STATEID(stateid)  (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
@@ -75,18 +76,21 @@ static stateid_t onestateid;              /* bits all 1 */
 /* forward declarations */
 static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
 static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
-static void release_stateid_lockowners(struct nfs4_stateid *open_stp);
 static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
 static void nfs4_set_recdir(char *recdir);
-/* Locking:
+/* Locking: */
- *
- * client_mutex:
+/* Currently used for almost all code touching nfsv4 state: */
- *      protects clientid_hashtbl[], clientstr_hashtbl[],
- *      unconfstr_hashtbl[], uncofid_hashtbl[].
- */
 static DEFINE_MUTEX(client_mutex);
+/*
+ * Currently used for the del_recall_lru and file hash table.  In an
+ * effort to decrease the scope of the client_mutex, this spinlock may
+ * eventually cover more:
+ */
+static DEFINE_SPINLOCK(recall_lock);
 static struct kmem_cache *stateowner_slab = NULL;
 static struct kmem_cache *file_slab = NULL;
 static struct kmem_cache *stateid_slab = NULL;
@@ -117,37 +121,23 @@ opaque_hashval(const void *ptr, int nbytes)
        return x;
 }
-/* forward declarations */
-static void release_stateowner(struct nfs4_stateowner *sop);
-static void release_stateid(struct nfs4_stateid *stp, int flags);
-/*
- * Delegation state
- */
-/* recall_lock protects the del_recall_lru */
-static DEFINE_SPINLOCK(recall_lock);
 static struct list_head del_recall_lru;
-static void
-free_nfs4_file(struct kref *kref)
-{
-        struct nfs4_file *fp = container_of(kref, struct nfs4_file, fi_ref);
-        list_del(&fp->fi_hash);
-        iput(fp->fi_inode);
-        kmem_cache_free(file_slab, fp);
-}
 static inline void
 put_nfs4_file(struct nfs4_file *fi)
 {
-        kref_put(&fi->fi_ref, free_nfs4_file);
+        if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) {
+                list_del(&fi->fi_hash);
+                spin_unlock(&recall_lock);
+                iput(fi->fi_inode);
+                kmem_cache_free(file_slab, fi);
+        }
 }
 static inline void
 get_nfs4_file(struct nfs4_file *fi)
 {
-        kref_get(&fi->fi_ref);
+        atomic_inc(&fi->fi_ref);
 }
 static int num_delegations;
@@ -220,9 +210,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
        dp->dl_stateid.si_stateownerid = current_delegid++;
        dp->dl_stateid.si_fileid = 0;
        dp->dl_stateid.si_generation = 0;
-        dp->dl_fhlen = current_fh->fh_handle.fh_size;
+        fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
-        memcpy(dp->dl_fhval, &current_fh->fh_handle.fh_base,
-                        current_fh->fh_handle.fh_size);
        dp->dl_time = 0;
        atomic_set(&dp->dl_count, 1);
        list_add(&dp->dl_perfile, &fp->fi_delegations);
@@ -311,6 +299,291 @@ static struct list_head	unconf_id_hashtbl[CLIENT_HASH_SIZE];
 static struct list_head client_lru;
 static struct list_head close_lru;
+static void unhash_generic_stateid(struct nfs4_stateid *stp)
+{
+        list_del(&stp->st_hash);
+        list_del(&stp->st_perfile);
+        list_del(&stp->st_perstateowner);
+}
+static void free_generic_stateid(struct nfs4_stateid *stp)
+{
+        put_nfs4_file(stp->st_file);
+        kmem_cache_free(stateid_slab, stp);
+}
+static void release_lock_stateid(struct nfs4_stateid *stp)
+{
+        unhash_generic_stateid(stp);
+        locks_remove_posix(stp->st_vfs_file, (fl_owner_t)stp->st_stateowner);
+        free_generic_stateid(stp);
+}
+static void unhash_lockowner(struct nfs4_stateowner *sop)
+{
+        struct nfs4_stateid *stp;
+        list_del(&sop->so_idhash);
+        list_del(&sop->so_strhash);
+        list_del(&sop->so_perstateid);
+        while (!list_empty(&sop->so_stateids)) {
+                stp = list_first_entry(&sop->so_stateids,
+                                struct nfs4_stateid, st_perstateowner);
+                release_lock_stateid(stp);
+        }
+}
+static void release_lockowner(struct nfs4_stateowner *sop)
+{
+        unhash_lockowner(sop);
+        nfs4_put_stateowner(sop);
+}
+static void
+release_stateid_lockowners(struct nfs4_stateid *open_stp)
+{
+        struct nfs4_stateowner *lock_sop;
+        while (!list_empty(&open_stp->st_lockowners)) {
+                lock_sop = list_entry(open_stp->st_lockowners.next,
+                                struct nfs4_stateowner, so_perstateid);
+                /* list_del(&open_stp->st_lockowners);  */
+                BUG_ON(lock_sop->so_is_open_owner);
+                release_lockowner(lock_sop);
+        }
+}
+static void release_open_stateid(struct nfs4_stateid *stp)
+{
+        unhash_generic_stateid(stp);
+        release_stateid_lockowners(stp);
+        nfsd_close(stp->st_vfs_file);
+        free_generic_stateid(stp);
+}
+static void unhash_openowner(struct nfs4_stateowner *sop)
+{
+        struct nfs4_stateid *stp;
+        list_del(&sop->so_idhash);
+        list_del(&sop->so_strhash);
+        list_del(&sop->so_perclient);
+        list_del(&sop->so_perstateid); /* XXX: necessary? */
+        while (!list_empty(&sop->so_stateids)) {
+                stp = list_first_entry(&sop->so_stateids,
+                                struct nfs4_stateid, st_perstateowner);
+                release_open_stateid(stp);
+        }
+}
+static void release_openowner(struct nfs4_stateowner *sop)
+{
+        unhash_openowner(sop);
+        list_del(&sop->so_close_lru);
+        nfs4_put_stateowner(sop);
+}
+static DEFINE_SPINLOCK(sessionid_lock);
+#define SESSION_HASH_SIZE       512
+static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
+static inline int
+hash_sessionid(struct nfs4_sessionid *sessionid)
+{
+        struct nfsd4_sessionid *sid = (struct nfsd4_sessionid *)sessionid;
+        return sid->sequence % SESSION_HASH_SIZE;
+}
+static inline void
+dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
+{
+        u32 *ptr = (u32 *)(&sessionid->data[0]);
+        dprintk("%s: %u:%u:%u:%u\n", fn, ptr[0], ptr[1], ptr[2], ptr[3]);
+}
+static void
+gen_sessionid(struct nfsd4_session *ses)
+{
+        struct nfs4_client *clp = ses->se_client;
+        struct nfsd4_sessionid *sid;
+        sid = (struct nfsd4_sessionid *)ses->se_sessionid.data;
+        sid->clientid = clp->cl_clientid;
+        sid->sequence = current_sessionid++;
+        sid->reserved = 0;
+}
+/*
+ * Give the client the number of slots it requests bound by
+ * NFSD_MAX_SLOTS_PER_SESSION and by sv_drc_max_pages.
+ *
+ * If we run out of pages (sv_drc_pages_used == sv_drc_max_pages) we
+ * should (up to a point) re-negotiate active sessions and reduce their
+ * slot usage to make rooom for new connections. For now we just fail the
+ * create session.
+ */
+static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan)
+{
+        int status = 0, np = fchan->maxreqs * NFSD_PAGES_PER_SLOT;
+        spin_lock(&nfsd_serv->sv_lock);
+        if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages)
+                np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used;
+        nfsd_serv->sv_drc_pages_used += np;
+        spin_unlock(&nfsd_serv->sv_lock);
+        if (np <= 0) {
+                status = nfserr_resource;
+                fchan->maxreqs = 0;
+        } else
+                fchan->maxreqs = np / NFSD_PAGES_PER_SLOT;
+        return status;
+}
+/*
+ * fchan holds the client values on input, and the server values on output
+ */
+static int init_forechannel_attrs(struct svc_rqst *rqstp,
+                                    struct nfsd4_session *session,
+                                    struct nfsd4_channel_attrs *fchan)
+{
+        int status = 0;
+        __u32   maxcount = svc_max_payload(rqstp);
+        /* headerpadsz set to zero in encode routine */
+        /* Use the client's max request and max response size if possible */
+        if (fchan->maxreq_sz > maxcount)
+                fchan->maxreq_sz = maxcount;
+        session->se_fmaxreq_sz = fchan->maxreq_sz;
+        if (fchan->maxresp_sz > maxcount)
+                fchan->maxresp_sz = maxcount;
+        session->se_fmaxresp_sz = fchan->maxresp_sz;
+        /* Set the max response cached size our default which is
+         * a multiple of PAGE_SIZE and small */
+        session->se_fmaxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
+        fchan->maxresp_cached = session->se_fmaxresp_cached;
+        /* Use the client's maxops if possible */
+        if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
+                fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
+        session->se_fmaxops = fchan->maxops;
+        /* try to use the client requested number of slots */
+        if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
+                fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
+        /* FIXME: Error means no more DRC pages so the server should
+         * recover pages from existing sessions. For now fail session
+         * creation.
+         */
+        status = set_forechannel_maxreqs(fchan);
+        session->se_fnumslots = fchan->maxreqs;
+        return status;
+}
+static int
+alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
+                   struct nfsd4_create_session *cses)
+{
+        struct nfsd4_session *new, tmp;
+        int idx, status = nfserr_resource, slotsize;
+        memset(&tmp, 0, sizeof(tmp));
+        /* FIXME: For now, we just accept the client back channel attributes. */
+        status = init_forechannel_attrs(rqstp, &tmp, &cses->fore_channel);
+        if (status)
+                goto out;
+        /* allocate struct nfsd4_session and slot table in one piece */
+        slotsize = tmp.se_fnumslots * sizeof(struct nfsd4_slot);
+        new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
+        if (!new)
+                goto out;
+        memcpy(new, &tmp, sizeof(*new));
+        new->se_client = clp;
+        gen_sessionid(new);
+        idx = hash_sessionid(&new->se_sessionid);
+        memcpy(clp->cl_sessionid.data, new->se_sessionid.data,
+               NFS4_MAX_SESSIONID_LEN);
+        new->se_flags = cses->flags;
+        kref_init(&new->se_ref);
+        spin_lock(&sessionid_lock);
+        list_add(&new->se_hash, &sessionid_hashtbl[idx]);
+        list_add(&new->se_perclnt, &clp->cl_sessions);
+        spin_unlock(&sessionid_lock);
+        status = nfs_ok;
+out:
+        return status;
+}
+/* caller must hold sessionid_lock */
+static struct nfsd4_session *
+find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
+{
+        struct nfsd4_session *elem;
+        int idx;
+        dump_sessionid(__func__, sessionid);
+        idx = hash_sessionid(sessionid);
+        dprintk("%s: idx is %d\n", __func__, idx);
+        /* Search in the appropriate list */
+        list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) {
+                dump_sessionid("list traversal", &elem->se_sessionid);
+                if (!memcmp(elem->se_sessionid.data, sessionid->data,
+                            NFS4_MAX_SESSIONID_LEN)) {
+                        return elem;
+                }
+        }
+        dprintk("%s: session not found\n", __func__);
+        return NULL;
+}
+/* caller must hold sessionid_lock */
+static void
+unhash_session(struct nfsd4_session *ses)
+{
+        list_del(&ses->se_hash);
+        list_del(&ses->se_perclnt);
+}
+static void
+release_session(struct nfsd4_session *ses)
+{
+        spin_lock(&sessionid_lock);
+        unhash_session(ses);
+        spin_unlock(&sessionid_lock);
+        nfsd4_put_session(ses);
+}
+static void nfsd4_release_respages(struct page **respages, short resused);
+void
+free_session(struct kref *kref)
+{
+        struct nfsd4_session *ses;
+        int i;
+        ses = container_of(kref, struct nfsd4_session, se_ref);
+        for (i = 0; i < ses->se_fnumslots; i++) {
+                struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry;
+                nfsd4_release_respages(e->ce_respages, e->ce_resused);
+        }
+        kfree(ses->se_slots);
+        kfree(ses);
+}
 static inline void
 renew_client(struct nfs4_client *clp)
 {
@@ -330,8 +603,8 @@ STALE_CLIENTID(clientid_t *clid)
 {
        if (clid->cl_boot == boot_time)
                return 0;
-        dprintk("NFSD stale clientid (%08x/%08x)\n", 
+        dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n",
-                        clid->cl_boot, clid->cl_id);
+                clid->cl_boot, clid->cl_id, boot_time);
        return 1;
 }
@@ -376,6 +649,8 @@ static inline void
 free_client(struct nfs4_client *clp)
 {
        shutdown_callback_client(clp);
+        nfsd4_release_respages(clp->cl_slot.sl_cache_entry.ce_respages,
+                             clp->cl_slot.sl_cache_entry.ce_resused);
        if (clp->cl_cred.cr_group_info)
                put_group_info(clp->cl_cred.cr_group_info);
        kfree(clp->cl_principal);
@@ -420,7 +695,13 @@ expire_client(struct nfs4_client *clp)
        list_del(&clp->cl_lru);
        while (!list_empty(&clp->cl_openowners)) {
                sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
-                release_stateowner(sop);
+                release_openowner(sop);
+        }
+        while (!list_empty(&clp->cl_sessions)) {
+                struct nfsd4_session  *ses;
+                ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
+                                 se_perclnt);
+                release_session(ses);
        }
        put_nfs4_client(clp);
 }
@@ -439,6 +720,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
        INIT_LIST_HEAD(&clp->cl_strhash);
        INIT_LIST_HEAD(&clp->cl_openowners);
        INIT_LIST_HEAD(&clp->cl_delegations);
+        INIT_LIST_HEAD(&clp->cl_sessions);
        INIT_LIST_HEAD(&clp->cl_lru);
        return clp;
 }
@@ -568,25 +850,45 @@ find_unconfirmed_client(clientid_t *clid)
        return NULL;
 }
+/*
+ * Return 1 iff clp's clientid establishment method matches the use_exchange_id
+ * parameter. Matching is based on the fact the at least one of the
+ * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1
+ *
+ * FIXME: we need to unify the clientid namespaces for nfsv4.x
+ * and correctly deal with client upgrade/downgrade in EXCHANGE_ID
+ * and SET_CLIENTID{,_CONFIRM}
+ */
+static inline int
+match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id)
+{
+        bool has_exchange_flags = (clp->cl_exchange_flags != 0);
+        return use_exchange_id == has_exchange_flags;
+}
 static struct nfs4_client *
-find_confirmed_client_by_str(const char *dname, unsigned int hashval)
+find_confirmed_client_by_str(const char *dname, unsigned int hashval,
+                             bool use_exchange_id)
 {
        struct nfs4_client *clp;
        list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
-                if (same_name(clp->cl_recdir, dname))
+                if (same_name(clp->cl_recdir, dname) &&
+                    match_clientid_establishment(clp, use_exchange_id))
                        return clp;
        }
        return NULL;
 }
 static struct nfs4_client *
-find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
+find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
+                               bool use_exchange_id)
 {
        struct nfs4_client *clp;
        list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
-                if (same_name(clp->cl_recdir, dname))
+                if (same_name(clp->cl_recdir, dname) &&
+                    match_clientid_establishment(clp, use_exchange_id))
                        return clp;
        }
        return NULL;
@@ -685,6 +987,534 @@ out_err:
        return;
 }
+void
+nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
+{
+        struct nfsd4_compoundres *resp = rqstp->rq_resp;
+        resp->cstate.statp = statp;
+}
+/*
+ * Dereference the result pages.
+ */
+static void
+nfsd4_release_respages(struct page **respages, short resused)
+{
+        int i;
+        dprintk("--> %s\n", __func__);
+        for (i = 0; i < resused; i++) {
+                if (!respages[i])
+                        continue;
+                put_page(respages[i]);
+                respages[i] = NULL;
+        }
+}
+static void
+nfsd4_copy_pages(struct page **topages, struct page **frompages, short count)
+{
+        int i;
+        for (i = 0; i < count; i++) {
+                topages[i] = frompages[i];
+                if (!topages[i])
+                        continue;
+                get_page(topages[i]);
+        }
+}
+/*
+ * Cache the reply pages up to NFSD_PAGES_PER_SLOT + 1, clearing the previous
+ * pages. We add a page to NFSD_PAGES_PER_SLOT for the case where the total
+ * length of the XDR response is less than se_fmaxresp_cached
+ * (NFSD_PAGES_PER_SLOT * PAGE_SIZE) but the xdr_buf pages is used for a
+ * of the reply (e.g. readdir).
+ *
+ * Store the base and length of the rq_req.head[0] page
+ * of the NFSv4.1 data, just past the rpc header.
+ */
+void
+nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
+{
+        struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
+        struct svc_rqst *rqstp = resp->rqstp;
+        struct nfsd4_compoundargs *args = rqstp->rq_argp;
+        struct nfsd4_op *op = &args->ops[resp->opcnt];
+        struct kvec *resv = &rqstp->rq_res.head[0];
+        dprintk("--> %s entry %p\n", __func__, entry);
+        /* Don't cache a failed OP_SEQUENCE. */
+        if (resp->opcnt == 1 && op->opnum == OP_SEQUENCE && resp->cstate.status)
+                return;
+        nfsd4_release_respages(entry->ce_respages, entry->ce_resused);
+        entry->ce_opcnt = resp->opcnt;
+        entry->ce_status = resp->cstate.status;
+        /*
+         * Don't need a page to cache just the sequence operation - the slot
+         * does this for us!
+         */
+        if (nfsd4_not_cached(resp)) {
+                entry->ce_resused = 0;
+                entry->ce_rpchdrlen = 0;
+                dprintk("%s Just cache SEQUENCE. ce_cachethis %d\n", __func__,
+                        resp->cstate.slot->sl_cache_entry.ce_cachethis);
+                return;
+        }
+        entry->ce_resused = rqstp->rq_resused;
+        if (entry->ce_resused > NFSD_PAGES_PER_SLOT + 1)
+                entry->ce_resused = NFSD_PAGES_PER_SLOT + 1;
+        nfsd4_copy_pages(entry->ce_respages, rqstp->rq_respages,
+                         entry->ce_resused);
+        entry->ce_datav.iov_base = resp->cstate.statp;
+        entry->ce_datav.iov_len = resv->iov_len - ((char *)resp->cstate.statp -
+                                (char *)page_address(rqstp->rq_respages[0]));
+        /* Current request rpc header length*/
+        entry->ce_rpchdrlen = (char *)resp->cstate.statp -
+                                (char *)page_address(rqstp->rq_respages[0]);
+}
+/*
+ * We keep the rpc header, but take the nfs reply from the replycache.
+ */
+static int
+nfsd41_copy_replay_data(struct nfsd4_compoundres *resp,
+                        struct nfsd4_cache_entry *entry)
+{
+        struct svc_rqst *rqstp = resp->rqstp;
+        struct kvec *resv = &resp->rqstp->rq_res.head[0];
+        int len;
+        /* Current request rpc header length*/
+        len = (char *)resp->cstate.statp -
+                        (char *)page_address(rqstp->rq_respages[0]);
+        if (entry->ce_datav.iov_len + len > PAGE_SIZE) {
+                dprintk("%s v41 cached reply too large (%Zd).\n", __func__,
+                        entry->ce_datav.iov_len);
+                return 0;
+        }
+        /* copy the cached reply nfsd data past the current rpc header */
+        memcpy((char *)resv->iov_base + len, entry->ce_datav.iov_base,
+                entry->ce_datav.iov_len);
+        resv->iov_len = len + entry->ce_datav.iov_len;
+        return 1;
+}
+/*
+ * Keep the first page of the replay. Copy the NFSv4.1 data from the first
+ * cached page.  Replace any futher replay pages from the cache.
+ */
+__be32
+nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
+                         struct nfsd4_sequence *seq)
+{
+        struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
+        __be32 status;
+        dprintk("--> %s entry %p\n", __func__, entry);
+        /*
+         * If this is just the sequence operation, we did not keep
+         * a page in the cache entry because we can just use the
+         * slot info stored in struct nfsd4_sequence that was checked
+         * against the slot in nfsd4_sequence().
+         *
+         * This occurs when seq->cachethis is FALSE, or when the client
+         * session inactivity timer fires and a solo sequence operation
+         * is sent (lease renewal).
+         */
+        if (seq && nfsd4_not_cached(resp)) {
+                seq->maxslots = resp->cstate.session->se_fnumslots;
+                return nfs_ok;
+        }
+        if (!nfsd41_copy_replay_data(resp, entry)) {
+                /*
+                 * Not enough room to use the replay rpc header, send the
+                 * cached header. Release all the allocated result pages.
+                 */
+                svc_free_res_pages(resp->rqstp);
+                nfsd4_copy_pages(resp->rqstp->rq_respages, entry->ce_respages,
+                        entry->ce_resused);
+        } else {
+                /* Release all but the first allocated result page */
+                resp->rqstp->rq_resused--;
+                svc_free_res_pages(resp->rqstp);
+                nfsd4_copy_pages(&resp->rqstp->rq_respages[1],
+                                 &entry->ce_respages[1],
+                                 entry->ce_resused - 1);
+        }
+        resp->rqstp->rq_resused = entry->ce_resused;
+        resp->opcnt = entry->ce_opcnt;
+        resp->cstate.iovlen = entry->ce_datav.iov_len + entry->ce_rpchdrlen;
+        status = entry->ce_status;
+        return status;
+}
+/*
+ * Set the exchange_id flags returned by the server.
+ */
+static void
+nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
+{
+        /* pNFS is not supported */
+        new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
+        /* Referrals are supported, Migration is not. */
+        new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
+        /* set the wire flags to return to client. */
+        clid->flags = new->cl_exchange_flags;
+}
+__be32
+nfsd4_exchange_id(struct svc_rqst *rqstp,
+                  struct nfsd4_compound_state *cstate,
+                  struct nfsd4_exchange_id *exid)
+{
+        struct nfs4_client *unconf, *conf, *new;
+        int status;
+        unsigned int            strhashval;
+        char                    dname[HEXDIR_LEN];
+        nfs4_verifier           verf = exid->verifier;
+        u32                     ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
+        dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
+                " ip_addr=%u flags %x, spa_how %d\n",
+                __func__, rqstp, exid, exid->clname.len, exid->clname.data,
+                ip_addr, exid->flags, exid->spa_how);
+        if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A))
+                return nfserr_inval;
+        /* Currently only support SP4_NONE */
+        switch (exid->spa_how) {
+        case SP4_NONE:
+                break;
+        case SP4_SSV:
+                return nfserr_encr_alg_unsupp;
+        default:
+                BUG();                          /* checked by xdr code */
+        case SP4_MACH_CRED:
+                return nfserr_serverfault;      /* no excuse :-/ */
+        }
+        status = nfs4_make_rec_clidname(dname, &exid->clname);
+        if (status)
+                goto error;
+        strhashval = clientstr_hashval(dname);
+        nfs4_lock_state();
+        status = nfs_ok;
+        conf = find_confirmed_client_by_str(dname, strhashval, true);
+        if (conf) {
+                if (!same_verf(&verf, &conf->cl_verifier)) {
+                        /* 18.35.4 case 8 */
+                        if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+                                status = nfserr_not_same;
+                                goto out;
+                        }
+                        /* Client reboot: destroy old state */
+                        expire_client(conf);
+                        goto out_new;
+                }
+                if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
+                        /* 18.35.4 case 9 */
+                        if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+                                status = nfserr_perm;
+                                goto out;
+                        }
+                        expire_client(conf);
+                        goto out_new;
+                }
+                if (ip_addr != conf->cl_addr &&
+                    !(exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A)) {
+                        /* Client collision. 18.35.4 case 3 */
+                        status = nfserr_clid_inuse;
+                        goto out;
+                }
+                /*
+                 * Set bit when the owner id and verifier map to an already
+                 * confirmed client id (18.35.3).
+                 */
+                exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
+                /*
+                 * Falling into 18.35.4 case 2, possible router replay.
+                 * Leave confirmed record intact and return same result.
+                 */
+                copy_verf(conf, &verf);
+                new = conf;
+                goto out_copy;
+        } else {
+                /* 18.35.4 case 7 */
+                if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+                        status = nfserr_noent;
+                        goto out;
+                }
+        }
+        unconf  = find_unconfirmed_client_by_str(dname, strhashval, true);
+        if (unconf) {
+                /*
+                 * Possible retry or client restart.  Per 18.35.4 case 4,
+                 * a new unconfirmed record should be generated regardless
+                 * of whether any properties have changed.
+                 */
+                expire_client(unconf);
+        }
+out_new:
+        /* Normal case */
+        new = create_client(exid->clname, dname);
+        if (new == NULL) {
+                status = nfserr_resource;
+                goto out;
+        }
+        copy_verf(new, &verf);
+        copy_cred(&new->cl_cred, &rqstp->rq_cred);
+        new->cl_addr = ip_addr;
+        gen_clid(new);
+        gen_confirm(new);
+        add_to_unconfirmed(new, strhashval);
+out_copy:
+        exid->clientid.cl_boot = new->cl_clientid.cl_boot;
+        exid->clientid.cl_id = new->cl_clientid.cl_id;
+        new->cl_slot.sl_seqid = 0;
+        exid->seqid = 1;
+        nfsd4_set_ex_flags(new, exid);
+        dprintk("nfsd4_exchange_id seqid %d flags %x\n",
+                new->cl_slot.sl_seqid, new->cl_exchange_flags);
+        status = nfs_ok;
+out:
+        nfs4_unlock_state();
+error:
+        dprintk("nfsd4_exchange_id returns %d\n", ntohl(status));
+        return status;
+}
+static int
+check_slot_seqid(u32 seqid, struct nfsd4_slot *slot)
+{
+        dprintk("%s enter. seqid %d slot->sl_seqid %d\n", __func__, seqid,
+                slot->sl_seqid);
+        /* The slot is in use, and no response has been sent. */
+        if (slot->sl_inuse) {
+                if (seqid == slot->sl_seqid)
+                        return nfserr_jukebox;
+                else
+                        return nfserr_seq_misordered;
+        }
+        /* Normal */
+        if (likely(seqid == slot->sl_seqid + 1))
+                return nfs_ok;
+        /* Replay */
+        if (seqid == slot->sl_seqid)
+                return nfserr_replay_cache;
+        /* Wraparound */
+        if (seqid == 1 && (slot->sl_seqid + 1) == 0)
+                return nfs_ok;
+        /* Misordered replay or misordered new request */
+        return nfserr_seq_misordered;
+}
+__be32
+nfsd4_create_session(struct svc_rqst *rqstp,
+                     struct nfsd4_compound_state *cstate,
+                     struct nfsd4_create_session *cr_ses)
+{
+        u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
+        struct nfsd4_compoundres *resp = rqstp->rq_resp;
+        struct nfs4_client *conf, *unconf;
+        struct nfsd4_slot *slot = NULL;
+        int status = 0;
+        nfs4_lock_state();
+        unconf = find_unconfirmed_client(&cr_ses->clientid);
+        conf = find_confirmed_client(&cr_ses->clientid);
+        if (conf) {
+                slot = &conf->cl_slot;
+                status = check_slot_seqid(cr_ses->seqid, slot);
+                if (status == nfserr_replay_cache) {
+                        dprintk("Got a create_session replay! seqid= %d\n",
+                                slot->sl_seqid);
+                        cstate->slot = slot;
+                        cstate->status = status;
+                        /* Return the cached reply status */
+                        status = nfsd4_replay_cache_entry(resp, NULL);
+                        goto out;
+                } else if (cr_ses->seqid != conf->cl_slot.sl_seqid + 1) {
+                        status = nfserr_seq_misordered;
+                        dprintk("Sequence misordered!\n");
+                        dprintk("Expected seqid= %d but got seqid= %d\n",
+                                slot->sl_seqid, cr_ses->seqid);
+                        goto out;
+                }
+                conf->cl_slot.sl_seqid++;
+        } else if (unconf) {
+                if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
+                    (ip_addr != unconf->cl_addr)) {
+                        status = nfserr_clid_inuse;
+                        goto out;
+                }
+                slot = &unconf->cl_slot;
+                status = check_slot_seqid(cr_ses->seqid, slot);
+                if (status) {
+                        /* an unconfirmed replay returns misordered */
+                        status = nfserr_seq_misordered;
+                        goto out;
+                }
+                slot->sl_seqid++; /* from 0 to 1 */
+                move_to_confirmed(unconf);
+                /*
+                 * We do not support RDMA or persistent sessions
+                 */
+                cr_ses->flags &= ~SESSION4_PERSIST;
+                cr_ses->flags &= ~SESSION4_RDMA;
+                conf = unconf;
+        } else {
+                status = nfserr_stale_clientid;
+                goto out;
+        }
+        status = alloc_init_session(rqstp, conf, cr_ses);
+        if (status)
+                goto out;
+        memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data,
+               NFS4_MAX_SESSIONID_LEN);
+        cr_ses->seqid = slot->sl_seqid;
+        slot->sl_inuse = true;
+        cstate->slot = slot;
+        /* Ensure a page is used for the cache */
+        slot->sl_cache_entry.ce_cachethis = 1;
+out:
+        nfs4_unlock_state();
+        dprintk("%s returns %d\n", __func__, ntohl(status));
+        return status;
+}
+__be32
+nfsd4_destroy_session(struct svc_rqst *r,
+                      struct nfsd4_compound_state *cstate,
+                      struct nfsd4_destroy_session *sessionid)
+{
+        struct nfsd4_session *ses;
+        u32 status = nfserr_badsession;
+        /* Notes:
+         * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
+         * - Should we return nfserr_back_chan_busy if waiting for
+         *   callbacks on to-be-destroyed session?
+         * - Do we need to clear any callback info from previous session?
+         */
+        dump_sessionid(__func__, &sessionid->sessionid);
+        spin_lock(&sessionid_lock);
+        ses = find_in_sessionid_hashtbl(&sessionid->sessionid);
+        if (!ses) {
+                spin_unlock(&sessionid_lock);
+                goto out;
+        }
+        unhash_session(ses);
+        spin_unlock(&sessionid_lock);
+        /* wait for callbacks */
+        shutdown_callback_client(ses->se_client);
+        nfsd4_put_session(ses);
+        status = nfs_ok;
+out:
+        dprintk("%s returns %d\n", __func__, ntohl(status));
+        return status;
+}
+__be32
+nfsd4_sequence(struct svc_rqst *rqstp,
+               struct nfsd4_compound_state *cstate,
+               struct nfsd4_sequence *seq)
+{
+        struct nfsd4_compoundres *resp = rqstp->rq_resp;
+        struct nfsd4_session *session;
+        struct nfsd4_slot *slot;
+        int status;
+        if (resp->opcnt != 1)
+                return nfserr_sequence_pos;
+        spin_lock(&sessionid_lock);
+        status = nfserr_badsession;
+        session = find_in_sessionid_hashtbl(&seq->sessionid);
+        if (!session)
+                goto out;
+        status = nfserr_badslot;
+        if (seq->slotid >= session->se_fnumslots)
+                goto out;
+        slot = &session->se_slots[seq->slotid];
+        dprintk("%s: slotid %d\n", __func__, seq->slotid);
+        status = check_slot_seqid(seq->seqid, slot);
+        if (status == nfserr_replay_cache) {
+                cstate->slot = slot;
+                cstate->session = session;
+                /* Return the cached reply status and set cstate->status
+                 * for nfsd4_svc_encode_compoundres processing */
+                status = nfsd4_replay_cache_entry(resp, seq);
+                cstate->status = nfserr_replay_cache;
+                goto replay_cache;
+        }
+        if (status)
+                goto out;
+        /* Success! bump slot seqid */
+        slot->sl_inuse = true;
+        slot->sl_seqid = seq->seqid;
+        slot->sl_cache_entry.ce_cachethis = seq->cachethis;
+        /* Always set the cache entry cachethis for solo sequence */
+        if (nfsd4_is_solo_sequence(resp))
+                slot->sl_cache_entry.ce_cachethis = 1;
+        cstate->slot = slot;
+        cstate->session = session;
+replay_cache:
+        /* Renew the clientid on success and on replay.
+         * Hold a session reference until done processing the compound:
+         * nfsd4_put_session called only if the cstate slot is set.
+         */
+        renew_client(session->se_client);
+        nfsd4_get_session(session);
+out:
+        spin_unlock(&sessionid_lock);
+        dprintk("%s: return %d\n", __func__, ntohl(status));
+        return status;
+}
 __be32
 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                  struct nfsd4_setclientid *setclid)
@@ -716,14 +1546,13 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        strhashval = clientstr_hashval(dname);
        nfs4_lock_state();
-        conf = find_confirmed_client_by_str(dname, strhashval);
+        conf = find_confirmed_client_by_str(dname, strhashval, false);
        if (conf) {
                /* RFC 3530 14.2.33 CASE 0: */
                status = nfserr_clid_inuse;
-                if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)
+                if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
-                                || conf->cl_addr != sin->sin_addr.s_addr) {
+                        dprintk("NFSD: setclientid: string in use by client"
-                        dprintk("NFSD: setclientid: string in use by clientat %pI4\n",
+                                " at %pI4\n", &conf->cl_addr);
-                                &conf->cl_addr);
                        goto out;
                }
        }
@@ -732,7 +1561,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
         * has a description of SETCLIENTID request processing consisting
         * of 5 bullet points, labeled as CASE0 - CASE4 below.
         */
-        unconf = find_unconfirmed_client_by_str(dname, strhashval);
+        unconf = find_unconfirmed_client_by_str(dname, strhashval, false);
        status = nfserr_resource;
        if (!conf) {
                /*
@@ -887,7 +1716,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                        unsigned int hash =
                                clientstr_hashval(unconf->cl_recdir);
                        conf = find_confirmed_client_by_str(unconf->cl_recdir,
-                                                                        hash);
+                                                            hash, false);
                        if (conf) {
                                nfsd4_remove_clid_dir(conf);
                                expire_client(conf);
@@ -923,11 +1752,13 @@ alloc_init_file(struct inode *ino)
        fp = kmem_cache_alloc(file_slab, GFP_KERNEL);
        if (fp) {
-                kref_init(&fp->fi_ref);
+                atomic_set(&fp->fi_ref, 1);
                INIT_LIST_HEAD(&fp->fi_hash);
                INIT_LIST_HEAD(&fp->fi_stateids);
                INIT_LIST_HEAD(&fp->fi_delegations);
+                spin_lock(&recall_lock);
                list_add(&fp->fi_hash, &file_hashtbl[hashval]);
+                spin_unlock(&recall_lock);
                fp->fi_inode = igrab(ino);
                fp->fi_id = current_fileid++;
                fp->fi_had_conflict = false;
@@ -1037,48 +1868,6 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
        return sop;
 }
-static void
-release_stateid_lockowners(struct nfs4_stateid *open_stp)
-{
-        struct nfs4_stateowner *lock_sop;
-        while (!list_empty(&open_stp->st_lockowners)) {
-                lock_sop = list_entry(open_stp->st_lockowners.next,
-                                struct nfs4_stateowner, so_perstateid);
-                /* list_del(&open_stp->st_lockowners);  */
-                BUG_ON(lock_sop->so_is_open_owner);
-                release_stateowner(lock_sop);
-        }
-}
-static void
-unhash_stateowner(struct nfs4_stateowner *sop)
-{
-        struct nfs4_stateid *stp;
-        list_del(&sop->so_idhash);
-        list_del(&sop->so_strhash);
-        if (sop->so_is_open_owner)
-                list_del(&sop->so_perclient);
-        list_del(&sop->so_perstateid);
-        while (!list_empty(&sop->so_stateids)) {
-                stp = list_entry(sop->so_stateids.next,
-                        struct nfs4_stateid, st_perstateowner);
-                if (sop->so_is_open_owner)
-                        release_stateid(stp, OPEN_STATE);
-                else
-                        release_stateid(stp, LOCK_STATE);
-        }
-}
-static void
-release_stateowner(struct nfs4_stateowner *sop)
-{
-        unhash_stateowner(sop);
-        list_del(&sop->so_close_lru);
-        nfs4_put_stateowner(sop);
-}
 static inline void
 init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
        struct nfs4_stateowner *sop = open->op_stateowner;
@@ -1100,30 +1889,13 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
        stp->st_stateid.si_generation = 0;
        stp->st_access_bmap = 0;
        stp->st_deny_bmap = 0;
-        __set_bit(open->op_share_access, &stp->st_access_bmap);
+        __set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK,
+                  &stp->st_access_bmap);
        __set_bit(open->op_share_deny, &stp->st_deny_bmap);
        stp->st_openstp = NULL;
 }
 static void
-release_stateid(struct nfs4_stateid *stp, int flags)
-{
-        struct file *filp = stp->st_vfs_file;
-        list_del(&stp->st_hash);
-        list_del(&stp->st_perfile);
-        list_del(&stp->st_perstateowner);
-        if (flags & OPEN_STATE) {
-                release_stateid_lockowners(stp);
-                stp->st_vfs_file = NULL;
-                nfsd_close(filp);
-        } else if (flags & LOCK_STATE)
-                locks_remove_posix(filp, (fl_owner_t) stp->st_stateowner);
-        put_nfs4_file(stp->st_file);
-        kmem_cache_free(stateid_slab, stp);
-}
-static void
 move_to_close_lru(struct nfs4_stateowner *sop)
 {
        dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop);
@@ -1160,20 +1932,33 @@ find_file(struct inode *ino)
        unsigned int hashval = file_hashval(ino);
        struct nfs4_file *fp;
+        spin_lock(&recall_lock);
        list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
                if (fp->fi_inode == ino) {
                        get_nfs4_file(fp);
+                        spin_unlock(&recall_lock);
                        return fp;
                }
        }
+        spin_unlock(&recall_lock);
        return NULL;
 }
-static inline int access_valid(u32 x)
+static inline int access_valid(u32 x, u32 minorversion)
 {
-        if (x < NFS4_SHARE_ACCESS_READ)
+        if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ)
                return 0;
-        if (x > NFS4_SHARE_ACCESS_BOTH)
+        if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH)
+                return 0;
+        x &= ~NFS4_SHARE_ACCESS_MASK;
+        if (minorversion && x) {
+                if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL)
+                        return 0;
+                if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED)
+                        return 0;
+                x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK);
+        }
+        if (x)
                return 0;
        return 1;
 }
@@ -1409,7 +2194,8 @@ static struct lock_manager_operations nfsd_lease_mng_ops = {
 __be32
-nfsd4_process_open1(struct nfsd4_open *open)
+nfsd4_process_open1(struct nfsd4_compound_state *cstate,
+                    struct nfsd4_open *open)
 {
        clientid_t *clientid = &open->op_clientid;
        struct nfs4_client *clp = NULL;
@@ -1432,10 +2218,13 @@ nfsd4_process_open1(struct nfsd4_open *open)
                        return nfserr_expired;
                goto renew;
        }
+        /* When sessions are used, skip open sequenceid processing */
+        if (nfsd4_has_session(cstate))
+                goto renew;
        if (!sop->so_confirmed) {
                /* Replace unconfirmed owners without checking for replay. */
                clp = sop->so_client;
-                release_stateowner(sop);
+                release_openowner(sop);
                open->op_stateowner = NULL;
                goto renew;
        }
@@ -1709,6 +2498,7 @@ out:
 __be32
 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
 {
+        struct nfsd4_compoundres *resp = rqstp->rq_resp;
        struct nfs4_file *fp = NULL;
        struct inode *ino = current_fh->fh_dentry->d_inode;
        struct nfs4_stateid *stp = NULL;
@@ -1716,7 +2506,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
        __be32 status;
        status = nfserr_inval;
-        if (!access_valid(open->op_share_access)
+        if (!access_valid(open->op_share_access, resp->cstate.minorversion)
                        || !deny_valid(open->op_share_deny))
                goto out;
        /*
@@ -1764,12 +2554,17 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
                init_stateid(stp, fp, open);
                status = nfsd4_truncate(rqstp, current_fh, open);
                if (status) {
-                        release_stateid(stp, OPEN_STATE);
+                        release_open_stateid(stp);
                        goto out;
                }
+                if (nfsd4_has_session(&resp->cstate))
+                        update_stateid(&stp->st_stateid);
        }
        memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
+        if (nfsd4_has_session(&resp->cstate))
+                open->op_stateowner->so_confirmed = 1;
        /*
        * Attempt to hand out a delegation. No error return, because the
        * OPEN succeeds even if we fail.
@@ -1790,7 +2585,8 @@ out:
        * To finish the open response, we just need to set the rflags.
        */
        open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
-        if (!open->op_stateowner->so_confirmed)
+        if (!open->op_stateowner->so_confirmed &&
+            !nfsd4_has_session(&resp->cstate))
                open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
        return status;
@@ -1898,7 +2694,7 @@ nfs4_laundromat(void)
                }
                dprintk("NFSD: purging unused open stateowner (so_id %d)\n",
                        sop->so_id);
-                release_stateowner(sop);
+                release_openowner(sop);
        }
        if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
                clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT;
@@ -1983,10 +2779,7 @@ out:
 static inline __be32
 check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
 {
-        /* Trying to call delegreturn with a special stateid? Yuch: */
+        if (ONE_STATEID(stateid) && (flags & RD_STATE))
-        if (!(flags & (RD_STATE | WR_STATE)))
-                return nfserr_bad_stateid;
-        else if (ONE_STATEID(stateid) && (flags & RD_STATE))
                return nfs_ok;
        else if (locks_in_grace()) {
                /* Answer in remaining cases depends on existance of
@@ -2005,14 +2798,20 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
 * that are not able to provide mandatory locking.
 */
 static inline int
-io_during_grace_disallowed(struct inode *inode, int flags)
+grace_disallows_io(struct inode *inode)
 {
-        return locks_in_grace() && (flags & (RD_STATE | WR_STATE))
+        return locks_in_grace() && mandatory_lock(inode);
-                && mandatory_lock(inode);
 }
-static int check_stateid_generation(stateid_t *in, stateid_t *ref)
+static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags)
 {
+        /*
+         * When sessions are used the stateid generation number is ignored
+         * when it is zero.
+         */
+        if ((flags & HAS_SESSION) && in->si_generation == 0)
+                goto out;
        /* If the client sends us a stateid from the future, it's buggy: */
        if (in->si_generation > ref->si_generation)
                return nfserr_bad_stateid;
@@ -2028,74 +2827,77 @@ static int check_stateid_generation(stateid_t *in, stateid_t *ref)
         */
        if (in->si_generation < ref->si_generation)
                return nfserr_old_stateid;
+out:
        return nfs_ok;
 }
+static int is_delegation_stateid(stateid_t *stateid)
+{
+        return stateid->si_fileid == 0;
+}
 /*
 * Checks for stateid operations
 */
 __be32
-nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int flags, struct file **filpp)
+nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
+                           stateid_t *stateid, int flags, struct file **filpp)
 {
        struct nfs4_stateid *stp = NULL;
        struct nfs4_delegation *dp = NULL;
-        stateid_t *stidp;
+        struct svc_fh *current_fh = &cstate->current_fh;
        struct inode *ino = current_fh->fh_dentry->d_inode;
        __be32 status;
-        dprintk("NFSD: preprocess_stateid_op: stateid = (%08x/%08x/%08x/%08x)\n",
-                stateid->si_boot, stateid->si_stateownerid, 
-                stateid->si_fileid, stateid->si_generation); 
        if (filpp)
                *filpp = NULL;
-        if (io_during_grace_disallowed(ino, flags))
+        if (grace_disallows_io(ino))
                return nfserr_grace;
+        if (nfsd4_has_session(cstate))
+                flags |= HAS_SESSION;
        if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
                return check_special_stateids(current_fh, stateid, flags);
-        /* STALE STATEID */
        status = nfserr_stale_stateid;
        if (STALE_STATEID(stateid)) 
                goto out;
-        /* BAD STATEID */
        status = nfserr_bad_stateid;
-        if (!stateid->si_fileid) { /* delegation stateid */
+        if (is_delegation_stateid(stateid)) {
-                if(!(dp = find_delegation_stateid(ino, stateid))) {
+                dp = find_delegation_stateid(ino, stateid);
-                        dprintk("NFSD: delegation stateid not found\n");
+                if (!dp)
                        goto out;
-                }
+                status = check_stateid_generation(stateid, &dp->dl_stateid,
-                stidp = &dp->dl_stateid;
+                                                  flags);
+                if (status)
+                        goto out;
+                status = nfs4_check_delegmode(dp, flags);
+                if (status)
+                        goto out;
+                renew_client(dp->dl_client);
+                if (filpp)
+                        *filpp = dp->dl_vfs_file;
        } else { /* open or lock stateid */
-                if (!(stp = find_stateid(stateid, flags))) {
+                stp = find_stateid(stateid, flags);
-                        dprintk("NFSD: open or lock stateid not found\n");
+                if (!stp)
                        goto out;
-                }
+                if (nfs4_check_fh(current_fh, stp))
-                if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp))
                        goto out;
                if (!stp->st_stateowner->so_confirmed)
                        goto out;
-                stidp = &stp->st_stateid;
+                status = check_stateid_generation(stateid, &stp->st_stateid,
-        }
+                                                  flags);
-        status = check_stateid_generation(stateid, stidp);
+                if (status)
-        if (status)
+                        goto out;
-                goto out;
+                status = nfs4_check_openmode(stp, flags);
-        if (stp) {
+                if (status)
-                if ((status = nfs4_check_openmode(stp,flags)))
                        goto out;
                renew_client(stp->st_stateowner->so_client);
                if (filpp)
                        *filpp = stp->st_vfs_file;
-        } else {
-                if ((status = nfs4_check_delegmode(dp, flags)))
-                        goto out;
-                renew_client(dp->dl_client);
-                if (flags & DELEG_RET)
-                        unhash_delegation(dp);
-                if (filpp)
-                        *filpp = dp->dl_vfs_file;
        }
        status = nfs_ok;
 out:
@@ -2113,10 +2915,14 @@ setlkflg (int type)
 * Checks for sequence id mutating operations. 
 */
 static __be32
-nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
+nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
+                         stateid_t *stateid, int flags,
+                         struct nfs4_stateowner **sopp,
+                         struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
 {
        struct nfs4_stateid *stp;
        struct nfs4_stateowner *sop;
+        struct svc_fh *current_fh = &cstate->current_fh;
        __be32 status;
        dprintk("NFSD: preprocess_seqid_op: seqid=%d " 
@@ -2134,6 +2940,10 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
        if (STALE_STATEID(stateid))
                return nfserr_stale_stateid;
+        if (nfsd4_has_session(cstate))
+                flags |= HAS_SESSION;
        /*
        * We return BAD_STATEID if filehandle doesn't match stateid, 
        * the confirmed flag is incorrecly set, or the generation 
@@ -2166,8 +2976,9 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
                if (lock->lk_is_new) {
                        if (!sop->so_is_open_owner)
                                return nfserr_bad_stateid;
-                        if (!same_clid(&clp->cl_clientid, lockclid))
+                        if (!(flags & HAS_SESSION) &&
-                               return nfserr_bad_stateid;
+                            !same_clid(&clp->cl_clientid, lockclid))
+                                return nfserr_bad_stateid;
                        /* stp is the open stateid */
                        status = nfs4_check_openmode(stp, lkflg);
                        if (status)
@@ -2190,7 +3001,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
        *  For the moment, we ignore the possibility of 
        *  generation number wraparound.
        */
-        if (seqid != sop->so_seqid)
+        if (!(flags & HAS_SESSION) && seqid != sop->so_seqid)
                goto check_replay;
        if (sop->so_confirmed && flags & CONFIRM) {
@@ -2203,7 +3014,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
                                " confirmed yet!\n");
                return nfserr_bad_stateid;
        }
-        status = check_stateid_generation(stateid, &stp->st_stateid);
+        status = check_stateid_generation(stateid, &stp->st_stateid, flags);
        if (status)
                return status;
        renew_client(sop->so_client);
@@ -2239,7 +3050,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        nfs4_lock_state();
-        if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+        if ((status = nfs4_preprocess_seqid_op(cstate,
                                        oc->oc_seqid, &oc->oc_req_stateid,
                                        CONFIRM | OPEN_STATE,
                                        &oc->oc_stateowner, &stp, NULL)))
@@ -2304,12 +3115,12 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
                        (int)cstate->current_fh.fh_dentry->d_name.len,
                        cstate->current_fh.fh_dentry->d_name.name);
-        if (!access_valid(od->od_share_access)
+        if (!access_valid(od->od_share_access, cstate->minorversion)
                        || !deny_valid(od->od_share_deny))
                return nfserr_inval;
        nfs4_lock_state();
-        if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+        if ((status = nfs4_preprocess_seqid_op(cstate,
                                        od->od_seqid,
                                        &od->od_stateid, 
                                        OPEN_STATE,
@@ -2362,7 +3173,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        nfs4_lock_state();
        /* check close_lru for replay */
-        if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+        if ((status = nfs4_preprocess_seqid_op(cstate,
                                        close->cl_seqid,
                                        &close->cl_stateid, 
                                        OPEN_STATE | CLOSE_STATE,
@@ -2373,7 +3184,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t));
        /* release_stateid() calls nfsd_close() if needed */
-        release_stateid(stp, OPEN_STATE);
+        release_open_stateid(stp);
        /* place unused nfs4_stateowners on so_close_lru list to be
         * released by the laundromat service after the lease period
@@ -2394,16 +3205,40 @@ __be32
 nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                  struct nfsd4_delegreturn *dr)
 {
+        struct nfs4_delegation *dp;
+        stateid_t *stateid = &dr->dr_stateid;
+        struct inode *inode;
        __be32 status;
+        int flags = 0;
        if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
-                goto out;
+                return status;
+        inode = cstate->current_fh.fh_dentry->d_inode;
+        if (nfsd4_has_session(cstate))
+                flags |= HAS_SESSION;
        nfs4_lock_state();
-        status = nfs4_preprocess_stateid_op(&cstate->current_fh,
+        status = nfserr_bad_stateid;
-                                            &dr->dr_stateid, DELEG_RET, NULL);
+        if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
-        nfs4_unlock_state();
+                goto out;
+        status = nfserr_stale_stateid;
+        if (STALE_STATEID(stateid))
+                goto out;
+        status = nfserr_bad_stateid;
+        if (!is_delegation_stateid(stateid))
+                goto out;
+        dp = find_delegation_stateid(inode, stateid);
+        if (!dp)
+                goto out;
+        status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
+        if (status)
+                goto out;
+        renew_client(dp->dl_client);
+        unhash_delegation(dp);
 out:
+        nfs4_unlock_state();
        return status;
 }
@@ -2684,11 +3519,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                struct nfs4_file *fp;
                
                status = nfserr_stale_clientid;
-                if (STALE_CLIENTID(&lock->lk_new_clientid))
+                if (!nfsd4_has_session(cstate) &&
+                    STALE_CLIENTID(&lock->lk_new_clientid))
                        goto out;
                /* validate and update open stateid and open seqid */
-                status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+                status = nfs4_preprocess_seqid_op(cstate,
                                        lock->lk_new_open_seqid,
                                        &lock->lk_new_open_stateid,
                                        OPEN_STATE,
@@ -2715,7 +3551,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                        goto out;
        } else {
                /* lock (lock owner + lock stateid) already exists */
-                status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+                status = nfs4_preprocess_seqid_op(cstate,
                                       lock->lk_old_lock_seqid, 
                                       &lock->lk_old_lock_stateid, 
                                       LOCK_STATE,
@@ -2788,7 +3624,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        }
 out:
        if (status && lock->lk_is_new && lock_sop)
-                release_stateowner(lock_sop);
+                release_lockowner(lock_sop);
        if (lock->lk_replay_owner) {
                nfs4_get_stateowner(lock->lk_replay_owner);
                cstate->replay_owner = lock->lk_replay_owner;
@@ -2838,7 +3674,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        nfs4_lock_state();
        status = nfserr_stale_clientid;
-        if (STALE_CLIENTID(&lockt->lt_clientid))
+        if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid))
                goto out;
        if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) {
@@ -2911,7 +3747,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        nfs4_lock_state();
                                                                                
-        if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+        if ((status = nfs4_preprocess_seqid_op(cstate,
                                        locku->lu_seqid, 
                                        &locku->lu_stateid, 
                                        LOCK_STATE,
@@ -3037,7 +3873,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
                /* unhash_stateowner deletes so_perclient only
                 * for openowners. */
                list_del(&sop->so_perclient);
-                release_stateowner(sop);
+                release_lockowner(sop);
        }
 out:
        nfs4_unlock_state();
@@ -3051,12 +3887,12 @@ alloc_reclaim(void)
 }
 int
-nfs4_has_reclaimed_state(const char *name)
+nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
 {
        unsigned int strhashval = clientstr_hashval(name);
        struct nfs4_client *clp;
-        clp = find_confirmed_client_by_str(name, strhashval);
+        clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id);
        return clp ? 1 : 0;
 }
@@ -3153,6 +3989,8 @@ nfs4_state_init(void)
                INIT_LIST_HEAD(&unconf_str_hashtbl[i]);
                INIT_LIST_HEAD(&unconf_id_hashtbl[i]);
        }
+        for (i = 0; i < SESSION_HASH_SIZE; i++)
+                INIT_LIST_HEAD(&sessionid_hashtbl[i]);
        for (i = 0; i < FILE_HASH_SIZE; i++) {
                INIT_LIST_HEAD(&file_hashtbl[i]);
        }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 9250067943d8..b820c311931c 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -45,6 +45,7 @@
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include <linux/vfs.h>
+#include <linux/utsname.h>
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/clnt.h>
@@ -188,6 +189,11 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
        return p;
 }
+static int zero_clientid(clientid_t *clid)
+{
+        return (clid->cl_boot == 0) && (clid->cl_id == 0);
+}
 static int
 defer_free(struct nfsd4_compoundargs *argp,
                void (*release)(const void *), void *p)
@@ -230,6 +236,7 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
        bmval[0] = 0;
        bmval[1] = 0;
+        bmval[2] = 0;
        READ_BUF(4);
        READ32(bmlen);
@@ -241,13 +248,27 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
                READ32(bmval[0]);
        if (bmlen > 1)
                READ32(bmval[1]);
+        if (bmlen > 2)
+                READ32(bmval[2]);
        DECODE_TAIL;
 }
+static u32 nfsd_attrmask[] = {
+        NFSD_WRITEABLE_ATTRS_WORD0,
+        NFSD_WRITEABLE_ATTRS_WORD1,
+        NFSD_WRITEABLE_ATTRS_WORD2
+};
+static u32 nfsd41_ex_attrmask[] = {
+        NFSD_SUPPATTR_EXCLCREAT_WORD0,
+        NFSD_SUPPATTR_EXCLCREAT_WORD1,
+        NFSD_SUPPATTR_EXCLCREAT_WORD2
+};
 static __be32
-nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr,
+nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, u32 *writable,
-    struct nfs4_acl **acl)
+                   struct iattr *iattr, struct nfs4_acl **acl)
 {
        int expected_len, len = 0;
        u32 dummy32;
@@ -263,9 +284,12 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
         * According to spec, unsupported attributes return ERR_ATTRNOTSUPP;
         * read-only attributes return ERR_INVAL.
         */
-        if ((bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0) || (bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1))
+        if ((bmval[0] & ~nfsd_suppattrs0(argp->minorversion)) ||
+            (bmval[1] & ~nfsd_suppattrs1(argp->minorversion)) ||
+            (bmval[2] & ~nfsd_suppattrs2(argp->minorversion)))
                return nfserr_attrnotsupp;
-        if ((bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0) || (bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1))
+        if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) ||
+            (bmval[2] & ~writable[2]))
                return nfserr_inval;
        READ_BUF(4);
@@ -400,6 +424,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
                        goto xdr_error;
                }
        }
+        BUG_ON(bmval[2]);       /* no such writeable attr supported yet */
        if (len != expected_len)
                goto xdr_error;
@@ -493,7 +518,9 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
        if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval)))
                return status;
-        if ((status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, &create->cr_acl)))
+        status = nfsd4_decode_fattr(argp, create->cr_bmval, nfsd_attrmask,
+                                    &create->cr_iattr, &create->cr_acl);
+        if (status)
                goto out;
        DECODE_TAIL;
@@ -583,6 +610,8 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
        READ_BUF(lockt->lt_owner.len);
        READMEM(lockt->lt_owner.data, lockt->lt_owner.len);
+        if (argp->minorversion && !zero_clientid(&lockt->lt_clientid))
+                return nfserr_inval;
        DECODE_TAIL;
 }
@@ -652,13 +681,26 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
                switch (open->op_createmode) {
                case NFS4_CREATE_UNCHECKED:
                case NFS4_CREATE_GUARDED:
-                        if ((status = nfsd4_decode_fattr(argp, open->op_bmval, &open->op_iattr, &open->op_acl)))
+                        status = nfsd4_decode_fattr(argp, open->op_bmval,
+                                nfsd_attrmask, &open->op_iattr, &open->op_acl);
+                        if (status)
                                goto out;
                        break;
                case NFS4_CREATE_EXCLUSIVE:
                        READ_BUF(8);
                        COPYMEM(open->op_verf.data, 8);
                        break;
+                case NFS4_CREATE_EXCLUSIVE4_1:
+                        if (argp->minorversion < 1)
+                                goto xdr_error;
+                        READ_BUF(8);
+                        COPYMEM(open->op_verf.data, 8);
+                        status = nfsd4_decode_fattr(argp, open->op_bmval,
+                                nfsd41_ex_attrmask, &open->op_iattr,
+                                &open->op_acl);
+                        if (status)
+                                goto out;
+                        break;
                default:
                        goto xdr_error;
                }
@@ -851,7 +893,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
        status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
        if (status)
                return status;
-        return nfsd4_decode_fattr(argp, setattr->sa_bmval,
+        return nfsd4_decode_fattr(argp, setattr->sa_bmval, nfsd_attrmask,
                                  &setattr->sa_iattr, &setattr->sa_acl);
 }
@@ -993,6 +1035,241 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
        READ_BUF(rlockowner->rl_owner.len);
        READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len);
+        if (argp->minorversion && !zero_clientid(&rlockowner->rl_clientid))
+                return nfserr_inval;
+        DECODE_TAIL;
+}
+static __be32
+nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
+                         struct nfsd4_exchange_id *exid)
+{
+        int dummy;
+        DECODE_HEAD;
+        READ_BUF(NFS4_VERIFIER_SIZE);
+        COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE);
+        READ_BUF(4);
+        READ32(exid->clname.len);
+        READ_BUF(exid->clname.len);
+        SAVEMEM(exid->clname.data, exid->clname.len);
+        READ_BUF(4);
+        READ32(exid->flags);
+        /* Ignore state_protect4_a */
+        READ_BUF(4);
+        READ32(exid->spa_how);
+        switch (exid->spa_how) {
+        case SP4_NONE:
+                break;
+        case SP4_MACH_CRED:
+                /* spo_must_enforce */
+                READ_BUF(4);
+                READ32(dummy);
+                READ_BUF(dummy * 4);
+                p += dummy;
+                /* spo_must_allow */
+                READ_BUF(4);
+                READ32(dummy);
+                READ_BUF(dummy * 4);
+                p += dummy;
+                break;
+        case SP4_SSV:
+                /* ssp_ops */
+                READ_BUF(4);
+                READ32(dummy);
+                READ_BUF(dummy * 4);
+                p += dummy;
+                READ_BUF(4);
+                READ32(dummy);
+                READ_BUF(dummy * 4);
+                p += dummy;
+                /* ssp_hash_algs<> */
+                READ_BUF(4);
+                READ32(dummy);
+                READ_BUF(dummy);
+                p += XDR_QUADLEN(dummy);
+                /* ssp_encr_algs<> */
+                READ_BUF(4);
+                READ32(dummy);
+                READ_BUF(dummy);
+                p += XDR_QUADLEN(dummy);
+                /* ssp_window and ssp_num_gss_handles */
+                READ_BUF(8);
+                READ32(dummy);
+                READ32(dummy);
+                break;
+        default:
+                goto xdr_error;
+        }
+        /* Ignore Implementation ID */
+        READ_BUF(4);    /* nfs_impl_id4 array length */
+        READ32(dummy);
+        if (dummy > 1)
+                goto xdr_error;
+        if (dummy == 1) {
+                /* nii_domain */
+                READ_BUF(4);
+                READ32(dummy);
+                READ_BUF(dummy);
+                p += XDR_QUADLEN(dummy);
+                /* nii_name */
+                READ_BUF(4);
+                READ32(dummy);
+                READ_BUF(dummy);
+                p += XDR_QUADLEN(dummy);
+                /* nii_date */
+                READ_BUF(12);
+                p += 3;
+        }
+        DECODE_TAIL;
+}
+static __be32
+nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
+                            struct nfsd4_create_session *sess)
+{
+        DECODE_HEAD;
+        u32 dummy;
+        char *machine_name;
+        int i;
+        int nr_secflavs;
+        READ_BUF(16);
+        COPYMEM(&sess->clientid, 8);
+        READ32(sess->seqid);
+        READ32(sess->flags);
+        /* Fore channel attrs */
+        READ_BUF(28);
+        READ32(dummy); /* headerpadsz is always 0 */
+        READ32(sess->fore_channel.maxreq_sz);
+        READ32(sess->fore_channel.maxresp_sz);
+        READ32(sess->fore_channel.maxresp_cached);
+        READ32(sess->fore_channel.maxops);
+        READ32(sess->fore_channel.maxreqs);
+        READ32(sess->fore_channel.nr_rdma_attrs);
+        if (sess->fore_channel.nr_rdma_attrs == 1) {
+                READ_BUF(4);
+                READ32(sess->fore_channel.rdma_attrs);
+        } else if (sess->fore_channel.nr_rdma_attrs > 1) {
+                dprintk("Too many fore channel attr bitmaps!\n");
+                goto xdr_error;
+        }
+        /* Back channel attrs */
+        READ_BUF(28);
+        READ32(dummy); /* headerpadsz is always 0 */
+        READ32(sess->back_channel.maxreq_sz);
+        READ32(sess->back_channel.maxresp_sz);
+        READ32(sess->back_channel.maxresp_cached);
+        READ32(sess->back_channel.maxops);
+        READ32(sess->back_channel.maxreqs);
+        READ32(sess->back_channel.nr_rdma_attrs);
+        if (sess->back_channel.nr_rdma_attrs == 1) {
+                READ_BUF(4);
+                READ32(sess->back_channel.rdma_attrs);
+        } else if (sess->back_channel.nr_rdma_attrs > 1) {
+                dprintk("Too many back channel attr bitmaps!\n");
+                goto xdr_error;
+        }
+        READ_BUF(8);
+        READ32(sess->callback_prog);
+        /* callback_sec_params4 */
+        READ32(nr_secflavs);
+        for (i = 0; i < nr_secflavs; ++i) {
+                READ_BUF(4);
+                READ32(dummy);
+                switch (dummy) {
+                case RPC_AUTH_NULL:
+                        /* Nothing to read */
+                        break;
+                case RPC_AUTH_UNIX:
+                        READ_BUF(8);
+                        /* stamp */
+                        READ32(dummy);
+                        /* machine name */
+                        READ32(dummy);
+                        READ_BUF(dummy);
+                        SAVEMEM(machine_name, dummy);
+                        /* uid, gid */
+                        READ_BUF(8);
+                        READ32(sess->uid);
+                        READ32(sess->gid);
+                        /* more gids */
+                        READ_BUF(4);
+                        READ32(dummy);
+                        READ_BUF(dummy * 4);
+                        for (i = 0; i < dummy; ++i)
+                                READ32(dummy);
+                        break;
+                case RPC_AUTH_GSS:
+                        dprintk("RPC_AUTH_GSS callback secflavor "
+                                "not supported!\n");
+                        READ_BUF(8);
+                        /* gcbp_service */
+                        READ32(dummy);
+                        /* gcbp_handle_from_server */
+                        READ32(dummy);
+                        READ_BUF(dummy);
+                        p += XDR_QUADLEN(dummy);
+                        /* gcbp_handle_from_client */
+                        READ_BUF(4);
+                        READ32(dummy);
+                        READ_BUF(dummy);
+                        p += XDR_QUADLEN(dummy);
+                        break;
+                default:
+                        dprintk("Illegal callback secflavor\n");
+                        return nfserr_inval;
+                }
+        }
+        DECODE_TAIL;
+}
+static __be32
+nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp,
+                             struct nfsd4_destroy_session *destroy_session)
+{
+        DECODE_HEAD;
+        READ_BUF(NFS4_MAX_SESSIONID_LEN);
+        COPYMEM(destroy_session->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+        DECODE_TAIL;
+}
+static __be32
+nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
+                      struct nfsd4_sequence *seq)
+{
+        DECODE_HEAD;
+        READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
+        COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+        READ32(seq->seqid);
+        READ32(seq->slotid);
+        READ32(seq->maxslots);
+        READ32(seq->cachethis);
        DECODE_TAIL;
 }
@@ -1005,7 +1282,7 @@ nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
 static __be32
 nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p)
 {
-        return nfserr_opnotsupp;
+        return nfserr_notsupp;
 }
 typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *);
@@ -1031,7 +1308,7 @@ static nfsd4_dec nfsd4_dec_ops[] = {
        [OP_OPEN_CONFIRM]       = (nfsd4_dec)nfsd4_decode_open_confirm,
        [OP_OPEN_DOWNGRADE]     = (nfsd4_dec)nfsd4_decode_open_downgrade,
        [OP_PUTFH]              = (nfsd4_dec)nfsd4_decode_putfh,
-        [OP_PUTPUBFH]           = (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_PUTPUBFH]           = (nfsd4_dec)nfsd4_decode_noop,
        [OP_PUTROOTFH]          = (nfsd4_dec)nfsd4_decode_noop,
        [OP_READ]               = (nfsd4_dec)nfsd4_decode_read,
        [OP_READDIR]            = (nfsd4_dec)nfsd4_decode_readdir,
@@ -1050,6 +1327,67 @@ static nfsd4_dec nfsd4_dec_ops[] = {
        [OP_RELEASE_LOCKOWNER]  = (nfsd4_dec)nfsd4_decode_release_lockowner,
 };
+static nfsd4_dec nfsd41_dec_ops[] = {
+        [OP_ACCESS]             (nfsd4_dec)nfsd4_decode_access,
+        [OP_CLOSE]              (nfsd4_dec)nfsd4_decode_close,
+        [OP_COMMIT]             (nfsd4_dec)nfsd4_decode_commit,
+        [OP_CREATE]             (nfsd4_dec)nfsd4_decode_create,
+        [OP_DELEGPURGE]         (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_DELEGRETURN]        (nfsd4_dec)nfsd4_decode_delegreturn,
+        [OP_GETATTR]            (nfsd4_dec)nfsd4_decode_getattr,
+        [OP_GETFH]              (nfsd4_dec)nfsd4_decode_noop,
+        [OP_LINK]               (nfsd4_dec)nfsd4_decode_link,
+        [OP_LOCK]               (nfsd4_dec)nfsd4_decode_lock,
+        [OP_LOCKT]              (nfsd4_dec)nfsd4_decode_lockt,
+        [OP_LOCKU]              (nfsd4_dec)nfsd4_decode_locku,
+        [OP_LOOKUP]             (nfsd4_dec)nfsd4_decode_lookup,
+        [OP_LOOKUPP]            (nfsd4_dec)nfsd4_decode_noop,
+        [OP_NVERIFY]            (nfsd4_dec)nfsd4_decode_verify,
+        [OP_OPEN]               (nfsd4_dec)nfsd4_decode_open,
+        [OP_OPENATTR]           (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_OPEN_CONFIRM]       (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_OPEN_DOWNGRADE]     (nfsd4_dec)nfsd4_decode_open_downgrade,
+        [OP_PUTFH]              (nfsd4_dec)nfsd4_decode_putfh,
+        [OP_PUTPUBFH]           (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_PUTROOTFH]          (nfsd4_dec)nfsd4_decode_noop,
+        [OP_READ]               (nfsd4_dec)nfsd4_decode_read,
+        [OP_READDIR]            (nfsd4_dec)nfsd4_decode_readdir,
+        [OP_READLINK]           (nfsd4_dec)nfsd4_decode_noop,
+        [OP_REMOVE]             (nfsd4_dec)nfsd4_decode_remove,
+        [OP_RENAME]             (nfsd4_dec)nfsd4_decode_rename,
+        [OP_RENEW]              (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_RESTOREFH]          (nfsd4_dec)nfsd4_decode_noop,
+        [OP_SAVEFH]             (nfsd4_dec)nfsd4_decode_noop,
+        [OP_SECINFO]            (nfsd4_dec)nfsd4_decode_secinfo,
+        [OP_SETATTR]            (nfsd4_dec)nfsd4_decode_setattr,
+        [OP_SETCLIENTID]        (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_SETCLIENTID_CONFIRM](nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_VERIFY]             (nfsd4_dec)nfsd4_decode_verify,
+        [OP_WRITE]              (nfsd4_dec)nfsd4_decode_write,
+        [OP_RELEASE_LOCKOWNER]  (nfsd4_dec)nfsd4_decode_notsupp,
+        /* new operations for NFSv4.1 */
+        [OP_BACKCHANNEL_CTL]    (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_BIND_CONN_TO_SESSION](nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_EXCHANGE_ID]        (nfsd4_dec)nfsd4_decode_exchange_id,
+        [OP_CREATE_SESSION]     (nfsd4_dec)nfsd4_decode_create_session,
+        [OP_DESTROY_SESSION]    (nfsd4_dec)nfsd4_decode_destroy_session,
+        [OP_FREE_STATEID]       (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_GET_DIR_DELEGATION] (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_GETDEVICEINFO]      (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_GETDEVICELIST]      (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_LAYOUTCOMMIT]       (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_LAYOUTGET]          (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_LAYOUTRETURN]       (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_SECINFO_NO_NAME]    (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_SEQUENCE]           (nfsd4_dec)nfsd4_decode_sequence,
+        [OP_SET_SSV]            (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_TEST_STATEID]       (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_WANT_DELEGATION]    (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_DESTROY_CLIENTID]   (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_RECLAIM_COMPLETE]   (nfsd4_dec)nfsd4_decode_notsupp,
+};
 struct nfsd4_minorversion_ops {
        nfsd4_dec *decoders;
        int nops;
@@ -1057,6 +1395,7 @@ struct nfsd4_minorversion_ops {
 static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
        [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
+        [1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
 };
 static __be32
@@ -1412,6 +1751,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 {
        u32 bmval0 = bmval[0];
        u32 bmval1 = bmval[1];
+        u32 bmval2 = bmval[2];
        struct kstat stat;
        struct svc_fh tempfh;
        struct kstatfs statfs;
@@ -1425,12 +1765,16 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
        int err;
        int aclsupport = 0;
        struct nfs4_acl *acl = NULL;
+        struct nfsd4_compoundres *resp = rqstp->rq_resp;
+        u32 minorversion = resp->cstate.minorversion;
        BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
-        BUG_ON(bmval0 & ~NFSD_SUPPORTED_ATTRS_WORD0);
+        BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
-        BUG_ON(bmval1 & ~NFSD_SUPPORTED_ATTRS_WORD1);
+        BUG_ON(bmval1 & ~nfsd_suppattrs1(minorversion));
+        BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion));
        if (exp->ex_fslocs.migrated) {
+                BUG_ON(bmval[2]);
                status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err);
                if (status)
                        goto out;
@@ -1476,22 +1820,42 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
        if ((buflen -= 16) < 0)
                goto out_resource;
-        WRITE32(2);
+        if (unlikely(bmval2)) {
-        WRITE32(bmval0);
+                WRITE32(3);
-        WRITE32(bmval1);
+                WRITE32(bmval0);
+                WRITE32(bmval1);
+                WRITE32(bmval2);
+        } else if (likely(bmval1)) {
+                WRITE32(2);
+                WRITE32(bmval0);
+                WRITE32(bmval1);
+        } else {
+                WRITE32(1);
+                WRITE32(bmval0);
+        }
        attrlenp = p++;                /* to be backfilled later */
        if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
-                u32 word0 = NFSD_SUPPORTED_ATTRS_WORD0;
+                u32 word0 = nfsd_suppattrs0(minorversion);
+                u32 word1 = nfsd_suppattrs1(minorversion);
+                u32 word2 = nfsd_suppattrs2(minorversion);
                if ((buflen -= 12) < 0)
                        goto out_resource;
                if (!aclsupport)
                        word0 &= ~FATTR4_WORD0_ACL;
                if (!exp->ex_fslocs.locations)
                        word0 &= ~FATTR4_WORD0_FS_LOCATIONS;
-                WRITE32(2);
+                if (!word2) {
-                WRITE32(word0);
+                        WRITE32(2);
-                WRITE32(NFSD_SUPPORTED_ATTRS_WORD1);
+                        WRITE32(word0);
+                        WRITE32(word1);
+                } else {
+                        WRITE32(3);
+                        WRITE32(word0);
+                        WRITE32(word1);
+                        WRITE32(word2);
+                }
        }
        if (bmval0 & FATTR4_WORD0_TYPE) {
                if ((buflen -= 4) < 0)
@@ -1801,6 +2165,13 @@ out_acl:
                }
                WRITE64(stat.ino);
        }
+        if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
+                WRITE32(3);
+                WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
+                WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1);
+                WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD2);
+        }
        *attrlenp = htonl((char *)p - (char *)attrlenp - 4);
        *countp = p - buffer;
        status = nfs_ok;
@@ -2572,6 +2943,143 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
 }
 static __be32
+nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr,
+                         struct nfsd4_exchange_id *exid)
+{
+        ENCODE_HEAD;
+        char *major_id;
+        char *server_scope;
+        int major_id_sz;
+        int server_scope_sz;
+        uint64_t minor_id = 0;
+        if (nfserr)
+                return nfserr;
+        major_id = utsname()->nodename;
+        major_id_sz = strlen(major_id);
+        server_scope = utsname()->nodename;
+        server_scope_sz = strlen(server_scope);
+        RESERVE_SPACE(
+                8 /* eir_clientid */ +
+                4 /* eir_sequenceid */ +
+                4 /* eir_flags */ +
+                4 /* spr_how (SP4_NONE) */ +
+                8 /* so_minor_id */ +
+                4 /* so_major_id.len */ +
+                (XDR_QUADLEN(major_id_sz) * 4) +
+                4 /* eir_server_scope.len */ +
+                (XDR_QUADLEN(server_scope_sz) * 4) +
+                4 /* eir_server_impl_id.count (0) */);
+        WRITEMEM(&exid->clientid, 8);
+        WRITE32(exid->seqid);
+        WRITE32(exid->flags);
+        /* state_protect4_r. Currently only support SP4_NONE */
+        BUG_ON(exid->spa_how != SP4_NONE);
+        WRITE32(exid->spa_how);
+        /* The server_owner struct */
+        WRITE64(minor_id);      /* Minor id */
+        /* major id */
+        WRITE32(major_id_sz);
+        WRITEMEM(major_id, major_id_sz);
+        /* Server scope */
+        WRITE32(server_scope_sz);
+        WRITEMEM(server_scope, server_scope_sz);
+        /* Implementation id */
+        WRITE32(0);     /* zero length nfs_impl_id4 array */
+        ADJUST_ARGS();
+        return 0;
+}
+static __be32
+nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr,
+                            struct nfsd4_create_session *sess)
+{
+        ENCODE_HEAD;
+        if (nfserr)
+                return nfserr;
+        RESERVE_SPACE(24);
+        WRITEMEM(sess->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+        WRITE32(sess->seqid);
+        WRITE32(sess->flags);
+        ADJUST_ARGS();
+        RESERVE_SPACE(28);
+        WRITE32(0); /* headerpadsz */
+        WRITE32(sess->fore_channel.maxreq_sz);
+        WRITE32(sess->fore_channel.maxresp_sz);
+        WRITE32(sess->fore_channel.maxresp_cached);
+        WRITE32(sess->fore_channel.maxops);
+        WRITE32(sess->fore_channel.maxreqs);
+        WRITE32(sess->fore_channel.nr_rdma_attrs);
+        ADJUST_ARGS();
+        if (sess->fore_channel.nr_rdma_attrs) {
+                RESERVE_SPACE(4);
+                WRITE32(sess->fore_channel.rdma_attrs);
+                ADJUST_ARGS();
+        }
+        RESERVE_SPACE(28);
+        WRITE32(0); /* headerpadsz */
+        WRITE32(sess->back_channel.maxreq_sz);
+        WRITE32(sess->back_channel.maxresp_sz);
+        WRITE32(sess->back_channel.maxresp_cached);
+        WRITE32(sess->back_channel.maxops);
+        WRITE32(sess->back_channel.maxreqs);
+        WRITE32(sess->back_channel.nr_rdma_attrs);
+        ADJUST_ARGS();
+        if (sess->back_channel.nr_rdma_attrs) {
+                RESERVE_SPACE(4);
+                WRITE32(sess->back_channel.rdma_attrs);
+                ADJUST_ARGS();
+        }
+        return 0;
+}
+static __be32
+nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr,
+                             struct nfsd4_destroy_session *destroy_session)
+{
+        return nfserr;
+}
+__be32
+nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
+                      struct nfsd4_sequence *seq)
+{
+        ENCODE_HEAD;
+        if (nfserr)
+                return nfserr;
+        RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 20);
+        WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+        WRITE32(seq->seqid);
+        WRITE32(seq->slotid);
+        WRITE32(seq->maxslots);
+        /*
+         * FIXME: for now:
+         *   target_maxslots = maxslots
+         *   status_flags = 0
+         */
+        WRITE32(seq->maxslots);
+        WRITE32(0);
+        ADJUST_ARGS();
+        return 0;
+}
+static __be32
 nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
 {
        return nfserr;
@@ -2579,6 +3087,11 @@ nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
 typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
+/*
+ * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1
+ * since we don't need to filter out obsolete ops as this is
+ * done in the decoding phase.
+ */
 static nfsd4_enc nfsd4_enc_ops[] = {
        [OP_ACCESS]             = (nfsd4_enc)nfsd4_encode_access,
        [OP_CLOSE]              = (nfsd4_enc)nfsd4_encode_close,
@@ -2617,8 +3130,77 @@ static nfsd4_enc nfsd4_enc_ops[] = {
        [OP_VERIFY]             = (nfsd4_enc)nfsd4_encode_noop,
        [OP_WRITE]              = (nfsd4_enc)nfsd4_encode_write,
        [OP_RELEASE_LOCKOWNER]  = (nfsd4_enc)nfsd4_encode_noop,
+        /* NFSv4.1 operations */
+        [OP_BACKCHANNEL_CTL]    = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_EXCHANGE_ID]        = (nfsd4_enc)nfsd4_encode_exchange_id,
+        [OP_CREATE_SESSION]     = (nfsd4_enc)nfsd4_encode_create_session,
+        [OP_DESTROY_SESSION]    = (nfsd4_enc)nfsd4_encode_destroy_session,
+        [OP_FREE_STATEID]       = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_GETDEVICEINFO]      = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_GETDEVICELIST]      = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_LAYOUTCOMMIT]       = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_LAYOUTGET]          = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_LAYOUTRETURN]       = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_SECINFO_NO_NAME]    = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_SEQUENCE]           = (nfsd4_enc)nfsd4_encode_sequence,
+        [OP_SET_SSV]            = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_TEST_STATEID]       = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_WANT_DELEGATION]    = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_DESTROY_CLIENTID]   = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_RECLAIM_COMPLETE]   = (nfsd4_enc)nfsd4_encode_noop,
 };
+/*
+ * Calculate the total amount of memory that the compound response has taken
+ * after encoding the current operation.
+ *
+ * pad: add on 8 bytes for the next operation's op_code and status so that
+ * there is room to cache a failure on the next operation.
+ *
+ * Compare this length to the session se_fmaxresp_cached.
+ *
+ * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so
+ * will be at least a page and will therefore hold the xdr_buf head.
+ */
+static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
+{
+        int status = 0;
+        struct xdr_buf *xb = &resp->rqstp->rq_res;
+        struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
+        struct nfsd4_session *session = NULL;
+        struct nfsd4_slot *slot = resp->cstate.slot;
+        u32 length, tlen = 0, pad = 8;
+        if (!nfsd4_has_session(&resp->cstate))
+                return status;
+        session = resp->cstate.session;
+        if (session == NULL || slot->sl_cache_entry.ce_cachethis == 0)
+                return status;
+        if (resp->opcnt >= args->opcnt)
+                pad = 0; /* this is the last operation */
+        if (xb->page_len == 0) {
+                length = (char *)resp->p - (char *)xb->head[0].iov_base + pad;
+        } else {
+                if (xb->tail[0].iov_base && xb->tail[0].iov_len > 0)
+                        tlen = (char *)resp->p - (char *)xb->tail[0].iov_base;
+                length = xb->head[0].iov_len + xb->page_len + tlen + pad;
+        }
+        dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
+                length, xb->page_len, tlen, pad);
+        if (length <= session->se_fmaxresp_cached)
+                return status;
+        else
+                return nfserr_rep_too_big_to_cache;
+}
 void
 nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 {
@@ -2635,6 +3217,9 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
        BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
               !nfsd4_enc_ops[op->opnum]);
        op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
+        /* nfsd4_check_drc_limit guarantees enough room for error status */
+        if (!op->status && nfsd4_check_drc_limit(resp))
+                op->status = nfserr_rep_too_big_to_cache;
 status:
        /*
         * Note: We write the status directly, instead of using WRITE32(),
@@ -2735,6 +3320,18 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
                iov = &rqstp->rq_res.head[0];
        iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
        BUG_ON(iov->iov_len > PAGE_SIZE);
+        if (nfsd4_has_session(&resp->cstate)) {
+                if (resp->cstate.status == nfserr_replay_cache &&
+                                !nfsd4_not_cached(resp)) {
+                        iov->iov_len = resp->cstate.iovlen;
+                } else {
+                        nfsd4_store_cache_entry(resp);
+                        dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
+                        resp->cstate.slot->sl_inuse = 0;
+                }
+                if (resp->cstate.session)
+                        nfsd4_put_session(resp->cstate.session);
+        }
        return 1;
 }
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index a4ed8644d69c..af16849d243a 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -60,6 +60,7 @@ enum {
        NFSD_FO_UnlockFS,
        NFSD_Threads,
        NFSD_Pool_Threads,
+        NFSD_Pool_Stats,
        NFSD_Versions,
        NFSD_Ports,
        NFSD_MaxBlkSize,
@@ -172,6 +173,16 @@ static const struct file_operations exports_operations = {
        .owner          = THIS_MODULE,
 };
+extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
+static struct file_operations pool_stats_operations = {
+        .open           = nfsd_pool_stats_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+        .owner          = THIS_MODULE,
+};
 /*----------------------------------------------------------------------------*/
 /*
 * payload - write methods
@@ -781,8 +792,9 @@ out_free:
 static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 {
        char *mesg = buf;
-        char *vers, sign;
+        char *vers, *minorp, sign;
        int len, num;
+        unsigned minor;
        ssize_t tlen = 0;
        char *sep;
@@ -803,9 +815,20 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
                do {
                        sign = *vers;
                        if (sign == '+' || sign == '-')
-                                num = simple_strtol((vers+1), NULL, 0);
+                                num = simple_strtol((vers+1), &minorp, 0);
                        else
-                                num = simple_strtol(vers, NULL, 0);
+                                num = simple_strtol(vers, &minorp, 0);
+                        if (*minorp == '.') {
+                                if (num < 4)
+                                        return -EINVAL;
+                                minor = simple_strtoul(minorp+1, NULL, 0);
+                                if (minor == 0)
+                                        return -EINVAL;
+                                if (nfsd_minorversion(minor, sign == '-' ?
+                                                     NFSD_CLEAR : NFSD_SET) < 0)
+                                        return -EINVAL;
+                                goto next;
+                        }
                        switch(num) {
                        case 2:
                        case 3:
@@ -815,6 +838,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
                        default:
                                return -EINVAL;
                        }
+                next:
                        vers += len + 1;
                        tlen += len;
                } while ((len = qword_get(&mesg, vers, size)) > 0);
@@ -833,6 +857,13 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
                                       num);
                        sep = " ";
                }
+        if (nfsd_vers(4, NFSD_AVAIL))
+                for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION; minor++)
+                        len += sprintf(buf+len, " %c4.%u",
+                                        (nfsd_vers(4, NFSD_TEST) &&
+                                         nfsd_minorversion(minor, NFSD_TEST)) ?
+                                                '+' : '-',
+                                        minor);
        len += sprintf(buf+len, "\n");
        return len;
 }
@@ -1248,6 +1279,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
                [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
+                [NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO},
                [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
                [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 6f7f26351227..e298e260b5f1 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -180,6 +180,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
 {
        __be32  nfserr;
        int     stable = 1;
+        unsigned long cnt = argp->len;
        dprintk("nfsd: WRITE    %s %d bytes at %d\n",
                SVCFH_fmt(&argp->fh),
@@ -188,7 +189,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
        nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
                                   argp->offset,
                                   rqstp->rq_vec, argp->vlen,
-                                   argp->len,
+                                   &cnt,
                                   &stable);
        return nfsd_return_attrs(nfserr, resp);
 }
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 7c09852be713..cbba4a935786 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -22,6 +22,7 @@
 #include <linux/freezer.h>
 #include <linux/fs_struct.h>
 #include <linux/kthread.h>
+#include <linux/swap.h>
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/stats.h>
@@ -40,9 +41,6 @@
 extern struct svc_program       nfsd_program;
 static int                      nfsd(void *vrqstp);
 struct timeval                  nfssvc_boot;
-static atomic_t                 nfsd_busy;
-static unsigned long            nfsd_last_call;
-static DEFINE_SPINLOCK(nfsd_call_lock);
 /*
 * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members
@@ -123,6 +121,8 @@ struct svc_program		nfsd_program = {
 };
+u32 nfsd_supported_minorversion;
 int nfsd_vers(int vers, enum vers_op change)
 {
        if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
@@ -149,6 +149,28 @@ int nfsd_vers(int vers, enum vers_op change)
        }
        return 0;
 }
+int nfsd_minorversion(u32 minorversion, enum vers_op change)
+{
+        if (minorversion > NFSD_SUPPORTED_MINOR_VERSION)
+                return -1;
+        switch(change) {
+        case NFSD_SET:
+                nfsd_supported_minorversion = minorversion;
+                break;
+        case NFSD_CLEAR:
+                if (minorversion == 0)
+                        return -1;
+                nfsd_supported_minorversion = minorversion - 1;
+                break;
+        case NFSD_TEST:
+                return minorversion <= nfsd_supported_minorversion;
+        case NFSD_AVAIL:
+                return minorversion <= NFSD_SUPPORTED_MINOR_VERSION;
+        }
+        return 0;
+}
 /*
 * Maximum number of nfsd processes
 */
@@ -200,6 +222,28 @@ void nfsd_reset_versions(void)
        }
 }
+/*
+ * Each session guarantees a negotiated per slot memory cache for replies
+ * which in turn consumes memory beyond the v2/v3/v4.0 server. A dedicated
+ * NFSv4.1 server might want to use more memory for a DRC than a machine
+ * with mutiple services.
+ *
+ * Impose a hard limit on the number of pages for the DRC which varies
+ * according to the machines free pages. This is of course only a default.
+ *
+ * For now this is a #defined shift which could be under admin control
+ * in the future.
+ */
+static void set_max_drc(void)
+{
+        /* The percent of nr_free_buffer_pages used by the V4.1 server DRC */
+        #define NFSD_DRC_SIZE_SHIFT     7
+        nfsd_serv->sv_drc_max_pages = nr_free_buffer_pages()
+                                                >> NFSD_DRC_SIZE_SHIFT;
+        nfsd_serv->sv_drc_pages_used = 0;
+        dprintk("%s svc_drc_max_pages %u\n", __func__,
+                nfsd_serv->sv_drc_max_pages);
+}
 int nfsd_create_serv(void)
 {
@@ -227,11 +271,12 @@ int nfsd_create_serv(void)
                        nfsd_max_blksize /= 2;
        }
-        atomic_set(&nfsd_busy, 0);
        nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
                                      nfsd_last_thread, nfsd, THIS_MODULE);
        if (nfsd_serv == NULL)
                err = -ENOMEM;
+        else
+                set_max_drc();
        do_gettimeofday(&nfssvc_boot);          /* record boot time */
        return err;
@@ -375,26 +420,6 @@ nfsd_svc(unsigned short port, int nrservs)
        return error;
 }
-static inline void
-update_thread_usage(int busy_threads)
-{
-        unsigned long prev_call;
-        unsigned long diff;
-        int decile;
-        spin_lock(&nfsd_call_lock);
-        prev_call = nfsd_last_call;
-        nfsd_last_call = jiffies;
-        decile = busy_threads*10/nfsdstats.th_cnt;
-        if (decile>0 && decile <= 10) {
-                diff = nfsd_last_call - prev_call;
-                if ( (nfsdstats.th_usage[decile-1] += diff) >= NFSD_USAGE_WRAP)
-                        nfsdstats.th_usage[decile-1] -= NFSD_USAGE_WRAP;
-                if (decile == 10)
-                        nfsdstats.th_fullcnt++;
-        }
-        spin_unlock(&nfsd_call_lock);
-}
 /*
 * This is the NFS server kernel thread
@@ -460,8 +485,6 @@ nfsd(void *vrqstp)
                        continue;
                }
-                update_thread_usage(atomic_read(&nfsd_busy));
-                atomic_inc(&nfsd_busy);
                /* Lock the export hash tables for reading. */
                exp_readlock();
@@ -470,8 +493,6 @@ nfsd(void *vrqstp)
                /* Unlock export hash tables */
                exp_readunlock();
-                update_thread_usage(atomic_read(&nfsd_busy));
-                atomic_dec(&nfsd_busy);
        }
        /* Clear signals before calling svc_exit_thread() */
@@ -539,6 +560,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
                + rqstp->rq_res.head[0].iov_len;
        rqstp->rq_res.head[0].iov_len += sizeof(__be32);
+        /* NFSv4.1 DRC requires statp */
+        if (rqstp->rq_vers == 4)
+                nfsd4_set_statp(rqstp, statp);
        /* Now call the procedure handler, and encode NFS status. */
        nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
        nfserr = map_new_errors(rqstp->rq_vers, nfserr);
@@ -570,3 +595,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
        nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1);
        return 1;
 }
+int nfsd_pool_stats_open(struct inode *inode, struct file *file)
+{
+        if (nfsd_serv == NULL)
+                return -ENODEV;
+        return svc_pool_stats_open(nfsd_serv, file);
+}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 78376b6c0236..ab93fcfef254 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -366,8 +366,9 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
        }
        /* Revoke setuid/setgid on chown */
-        if (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) ||
+        if (!S_ISDIR(inode->i_mode) &&
-            ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid)) {
+            (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) ||
+             ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid))) {
                iap->ia_valid |= ATTR_KILL_PRIV;
                if (iap->ia_valid & ATTR_MODE) {
                        /* we're setting mode too, just clear the s*id bits */
@@ -960,7 +961,7 @@ static void kill_suid(struct dentry *dentry)
 static __be32
 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                                loff_t offset, struct kvec *vec, int vlen,
-                                unsigned long cnt, int *stablep)
+                                unsigned long *cnt, int *stablep)
 {
        struct svc_export       *exp;
        struct dentry           *dentry;
@@ -974,7 +975,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        err = nfserr_perm;
        if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
-                (!lock_may_write(file->f_path.dentry->d_inode, offset, cnt)))
+                (!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt)))
                goto out;
 #endif
@@ -1009,7 +1010,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
        set_fs(oldfs);
        if (host_err >= 0) {
-                nfsdstats.io_write += cnt;
+                nfsdstats.io_write += host_err;
                fsnotify_modify(file->f_path.dentry);
        }
@@ -1054,9 +1055,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        }
        dprintk("nfsd: write complete host_err=%d\n", host_err);
-        if (host_err >= 0)
+        if (host_err >= 0) {
                err = 0;
-        else 
+                *cnt = host_err;
+        } else
                err = nfserrno(host_err);
 out:
        return err;
@@ -1098,7 +1100,7 @@ out:
 */
 __be32
 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
-                loff_t offset, struct kvec *vec, int vlen, unsigned long cnt,
+                loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt,
                int *stablep)
 {
        __be32                  err = 0;
@@ -1179,6 +1181,21 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
        return 0;
 }
+/* HPUX client sometimes creates a file in mode 000, and sets size to 0.
+ * setting size to 0 may fail for some specific file systems by the permission
+ * checking which requires WRITE permission but the mode is 000.
+ * we ignore the resizing(to 0) on the just new created file, since the size is
+ * 0 after file created.
+ *
+ * call this only after vfs_create() is called.
+ * */
+static void
+nfsd_check_ignore_resizing(struct iattr *iap)
+{
+        if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0))
+                iap->ia_valid &= ~ATTR_SIZE;
+}
 /*
 * Create a file (regular, directory, device, fifo); UNIX sockets 
 * not yet implemented.
@@ -1274,6 +1291,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
        switch (type) {
        case S_IFREG:
                host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
+                if (!host_err)
+                        nfsd_check_ignore_resizing(iap);
                break;
        case S_IFDIR:
                host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
@@ -1427,6 +1446,8 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
                /* setattr will sync the child (or not) */
        }
+        nfsd_check_ignore_resizing(iap);
        if (createmode == NFS3_CREATE_EXCLUSIVE) {
                /* Cram the verifier into atime/mtime */
                iap->ia_valid = ATTR_MTIME|ATTR_ATIME
diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig
index 1a17020f9faf..ce2d6bcc6266 100644
--- a/fs/romfs/Kconfig
+++ b/fs/romfs/Kconfig
@@ -1,6 +1,6 @@
 config ROMFS_FS
        tristate "ROM file system support"
-        depends on BLOCK
+        depends on BLOCK || MTD
        ---help---
          This is a very small read-only file system mainly intended for
          initial ram disks of installation disks, but it could be used for
@@ -14,3 +14,49 @@ config ROMFS_FS
          If you don't know whether you need it, then you don't need it:
          answer N.
+#
+# Select the backing stores to be supported
+#
+choice
+        prompt "RomFS backing stores"
+        depends on ROMFS_FS
+        default ROMFS_BACKED_BY_BLOCK
+        help
+          Select the backing stores to be supported.
+config ROMFS_BACKED_BY_BLOCK
+        bool "Block device-backed ROM file system support"
+        depends on BLOCK
+        help
+          This permits ROMFS to use block devices buffered through the page
+          cache as the medium from which to retrieve data.  It does not allow
+          direct mapping of the medium.
+          If unsure, answer Y.
+config ROMFS_BACKED_BY_MTD
+        bool "MTD-backed ROM file system support"
+        depends on MTD=y || (ROMFS_FS=m && MTD)
+        help
+          This permits ROMFS to use MTD based devices directly, without the
+          intercession of the block layer (which may have been disabled).  It
+          also allows direct mapping of MTD devices through romfs files under
+          NOMMU conditions if the underlying device is directly addressable by
+          the CPU.
+          If unsure, answer Y.
+config ROMFS_BACKED_BY_BOTH
+        bool "Both the above"
+        depends on BLOCK && (MTD=y || (ROMFS_FS=m && MTD))
+endchoice
+config ROMFS_ON_BLOCK
+        bool
+        default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH
+config ROMFS_ON_MTD
+        bool
+        default y if ROMFS_BACKED_BY_MTD || ROMFS_BACKED_BY_BOTH
diff --git a/fs/romfs/Makefile b/fs/romfs/Makefile
index c95b21cf49a3..420beb7d495c 100644
--- a/fs/romfs/Makefile
+++ b/fs/romfs/Makefile
@@ -1,7 +1,12 @@
 #
-# Makefile for the linux romfs filesystem routines.
+# Makefile for the linux RomFS filesystem routines.
 #
 obj-$(CONFIG_ROMFS_FS) += romfs.o
-romfs-objs := inode.o
+romfs-y := storage.o super.o
+ifneq ($(CONFIG_MMU),y)
+romfs-$(CONFIG_ROMFS_ON_MTD) += mmap-nommu.o
+endif
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
deleted file mode 100644
index 98a232f7196b..000000000000
--- a/fs/romfs/inode.c
+++ /dev/null
@@ -1,665 +0,0 @@
-/*
- * ROMFS file system, Linux implementation
- *
- * Copyright (C) 1997-1999  Janos Farkas <chexum@shadow.banki.hu>
- *
- * Using parts of the minix filesystem
- * Copyright (C) 1991, 1992  Linus Torvalds
- *
- * and parts of the affs filesystem additionally
- * Copyright (C) 1993  Ray Burr
- * Copyright (C) 1996  Hans-Joachim Widmaier
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Changes
- *                                      Changed for 2.1.19 modules
- *      Jan 1997                        Initial release
- *      Jun 1997                        2.1.43+ changes
- *                                      Proper page locking in readpage
- *                                      Changed to work with 2.1.45+ fs
- *      Jul 1997                        Fixed follow_link
- *                      2.1.47
- *                                      lookup shouldn't return -ENOENT
- *                                      from Horst von Brand:
- *                                        fail on wrong checksum
- *                                        double unlock_super was possible
- *                                        correct namelen for statfs
- *                                      spotted by Bill Hawes:
- *                                        readlink shouldn't iput()
- *      Jun 1998        2.1.106         from Avery Pennarun: glibc scandir()
- *                                        exposed a problem in readdir
- *                      2.1.107         code-freeze spellchecker run
- *      Aug 1998                        2.1.118+ VFS changes
- *      Sep 1998        2.1.122         another VFS change (follow_link)
- *      Apr 1999        2.2.7           no more EBADF checking in
- *                                        lookup/readdir, use ERR_PTR
- *      Jun 1999        2.3.6           d_alloc_root use changed
- *                      2.3.9           clean up usage of ENOENT/negative
- *                                        dentries in lookup
- *                                      clean up page flags setting
- *                                        (error, uptodate, locking) in
- *                                        in readpage
- *                                      use init_special_inode for
- *                                        fifos/sockets (and streamline) in
- *                                        read_inode, fix _ops table order
- *      Aug 1999        2.3.16          __initfunc() => __init change
- *      Oct 1999        2.3.24          page->owner hack obsoleted
- *      Nov 1999        2.3.27          2.3.25+ page->offset => index change
- */
-/* todo:
- *      - see Documentation/filesystems/romfs.txt
- *      - use allocated, not stack memory for file names?
- *      - considering write access...
- *      - network (tftp) files?
- *      - merge back some _op tables
- */
-/*
- * Sorry about some optimizations and for some goto's.  I just wanted
- * to squeeze some more bytes out of this code.. :)
- */
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/romfs_fs.h>
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/pagemap.h>
-#include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
-#include <linux/vfs.h>
-#include <asm/uaccess.h>
-struct romfs_inode_info {
-        unsigned long i_metasize;       /* size of non-data area */
-        unsigned long i_dataoffset;     /* from the start of fs */
-        struct inode vfs_inode;
-};
-static struct inode *romfs_iget(struct super_block *, unsigned long);
-/* instead of private superblock data */
-static inline unsigned long romfs_maxsize(struct super_block *sb)
-{
-        return (unsigned long)sb->s_fs_info;
-}
-static inline struct romfs_inode_info *ROMFS_I(struct inode *inode)
-{
-        return container_of(inode, struct romfs_inode_info, vfs_inode);
-}
-static __u32
-romfs_checksum(void *data, int size)
-{
-        __u32 sum;
-        __be32 *ptr;
-        sum = 0; ptr = data;
-        size>>=2;
-        while (size>0) {
-                sum += be32_to_cpu(*ptr++);
-                size--;
-        }
-        return sum;
-}
-static const struct super_operations romfs_ops;
-static int romfs_fill_super(struct super_block *s, void *data, int silent)
-{
-        struct buffer_head *bh;
-        struct romfs_super_block *rsb;
-        struct inode *root;
-        int sz, ret = -EINVAL;
-        /* I would parse the options here, but there are none.. :) */
-        sb_set_blocksize(s, ROMBSIZE);
-        s->s_maxbytes = 0xFFFFFFFF;
-        bh = sb_bread(s, 0);
-        if (!bh) {
-                /* XXX merge with other printk? */
-                printk ("romfs: unable to read superblock\n");
-                goto outnobh;
-        }
-        rsb = (struct romfs_super_block *)bh->b_data;
-        sz = be32_to_cpu(rsb->size);
-        if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1
-           || sz < ROMFH_SIZE) {
-                if (!silent)
-                        printk ("VFS: Can't find a romfs filesystem on dev "
-                                "%s.\n", s->s_id);
-                goto out;
-        }
-        if (romfs_checksum(rsb, min_t(int, sz, 512))) {
-                printk ("romfs: bad initial checksum on dev "
-                        "%s.\n", s->s_id);
-                goto out;
-        }
-        s->s_magic = ROMFS_MAGIC;
-        s->s_fs_info = (void *)(long)sz;
-        s->s_flags |= MS_RDONLY;
-        /* Find the start of the fs */
-        sz = (ROMFH_SIZE +
-              strnlen(rsb->name, ROMFS_MAXFN) + 1 + ROMFH_PAD)
-             & ROMFH_MASK;
-        s->s_op = &romfs_ops;
-        root = romfs_iget(s, sz);
-        if (IS_ERR(root)) {
-                ret = PTR_ERR(root);
-                goto out;
-        }
-        ret = -ENOMEM;
-        s->s_root = d_alloc_root(root);
-        if (!s->s_root)
-                goto outiput;
-        brelse(bh);
-        return 0;
-outiput:
-        iput(root);
-out:
-        brelse(bh);
-outnobh:
-        return ret;
-}
-/* That's simple too. */
-static int
-romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-        buf->f_type = ROMFS_MAGIC;
-        buf->f_bsize = ROMBSIZE;
-        buf->f_bfree = buf->f_bavail = buf->f_ffree;
-        buf->f_blocks = (romfs_maxsize(dentry->d_sb)+ROMBSIZE-1)>>ROMBSBITS;
-        buf->f_namelen = ROMFS_MAXFN;
-        return 0;
-}
-/* some helper routines */
-static int
-romfs_strnlen(struct inode *i, unsigned long offset, unsigned long count)
-{
-        struct buffer_head *bh;
-        unsigned long avail, maxsize, res;
-        maxsize = romfs_maxsize(i->i_sb);
-        if (offset >= maxsize)
-                return -1;
-        /* strnlen is almost always valid */
-        if (count > maxsize || offset+count > maxsize)
-                count = maxsize-offset;
-        bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
-        if (!bh)
-                return -1;              /* error */
-        avail = ROMBSIZE - (offset & ROMBMASK);
-        maxsize = min_t(unsigned long, count, avail);
-        res = strnlen(((char *)bh->b_data)+(offset&ROMBMASK), maxsize);
-        brelse(bh);
-        if (res < maxsize)
-                return res;             /* found all of it */
-        while (res < count) {
-                offset += maxsize;
-                bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
-                if (!bh)
-                        return -1;
-                maxsize = min_t(unsigned long, count - res, ROMBSIZE);
-                avail = strnlen(bh->b_data, maxsize);
-                res += avail;
-                brelse(bh);
-                if (avail < maxsize)
-                        return res;
-        }
-        return res;
-}
-static int
-romfs_copyfrom(struct inode *i, void *dest, unsigned long offset, unsigned long count)
-{
-        struct buffer_head *bh;
-        unsigned long avail, maxsize, res;
-        maxsize = romfs_maxsize(i->i_sb);
-        if (offset >= maxsize || count > maxsize || offset+count>maxsize)
-                return -1;
-        bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
-        if (!bh)
-                return -1;              /* error */
-        avail = ROMBSIZE - (offset & ROMBMASK);
-        maxsize = min_t(unsigned long, count, avail);
-        memcpy(dest, ((char *)bh->b_data) + (offset & ROMBMASK), maxsize);
-        brelse(bh);
-        res = maxsize;                  /* all of it */
-        while (res < count) {
-                offset += maxsize;
-                dest += maxsize;
-                bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
-                if (!bh)
-                        return -1;
-                maxsize = min_t(unsigned long, count - res, ROMBSIZE);
-                memcpy(dest, bh->b_data, maxsize);
-                brelse(bh);
-                res += maxsize;
-        }
-        return res;
-}
-static unsigned char romfs_dtype_table[] = {
-        DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
-};
-static int
-romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
-        struct inode *i = filp->f_path.dentry->d_inode;
-        struct romfs_inode ri;
-        unsigned long offset, maxoff;
-        int j, ino, nextfh;
-        int stored = 0;
-        char fsname[ROMFS_MAXFN];       /* XXX dynamic? */
-        lock_kernel();
-        maxoff = romfs_maxsize(i->i_sb);
-        offset = filp->f_pos;
-        if (!offset) {
-                offset = i->i_ino & ROMFH_MASK;
-                if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0)
-                        goto out;
-                offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
-        }
-        /* Not really failsafe, but we are read-only... */
-        for(;;) {
-                if (!offset || offset >= maxoff) {
-                        offset = maxoff;
-                        filp->f_pos = offset;
-                        goto out;
-                }
-                filp->f_pos = offset;
-                /* Fetch inode info */
-                if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0)
-                        goto out;
-                j = romfs_strnlen(i, offset+ROMFH_SIZE, sizeof(fsname)-1);
-                if (j < 0)
-                        goto out;
-                fsname[j]=0;
-                romfs_copyfrom(i, fsname, offset+ROMFH_SIZE, j);
-                ino = offset;
-                nextfh = be32_to_cpu(ri.next);
-                if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
-                        ino = be32_to_cpu(ri.spec);
-                if (filldir(dirent, fsname, j, offset, ino,
-                            romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0) {
-                        goto out;
-                }
-                stored++;
-                offset = nextfh & ROMFH_MASK;
-        }
-out:
-        unlock_kernel();
-        return stored;
-}
-static struct dentry *
-romfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
-{
-        unsigned long offset, maxoff;
-        long res;
-        int fslen;
-        struct inode *inode = NULL;
-        char fsname[ROMFS_MAXFN];       /* XXX dynamic? */
-        struct romfs_inode ri;
-        const char *name;               /* got from dentry */
-        int len;
-        res = -EACCES;                  /* placeholder for "no data here" */
-        offset = dir->i_ino & ROMFH_MASK;
-        lock_kernel();
-        if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0)
-                goto error;
-        maxoff = romfs_maxsize(dir->i_sb);
-        offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
-        /* OK, now find the file whose name is in "dentry" in the
-         * directory specified by "dir".  */
-        name = dentry->d_name.name;
-        len = dentry->d_name.len;
-        for(;;) {
-                if (!offset || offset >= maxoff)
-                        goto success; /* negative success */
-                if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0)
-                        goto error;
-                /* try to match the first 16 bytes of name */
-                fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, ROMFH_SIZE);
-                if (len < ROMFH_SIZE) {
-                        if (len == fslen) {
-                                /* both are shorter, and same size */
-                                romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1);
-                                if (strncmp (name, fsname, len) == 0)
-                                        break;
-                        }
-                } else if (fslen >= ROMFH_SIZE) {
-                        /* both are longer; XXX optimize max size */
-                        fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, sizeof(fsname)-1);
-                        if (len == fslen) {
-                                romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1);
-                                if (strncmp(name, fsname, len) == 0)
-                                        break;
-                        }
-                }
-                /* next entry */
-                offset = be32_to_cpu(ri.next) & ROMFH_MASK;
-        }
-        /* Hard link handling */
-        if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
-                offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
-        inode = romfs_iget(dir->i_sb, offset);
-        if (IS_ERR(inode)) {
-                res = PTR_ERR(inode);
-                goto error;
-        }
-success:
-        d_add(dentry, inode);
-        res = 0;
-error:
-        unlock_kernel();
-        return ERR_PTR(res);
-}
-/*
- * Ok, we do readpage, to be able to execute programs.  Unfortunately,
- * we can't use bmap, since we may have looser alignments.
- */
-static int
-romfs_readpage(struct file *file, struct page * page)
-{
-        struct inode *inode = page->mapping->host;
-        loff_t offset, size;
-        unsigned long filled;
-        void *buf;
-        int result = -EIO;
-        page_cache_get(page);
-        lock_kernel();
-        buf = kmap(page);
-        if (!buf)
-                goto err_out;
-        /* 32 bit warning -- but not for us :) */
-        offset = page_offset(page);
-        size = i_size_read(inode);
-        filled = 0;
-        result = 0;
-        if (offset < size) {
-                unsigned long readlen;
-                size -= offset;
-                readlen = size > PAGE_SIZE ? PAGE_SIZE : size;
-                filled = romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen);
-                if (filled != readlen) {
-                        SetPageError(page);
-                        filled = 0;
-                        result = -EIO;
-                }
-        }
-        if (filled < PAGE_SIZE)
-                memset(buf + filled, 0, PAGE_SIZE-filled);
-        if (!result)
-                SetPageUptodate(page);
-        flush_dcache_page(page);
-        unlock_page(page);
-        kunmap(page);
-err_out:
-        page_cache_release(page);
-        unlock_kernel();
-        return result;
-}
-/* Mapping from our types to the kernel */
-static const struct address_space_operations romfs_aops = {
-        .readpage = romfs_readpage
-};
-static const struct file_operations romfs_dir_operations = {
-        .read           = generic_read_dir,
-        .readdir        = romfs_readdir,
-};
-static const struct inode_operations romfs_dir_inode_operations = {
-        .lookup         = romfs_lookup,
-};
-static mode_t romfs_modemap[] =
-{
-        0, S_IFDIR+0644, S_IFREG+0644, S_IFLNK+0777,
-        S_IFBLK+0600, S_IFCHR+0600, S_IFSOCK+0644, S_IFIFO+0644
-};
-static struct inode *
-romfs_iget(struct super_block *sb, unsigned long ino)
-{
-        int nextfh, ret;
-        struct romfs_inode ri;
-        struct inode *i;
-        ino &= ROMFH_MASK;
-        i = iget_locked(sb, ino);
-        if (!i)
-                return ERR_PTR(-ENOMEM);
-        if (!(i->i_state & I_NEW))
-                return i;
-        i->i_mode = 0;
-        /* Loop for finding the real hard link */
-        for(;;) {
-                if (romfs_copyfrom(i, &ri, ino, ROMFH_SIZE) <= 0) {
-                        printk(KERN_ERR "romfs: read error for inode 0x%lx\n",
-                                ino);
-                        iget_failed(i);
-                        return ERR_PTR(-EIO);
-                }
-                /* XXX: do romfs_checksum here too (with name) */
-                nextfh = be32_to_cpu(ri.next);
-                if ((nextfh & ROMFH_TYPE) != ROMFH_HRD)
-                        break;
-                ino = be32_to_cpu(ri.spec) & ROMFH_MASK;
-        }
-        i->i_nlink = 1;         /* Hard to decide.. */
-        i->i_size = be32_to_cpu(ri.size);
-        i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
-        i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
-        /* Precalculate the data offset */
-        ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN);
-        if (ret >= 0)
-                ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK;
-        else
-                ino = 0;
-        ROMFS_I(i)->i_metasize = ino;
-        ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK);
-        /* Compute permissions */
-        ino = romfs_modemap[nextfh & ROMFH_TYPE];
-        /* only "normal" files have ops */
-        switch (nextfh & ROMFH_TYPE) {
-                case 1:
-                        i->i_size = ROMFS_I(i)->i_metasize;
-                        i->i_op = &romfs_dir_inode_operations;
-                        i->i_fop = &romfs_dir_operations;
-                        if (nextfh & ROMFH_EXEC)
-                                ino |= S_IXUGO;
-                        i->i_mode = ino;
-                        break;
-                case 2:
-                        i->i_fop = &generic_ro_fops;
-                        i->i_data.a_ops = &romfs_aops;
-                        if (nextfh & ROMFH_EXEC)
-                                ino |= S_IXUGO;
-                        i->i_mode = ino;
-                        break;
-                case 3:
-                        i->i_op = &page_symlink_inode_operations;
-                        i->i_data.a_ops = &romfs_aops;
-                        i->i_mode = ino | S_IRWXUGO;
-                        break;
-                default:
-                        /* depending on MBZ for sock/fifos */
-                        nextfh = be32_to_cpu(ri.spec);
-                        init_special_inode(i, ino,
-                                        MKDEV(nextfh>>16,nextfh&0xffff));
-        }
-        unlock_new_inode(i);
-        return i;
-}
-static struct kmem_cache * romfs_inode_cachep;
-static struct inode *romfs_alloc_inode(struct super_block *sb)
-{
-        struct romfs_inode_info *ei;
-        ei = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
-        if (!ei)
-                return NULL;
-        return &ei->vfs_inode;
-}
-static void romfs_destroy_inode(struct inode *inode)
-{
-        kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
-}
-static void init_once(void *foo)
-{
-        struct romfs_inode_info *ei = foo;
-        inode_init_once(&ei->vfs_inode);
-}
-static int init_inodecache(void)
-{
-        romfs_inode_cachep = kmem_cache_create("romfs_inode_cache",
-                                             sizeof(struct romfs_inode_info),
-                                             0, (SLAB_RECLAIM_ACCOUNT|
-                                                SLAB_MEM_SPREAD),
-                                             init_once);
-        if (romfs_inode_cachep == NULL)
-                return -ENOMEM;
-        return 0;
-}
-static void destroy_inodecache(void)
-{
-        kmem_cache_destroy(romfs_inode_cachep);
-}
-static int romfs_remount(struct super_block *sb, int *flags, char *data)
-{
-        *flags |= MS_RDONLY;
-        return 0;
-}
-static const struct super_operations romfs_ops = {
-        .alloc_inode    = romfs_alloc_inode,
-        .destroy_inode  = romfs_destroy_inode,
-        .statfs         = romfs_statfs,
-        .remount_fs     = romfs_remount,
-};
-static int romfs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
-{
-        return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super,
-                           mnt);
-}
-static struct file_system_type romfs_fs_type = {
-        .owner          = THIS_MODULE,
-        .name           = "romfs",
-        .get_sb         = romfs_get_sb,
-        .kill_sb        = kill_block_super,
-        .fs_flags       = FS_REQUIRES_DEV,
-};
-static int __init init_romfs_fs(void)
-{
-        int err = init_inodecache();
-        if (err)
-                goto out1;
-        err = register_filesystem(&romfs_fs_type);
-        if (err)
-                goto out;
-        return 0;
-out:
-        destroy_inodecache();
-out1:
-        return err;
-}
-static void __exit exit_romfs_fs(void)
-{
-        unregister_filesystem(&romfs_fs_type);
-        destroy_inodecache();
-}
-/* Yes, works even as a module... :) */
-module_init(init_romfs_fs)
-module_exit(exit_romfs_fs)
-MODULE_LICENSE("GPL");
diff --git a/fs/romfs/internal.h b/fs/romfs/internal.h
new file mode 100644
index 000000000000..06044a9dc62d
--- /dev/null
+++ b/fs/romfs/internal.h
@@ -0,0 +1,47 @@
+/* RomFS internal definitions
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/romfs_fs.h>
+struct romfs_inode_info {
+        struct inode    vfs_inode;
+        unsigned long   i_metasize;     /* size of non-data area */
+        unsigned long   i_dataoffset;   /* from the start of fs */
+};
+static inline size_t romfs_maxsize(struct super_block *sb)
+{
+        return (size_t) (unsigned long) sb->s_fs_info;
+}
+static inline struct romfs_inode_info *ROMFS_I(struct inode *inode)
+{
+        return container_of(inode, struct romfs_inode_info, vfs_inode);
+}
+/*
+ * mmap-nommu.c
+ */
+#if !defined(CONFIG_MMU) && defined(CONFIG_ROMFS_ON_MTD)
+extern const struct file_operations romfs_ro_fops;
+#else
+#define romfs_ro_fops   generic_ro_fops
+#endif
+/*
+ * storage.c
+ */
+extern int romfs_dev_read(struct super_block *sb, unsigned long pos,
+                          void *buf, size_t buflen);
+extern ssize_t romfs_dev_strnlen(struct super_block *sb,
+                                 unsigned long pos, size_t maxlen);
+extern int romfs_dev_strncmp(struct super_block *sb, unsigned long pos,
+                             const char *str, size_t size);
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
new file mode 100644
index 000000000000..f0511e816967
--- /dev/null
+++ b/fs/romfs/mmap-nommu.c
@@ -0,0 +1,75 @@
+/* NOMMU mmap support for RomFS on MTD devices
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/mm.h>
+#include <linux/mtd/super.h>
+#include "internal.h"
+/*
+ * try to determine where a shared mapping can be made
+ * - only supported for NOMMU at the moment (MMU can't doesn't copy private
+ *   mappings)
+ * - attempts to map through to the underlying MTD device
+ */
+static unsigned long romfs_get_unmapped_area(struct file *file,
+                                             unsigned long addr,
+                                             unsigned long len,
+                                             unsigned long pgoff,
+                                             unsigned long flags)
+{
+        struct inode *inode = file->f_mapping->host;
+        struct mtd_info *mtd = inode->i_sb->s_mtd;
+        unsigned long isize, offset;
+        if (!mtd)
+                goto cant_map_directly;
+        isize = i_size_read(inode);
+        offset = pgoff << PAGE_SHIFT;
+        if (offset > isize || len > isize || offset > isize - len)
+                return (unsigned long) -EINVAL;
+        /* we need to call down to the MTD layer to do the actual mapping */
+        if (mtd->get_unmapped_area) {
+                if (addr != 0)
+                        return (unsigned long) -EINVAL;
+                if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT))
+                        return (unsigned long) -EINVAL;
+                offset += ROMFS_I(inode)->i_dataoffset;
+                if (offset > mtd->size - len)
+                        return (unsigned long) -EINVAL;
+                return mtd->get_unmapped_area(mtd, len, offset, flags);
+        }
+cant_map_directly:
+        return (unsigned long) -ENOSYS;
+}
+/*
+ * permit a R/O mapping to be made directly through onto an MTD device if
+ * possible
+ */
+static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
+}
+const struct file_operations romfs_ro_fops = {
+        .llseek                 = generic_file_llseek,
+        .read                   = do_sync_read,
+        .aio_read               = generic_file_aio_read,
+        .splice_read            = generic_file_splice_read,
+        .mmap                   = romfs_mmap,
+        .get_unmapped_area      = romfs_get_unmapped_area,
+};
diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c
new file mode 100644
index 000000000000..7e3e1e12a081
--- /dev/null
+++ b/fs/romfs/storage.c
@@ -0,0 +1,261 @@
+/* RomFS storage access routines
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/fs.h>
+#include <linux/mtd/super.h>
+#include <linux/buffer_head.h>
+#include "internal.h"
+#if !defined(CONFIG_ROMFS_ON_MTD) && !defined(CONFIG_ROMFS_ON_BLOCK)
+#error no ROMFS backing store interface configured
+#endif
+#ifdef CONFIG_ROMFS_ON_MTD
+#define ROMFS_MTD_READ(sb, ...) ((sb)->s_mtd->read((sb)->s_mtd, ##__VA_ARGS__))
+/*
+ * read data from an romfs image on an MTD device
+ */
+static int romfs_mtd_read(struct super_block *sb, unsigned long pos,
+                          void *buf, size_t buflen)
+{
+        size_t rlen;
+        int ret;
+        ret = ROMFS_MTD_READ(sb, pos, buflen, &rlen, buf);
+        return (ret < 0 || rlen != buflen) ? -EIO : 0;
+}
+/*
+ * determine the length of a string in a romfs image on an MTD device
+ */
+static ssize_t romfs_mtd_strnlen(struct super_block *sb,
+                                 unsigned long pos, size_t maxlen)
+{
+        ssize_t n = 0;
+        size_t segment;
+        u_char buf[16], *p;
+        size_t len;
+        int ret;
+        /* scan the string up to 16 bytes at a time */
+        while (maxlen > 0) {
+                segment = min_t(size_t, maxlen, 16);
+                ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
+                if (ret < 0)
+                        return ret;
+                p = memchr(buf, 0, len);
+                if (p)
+                        return n + (p - buf);
+                maxlen -= len;
+                pos += len;
+                n += len;
+        }
+        return n;
+}
+/*
+ * compare a string to one in a romfs image on MTD
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+static int romfs_mtd_strncmp(struct super_block *sb, unsigned long pos,
+                             const char *str, size_t size)
+{
+        u_char buf[16];
+        size_t len, segment;
+        int ret;
+        /* scan the string up to 16 bytes at a time */
+        while (size > 0) {
+                segment = min_t(size_t, size, 16);
+                ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
+                if (ret < 0)
+                        return ret;
+                if (memcmp(buf, str, len) != 0)
+                        return 0;
+                size -= len;
+                pos += len;
+                str += len;
+        }
+        return 1;
+}
+#endif /* CONFIG_ROMFS_ON_MTD */
+#ifdef CONFIG_ROMFS_ON_BLOCK
+/*
+ * read data from an romfs image on a block device
+ */
+static int romfs_blk_read(struct super_block *sb, unsigned long pos,
+                          void *buf, size_t buflen)
+{
+        struct buffer_head *bh;
+        unsigned long offset;
+        size_t segment;
+        /* copy the string up to blocksize bytes at a time */
+        while (buflen > 0) {
+                offset = pos & (ROMBSIZE - 1);
+                segment = min_t(size_t, buflen, ROMBSIZE - offset);
+                bh = sb_bread(sb, pos >> ROMBSBITS);
+                if (!bh)
+                        return -EIO;
+                memcpy(buf, bh->b_data + offset, segment);
+                brelse(bh);
+                buflen -= segment;
+                pos += segment;
+        }
+        return 0;
+}
+/*
+ * determine the length of a string in romfs on a block device
+ */
+static ssize_t romfs_blk_strnlen(struct super_block *sb,
+                                 unsigned long pos, size_t limit)
+{
+        struct buffer_head *bh;
+        unsigned long offset;
+        ssize_t n = 0;
+        size_t segment;
+        u_char *buf, *p;
+        /* scan the string up to blocksize bytes at a time */
+        while (limit > 0) {
+                offset = pos & (ROMBSIZE - 1);
+                segment = min_t(size_t, limit, ROMBSIZE - offset);
+                bh = sb_bread(sb, pos >> ROMBSBITS);
+                if (!bh)
+                        return -EIO;
+                buf = bh->b_data + offset;
+                p = memchr(buf, 0, segment);
+                brelse(bh);
+                if (p)
+                        return n + (p - buf);
+                limit -= segment;
+                pos += segment;
+                n += segment;
+        }
+        return n;
+}
+/*
+ * compare a string to one in a romfs image on a block device
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+static int romfs_blk_strncmp(struct super_block *sb, unsigned long pos,
+                             const char *str, size_t size)
+{
+        struct buffer_head *bh;
+        unsigned long offset;
+        size_t segment;
+        bool x;
+        /* scan the string up to 16 bytes at a time */
+        while (size > 0) {
+                offset = pos & (ROMBSIZE - 1);
+                segment = min_t(size_t, size, ROMBSIZE - offset);
+                bh = sb_bread(sb, pos >> ROMBSBITS);
+                if (!bh)
+                        return -EIO;
+                x = (memcmp(bh->b_data + offset, str, segment) != 0);
+                brelse(bh);
+                if (x)
+                        return 0;
+                size -= segment;
+                pos += segment;
+                str += segment;
+        }
+        return 1;
+}
+#endif /* CONFIG_ROMFS_ON_BLOCK */
+/*
+ * read data from the romfs image
+ */
+int romfs_dev_read(struct super_block *sb, unsigned long pos,
+                   void *buf, size_t buflen)
+{
+        size_t limit;
+        limit = romfs_maxsize(sb);
+        if (pos >= limit)
+                return -EIO;
+        if (buflen > limit - pos)
+                buflen = limit - pos;
+#ifdef CONFIG_ROMFS_ON_MTD
+        if (sb->s_mtd)
+                return romfs_mtd_read(sb, pos, buf, buflen);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+        if (sb->s_bdev)
+                return romfs_blk_read(sb, pos, buf, buflen);
+#endif
+        return -EIO;
+}
+/*
+ * determine the length of a string in romfs
+ */
+ssize_t romfs_dev_strnlen(struct super_block *sb,
+                          unsigned long pos, size_t maxlen)
+{
+        size_t limit;
+        limit = romfs_maxsize(sb);
+        if (pos >= limit)
+                return -EIO;
+        if (maxlen > limit - pos)
+                maxlen = limit - pos;
+#ifdef CONFIG_ROMFS_ON_MTD
+        if (sb->s_mtd)
+                return romfs_mtd_strnlen(sb, pos, limit);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+        if (sb->s_bdev)
+                return romfs_blk_strnlen(sb, pos, limit);
+#endif
+        return -EIO;
+}
+/*
+ * compare a string to one in romfs
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+int romfs_dev_strncmp(struct super_block *sb, unsigned long pos,
+                      const char *str, size_t size)
+{
+        size_t limit;
+        limit = romfs_maxsize(sb);
+        if (pos >= limit)
+                return -EIO;
+        if (size > ROMFS_MAXFN)
+                return -ENAMETOOLONG;
+        if (size > limit - pos)
+                return -EIO;
+#ifdef CONFIG_ROMFS_ON_MTD
+        if (sb->s_mtd)
+                return romfs_mtd_strncmp(sb, pos, str, size);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+        if (sb->s_bdev)
+                return romfs_blk_strncmp(sb, pos, str, size);
+#endif
+        return -EIO;
+}
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
new file mode 100644
index 000000000000..1e548a4975ba
--- /dev/null
+++ b/fs/romfs/super.c
@@ -0,0 +1,648 @@
+/* Block- or MTD-based romfs
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * Derived from: ROMFS file system, Linux implementation
+ *
+ * Copyright © 1997-1999  Janos Farkas <chexum@shadow.banki.hu>
+ *
+ * Using parts of the minix filesystem
+ * Copyright © 1991, 1992  Linus Torvalds
+ *
+ * and parts of the affs filesystem additionally
+ * Copyright © 1993  Ray Burr
+ * Copyright © 1996  Hans-Joachim Widmaier
+ *
+ * Changes
+ *                                      Changed for 2.1.19 modules
+ *      Jan 1997                        Initial release
+ *      Jun 1997                        2.1.43+ changes
+ *                                      Proper page locking in readpage
+ *                                      Changed to work with 2.1.45+ fs
+ *      Jul 1997                        Fixed follow_link
+ *                      2.1.47
+ *                                      lookup shouldn't return -ENOENT
+ *                                      from Horst von Brand:
+ *                                        fail on wrong checksum
+ *                                        double unlock_super was possible
+ *                                        correct namelen for statfs
+ *                                      spotted by Bill Hawes:
+ *                                        readlink shouldn't iput()
+ *      Jun 1998        2.1.106         from Avery Pennarun: glibc scandir()
+ *                                        exposed a problem in readdir
+ *                      2.1.107         code-freeze spellchecker run
+ *      Aug 1998                        2.1.118+ VFS changes
+ *      Sep 1998        2.1.122         another VFS change (follow_link)
+ *      Apr 1999        2.2.7           no more EBADF checking in
+ *                                        lookup/readdir, use ERR_PTR
+ *      Jun 1999        2.3.6           d_alloc_root use changed
+ *                      2.3.9           clean up usage of ENOENT/negative
+ *                                        dentries in lookup
+ *                                      clean up page flags setting
+ *                                        (error, uptodate, locking) in
+ *                                        in readpage
+ *                                      use init_special_inode for
+ *                                        fifos/sockets (and streamline) in
+ *                                        read_inode, fix _ops table order
+ *      Aug 1999        2.3.16          __initfunc() => __init change
+ *      Oct 1999        2.3.24          page->owner hack obsoleted
+ *      Nov 1999        2.3.27          2.3.25+ page->offset => index change
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/statfs.h>
+#include <linux/mtd/super.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uaccess.h>
+#include "internal.h"
+static struct kmem_cache *romfs_inode_cachep;
+static const umode_t romfs_modemap[8] = {
+        0,                      /* hard link */
+        S_IFDIR  | 0644,        /* directory */
+        S_IFREG  | 0644,        /* regular file */
+        S_IFLNK  | 0777,        /* symlink */
+        S_IFBLK  | 0600,        /* blockdev */
+        S_IFCHR  | 0600,        /* chardev */
+        S_IFSOCK | 0644,        /* socket */
+        S_IFIFO  | 0644         /* FIFO */
+};
+static const unsigned char romfs_dtype_table[] = {
+        DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
+};
+static struct inode *romfs_iget(struct super_block *sb, unsigned long pos);
+/*
+ * read a page worth of data from the image
+ */
+static int romfs_readpage(struct file *file, struct page *page)
+{
+        struct inode *inode = page->mapping->host;
+        loff_t offset, size;
+        unsigned long fillsize, pos;
+        void *buf;
+        int ret;
+        buf = kmap(page);
+        if (!buf)
+                return -ENOMEM;
+        /* 32 bit warning -- but not for us :) */
+        offset = page_offset(page);
+        size = i_size_read(inode);
+        fillsize = 0;
+        ret = 0;
+        if (offset < size) {
+                size -= offset;
+                fillsize = size > PAGE_SIZE ? PAGE_SIZE : size;
+                pos = ROMFS_I(inode)->i_dataoffset + offset;
+                ret = romfs_dev_read(inode->i_sb, pos, buf, fillsize);
+                if (ret < 0) {
+                        SetPageError(page);
+                        fillsize = 0;
+                        ret = -EIO;
+                }
+        }
+        if (fillsize < PAGE_SIZE)
+                memset(buf + fillsize, 0, PAGE_SIZE - fillsize);
+        if (ret == 0)
+                SetPageUptodate(page);
+        flush_dcache_page(page);
+        kunmap(page);
+        unlock_page(page);
+        return ret;
+}
+static const struct address_space_operations romfs_aops = {
+        .readpage       = romfs_readpage
+};
+/*
+ * read the entries from a directory
+ */
+static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+        struct inode *i = filp->f_dentry->d_inode;
+        struct romfs_inode ri;
+        unsigned long offset, maxoff;
+        int j, ino, nextfh;
+        int stored = 0;
+        char fsname[ROMFS_MAXFN];       /* XXX dynamic? */
+        int ret;
+        maxoff = romfs_maxsize(i->i_sb);
+        offset = filp->f_pos;
+        if (!offset) {
+                offset = i->i_ino & ROMFH_MASK;
+                ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
+                if (ret < 0)
+                        goto out;
+                offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+        }
+        /* Not really failsafe, but we are read-only... */
+        for (;;) {
+                if (!offset || offset >= maxoff) {
+                        offset = maxoff;
+                        filp->f_pos = offset;
+                        goto out;
+                }
+                filp->f_pos = offset;
+                /* Fetch inode info */
+                ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
+                if (ret < 0)
+                        goto out;
+                j = romfs_dev_strnlen(i->i_sb, offset + ROMFH_SIZE,
+                                      sizeof(fsname) - 1);
+                if (j < 0)
+                        goto out;
+                ret = romfs_dev_read(i->i_sb, offset + ROMFH_SIZE, fsname, j);
+                if (ret < 0)
+                        goto out;
+                fsname[j] = '\0';
+                ino = offset;
+                nextfh = be32_to_cpu(ri.next);
+                if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
+                        ino = be32_to_cpu(ri.spec);
+                if (filldir(dirent, fsname, j, offset, ino,
+                            romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0)
+                        goto out;
+                stored++;
+                offset = nextfh & ROMFH_MASK;
+        }
+out:
+        return stored;
+}
+/*
+ * look up an entry in a directory
+ */
+static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry,
+                                   struct nameidata *nd)
+{
+        unsigned long offset, maxoff;
+        struct inode *inode;
+        struct romfs_inode ri;
+        const char *name;               /* got from dentry */
+        int len, ret;
+        offset = dir->i_ino & ROMFH_MASK;
+        ret = romfs_dev_read(dir->i_sb, offset, &ri, ROMFH_SIZE);
+        if (ret < 0)
+                goto error;
+        /* search all the file entries in the list starting from the one
+         * pointed to by the directory's special data */
+        maxoff = romfs_maxsize(dir->i_sb);
+        offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+        name = dentry->d_name.name;
+        len = dentry->d_name.len;
+        for (;;) {
+                if (!offset || offset >= maxoff)
+                        goto out0;
+                ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri));
+                if (ret < 0)
+                        goto error;
+                /* try to match the first 16 bytes of name */
+                ret = romfs_dev_strncmp(dir->i_sb, offset + ROMFH_SIZE, name,
+                                        len);
+                if (ret < 0)
+                        goto error;
+                if (ret == 1)
+                        break;
+                /* next entry */
+                offset = be32_to_cpu(ri.next) & ROMFH_MASK;
+        }
+        /* Hard link handling */
+        if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
+                offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+        inode = romfs_iget(dir->i_sb, offset);
+        if (IS_ERR(inode)) {
+                ret = PTR_ERR(inode);
+                goto error;
+        }
+        goto outi;
+        /*
+         * it's a bit funky, _lookup needs to return an error code
+         * (negative) or a NULL, both as a dentry.  ENOENT should not
+         * be returned, instead we need to create a negative dentry by
+         * d_add(dentry, NULL); and return 0 as no error.
+         * (Although as I see, it only matters on writable file
+         * systems).
+         */
+out0:
+        inode = NULL;
+outi:
+        d_add(dentry, inode);
+        ret = 0;
+error:
+        return ERR_PTR(ret);
+}
+static const struct file_operations romfs_dir_operations = {
+        .read           = generic_read_dir,
+        .readdir        = romfs_readdir,
+};
+static struct inode_operations romfs_dir_inode_operations = {
+        .lookup         = romfs_lookup,
+};
+/*
+ * get a romfs inode based on its position in the image (which doubles as the
+ * inode number)
+ */
+static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
+{
+        struct romfs_inode_info *inode;
+        struct romfs_inode ri;
+        struct inode *i;
+        unsigned long nlen;
+        unsigned nextfh, ret;
+        umode_t mode;
+        /* we might have to traverse a chain of "hard link" file entries to get
+         * to the actual file */
+        for (;;) {
+                ret = romfs_dev_read(sb, pos, &ri, sizeof(ri));
+                if (ret < 0)
+                        goto error;
+                /* XXX: do romfs_checksum here too (with name) */
+                nextfh = be32_to_cpu(ri.next);
+                if ((nextfh & ROMFH_TYPE) != ROMFH_HRD)
+                        break;
+                pos = be32_to_cpu(ri.spec) & ROMFH_MASK;
+        }
+        /* determine the length of the filename */
+        nlen = romfs_dev_strnlen(sb, pos + ROMFH_SIZE, ROMFS_MAXFN);
+        if (IS_ERR_VALUE(nlen))
+                goto eio;
+        /* get an inode for this image position */
+        i = iget_locked(sb, pos);
+        if (!i)
+                return ERR_PTR(-ENOMEM);
+        if (!(i->i_state & I_NEW))
+                return i;
+        /* precalculate the data offset */
+        inode = ROMFS_I(i);
+        inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK;
+        inode->i_dataoffset = pos + inode->i_metasize;
+        i->i_nlink = 1;         /* Hard to decide.. */
+        i->i_size = be32_to_cpu(ri.size);
+        i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
+        i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
+        /* set up mode and ops */
+        mode = romfs_modemap[nextfh & ROMFH_TYPE];
+        switch (nextfh & ROMFH_TYPE) {
+        case ROMFH_DIR:
+                i->i_size = ROMFS_I(i)->i_metasize;
+                i->i_op = &romfs_dir_inode_operations;
+                i->i_fop = &romfs_dir_operations;
+                if (nextfh & ROMFH_EXEC)
+                        mode |= S_IXUGO;
+                break;
+        case ROMFH_REG:
+                i->i_fop = &romfs_ro_fops;
+                i->i_data.a_ops = &romfs_aops;
+                if (i->i_sb->s_mtd)
+                        i->i_data.backing_dev_info =
+                                i->i_sb->s_mtd->backing_dev_info;
+                if (nextfh & ROMFH_EXEC)
+                        mode |= S_IXUGO;
+                break;
+        case ROMFH_SYM:
+                i->i_op = &page_symlink_inode_operations;
+                i->i_data.a_ops = &romfs_aops;
+                mode |= S_IRWXUGO;
+                break;
+        default:
+                /* depending on MBZ for sock/fifos */
+                nextfh = be32_to_cpu(ri.spec);
+                init_special_inode(i, mode, MKDEV(nextfh >> 16,
+                                                  nextfh & 0xffff));
+                break;
+        }
+        i->i_mode = mode;
+        unlock_new_inode(i);
+        return i;
+eio:
+        ret = -EIO;
+error:
+        printk(KERN_ERR "ROMFS: read error for inode 0x%lx\n", pos);
+        return ERR_PTR(ret);
+}
+/*
+ * allocate a new inode
+ */
+static struct inode *romfs_alloc_inode(struct super_block *sb)
+{
+        struct romfs_inode_info *inode;
+        inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
+        return inode ? &inode->vfs_inode : NULL;
+}
+/*
+ * return a spent inode to the slab cache
+ */
+static void romfs_destroy_inode(struct inode *inode)
+{
+        kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
+}
+/*
+ * get filesystem statistics
+ */
+static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        buf->f_type = ROMFS_MAGIC;
+        buf->f_namelen = ROMFS_MAXFN;
+        buf->f_bsize = ROMBSIZE;
+        buf->f_bfree = buf->f_bavail = buf->f_ffree;
+        buf->f_blocks =
+                (romfs_maxsize(dentry->d_sb) + ROMBSIZE - 1) >> ROMBSBITS;
+        return 0;
+}
+/*
+ * remounting must involve read-only
+ */
+static int romfs_remount(struct super_block *sb, int *flags, char *data)
+{
+        *flags |= MS_RDONLY;
+        return 0;
+}
+static const struct super_operations romfs_super_ops = {
+        .alloc_inode    = romfs_alloc_inode,
+        .destroy_inode  = romfs_destroy_inode,
+        .statfs         = romfs_statfs,
+        .remount_fs     = romfs_remount,
+};
+/*
+ * checksum check on part of a romfs filesystem
+ */
+static __u32 romfs_checksum(const void *data, int size)
+{
+        const __be32 *ptr = data;
+        __u32 sum;
+        sum = 0;
+        size >>= 2;
+        while (size > 0) {
+                sum += be32_to_cpu(*ptr++);
+                size--;
+        }
+        return sum;
+}
+/*
+ * fill in the superblock
+ */
+static int romfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct romfs_super_block *rsb;
+        struct inode *root;
+        unsigned long pos, img_size;
+        const char *storage;
+        size_t len;
+        int ret;
+#ifdef CONFIG_BLOCK
+        if (!sb->s_mtd) {
+                sb_set_blocksize(sb, ROMBSIZE);
+        } else {
+                sb->s_blocksize = ROMBSIZE;
+                sb->s_blocksize_bits = blksize_bits(ROMBSIZE);
+        }
+#endif
+        sb->s_maxbytes = 0xFFFFFFFF;
+        sb->s_magic = ROMFS_MAGIC;
+        sb->s_flags |= MS_RDONLY | MS_NOATIME;
+        sb->s_op = &romfs_super_ops;
+        /* read the image superblock and check it */
+        rsb = kmalloc(512, GFP_KERNEL);
+        if (!rsb)
+                return -ENOMEM;
+        sb->s_fs_info = (void *) 512;
+        ret = romfs_dev_read(sb, 0, rsb, 512);
+        if (ret < 0)
+                goto error_rsb;
+        img_size = be32_to_cpu(rsb->size);
+        if (sb->s_mtd && img_size > sb->s_mtd->size)
+                goto error_rsb_inval;
+        sb->s_fs_info = (void *) img_size;
+        if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 ||
+            img_size < ROMFH_SIZE) {
+                if (!silent)
+                        printk(KERN_WARNING "VFS:"
+                               " Can't find a romfs filesystem on dev %s.\n",
+                               sb->s_id);
+                goto error_rsb_inval;
+        }
+        if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) {
+                printk(KERN_ERR "ROMFS: bad initial checksum on dev %s.\n",
+                       sb->s_id);
+                goto error_rsb_inval;
+        }
+        storage = sb->s_mtd ? "MTD" : "the block layer";
+        len = strnlen(rsb->name, ROMFS_MAXFN);
+        if (!silent)
+                printk(KERN_NOTICE "ROMFS: Mounting image '%*.*s' through %s\n",
+                       (unsigned) len, (unsigned) len, rsb->name, storage);
+        kfree(rsb);
+        rsb = NULL;
+        /* find the root directory */
+        pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK;
+        root = romfs_iget(sb, pos);
+        if (!root)
+                goto error;
+        sb->s_root = d_alloc_root(root);
+        if (!sb->s_root)
+                goto error_i;
+        return 0;
+error_i:
+        iput(root);
+error:
+        return -EINVAL;
+error_rsb_inval:
+        ret = -EINVAL;
+error_rsb:
+        return ret;
+}
+/*
+ * get a superblock for mounting
+ */
+static int romfs_get_sb(struct file_system_type *fs_type,
+                        int flags, const char *dev_name,
+                        void *data, struct vfsmount *mnt)
+{
+        int ret = -EINVAL;
+#ifdef CONFIG_ROMFS_ON_MTD
+        ret = get_sb_mtd(fs_type, flags, dev_name, data, romfs_fill_super,
+                         mnt);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+        if (ret == -EINVAL)
+                ret = get_sb_bdev(fs_type, flags, dev_name, data,
+                                  romfs_fill_super, mnt);
+#endif
+        return ret;
+}
+/*
+ * destroy a romfs superblock in the appropriate manner
+ */
+static void romfs_kill_sb(struct super_block *sb)
+{
+#ifdef CONFIG_ROMFS_ON_MTD
+        if (sb->s_mtd) {
+                kill_mtd_super(sb);
+                return;
+        }
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+        if (sb->s_bdev) {
+                kill_block_super(sb);
+                return;
+        }
+#endif
+}
+static struct file_system_type romfs_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "romfs",
+        .get_sb         = romfs_get_sb,
+        .kill_sb        = romfs_kill_sb,
+        .fs_flags       = FS_REQUIRES_DEV,
+};
+/*
+ * inode storage initialiser
+ */
+static void romfs_i_init_once(void *_inode)
+{
+        struct romfs_inode_info *inode = _inode;
+        inode_init_once(&inode->vfs_inode);
+}
+/*
+ * romfs module initialisation
+ */
+static int __init init_romfs_fs(void)
+{
+        int ret;
+        printk(KERN_INFO "ROMFS MTD (C) 2007 Red Hat, Inc.\n");
+        romfs_inode_cachep =
+                kmem_cache_create("romfs_i",
+                                  sizeof(struct romfs_inode_info), 0,
+                                  SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+                                  romfs_i_init_once);
+        if (!romfs_inode_cachep) {
+                printk(KERN_ERR
+                       "ROMFS error: Failed to initialise inode cache\n");
+                return -ENOMEM;
+        }
+        ret = register_filesystem(&romfs_fs_type);
+        if (ret) {
+                printk(KERN_ERR "ROMFS error: Failed to register filesystem\n");
+                goto error_register;
+        }
+        return 0;
+error_register:
+        kmem_cache_destroy(romfs_inode_cachep);
+        return ret;
+}
+/*
+ * romfs module removal
+ */
+static void __exit exit_romfs_fs(void)
+{
+        unregister_filesystem(&romfs_fs_type);
+        kmem_cache_destroy(romfs_inode_cachep);
+}
+module_init(init_romfs_fs);
+module_exit(exit_romfs_fs);
+MODULE_DESCRIPTION("Direct-MTD Capable RomFS");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL"); /* Actually dual-licensed, but it doesn't matter for */
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 69e971d5ddc1..2b1b8fe5e037 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -40,6 +40,7 @@
 #include <linux/dcache.h>
 #include <linux/exportfs.h>
 #include <linux/zlib.h>
+#include <linux/slab.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index f393620890ee..af1914462f02 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -194,29 +194,26 @@ static int make_free_space(struct ubifs_info *c)
 }
 /**
- * ubifs_calc_min_idx_lebs - calculate amount of eraseblocks for the index.
+ * ubifs_calc_min_idx_lebs - calculate amount of LEBs for the index.
 * @c: UBIFS file-system description object
 *
- * This function calculates and returns the number of eraseblocks which should
+ * This function calculates and returns the number of LEBs which should be kept
- * be kept for index usage.
+ * for index usage.
 */
 int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
 {
-        int idx_lebs, eff_leb_size = c->leb_size - c->max_idx_node_sz;
+        int idx_lebs;
        long long idx_size;
        idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
        /* And make sure we have thrice the index size of space reserved */
-        idx_size = idx_size + (idx_size << 1);
+        idx_size += idx_size << 1;
        /*
         * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes'
         * pair, nor similarly the two variables for the new index size, so we
         * have to do this costly 64-bit division on fast-path.
         */
-        idx_size += eff_leb_size - 1;
+        idx_lebs = div_u64(idx_size + c->idx_leb_size - 1, c->idx_leb_size);
-        idx_lebs = div_u64(idx_size, eff_leb_size);
        /*
         * The index head is not available for the in-the-gaps method, so add an
         * extra LEB to compensate.
@@ -310,23 +307,23 @@ static int can_use_rp(struct ubifs_info *c)
 * do_budget_space - reserve flash space for index and data growth.
 * @c: UBIFS file-system description object
 *
- * This function makes sure UBIFS has enough free eraseblocks for index growth
+ * This function makes sure UBIFS has enough free LEBs for index growth and
- * and data.
+ * data.
 *
 * When budgeting index space, UBIFS reserves thrice as many LEBs as the index
 * would take if it was consolidated and written to the flash. This guarantees
 * that the "in-the-gaps" commit method always succeeds and UBIFS will always
 * be able to commit dirty index. So this function basically adds amount of
 * budgeted index space to the size of the current index, multiplies this by 3,
- * and makes sure this does not exceed the amount of free eraseblocks.
+ * and makes sure this does not exceed the amount of free LEBs.
 *
 * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
 * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
 *    be large, because UBIFS does not do any index consolidation as long as
 *    there is free space. IOW, the index may take a lot of LEBs, but the LEBs
 *    will contain a lot of dirt.
- * o @c->min_idx_lebs is the the index presumably takes. IOW, the index may be
+ * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW,
- *   consolidated to take up to @c->min_idx_lebs LEBs.
+ *    the index may be consolidated to take up to @c->min_idx_lebs LEBs.
 *
 * This function returns zero in case of success, and %-ENOSPC in case of
 * failure.
@@ -695,12 +692,12 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
 * This function calculates amount of free space to report to user-space.
 *
 * Because UBIFS may introduce substantial overhead (the index, node headers,
- * alignment, wastage at the end of eraseblocks, etc), it cannot report real
+ * alignment, wastage at the end of LEBs, etc), it cannot report real amount of
- * amount of free flash space it has (well, because not all dirty space is
+ * free flash space it has (well, because not all dirty space is reclaimable,
- * reclaimable, UBIFS does not actually know the real amount). If UBIFS did so,
+ * UBIFS does not actually know the real amount). If UBIFS did so, it would
- * it would bread user expectations about what free space is. Users seem to
+ * bread user expectations about what free space is. Users seem to accustomed
- * accustomed to assume that if the file-system reports N bytes of free space,
+ * to assume that if the file-system reports N bytes of free space, they would
- * they would be able to fit a file of N bytes to the FS. This almost works for
+ * be able to fit a file of N bytes to the FS. This almost works for
 * traditional file-systems, because they have way less overhead than UBIFS.
 * So, to keep users happy, UBIFS tries to take the overhead into account.
 */
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index e975bd82f38b..ce2cd8343618 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -479,9 +479,9 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
                                          "bad or corrupted node)");
                else {
                        for (i = 0; i < nlen && dent->name[i]; i++)
-                                printk("%c", dent->name[i]);
+                                printk(KERN_CONT "%c", dent->name[i]);
                }
-                printk("\n");
+                printk(KERN_CONT "\n");
                break;
        }
@@ -1214,7 +1214,7 @@ static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr)
                        /*
                         * Make sure the last key in our znode is less or
-                         * equivalent than the the key in zbranch which goes
+                         * equivalent than the key in the zbranch which goes
                         * after our pointing zbranch.
                         */
                        cmp = keys_cmp(c, max,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 0ff89fe71e51..6d34dc7e33e1 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -430,6 +430,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
        struct ubifs_inode *ui = ubifs_inode(inode);
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
        int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+        int skipped_read = 0;
        struct page *page;
        ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
@@ -444,7 +445,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
        if (!PageUptodate(page)) {
                /* The page is not loaded from the flash */
-                if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+                if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) {
                        /*
                         * We change whole page so no need to load it. But we
                         * have to set the @PG_checked flag to make the further
@@ -453,7 +454,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
                         * the media.
                         */
                        SetPageChecked(page);
-                else {
+                        skipped_read = 1;
+                } else {
                        err = do_readpage(page);
                        if (err) {
                                unlock_page(page);
@@ -470,6 +472,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
        if (unlikely(err)) {
                ubifs_assert(err == -ENOSPC);
                /*
+                 * If we skipped reading the page because we were going to
+                 * write all of it, then it is not up to date.
+                 */
+                if (skipped_read) {
+                        ClearPageChecked(page);
+                        ClearPageUptodate(page);
+                }
+                /*
                 * Budgeting failed which means it would have to force
                 * write-back but didn't, because we set the @fast flag in the
                 * request. Write-back cannot be done now, while we have the
@@ -949,7 +959,7 @@ static int do_writepage(struct page *page, int len)
 * whole index and correct all inode sizes, which is long an unacceptable.
 *
 * To prevent situations like this, UBIFS writes pages back only if they are
- * within last synchronized inode size, i.e. the the size which has been
+ * within the last synchronized inode size, i.e. the size which has been
 * written to the flash media last time. Otherwise, UBIFS forces inode
 * write-back, thus making sure the on-flash inode contains current inode size,
 * and then keeps writing pages back.
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 717d79c97c5e..1d54383d1269 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -478,7 +478,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
 * ubifs_find_free_space - find a data LEB with free space.
 * @c: the UBIFS file-system description object
 * @min_space: minimum amount of required free space
- * @free: contains amount of free space in the LEB on exit
+ * @offs: contains offset of where free space starts on exit
 * @squeeze: whether to try to find space in a non-empty LEB first
 *
 * This function looks for an LEB with at least @min_space bytes of free space.
@@ -490,7 +490,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
 * failed to find a LEB with @min_space bytes of free space and other a negative
 * error codes in case of failure.
 */
-int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
                          int squeeze)
 {
        const struct ubifs_lprops *lprops;
@@ -558,10 +558,10 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
                spin_unlock(&c->space_lock);
        }
-        *free = lprops->free;
+        *offs = c->leb_size - lprops->free;
        ubifs_release_lprops(c);
-        if (*free == c->leb_size) {
+        if (*offs == 0) {
                /*
                 * Ensure that empty LEBs have been unmapped. They may not have
                 * been, for example, because of an unclean unmount.  Also
@@ -573,8 +573,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
                        return err;
        }
-        dbg_find("found LEB %d, free %d", lnum, *free);
+        dbg_find("found LEB %d, free %d", lnum, c->leb_size - *offs);
-        ubifs_assert(*free >= min_space);
+        ubifs_assert(*offs <= c->leb_size - min_space);
        return lnum;
 out:
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index a711d33b3d3e..f0f5f15d384e 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -47,7 +47,7 @@
 * have to waste large pieces of free space at the end of LEB B, because nodes
 * from LEB A would not fit. And the worst situation is when all nodes are of
 * maximum size. So dark watermark is the amount of free + dirty space in LEB
- * which are guaranteed to be reclaimable. If LEB has less space, the GC migh
+ * which are guaranteed to be reclaimable. If LEB has less space, the GC might
 * be unable to reclaim it. So, LEBs with free + dirty greater than dark
 * watermark are "good" LEBs from GC's point of few. The other LEBs are not so
 * good, and GC takes extra care when moving them.
@@ -57,14 +57,6 @@
 #include "ubifs.h"
 /*
- * GC tries to optimize the way it fit nodes to available space, and it sorts
- * nodes a little. The below constants are watermarks which define "large",
- * "medium", and "small" nodes.
- */
-#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4)
-#define SMALL_NODE_WM  UBIFS_MAX_DENT_NODE_SZ
-/*
 * GC may need to move more than one LEB to make progress. The below constants
 * define "soft" and "hard" limits on the number of LEBs the garbage collector
 * may move.
@@ -116,83 +108,222 @@ static int switch_gc_head(struct ubifs_info *c)
 }
 /**
- * joinup - bring data nodes for an inode together.
+ * list_sort - sort a list.
- * @c: UBIFS file-system description object
+ * @priv: private data, passed to @cmp
- * @sleb: describes scanned LEB
+ * @head: the list to sort
- * @inum: inode number
+ * @cmp: the elements comparison function
- * @blk: block number
- * @data: list to which to add data nodes
 *
- * This function looks at the first few nodes in the scanned LEB @sleb and adds
+ * This function has been implemented by Mark J Roberts <mjr@znex.org>. It
- * them to @data if they are data nodes from @inum and have a larger block
+ * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted
- * number than @blk. This function returns %0 on success and a negative error
+ * in ascending order.
- * code on failure.
+ *
+ * The comparison function @cmp is supposed to return a negative value if @a is
+ * than @b, and a positive value if @a is greater than @b. If @a and @b are
+ * equivalent, then it does not matter what this function returns.
 */
-static int joinup(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum,
+static void list_sort(void *priv, struct list_head *head,
-                  unsigned int blk, struct list_head *data)
+                      int (*cmp)(void *priv, struct list_head *a,
+                                 struct list_head *b))
 {
-        int err, cnt = 6, lnum = sleb->lnum, offs;
+        struct list_head *p, *q, *e, *list, *tail, *oldhead;
-        struct ubifs_scan_node *snod, *tmp;
+        int insize, nmerges, psize, qsize, i;
-        union ubifs_key *key;
+        if (list_empty(head))
+                return;
+        list = head->next;
+        list_del(head);
+        insize = 1;
+        for (;;) {
+                p = oldhead = list;
+                list = tail = NULL;
+                nmerges = 0;
+                while (p) {
+                        nmerges++;
+                        q = p;
+                        psize = 0;
+                        for (i = 0; i < insize; i++) {
+                                psize++;
+                                q = q->next == oldhead ? NULL : q->next;
+                                if (!q)
+                                        break;
+                        }
-        list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+                        qsize = insize;
-                key = &snod->key;
+                        while (psize > 0 || (qsize > 0 && q)) {
-                if (key_inum(c, key) == inum &&
+                                if (!psize) {
-                    key_type(c, key) == UBIFS_DATA_KEY &&
+                                        e = q;
-                    key_block(c, key) > blk) {
+                                        q = q->next;
-                        offs = snod->offs;
+                                        qsize--;
-                        err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0);
+                                        if (q == oldhead)
-                        if (err < 0)
+                                                q = NULL;
-                                return err;
+                                } else if (!qsize || !q) {
-                        list_del(&snod->list);
+                                        e = p;
-                        if (err) {
+                                        p = p->next;
-                                list_add_tail(&snod->list, data);
+                                        psize--;
-                                blk = key_block(c, key);
+                                        if (p == oldhead)
-                        } else
+                                                p = NULL;
-                                kfree(snod);
+                                } else if (cmp(priv, p, q) <= 0) {
-                        cnt = 6;
+                                        e = p;
-                } else if (--cnt == 0)
+                                        p = p->next;
+                                        psize--;
+                                        if (p == oldhead)
+                                                p = NULL;
+                                } else {
+                                        e = q;
+                                        q = q->next;
+                                        qsize--;
+                                        if (q == oldhead)
+                                                q = NULL;
+                                }
+                                if (tail)
+                                        tail->next = e;
+                                else
+                                        list = e;
+                                e->prev = tail;
+                                tail = e;
+                        }
+                        p = q;
+                }
+                tail->next = list;
+                list->prev = tail;
+                if (nmerges <= 1)
                        break;
+                insize *= 2;
        }
-        return 0;
+        head->next = list;
+        head->prev = list->prev;
+        list->prev->next = head;
+        list->prev = head;
 }
 /**
- * move_nodes - move nodes.
+ * data_nodes_cmp - compare 2 data nodes.
+ * @priv: UBIFS file-system description object
+ * @a: first data node
+ * @a: second data node
+ *
+ * This function compares data nodes @a and @b. Returns %1 if @a has greater
+ * inode or block number, and %-1 otherwise.
+ */
+int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+        ino_t inuma, inumb;
+        struct ubifs_info *c = priv;
+        struct ubifs_scan_node *sa, *sb;
+        cond_resched();
+        sa = list_entry(a, struct ubifs_scan_node, list);
+        sb = list_entry(b, struct ubifs_scan_node, list);
+        ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
+        ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);
+        inuma = key_inum(c, &sa->key);
+        inumb = key_inum(c, &sb->key);
+        if (inuma == inumb) {
+                unsigned int blka = key_block(c, &sa->key);
+                unsigned int blkb = key_block(c, &sb->key);
+                if (blka <= blkb)
+                        return -1;
+        } else if (inuma <= inumb)
+                return -1;
+        return 1;
+}
+/*
+ * nondata_nodes_cmp - compare 2 non-data nodes.
+ * @priv: UBIFS file-system description object
+ * @a: first node
+ * @a: second node
+ *
+ * This function compares nodes @a and @b. It makes sure that inode nodes go
+ * first and sorted by length in descending order. Directory entry nodes go
+ * after inode nodes and are sorted in ascending hash valuer order.
+ */
+int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+        int typea, typeb;
+        ino_t inuma, inumb;
+        struct ubifs_info *c = priv;
+        struct ubifs_scan_node *sa, *sb;
+        cond_resched();
+        sa = list_entry(a, struct ubifs_scan_node, list);
+        sb = list_entry(b, struct ubifs_scan_node, list);
+        typea = key_type(c, &sa->key);
+        typeb = key_type(c, &sb->key);
+        ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY);
+        /* Inodes go before directory entries */
+        if (typea == UBIFS_INO_KEY) {
+                if (typeb == UBIFS_INO_KEY)
+                        return sb->len - sa->len;
+                return -1;
+        }
+        if (typeb == UBIFS_INO_KEY)
+                return 1;
+        ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY);
+        inuma = key_inum(c, &sa->key);
+        inumb = key_inum(c, &sb->key);
+        if (inuma == inumb) {
+                uint32_t hasha = key_hash(c, &sa->key);
+                uint32_t hashb = key_hash(c, &sb->key);
+                if (hasha <= hashb)
+                        return -1;
+        } else if (inuma <= inumb)
+                return -1;
+        return 1;
+}
+/**
+ * sort_nodes - sort nodes for GC.
 * @c: UBIFS file-system description object
- * @sleb: describes nodes to move
+ * @sleb: describes nodes to sort and contains the result on exit
+ * @nondata: contains non-data nodes on exit
+ * @min: minimum node size is returned here
 *
- * This function moves valid nodes from data LEB described by @sleb to the GC
+ * This function sorts the list of inodes to garbage collect. First of all, it
- * journal head. The obsolete nodes are dropped.
+ * kills obsolete nodes and separates data and non-data nodes to the
+ * @sleb->nodes and @nondata lists correspondingly.
+ *
+ * Data nodes are then sorted in block number order - this is important for
+ * bulk-read; data nodes with lower inode number go before data nodes with
+ * higher inode number, and data nodes with lower block number go before data
+ * nodes with higher block number;
 *
- * When moving nodes we have to deal with classical bin-packing problem: the
+ * Non-data nodes are sorted as follows.
- * space in the current GC journal head LEB and in @c->gc_lnum are the "bins",
+ *   o First go inode nodes - they are sorted in descending length order.
- * where the nodes in the @sleb->nodes list are the elements which should be
+ *   o Then go directory entry nodes - they are sorted in hash order, which
- * fit optimally to the bins. This function uses the "first fit decreasing"
+ *     should supposedly optimize 'readdir()'. Direntry nodes with lower parent
- * strategy, although it does not really sort the nodes but just split them on
+ *     inode number go before direntry nodes with higher parent inode number,
- * 3 classes - large, medium, and small, so they are roughly sorted.
+ *     and direntry nodes with lower name hash values go before direntry nodes
+ *     with higher name hash values.
 *
- * This function returns zero in case of success, %-EAGAIN if commit is
+ * This function returns zero in case of success and a negative error code in
- * required, and other negative error codes in case of other failures.
+ * case of failure.
 */
-static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
+static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+                      struct list_head *nondata, int *min)
 {
        struct ubifs_scan_node *snod, *tmp;
-        struct list_head data, large, medium, small;
-        struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
-        int avail, err, min = INT_MAX;
-        unsigned int blk = 0;
-        ino_t inum = 0;
-        INIT_LIST_HEAD(&data);
+        *min = INT_MAX;
-        INIT_LIST_HEAD(&large);
-        INIT_LIST_HEAD(&medium);
-        INIT_LIST_HEAD(&small);
-        while (!list_empty(&sleb->nodes)) {
+        /* Separate data nodes and non-data nodes */
-                struct list_head *lst = sleb->nodes.next;
+        list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+                int err;
-                snod = list_entry(lst, struct ubifs_scan_node, list);
                ubifs_assert(snod->type != UBIFS_IDX_NODE);
                ubifs_assert(snod->type != UBIFS_REF_NODE);
@@ -201,53 +332,72 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
                err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
                                         snod->offs, 0);
                if (err < 0)
-                        goto out;
+                        return err;
-                list_del(lst);
                if (!err) {
                        /* The node is obsolete, remove it from the list */
+                        list_del(&snod->list);
                        kfree(snod);
                        continue;
                }
-                /*
+                if (snod->len < *min)
-                 * Sort the list of nodes so that data nodes go first, large
+                        *min = snod->len;
-                 * nodes go second, and small nodes go last.
-                 */
+                if (key_type(c, &snod->key) != UBIFS_DATA_KEY)
-                if (key_type(c, &snod->key) == UBIFS_DATA_KEY) {
+                        list_move_tail(&snod->list, nondata);
-                        if (inum != key_inum(c, &snod->key)) {
-                                if (inum) {
-                                        /*
-                                         * Try to move data nodes from the same
-                                         * inode together.
-                                         */
-                                        err = joinup(c, sleb, inum, blk, &data);
-                                        if (err)
-                                                goto out;
-                                }
-                                inum = key_inum(c, &snod->key);
-                                blk = key_block(c, &snod->key);
-                        }
-                        list_add_tail(lst, &data);
-                } else if (snod->len > MEDIUM_NODE_WM)
-                        list_add_tail(lst, &large);
-                else if (snod->len > SMALL_NODE_WM)
-                        list_add_tail(lst, &medium);
-                else
-                        list_add_tail(lst, &small);
-                /* And find the smallest node */
-                if (snod->len < min)
-                        min = snod->len;
        }
-        /*
+        /* Sort data and non-data nodes */
-         * Join the tree lists so that we'd have one roughly sorted list
+        list_sort(c, &sleb->nodes, &data_nodes_cmp);
-         * ('large' will be the head of the joined list).
+        list_sort(c, nondata, &nondata_nodes_cmp);
-         */
+        return 0;
-        list_splice(&data, &large);
+}
-        list_splice(&medium, large.prev);
-        list_splice(&small, large.prev);
+/**
+ * move_node - move a node.
+ * @c: UBIFS file-system description object
+ * @sleb: describes the LEB to move nodes from
+ * @snod: the mode to move
+ * @wbuf: write-buffer to move node to
+ *
+ * This function moves node @snod to @wbuf, changes TNC correspondingly, and
+ * destroys @snod. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+                     struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf)
+{
+        int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used;
+        cond_resched();
+        err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len);
+        if (err)
+                return err;
+        err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
+                                snod->offs, new_lnum, new_offs,
+                                snod->len);
+        list_del(&snod->list);
+        kfree(snod);
+        return err;
+}
+/**
+ * move_nodes - move nodes.
+ * @c: UBIFS file-system description object
+ * @sleb: describes the LEB to move nodes from
+ *
+ * This function moves valid nodes from data LEB described by @sleb to the GC
+ * journal head. This function returns zero in case of success, %-EAGAIN if
+ * commit is required, and other negative error codes in case of other
+ * failures.
+ */
+static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
+{
+        int err, min;
+        LIST_HEAD(nondata);
+        struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
        if (wbuf->lnum == -1) {
                /*
@@ -256,42 +406,59 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
                 */
                err = switch_gc_head(c);
                if (err)
-                        goto out;
+                        return err;
        }
+        err = sort_nodes(c, sleb, &nondata, &min);
+        if (err)
+                goto out;
        /* Write nodes to their new location. Use the first-fit strategy */
        while (1) {
-                avail = c->leb_size - wbuf->offs - wbuf->used;
+                int avail;
-                list_for_each_entry_safe(snod, tmp, &large, list) {
+                struct ubifs_scan_node *snod, *tmp;
-                        int new_lnum, new_offs;
+                /* Move data nodes */
+                list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+                        avail = c->leb_size - wbuf->offs - wbuf->used;
+                        if  (snod->len > avail)
+                                /*
+                                 * Do not skip data nodes in order to optimize
+                                 * bulk-read.
+                                 */
+                                break;
+                        err = move_node(c, sleb, snod, wbuf);
+                        if (err)
+                                goto out;
+                }
+                /* Move non-data nodes */
+                list_for_each_entry_safe(snod, tmp, &nondata, list) {
+                        avail = c->leb_size - wbuf->offs - wbuf->used;
                        if (avail < min)
                                break;
-                        if (snod->len > avail)
+                        if  (snod->len > avail) {
-                                /* This node does not fit */
+                                /*
+                                 * Keep going only if this is an inode with
+                                 * some data. Otherwise stop and switch the GC
+                                 * head. IOW, we assume that data-less inode
+                                 * nodes and direntry nodes are roughly of the
+                                 * same size.
+                                 */
+                                if (key_type(c, &snod->key) == UBIFS_DENT_KEY ||
+                                    snod->len == UBIFS_INO_NODE_SZ)
+                                        break;
                                continue;
+                        }
-                        cond_resched();
+                        err = move_node(c, sleb, snod, wbuf);
-                        new_lnum = wbuf->lnum;
-                        new_offs = wbuf->offs + wbuf->used;
-                        err = ubifs_wbuf_write_nolock(wbuf, snod->node,
-                                                      snod->len);
                        if (err)
                                goto out;
-                        err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
-                                                snod->offs, new_lnum, new_offs,
-                                                snod->len);
-                        if (err)
-                                goto out;
-                        avail = c->leb_size - wbuf->offs - wbuf->used;
-                        list_del(&snod->list);
-                        kfree(snod);
                }
-                if (list_empty(&large))
+                if (list_empty(&sleb->nodes) && list_empty(&nondata))
                        break;
                /*
@@ -306,10 +473,7 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
        return 0;
 out:
-        list_for_each_entry_safe(snod, tmp, &large, list) {
+        list_splice_tail(&nondata, &sleb->nodes);
-                list_del(&snod->list);
-                kfree(snod);
-        }
        return err;
 }
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index a11ca0958a23..64b5f3a309f5 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -114,7 +114,7 @@ static inline void zero_trun_node_unused(struct ubifs_trun_node *trun)
 */
 static int reserve_space(struct ubifs_info *c, int jhead, int len)
 {
-        int err = 0, err1, retries = 0, avail, lnum, offs, free, squeeze;
+        int err = 0, err1, retries = 0, avail, lnum, offs, squeeze;
        struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
        /*
@@ -139,10 +139,9 @@ again:
         * Write buffer wasn't seek'ed or there is no enough space - look for an
         * LEB with some empty space.
         */
-        lnum = ubifs_find_free_space(c, len, &free, squeeze);
+        lnum = ubifs_find_free_space(c, len, &offs, squeeze);
        if (lnum >= 0) {
                /* Found an LEB, add it to the journal head */
-                offs = c->leb_size - free;
                err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
                if (err)
                        goto out_return;
@@ -1366,7 +1365,7 @@ out_ro:
 * @host: host inode
 *
 * This function writes the updated version of an extended attribute inode and
- * the host inode tho the journal (to the base head). The host inode is written
+ * the host inode to the journal (to the base head). The host inode is written
 * after the extended attribute inode in order to guarantee that the extended
 * attribute will be flushed when the inode is synchronized by 'fsync()' and
 * consequently, the write-buffer is synchronized. This function returns zero
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index efb3430a2581..5fa27ea031ba 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -381,8 +381,8 @@ static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k)
 * @c: UBIFS file-system description object
 * @key: the key to get hash from
 */
-static inline int key_hash(const struct ubifs_info *c,
+static inline uint32_t key_hash(const struct ubifs_info *c,
-                           const union ubifs_key *key)
+                                const union ubifs_key *key)
 {
        return key->u32[1] & UBIFS_S_KEY_HASH_MASK;
 }
@@ -392,7 +392,7 @@ static inline int key_hash(const struct ubifs_info *c,
 * @c: UBIFS file-system description object
 * @k: the key to get hash from
 */
-static inline int key_hash_flash(const struct ubifs_info *c, const void *k)
+static inline uint32_t key_hash_flash(const struct ubifs_info *c, const void *k)
 {
        const union ubifs_key *key = k;
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 3e0aa7367556..56e33772a1ee 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -239,7 +239,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
        }
        /*
-         * Make sure the the amount of space in buds will not exceed
+         * Make sure the amount of space in buds will not exceed the
         * 'c->max_bud_bytes' limit, because we want to guarantee mount time
         * limits.
         *
@@ -367,7 +367,6 @@ static void remove_buds(struct ubifs_info *c)
                                bud->jhead, c->leb_size - bud->start,
                                c->cmt_bud_bytes);
                        rb_erase(p1, &c->buds);
-                        list_del(&bud->list);
                        /*
                         * If the commit does not finish, the recovery will need
                         * to replay the journal, in which case the old buds
@@ -375,7 +374,7 @@ static void remove_buds(struct ubifs_info *c)
                         * commit i.e. do not allow them to be garbage
                         * collected.
                         */
-                        list_add(&bud->list, &c->old_buds);
+                        list_move(&bud->list, &c->old_buds);
                }
        }
        spin_unlock(&c->buds_lock);
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 3216a1f277f8..8cbfb8248025 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -229,7 +229,7 @@ static int layout_cnodes(struct ubifs_info *c)
                while (offs + len > c->leb_size) {
                        alen = ALIGN(offs, c->min_io_size);
                        upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
-                        dbg_chk_lpt_sz(c, 2, alen - offs);
+                        dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
                        err = alloc_lpt_leb(c, &lnum);
                        if (err)
                                goto no_space;
@@ -272,7 +272,7 @@ static int layout_cnodes(struct ubifs_info *c)
                if (offs + c->lsave_sz > c->leb_size) {
                        alen = ALIGN(offs, c->min_io_size);
                        upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
-                        dbg_chk_lpt_sz(c, 2, alen - offs);
+                        dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
                        err = alloc_lpt_leb(c, &lnum);
                        if (err)
                                goto no_space;
@@ -292,7 +292,7 @@ static int layout_cnodes(struct ubifs_info *c)
                if (offs + c->ltab_sz > c->leb_size) {
                        alen = ALIGN(offs, c->min_io_size);
                        upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
-                        dbg_chk_lpt_sz(c, 2, alen - offs);
+                        dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
                        err = alloc_lpt_leb(c, &lnum);
                        if (err)
                                goto no_space;
@@ -416,14 +416,12 @@ static int write_cnodes(struct ubifs_info *c)
                                                       alen, UBI_SHORTTERM);
                                if (err)
                                        return err;
-                                dbg_chk_lpt_sz(c, 4, alen - wlen);
                        }
-                        dbg_chk_lpt_sz(c, 2, 0);
+                        dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
                        err = realloc_lpt_leb(c, &lnum);
                        if (err)
                                goto no_space;
-                        offs = 0;
+                        offs = from = 0;
-                        from = 0;
                        ubifs_assert(lnum >= c->lpt_first &&
                                     lnum <= c->lpt_last);
                        err = ubifs_leb_unmap(c, lnum);
@@ -477,11 +475,11 @@ static int write_cnodes(struct ubifs_info *c)
                                              UBI_SHORTTERM);
                        if (err)
                                return err;
-                        dbg_chk_lpt_sz(c, 2, alen - wlen);
+                        dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
                        err = realloc_lpt_leb(c, &lnum);
                        if (err)
                                goto no_space;
-                        offs = 0;
+                        offs = from = 0;
                        ubifs_assert(lnum >= c->lpt_first &&
                                     lnum <= c->lpt_last);
                        err = ubifs_leb_unmap(c, lnum);
@@ -504,11 +502,11 @@ static int write_cnodes(struct ubifs_info *c)
                                              UBI_SHORTTERM);
                        if (err)
                                return err;
-                        dbg_chk_lpt_sz(c, 2, alen - wlen);
+                        dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
                        err = realloc_lpt_leb(c, &lnum);
                        if (err)
                                goto no_space;
-                        offs = 0;
+                        offs = from = 0;
                        ubifs_assert(lnum >= c->lpt_first &&
                                     lnum <= c->lpt_last);
                        err = ubifs_leb_unmap(c, lnum);
@@ -1756,10 +1754,16 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
 /**
 * dbg_chk_lpt_sz - check LPT does not write more than LPT size.
 * @c: the UBIFS file-system description object
- * @action: action
+ * @action: what to do
 * @len: length written
 *
 * This function returns %0 on success and a negative error code on failure.
+ * The @action argument may be one of:
+ *   o %0 - LPT debugging checking starts, initialize debugging variables;
+ *   o %1 - wrote an LPT node, increase LPT size by @len bytes;
+ *   o %2 - switched to a different LEB and wasted @len bytes;
+ *   o %3 - check that we've written the right number of bytes.
+ *   o %4 - wasted @len bytes;
 */
 int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
 {
@@ -1917,12 +1921,12 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
                                       lnum, offs);
                        err = ubifs_unpack_nnode(c, buf, &nnode);
                        for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
-                                printk("%d:%d", nnode.nbranch[i].lnum,
+                                printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum,
                                       nnode.nbranch[i].offs);
                                if (i != UBIFS_LPT_FANOUT - 1)
-                                        printk(", ");
+                                        printk(KERN_CONT ", ");
                        }
-                        printk("\n");
+                        printk(KERN_CONT "\n");
                        break;
                }
                case UBIFS_LPT_LTAB:
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 90acac603e63..10662975d2ef 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -425,59 +425,35 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
 * @lnum: LEB number of the LEB from which @buf was read
 * @offs: offset from which @buf was read
 *
- * This function scans @buf for more nodes and returns %0 is a node is found and
+ * This function ensures that the corrupted node at @offs is the last thing
- * %1 if no more nodes are found.
+ * written to a LEB. This function returns %1 if more data is not found and
+ * %0 if more data is found.
 */
 static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
                        int lnum, int offs)
 {
-        int skip, next_offs = 0;
+        struct ubifs_ch *ch = buf;
+        int skip, dlen = le32_to_cpu(ch->len);
-        if (len > UBIFS_DATA_NODE_SZ) {
+        /* Check for empty space after the corrupt node's common header */
-                struct ubifs_ch *ch = buf;
+        skip = ALIGN(offs + UBIFS_CH_SZ, c->min_io_size) - offs;
-                int dlen = le32_to_cpu(ch->len);
+        if (is_empty(buf + skip, len - skip))
+                return 1;
-                if (ch->node_type == UBIFS_DATA_NODE && dlen >= UBIFS_CH_SZ &&
+        /*
-                    dlen <= UBIFS_MAX_DATA_NODE_SZ)
+         * The area after the common header size is not empty, so the common
-                        /* The corrupt node looks like a data node */
+         * header must be intact. Check it.
-                        next_offs = ALIGN(offs + dlen, 8);
+         */
-        }
+        if (ubifs_check_node(c, buf, lnum, offs, 1, 0) != -EUCLEAN) {
+                dbg_rcvry("unexpected bad common header at %d:%d", lnum, offs);
-        if (c->min_io_size == 1)
+                return 0;
-                skip = 8;
-        else
-                skip = ALIGN(offs + 1, c->min_io_size) - offs;
-        offs += skip;
-        buf += skip;
-        len -= skip;
-        while (len > 8) {
-                struct ubifs_ch *ch = buf;
-                uint32_t magic = le32_to_cpu(ch->magic);
-                int ret;
-                if (magic == UBIFS_NODE_MAGIC) {
-                        ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
-                        if (ret == SCANNED_A_NODE || ret > 0) {
-                                /*
-                                 * There is a small chance this is just data in
-                                 * a data node, so check that possibility. e.g.
-                                 * this is part of a file that itself contains
-                                 * a UBIFS image.
-                                 */
-                                if (next_offs && offs + le32_to_cpu(ch->len) <=
-                                    next_offs)
-                                        continue;
-                                dbg_rcvry("unexpected node at %d:%d", lnum,
-                                          offs);
-                                return 0;
-                        }
-                }
-                offs += 8;
-                buf += 8;
-                len -= 8;
        }
-        return 1;
+        /* Now we know the corrupt node's length we can skip over it */
+        skip = ALIGN(offs + dlen, c->min_io_size) - offs;
+        /* After which there should be empty space */
+        if (is_empty(buf + skip, len - skip))
+                return 1;
+        dbg_rcvry("unexpected data at %d:%d", lnum, offs + skip);
+        return 0;
 }
 /**
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index ce42a7b0ca5a..11cc80125a49 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -143,7 +143,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
                dirty -= c->leb_size - lp->free;
                /*
                 * If the replay order was perfect the dirty space would now be
-                 * zero. The order is not perfect because the the journal heads
+                 * zero. The order is not perfect because the journal heads
                 * race with each other. This is not a problem but is does mean
                 * that the dirty space may temporarily exceed c->leb_size
                 * during the replay.
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index e070c643d1bb..57085e43320f 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -193,6 +193,7 @@ static int create_default_filesystem(struct ubifs_info *c)
        if (tmp64 > DEFAULT_MAX_RP_SIZE)
                tmp64 = DEFAULT_MAX_RP_SIZE;
        sup->rp_size = cpu_to_le64(tmp64);
+        sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION);
        err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM);
        kfree(sup);
@@ -532,17 +533,39 @@ int ubifs_read_superblock(struct ubifs_info *c)
        if (IS_ERR(sup))
                return PTR_ERR(sup);
+        c->fmt_version = le32_to_cpu(sup->fmt_version);
+        c->ro_compat_version = le32_to_cpu(sup->ro_compat_version);
        /*
         * The software supports all previous versions but not future versions,
         * due to the unavailability of time-travelling equipment.
         */
-        c->fmt_version = le32_to_cpu(sup->fmt_version);
        if (c->fmt_version > UBIFS_FORMAT_VERSION) {
-                ubifs_err("on-flash format version is %d, but software only "
+                struct super_block *sb = c->vfs_sb;
-                          "supports up to version %d", c->fmt_version,
+                int mounting_ro = sb->s_flags & MS_RDONLY;
-                          UBIFS_FORMAT_VERSION);
-                err = -EINVAL;
+                ubifs_assert(!c->ro_media || mounting_ro);
-                goto out;
+                if (!mounting_ro ||
+                    c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) {
+                        ubifs_err("on-flash format version is w%d/r%d, but "
+                                  "software only supports up to version "
+                                  "w%d/r%d", c->fmt_version,
+                                  c->ro_compat_version, UBIFS_FORMAT_VERSION,
+                                  UBIFS_RO_COMPAT_VERSION);
+                        if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) {
+                                ubifs_msg("only R/O mounting is possible");
+                                err = -EROFS;
+                        } else
+                                err = -EINVAL;
+                        goto out;
+                }
+                /*
+                 * The FS is mounted R/O, and the media format is
+                 * R/O-compatible with the UBIFS implementation, so we can
+                 * mount.
+                 */
+                c->rw_incompat = 1;
        }
        if (c->fmt_version < 3) {
@@ -623,7 +646,6 @@ int ubifs_read_superblock(struct ubifs_info *c)
        c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS;
        c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs;
        c->main_first = c->leb_cnt - c->main_lebs;
-        c->report_rp_size = ubifs_reported_space(c, c->rp_size);
        err = validate_sb(c, sup);
 out:
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index e7bab52a1410..02feb59cefca 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -206,8 +206,7 @@ static int shrink_tnc_trees(int nr, int age, int *contention)
                 * Move this one to the end of the list to provide some
                 * fairness.
                 */
-                list_del(&c->infos_list);
+                list_move_tail(&c->infos_list, &ubifs_infos);
-                list_add_tail(&c->infos_list, &ubifs_infos);
                mutex_unlock(&c->umount_mutex);
                if (freed >= nr)
                        break;
@@ -263,8 +262,7 @@ static int kick_a_thread(void)
                        }
                        if (i == 1) {
-                                list_del(&c->infos_list);
+                                list_move_tail(&c->infos_list, &ubifs_infos);
-                                list_add_tail(&c->infos_list, &ubifs_infos);
                                spin_unlock(&ubifs_infos_lock);
                                ubifs_request_bg_commit(c);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index c5c98355459a..faa44f90608a 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -421,8 +421,8 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
                seq_printf(s, ",no_chk_data_crc");
        if (c->mount_opts.override_compr) {
-                seq_printf(s, ",compr=");
+                seq_printf(s, ",compr=%s",
-                seq_printf(s, ubifs_compr_name(c->mount_opts.compr_type));
+                           ubifs_compr_name(c->mount_opts.compr_type));
        }
        return 0;
@@ -700,6 +700,8 @@ static int init_constants_sb(struct ubifs_info *c)
        if (err)
                return err;
+        /* Initialize effective LEB size used in budgeting calculations */
+        c->idx_leb_size = c->leb_size - c->max_idx_node_sz;
        return 0;
 }
@@ -716,6 +718,7 @@ static void init_constants_master(struct ubifs_info *c)
        long long tmp64;
        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        c->report_rp_size = ubifs_reported_space(c, c->rp_size);
        /*
         * Calculate total amount of FS blocks. This number is not used
@@ -1201,7 +1204,7 @@ static int mount_ubifs(struct ubifs_info *c)
                        goto out_cbuf;
                /* Create background thread */
-                c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
+                c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
                if (IS_ERR(c->bgt)) {
                        err = PTR_ERR(c->bgt);
                        c->bgt = NULL;
@@ -1318,11 +1321,15 @@ static int mount_ubifs(struct ubifs_info *c)
                else {
                        c->need_recovery = 0;
                        ubifs_msg("recovery completed");
-                        /* GC LEB has to be empty and taken at this point */
+                        /*
-                        ubifs_assert(c->lst.taken_empty_lebs == 1);
+                         * GC LEB has to be empty and taken at this point. But
+                         * the journal head LEBs may also be accounted as
+                         * "empty taken" if they are empty.
+                         */
+                        ubifs_assert(c->lst.taken_empty_lebs > 0);
                }
        } else
-                ubifs_assert(c->lst.taken_empty_lebs == 1);
+                ubifs_assert(c->lst.taken_empty_lebs > 0);
        err = dbg_check_filesystem(c);
        if (err)
@@ -1344,8 +1351,9 @@ static int mount_ubifs(struct ubifs_info *c)
        x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
        ubifs_msg("journal size:       %lld bytes (%lld KiB, %lld MiB, %d "
                  "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
-        ubifs_msg("media format:       %d (latest is %d)",
+        ubifs_msg("media format:       w%d/r%d (latest is w%d/r%d)",
-                  c->fmt_version, UBIFS_FORMAT_VERSION);
+                  c->fmt_version, c->ro_compat_version,
+                  UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION);
        ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
        ubifs_msg("reserved for root:  %llu bytes (%llu KiB)",
                c->report_rp_size, c->report_rp_size >> 10);
@@ -1485,6 +1493,15 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 {
        int err, lnum;
+        if (c->rw_incompat) {
+                ubifs_err("the file-system is not R/W-compatible");
+                ubifs_msg("on-flash format version is w%d/r%d, but software "
+                          "only supports up to version w%d/r%d", c->fmt_version,
+                          c->ro_compat_version, UBIFS_FORMAT_VERSION,
+                          UBIFS_RO_COMPAT_VERSION);
+                return -EROFS;
+        }
        mutex_lock(&c->umount_mutex);
        dbg_save_space_info(c);
        c->remounting_rw = 1;
@@ -1554,7 +1571,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
        ubifs_create_buds_lists(c);
        /* Create background thread */
-        c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
+        c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
        if (IS_ERR(c->bgt)) {
                err = PTR_ERR(c->bgt);
                c->bgt = NULL;
@@ -1775,7 +1792,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
                c->bu.buf = NULL;
        }
-        ubifs_assert(c->lst.taken_empty_lebs == 1);
+        ubifs_assert(c->lst.taken_empty_lebs > 0);
        return 0;
 }
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index fa28a84c6a1b..f249f7b0d656 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1252,7 +1252,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
         * splitting in the middle of the colliding sequence. Also, when
         * removing the leftmost key, we would have to correct the key of the
         * parent node, which would introduce additional complications. Namely,
-         * if we changed the the leftmost key of the parent znode, the garbage
+         * if we changed the leftmost key of the parent znode, the garbage
         * collector would be unable to find it (GC is doing this when GC'ing
         * indexing LEBs). Although we already have an additional RB-tree where
         * we save such changed znodes (see 'ins_clr_old_idx_znode()') until
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index b25fc36cf72f..3eee07e0c495 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -36,9 +36,31 @@
 /* UBIFS node magic number (must not have the padding byte first or last) */
 #define UBIFS_NODE_MAGIC  0x06101831
-/* UBIFS on-flash format version */
+/*
+ * UBIFS on-flash format version. This version is increased when the on-flash
+ * format is changing. If this happens, UBIFS is will support older versions as
+ * well. But older UBIFS code will not support newer formats. Format changes
+ * will be rare and only when absolutely necessary, e.g. to fix a bug or to add
+ * a new feature.
+ *
+ * UBIFS went into mainline kernel with format version 4. The older formats
+ * were development formats.
+ */
 #define UBIFS_FORMAT_VERSION 4
+/*
+ * Read-only compatibility version. If the UBIFS format is changed, older UBIFS
+ * implementations will not be able to mount newer formats in read-write mode.
+ * However, depending on the change, it may be possible to mount newer formats
+ * in R/O mode. This is indicated by the R/O compatibility version which is
+ * stored in the super-block.
+ *
+ * This is needed to support boot-loaders which only need R/O mounting. With
+ * this flag it is possible to do UBIFS format changes without a need to update
+ * boot-loaders.
+ */
+#define UBIFS_RO_COMPAT_VERSION 0
 /* Minimum logical eraseblock size in bytes */
 #define UBIFS_MIN_LEB_SZ (15*1024)
@@ -53,7 +75,7 @@
 /*
 * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes
- * shorter than uncompressed data length, UBIFS preferes to leave this data
+ * shorter than uncompressed data length, UBIFS prefers to leave this data
 * node uncompress, because it'll be read faster.
 */
 #define UBIFS_MIN_COMPRESS_DIFF 64
@@ -586,6 +608,7 @@ struct ubifs_pad_node {
 * @padding2: reserved for future, zeroes
 * @time_gran: time granularity in nanoseconds
 * @uuid: UUID generated when the file system image was created
+ * @ro_compat_version: UBIFS R/O compatibility version
 */
 struct ubifs_sb_node {
        struct ubifs_ch ch;
@@ -612,7 +635,8 @@ struct ubifs_sb_node {
        __le64 rp_size;
        __le32 time_gran;
        __u8 uuid[16];
-        __u8 padding2[3972];
+        __le32 ro_compat_version;
+        __u8 padding2[3968];
 } __attribute__ ((packed));
 /**
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 039a68bee29a..0a8341e14088 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -934,6 +934,7 @@ struct ubifs_debug_info;
 *          by @commit_sem
 * @cnt_lock: protects @highest_inum and @max_sqnum counters
 * @fmt_version: UBIFS on-flash format version
+ * @ro_compat_version: R/O compatibility version
 * @uuid: UUID from super block
 *
 * @lhead_lnum: log head logical eraseblock number
@@ -966,6 +967,7 @@ struct ubifs_debug_info;
 *                   recovery)
 * @bulk_read: enable bulk-reads
 * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
+ * @rw_incompat: the media is not R/W compatible
 *
 * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
 *             @calc_idx_sz
@@ -1015,6 +1017,8 @@ struct ubifs_debug_info;
 * @min_io_shift: number of bits in @min_io_size minus one
 * @leb_size: logical eraseblock size in bytes
 * @half_leb_size: half LEB size
+ * @idx_leb_size: how many bytes of an LEB are effectively available when it is
+ *                used to store indexing nodes (@leb_size - @max_idx_node_sz)
 * @leb_cnt: count of logical eraseblocks
 * @max_leb_cnt: maximum count of logical eraseblocks
 * @old_leb_cnt: count of logical eraseblocks before re-size
@@ -1132,8 +1136,8 @@ struct ubifs_debug_info;
 *             previous commit start
 * @uncat_list: list of un-categorized LEBs
 * @empty_list: list of empty LEBs
- * @freeable_list: list of freeable non-index LEBs (free + dirty == leb_size)
+ * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size)
- * @frdi_idx_list: list of freeable index LEBs (free + dirty == leb_size)
+ * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size)
 * @freeable_cnt: number of freeable LEBs in @freeable_list
 *
 * @ltab_lnum: LEB number of LPT's own lprops table
@@ -1177,6 +1181,7 @@ struct ubifs_info {
        unsigned long long cmt_no;
        spinlock_t cnt_lock;
        int fmt_version;
+        int ro_compat_version;
        unsigned char uuid[16];
        int lhead_lnum;
@@ -1205,6 +1210,7 @@ struct ubifs_info {
        unsigned int no_chk_data_crc:1;
        unsigned int bulk_read:1;
        unsigned int default_compr:2;
+        unsigned int rw_incompat:1;
        struct mutex tnc_mutex;
        struct ubifs_zbranch zroot;
@@ -1253,6 +1259,7 @@ struct ubifs_info {
        int min_io_shift;
        int leb_size;
        int half_leb_size;
+        int idx_leb_size;
        int leb_cnt;
        int max_leb_cnt;
        int old_leb_cnt;
@@ -1500,7 +1507,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free);
 long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
 /* find.c */
-int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
                          int squeeze);
 int ubifs_find_free_leb_for_idx(struct ubifs_info *c);
 int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
author	Ingo Molnar <mingo@elte.hu>	2009-04-07 06:05:21 -0400
committer	Ingo Molnar <mingo@elte.hu>	2009-04-07 06:05:25 -0400
commit	6c009ecef8cca28c7c09eb16d0802e37915a76e1 (patch)
tree	11c773f780186fdb9fbc9c80a73fb7c8426b1fba /fs
parent	98c2aaf8be5baf7193be37fb28bce8e7327158bc (diff)
parent	d508afb437daee7cf07da085b635c44a4ebf9b38 (diff)