99 files changed, 11299 insertions, 760 deletions
diff --git a/fs/9p/9p.c b/fs/9p/9p.c
new file mode 100644
index 000000000000..e847f504a47c
--- /dev/null
+++ b/fs/9p/9p.c
@@ -0,0 +1,359 @@
+/*
+ *  linux/fs/9p/9p.c
+ *
+ *  This file contains functions 9P2000 functions
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/idr.h>
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "mux.h"
+/**
+ * v9fs_t_version - negotiate protocol parameters with sever
+ * @v9ses: 9P2000 session information
+ * @msize: requested max size packet
+ * @version: requested version.extension string
+ * @fcall: pointer to response fcall pointer
+ *
+ */
+int
+v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize,
+               char *version, struct v9fs_fcall **fcall)
+{
+        struct v9fs_fcall msg;
+        dprintk(DEBUG_9P, "msize: %d version: %s\n", msize, version);
+        msg.id = TVERSION;
+        msg.params.tversion.msize = msize;
+        msg.params.tversion.version = version;
+        return v9fs_mux_rpc(v9ses, &msg, fcall);
+}
+/**
+ * v9fs_t_attach - mount the server
+ * @v9ses: 9P2000 session information
+ * @uname: user name doing the attach
+ * @aname: remote name being attached to
+ * @fid: mount fid to attatch to root node
+ * @afid: authentication fid (in this case result key)
+ * @fcall: pointer to response fcall pointer
+ *
+ */
+int
+v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname,
+              u32 fid, u32 afid, struct v9fs_fcall **fcall)
+{
+        struct v9fs_fcall msg;
+        dprintk(DEBUG_9P, "uname '%s' aname '%s' fid %d afid %d\n", uname,
+                aname, fid, afid);
+        msg.id = TATTACH;
+        msg.params.tattach.fid = fid;
+        msg.params.tattach.afid = afid;
+        msg.params.tattach.uname = uname;
+        msg.params.tattach.aname = aname;
+        return v9fs_mux_rpc(v9ses, &msg, fcall);
+}
+/**
+ * v9fs_t_clunk - release a fid (finish a transaction)
+ * @v9ses: 9P2000 session information
+ * @fid: fid to release
+ * @fcall: pointer to response fcall pointer
+ *
+ */
+int
+v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid,
+             struct v9fs_fcall **fcall)
+{
+        struct v9fs_fcall msg;
+        dprintk(DEBUG_9P, "fid %d\n", fid);
+        msg.id = TCLUNK;
+        msg.params.tclunk.fid = fid;
+        return v9fs_mux_rpc(v9ses, &msg, fcall);
+}
+/**
+ * v9fs_v9fs_t_flush - flush a pending transaction
+ * @v9ses: 9P2000 session information
+ * @tag: tid to release
+ *
+ */
+int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 tag)
+{
+        struct v9fs_fcall msg;
+        dprintk(DEBUG_9P, "oldtag %d\n", tag);
+        msg.id = TFLUSH;
+        msg.params.tflush.oldtag = tag;
+        return v9fs_mux_rpc(v9ses, &msg, NULL);
+}
+/**
+ * v9fs_t_stat - read a file's meta-data
+ * @v9ses: 9P2000 session information
+ * @fid: fid pointing to file or directory to get info about
+ * @fcall: pointer to response fcall
+ *
+ */
+int
+v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid, struct v9fs_fcall **fcall)
+{
+        struct v9fs_fcall msg;
+        dprintk(DEBUG_9P, "fid %d\n", fid);
+        if (fcall)
+                *fcall = NULL;
+        msg.id = TSTAT;
+        msg.params.tstat.fid = fid;
+        return v9fs_mux_rpc(v9ses, &msg, fcall);
+}
+/**
+ * v9fs_t_wstat - write a file's meta-data
+ * @v9ses: 9P2000 session information
+ * @fid: fid pointing to file or directory to write info about
+ * @stat: metadata
+ * @fcall: pointer to response fcall
+ *
+ */
+int
+v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid,
+             struct v9fs_stat *stat, struct v9fs_fcall **fcall)
+{
+        struct v9fs_fcall msg;
+        dprintk(DEBUG_9P, "fid %d length %d\n", fid, (int)stat->length);
+        msg.id = TWSTAT;
+        msg.params.twstat.fid = fid;
+        msg.params.twstat.stat = stat;
+        return v9fs_mux_rpc(v9ses, &msg, fcall);
+}
+/**
+ * v9fs_t_walk - walk a fid to a new file or directory
+ * @v9ses: 9P2000 session information
+ * @fid: fid to walk
+ * @newfid: new fid (for clone operations)
+ * @name: path to walk fid to
+ * @fcall: pointer to response fcall
+ *
+ */
+/* TODO: support multiple walk */
+int
+v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid,
+            char *name, struct v9fs_fcall **fcall)
+{
+        struct v9fs_fcall msg;
+        dprintk(DEBUG_9P, "fid %d newfid %d wname '%s'\n", fid, newfid, name);
+        msg.id = TWALK;
+        msg.params.twalk.fid = fid;
+        msg.params.twalk.newfid = newfid;
+        if (name) {
+                msg.params.twalk.nwname = 1;
+                msg.params.twalk.wnames = &name;
+        } else {
+                msg.params.twalk.nwname = 0;
+        }
+        return v9fs_mux_rpc(v9ses, &msg, fcall);
+}
+/**
+ * v9fs_t_open - open a file
+ *
+ * @v9ses - 9P2000 session information
+ * @fid - fid to open
+ * @mode - mode to open file (R, RW, etc)
+ * @fcall - pointer to response fcall
+ *
+ */
+int
+v9fs_t_open(struct v9fs_session_info *v9ses, u32 fid, u8 mode,
+            struct v9fs_fcall **fcall)
+{
+        struct v9fs_fcall msg;
+        long errorno = -1;
+        dprintk(DEBUG_9P, "fid %d mode %d\n", fid, mode);
+        msg.id = TOPEN;
+        msg.params.topen.fid = fid;
+        msg.params.topen.mode = mode;
+        errorno = v9fs_mux_rpc(v9ses, &msg, fcall);
+        return errorno;
+}
+/**
+ * v9fs_t_remove - remove a file or directory
+ * @v9ses: 9P2000 session information
+ * @fid: fid to remove
+ * @fcall: pointer to response fcall
+ *
+ */
+int
+v9fs_t_remove(struct v9fs_session_info *v9ses, u32 fid,
+              struct v9fs_fcall **fcall)
+{
+        struct v9fs_fcall msg;
+        dprintk(DEBUG_9P, "fid %d\n", fid);
+        msg.id = TREMOVE;
+        msg.params.tremove.fid = fid;
+        return v9fs_mux_rpc(v9ses, &msg, fcall);
+}
+/**
+ * v9fs_t_create - create a file or directory
+ * @v9ses: 9P2000 session information
+ * @fid: fid to create
+ * @name: name of the file or directory to create
+ * @perm: permissions to create with
+ * @mode: mode to open file (R, RW, etc)
+ * @fcall: pointer to response fcall
+ *
+ */
+int
+v9fs_t_create(struct v9fs_session_info *v9ses, u32 fid, char *name,
+              u32 perm, u8 mode, struct v9fs_fcall **fcall)
+{
+        struct v9fs_fcall msg;
+        dprintk(DEBUG_9P, "fid %d name '%s' perm %x mode %d\n",
+                fid, name, perm, mode);
+        msg.id = TCREATE;
+        msg.params.tcreate.fid = fid;
+        msg.params.tcreate.name = name;
+        msg.params.tcreate.perm = perm;
+        msg.params.tcreate.mode = mode;
+        return v9fs_mux_rpc(v9ses, &msg, fcall);
+}
+/**
+ * v9fs_t_read - read data
+ * @v9ses: 9P2000 session information
+ * @fid: fid to read from
+ * @offset: offset to start read at
+ * @count: how many bytes to read
+ * @fcall: pointer to response fcall (with data)
+ *
+ */
+int
+v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid, u64 offset,
+            u32 count, struct v9fs_fcall **fcall)
+{
+        struct v9fs_fcall msg;
+        struct v9fs_fcall *rc = NULL;
+        long errorno = -1;
+        dprintk(DEBUG_9P, "fid %d offset 0x%lx count 0x%x\n", fid,
+                (long unsigned int)offset, count);
+        msg.id = TREAD;
+        msg.params.tread.fid = fid;
+        msg.params.tread.offset = offset;
+        msg.params.tread.count = count;
+        errorno = v9fs_mux_rpc(v9ses, &msg, &rc);
+        if (!errorno) {
+                errorno = rc->params.rread.count;
+                dump_data(rc->params.rread.data, rc->params.rread.count);
+        }
+        if (fcall)
+                *fcall = rc;
+        else
+                kfree(rc);
+        return errorno;
+}
+/**
+ * v9fs_t_write - write data
+ * @v9ses: 9P2000 session information
+ * @fid: fid to write to
+ * @offset: offset to start write at
+ * @count: how many bytes to write
+ * @fcall: pointer to response fcall
+ *
+ */
+int
+v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid,
+             u64 offset, u32 count, void *data, struct v9fs_fcall **fcall)
+{
+        struct v9fs_fcall msg;
+        struct v9fs_fcall *rc = NULL;
+        long errorno = -1;
+        dprintk(DEBUG_9P, "fid %d offset 0x%llx count 0x%x\n", fid,
+                (unsigned long long)offset, count);
+        dump_data(data, count);
+        msg.id = TWRITE;
+        msg.params.twrite.fid = fid;
+        msg.params.twrite.offset = offset;
+        msg.params.twrite.count = count;
+        msg.params.twrite.data = data;
+        errorno = v9fs_mux_rpc(v9ses, &msg, &rc);
+        if (!errorno)
+                errorno = rc->params.rwrite.count;
+        if (fcall)
+                *fcall = rc;
+        else
+                kfree(rc);
+        return errorno;
+}
diff --git a/fs/9p/9p.h b/fs/9p/9p.h
new file mode 100644
index 000000000000..f55424216be2
--- /dev/null
+++ b/fs/9p/9p.h
@@ -0,0 +1,341 @@
+/*
+ * linux/fs/9p/9p.h
+ *
+ * 9P protocol definitions.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+/* Message Types */
+enum {
+        TVERSION = 100,
+        RVERSION,
+        TAUTH = 102,
+        RAUTH,
+        TATTACH = 104,
+        RATTACH,
+        TERROR = 106,
+        RERROR,
+        TFLUSH = 108,
+        RFLUSH,
+        TWALK = 110,
+        RWALK,
+        TOPEN = 112,
+        ROPEN,
+        TCREATE = 114,
+        RCREATE,
+        TREAD = 116,
+        RREAD,
+        TWRITE = 118,
+        RWRITE,
+        TCLUNK = 120,
+        RCLUNK,
+        TREMOVE = 122,
+        RREMOVE,
+        TSTAT = 124,
+        RSTAT,
+        TWSTAT = 126,
+        RWSTAT,
+};
+/* modes */
+enum {
+        V9FS_OREAD = 0x00,
+        V9FS_OWRITE = 0x01,
+        V9FS_ORDWR = 0x02,
+        V9FS_OEXEC = 0x03,
+        V9FS_OEXCL = 0x04,
+        V9FS_OTRUNC = 0x10,
+        V9FS_OREXEC = 0x20,
+        V9FS_ORCLOSE = 0x40,
+        V9FS_OAPPEND = 0x80,
+};
+/* permissions */
+enum {
+        V9FS_DMDIR = 0x80000000,
+        V9FS_DMAPPEND = 0x40000000,
+        V9FS_DMEXCL = 0x20000000,
+        V9FS_DMMOUNT = 0x10000000,
+        V9FS_DMAUTH = 0x08000000,
+        V9FS_DMTMP = 0x04000000,
+        V9FS_DMSYMLINK = 0x02000000,
+        V9FS_DMLINK = 0x01000000,
+        /* 9P2000.u extensions */
+        V9FS_DMDEVICE = 0x00800000,
+        V9FS_DMNAMEDPIPE = 0x00200000,
+        V9FS_DMSOCKET = 0x00100000,
+        V9FS_DMSETUID = 0x00080000,
+        V9FS_DMSETGID = 0x00040000,
+};
+/* qid.types */
+enum {
+        V9FS_QTDIR = 0x80,
+        V9FS_QTAPPEND = 0x40,
+        V9FS_QTEXCL = 0x20,
+        V9FS_QTMOUNT = 0x10,
+        V9FS_QTAUTH = 0x08,
+        V9FS_QTTMP = 0x04,
+        V9FS_QTSYMLINK = 0x02,
+        V9FS_QTLINK = 0x01,
+        V9FS_QTFILE = 0x00,
+};
+/* ample room for Twrite/Rread header (iounit) */
+#define V9FS_IOHDRSZ    24
+/* qids are the unique ID for a file (like an inode */
+struct v9fs_qid {
+        u8 type;
+        u32 version;
+        u64 path;
+};
+/* Plan 9 file metadata (stat) structure */
+struct v9fs_stat {
+        u16 size;
+        u16 type;
+        u32 dev;
+        struct v9fs_qid qid;
+        u32 mode;
+        u32 atime;
+        u32 mtime;
+        u64 length;
+        char *name;
+        char *uid;
+        char *gid;
+        char *muid;
+        char *extension;        /* 9p2000.u extensions */
+        u32 n_uid;              /* 9p2000.u extensions */
+        u32 n_gid;              /* 9p2000.u extensions */
+        u32 n_muid;             /* 9p2000.u extensions */
+        char data[0];
+};
+/* Structures for Protocol Operations */
+struct Tversion {
+        u32 msize;
+        char *version;
+};
+struct Rversion {
+        u32 msize;
+        char *version;
+};
+struct Tauth {
+        u32 afid;
+        char *uname;
+        char *aname;
+};
+struct Rauth {
+        struct v9fs_qid qid;
+};
+struct Rerror {
+        char *error;
+        u32 errno;              /* 9p2000.u extension */
+};
+struct Tflush {
+        u32 oldtag;
+};
+struct Rflush {
+};
+struct Tattach {
+        u32 fid;
+        u32 afid;
+        char *uname;
+        char *aname;
+};
+struct Rattach {
+        struct v9fs_qid qid;
+};
+struct Twalk {
+        u32 fid;
+        u32 newfid;
+        u32 nwname;
+        char **wnames;
+};
+struct Rwalk {
+        u32 nwqid;
+        struct v9fs_qid *wqids;
+};
+struct Topen {
+        u32 fid;
+        u8 mode;
+};
+struct Ropen {
+        struct v9fs_qid qid;
+        u32 iounit;
+};
+struct Tcreate {
+        u32 fid;
+        char *name;
+        u32 perm;
+        u8 mode;
+};
+struct Rcreate {
+        struct v9fs_qid qid;
+        u32 iounit;
+};
+struct Tread {
+        u32 fid;
+        u64 offset;
+        u32 count;
+};
+struct Rread {
+        u32 count;
+        u8 *data;
+};
+struct Twrite {
+        u32 fid;
+        u64 offset;
+        u32 count;
+        u8 *data;
+};
+struct Rwrite {
+        u32 count;
+};
+struct Tclunk {
+        u32 fid;
+};
+struct Rclunk {
+};
+struct Tremove {
+        u32 fid;
+};
+struct Rremove {
+};
+struct Tstat {
+        u32 fid;
+};
+struct Rstat {
+        struct v9fs_stat *stat;
+};
+struct Twstat {
+        u32 fid;
+        struct v9fs_stat *stat;
+};
+struct Rwstat {
+};
+/*
+  * fcall is the primary packet structure
+  *
+  */
+struct v9fs_fcall {
+        u32 size;
+        u8 id;
+        u16 tag;
+        union {
+                struct Tversion tversion;
+                struct Rversion rversion;
+                struct Tauth tauth;
+                struct Rauth rauth;
+                struct Rerror rerror;
+                struct Tflush tflush;
+                struct Rflush rflush;
+                struct Tattach tattach;
+                struct Rattach rattach;
+                struct Twalk twalk;
+                struct Rwalk rwalk;
+                struct Topen topen;
+                struct Ropen ropen;
+                struct Tcreate tcreate;
+                struct Rcreate rcreate;
+                struct Tread tread;
+                struct Rread rread;
+                struct Twrite twrite;
+                struct Rwrite rwrite;
+                struct Tclunk tclunk;
+                struct Rclunk rclunk;
+                struct Tremove tremove;
+                struct Rremove rremove;
+                struct Tstat tstat;
+                struct Rstat rstat;
+                struct Twstat twstat;
+                struct Rwstat rwstat;
+        } params;
+};
+#define FCALL_ERROR(fcall) (fcall ? fcall->params.rerror.error : "")
+int v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize,
+                   char *version, struct v9fs_fcall **rcall);
+int v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname,
+                  u32 fid, u32 afid, struct v9fs_fcall **rcall);
+int v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid,
+                 struct v9fs_fcall **rcall);
+int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 oldtag);
+int v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid,
+                struct v9fs_fcall **rcall);
+int v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid,
+                 struct v9fs_stat *stat, struct v9fs_fcall **rcall);
+int v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid,
+                char *name, struct v9fs_fcall **rcall);
+int v9fs_t_open(struct v9fs_session_info *v9ses, u32 fid, u8 mode,
+                struct v9fs_fcall **rcall);
+int v9fs_t_remove(struct v9fs_session_info *v9ses, u32 fid,
+                  struct v9fs_fcall **rcall);
+int v9fs_t_create(struct v9fs_session_info *v9ses, u32 fid, char *name,
+                  u32 perm, u8 mode, struct v9fs_fcall **rcall);
+int v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid,
+                u64 offset, u32 count, struct v9fs_fcall **rcall);
+int v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid, u64 offset,
+                 u32 count, void *data, struct v9fs_fcall **rcall);
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
new file mode 100644
index 000000000000..e4e4ffe5a7dc
--- /dev/null
+++ b/fs/9p/Makefile
@@ -0,0 +1,17 @@
+obj-$(CONFIG_9P_FS) := 9p2000.o
+9p2000-objs := \
+        vfs_super.o \
+        vfs_inode.o \
+        vfs_file.o \
+        vfs_dir.o \
+        vfs_dentry.o \
+        error.o \
+        mux.o \
+        trans_fd.o \
+        trans_sock.o \
+        9p.o \
+        conv.o \
+        v9fs.o \
+        fid.o
diff --git a/fs/9p/conv.c b/fs/9p/conv.c
new file mode 100644
index 000000000000..1554731bd653
--- /dev/null
+++ b/fs/9p/conv.c
@@ -0,0 +1,693 @@
+/*
+ * linux/fs/9p/conv.c
+ *
+ * 9P protocol conversion functions
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/idr.h>
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "conv.h"
+/*
+ * Buffer to help with string parsing
+ */
+struct cbuf {
+        unsigned char *sp;
+        unsigned char *p;
+        unsigned char *ep;
+};
+static inline void buf_init(struct cbuf *buf, void *data, int datalen)
+{
+        buf->sp = buf->p = data;
+        buf->ep = data + datalen;
+}
+static inline int buf_check_overflow(struct cbuf *buf)
+{
+        return buf->p > buf->ep;
+}
+static inline void buf_check_size(struct cbuf *buf, int len)
+{
+        if (buf->p+len > buf->ep) {
+                if (buf->p < buf->ep) {
+                        eprintk(KERN_ERR, "buffer overflow\n");
+                        buf->p = buf->ep + 1;
+                }
+        }
+}
+static inline void *buf_alloc(struct cbuf *buf, int len)
+{
+        void *ret = NULL;
+        buf_check_size(buf, len);
+        ret = buf->p;
+        buf->p += len;
+        return ret;
+}
+static inline void buf_put_int8(struct cbuf *buf, u8 val)
+{
+        buf_check_size(buf, 1);
+        buf->p[0] = val;
+        buf->p++;
+}
+static inline void buf_put_int16(struct cbuf *buf, u16 val)
+{
+        buf_check_size(buf, 2);
+        *(__le16 *) buf->p = cpu_to_le16(val);
+        buf->p += 2;
+}
+static inline void buf_put_int32(struct cbuf *buf, u32 val)
+{
+        buf_check_size(buf, 4);
+        *(__le32 *)buf->p = cpu_to_le32(val);
+        buf->p += 4;
+}
+static inline void buf_put_int64(struct cbuf *buf, u64 val)
+{
+        buf_check_size(buf, 8);
+        *(__le64 *)buf->p = cpu_to_le64(val);
+        buf->p += 8;
+}
+static inline void buf_put_stringn(struct cbuf *buf, const char *s, u16 slen)
+{
+        buf_check_size(buf, slen + 2);
+        buf_put_int16(buf, slen);
+        memcpy(buf->p, s, slen);
+        buf->p += slen;
+}
+static inline void buf_put_string(struct cbuf *buf, const char *s)
+{
+        buf_put_stringn(buf, s, strlen(s));
+}
+static inline void buf_put_data(struct cbuf *buf, void *data, u32 datalen)
+{
+        buf_check_size(buf, datalen);
+        memcpy(buf->p, data, datalen);
+        buf->p += datalen;
+}
+static inline u8 buf_get_int8(struct cbuf *buf)
+{
+        u8 ret = 0;
+        buf_check_size(buf, 1);
+        ret = buf->p[0];
+        buf->p++;
+        return ret;
+}
+static inline u16 buf_get_int16(struct cbuf *buf)
+{
+        u16 ret = 0;
+        buf_check_size(buf, 2);
+        ret = le16_to_cpu(*(__le16 *)buf->p);
+        buf->p += 2;
+        return ret;
+}
+static inline u32 buf_get_int32(struct cbuf *buf)
+{
+        u32 ret = 0;
+        buf_check_size(buf, 4);
+        ret = le32_to_cpu(*(__le32 *)buf->p);
+        buf->p += 4;
+        return ret;
+}
+static inline u64 buf_get_int64(struct cbuf *buf)
+{
+        u64 ret = 0;
+        buf_check_size(buf, 8);
+        ret = le64_to_cpu(*(__le64 *)buf->p);
+        buf->p += 8;
+        return ret;
+}
+static inline int
+buf_get_string(struct cbuf *buf, char *data, unsigned int datalen)
+{
+        u16 len = buf_get_int16(buf);
+        buf_check_size(buf, len);
+        if (len + 1 > datalen)
+                return 0;
+        memcpy(data, buf->p, len);
+        data[len] = 0;
+        buf->p += len;
+        return len + 1;
+}
+static inline char *buf_get_stringb(struct cbuf *buf, struct cbuf *sbuf)
+{
+        char *ret = NULL;
+        int n = buf_get_string(buf, sbuf->p, sbuf->ep - sbuf->p);
+        if (n > 0) {
+                ret = sbuf->p;
+                sbuf->p += n;
+        }
+        return ret;
+}
+static inline int buf_get_data(struct cbuf *buf, void *data, int datalen)
+{
+        buf_check_size(buf, datalen);
+        memcpy(data, buf->p, datalen);
+        buf->p += datalen;
+        return datalen;
+}
+static inline void *buf_get_datab(struct cbuf *buf, struct cbuf *dbuf,
+                                  int datalen)
+{
+        char *ret = NULL;
+        int n = 0;
+        buf_check_size(dbuf, datalen);
+        n = buf_get_data(buf, dbuf->p, datalen);
+        if (n > 0) {
+                ret = dbuf->p;
+                dbuf->p += n;
+        }
+        return ret;
+}
+/**
+ * v9fs_size_stat - calculate the size of a variable length stat struct
+ * @v9ses: session information
+ * @stat: metadata (stat) structure
+ *
+ */
+static int v9fs_size_stat(struct v9fs_session_info *v9ses,
+                          struct v9fs_stat *stat)
+{
+        int size = 0;
+        if (stat == NULL) {
+                eprintk(KERN_ERR, "v9fs_size_stat: got a NULL stat pointer\n");
+                return 0;
+        }
+        size =                  /* 2 + *//* size[2] */
+            2 +                 /* type[2] */
+            4 +                 /* dev[4] */
+            1 +                 /* qid.type[1] */
+            4 +                 /* qid.vers[4] */
+            8 +                 /* qid.path[8] */
+            4 +                 /* mode[4] */
+            4 +                 /* atime[4] */
+            4 +                 /* mtime[4] */
+            8 +                 /* length[8] */
+            8;                  /* minimum sum of string lengths */
+        if (stat->name)
+                size += strlen(stat->name);
+        if (stat->uid)
+                size += strlen(stat->uid);
+        if (stat->gid)
+                size += strlen(stat->gid);
+        if (stat->muid)
+                size += strlen(stat->muid);
+        if (v9ses->extended) {
+                size += 4 +     /* n_uid[4] */
+                    4 +         /* n_gid[4] */
+                    4 +         /* n_muid[4] */
+                    2;          /* string length of extension[4] */
+                if (stat->extension)
+                        size += strlen(stat->extension);
+        }
+        return size;
+}
+/**
+ * serialize_stat - safely format a stat structure for transmission
+ * @v9ses: session info
+ * @stat: metadata (stat) structure
+ * @bufp: buffer to serialize structure into
+ *
+ */
+static int
+serialize_stat(struct v9fs_session_info *v9ses, struct v9fs_stat *stat,
+               struct cbuf *bufp)
+{
+        buf_put_int16(bufp, stat->size);
+        buf_put_int16(bufp, stat->type);
+        buf_put_int32(bufp, stat->dev);
+        buf_put_int8(bufp, stat->qid.type);
+        buf_put_int32(bufp, stat->qid.version);
+        buf_put_int64(bufp, stat->qid.path);
+        buf_put_int32(bufp, stat->mode);
+        buf_put_int32(bufp, stat->atime);
+        buf_put_int32(bufp, stat->mtime);
+        buf_put_int64(bufp, stat->length);
+        buf_put_string(bufp, stat->name);
+        buf_put_string(bufp, stat->uid);
+        buf_put_string(bufp, stat->gid);
+        buf_put_string(bufp, stat->muid);
+        if (v9ses->extended) {
+                buf_put_string(bufp, stat->extension);
+                buf_put_int32(bufp, stat->n_uid);
+                buf_put_int32(bufp, stat->n_gid);
+                buf_put_int32(bufp, stat->n_muid);
+        }
+        if (buf_check_overflow(bufp))
+                return 0;
+        return stat->size;
+}
+/**
+ * deserialize_stat - safely decode a recieved metadata (stat) structure
+ * @v9ses: session info
+ * @bufp: buffer to deserialize
+ * @stat: metadata (stat) structure
+ * @dbufp: buffer to deserialize variable strings into
+ *
+ */
+static inline int
+deserialize_stat(struct v9fs_session_info *v9ses, struct cbuf *bufp,
+                 struct v9fs_stat *stat, struct cbuf *dbufp)
+{
+        stat->size = buf_get_int16(bufp);
+        stat->type = buf_get_int16(bufp);
+        stat->dev = buf_get_int32(bufp);
+        stat->qid.type = buf_get_int8(bufp);
+        stat->qid.version = buf_get_int32(bufp);
+        stat->qid.path = buf_get_int64(bufp);
+        stat->mode = buf_get_int32(bufp);
+        stat->atime = buf_get_int32(bufp);
+        stat->mtime = buf_get_int32(bufp);
+        stat->length = buf_get_int64(bufp);
+        stat->name = buf_get_stringb(bufp, dbufp);
+        stat->uid = buf_get_stringb(bufp, dbufp);
+        stat->gid = buf_get_stringb(bufp, dbufp);
+        stat->muid = buf_get_stringb(bufp, dbufp);
+        if (v9ses->extended) {
+                stat->extension = buf_get_stringb(bufp, dbufp);
+                stat->n_uid = buf_get_int32(bufp);
+                stat->n_gid = buf_get_int32(bufp);
+                stat->n_muid = buf_get_int32(bufp);
+        }
+        if (buf_check_overflow(bufp) || buf_check_overflow(dbufp))
+                return 0;
+        return stat->size + 2;
+}
+/**
+ * deserialize_statb - wrapper for decoding a received metadata structure
+ * @v9ses: session info
+ * @bufp: buffer to deserialize
+ * @dbufp: buffer to deserialize variable strings into
+ *
+ */
+static inline struct v9fs_stat *deserialize_statb(struct v9fs_session_info
+                                                  *v9ses, struct cbuf *bufp,
+                                                  struct cbuf *dbufp)
+{
+        struct v9fs_stat *ret = buf_alloc(dbufp, sizeof(struct v9fs_stat));
+        if (ret) {
+                int n = deserialize_stat(v9ses, bufp, ret, dbufp);
+                if (n <= 0)
+                        return NULL;
+        }
+        return ret;
+}
+/**
+ * v9fs_deserialize_stat - decode a received metadata structure
+ * @v9ses: session info
+ * @buf: buffer to deserialize
+ * @buflen: length of received buffer
+ * @stat: metadata structure to decode into
+ * @statlen: length of destination metadata structure
+ *
+ */
+int
+v9fs_deserialize_stat(struct v9fs_session_info *v9ses, void *buf,
+                      u32 buflen, struct v9fs_stat *stat, u32 statlen)
+{
+        struct cbuf buffer;
+        struct cbuf *bufp = &buffer;
+        struct cbuf dbuffer;
+        struct cbuf *dbufp = &dbuffer;
+        buf_init(bufp, buf, buflen);
+        buf_init(dbufp, (char *)stat + sizeof(struct v9fs_stat),
+                 statlen - sizeof(struct v9fs_stat));
+        return deserialize_stat(v9ses, bufp, stat, dbufp);
+}
+static inline int
+v9fs_size_fcall(struct v9fs_session_info *v9ses, struct v9fs_fcall *fcall)
+{
+        int size = 4 + 1 + 2;   /* size[4] msg[1] tag[2] */
+        int i = 0;
+        switch (fcall->id) {
+        default:
+                eprintk(KERN_ERR, "bad msg type %d\n", fcall->id);
+                return 0;
+        case TVERSION:          /* msize[4] version[s] */
+                size += 4 + 2 + strlen(fcall->params.tversion.version);
+                break;
+        case TAUTH:             /* afid[4] uname[s] aname[s] */
+                size += 4 + 2 + strlen(fcall->params.tauth.uname) +
+                    2 + strlen(fcall->params.tauth.aname);
+                break;
+        case TFLUSH:            /* oldtag[2] */
+                size += 2;
+                break;
+        case TATTACH:           /* fid[4] afid[4] uname[s] aname[s] */
+                size += 4 + 4 + 2 + strlen(fcall->params.tattach.uname) +
+                    2 + strlen(fcall->params.tattach.aname);
+                break;
+        case TWALK:             /* fid[4] newfid[4] nwname[2] nwname*(wname[s]) */
+                size += 4 + 4 + 2;
+                /* now compute total for the array of names */
+                for (i = 0; i < fcall->params.twalk.nwname; i++)
+                        size += 2 + strlen(fcall->params.twalk.wnames[i]);
+                break;
+        case TOPEN:             /* fid[4] mode[1] */
+                size += 4 + 1;
+                break;
+        case TCREATE:           /* fid[4] name[s] perm[4] mode[1] */
+                size += 4 + 2 + strlen(fcall->params.tcreate.name) + 4 + 1;
+                break;
+        case TREAD:             /* fid[4] offset[8] count[4] */
+                size += 4 + 8 + 4;
+                break;
+        case TWRITE:            /* fid[4] offset[8] count[4] data[count] */
+                size += 4 + 8 + 4 + fcall->params.twrite.count;
+                break;
+        case TCLUNK:            /* fid[4] */
+                size += 4;
+                break;
+        case TREMOVE:           /* fid[4] */
+                size += 4;
+                break;
+        case TSTAT:             /* fid[4] */
+                size += 4;
+                break;
+        case TWSTAT:            /* fid[4] stat[n] */
+                fcall->params.twstat.stat->size =
+                    v9fs_size_stat(v9ses, fcall->params.twstat.stat);
+                size += 4 + 2 + 2 + fcall->params.twstat.stat->size;
+        }
+        return size;
+}
+/*
+ * v9fs_serialize_fcall - marshall fcall struct into a packet
+ * @v9ses: session information
+ * @fcall: structure to convert
+ * @data: buffer to serialize fcall into
+ * @datalen: length of buffer to serialize fcall into
+ *
+ */
+int
+v9fs_serialize_fcall(struct v9fs_session_info *v9ses, struct v9fs_fcall *fcall,
+                     void *data, u32 datalen)
+{
+        int i = 0;
+        struct v9fs_stat *stat = NULL;
+        struct cbuf buffer;
+        struct cbuf *bufp = &buffer;
+        buf_init(bufp, data, datalen);
+        if (!fcall) {
+                eprintk(KERN_ERR, "no fcall\n");
+                return -EINVAL;
+        }
+        fcall->size = v9fs_size_fcall(v9ses, fcall);
+        buf_put_int32(bufp, fcall->size);
+        buf_put_int8(bufp, fcall->id);
+        buf_put_int16(bufp, fcall->tag);
+        dprintk(DEBUG_CONV, "size %d id %d tag %d\n", fcall->size, fcall->id,
+                fcall->tag);
+        /* now encode it */
+        switch (fcall->id) {
+        default:
+                eprintk(KERN_ERR, "bad msg type: %d\n", fcall->id);
+                return -EPROTO;
+        case TVERSION:
+                buf_put_int32(bufp, fcall->params.tversion.msize);
+                buf_put_string(bufp, fcall->params.tversion.version);
+                break;
+        case TAUTH:
+                buf_put_int32(bufp, fcall->params.tauth.afid);
+                buf_put_string(bufp, fcall->params.tauth.uname);
+                buf_put_string(bufp, fcall->params.tauth.aname);
+                break;
+        case TFLUSH:
+                buf_put_int16(bufp, fcall->params.tflush.oldtag);
+                break;
+        case TATTACH:
+                buf_put_int32(bufp, fcall->params.tattach.fid);
+                buf_put_int32(bufp, fcall->params.tattach.afid);
+                buf_put_string(bufp, fcall->params.tattach.uname);
+                buf_put_string(bufp, fcall->params.tattach.aname);
+                break;
+        case TWALK:
+                buf_put_int32(bufp, fcall->params.twalk.fid);
+                buf_put_int32(bufp, fcall->params.twalk.newfid);
+                buf_put_int16(bufp, fcall->params.twalk.nwname);
+                for (i = 0; i < fcall->params.twalk.nwname; i++)
+                        buf_put_string(bufp, fcall->params.twalk.wnames[i]);
+                break;
+        case TOPEN:
+                buf_put_int32(bufp, fcall->params.topen.fid);
+                buf_put_int8(bufp, fcall->params.topen.mode);
+                break;
+        case TCREATE:
+                buf_put_int32(bufp, fcall->params.tcreate.fid);
+                buf_put_string(bufp, fcall->params.tcreate.name);
+                buf_put_int32(bufp, fcall->params.tcreate.perm);
+                buf_put_int8(bufp, fcall->params.tcreate.mode);
+                break;
+        case TREAD:
+                buf_put_int32(bufp, fcall->params.tread.fid);
+                buf_put_int64(bufp, fcall->params.tread.offset);
+                buf_put_int32(bufp, fcall->params.tread.count);
+                break;
+        case TWRITE:
+                buf_put_int32(bufp, fcall->params.twrite.fid);
+                buf_put_int64(bufp, fcall->params.twrite.offset);
+                buf_put_int32(bufp, fcall->params.twrite.count);
+                buf_put_data(bufp, fcall->params.twrite.data,
+                             fcall->params.twrite.count);
+                break;
+        case TCLUNK:
+                buf_put_int32(bufp, fcall->params.tclunk.fid);
+                break;
+        case TREMOVE:
+                buf_put_int32(bufp, fcall->params.tremove.fid);
+                break;
+        case TSTAT:
+                buf_put_int32(bufp, fcall->params.tstat.fid);
+                break;
+        case TWSTAT:
+                buf_put_int32(bufp, fcall->params.twstat.fid);
+                stat = fcall->params.twstat.stat;
+                buf_put_int16(bufp, stat->size + 2);
+                serialize_stat(v9ses, stat, bufp);
+                break;
+        }
+        if (buf_check_overflow(bufp))
+                return -EIO;
+        return fcall->size;
+}
+/**
+ * deserialize_fcall - unmarshal a response
+ * @v9ses: session information
+ * @msgsize: size of rcall message
+ * @buf: recieved buffer
+ * @buflen: length of received buffer
+ * @rcall: fcall structure to populate
+ * @rcalllen: length of fcall structure to populate
+ *
+ */
+int
+v9fs_deserialize_fcall(struct v9fs_session_info *v9ses, u32 msgsize,
+                       void *buf, u32 buflen, struct v9fs_fcall *rcall,
+                       int rcalllen)
+{
+        struct cbuf buffer;
+        struct cbuf *bufp = &buffer;
+        struct cbuf dbuffer;
+        struct cbuf *dbufp = &dbuffer;
+        int i = 0;
+        buf_init(bufp, buf, buflen);
+        buf_init(dbufp, (char *)rcall + sizeof(struct v9fs_fcall),
+                 rcalllen - sizeof(struct v9fs_fcall));
+        rcall->size = msgsize;
+        rcall->id = buf_get_int8(bufp);
+        rcall->tag = buf_get_int16(bufp);
+        dprintk(DEBUG_CONV, "size %d id %d tag %d\n", rcall->size, rcall->id,
+                rcall->tag);
+        switch (rcall->id) {
+        default:
+                eprintk(KERN_ERR, "unknown message type: %d\n", rcall->id);
+                return -EPROTO;
+        case RVERSION:
+                rcall->params.rversion.msize = buf_get_int32(bufp);
+                rcall->params.rversion.version = buf_get_stringb(bufp, dbufp);
+                break;
+        case RFLUSH:
+                break;
+        case RATTACH:
+                rcall->params.rattach.qid.type = buf_get_int8(bufp);
+                rcall->params.rattach.qid.version = buf_get_int32(bufp);
+                rcall->params.rattach.qid.path = buf_get_int64(bufp);
+                break;
+        case RWALK:
+                rcall->params.rwalk.nwqid = buf_get_int16(bufp);
+                rcall->params.rwalk.wqids = buf_alloc(bufp,
+                      rcall->params.rwalk.nwqid * sizeof(struct v9fs_qid));
+                if (rcall->params.rwalk.wqids)
+                        for (i = 0; i < rcall->params.rwalk.nwqid; i++) {
+                                rcall->params.rwalk.wqids[i].type =
+                                    buf_get_int8(bufp);
+                                rcall->params.rwalk.wqids[i].version =
+                                    buf_get_int16(bufp);
+                                rcall->params.rwalk.wqids[i].path =
+                                    buf_get_int64(bufp);
+                        }
+                break;
+        case ROPEN:
+                rcall->params.ropen.qid.type = buf_get_int8(bufp);
+                rcall->params.ropen.qid.version = buf_get_int32(bufp);
+                rcall->params.ropen.qid.path = buf_get_int64(bufp);
+                rcall->params.ropen.iounit = buf_get_int32(bufp);
+                break;
+        case RCREATE:
+                rcall->params.rcreate.qid.type = buf_get_int8(bufp);
+                rcall->params.rcreate.qid.version = buf_get_int32(bufp);
+                rcall->params.rcreate.qid.path = buf_get_int64(bufp);
+                rcall->params.rcreate.iounit = buf_get_int32(bufp);
+                break;
+        case RREAD:
+                rcall->params.rread.count = buf_get_int32(bufp);
+                rcall->params.rread.data = buf_get_datab(bufp, dbufp,
+                        rcall->params.rread.count);
+                break;
+        case RWRITE:
+                rcall->params.rwrite.count = buf_get_int32(bufp);
+                break;
+        case RCLUNK:
+                break;
+        case RREMOVE:
+                break;
+        case RSTAT:
+                buf_get_int16(bufp);
+                rcall->params.rstat.stat =
+                    deserialize_statb(v9ses, bufp, dbufp);
+                break;
+        case RWSTAT:
+                break;
+        case RERROR:
+                rcall->params.rerror.error = buf_get_stringb(bufp, dbufp);
+                if (v9ses->extended)
+                        rcall->params.rerror.errno = buf_get_int16(bufp);
+                break;
+        }
+        if (buf_check_overflow(bufp) || buf_check_overflow(dbufp))
+                return -EIO;
+        return rcall->size;
+}
diff --git a/fs/9p/conv.h b/fs/9p/conv.h
new file mode 100644
index 000000000000..ee849613c61a
--- /dev/null
+++ b/fs/9p/conv.h
@@ -0,0 +1,36 @@
+/*
+ * linux/fs/9p/conv.h
+ *
+ * 9P protocol conversion definitions
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+int v9fs_deserialize_stat(struct v9fs_session_info *, void *buf,
+                          u32 buflen, struct v9fs_stat *stat, u32 statlen);
+int v9fs_serialize_fcall(struct v9fs_session_info *, struct v9fs_fcall *tcall,
+                         void *buf, u32 buflen);
+int v9fs_deserialize_fcall(struct v9fs_session_info *, u32 msglen,
+                           void *buf, u32 buflen, struct v9fs_fcall *rcall,
+                           int rcalllen);
+/* this one is actually in error.c right now */
+int v9fs_errstr2errno(char *errstr);
diff --git a/fs/9p/debug.h b/fs/9p/debug.h
new file mode 100644
index 000000000000..4445f06919d9
--- /dev/null
+++ b/fs/9p/debug.h
@@ -0,0 +1,70 @@
+/*
+ *  linux/fs/9p/debug.h - V9FS Debug Definitions
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#define DEBUG_ERROR             (1<<0)
+#define DEBUG_CURRENT           (1<<1)
+#define DEBUG_9P                (1<<2)
+#define DEBUG_VFS               (1<<3)
+#define DEBUG_CONV              (1<<4)
+#define DEBUG_MUX               (1<<5)
+#define DEBUG_TRANS             (1<<6)
+#define DEBUG_SLABS             (1<<7)
+#define DEBUG_DUMP_PKT          0
+extern int v9fs_debug_level;
+#define dprintk(level, format, arg...) \
+do {  \
+        if((v9fs_debug_level & level)==level) \
+                printk(KERN_NOTICE "-- %s (%d): " \
+                format , __FUNCTION__, current->pid , ## arg); \
+} while(0)
+#define eprintk(level, format, arg...) \
+do { \
+        printk(level "v9fs: %s (%d): " \
+                format , __FUNCTION__, current->pid , ## arg); \
+} while(0)
+#if DEBUG_DUMP_PKT
+static inline void dump_data(const unsigned char *data, unsigned int datalen)
+{
+        int i, j;
+        int len = datalen;
+        printk(KERN_DEBUG "data ");
+        for (i = 0; i < len; i += 4) {
+                for (j = 0; (j < 4) && (i + j < len); j++)
+                        printk(KERN_DEBUG "%02x", data[i + j]);
+                printk(KERN_DEBUG " ");
+        }
+        printk(KERN_DEBUG "\n");
+}
+#else                           /* DEBUG_DUMP_PKT */
+static inline void dump_data(const unsigned char *data, unsigned int datalen)
+{
+}
+#endif                          /* DEBUG_DUMP_PKT */
diff --git a/fs/9p/error.c b/fs/9p/error.c
new file mode 100644
index 000000000000..fee5d19179c5
--- /dev/null
+++ b/fs/9p/error.c
@@ -0,0 +1,93 @@
+/*
+ * linux/fs/9p/error.c
+ *
+ * Error string handling
+ *
+ * Plan 9 uses error strings, Unix uses error numbers.  These functions
+ * try to help manage that and provide for dynamically adding error
+ * mappings.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include "debug.h"
+#include "error.h"
+/**
+ * v9fs_error_init - preload
+ * @errstr: error string
+ *
+ */
+int v9fs_error_init(void)
+{
+        struct errormap *c;
+        int bucket;
+        /* initialize hash table */
+        for (bucket = 0; bucket < ERRHASHSZ; bucket++)
+                INIT_HLIST_HEAD(&hash_errmap[bucket]);
+        /* load initial error map into hash table */
+        for (c = errmap; c->name != NULL; c++) {
+                bucket = jhash(c->name, strlen(c->name), 0) % ERRHASHSZ;
+                INIT_HLIST_NODE(&c->list);
+                hlist_add_head(&c->list, &hash_errmap[bucket]);
+        }
+        return 1;
+}
+/**
+ * errstr2errno - convert error string to error number
+ * @errstr: error string
+ *
+ */
+int v9fs_errstr2errno(char *errstr)
+{
+        int errno = 0;
+        struct hlist_node *p = NULL;
+        struct errormap *c = NULL;
+        int bucket = jhash(errstr, strlen(errstr), 0) % ERRHASHSZ;
+        hlist_for_each_entry(c, p, &hash_errmap[bucket], list) {
+                if (!strcmp(c->name, errstr)) {
+                        errno = c->val;
+                        break;
+                }
+        }
+        if (errno == 0) {
+                /* TODO: if error isn't found, add it dynamically */
+                printk(KERN_ERR "%s: errstr :%s: not found\n", __FUNCTION__,
+                       errstr);
+                errno = 1;
+        }
+        return -errno;
+}
diff --git a/fs/9p/error.h b/fs/9p/error.h
new file mode 100644
index 000000000000..78f89acf7c9a
--- /dev/null
+++ b/fs/9p/error.h
@@ -0,0 +1,178 @@
+/*
+ * linux/fs/9p/error.h
+ *
+ * Huge Nasty Error Table
+ *
+ * Plan 9 uses error strings, Unix uses error numbers.  This table tries to
+ * match UNIX strings and Plan 9 strings to unix error numbers.  It is used
+ * to preload the dynamic error table which can also track user-specific error
+ * strings.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#include <linux/errno.h>
+#include <asm/errno.h>
+struct errormap {
+        char *name;
+        int val;
+        struct hlist_node list;
+};
+#define ERRHASHSZ               32
+static struct hlist_head hash_errmap[ERRHASHSZ];
+/* FixMe - reduce to a reasonable size */
+static struct errormap errmap[] = {
+        {"Operation not permitted", EPERM},
+        {"wstat prohibited", EPERM},
+        {"No such file or directory", ENOENT},
+        {"directory entry not found", ENOENT},
+        {"file not found", ENOENT},
+        {"Interrupted system call", EINTR},
+        {"Input/output error", EIO},
+        {"No such device or address", ENXIO},
+        {"Argument list too long", E2BIG},
+        {"Bad file descriptor", EBADF},
+        {"Resource temporarily unavailable", EAGAIN},
+        {"Cannot allocate memory", ENOMEM},
+        {"Permission denied", EACCES},
+        {"Bad address", EFAULT},
+        {"Block device required", ENOTBLK},
+        {"Device or resource busy", EBUSY},
+        {"File exists", EEXIST},
+        {"Invalid cross-device link", EXDEV},
+        {"No such device", ENODEV},
+        {"Not a directory", ENOTDIR},
+        {"Is a directory", EISDIR},
+        {"Invalid argument", EINVAL},
+        {"Too many open files in system", ENFILE},
+        {"Too many open files", EMFILE},
+        {"Text file busy", ETXTBSY},
+        {"File too large", EFBIG},
+        {"No space left on device", ENOSPC},
+        {"Illegal seek", ESPIPE},
+        {"Read-only file system", EROFS},
+        {"Too many links", EMLINK},
+        {"Broken pipe", EPIPE},
+        {"Numerical argument out of domain", EDOM},
+        {"Numerical result out of range", ERANGE},
+        {"Resource deadlock avoided", EDEADLK},
+        {"File name too long", ENAMETOOLONG},
+        {"No locks available", ENOLCK},
+        {"Function not implemented", ENOSYS},
+        {"Directory not empty", ENOTEMPTY},
+        {"Too many levels of symbolic links", ELOOP},
+        {"No message of desired type", ENOMSG},
+        {"Identifier removed", EIDRM},
+        {"No data available", ENODATA},
+        {"Machine is not on the network", ENONET},
+        {"Package not installed", ENOPKG},
+        {"Object is remote", EREMOTE},
+        {"Link has been severed", ENOLINK},
+        {"Communication error on send", ECOMM},
+        {"Protocol error", EPROTO},
+        {"Bad message", EBADMSG},
+        {"File descriptor in bad state", EBADFD},
+        {"Streams pipe error", ESTRPIPE},
+        {"Too many users", EUSERS},
+        {"Socket operation on non-socket", ENOTSOCK},
+        {"Message too long", EMSGSIZE},
+        {"Protocol not available", ENOPROTOOPT},
+        {"Protocol not supported", EPROTONOSUPPORT},
+        {"Socket type not supported", ESOCKTNOSUPPORT},
+        {"Operation not supported", EOPNOTSUPP},
+        {"Protocol family not supported", EPFNOSUPPORT},
+        {"Network is down", ENETDOWN},
+        {"Network is unreachable", ENETUNREACH},
+        {"Network dropped connection on reset", ENETRESET},
+        {"Software caused connection abort", ECONNABORTED},
+        {"Connection reset by peer", ECONNRESET},
+        {"No buffer space available", ENOBUFS},
+        {"Transport endpoint is already connected", EISCONN},
+        {"Transport endpoint is not connected", ENOTCONN},
+        {"Cannot send after transport endpoint shutdown", ESHUTDOWN},
+        {"Connection timed out", ETIMEDOUT},
+        {"Connection refused", ECONNREFUSED},
+        {"Host is down", EHOSTDOWN},
+        {"No route to host", EHOSTUNREACH},
+        {"Operation already in progress", EALREADY},
+        {"Operation now in progress", EINPROGRESS},
+        {"Is a named type file", EISNAM},
+        {"Remote I/O error", EREMOTEIO},
+        {"Disk quota exceeded", EDQUOT},
+/* errors from fossil, vacfs, and u9fs */
+        {"fid unknown or out of range", EBADF},
+        {"permission denied", EACCES},
+        {"file does not exist", ENOENT},
+        {"authentication failed", ECONNREFUSED},
+        {"bad offset in directory read", ESPIPE},
+        {"bad use of fid", EBADF},
+        {"wstat can't convert between files and directories", EPERM},
+        {"directory is not empty", ENOTEMPTY},
+        {"file exists", EEXIST},
+        {"file already exists", EEXIST},
+        {"file or directory already exists", EEXIST},
+        {"fid already in use", EBADF},
+        {"file in use", ETXTBSY},
+        {"i/o error", EIO},
+        {"file already open for I/O", ETXTBSY},
+        {"illegal mode", EINVAL},
+        {"illegal name", ENAMETOOLONG},
+        {"not a directory", ENOTDIR},
+        {"not a member of proposed group", EPERM},
+        {"not owner", EACCES},
+        {"only owner can change group in wstat", EACCES},
+        {"read only file system", EROFS},
+        {"no access to special file", EPERM},
+        {"i/o count too large", EIO},
+        {"unknown group", EINVAL},
+        {"unknown user", EINVAL},
+        {"bogus wstat buffer", EPROTO},
+        {"exclusive use file already open", EAGAIN},
+        {"corrupted directory entry", EIO},
+        {"corrupted file entry", EIO},
+        {"corrupted block label", EIO},
+        {"corrupted meta data", EIO},
+        {"illegal offset", EINVAL},
+        {"illegal path element", ENOENT},
+        {"root of file system is corrupted", EIO},
+        {"corrupted super block", EIO},
+        {"protocol botch", EPROTO},
+        {"file system is full", ENOSPC},
+        {"file is in use", EAGAIN},
+        {"directory entry is not allocated", ENOENT},
+        {"file is read only", EROFS},
+        {"file has been removed", EIDRM},
+        {"only support truncation to zero length", EPERM},
+        {"cannot remove root", EPERM},
+        {"file too big", EFBIG},
+        {"venti i/o error", EIO},
+        /* these are not errors */
+        {"u9fs rhostsauth: no authentication required", 0},
+        {"u9fs authnone: no authentication required", 0},
+        {NULL, -1}
+};
+extern int v9fs_error_init(void);
+extern int v9fs_errstr2errno(char *errstr);
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
new file mode 100644
index 000000000000..821c9c4d76aa
--- /dev/null
+++ b/fs/9p/fid.c
@@ -0,0 +1,241 @@
+/*
+ * V9FS FID Management
+ *
+ *  Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/idr.h>
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "v9fs_vfs.h"
+#include "transport.h"
+#include "mux.h"
+#include "conv.h"
+#include "fid.h"
+/**
+ * v9fs_fid_insert - add a fid to a dentry
+ * @fid: fid to add
+ * @dentry: dentry that it is being added to
+ *
+ */
+static int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry)
+{
+        struct list_head *fid_list = (struct list_head *)dentry->d_fsdata;
+        dprintk(DEBUG_9P, "fid %d (%p) dentry %s (%p)\n", fid->fid, fid,
+                dentry->d_iname, dentry);
+        if (dentry->d_fsdata == NULL) {
+                dentry->d_fsdata =
+                    kmalloc(sizeof(struct list_head), GFP_KERNEL);
+                if (dentry->d_fsdata == NULL) {
+                        dprintk(DEBUG_ERROR, "Out of memory\n");
+                        return -ENOMEM;
+                }
+                fid_list = (struct list_head *)dentry->d_fsdata;
+                INIT_LIST_HEAD(fid_list);       /* Initialize list head */
+        }
+        fid->uid = current->uid;
+        fid->pid = current->pid;
+        list_add(&fid->list, fid_list);
+        return 0;
+}
+/**
+ * v9fs_fid_create - allocate a FID structure
+ * @dentry - dentry to link newly created fid to
+ *
+ */
+struct v9fs_fid *v9fs_fid_create(struct dentry *dentry)
+{
+        struct v9fs_fid *new;
+        new = kmalloc(sizeof(struct v9fs_fid), GFP_KERNEL);
+        if (new == NULL) {
+                dprintk(DEBUG_ERROR, "Out of Memory\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        new->fid = -1;
+        new->fidopen = 0;
+        new->fidcreate = 0;
+        new->fidclunked = 0;
+        new->iounit = 0;
+        if (v9fs_fid_insert(new, dentry) == 0)
+                return new;
+        else {
+                dprintk(DEBUG_ERROR, "Problems inserting to dentry\n");
+                kfree(new);
+                return NULL;
+        }
+}
+/**
+ * v9fs_fid_destroy - deallocate a FID structure
+ * @fid: fid to destroy
+ *
+ */
+void v9fs_fid_destroy(struct v9fs_fid *fid)
+{
+        list_del(&fid->list);
+        kfree(fid);
+}
+/**
+ * v9fs_fid_lookup - retrieve the right fid from a  particular dentry
+ * @dentry: dentry to look for fid in
+ * @type: intent of lookup (operation or traversal)
+ *
+ * search list of fids associated with a dentry for a fid with a matching
+ * thread id or uid.  If that fails, look up the dentry's parents to see if you
+ * can find a matching fid.
+ *
+ */
+struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry, int type)
+{
+        struct list_head *fid_list = (struct list_head *)dentry->d_fsdata;
+        struct v9fs_fid *current_fid = NULL;
+        struct v9fs_fid *temp = NULL;
+        struct v9fs_fid *return_fid = NULL;
+        int found_parent = 0;
+        int found_user = 0;
+        dprintk(DEBUG_9P, " dentry: %s (%p) type %d\n", dentry->d_iname, dentry,
+                type);
+        if (fid_list && !list_empty(fid_list)) {
+                list_for_each_entry_safe(current_fid, temp, fid_list, list) {
+                        if (current_fid->uid == current->uid) {
+                                if (return_fid == NULL) {
+                                        if ((type == FID_OP)
+                                            || (!current_fid->fidopen)) {
+                                                return_fid = current_fid;
+                                                found_user = 1;
+                                        }
+                                }
+                        }
+                        if (current_fid->pid == current->real_parent->pid) {
+                                if ((return_fid == NULL) || (found_parent)
+                                    || (found_user)) {
+                                        if ((type == FID_OP)
+                                            || (!current_fid->fidopen)) {
+                                                return_fid = current_fid;
+                                                found_parent = 1;
+                                                found_user = 0;
+                                        }
+                                }
+                        }
+                        if (current_fid->pid == current->pid) {
+                                if ((type == FID_OP) ||
+                                    (!current_fid->fidopen)) {
+                                        return_fid = current_fid;
+                                        found_parent = 0;
+                                        found_user = 0;
+                                }
+                        }
+                }
+        }
+        /* we are at the root but didn't match */
+        if ((!return_fid) && (dentry->d_parent == dentry)) {
+                /* TODO: clone attach with new uid */
+                return_fid = current_fid;
+        }
+        if (!return_fid) {
+                struct dentry *par = current->fs->pwd->d_parent;
+                int count = 1;
+                while (par != NULL) {
+                        if (par == dentry)
+                                break;
+                        count++;
+                        if (par == par->d_parent) {
+                                dprintk(DEBUG_ERROR,
+                                        "got to root without finding dentry\n");
+                                break;
+                        }
+                        par = par->d_parent;
+                }
+/* XXX - there may be some duplication we can get rid of */
+                if (par == dentry) {
+                        /* we need to fid_lookup the starting point */
+                        int fidnum = -1;
+                        int oldfid = -1;
+                        int result = -1;
+                        struct v9fs_session_info *v9ses =
+                            v9fs_inode2v9ses(current->fs->pwd->d_inode);
+                        current_fid =
+                            v9fs_fid_lookup(current->fs->pwd, FID_WALK);
+                        if (current_fid == NULL) {
+                                dprintk(DEBUG_ERROR,
+                                        "process cwd doesn't have a fid\n");
+                                return return_fid;
+                        }
+                        oldfid = current_fid->fid;
+                        par = current->fs->pwd;
+                        /* TODO: take advantage of multiwalk */
+                        fidnum = v9fs_get_idpool(&v9ses->fidpool);
+                        if (fidnum < 0) {
+                                dprintk(DEBUG_ERROR,
+                                        "could not get a new fid num\n");
+                                return return_fid;
+                        }
+                        while (par != dentry) {
+                                result =
+                                    v9fs_t_walk(v9ses, oldfid, fidnum, "..",
+                                                NULL);
+                                if (result < 0) {
+                                        dprintk(DEBUG_ERROR,
+                                                "problem walking to parent\n");
+                                        break;
+                                }
+                                oldfid = fidnum;
+                                if (par == par->d_parent) {
+                                        dprintk(DEBUG_ERROR,
+                                                "can't find dentry\n");
+                                        break;
+                                }
+                                par = par->d_parent;
+                        }
+                        if (par == dentry) {
+                                return_fid = v9fs_fid_create(dentry);
+                                return_fid->fid = fidnum;
+                        }
+                }
+        }
+        return return_fid;
+}
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
new file mode 100644
index 000000000000..7db478ccca36
--- /dev/null
+++ b/fs/9p/fid.h
@@ -0,0 +1,57 @@
+/*
+ * V9FS FID Management
+ *
+ *  Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#include <linux/list.h>
+#define FID_OP   0
+#define FID_WALK 1
+struct v9fs_fid {
+        struct list_head list;   /* list of fids associated with a dentry */
+        struct list_head active; /* XXX - debug */
+        u32 fid;
+        unsigned char fidopen;    /* set when fid is opened */
+        unsigned char fidcreate;  /* set when fid was just created */
+        unsigned char fidclunked; /* set when fid has already been clunked */
+        struct v9fs_qid qid;
+        u32 iounit;
+        /* readdir stuff */
+        int rdir_fpos;
+        loff_t rdir_pos;
+        struct v9fs_fcall *rdir_fcall;
+        /* management stuff */
+        pid_t pid;              /* thread associated with this fid */
+        uid_t uid;              /* user associated with this fid */
+        /* private data */
+        struct file *filp;      /* backpointer to File struct for open files */
+        struct v9fs_session_info *v9ses;        /* session info for this FID */
+};
+struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry, int type);
+void v9fs_fid_destroy(struct v9fs_fid *fid);
+struct v9fs_fid *v9fs_fid_create(struct dentry *);
diff --git a/fs/9p/mux.c b/fs/9p/mux.c
new file mode 100644
index 000000000000..8835b576f744
--- /dev/null
+++ b/fs/9p/mux.c
@@ -0,0 +1,475 @@
+/*
+ * linux/fs/9p/mux.c
+ *
+ * Protocol Multiplexer
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2004 by Latchesar Ionkov <lucho@ionkov.net>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/kthread.h>
+#include <linux/idr.h>
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "transport.h"
+#include "conv.h"
+#include "mux.h"
+/**
+ * dprintcond - print condition of session info
+ * @v9ses: session info structure
+ * @req: RPC request structure
+ *
+ */
+static inline int
+dprintcond(struct v9fs_session_info *v9ses, struct v9fs_rpcreq *req)
+{
+        dprintk(DEBUG_MUX, "condition: %d, %p\n", v9ses->transport->status,
+                req->rcall);
+        return 0;
+}
+/**
+ * xread - force read of a certain number of bytes
+ * @v9ses: session info structure
+ * @ptr: pointer to buffer
+ * @sz: number of bytes to read
+ *
+ * Chuck Cranor CS-533 project1
+ */
+static int xread(struct v9fs_session_info *v9ses, void *ptr, unsigned long sz)
+{
+        int rd = 0;
+        int ret = 0;
+        while (rd < sz) {
+                ret = v9ses->transport->read(v9ses->transport, ptr, sz - rd);
+                if (ret <= 0) {
+                        dprintk(DEBUG_ERROR, "xread errno %d\n", ret);
+                        return ret;
+                }
+                rd += ret;
+                ptr += ret;
+        }
+        return (rd);
+}
+/**
+ * read_message - read a full 9P2000 fcall packet
+ * @v9ses: session info structure
+ * @rcall: fcall structure to read into
+ * @rcalllen: size of fcall buffer
+ *
+ */
+static int
+read_message(struct v9fs_session_info *v9ses,
+             struct v9fs_fcall *rcall, int rcalllen)
+{
+        unsigned char buf[4];
+        void *data;
+        int size = 0;
+        int res = 0;
+        res = xread(v9ses, buf, sizeof(buf));
+        if (res < 0) {
+                dprintk(DEBUG_ERROR,
+                        "Reading of count field failed returned: %d\n", res);
+                return res;
+        }
+        if (res < 4) {
+                dprintk(DEBUG_ERROR,
+                        "Reading of count field failed returned: %d\n", res);
+                return -EIO;
+        }
+        size = buf[0] | (buf[1] << 8) | (buf[2] << 16) | (buf[3] << 24);
+        dprintk(DEBUG_MUX, "got a packet count: %d\n", size);
+        /* adjust for the four bytes of size */
+        size -= 4;
+        if (size > v9ses->maxdata) {
+                dprintk(DEBUG_ERROR, "packet too big: %d\n", size);
+                return -E2BIG;
+        }
+        data = kmalloc(size, GFP_KERNEL);
+        if (!data) {
+                eprintk(KERN_WARNING, "out of memory\n");
+                return -ENOMEM;
+        }
+        res = xread(v9ses, data, size);
+        if (res < size) {
+                dprintk(DEBUG_ERROR, "Reading of fcall failed returned: %d\n",
+                        res);
+                kfree(data);
+                return res;
+        }
+        /* we now have an in-memory string that is the reply.
+         * deserialize it. There is very little to go wrong at this point
+         * save for v9fs_alloc errors.
+         */
+        res = v9fs_deserialize_fcall(v9ses, size, data, v9ses->maxdata,
+                                     rcall, rcalllen);
+        kfree(data);
+        if (res < 0)
+                return res;
+        return 0;
+}
+/**
+ * v9fs_recv - receive an RPC response for a particular tag
+ * @v9ses: session info structure
+ * @req: RPC request structure
+ *
+ */
+static int v9fs_recv(struct v9fs_session_info *v9ses, struct v9fs_rpcreq *req)
+{
+        int ret = 0;
+        dprintk(DEBUG_MUX, "waiting for response: %d\n", req->tcall->tag);
+        ret = wait_event_interruptible(v9ses->read_wait,
+                       ((v9ses->transport->status != Connected) ||
+                        (req->rcall != 0) || (req->err < 0) ||
+                        dprintcond(v9ses, req)));
+        dprintk(DEBUG_MUX, "got it: rcall %p\n", req->rcall);
+        spin_lock(&v9ses->muxlock);
+        list_del(&req->next);
+        spin_unlock(&v9ses->muxlock);
+        if (req->err < 0)
+                return req->err;
+        if (v9ses->transport->status == Disconnected)
+                return -ECONNRESET;
+        return ret;
+}
+/**
+ * v9fs_send - send a 9P request
+ * @v9ses: session info structure
+ * @req: RPC request to send
+ *
+ */
+static int v9fs_send(struct v9fs_session_info *v9ses, struct v9fs_rpcreq *req)
+{
+        int ret = -1;
+        void *data = NULL;
+        struct v9fs_fcall *tcall = req->tcall;
+        data = kmalloc(v9ses->maxdata + V9FS_IOHDRSZ, GFP_KERNEL);
+        if (!data)
+                return -ENOMEM;
+        tcall->size = 0;        /* enforce size recalculation */
+        ret =
+            v9fs_serialize_fcall(v9ses, tcall, data,
+                                 v9ses->maxdata + V9FS_IOHDRSZ);
+        if (ret < 0)
+                goto free_data;
+        spin_lock(&v9ses->muxlock);
+        list_add(&req->next, &v9ses->mux_fcalls);
+        spin_unlock(&v9ses->muxlock);
+        dprintk(DEBUG_MUX, "sending message: tag %d size %d\n", tcall->tag,
+                tcall->size);
+        ret = v9ses->transport->write(v9ses->transport, data, tcall->size);
+        if (ret != tcall->size) {
+                spin_lock(&v9ses->muxlock);
+                list_del(&req->next);
+                kfree(req->rcall);
+                spin_unlock(&v9ses->muxlock);
+                if (ret >= 0)
+                        ret = -EREMOTEIO;
+        } else
+                ret = 0;
+      free_data:
+        kfree(data);
+        return ret;
+}
+/**
+ * v9fs_mux_rpc - send a request, receive a response
+ * @v9ses: session info structure
+ * @tcall: fcall to send
+ * @rcall: buffer to place response into
+ *
+ */
+long
+v9fs_mux_rpc(struct v9fs_session_info *v9ses, struct v9fs_fcall *tcall,
+             struct v9fs_fcall **rcall)
+{
+        int tid = -1;
+        struct v9fs_fcall *fcall = NULL;
+        struct v9fs_rpcreq req;
+        int ret = -1;
+        if (!v9ses)
+                return -EINVAL;
+        if (!v9ses->transport || v9ses->transport->status != Connected)
+                return -EIO;
+        if (rcall)
+                *rcall = NULL;
+        if (tcall->id != TVERSION) {
+                tid = v9fs_get_idpool(&v9ses->tidpool);
+                if (tid < 0)
+                        return -ENOMEM;
+        }
+        tcall->tag = tid;
+        req.tcall = tcall;
+        req.err = 0;
+        req.rcall = NULL;
+        ret = v9fs_send(v9ses, &req);
+        if (ret < 0) {
+                if (tcall->id != TVERSION)
+                        v9fs_put_idpool(tid, &v9ses->tidpool);
+                dprintk(DEBUG_MUX, "error %d\n", ret);
+                return ret;
+        }
+        ret = v9fs_recv(v9ses, &req);
+        fcall = req.rcall;
+        dprintk(DEBUG_MUX, "received: tag=%x, ret=%d\n", tcall->tag, ret);
+        if (ret == -ERESTARTSYS) {
+                if (v9ses->transport->status != Disconnected
+                    && tcall->id != TFLUSH) {
+                        unsigned long flags;
+                        dprintk(DEBUG_MUX, "flushing the tag: %d\n",
+                                tcall->tag);
+                        clear_thread_flag(TIF_SIGPENDING);
+                        v9fs_t_flush(v9ses, tcall->tag);
+                        spin_lock_irqsave(&current->sighand->siglock, flags);
+                        recalc_sigpending();
+                        spin_unlock_irqrestore(&current->sighand->siglock,
+                                               flags);
+                        dprintk(DEBUG_MUX, "flushing done\n");
+                }
+                goto release_req;
+        } else if (ret < 0)
+                goto release_req;
+        if (!fcall)
+                ret = -EIO;
+        else {
+                if (fcall->id == RERROR) {
+                        ret = v9fs_errstr2errno(fcall->params.rerror.error);
+                        if (ret == 0) { /* string match failed */
+                                if (fcall->params.rerror.errno)
+                                        ret = -(fcall->params.rerror.errno);
+                                else
+                                        ret = -ESERVERFAULT;
+                        }
+                } else if (fcall->id != tcall->id + 1) {
+                        dprintk(DEBUG_ERROR,
+                                "fcall mismatch: expected %d, got %d\n",
+                                tcall->id + 1, fcall->id);
+                        ret = -EIO;
+                }
+        }
+      release_req:
+        if (tcall->id != TVERSION)
+                v9fs_put_idpool(tid, &v9ses->tidpool);
+        if (rcall)
+                *rcall = fcall;
+        else
+                kfree(fcall);
+        return ret;
+}
+/**
+ * v9fs_mux_cancel_requests - cancels all pending requests
+ *
+ * @v9ses: session info structure
+ * @err: error code to return to the requests
+ */
+void v9fs_mux_cancel_requests(struct v9fs_session_info *v9ses, int err)
+{
+        struct v9fs_rpcreq *rptr;
+        struct v9fs_rpcreq *rreq;
+        dprintk(DEBUG_MUX, " %d\n", err);
+        spin_lock(&v9ses->muxlock);
+        list_for_each_entry_safe(rreq, rptr, &v9ses->mux_fcalls, next) {
+                rreq->err = err;
+        }
+        spin_unlock(&v9ses->muxlock);
+        wake_up_all(&v9ses->read_wait);
+}
+/**
+ * v9fs_recvproc - kproc to handle demultiplexing responses
+ * @data: session info structure
+ *
+ */
+static int v9fs_recvproc(void *data)
+{
+        struct v9fs_session_info *v9ses = (struct v9fs_session_info *)data;
+        struct v9fs_fcall *rcall = NULL;
+        struct v9fs_rpcreq *rptr;
+        struct v9fs_rpcreq *req;
+        struct v9fs_rpcreq *rreq;
+        int err = 0;
+        allow_signal(SIGKILL);
+        set_current_state(TASK_INTERRUPTIBLE);
+        complete(&v9ses->proccmpl);
+        while (!kthread_should_stop() && err >= 0) {
+                req = rptr = rreq = NULL;
+                rcall = kmalloc(v9ses->maxdata + V9FS_IOHDRSZ, GFP_KERNEL);
+                if (!rcall) {
+                        eprintk(KERN_ERR, "no memory for buffers\n");
+                        break;
+                }
+                err = read_message(v9ses, rcall, v9ses->maxdata + V9FS_IOHDRSZ);
+                spin_lock(&v9ses->muxlock);
+                if (err < 0) {
+                        list_for_each_entry_safe(rreq, rptr, &v9ses->mux_fcalls, next) {
+                                rreq->err = err;
+                        }
+                        if(err != -ERESTARTSYS)
+                                eprintk(KERN_ERR,
+                                        "Transport error while reading message %d\n", err);
+                } else {
+                        list_for_each_entry_safe(rreq, rptr, &v9ses->mux_fcalls, next) {
+                                if (rreq->tcall->tag == rcall->tag) {
+                                        req = rreq;
+                                        req->rcall = rcall;
+                                        break;
+                                }
+                        }
+                }
+                if (req && (req->tcall->id == TFLUSH)) {
+                        struct v9fs_rpcreq *treq = NULL;
+                        list_for_each_entry_safe(treq, rptr, &v9ses->mux_fcalls, next) {
+                                if (treq->tcall->tag ==
+                                    req->tcall->params.tflush.oldtag) {
+                                        list_del(&rptr->next);
+                                        kfree(treq->rcall);
+                                        break;
+                                }
+                        }
+                }
+                spin_unlock(&v9ses->muxlock);
+                if (!req) {
+                        if (err >= 0)
+                                dprintk(DEBUG_ERROR,
+                                        "unexpected response: id %d tag %d\n",
+                                        rcall->id, rcall->tag);
+                        kfree(rcall);
+                }
+                wake_up_all(&v9ses->read_wait);
+                set_current_state(TASK_INTERRUPTIBLE);
+        }
+        v9ses->transport->close(v9ses->transport);
+        /* Inform all pending processes about the failure */
+        wake_up_all(&v9ses->read_wait);
+        if (signal_pending(current))
+                complete(&v9ses->proccmpl);
+        dprintk(DEBUG_MUX, "recvproc: end\n");
+        v9ses->recvproc = NULL;
+        return err >= 0;
+}
+/**
+ * v9fs_mux_init - initialize multiplexer (spawn kproc)
+ * @v9ses: session info structure
+ * @dev_name: mount device information (to create unique kproc)
+ *
+ */
+int v9fs_mux_init(struct v9fs_session_info *v9ses, const char *dev_name)
+{
+        char procname[60];
+        strncpy(procname, dev_name, sizeof(procname));
+        procname[sizeof(procname) - 1] = 0;
+        init_waitqueue_head(&v9ses->read_wait);
+        init_completion(&v9ses->fcread);
+        init_completion(&v9ses->proccmpl);
+        spin_lock_init(&v9ses->muxlock);
+        INIT_LIST_HEAD(&v9ses->mux_fcalls);
+        v9ses->recvproc = NULL;
+        v9ses->curfcall = NULL;
+        v9ses->recvproc = kthread_create(v9fs_recvproc, v9ses,
+                                         "v9fs_recvproc %s", procname);
+        if (IS_ERR(v9ses->recvproc)) {
+                eprintk(KERN_ERR, "cannot create receiving thread\n");
+                v9fs_session_close(v9ses);
+                return -ECONNABORTED;
+        }
+        wake_up_process(v9ses->recvproc);
+        wait_for_completion(&v9ses->proccmpl);
+        return 0;
+}
diff --git a/fs/9p/mux.h b/fs/9p/mux.h
new file mode 100644
index 000000000000..4994cb10badf
--- /dev/null
+++ b/fs/9p/mux.h
@@ -0,0 +1,41 @@
+/*
+ * linux/fs/9p/mux.h
+ *
+ * Multiplexer Definitions
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+/* structure to manage each RPC transaction */
+struct v9fs_rpcreq {
+        struct v9fs_fcall *tcall;
+        struct v9fs_fcall *rcall;
+        int err;        /* error code if response failed */
+        /* XXX - could we put scatter/gather buffers here? */
+        struct list_head next;
+};
+int v9fs_mux_init(struct v9fs_session_info *v9ses, const char *dev_name);
+long v9fs_mux_rpc(struct v9fs_session_info *v9ses,
+                  struct v9fs_fcall *tcall, struct v9fs_fcall **rcall);
+void v9fs_mux_cancel_requests(struct v9fs_session_info *v9ses, int err);
diff --git a/fs/9p/trans_fd.c b/fs/9p/trans_fd.c
new file mode 100644
index 000000000000..63b58ce98ff4
--- /dev/null
+++ b/fs/9p/trans_fd.c
@@ -0,0 +1,172 @@
+/*
+ * linux/fs/9p/trans_fd.c
+ *
+ * File Descriptor Transport Layer
+ *
+ *  Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/ipv6.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/un.h>
+#include <asm/uaccess.h>
+#include <linux/inet.h>
+#include <linux/idr.h>
+#include <linux/file.h>
+#include "debug.h"
+#include "v9fs.h"
+#include "transport.h"
+struct v9fs_trans_fd {
+        struct file *in_file;
+        struct file *out_file;
+};
+/**
+ * v9fs_fd_recv - receive from a socket
+ * @v9ses: session information
+ * @v: buffer to receive data into
+ * @len: size of receive buffer
+ *
+ */
+static int v9fs_fd_recv(struct v9fs_transport *trans, void *v, int len)
+{
+        struct v9fs_trans_fd *ts = trans ? trans->priv : NULL;
+        if (!trans || trans->status != Connected || !ts)
+                return -EIO;
+        return kernel_read(ts->in_file, ts->in_file->f_pos, v, len);
+}
+/**
+ * v9fs_fd_send - send to a socket
+ * @v9ses: session information
+ * @v: buffer to send data from
+ * @len: size of send buffer
+ *
+ */
+static int v9fs_fd_send(struct v9fs_transport *trans, void *v, int len)
+{
+        struct v9fs_trans_fd *ts = trans ? trans->priv : NULL;
+        mm_segment_t oldfs = get_fs();
+        int ret = 0;
+        if (!trans || trans->status != Connected || !ts)
+                return -EIO;
+        set_fs(get_ds());
+        /* The cast to a user pointer is valid due to the set_fs() */
+        ret = vfs_write(ts->out_file, (void __user *)v, len, &ts->out_file->f_pos);
+        set_fs(oldfs);
+        return ret;
+}
+/**
+ * v9fs_fd_init - initialize file descriptor transport
+ * @v9ses: session information
+ * @addr: address of server to mount
+ * @data: mount options
+ *
+ */
+static int
+v9fs_fd_init(struct v9fs_session_info *v9ses, const char *addr, char *data)
+{
+        struct v9fs_trans_fd *ts = NULL;
+        struct v9fs_transport *trans = v9ses->transport;
+        if((v9ses->wfdno == ~0) || (v9ses->rfdno == ~0)) {
+                printk(KERN_ERR "v9fs: Insufficient options for proto=fd\n");
+                return -ENOPROTOOPT;
+        }
+        sema_init(&trans->writelock, 1);
+        sema_init(&trans->readlock, 1);
+        ts = kmalloc(sizeof(struct v9fs_trans_fd), GFP_KERNEL);
+        if (!ts)
+                return -ENOMEM;
+        ts->in_file = fget( v9ses->rfdno );
+        ts->out_file = fget( v9ses->wfdno );
+        if (!ts->in_file || !ts->out_file) {
+                if (ts->in_file)
+                        fput(ts->in_file);
+                if (ts->out_file)
+                        fput(ts->out_file);
+                kfree(ts);
+                return -EIO;
+        }
+        trans->priv = ts;
+        trans->status = Connected;
+        return 0;
+}
+/**
+ * v9fs_fd_close - shutdown file descriptor
+ * @trans: private socket structure
+ *
+ */
+static void v9fs_fd_close(struct v9fs_transport *trans)
+{
+        struct v9fs_trans_fd *ts;
+        if (!trans)
+                return;
+        trans->status = Disconnected;
+        ts = trans->priv;
+        if (!ts)
+                return;
+        if (ts->in_file)
+                fput(ts->in_file);
+        if (ts->out_file)
+                fput(ts->out_file);
+        kfree(ts);
+}
+struct v9fs_transport v9fs_trans_fd = {
+        .init = v9fs_fd_init,
+        .write = v9fs_fd_send,
+        .read = v9fs_fd_recv,
+        .close = v9fs_fd_close,
+};
diff --git a/fs/9p/trans_sock.c b/fs/9p/trans_sock.c
new file mode 100644
index 000000000000..01e26f0013ac
--- /dev/null
+++ b/fs/9p/trans_sock.c
@@ -0,0 +1,290 @@
+/*
+ * linux/fs/9p/trans_socket.c
+ *
+ * Socket Transport Layer
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com>
+ *  Copyright (C) 1995, 1996 by Olaf Kirch <okir@monad.swb.de>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/ipv6.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/un.h>
+#include <asm/uaccess.h>
+#include <linux/inet.h>
+#include <linux/idr.h>
+#include "debug.h"
+#include "v9fs.h"
+#include "transport.h"
+#define V9FS_PORT 564
+struct v9fs_trans_sock {
+        struct socket *s;
+};
+/**
+ * v9fs_sock_recv - receive from a socket
+ * @v9ses: session information
+ * @v: buffer to receive data into
+ * @len: size of receive buffer
+ *
+ */
+static int v9fs_sock_recv(struct v9fs_transport *trans, void *v, int len)
+{
+        struct msghdr msg;
+        struct kvec iov;
+        int result;
+        mm_segment_t oldfs;
+        struct v9fs_trans_sock *ts = trans ? trans->priv : NULL;
+        if (trans->status == Disconnected)
+                return -EREMOTEIO;
+        result = -EINVAL;
+        oldfs = get_fs();
+        set_fs(get_ds());
+        iov.iov_base = v;
+        iov.iov_len = len;
+        msg.msg_name = NULL;
+        msg.msg_namelen = 0;
+        msg.msg_iovlen = 1;
+        msg.msg_control = NULL;
+        msg.msg_controllen = 0;
+        msg.msg_namelen = 0;
+        msg.msg_flags = MSG_NOSIGNAL;
+        result = kernel_recvmsg(ts->s, &msg, &iov, 1, len, 0);
+        dprintk(DEBUG_TRANS, "socket state %d\n", ts->s->state);
+        set_fs(oldfs);
+        if (result <= 0) {
+                if (result != -ERESTARTSYS)
+                        trans->status = Disconnected;
+        }
+        return result;
+}
+/**
+ * v9fs_sock_send - send to a socket
+ * @v9ses: session information
+ * @v: buffer to send data from
+ * @len: size of send buffer
+ *
+ */
+static int v9fs_sock_send(struct v9fs_transport *trans, void *v, int len)
+{
+        struct kvec iov;
+        struct msghdr msg;
+        int result = -1;
+        mm_segment_t oldfs;
+        struct v9fs_trans_sock *ts = trans ? trans->priv : NULL;
+        dprintk(DEBUG_TRANS, "Sending packet size %d (%x)\n", len, len);
+        dump_data(v, len);
+        down(&trans->writelock);
+        oldfs = get_fs();
+        set_fs(get_ds());
+        iov.iov_base = v;
+        iov.iov_len = len;
+        msg.msg_name = NULL;
+        msg.msg_namelen = 0;
+        msg.msg_iovlen = 1;
+        msg.msg_control = NULL;
+        msg.msg_controllen = 0;
+        msg.msg_namelen = 0;
+        msg.msg_flags = MSG_NOSIGNAL;
+        result = kernel_sendmsg(ts->s, &msg, &iov, 1, len);
+        set_fs(oldfs);
+        if (result < 0) {
+                if (result != -ERESTARTSYS)
+                        trans->status = Disconnected;
+        }
+        up(&trans->writelock);
+        return result;
+}
+/**
+ * v9fs_tcp_init - initialize TCP socket
+ * @v9ses: session information
+ * @addr: address of server to mount
+ * @data: mount options
+ *
+ */
+static int
+v9fs_tcp_init(struct v9fs_session_info *v9ses, const char *addr, char *data)
+{
+        struct socket *csocket = NULL;
+        struct sockaddr_in sin_server;
+        int rc = 0;
+        struct v9fs_trans_sock *ts = NULL;
+        struct v9fs_transport *trans = v9ses->transport;
+        sema_init(&trans->writelock, 1);
+        sema_init(&trans->readlock, 1);
+        ts = kmalloc(sizeof(struct v9fs_trans_sock), GFP_KERNEL);
+        if (!ts)
+                return -ENOMEM;
+        trans->priv = ts;
+        ts->s = NULL;
+        if (!addr)
+                return -EINVAL;
+        dprintk(DEBUG_TRANS, "Connecting to %s\n", addr);
+        sin_server.sin_family = AF_INET;
+        sin_server.sin_addr.s_addr = in_aton(addr);
+        sin_server.sin_port = htons(v9ses->port);
+        sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &csocket);
+        rc = csocket->ops->connect(csocket,
+                                   (struct sockaddr *)&sin_server,
+                                   sizeof(struct sockaddr_in), 0);
+        if (rc < 0) {
+                eprintk(KERN_ERR,
+                        "v9fs_trans_tcp: problem connecting socket to %s\n",
+                        addr);
+                return rc;
+        }
+        csocket->sk->sk_allocation = GFP_NOIO;
+        ts->s = csocket;
+        trans->status = Connected;
+        return 0;
+}
+/**
+ * v9fs_unix_init - initialize UNIX domain socket
+ * @v9ses: session information
+ * @dev_name: path to named pipe
+ * @data: mount options
+ *
+ */
+static int
+v9fs_unix_init(struct v9fs_session_info *v9ses, const char *dev_name,
+               char *data)
+{
+        int rc;
+        struct socket *csocket;
+        struct sockaddr_un sun_server;
+        struct v9fs_transport *trans;
+        struct v9fs_trans_sock *ts;
+        rc = 0;
+        csocket = NULL;
+        trans = v9ses->transport;
+        if (strlen(dev_name) > UNIX_PATH_MAX) {
+                eprintk(KERN_ERR, "v9fs_trans_unix: address too long: %s\n",
+                        dev_name);
+                return -ENOMEM;
+        }
+        ts = kmalloc(sizeof(struct v9fs_trans_sock), GFP_KERNEL);
+        if (!ts)
+                return -ENOMEM;
+        trans->priv = ts;
+        ts->s = NULL;
+        sema_init(&trans->writelock, 1);
+        sema_init(&trans->readlock, 1);
+        sun_server.sun_family = PF_UNIX;
+        strcpy(sun_server.sun_path, dev_name);
+        sock_create_kern(PF_UNIX, SOCK_STREAM, 0, &csocket);
+        rc = csocket->ops->connect(csocket, (struct sockaddr *)&sun_server,
+                sizeof(struct sockaddr_un) - 1, 0);     /* -1 *is* important */
+        if (rc < 0) {
+                eprintk(KERN_ERR,
+                        "v9fs_trans_unix: problem connecting socket: %s: %d\n",
+                        dev_name, rc);
+                return rc;
+        }
+        csocket->sk->sk_allocation = GFP_NOIO;
+        ts->s = csocket;
+        trans->status = Connected;
+        return 0;
+}
+/**
+ * v9fs_sock_close - shutdown socket
+ * @trans: private socket structure
+ *
+ */
+static void v9fs_sock_close(struct v9fs_transport *trans)
+{
+        struct v9fs_trans_sock *ts;
+        if (!trans)
+                return;
+        ts = trans->priv;
+        if ((ts) && (ts->s)) {
+                dprintk(DEBUG_TRANS, "closing the socket %p\n", ts->s);
+                sock_release(ts->s);
+                ts->s = NULL;
+                trans->status = Disconnected;
+                dprintk(DEBUG_TRANS, "socket closed\n");
+        }
+        if (ts)
+                kfree(ts);
+        trans->priv = NULL;
+}
+struct v9fs_transport v9fs_trans_tcp = {
+        .init = v9fs_tcp_init,
+        .write = v9fs_sock_send,
+        .read = v9fs_sock_recv,
+        .close = v9fs_sock_close,
+};
+struct v9fs_transport v9fs_trans_unix = {
+        .init = v9fs_unix_init,
+        .write = v9fs_sock_send,
+        .read = v9fs_sock_recv,
+        .close = v9fs_sock_close,
+};
diff --git a/fs/9p/transport.h b/fs/9p/transport.h
new file mode 100644
index 000000000000..9e9cd418efd5
--- /dev/null
+++ b/fs/9p/transport.h
@@ -0,0 +1,46 @@
+/*
+ * linux/fs/9p/transport.h
+ *
+ * Transport Definition
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+enum v9fs_transport_status {
+        Connected,
+        Disconnected,
+        Hung,
+};
+struct v9fs_transport {
+        enum v9fs_transport_status status;
+        struct semaphore writelock;
+        struct semaphore readlock;
+        void *priv;
+        int (*init) (struct v9fs_session_info *, const char *, char *);
+        int (*write) (struct v9fs_transport *, void *, int);
+        int (*read) (struct v9fs_transport *, void *, int);
+        void (*close) (struct v9fs_transport *);
+};
+extern struct v9fs_transport v9fs_trans_tcp;
+extern struct v9fs_transport v9fs_trans_unix;
+extern struct v9fs_transport v9fs_trans_fd;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
new file mode 100644
index 000000000000..13bdbbab4387
--- /dev/null
+++ b/fs/9p/v9fs.c
@@ -0,0 +1,452 @@
+/*
+ *  linux/fs/9p/v9fs.c
+ *
+ *  This file contains functions assisting in mapping VFS to 9P2000
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/parser.h>
+#include <linux/idr.h>
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "v9fs_vfs.h"
+#include "transport.h"
+#include "mux.h"
+#include "conv.h"
+/* TODO: sysfs or debugfs interface */
+int v9fs_debug_level = 0;       /* feature-rific global debug level  */
+/*
+  * Option Parsing (code inspired by NFS code)
+  *
+  */
+enum {
+        /* Options that take integer arguments */
+        Opt_port, Opt_msize, Opt_uid, Opt_gid, Opt_afid, Opt_debug,
+        Opt_rfdno, Opt_wfdno,
+        /* String options */
+        Opt_name, Opt_remotename,
+        /* Options that take no arguments */
+        Opt_legacy, Opt_nodevmap, Opt_unix, Opt_tcp, Opt_fd,
+        /* Error token */
+        Opt_err
+};
+static match_table_t tokens = {
+        {Opt_port, "port=%u"},
+        {Opt_msize, "msize=%u"},
+        {Opt_uid, "uid=%u"},
+        {Opt_gid, "gid=%u"},
+        {Opt_afid, "afid=%u"},
+        {Opt_rfdno, "rfdno=%u"},
+        {Opt_wfdno, "wfdno=%u"},
+        {Opt_debug, "debug=%u"},
+        {Opt_name, "name=%s"},
+        {Opt_remotename, "aname=%s"},
+        {Opt_unix, "proto=unix"},
+        {Opt_tcp, "proto=tcp"},
+        {Opt_fd, "proto=fd"},
+        {Opt_tcp, "tcp"},
+        {Opt_unix, "unix"},
+        {Opt_fd, "fd"},
+        {Opt_legacy, "noextend"},
+        {Opt_nodevmap, "nodevmap"},
+        {Opt_err, NULL}
+};
+/*
+ *  Parse option string.
+ */
+/**
+ * v9fs_parse_options - parse mount options into session structure
+ * @options: options string passed from mount
+ * @v9ses: existing v9fs session information
+ *
+ */
+static void v9fs_parse_options(char *options, struct v9fs_session_info *v9ses)
+{
+        char *p;
+        substring_t args[MAX_OPT_ARGS];
+        int option;
+        int ret;
+        /* setup defaults */
+        v9ses->port = V9FS_PORT;
+        v9ses->maxdata = 9000;
+        v9ses->proto = PROTO_TCP;
+        v9ses->extended = 1;
+        v9ses->afid = ~0;
+        v9ses->debug = 0;
+        v9ses->rfdno = ~0;
+        v9ses->wfdno = ~0;
+        if (!options)
+                return;
+        while ((p = strsep(&options, ",")) != NULL) {
+                int token;
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                if (token < Opt_name) {
+                        if ((ret = match_int(&args[0], &option)) < 0) {
+                                dprintk(DEBUG_ERROR,
+                                        "integer field, but no integer?\n");
+                                continue;
+                        }
+                }
+                switch (token) {
+                case Opt_port:
+                        v9ses->port = option;
+                        break;
+                case Opt_msize:
+                        v9ses->maxdata = option;
+                        break;
+                case Opt_uid:
+                        v9ses->uid = option;
+                        break;
+                case Opt_gid:
+                        v9ses->gid = option;
+                        break;
+                case Opt_afid:
+                        v9ses->afid = option;
+                        break;
+                case Opt_rfdno:
+                        v9ses->rfdno = option;
+                        break;
+                case Opt_wfdno:
+                        v9ses->wfdno = option;
+                        break;
+                case Opt_debug:
+                        v9ses->debug = option;
+                        break;
+                case Opt_tcp:
+                        v9ses->proto = PROTO_TCP;
+                        break;
+                case Opt_unix:
+                        v9ses->proto = PROTO_UNIX;
+                        break;
+                case Opt_fd:
+                        v9ses->proto = PROTO_FD;
+                        break;
+                case Opt_name:
+                        match_strcpy(v9ses->name, &args[0]);
+                        break;
+                case Opt_remotename:
+                        match_strcpy(v9ses->remotename, &args[0]);
+                        break;
+                case Opt_legacy:
+                        v9ses->extended = 0;
+                        break;
+                case Opt_nodevmap:
+                        v9ses->nodev = 1;
+                        break;
+                default:
+                        continue;
+                }
+        }
+}
+/**
+ * v9fs_inode2v9ses - safely extract v9fs session info from super block
+ * @inode: inode to extract information from
+ *
+ * Paranoid function to extract v9ses information from superblock,
+ * if anything is missing it will report an error.
+ *
+ */
+struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
+{
+        return (inode->i_sb->s_fs_info);
+}
+/**
+ * v9fs_get_idpool - allocate numeric id from pool
+ * @p - pool to allocate from
+ *
+ * XXX - This seems to be an awful generic function, should it be in idr.c with
+ *            the lock included in struct idr?
+ */
+int v9fs_get_idpool(struct v9fs_idpool *p)
+{
+        int i = 0;
+        int error;
+retry:
+        if (idr_pre_get(&p->pool, GFP_KERNEL) == 0)
+                return 0;
+        if (down_interruptible(&p->lock) == -EINTR) {
+                eprintk(KERN_WARNING, "Interrupted while locking\n");
+                return -1;
+        }
+        error = idr_get_new(&p->pool, NULL, &i);
+        up(&p->lock);
+        if (error == -EAGAIN)
+                goto retry;
+        else if (error)
+                return -1;
+        return i;
+}
+/**
+ * v9fs_put_idpool - release numeric id from pool
+ * @p - pool to allocate from
+ *
+ * XXX - This seems to be an awful generic function, should it be in idr.c with
+ *            the lock included in struct idr?
+ */
+void v9fs_put_idpool(int id, struct v9fs_idpool *p)
+{
+        if (down_interruptible(&p->lock) == -EINTR) {
+                eprintk(KERN_WARNING, "Interrupted while locking\n");
+                return;
+        }
+        idr_remove(&p->pool, id);
+        up(&p->lock);
+}
+/**
+ * v9fs_session_init - initialize session
+ * @v9ses: session information structure
+ * @dev_name: device being mounted
+ * @data: options
+ *
+ */
+int
+v9fs_session_init(struct v9fs_session_info *v9ses,
+                  const char *dev_name, char *data)
+{
+        struct v9fs_fcall *fcall = NULL;
+        struct v9fs_transport *trans_proto;
+        int n = 0;
+        int newfid = -1;
+        int retval = -EINVAL;
+        v9ses->name = __getname();
+        if (!v9ses->name)
+                return -ENOMEM;
+        v9ses->remotename = __getname();
+        if (!v9ses->remotename) {
+                putname(v9ses->name);
+                return -ENOMEM;
+        }
+        strcpy(v9ses->name, V9FS_DEFUSER);
+        strcpy(v9ses->remotename, V9FS_DEFANAME);
+        v9fs_parse_options(data, v9ses);
+        /* set global debug level */
+        v9fs_debug_level = v9ses->debug;
+        /* id pools that are session-dependent: FIDs and TIDs */
+        idr_init(&v9ses->fidpool.pool);
+        init_MUTEX(&v9ses->fidpool.lock);
+        idr_init(&v9ses->tidpool.pool);
+        init_MUTEX(&v9ses->tidpool.lock);
+        switch (v9ses->proto) {
+        case PROTO_TCP:
+                trans_proto = &v9fs_trans_tcp;
+                break;
+        case PROTO_UNIX:
+                trans_proto = &v9fs_trans_unix;
+                *v9ses->remotename = 0;
+                break;
+        case PROTO_FD:
+                trans_proto = &v9fs_trans_fd;
+                *v9ses->remotename = 0;
+                break;
+        default:
+                printk(KERN_ERR "v9fs: Bad mount protocol %d\n", v9ses->proto);
+                retval = -ENOPROTOOPT;
+                goto SessCleanUp;
+        };
+        v9ses->transport = trans_proto;
+        if ((retval = v9ses->transport->init(v9ses, dev_name, data)) < 0) {
+                eprintk(KERN_ERR, "problem initializing transport\n");
+                goto SessCleanUp;
+        }
+        v9ses->inprogress = 0;
+        v9ses->shutdown = 0;
+        v9ses->session_hung = 0;
+        if ((retval = v9fs_mux_init(v9ses, dev_name)) < 0) {
+                dprintk(DEBUG_ERROR, "problem initializing mux\n");
+                goto SessCleanUp;
+        }
+        if (v9ses->afid == ~0) {
+                if (v9ses->extended)
+                        retval =
+                            v9fs_t_version(v9ses, v9ses->maxdata, "9P2000.u",
+                                           &fcall);
+                else
+                        retval = v9fs_t_version(v9ses, v9ses->maxdata, "9P2000",
+                                                &fcall);
+                if (retval < 0) {
+                        dprintk(DEBUG_ERROR, "v9fs_t_version failed\n");
+                        goto FreeFcall;
+                }
+                /* Really should check for 9P1 and report error */
+                if (!strcmp(fcall->params.rversion.version, "9P2000.u")) {
+                        dprintk(DEBUG_9P, "9P2000 UNIX extensions enabled\n");
+                        v9ses->extended = 1;
+                } else {
+                        dprintk(DEBUG_9P, "9P2000 legacy mode enabled\n");
+                        v9ses->extended = 0;
+                }
+                n = fcall->params.rversion.msize;
+                kfree(fcall);
+                if (n < v9ses->maxdata)
+                        v9ses->maxdata = n;
+        }
+        newfid = v9fs_get_idpool(&v9ses->fidpool);
+        if (newfid < 0) {
+                eprintk(KERN_WARNING, "couldn't allocate FID\n");
+                retval = -ENOMEM;
+                goto SessCleanUp;
+        }
+        /* it is a little bit ugly, but we have to prevent newfid */
+        /* being the same as afid, so if it is, get a new fid     */
+        if (v9ses->afid != ~0 && newfid == v9ses->afid) {
+                newfid = v9fs_get_idpool(&v9ses->fidpool);
+                if (newfid < 0) {
+                        eprintk(KERN_WARNING, "couldn't allocate FID\n");
+                        retval = -ENOMEM;
+                        goto SessCleanUp;
+                }
+        }
+        if ((retval =
+             v9fs_t_attach(v9ses, v9ses->name, v9ses->remotename, newfid,
+                           v9ses->afid, NULL))
+            < 0) {
+                dprintk(DEBUG_ERROR, "cannot attach\n");
+                goto SessCleanUp;
+        }
+        if (v9ses->afid != ~0) {
+                if (v9fs_t_clunk(v9ses, v9ses->afid, NULL))
+                        dprintk(DEBUG_ERROR, "clunk failed\n");
+        }
+        return newfid;
+      FreeFcall:
+        kfree(fcall);
+      SessCleanUp:
+        v9fs_session_close(v9ses);
+        return retval;
+}
+/**
+ * v9fs_session_close - shutdown a session
+ * @v9ses: session information structure
+ *
+ */
+void v9fs_session_close(struct v9fs_session_info *v9ses)
+{
+        if (v9ses->recvproc) {
+                send_sig(SIGKILL, v9ses->recvproc, 1);
+                wait_for_completion(&v9ses->proccmpl);
+        }
+        if (v9ses->transport)
+                v9ses->transport->close(v9ses->transport);
+        putname(v9ses->name);
+        putname(v9ses->remotename);
+}
+/**
+ * v9fs_session_cancel - mark transport as disconnected
+ *      and cancel all pending requests.
+ */
+void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
+        v9ses->transport->status = Disconnected;
+        v9fs_mux_cancel_requests(v9ses, -EIO);
+}
+extern int v9fs_error_init(void);
+/**
+ * v9fs_init - Initialize module
+ *
+ */
+static int __init init_v9fs(void)
+{
+        v9fs_error_init();
+        printk(KERN_INFO "Installing v9fs 9P2000 file system support\n");
+        return register_filesystem(&v9fs_fs_type);
+}
+/**
+ * v9fs_init - shutdown module
+ *
+ */
+static void __exit exit_v9fs(void)
+{
+        unregister_filesystem(&v9fs_fs_type);
+}
+module_init(init_v9fs)
+module_exit(exit_v9fs)
+MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
+MODULE_AUTHOR("Ron Minnich <rminnich@lanl.gov>");
+MODULE_LICENSE("GPL");
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
new file mode 100644
index 000000000000..45dcef42bdd6
--- /dev/null
+++ b/fs/9p/v9fs.h
@@ -0,0 +1,103 @@
+/*
+ * V9FS definitions.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+/*
+  * Idpool structure provides lock and id management
+  *
+  */
+struct v9fs_idpool {
+        struct semaphore lock;
+        struct idr pool;
+};
+/*
+  * Session structure provides information for an opened session
+  *
+  */
+struct v9fs_session_info {
+        /* options */
+        unsigned int maxdata;
+        unsigned char extended; /* set to 1 if we are using UNIX extensions */
+        unsigned char nodev;    /* set to 1 if no disable device mapping */
+        unsigned short port;    /* port to connect to */
+        unsigned short debug;   /* debug level */
+        unsigned short proto;   /* protocol to use */
+        unsigned int afid;      /* authentication fid */
+        unsigned int rfdno;     /* read file descriptor number */
+        unsigned int wfdno;     /* write file descriptor number */
+        char *name;             /* user name to mount as */
+        char *remotename;       /* name of remote hierarchy being mounted */
+        unsigned int uid;       /* default uid/muid for legacy support */
+        unsigned int gid;       /* default gid for legacy support */
+        /* book keeping */
+        struct v9fs_idpool fidpool;     /* The FID pool for file descriptors */
+        struct v9fs_idpool tidpool;     /* The TID pool for transactions ids */
+        /* transport information */
+        struct v9fs_transport *transport;
+        int inprogress;         /* session in progress => true */
+        int shutdown;           /* session shutting down. no more attaches. */
+        unsigned char session_hung;
+        /* mux private data */
+        struct v9fs_fcall *curfcall;
+        wait_queue_head_t read_wait;
+        struct completion fcread;
+        struct completion proccmpl;
+        struct task_struct *recvproc;
+        spinlock_t muxlock;
+        struct list_head mux_fcalls;
+};
+/* possible values of ->proto */
+enum {
+        PROTO_TCP,
+        PROTO_UNIX,
+        PROTO_FD,
+};
+int v9fs_session_init(struct v9fs_session_info *, const char *, char *);
+struct v9fs_session_info *v9fs_inode2v9ses(struct inode *);
+void v9fs_session_close(struct v9fs_session_info *v9ses);
+int v9fs_get_idpool(struct v9fs_idpool *p);
+void v9fs_put_idpool(int id, struct v9fs_idpool *p);
+void v9fs_session_cancel(struct v9fs_session_info *v9ses);
+#define V9FS_MAGIC 0x01021997
+/* other default globals */
+#define V9FS_PORT               564
+#define V9FS_DEFUSER    "nobody"
+#define V9FS_DEFANAME   ""
+/* inital pool sizes for fids and tags */
+#define V9FS_START_FIDS 8192
+#define V9FS_START_TIDS 256
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
new file mode 100644
index 000000000000..2f2cea7ee3e7
--- /dev/null
+++ b/fs/9p/v9fs_vfs.h
@@ -0,0 +1,53 @@
+/*
+ * V9FS VFS extensions.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+/* plan9 semantics are that created files are implicitly opened.
+ * But linux semantics are that you call create, then open.
+ * the plan9 approach is superior as it provides an atomic
+ * open.
+ * we track the create fid here. When the file is opened, if fidopen is
+ * non-zero, we use the fid and can skip some steps.
+ * there may be a better way to do this, but I don't know it.
+ * one BAD way is to clunk the fid on create, then open it again:
+ * you lose the atomicity of file open
+ */
+/* special case:
+ * unlink calls remove, which is an implicit clunk. So we have to track
+ * that kind of thing so that we don't try to clunk a dead fid.
+ */
+extern struct file_system_type v9fs_fs_type;
+extern struct file_operations v9fs_file_operations;
+extern struct file_operations v9fs_dir_operations;
+extern struct dentry_operations v9fs_dentry_operations;
+struct inode *v9fs_get_inode(struct super_block *sb, int mode);
+ino_t v9fs_qid2ino(struct v9fs_qid *qid);
+void v9fs_mistat2inode(struct v9fs_stat *, struct inode *,
+                       struct super_block *);
+int v9fs_dir_release(struct inode *inode, struct file *filp);
+int v9fs_file_open(struct inode *inode, struct file *file);
+void v9fs_inode2mistat(struct inode *inode, struct v9fs_stat *mistat);
+void v9fs_dentry_release(struct dentry *);
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
new file mode 100644
index 000000000000..306c96741f81
--- /dev/null
+++ b/fs/9p/vfs_dentry.c
@@ -0,0 +1,126 @@
+/*
+ *  linux/fs/9p/vfs_dentry.c
+ *
+ * This file contians vfs dentry ops for the 9P2000 protocol.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/inet.h>
+#include <linux/namei.h>
+#include <linux/idr.h>
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "v9fs_vfs.h"
+#include "conv.h"
+#include "fid.h"
+/**
+ * v9fs_dentry_validate - VFS dcache hook to validate cache
+ * @dentry:  dentry that is being validated
+ * @nd: path data
+ *
+ * dcache really shouldn't be used for 9P2000 as at all due to
+ * potential attached semantics to directory traversal (walk).
+ *
+ * FUTURE: look into how to use dcache to allow multi-stage
+ * walks in Plan 9 & potential for better dcache operation which
+ * would remain valid for Plan 9 semantics.  Older versions
+ * had validation via stat for those interested.  However, since
+ * stat has the same approximate overhead as walk there really
+ * is no difference.  The only improvement would be from a
+ * time-decay cache like NFS has and that undermines the
+ * synchronous nature of 9P2000.
+ *
+ */
+static int v9fs_dentry_validate(struct dentry *dentry, struct nameidata *nd)
+{
+        struct dentry *dc = current->fs->pwd;
+        dprintk(DEBUG_VFS, "dentry: %s (%p)\n", dentry->d_iname, dentry);
+        if (v9fs_fid_lookup(dentry, FID_OP)) {
+                dprintk(DEBUG_VFS, "VALID\n");
+                return 1;
+        }
+        while (dc != NULL) {
+                if (dc == dentry) {
+                        dprintk(DEBUG_VFS, "VALID\n");
+                        return 1;
+                }
+                if (dc == dc->d_parent)
+                        break;
+                dc = dc->d_parent;
+        }
+        dprintk(DEBUG_VFS, "INVALID\n");
+        return 0;
+}
+/**
+ * v9fs_dentry_release - called when dentry is going to be freed
+ * @dentry:  dentry that is being release
+ *
+ */
+void v9fs_dentry_release(struct dentry *dentry)
+{
+        dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
+        if (dentry->d_fsdata != NULL) {
+                struct list_head *fid_list = dentry->d_fsdata;
+                struct v9fs_fid *temp = NULL;
+                struct v9fs_fid *current_fid = NULL;
+                struct v9fs_fcall *fcall = NULL;
+                list_for_each_entry_safe(current_fid, temp, fid_list, list) {
+                        if (v9fs_t_clunk
+                            (current_fid->v9ses, current_fid->fid, &fcall))
+                                dprintk(DEBUG_ERROR, "clunk failed: %s\n",
+                                        FCALL_ERROR(fcall));
+                        v9fs_put_idpool(current_fid->fid,
+                                        &current_fid->v9ses->fidpool);
+                        kfree(fcall);
+                        v9fs_fid_destroy(current_fid);
+                }
+                kfree(dentry->d_fsdata);        /* free the list_head */
+        }
+}
+struct dentry_operations v9fs_dentry_operations = {
+        .d_revalidate = v9fs_dentry_validate,
+        .d_release = v9fs_dentry_release,
+};
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
new file mode 100644
index 000000000000..c478a7384186
--- /dev/null
+++ b/fs/9p/vfs_dir.c
@@ -0,0 +1,226 @@
+/*
+ * linux/fs/9p/vfs_dir.c
+ *
+ * This file contains vfs directory ops for the 9P2000 protocol.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/inet.h>
+#include <linux/idr.h>
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "v9fs_vfs.h"
+#include "conv.h"
+#include "fid.h"
+/**
+ * dt_type - return file type
+ * @mistat: mistat structure
+ *
+ */
+static inline int dt_type(struct v9fs_stat *mistat)
+{
+        unsigned long perm = mistat->mode;
+        int rettype = DT_REG;
+        if (perm & V9FS_DMDIR)
+                rettype = DT_DIR;
+        if (perm & V9FS_DMSYMLINK)
+                rettype = DT_LNK;
+        return rettype;
+}
+/**
+ * v9fs_dir_readdir - read a directory
+ * @filep: opened file structure
+ * @dirent: directory structure ???
+ * @filldir: function to populate directory structure ???
+ *
+ */
+static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+        struct v9fs_fcall *fcall = NULL;
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
+        struct v9fs_fid *file = filp->private_data;
+        unsigned int i, n;
+        int fid = -1;
+        int ret = 0;
+        struct v9fs_stat *mi = NULL;
+        int over = 0;
+        dprintk(DEBUG_VFS, "name %s\n", filp->f_dentry->d_name.name);
+        fid = file->fid;
+        mi = kmalloc(v9ses->maxdata, GFP_KERNEL);
+        if (!mi)
+                return -ENOMEM;
+        if (file->rdir_fcall && (filp->f_pos != file->rdir_pos)) {
+                kfree(file->rdir_fcall);
+                file->rdir_fcall = NULL;
+        }
+        if (file->rdir_fcall) {
+                n = file->rdir_fcall->params.rread.count;
+                i = file->rdir_fpos;
+                while (i < n) {
+                        int s = v9fs_deserialize_stat(v9ses,
+                                  file->rdir_fcall->params.rread.data + i,
+                                  n - i, mi, v9ses->maxdata);
+                        if (s == 0) {
+                                dprintk(DEBUG_ERROR,
+                                        "error while deserializing mistat\n");
+                                ret = -EIO;
+                                goto FreeStructs;
+                        }
+                        over = filldir(dirent, mi->name, strlen(mi->name),
+                                    filp->f_pos, v9fs_qid2ino(&mi->qid),
+                                    dt_type(mi));
+                        if (over) {
+                                file->rdir_fpos = i;
+                                file->rdir_pos = filp->f_pos;
+                                break;
+                        }
+                        i += s;
+                        filp->f_pos += s;
+                }
+                if (!over) {
+                        kfree(file->rdir_fcall);
+                        file->rdir_fcall = NULL;
+                }
+        }
+        while (!over) {
+                ret = v9fs_t_read(v9ses, fid, filp->f_pos,
+                                            v9ses->maxdata-V9FS_IOHDRSZ, &fcall);
+                if (ret < 0) {
+                        dprintk(DEBUG_ERROR, "error while reading: %d: %p\n",
+                                ret, fcall);
+                        goto FreeStructs;
+                } else if (ret == 0)
+                        break;
+                n = ret;
+                i = 0;
+                while (i < n) {
+                        int s = v9fs_deserialize_stat(v9ses,
+                                  fcall->params.rread.data + i, n - i, mi,
+                                  v9ses->maxdata);
+                        if (s == 0) {
+                                dprintk(DEBUG_ERROR,
+                                        "error while deserializing mistat\n");
+                                return -EIO;
+                        }
+                        over = filldir(dirent, mi->name, strlen(mi->name),
+                                    filp->f_pos, v9fs_qid2ino(&mi->qid),
+                                    dt_type(mi));
+                        if (over) {
+                                file->rdir_fcall = fcall;
+                                file->rdir_fpos = i;
+                                file->rdir_pos = filp->f_pos;
+                                fcall = NULL;
+                                break;
+                        }
+                        i += s;
+                        filp->f_pos += s;
+                }
+                kfree(fcall);
+        }
+      FreeStructs:
+        kfree(fcall);
+        kfree(mi);
+        return ret;
+}
+/**
+ * v9fs_dir_release - close a directory
+ * @inode: inode of the directory
+ * @filp: file pointer to a directory
+ *
+ */
+int v9fs_dir_release(struct inode *inode, struct file *filp)
+{
+        struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
+        struct v9fs_fid *fid = filp->private_data;
+        int fidnum = -1;
+        dprintk(DEBUG_VFS, "inode: %p filp: %p fid: %d\n", inode, filp,
+                fid->fid);
+        fidnum = fid->fid;
+        filemap_fdatawrite(inode->i_mapping);
+        filemap_fdatawait(inode->i_mapping);
+        if (fidnum >= 0) {
+                fid->fidopen--;
+                dprintk(DEBUG_VFS, "fidopen: %d v9f->fid: %d\n", fid->fidopen,
+                        fid->fid);
+                if (fid->fidopen == 0) {
+                        if (v9fs_t_clunk(v9ses, fidnum, NULL))
+                                dprintk(DEBUG_ERROR, "clunk failed\n");
+                        v9fs_put_idpool(fid->fid, &v9ses->fidpool);
+                }
+                kfree(fid->rdir_fcall);
+                filp->private_data = NULL;
+                v9fs_fid_destroy(fid);
+        }
+        d_drop(filp->f_dentry);
+        return 0;
+}
+struct file_operations v9fs_dir_operations = {
+        .read = generic_read_dir,
+        .readdir = v9fs_dir_readdir,
+        .open = v9fs_file_open,
+        .release = v9fs_dir_release,
+};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
new file mode 100644
index 000000000000..1f8ae7d580ab
--- /dev/null
+++ b/fs/9p/vfs_file.c
@@ -0,0 +1,401 @@
+/*
+ *  linux/fs/9p/vfs_file.c
+ *
+ * This file contians vfs file ops for 9P2000.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/inet.h>
+#include <linux/version.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <linux/idr.h>
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "v9fs_vfs.h"
+#include "fid.h"
+/**
+ * v9fs_file_open - open a file (or directory)
+ * @inode: inode to be opened
+ * @file: file being opened
+ *
+ */
+int v9fs_file_open(struct inode *inode, struct file *file)
+{
+        struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
+        struct v9fs_fid *v9fid = v9fs_fid_lookup(file->f_dentry, FID_WALK);
+        struct v9fs_fid *v9newfid = NULL;
+        struct v9fs_fcall *fcall = NULL;
+        int open_mode = 0;
+        unsigned int iounit = 0;
+        int newfid = -1;
+        long result = -1;
+        dprintk(DEBUG_VFS, "inode: %p file: %p v9fid= %p\n", inode, file,
+                v9fid);
+        if (!v9fid) {
+                struct dentry *dentry = file->f_dentry;
+                dprintk(DEBUG_ERROR, "Couldn't resolve fid from dentry\n");
+                /* XXX - some duplication from lookup, generalize later */
+                /* basically vfs_lookup is too heavy weight */
+                v9fid = v9fs_fid_lookup(file->f_dentry, FID_OP);
+                if (!v9fid)
+                        return -EBADF;
+                v9fid = v9fs_fid_lookup(dentry->d_parent, FID_WALK);
+                if (!v9fid)
+                        return -EBADF;
+                newfid = v9fs_get_idpool(&v9ses->fidpool);
+                if (newfid < 0) {
+                        eprintk(KERN_WARNING, "newfid fails!\n");
+                        return -ENOSPC;
+                }
+                result =
+                    v9fs_t_walk(v9ses, v9fid->fid, newfid,
+                                (char *)file->f_dentry->d_name.name, NULL);
+                if (result < 0) {
+                        v9fs_put_idpool(newfid, &v9ses->fidpool);
+                        dprintk(DEBUG_ERROR, "rewalk didn't work\n");
+                        return -EBADF;
+                }
+                v9fid = v9fs_fid_create(dentry);
+                if (v9fid == NULL) {
+                        dprintk(DEBUG_ERROR, "couldn't insert\n");
+                        return -ENOMEM;
+                }
+                v9fid->fid = newfid;
+        }
+        if (v9fid->fidcreate) {
+                /* create case */
+                newfid = v9fid->fid;
+                iounit = v9fid->iounit;
+                v9fid->fidcreate = 0;
+        } else {
+                if (!S_ISDIR(inode->i_mode))
+                        newfid = v9fid->fid;
+                else {
+                        newfid = v9fs_get_idpool(&v9ses->fidpool);
+                        if (newfid < 0) {
+                                eprintk(KERN_WARNING, "allocation failed\n");
+                                return -ENOSPC;
+                        }
+                        /* This would be a somewhat critical clone */
+                        result =
+                            v9fs_t_walk(v9ses, v9fid->fid, newfid, NULL,
+                                        &fcall);
+                        if (result < 0) {
+                                dprintk(DEBUG_ERROR, "clone error: %s\n",
+                                        FCALL_ERROR(fcall));
+                                kfree(fcall);
+                                return result;
+                        }
+                        v9newfid = v9fs_fid_create(file->f_dentry);
+                        v9newfid->fid = newfid;
+                        v9newfid->qid = v9fid->qid;
+                        v9newfid->iounit = v9fid->iounit;
+                        v9newfid->fidopen = 0;
+                        v9newfid->fidclunked = 0;
+                        v9newfid->v9ses = v9ses;
+                        v9fid = v9newfid;
+                        kfree(fcall);
+                }
+                /* TODO: do special things for O_EXCL, O_NOFOLLOW, O_SYNC */
+                /* translate open mode appropriately */
+                open_mode = file->f_flags & 0x3;
+                if (file->f_flags & O_EXCL)
+                        open_mode |= V9FS_OEXCL;
+                if (v9ses->extended) {
+                        if (file->f_flags & O_TRUNC)
+                                open_mode |= V9FS_OTRUNC;
+                        if (file->f_flags & O_APPEND)
+                                open_mode |= V9FS_OAPPEND;
+                }
+                result = v9fs_t_open(v9ses, newfid, open_mode, &fcall);
+                if (result < 0) {
+                        dprintk(DEBUG_ERROR,
+                                "open failed, open_mode 0x%x: %s\n", open_mode,
+                                FCALL_ERROR(fcall));
+                        kfree(fcall);
+                        return result;
+                }
+                iounit = fcall->params.ropen.iounit;
+                kfree(fcall);
+        }
+        file->private_data = v9fid;
+        v9fid->rdir_pos = 0;
+        v9fid->rdir_fcall = NULL;
+        v9fid->fidopen = 1;
+        v9fid->filp = file;
+        v9fid->iounit = iounit;
+        return 0;
+}
+/**
+ * v9fs_file_lock - lock a file (or directory)
+ * @inode: inode to be opened
+ * @file: file being opened
+ *
+ * XXX - this looks like a local only lock, we should extend into 9P
+ *       by using open exclusive
+ */
+static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+        int res = 0;
+        struct inode *inode = filp->f_dentry->d_inode;
+        dprintk(DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
+        /* No mandatory locks */
+        if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+                return -ENOLCK;
+        if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+                filemap_fdatawrite(inode->i_mapping);
+                filemap_fdatawait(inode->i_mapping);
+                invalidate_inode_pages(&inode->i_data);
+        }
+        return res;
+}
+/**
+ * v9fs_read - read from a file (internal)
+ * @filep: file pointer to read
+ * @data: data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+static ssize_t
+v9fs_read(struct file *filp, char *buffer, size_t count, loff_t * offset)
+{
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
+        struct v9fs_fid *v9f = filp->private_data;
+        struct v9fs_fcall *fcall = NULL;
+        int fid = v9f->fid;
+        int rsize = 0;
+        int result = 0;
+        int total = 0;
+        dprintk(DEBUG_VFS, "\n");
+        rsize = v9ses->maxdata - V9FS_IOHDRSZ;
+        if (v9f->iounit != 0 && rsize > v9f->iounit)
+                rsize = v9f->iounit;
+        do {
+                if (count < rsize)
+                        rsize = count;
+                result = v9fs_t_read(v9ses, fid, *offset, rsize, &fcall);
+                if (result < 0) {
+                        printk(KERN_ERR "9P2000: v9fs_t_read returned %d\n",
+                               result);
+                        kfree(fcall);
+                        return total;
+                } else
+                        *offset += result;
+                /* XXX - extra copy */
+                memcpy(buffer, fcall->params.rread.data, result);
+                count -= result;
+                buffer += result;
+                total += result;
+                kfree(fcall);
+                if (result < rsize)
+                        break;
+        } while (count);
+        return total;
+}
+/**
+ * v9fs_file_read - read from a file
+ * @filep: file pointer to read
+ * @data: data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+static ssize_t
+v9fs_file_read(struct file *filp, char __user * data, size_t count,
+               loff_t * offset)
+{
+        int retval = -1;
+        int ret = 0;
+        char *buffer;
+        buffer = kmalloc(count, GFP_KERNEL);
+        if (!buffer)
+                return -ENOMEM;
+        retval = v9fs_read(filp, buffer, count, offset);
+        if (retval > 0) {
+                if ((ret = copy_to_user(data, buffer, retval)) != 0) {
+                        dprintk(DEBUG_ERROR, "Problem copying to user %d\n",
+                                ret);
+                        retval = ret;
+                }
+        }
+        kfree(buffer);
+        return retval;
+}
+/**
+ * v9fs_write - write to a file
+ * @filep: file pointer to write
+ * @data: data buffer to write data from
+ * @count: size of buffer
+ * @offset: offset at which to write data
+ *
+ */
+static ssize_t
+v9fs_write(struct file *filp, char *buffer, size_t count, loff_t * offset)
+{
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
+        struct v9fs_fid *v9fid = filp->private_data;
+        struct v9fs_fcall *fcall;
+        int fid = v9fid->fid;
+        int result = -EIO;
+        int rsize = 0;
+        int total = 0;
+        dprintk(DEBUG_VFS, "data %p count %d offset %x\n", buffer, (int)count,
+                (int)*offset);
+        rsize = v9ses->maxdata - V9FS_IOHDRSZ;
+        if (v9fid->iounit != 0 && rsize > v9fid->iounit)
+                rsize = v9fid->iounit;
+        dump_data(buffer, count);
+        do {
+                if (count < rsize)
+                        rsize = count;
+                result =
+                    v9fs_t_write(v9ses, fid, *offset, rsize, buffer, &fcall);
+                if (result < 0) {
+                        eprintk(KERN_ERR, "error while writing: %s(%d)\n",
+                                FCALL_ERROR(fcall), result);
+                        kfree(fcall);
+                        return result;
+                } else
+                        *offset += result;
+                kfree(fcall);
+                if (result != rsize) {
+                        eprintk(KERN_ERR,
+                                "short write: v9fs_t_write returned %d\n",
+                                result);
+                        break;
+                }
+                count -= result;
+                buffer += result;
+                total += result;
+        } while (count);
+        return total;
+}
+/**
+ * v9fs_file_write - write to a file
+ * @filep: file pointer to write
+ * @data: data buffer to write data from
+ * @count: size of buffer
+ * @offset: offset at which to write data
+ *
+ */
+static ssize_t
+v9fs_file_write(struct file *filp, const char __user * data,
+                size_t count, loff_t * offset)
+{
+        int ret = -1;
+        char *buffer;
+        buffer = kmalloc(count, GFP_KERNEL);
+        if (buffer == NULL)
+                return -ENOMEM;
+        ret = copy_from_user(buffer, data, count);
+        if (ret) {
+                dprintk(DEBUG_ERROR, "Problem copying from user\n");
+                ret = -EFAULT;
+        } else {
+                ret = v9fs_write(filp, buffer, count, offset);
+        }
+        kfree(buffer);
+        return ret;
+}
+struct file_operations v9fs_file_operations = {
+        .llseek = generic_file_llseek,
+        .read = v9fs_file_read,
+        .write = v9fs_file_write,
+        .open = v9fs_file_open,
+        .release = v9fs_dir_release,
+        .lock = v9fs_file_lock,
+};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
new file mode 100644
index 000000000000..0c13fc600049
--- /dev/null
+++ b/fs/9p/vfs_inode.c
@@ -0,0 +1,1338 @@
+/*
+ *  linux/fs/9p/vfs_inode.c
+ *
+ * This file contains vfs inode ops for the 9P2000 protocol.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/inet.h>
+#include <linux/namei.h>
+#include <linux/idr.h>
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "v9fs_vfs.h"
+#include "conv.h"
+#include "fid.h"
+static struct inode_operations v9fs_dir_inode_operations;
+static struct inode_operations v9fs_dir_inode_operations_ext;
+static struct inode_operations v9fs_file_inode_operations;
+static struct inode_operations v9fs_symlink_inode_operations;
+/**
+ * unixmode2p9mode - convert unix mode bits to plan 9
+ * @v9ses: v9fs session information
+ * @mode: mode to convert
+ *
+ */
+static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode)
+{
+        int res;
+        res = mode & 0777;
+        if (S_ISDIR(mode))
+                res |= V9FS_DMDIR;
+        if (v9ses->extended) {
+                if (S_ISLNK(mode))
+                        res |= V9FS_DMSYMLINK;
+                if (v9ses->nodev == 0) {
+                        if (S_ISSOCK(mode))
+                                res |= V9FS_DMSOCKET;
+                        if (S_ISFIFO(mode))
+                                res |= V9FS_DMNAMEDPIPE;
+                        if (S_ISBLK(mode))
+                                res |= V9FS_DMDEVICE;
+                        if (S_ISCHR(mode))
+                                res |= V9FS_DMDEVICE;
+                }
+                if ((mode & S_ISUID) == S_ISUID)
+                        res |= V9FS_DMSETUID;
+                if ((mode & S_ISGID) == S_ISGID)
+                        res |= V9FS_DMSETGID;
+                if ((mode & V9FS_DMLINK))
+                        res |= V9FS_DMLINK;
+        }
+        return res;
+}
+/**
+ * p9mode2unixmode- convert plan9 mode bits to unix mode bits
+ * @v9ses: v9fs session information
+ * @mode: mode to convert
+ *
+ */
+static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
+{
+        int res;
+        res = mode & 0777;
+        if ((mode & V9FS_DMDIR) == V9FS_DMDIR)
+                res |= S_IFDIR;
+        else if ((mode & V9FS_DMSYMLINK) && (v9ses->extended))
+                res |= S_IFLNK;
+        else if ((mode & V9FS_DMSOCKET) && (v9ses->extended)
+                 && (v9ses->nodev == 0))
+                res |= S_IFSOCK;
+        else if ((mode & V9FS_DMNAMEDPIPE) && (v9ses->extended)
+                 && (v9ses->nodev == 0))
+                res |= S_IFIFO;
+        else if ((mode & V9FS_DMDEVICE) && (v9ses->extended)
+                 && (v9ses->nodev == 0))
+                res |= S_IFBLK;
+        else
+                res |= S_IFREG;
+        if (v9ses->extended) {
+                if ((mode & V9FS_DMSETUID) == V9FS_DMSETUID)
+                        res |= S_ISUID;
+                if ((mode & V9FS_DMSETGID) == V9FS_DMSETGID)
+                        res |= S_ISGID;
+        }
+        return res;
+}
+/**
+ * v9fs_blank_mistat - helper function to setup a 9P stat structure
+ * @v9ses: 9P session info (for determining extended mode)
+ * @mistat: structure to initialize
+ *
+ */
+static void
+v9fs_blank_mistat(struct v9fs_session_info *v9ses, struct v9fs_stat *mistat)
+{
+        mistat->type = ~0;
+        mistat->dev = ~0;
+        mistat->qid.type = ~0;
+        mistat->qid.version = ~0;
+        *((long long *)&mistat->qid.path) = ~0;
+        mistat->mode = ~0;
+        mistat->atime = ~0;
+        mistat->mtime = ~0;
+        mistat->length = ~0;
+        mistat->name = mistat->data;
+        mistat->uid = mistat->data;
+        mistat->gid = mistat->data;
+        mistat->muid = mistat->data;
+        if (v9ses->extended) {
+                mistat->n_uid = ~0;
+                mistat->n_gid = ~0;
+                mistat->n_muid = ~0;
+                mistat->extension = mistat->data;
+        }
+        *mistat->data = 0;
+}
+/**
+ * v9fs_mistat2unix - convert mistat to unix stat
+ * @mistat: Plan 9 metadata (mistat) structure
+ * @buf: unix metadata (stat) structure to populate
+ * @sb: superblock
+ *
+ */
+static void
+v9fs_mistat2unix(struct v9fs_stat *mistat, struct stat *buf,
+                 struct super_block *sb)
+{
+        struct v9fs_session_info *v9ses = sb ? sb->s_fs_info : NULL;
+        buf->st_nlink = 1;
+        buf->st_atime = mistat->atime;
+        buf->st_mtime = mistat->mtime;
+        buf->st_ctime = mistat->mtime;
+        buf->st_uid = (unsigned short)-1;
+        buf->st_gid = (unsigned short)-1;
+        if (v9ses && v9ses->extended) {
+                /* TODO: string to uid mapping via user-space daemon */
+                if (mistat->n_uid != -1)
+                        sscanf(mistat->uid, "%x", (unsigned int *)&buf->st_uid);
+                if (mistat->n_gid != -1)
+                        sscanf(mistat->gid, "%x", (unsigned int *)&buf->st_gid);
+        }
+        if (buf->st_uid == (unsigned short)-1)
+                buf->st_uid = v9ses->uid;
+        if (buf->st_gid == (unsigned short)-1)
+                buf->st_gid = v9ses->gid;
+        buf->st_mode = p9mode2unixmode(v9ses, mistat->mode);
+        if ((S_ISBLK(buf->st_mode)) || (S_ISCHR(buf->st_mode))) {
+                char type = 0;
+                int major = -1;
+                int minor = -1;
+                sscanf(mistat->extension, "%c %u %u", &type, &major, &minor);
+                switch (type) {
+                case 'c':
+                        buf->st_mode &= ~S_IFBLK;
+                        buf->st_mode |= S_IFCHR;
+                        break;
+                case 'b':
+                        break;
+                default:
+                        dprintk(DEBUG_ERROR, "Unknown special type %c (%s)\n",
+                                type, mistat->extension);
+                };
+                buf->st_rdev = MKDEV(major, minor);
+        } else
+                buf->st_rdev = 0;
+        buf->st_size = mistat->length;
+        buf->st_blksize = sb->s_blocksize;
+        buf->st_blocks =
+            (buf->st_size + buf->st_blksize - 1) >> sb->s_blocksize_bits;
+}
+/**
+ * v9fs_get_inode - helper function to setup an inode
+ * @sb: superblock
+ * @mode: mode to setup inode with
+ *
+ */
+struct inode *v9fs_get_inode(struct super_block *sb, int mode)
+{
+        struct inode *inode = NULL;
+        struct v9fs_session_info *v9ses = sb->s_fs_info;
+        dprintk(DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
+        inode = new_inode(sb);
+        if (inode) {
+                inode->i_mode = mode;
+                inode->i_uid = current->fsuid;
+                inode->i_gid = current->fsgid;
+                inode->i_blksize = sb->s_blocksize;
+                inode->i_blocks = 0;
+                inode->i_rdev = 0;
+                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+                switch (mode & S_IFMT) {
+                case S_IFIFO:
+                case S_IFBLK:
+                case S_IFCHR:
+                case S_IFSOCK:
+                        if(!v9ses->extended) {
+                                dprintk(DEBUG_ERROR, "special files without extended mode\n");
+                                return ERR_PTR(-EINVAL);
+                        }
+                        init_special_inode(inode, inode->i_mode,
+                                           inode->i_rdev);
+                        break;
+                case S_IFREG:
+                        inode->i_op = &v9fs_file_inode_operations;
+                        inode->i_fop = &v9fs_file_operations;
+                        break;
+                case S_IFLNK:
+                        if(!v9ses->extended) {
+                                dprintk(DEBUG_ERROR, "extended modes used w/o 9P2000.u\n");
+                                return ERR_PTR(-EINVAL);
+                        }
+                        inode->i_op = &v9fs_symlink_inode_operations;
+                        break;
+                case S_IFDIR:
+                        inode->i_nlink++;
+                        if(v9ses->extended)
+                                inode->i_op = &v9fs_dir_inode_operations_ext;
+                        else
+                                inode->i_op = &v9fs_dir_inode_operations;
+                        inode->i_fop = &v9fs_dir_operations;
+                        break;
+                default:
+                        dprintk(DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
+                                mode, mode & S_IFMT);
+                        return ERR_PTR(-EINVAL);
+                }
+        } else {
+                eprintk(KERN_WARNING, "Problem allocating inode\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        return inode;
+}
+/**
+ * v9fs_create - helper function to create files and directories
+ * @dir: directory inode file is being created in
+ * @file_dentry: dentry file is being created in
+ * @perm: permissions file is being created with
+ * @open_mode: resulting open mode for file
+ *
+ */
+static int
+v9fs_create(struct inode *dir,
+            struct dentry *file_dentry,
+            unsigned int perm, unsigned int open_mode)
+{
+        struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
+        struct super_block *sb = dir->i_sb;
+        struct v9fs_fid *dirfid =
+            v9fs_fid_lookup(file_dentry->d_parent, FID_WALK);
+        struct v9fs_fid *fid = NULL;
+        struct inode *file_inode = NULL;
+        struct v9fs_fcall *fcall = NULL;
+        struct v9fs_qid qid;
+        struct stat newstat;
+        int dirfidnum = -1;
+        long newfid = -1;
+        int result = 0;
+        unsigned int iounit = 0;
+        perm = unixmode2p9mode(v9ses, perm);
+        dprintk(DEBUG_VFS, "dir: %p dentry: %p perm: %o mode: %o\n", dir,
+                file_dentry, perm, open_mode);
+        if (!dirfid)
+                return -EBADF;
+        dirfidnum = dirfid->fid;
+        if (dirfidnum < 0) {
+                dprintk(DEBUG_ERROR, "No fid for the directory #%lu\n",
+                        dir->i_ino);
+                return -EBADF;
+        }
+        if (file_dentry->d_inode) {
+                dprintk(DEBUG_ERROR,
+                        "Odd. There is an inode for dir %lu, name :%s:\n",
+                        dir->i_ino, file_dentry->d_name.name);
+                return -EEXIST;
+        }
+        newfid = v9fs_get_idpool(&v9ses->fidpool);
+        if (newfid < 0) {
+                eprintk(KERN_WARNING, "no free fids available\n");
+                return -ENOSPC;
+        }
+        result = v9fs_t_walk(v9ses, dirfidnum, newfid, NULL, &fcall);
+        if (result < 0) {
+                dprintk(DEBUG_ERROR, "clone error: %s\n", FCALL_ERROR(fcall));
+                v9fs_put_idpool(newfid, &v9ses->fidpool);
+                newfid = 0;
+                goto CleanUpFid;
+        }
+        kfree(fcall);
+        result = v9fs_t_create(v9ses, newfid, (char *)file_dentry->d_name.name,
+                               perm, open_mode, &fcall);
+        if (result < 0) {
+                dprintk(DEBUG_ERROR, "create fails: %s(%d)\n",
+                        FCALL_ERROR(fcall), result);
+                goto CleanUpFid;
+        }
+        iounit = fcall->params.rcreate.iounit;
+        qid = fcall->params.rcreate.qid;
+        kfree(fcall);
+        fid = v9fs_fid_create(file_dentry);
+        if (!fid) {
+                result = -ENOMEM;
+                goto CleanUpFid;
+        }
+        fid->fid = newfid;
+        fid->fidopen = 0;
+        fid->fidcreate = 1;
+        fid->qid = qid;
+        fid->iounit = iounit;
+        fid->rdir_pos = 0;
+        fid->rdir_fcall = NULL;
+        fid->v9ses = v9ses;
+        if ((perm & V9FS_DMSYMLINK) || (perm & V9FS_DMLINK) ||
+            (perm & V9FS_DMNAMEDPIPE) || (perm & V9FS_DMSOCKET) ||
+            (perm & V9FS_DMDEVICE))
+                return 0;
+        result = v9fs_t_stat(v9ses, newfid, &fcall);
+        if (result < 0) {
+                dprintk(DEBUG_ERROR, "stat error: %s(%d)\n", FCALL_ERROR(fcall),
+                        result);
+                goto CleanUpFid;
+        }
+        v9fs_mistat2unix(fcall->params.rstat.stat, &newstat, sb);
+        file_inode = v9fs_get_inode(sb, newstat.st_mode);
+        if ((!file_inode) || IS_ERR(file_inode)) {
+                dprintk(DEBUG_ERROR, "create inode failed\n");
+                result = -EBADF;
+                goto CleanUpFid;
+        }
+        v9fs_mistat2inode(fcall->params.rstat.stat, file_inode, sb);
+        kfree(fcall);
+        d_instantiate(file_dentry, file_inode);
+        if (perm & V9FS_DMDIR) {
+                if (v9fs_t_clunk(v9ses, newfid, &fcall))
+                        dprintk(DEBUG_ERROR, "clunk for mkdir failed: %s\n",
+                                FCALL_ERROR(fcall));
+                v9fs_put_idpool(newfid, &v9ses->fidpool);
+                kfree(fcall);
+                fid->fidopen = 0;
+                fid->fidcreate = 0;
+                d_drop(file_dentry);
+        }
+        return 0;
+      CleanUpFid:
+        kfree(fcall);
+        if (newfid) {
+                if (v9fs_t_clunk(v9ses, newfid, &fcall))
+                        dprintk(DEBUG_ERROR, "clunk failed: %s\n",
+                                FCALL_ERROR(fcall));
+                v9fs_put_idpool(newfid, &v9ses->fidpool);
+                kfree(fcall);
+        }
+        return result;
+}
+/**
+ * v9fs_remove - helper function to remove files and directories
+ * @dir: directory inode that is being deleted
+ * @file:  dentry that is being deleted
+ * @rmdir: removing a directory
+ *
+ */
+static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
+{
+        struct v9fs_fcall *fcall = NULL;
+        struct super_block *sb = NULL;
+        struct v9fs_session_info *v9ses = NULL;
+        struct v9fs_fid *v9fid = NULL;
+        struct inode *file_inode = NULL;
+        int fid = -1;
+        int result = 0;
+        dprintk(DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file,
+                rmdir);
+        file_inode = file->d_inode;
+        sb = file_inode->i_sb;
+        v9ses = v9fs_inode2v9ses(file_inode);
+        v9fid = v9fs_fid_lookup(file, FID_OP);
+        if (!v9fid) {
+                dprintk(DEBUG_ERROR,
+                        "no v9fs_fid\n");
+                return -EBADF;
+        }
+        fid = v9fid->fid;
+        if (fid < 0) {
+                dprintk(DEBUG_ERROR, "inode #%lu, no fid!\n",
+                        file_inode->i_ino);
+                return -EBADF;
+        }
+        result = v9fs_t_remove(v9ses, fid, &fcall);
+        if (result < 0)
+                dprintk(DEBUG_ERROR, "remove of file fails: %s(%d)\n",
+                        FCALL_ERROR(fcall), result);
+        else {
+                v9fs_put_idpool(fid, &v9ses->fidpool);
+                v9fs_fid_destroy(v9fid);
+        }
+        kfree(fcall);
+        return result;
+}
+/**
+ * v9fs_vfs_create - VFS hook to create files
+ * @inode: directory inode that is being deleted
+ * @dentry:  dentry that is being deleted
+ * @perm: create permissions
+ * @nd: path information
+ *
+ */
+static int
+v9fs_vfs_create(struct inode *inode, struct dentry *dentry, int perm,
+                struct nameidata *nd)
+{
+        return v9fs_create(inode, dentry, perm, O_RDWR);
+}
+/**
+ * v9fs_vfs_mkdir - VFS mkdir hook to create a directory
+ * @inode:  inode that is being unlinked
+ * @dentry: dentry that is being unlinked
+ * @mode: mode for new directory
+ *
+ */
+static int v9fs_vfs_mkdir(struct inode *inode, struct dentry *dentry, int mode)
+{
+        return v9fs_create(inode, dentry, mode | S_IFDIR, O_RDONLY);
+}
+/**
+ * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode
+ * @dir:  inode that is being walked from
+ * @dentry: dentry that is being walked to?
+ * @nameidata: path data
+ *
+ */
+static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
+                                      struct nameidata *nameidata)
+{
+        struct super_block *sb;
+        struct v9fs_session_info *v9ses;
+        struct v9fs_fid *dirfid;
+        struct v9fs_fid *fid;
+        struct inode *inode;
+        struct v9fs_fcall *fcall = NULL;
+        struct stat newstat;
+        int dirfidnum = -1;
+        int newfid = -1;
+        int result = 0;
+        dprintk(DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
+                dir, dentry->d_iname, dentry, nameidata);
+        sb = dir->i_sb;
+        v9ses = v9fs_inode2v9ses(dir);
+        dirfid = v9fs_fid_lookup(dentry->d_parent, FID_WALK);
+        if (!dirfid) {
+                dprintk(DEBUG_ERROR, "no dirfid\n");
+                return ERR_PTR(-EINVAL);
+        }
+        dirfidnum = dirfid->fid;
+        if (dirfidnum < 0) {
+                dprintk(DEBUG_ERROR, "no dirfid for inode %p, #%lu\n",
+                        dir, dir->i_ino);
+                return ERR_PTR(-EBADF);
+        }
+        newfid = v9fs_get_idpool(&v9ses->fidpool);
+        if (newfid < 0) {
+                eprintk(KERN_WARNING, "newfid fails!\n");
+                return ERR_PTR(-ENOSPC);
+        }
+        result =
+            v9fs_t_walk(v9ses, dirfidnum, newfid, (char *)dentry->d_name.name,
+                        NULL);
+        if (result < 0) {
+                v9fs_put_idpool(newfid, &v9ses->fidpool);
+                if (result == -ENOENT) {
+                        d_add(dentry, NULL);
+                        dprintk(DEBUG_ERROR,
+                                "Return negative dentry %p count %d\n",
+                                dentry, atomic_read(&dentry->d_count));
+                        return NULL;
+                }
+                dprintk(DEBUG_ERROR, "walk error:%d\n", result);
+                goto FreeFcall;
+        }
+        result = v9fs_t_stat(v9ses, newfid, &fcall);
+        if (result < 0) {
+                dprintk(DEBUG_ERROR, "stat error\n");
+                goto FreeFcall;
+        }
+        v9fs_mistat2unix(fcall->params.rstat.stat, &newstat, sb);
+        inode = v9fs_get_inode(sb, newstat.st_mode);
+        if (IS_ERR(inode) && (PTR_ERR(inode) == -ENOSPC)) {
+                eprintk(KERN_WARNING, "inode alloc failes, returns %ld\n",
+                        PTR_ERR(inode));
+                result = -ENOSPC;
+                goto FreeFcall;
+        }
+        inode->i_ino = v9fs_qid2ino(&fcall->params.rstat.stat->qid);
+        fid = v9fs_fid_create(dentry);
+        if (fid == NULL) {
+                dprintk(DEBUG_ERROR, "couldn't insert\n");
+                result = -ENOMEM;
+                goto FreeFcall;
+        }
+        fid->fid = newfid;
+        fid->fidopen = 0;
+        fid->v9ses = v9ses;
+        fid->qid = fcall->params.rstat.stat->qid;
+        dentry->d_op = &v9fs_dentry_operations;
+        v9fs_mistat2inode(fcall->params.rstat.stat, inode, inode->i_sb);
+        d_add(dentry, inode);
+        kfree(fcall);
+        return NULL;
+      FreeFcall:
+        kfree(fcall);
+        return ERR_PTR(result);
+}
+/**
+ * v9fs_vfs_unlink - VFS unlink hook to delete an inode
+ * @i:  inode that is being unlinked
+ * @d: dentry that is being unlinked
+ *
+ */
+static int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
+{
+        return v9fs_remove(i, d, 0);
+}
+/**
+ * v9fs_vfs_rmdir - VFS unlink hook to delete a directory
+ * @i:  inode that is being unlinked
+ * @d: dentry that is being unlinked
+ *
+ */
+static int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
+{
+        return v9fs_remove(i, d, 1);
+}
+/**
+ * v9fs_vfs_rename - VFS hook to rename an inode
+ * @old_dir:  old dir inode
+ * @old_dentry: old dentry
+ * @new_dir: new dir inode
+ * @new_dentry: new dentry
+ *
+ */
+static int
+v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+                struct inode *new_dir, struct dentry *new_dentry)
+{
+        struct inode *old_inode = old_dentry->d_inode;
+        struct v9fs_session_info *v9ses = v9fs_inode2v9ses(old_inode);
+        struct v9fs_fid *oldfid = v9fs_fid_lookup(old_dentry, FID_WALK);
+        struct v9fs_fid *olddirfid =
+            v9fs_fid_lookup(old_dentry->d_parent, FID_WALK);
+        struct v9fs_fid *newdirfid =
+            v9fs_fid_lookup(new_dentry->d_parent, FID_WALK);
+        struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
+        struct v9fs_fcall *fcall = NULL;
+        int fid = -1;
+        int olddirfidnum = -1;
+        int newdirfidnum = -1;
+        int retval = 0;
+        dprintk(DEBUG_VFS, "\n");
+        if (!mistat)
+                return -ENOMEM;
+        if ((!oldfid) || (!olddirfid) || (!newdirfid)) {
+                dprintk(DEBUG_ERROR, "problem with arguments\n");
+                return -EBADF;
+        }
+        /* 9P can only handle file rename in the same directory */
+        if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) {
+                dprintk(DEBUG_ERROR, "old dir and new dir are different\n");
+                retval = -EPERM;
+                goto FreeFcallnBail;
+        }
+        fid = oldfid->fid;
+        olddirfidnum = olddirfid->fid;
+        newdirfidnum = newdirfid->fid;
+        if (fid < 0) {
+                dprintk(DEBUG_ERROR, "no fid for old file #%lu\n",
+                        old_inode->i_ino);
+                retval = -EBADF;
+                goto FreeFcallnBail;
+        }
+        v9fs_blank_mistat(v9ses, mistat);
+        strcpy(mistat->data + 1, v9ses->name);
+        mistat->name = mistat->data + 1 + strlen(v9ses->name);
+        if (new_dentry->d_name.len >
+            (v9ses->maxdata - strlen(v9ses->name) - sizeof(struct v9fs_stat))) {
+                dprintk(DEBUG_ERROR, "new name too long\n");
+                goto FreeFcallnBail;
+        }
+        strcpy(mistat->name, new_dentry->d_name.name);
+        retval = v9fs_t_wstat(v9ses, fid, mistat, &fcall);
+      FreeFcallnBail:
+        kfree(mistat);
+        if (retval < 0)
+                dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n",
+                        FCALL_ERROR(fcall));
+        kfree(fcall);
+        return retval;
+}
+/**
+ * v9fs_vfs_getattr - retreive file metadata
+ * @mnt - mount information
+ * @dentry - file to get attributes on
+ * @stat - metadata structure to populate
+ *
+ */
+static int
+v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                 struct kstat *stat)
+{
+        struct v9fs_fcall *fcall = NULL;
+        struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        struct v9fs_fid *fid = v9fs_fid_lookup(dentry, FID_OP);
+        int err = -EPERM;
+        dprintk(DEBUG_VFS, "dentry: %p\n", dentry);
+        if (!fid) {
+                dprintk(DEBUG_ERROR,
+                        "couldn't find fid associated with dentry\n");
+                return -EBADF;
+        }
+        err = v9fs_t_stat(v9ses, fid->fid, &fcall);
+        if (err < 0)
+                dprintk(DEBUG_ERROR, "stat error\n");
+        else {
+                v9fs_mistat2inode(fcall->params.rstat.stat, dentry->d_inode,
+                                  dentry->d_inode->i_sb);
+                generic_fillattr(dentry->d_inode, stat);
+        }
+        kfree(fcall);
+        return err;
+}
+/**
+ * v9fs_vfs_setattr - set file metadata
+ * @dentry: file whose metadata to set
+ * @iattr: metadata assignment structure
+ *
+ */
+static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+        struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        struct v9fs_fid *fid = v9fs_fid_lookup(dentry, FID_OP);
+        struct v9fs_fcall *fcall = NULL;
+        struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
+        int res = -EPERM;
+        dprintk(DEBUG_VFS, "\n");
+        if (!mistat)
+                return -ENOMEM;
+        if (!fid) {
+                dprintk(DEBUG_ERROR,
+                        "Couldn't find fid associated with dentry\n");
+                return -EBADF;
+        }
+        v9fs_blank_mistat(v9ses, mistat);
+        if (iattr->ia_valid & ATTR_MODE)
+                mistat->mode = unixmode2p9mode(v9ses, iattr->ia_mode);
+        if (iattr->ia_valid & ATTR_MTIME)
+                mistat->mtime = iattr->ia_mtime.tv_sec;
+        if (iattr->ia_valid & ATTR_ATIME)
+                mistat->atime = iattr->ia_atime.tv_sec;
+        if (iattr->ia_valid & ATTR_SIZE)
+                mistat->length = iattr->ia_size;
+        if (v9ses->extended) {
+                char *ptr = mistat->data+1;
+                if (iattr->ia_valid & ATTR_UID) {
+                        mistat->uid = ptr;
+                        ptr += 1+sprintf(ptr, "%08x", iattr->ia_uid);
+                        mistat->n_uid = iattr->ia_uid;
+                }
+                if (iattr->ia_valid & ATTR_GID) {
+                        mistat->gid = ptr;
+                        ptr += 1+sprintf(ptr, "%08x", iattr->ia_gid);
+                        mistat->n_gid = iattr->ia_gid;
+                }
+        }
+        res = v9fs_t_wstat(v9ses, fid->fid, mistat, &fcall);
+        if (res < 0)
+                dprintk(DEBUG_ERROR, "wstat error: %s\n", FCALL_ERROR(fcall));
+        kfree(mistat);
+        kfree(fcall);
+        if (res >= 0)
+                res = inode_setattr(dentry->d_inode, iattr);
+        return res;
+}
+/**
+ * v9fs_mistat2inode - populate an inode structure with mistat info
+ * @mistat: Plan 9 metadata (mistat) structure
+ * @inode: inode to populate
+ * @sb: superblock of filesystem
+ *
+ */
+void
+v9fs_mistat2inode(struct v9fs_stat *mistat, struct inode *inode,
+                  struct super_block *sb)
+{
+        struct v9fs_session_info *v9ses = sb->s_fs_info;
+        inode->i_nlink = 1;
+        inode->i_atime.tv_sec = mistat->atime;
+        inode->i_mtime.tv_sec = mistat->mtime;
+        inode->i_ctime.tv_sec = mistat->mtime;
+        inode->i_uid = -1;
+        inode->i_gid = -1;
+        if (v9ses->extended) {
+                /* TODO: string to uid mapping via user-space daemon */
+                inode->i_uid = mistat->n_uid;
+                inode->i_gid = mistat->n_gid;
+                if (mistat->n_uid == -1)
+                        sscanf(mistat->uid, "%x", &inode->i_uid);
+                if (mistat->n_gid == -1)
+                        sscanf(mistat->gid, "%x", &inode->i_gid);
+        }
+        if (inode->i_uid == -1)
+                inode->i_uid = v9ses->uid;
+        if (inode->i_gid == -1)
+                inode->i_gid = v9ses->gid;
+        inode->i_mode = p9mode2unixmode(v9ses, mistat->mode);
+        if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) {
+                char type = 0;
+                int major = -1;
+                int minor = -1;
+                sscanf(mistat->extension, "%c %u %u", &type, &major, &minor);
+                switch (type) {
+                case 'c':
+                        inode->i_mode &= ~S_IFBLK;
+                        inode->i_mode |= S_IFCHR;
+                        break;
+                case 'b':
+                        break;
+                default:
+                        dprintk(DEBUG_ERROR, "Unknown special type %c (%s)\n",
+                                type, mistat->extension);
+                };
+                inode->i_rdev = MKDEV(major, minor);
+        } else
+                inode->i_rdev = 0;
+        inode->i_size = mistat->length;
+        inode->i_blksize = sb->s_blocksize;
+        inode->i_blocks =
+            (inode->i_size + inode->i_blksize - 1) >> sb->s_blocksize_bits;
+}
+/**
+ * v9fs_qid2ino - convert qid into inode number
+ * @qid: qid to hash
+ *
+ * BUG: potential for inode number collisions?
+ */
+ino_t v9fs_qid2ino(struct v9fs_qid *qid)
+{
+        u64 path = qid->path + 2;
+        ino_t i = 0;
+        if (sizeof(ino_t) == sizeof(path))
+                memcpy(&i, &path, sizeof(ino_t));
+        else
+                i = (ino_t) (path ^ (path >> 32));
+        return i;
+}
+/**
+ * v9fs_vfs_symlink - helper function to create symlinks
+ * @dir: directory inode containing symlink
+ * @dentry: dentry for symlink
+ * @symname: symlink data
+ *
+ * See 9P2000.u RFC for more information
+ *
+ */
+static int
+v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+        int retval = -EPERM;
+        struct v9fs_fid *newfid;
+        struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
+        struct v9fs_fcall *fcall = NULL;
+        struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
+        dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
+                symname);
+        if (!mistat)
+                return -ENOMEM;
+        if (!v9ses->extended) {
+                dprintk(DEBUG_ERROR, "not extended\n");
+                goto FreeFcall;
+        }
+        /* issue a create */
+        retval = v9fs_create(dir, dentry, S_IFLNK, 0);
+        if (retval != 0)
+                goto FreeFcall;
+        newfid = v9fs_fid_lookup(dentry, FID_OP);
+        /* issue a twstat */
+        v9fs_blank_mistat(v9ses, mistat);
+        strcpy(mistat->data + 1, symname);
+        mistat->extension = mistat->data + 1;
+        retval = v9fs_t_wstat(v9ses, newfid->fid, mistat, &fcall);
+        if (retval < 0) {
+                dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n",
+                        FCALL_ERROR(fcall));
+                goto FreeFcall;
+        }
+        kfree(fcall);
+        if (v9fs_t_clunk(v9ses, newfid->fid, &fcall)) {
+                dprintk(DEBUG_ERROR, "clunk for symlink failed: %s\n",
+                        FCALL_ERROR(fcall));
+                goto FreeFcall;
+        }
+        d_drop(dentry);         /* FID - will this also clunk? */
+      FreeFcall:
+        kfree(mistat);
+        kfree(fcall);
+        return retval;
+}
+/**
+ * v9fs_readlink - read a symlink's location (internal version)
+ * @dentry: dentry for symlink
+ * @buffer: buffer to load symlink location into
+ * @buflen: length of buffer
+ *
+ */
+static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
+{
+        int retval = -EPERM;
+        struct v9fs_fcall *fcall = NULL;
+        struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        struct v9fs_fid *fid = v9fs_fid_lookup(dentry, FID_OP);
+        if (!fid) {
+                dprintk(DEBUG_ERROR, "could not resolve fid from dentry\n");
+                retval = -EBADF;
+                goto FreeFcall;
+        }
+        if (!v9ses->extended) {
+                retval = -EBADF;
+                dprintk(DEBUG_ERROR, "not extended\n");
+                goto FreeFcall;
+        }
+        dprintk(DEBUG_VFS, " %s\n", dentry->d_name.name);
+        retval = v9fs_t_stat(v9ses, fid->fid, &fcall);
+        if (retval < 0) {
+                dprintk(DEBUG_ERROR, "stat error\n");
+                goto FreeFcall;
+        }
+        if (!fcall)
+                return -EIO;
+        if (!(fcall->params.rstat.stat->mode & V9FS_DMSYMLINK)) {
+                retval = -EINVAL;
+                goto FreeFcall;
+        }
+        /* copy extension buffer into buffer */
+        if (strlen(fcall->params.rstat.stat->extension) < buflen)
+                buflen = strlen(fcall->params.rstat.stat->extension);
+        memcpy(buffer, fcall->params.rstat.stat->extension, buflen + 1);
+        retval = buflen;
+      FreeFcall:
+        kfree(fcall);
+        return retval;
+}
+/**
+ * v9fs_vfs_readlink - read a symlink's location
+ * @dentry: dentry for symlink
+ * @buf: buffer to load symlink location into
+ * @buflen: length of buffer
+ *
+ */
+static int v9fs_vfs_readlink(struct dentry *dentry, char __user * buffer,
+                             int buflen)
+{
+        int retval;
+        int ret;
+        char *link = __getname();
+        if (strlen(link) < buflen)
+                buflen = strlen(link);
+        dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
+        retval = v9fs_readlink(dentry, link, buflen);
+        if (retval > 0) {
+                if ((ret = copy_to_user(buffer, link, retval)) != 0) {
+                        dprintk(DEBUG_ERROR, "problem copying to user: %d\n",
+                                ret);
+                        retval = ret;
+                }
+        }
+        putname(link);
+        return retval;
+}
+/**
+ * v9fs_vfs_follow_link - follow a symlink path
+ * @dentry: dentry for symlink
+ * @nd: nameidata
+ *
+ */
+static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+        int len = 0;
+        char *link = __getname();
+        dprintk(DEBUG_VFS, "%s n", dentry->d_name.name);
+        if (!link)
+                link = ERR_PTR(-ENOMEM);
+        else {
+                len = v9fs_readlink(dentry, link, strlen(link));
+                if (len < 0) {
+                        putname(link);
+                        link = ERR_PTR(len);
+                } else
+                        link[len] = 0;
+        }
+        nd_set_link(nd, link);
+        return NULL;
+}
+/**
+ * v9fs_vfs_put_link - release a symlink path
+ * @dentry: dentry for symlink
+ * @nd: nameidata
+ *
+ */
+static void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
+{
+        char *s = nd_get_link(nd);
+        dprintk(DEBUG_VFS, " %s %s\n", dentry->d_name.name, s);
+        if (!IS_ERR(s))
+                putname(s);
+}
+/**
+ * v9fs_vfs_link - create a hardlink
+ * @old_dentry: dentry for file to link to
+ * @dir: inode destination for new link
+ * @dentry: dentry for link
+ *
+ */
+/* XXX - lots of code dup'd from symlink and creates,
+ * figure out a better reuse strategy
+ */
+static int
+v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
+              struct dentry *dentry)
+{
+        int retval = -EPERM;
+        struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
+        struct v9fs_fcall *fcall = NULL;
+        struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
+        struct v9fs_fid *oldfid = v9fs_fid_lookup(old_dentry, FID_OP);
+        struct v9fs_fid *newfid = NULL;
+        char *symname = __getname();
+        dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
+                old_dentry->d_name.name);
+        if (!v9ses->extended) {
+                dprintk(DEBUG_ERROR, "not extended\n");
+                goto FreeMem;
+        }
+        /* get fid of old_dentry */
+        sprintf(symname, "hardlink(%d)\n", oldfid->fid);
+        /* issue a create */
+        retval = v9fs_create(dir, dentry, V9FS_DMLINK, 0);
+        if (retval != 0)
+                goto FreeMem;
+        newfid = v9fs_fid_lookup(dentry, FID_OP);
+        if (!newfid) {
+                dprintk(DEBUG_ERROR, "couldn't resolve fid from dentry\n");
+                goto FreeMem;
+        }
+        /* issue a twstat */
+        v9fs_blank_mistat(v9ses, mistat);
+        strcpy(mistat->data + 1, symname);
+        mistat->extension = mistat->data + 1;
+        retval = v9fs_t_wstat(v9ses, newfid->fid, mistat, &fcall);
+        if (retval < 0) {
+                dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n",
+                        FCALL_ERROR(fcall));
+                goto FreeMem;
+        }
+        kfree(fcall);
+        if (v9fs_t_clunk(v9ses, newfid->fid, &fcall)) {
+                dprintk(DEBUG_ERROR, "clunk for symlink failed: %s\n",
+                        FCALL_ERROR(fcall));
+                goto FreeMem;
+        }
+        d_drop(dentry);         /* FID - will this also clunk? */
+        kfree(fcall);
+        fcall = NULL;
+      FreeMem:
+        kfree(mistat);
+        kfree(fcall);
+        putname(symname);
+        return retval;
+}
+/**
+ * v9fs_vfs_mknod - create a special file
+ * @dir: inode destination for new link
+ * @dentry: dentry for file
+ * @mode: mode for creation
+ * @dev_t: device associated with special file
+ *
+ */
+static int
+v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
+{
+        int retval = -EPERM;
+        struct v9fs_fid *newfid;
+        struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
+        struct v9fs_fcall *fcall = NULL;
+        struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
+        char *symname = __getname();
+        dprintk(DEBUG_VFS, " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
+                dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
+        if (!mistat)
+                return -ENOMEM;
+        if (!new_valid_dev(rdev)) {
+                retval = -EINVAL;
+                goto FreeMem;
+        }
+        if (!v9ses->extended) {
+                dprintk(DEBUG_ERROR, "not extended\n");
+                goto FreeMem;
+        }
+        /* issue a create */
+        retval = v9fs_create(dir, dentry, mode, 0);
+        if (retval != 0)
+                goto FreeMem;
+        newfid = v9fs_fid_lookup(dentry, FID_OP);
+        if (!newfid) {
+                dprintk(DEBUG_ERROR, "coudn't resove fid from dentry\n");
+                retval = -EINVAL;
+                goto FreeMem;
+        }
+        /* build extension */
+        if (S_ISBLK(mode))
+                sprintf(symname, "b %u %u", MAJOR(rdev), MINOR(rdev));
+        else if (S_ISCHR(mode))
+                sprintf(symname, "c %u %u", MAJOR(rdev), MINOR(rdev));
+        else if (S_ISFIFO(mode))
+                ;       /* DO NOTHING */
+        else {
+                retval = -EINVAL;
+                goto FreeMem;
+        }
+        if (!S_ISFIFO(mode)) {
+                /* issue a twstat */
+                v9fs_blank_mistat(v9ses, mistat);
+                strcpy(mistat->data + 1, symname);
+                mistat->extension = mistat->data + 1;
+                retval = v9fs_t_wstat(v9ses, newfid->fid, mistat, &fcall);
+                if (retval < 0) {
+                        dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n",
+                                FCALL_ERROR(fcall));
+                        goto FreeMem;
+                }
+        }
+        /* need to update dcache so we show up */
+        kfree(fcall);
+        if (v9fs_t_clunk(v9ses, newfid->fid, &fcall)) {
+                dprintk(DEBUG_ERROR, "clunk for symlink failed: %s\n",
+                        FCALL_ERROR(fcall));
+                goto FreeMem;
+        }
+        d_drop(dentry);         /* FID - will this also clunk? */
+      FreeMem:
+        kfree(mistat);
+        kfree(fcall);
+        putname(symname);
+        return retval;
+}
+static struct inode_operations v9fs_dir_inode_operations_ext = {
+        .create = v9fs_vfs_create,
+        .lookup = v9fs_vfs_lookup,
+        .symlink = v9fs_vfs_symlink,
+        .link = v9fs_vfs_link,
+        .unlink = v9fs_vfs_unlink,
+        .mkdir = v9fs_vfs_mkdir,
+        .rmdir = v9fs_vfs_rmdir,
+        .mknod = v9fs_vfs_mknod,
+        .rename = v9fs_vfs_rename,
+        .readlink = v9fs_vfs_readlink,
+        .getattr = v9fs_vfs_getattr,
+        .setattr = v9fs_vfs_setattr,
+};
+static struct inode_operations v9fs_dir_inode_operations = {
+        .create = v9fs_vfs_create,
+        .lookup = v9fs_vfs_lookup,
+        .unlink = v9fs_vfs_unlink,
+        .mkdir = v9fs_vfs_mkdir,
+        .rmdir = v9fs_vfs_rmdir,
+        .mknod = v9fs_vfs_mknod,
+        .rename = v9fs_vfs_rename,
+        .getattr = v9fs_vfs_getattr,
+        .setattr = v9fs_vfs_setattr,
+};
+static struct inode_operations v9fs_file_inode_operations = {
+        .getattr = v9fs_vfs_getattr,
+        .setattr = v9fs_vfs_setattr,
+};
+static struct inode_operations v9fs_symlink_inode_operations = {
+        .readlink = v9fs_vfs_readlink,
+        .follow_link = v9fs_vfs_follow_link,
+        .put_link = v9fs_vfs_put_link,
+        .getattr = v9fs_vfs_getattr,
+        .setattr = v9fs_vfs_setattr,
+};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
new file mode 100644
index 000000000000..868f350b2c5f
--- /dev/null
+++ b/fs/9p/vfs_super.c
@@ -0,0 +1,280 @@
+/*
+ *  linux/fs/9p/vfs_super.c
+ *
+ * This file contians superblock ops for 9P2000. It is intended that
+ * you mount this file system on directories.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/inet.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/idr.h>
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "v9fs_vfs.h"
+#include "conv.h"
+#include "fid.h"
+static void v9fs_clear_inode(struct inode *);
+static struct super_operations v9fs_super_ops;
+/**
+ * v9fs_clear_inode - release an inode
+ * @inode: inode to release
+ *
+ */
+static void v9fs_clear_inode(struct inode *inode)
+{
+        filemap_fdatawrite(inode->i_mapping);
+}
+/**
+ * v9fs_set_super - set the superblock
+ * @s: super block
+ * @data: file system specific data
+ *
+ */
+static int v9fs_set_super(struct super_block *s, void *data)
+{
+        s->s_fs_info = data;
+        return set_anon_super(s, data);
+}
+/**
+ * v9fs_fill_super - populate superblock with info
+ * @sb: superblock
+ * @v9ses: session information
+ *
+ */
+static void
+v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
+                int flags)
+{
+        sb->s_maxbytes = MAX_LFS_FILESIZE;
+        sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
+        sb->s_blocksize = 1 << sb->s_blocksize_bits;
+        sb->s_magic = V9FS_MAGIC;
+        sb->s_op = &v9fs_super_ops;
+        sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
+            MS_NODIRATIME | MS_NOATIME;
+}
+/**
+ * v9fs_get_sb - mount a superblock
+ * @fs_type: file system type
+ * @flags: mount flags
+ * @dev_name: device name that was mounted
+ * @data: mount options
+ *
+ */
+static struct super_block *v9fs_get_sb(struct file_system_type
+                                       *fs_type, int flags,
+                                       const char *dev_name, void *data)
+{
+        struct super_block *sb = NULL;
+        struct v9fs_fcall *fcall = NULL;
+        struct inode *inode = NULL;
+        struct dentry *root = NULL;
+        struct v9fs_session_info *v9ses = NULL;
+        struct v9fs_fid *root_fid = NULL;
+        int mode = S_IRWXUGO | S_ISVTX;
+        uid_t uid = current->fsuid;
+        gid_t gid = current->fsgid;
+        int stat_result = 0;
+        int newfid = 0;
+        int retval = 0;
+        dprintk(DEBUG_VFS, " \n");
+        v9ses = kcalloc(1, sizeof(struct v9fs_session_info), GFP_KERNEL);
+        if (!v9ses)
+                return ERR_PTR(-ENOMEM);
+        if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) {
+                dprintk(DEBUG_ERROR, "problem initiating session\n");
+                retval = newfid;
+                goto free_session;
+        }
+        sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
+        v9fs_fill_super(sb, v9ses, flags);
+        inode = v9fs_get_inode(sb, S_IFDIR | mode);
+        if (IS_ERR(inode)) {
+                retval = PTR_ERR(inode);
+                goto put_back_sb;
+        }
+        inode->i_uid = uid;
+        inode->i_gid = gid;
+        root = d_alloc_root(inode);
+        if (!root) {
+                retval = -ENOMEM;
+                goto release_inode;
+        }
+        sb->s_root = root;
+        /* Setup the Root Inode */
+        root_fid = v9fs_fid_create(root);
+        if (root_fid == NULL) {
+                retval = -ENOMEM;
+                goto release_dentry;
+        }
+        root_fid->fidopen = 0;
+        root_fid->v9ses = v9ses;
+        stat_result = v9fs_t_stat(v9ses, newfid, &fcall);
+        if (stat_result < 0) {
+                dprintk(DEBUG_ERROR, "stat error\n");
+                v9fs_t_clunk(v9ses, newfid, NULL);
+                v9fs_put_idpool(newfid, &v9ses->fidpool);
+        } else {
+                root_fid->fid = newfid;
+                root_fid->qid = fcall->params.rstat.stat->qid;
+                root->d_inode->i_ino =
+                    v9fs_qid2ino(&fcall->params.rstat.stat->qid);
+                v9fs_mistat2inode(fcall->params.rstat.stat, root->d_inode, sb);
+        }
+        kfree(fcall);
+        if (stat_result < 0) {
+                retval = stat_result;
+                goto release_dentry;
+        }
+        return sb;
+      release_dentry:
+        dput(sb->s_root);
+      release_inode:
+        iput(inode);
+      put_back_sb:
+        up_write(&sb->s_umount);
+        deactivate_super(sb);
+        v9fs_session_close(v9ses);
+      free_session:
+        kfree(v9ses);
+        return ERR_PTR(retval);
+}
+/**
+ * v9fs_kill_super - Kill Superblock
+ * @s: superblock
+ *
+ */
+static void v9fs_kill_super(struct super_block *s)
+{
+        struct v9fs_session_info *v9ses = s->s_fs_info;
+        dprintk(DEBUG_VFS, " %p\n", s);
+        v9fs_dentry_release(s->s_root); /* clunk root */
+        kill_anon_super(s);
+        v9fs_session_close(v9ses);
+        kfree(v9ses);
+        dprintk(DEBUG_VFS, "exiting kill_super\n");
+}
+/**
+ * v9fs_show_options - Show mount options in /proc/mounts
+ * @m: seq_file to write to
+ * @mnt: mount descriptor
+ *
+ */
+static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+        struct v9fs_session_info *v9ses = mnt->mnt_sb->s_fs_info;
+        if (v9ses->debug != 0)
+                seq_printf(m, ",debug=%u", v9ses->debug);
+        if (v9ses->port != V9FS_PORT)
+                seq_printf(m, ",port=%u", v9ses->port);
+        if (v9ses->maxdata != 9000)
+                seq_printf(m, ",msize=%u", v9ses->maxdata);
+        if (v9ses->afid != ~0)
+                seq_printf(m, ",afid=%u", v9ses->afid);
+        if (v9ses->proto == PROTO_UNIX)
+                seq_puts(m, ",proto=unix");
+        if (v9ses->extended == 0)
+                seq_puts(m, ",noextend");
+        if (v9ses->nodev == 1)
+                seq_puts(m, ",nodevmap");
+        seq_printf(m, ",name=%s", v9ses->name);
+        seq_printf(m, ",aname=%s", v9ses->remotename);
+        seq_printf(m, ",uid=%u", v9ses->uid);
+        seq_printf(m, ",gid=%u", v9ses->gid);
+        return 0;
+}
+static void
+v9fs_umount_begin(struct super_block *sb)
+{
+        struct v9fs_session_info *v9ses = sb->s_fs_info;
+        v9fs_session_cancel(v9ses);
+}
+static struct super_operations v9fs_super_ops = {
+        .statfs = simple_statfs,
+        .clear_inode = v9fs_clear_inode,
+        .show_options = v9fs_show_options,
+        .umount_begin = v9fs_umount_begin,
+};
+struct file_system_type v9fs_fs_type = {
+        .name = "9P",
+        .get_sb = v9fs_get_sb,
+        .kill_sb = v9fs_kill_super,
+        .owner = THIS_MODULE,
+};
diff --git a/fs/Kconfig b/fs/Kconfig
index 5e817902cb3b..068ccea2f184 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -462,6 +462,19 @@ config AUTOFS4_FS
          local network, you probably do not need an automounter, and can say
          N here.
+config FUSE_FS
+        tristate "Filesystem in Userspace support"
+        help
+          With FUSE it is possible to implement a fully functional filesystem
+          in a userspace program.
+          There's also companion library: libfuse.  This library along with
+          utilities is available from the FUSE homepage:
+          <http://fuse.sourceforge.net/>
+          If you want to develop a userspace FS, or if you want to use
+          a filesystem based on FUSE, answer Y or M.
 menu "CD-ROM/DVD Filesystems"
 config ISO9660_FS
@@ -1703,6 +1716,17 @@ config AFS_FS
 config RXRPC
        tristate
+config 9P_FS
+        tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)"
+        depends on INET && EXPERIMENTAL
+        help
+          If you say Y here, you will get experimental support for
+          Plan 9 resource sharing via the 9P2000 protocol.
+          See <http://v9fs.sf.net> for more information.
+          If unsure, say N.
 endmenu
 menu "Partition Types"
diff --git a/fs/Makefile b/fs/Makefile
index 15158309dee4..1972da186272 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -89,11 +89,13 @@ obj-$(CONFIG_QNX4FS_FS)		+= qnx4/
 obj-$(CONFIG_AUTOFS_FS)         += autofs/
 obj-$(CONFIG_AUTOFS4_FS)        += autofs4/
 obj-$(CONFIG_ADFS_FS)           += adfs/
+obj-$(CONFIG_FUSE_FS)           += fuse/
 obj-$(CONFIG_UDF_FS)            += udf/
 obj-$(CONFIG_RELAYFS_FS)        += relayfs/
 obj-$(CONFIG_SUN_OPENPROMFS)    += openpromfs/
 obj-$(CONFIG_JFS_FS)            += jfs/
 obj-$(CONFIG_XFS_FS)            += xfs/
+obj-$(CONFIG_9P_FS)             += 9p/
 obj-$(CONFIG_AFS_FS)            += afs/
 obj-$(CONFIG_BEFS_FS)           += befs/
 obj-$(CONFIG_HOSTFS)            += hostfs/
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 7aa6f2004536..9ebe881c6786 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -255,6 +255,7 @@ void
 affs_delete_inode(struct inode *inode)
 {
        pr_debug("AFFS: delete_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
+        truncate_inode_pages(&inode->i_data, 0);
        inode->i_size = 0;
        if (S_ISREG(inode->i_mode))
                affs_truncate(inode);
diff --git a/fs/aio.c b/fs/aio.c
index 4f641abac3c0..38f62680fd63 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -29,6 +29,7 @@
 #include <linux/highmem.h>
 #include <linux/workqueue.h>
 #include <linux/security.h>
+#include <linux/rcuref.h>
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -499,7 +500,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
        /* Must be done under the lock to serialise against cancellation.
         * Call this aio_fput as it duplicates fput via the fput_work.
         */
-        if (unlikely(atomic_dec_and_test(&req->ki_filp->f_count))) {
+        if (unlikely(rcuref_dec_and_test(&req->ki_filp->f_count))) {
                get_ioctx(ctx);
                spin_lock(&fput_lock);
                list_add(&req->ki_list, &fput_head);
@@ -546,6 +547,24 @@ struct kioctx *lookup_ioctx(unsigned long ctx_id)
        return ioctx;
 }
+static int lock_kiocb_action(void *param)
+{
+        schedule();
+        return 0;
+}
+static inline void lock_kiocb(struct kiocb *iocb)
+{
+        wait_on_bit_lock(&iocb->ki_flags, KIF_LOCKED, lock_kiocb_action,
+                         TASK_UNINTERRUPTIBLE);
+}
+static inline void unlock_kiocb(struct kiocb *iocb)
+{
+        kiocbClearLocked(iocb);
+        wake_up_bit(&iocb->ki_flags, KIF_LOCKED);
+}
 /*
 * use_mm
 *      Makes the calling kernel thread take on the specified
@@ -786,7 +805,9 @@ static int __aio_run_iocbs(struct kioctx *ctx)
                 * Hold an extra reference while retrying i/o.
                 */
                iocb->ki_users++;       /* grab extra reference */
+                lock_kiocb(iocb);
                aio_run_iocb(iocb);
+                unlock_kiocb(iocb);
                if (__aio_put_req(ctx, iocb))  /* drop extra ref */
                        put_ioctx(ctx);
        }
@@ -1527,10 +1548,9 @@ int fastcall io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
                goto out_put_req;
        spin_lock_irq(&ctx->ctx_lock);
-        if (likely(list_empty(&ctx->run_list))) {
+        aio_run_iocb(req);
-                aio_run_iocb(req);
+        unlock_kiocb(req);
-        } else {
+        if (!list_empty(&ctx->run_list)) {
-                list_add_tail(&req->ki_run_list, &ctx->run_list);
                /* drain the run list */
                while (__aio_run_iocbs(ctx))
                        ;
@@ -1661,6 +1681,7 @@ asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb,
        if (NULL != cancel) {
                struct io_event tmp;
                pr_debug("calling cancel\n");
+                lock_kiocb(kiocb);
                memset(&tmp, 0, sizeof(tmp));
                tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user;
                tmp.data = kiocb->ki_user_data;
@@ -1672,8 +1693,9 @@ asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb,
                        if (copy_to_user(result, &tmp, sizeof(tmp)))
                                ret = -EFAULT;
                }
+                unlock_kiocb(kiocb);
        } else
-                printk(KERN_DEBUG "iocb has no cancel operation\n");
+                ret = -EINVAL;
        put_ioctx(ctx);
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 6171431272dc..990c28da5aec 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -105,6 +105,7 @@ struct autofs_sb_info {
        struct file *pipe;
        pid_t oz_pgrp;
        int catatonic;
+        struct super_block *sb;
        unsigned long exp_timeout;
        ino_t next_dir_ino;
        struct autofs_wait_queue *queues; /* Wait queue pointer */
@@ -134,7 +135,7 @@ void autofs_hash_insert(struct autofs_dirhash *,struct autofs_dir_ent *);
 void autofs_hash_delete(struct autofs_dir_ent *);
 struct autofs_dir_ent *autofs_hash_enum(const struct autofs_dirhash *,off_t *,struct autofs_dir_ent *);
 void autofs_hash_dputall(struct autofs_dirhash *);
-void autofs_hash_nuke(struct autofs_dirhash *);
+void autofs_hash_nuke(struct autofs_sb_info *);
 /* Expiration-handling functions */
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index 448143fd0796..5ccfcf26310d 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -232,13 +232,13 @@ void autofs_hash_dputall(struct autofs_dirhash *dh)
 /* Delete everything.  This is used on filesystem destruction, so we
   make no attempt to keep the pointers valid */
-void autofs_hash_nuke(struct autofs_dirhash *dh)
+void autofs_hash_nuke(struct autofs_sb_info *sbi)
 {
        int i;
        struct autofs_dir_ent *ent, *nent;
        for ( i = 0 ; i < AUTOFS_HASH_SIZE ; i++ ) {
-                for ( ent = dh->h[i] ; ent ; ent = nent ) {
+                for ( ent = sbi->dirhash.h[i] ; ent ; ent = nent ) {
                        nent = ent->next;
                        if ( ent->dentry )
                                dput(ent->dentry);
@@ -246,4 +246,5 @@ void autofs_hash_nuke(struct autofs_dirhash *dh)
                        kfree(ent);
                }
        }
+        shrink_dcache_sb(sbi->sb);
 }
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 4888c1fabbf7..65e5ed42190e 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -27,7 +27,7 @@ static void autofs_put_super(struct super_block *sb)
        if ( !sbi->catatonic )
                autofs_catatonic_mode(sbi); /* Free wait queues, close pipe */
-        autofs_hash_nuke(&sbi->dirhash);
+        autofs_hash_nuke(sbi);
        for ( n = 0 ; n < AUTOFS_MAX_SYMLINKS ; n++ ) {
                if ( test_bit(n, sbi->symlink_bitmap) )
                        kfree(sbi->symlink[n].data);
@@ -148,6 +148,7 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
        s->s_magic = AUTOFS_SUPER_MAGIC;
        s->s_op = &autofs_sops;
        s->s_time_gran = 1;
+        sbi->sb = s;
        root_inode = iget(s, AUTOFS_ROOT_INO);
        root = d_alloc_root(root_inode);
diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index 1020dbc88bec..1fbc53f14aba 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -20,7 +20,6 @@ struct bfs_sb_info {
        unsigned long si_lasti;
        unsigned long * si_imap;
        struct buffer_head * si_sbh;            /* buffer header w/superblock */
-        struct bfs_super_block * si_bfs_sb;     /* superblock in si_sbh->b_data */
 };
 /*
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 5a1e5ce057ff..e240c335eb23 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -2,6 +2,7 @@
 *      fs/bfs/dir.c
 *      BFS directory operations.
 *      Copyright (C) 1999,2000  Tigran Aivazian <tigran@veritas.com>
+ *      Made endianness-clean by Andrew Stribblehill <ads@wompom.org> 2005
 */
 #include <linux/time.h>
@@ -20,9 +21,9 @@
 #define dprintf(x...)
 #endif
-static int bfs_add_entry(struct inode * dir, const char * name, int namelen, int ino);
+static int bfs_add_entry(struct inode * dir, const unsigned char * name, int namelen, int ino);
 static struct buffer_head * bfs_find_entry(struct inode * dir, 
-        const char * name, int namelen, struct bfs_dirent ** res_dir);
+        const unsigned char * name, int namelen, struct bfs_dirent ** res_dir);
 static int bfs_readdir(struct file * f, void * dirent, filldir_t filldir)
 {
@@ -53,7 +54,7 @@ static int bfs_readdir(struct file * f, void * dirent, filldir_t filldir)
                        de = (struct bfs_dirent *)(bh->b_data + offset);
                        if (de->ino) {
                                int size = strnlen(de->name, BFS_NAMELEN);
-                                if (filldir(dirent, de->name, size, f->f_pos, de->ino, DT_UNKNOWN) < 0) {
+                                if (filldir(dirent, de->name, size, f->f_pos, le16_to_cpu(de->ino), DT_UNKNOWN) < 0) {
                                        brelse(bh);
                                        unlock_kernel();
                                        return 0;
@@ -107,7 +108,7 @@ static int bfs_create(struct inode * dir, struct dentry * dentry, int mode,
        inode->i_mapping->a_ops = &bfs_aops;
        inode->i_mode = mode;
        inode->i_ino = ino;
-        BFS_I(inode)->i_dsk_ino = ino;
+        BFS_I(inode)->i_dsk_ino = cpu_to_le16(ino);
        BFS_I(inode)->i_sblock = 0;
        BFS_I(inode)->i_eblock = 0;
        insert_inode_hash(inode);
@@ -139,7 +140,7 @@ static struct dentry * bfs_lookup(struct inode * dir, struct dentry * dentry, st
        lock_kernel();
        bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de);
        if (bh) {
-                unsigned long ino = le32_to_cpu(de->ino);
+                unsigned long ino = (unsigned long)le16_to_cpu(de->ino);
                brelse(bh);
                inode = iget(dir->i_sb, ino);
                if (!inode) {
@@ -183,7 +184,7 @@ static int bfs_unlink(struct inode * dir, struct dentry * dentry)
        inode = dentry->d_inode;
        lock_kernel();
        bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de);
-        if (!bh || de->ino != inode->i_ino) 
+        if (!bh || le16_to_cpu(de->ino) != inode->i_ino)
                goto out_brelse;
        if (!inode->i_nlink) {
@@ -224,7 +225,7 @@ static int bfs_rename(struct inode * old_dir, struct dentry * old_dentry,
                                old_dentry->d_name.name, 
                                old_dentry->d_name.len, &old_de);
-        if (!old_bh || old_de->ino != old_inode->i_ino)
+        if (!old_bh || le16_to_cpu(old_de->ino) != old_inode->i_ino)
                goto end_rename;
        error = -EPERM;
@@ -270,7 +271,7 @@ struct inode_operations bfs_dir_inops = {
        .rename                 = bfs_rename,
 };
-static int bfs_add_entry(struct inode * dir, const char * name, int namelen, int ino)
+static int bfs_add_entry(struct inode * dir, const unsigned char * name, int namelen, int ino)
 {
        struct buffer_head * bh;
        struct bfs_dirent * de;
@@ -304,7 +305,7 @@ static int bfs_add_entry(struct inode * dir, const char * name, int namelen, int
                                }
                                dir->i_mtime = CURRENT_TIME_SEC;
                                mark_inode_dirty(dir);
-                                de->ino = ino;
+                                de->ino = cpu_to_le16((u16)ino);
                                for (i=0; i<BFS_NAMELEN; i++)
                                        de->name[i] = (i < namelen) ? name[i] : 0;
                                mark_buffer_dirty(bh);
@@ -317,7 +318,7 @@ static int bfs_add_entry(struct inode * dir, const char * name, int namelen, int
        return -ENOSPC;
 }
-static inline int bfs_namecmp(int len, const char * name, const char * buffer)
+static inline int bfs_namecmp(int len, const unsigned char * name, const char * buffer)
 {
        if (len < BFS_NAMELEN && buffer[len])
                return 0;
@@ -325,7 +326,7 @@ static inline int bfs_namecmp(int len, const char * name, const char * buffer)
 }
 static struct buffer_head * bfs_find_entry(struct inode * dir, 
-        const char * name, int namelen, struct bfs_dirent ** res_dir)
+        const unsigned char * name, int namelen, struct bfs_dirent ** res_dir)
 {
        unsigned long block, offset;
        struct buffer_head * bh;
@@ -346,7 +347,7 @@ static struct buffer_head * bfs_find_entry(struct inode * dir,
                }
                de = (struct bfs_dirent *)(bh->b_data + offset);
                offset += BFS_DIRENT_SIZE;
-                if (de->ino && bfs_namecmp(namelen, name, de->name)) {
+                if (le16_to_cpu(de->ino) && bfs_namecmp(namelen, name, de->name)) {
                        *res_dir = de;
                        return bh;
                }
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 747fd1ea55e0..807723b65daf 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -40,8 +40,8 @@ static int bfs_move_block(unsigned long from, unsigned long to, struct super_blo
        return 0;
 }
-static int bfs_move_blocks(struct super_block *sb, unsigned long start, unsigned long end, 
+static int bfs_move_blocks(struct super_block *sb, unsigned long start,
-                                unsigned long where)
+                           unsigned long end, unsigned long where)
 {
        unsigned long i;
@@ -57,20 +57,21 @@ static int bfs_move_blocks(struct super_block *sb, unsigned long start, unsigned
 static int bfs_get_block(struct inode * inode, sector_t block, 
        struct buffer_head * bh_result, int create)
 {
-        long phys;
+        unsigned long phys;
        int err;
        struct super_block *sb = inode->i_sb;
        struct bfs_sb_info *info = BFS_SB(sb);
        struct bfs_inode_info *bi = BFS_I(inode);
        struct buffer_head *sbh = info->si_sbh;
-        if (block < 0 || block > info->si_blocks)
+        if (block > info->si_blocks)
                return -EIO;
        phys = bi->i_sblock + block;
        if (!create) {
                if (phys <= bi->i_eblock) {
-                        dprintf("c=%d, b=%08lx, phys=%08lx (granted)\n", create, block, phys);
+                        dprintf("c=%d, b=%08lx, phys=%09lx (granted)\n",
+                                create, (unsigned long)block, phys);
                        map_bh(bh_result, sb, phys);
                }
                return 0;
@@ -80,7 +81,7 @@ static int bfs_get_block(struct inode * inode, sector_t block,
           of blocks allocated for this file, we can grant it */
        if (inode->i_size && phys <= bi->i_eblock) {
                dprintf("c=%d, b=%08lx, phys=%08lx (interim block granted)\n", 
-                                create, block, phys);
+                                create, (unsigned long)block, phys);
                map_bh(bh_result, sb, phys);
                return 0;
        }
@@ -88,11 +89,12 @@ static int bfs_get_block(struct inode * inode, sector_t block,
        /* the rest has to be protected against itself */
        lock_kernel();
-        /* if the last data block for this file is the last allocated block, we can
+        /* if the last data block for this file is the last allocated
-           extend the file trivially, without moving it anywhere */
+           block, we can extend the file trivially, without moving it
+           anywhere */
        if (bi->i_eblock == info->si_lf_eblk) {
                dprintf("c=%d, b=%08lx, phys=%08lx (simple extension)\n", 
-                                create, block, phys);
+                                create, (unsigned long)block, phys);
                map_bh(bh_result, sb, phys);
                info->si_freeb -= phys - bi->i_eblock;
                info->si_lf_eblk = bi->i_eblock = phys;
@@ -114,7 +116,8 @@ static int bfs_get_block(struct inode * inode, sector_t block,
        } else
                err = 0;
-        dprintf("c=%d, b=%08lx, phys=%08lx (moved)\n", create, block, phys);
+        dprintf("c=%d, b=%08lx, phys=%08lx (moved)\n",
+                create, (unsigned long)block, phys);
        bi->i_sblock = phys;
        phys += block;
        info->si_lf_eblk = bi->i_eblock = phys;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 64e0fb33fc0c..c7b39aa279d7 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -3,6 +3,8 @@
 *      BFS superblock and inode operations.
 *      Copyright (C) 1999,2000 Tigran Aivazian <tigran@veritas.com>
 *      From fs/minix, Copyright (C) 1991, 1992 Linus Torvalds.
+ *
+ *      Made endianness-clean by Andrew Stribblehill <ads@wompom.org>, 2005.
 */
 #include <linux/module.h>
@@ -54,46 +56,50 @@ static void bfs_read_inode(struct inode * inode)
        off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
        di = (struct bfs_inode *)bh->b_data + off;
-        inode->i_mode = 0x0000FFFF & di->i_mode;
+        inode->i_mode = 0x0000FFFF &  le32_to_cpu(di->i_mode);
-        if (di->i_vtype == BFS_VDIR) {
+        if (le32_to_cpu(di->i_vtype) == BFS_VDIR) {
                inode->i_mode |= S_IFDIR;
                inode->i_op = &bfs_dir_inops;
                inode->i_fop = &bfs_dir_operations;
-        } else if (di->i_vtype == BFS_VREG) {
+        } else if (le32_to_cpu(di->i_vtype) == BFS_VREG) {
                inode->i_mode |= S_IFREG;
                inode->i_op = &bfs_file_inops;
                inode->i_fop = &bfs_file_operations;
                inode->i_mapping->a_ops = &bfs_aops;
        }
-        inode->i_uid = di->i_uid;
+        BFS_I(inode)->i_sblock =  le32_to_cpu(di->i_sblock);
-        inode->i_gid = di->i_gid;
+        BFS_I(inode)->i_eblock =  le32_to_cpu(di->i_eblock);
-        inode->i_nlink = di->i_nlink;
+        inode->i_uid =  le32_to_cpu(di->i_uid);
+        inode->i_gid =  le32_to_cpu(di->i_gid);
+        inode->i_nlink =  le32_to_cpu(di->i_nlink);
        inode->i_size = BFS_FILESIZE(di);
        inode->i_blocks = BFS_FILEBLOCKS(di);
+        if (inode->i_size || inode->i_blocks) dprintf("Registered inode with %lld size, %ld blocks\n", inode->i_size, inode->i_blocks);
        inode->i_blksize = PAGE_SIZE;
-        inode->i_atime.tv_sec = di->i_atime;
+        inode->i_atime.tv_sec =  le32_to_cpu(di->i_atime);
-        inode->i_mtime.tv_sec = di->i_mtime;
+        inode->i_mtime.tv_sec =  le32_to_cpu(di->i_mtime);
-        inode->i_ctime.tv_sec = di->i_ctime;
+        inode->i_ctime.tv_sec =  le32_to_cpu(di->i_ctime);
        inode->i_atime.tv_nsec = 0;
        inode->i_mtime.tv_nsec = 0;
        inode->i_ctime.tv_nsec = 0;
-        BFS_I(inode)->i_dsk_ino = di->i_ino; /* can be 0 so we store a copy */
+        BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino); /* can be 0 so we store a copy */
-        BFS_I(inode)->i_sblock = di->i_sblock;
-        BFS_I(inode)->i_eblock = di->i_eblock;
        brelse(bh);
 }
 static int bfs_write_inode(struct inode * inode, int unused)
 {
-        unsigned long ino = inode->i_ino;
+        unsigned int ino = (u16)inode->i_ino;
+        unsigned long i_sblock;
        struct bfs_inode * di;
        struct buffer_head * bh;
        int block, off;
+        dprintf("ino=%08x\n", ino);
        if (ino < BFS_ROOT_INO || ino > BFS_SB(inode->i_sb)->si_lasti) {
-                printf("Bad inode number %s:%08lx\n", inode->i_sb->s_id, ino);
+                printf("Bad inode number %s:%08x\n", inode->i_sb->s_id, ino);
                return -EIO;
        }
@@ -101,7 +107,7 @@ static int bfs_write_inode(struct inode * inode, int unused)
        block = (ino - BFS_ROOT_INO)/BFS_INODES_PER_BLOCK + 1;
        bh = sb_bread(inode->i_sb, block);
        if (!bh) {
-                printf("Unable to read inode %s:%08lx\n", inode->i_sb->s_id, ino);
+                printf("Unable to read inode %s:%08x\n", inode->i_sb->s_id, ino);
                unlock_kernel();
                return -EIO;
        }
@@ -109,24 +115,26 @@ static int bfs_write_inode(struct inode * inode, int unused)
        off = (ino - BFS_ROOT_INO)%BFS_INODES_PER_BLOCK;
        di = (struct bfs_inode *)bh->b_data + off;
-        if (inode->i_ino == BFS_ROOT_INO)
+        if (ino == BFS_ROOT_INO)
-                di->i_vtype = BFS_VDIR;
+                di->i_vtype = cpu_to_le32(BFS_VDIR);
        else
-                di->i_vtype = BFS_VREG;
+                di->i_vtype = cpu_to_le32(BFS_VREG);
-        di->i_ino = inode->i_ino;
+        di->i_ino = cpu_to_le16(ino);
-        di->i_mode = inode->i_mode;
+        di->i_mode = cpu_to_le32(inode->i_mode);
-        di->i_uid = inode->i_uid;
+        di->i_uid = cpu_to_le32(inode->i_uid);
-        di->i_gid = inode->i_gid;
+        di->i_gid = cpu_to_le32(inode->i_gid);
-        di->i_nlink = inode->i_nlink;
+        di->i_nlink = cpu_to_le32(inode->i_nlink);
-        di->i_atime = inode->i_atime.tv_sec;
+        di->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
-        di->i_mtime = inode->i_mtime.tv_sec;
+        di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
-        di->i_ctime = inode->i_ctime.tv_sec;
+        di->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
-        di->i_sblock = BFS_I(inode)->i_sblock;
+        i_sblock = BFS_I(inode)->i_sblock;
-        di->i_eblock = BFS_I(inode)->i_eblock;
+        di->i_sblock = cpu_to_le32(i_sblock);
-        di->i_eoffset = di->i_sblock * BFS_BSIZE + inode->i_size - 1;
+        di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock);
+        di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
        mark_buffer_dirty(bh);
+        dprintf("Written ino=%d into %d:%d\n",le16_to_cpu(di->i_ino),block,off);
        brelse(bh);
        unlock_kernel();
        return 0;
@@ -140,11 +148,14 @@ static void bfs_delete_inode(struct inode * inode)
        int block, off;
        struct super_block * s = inode->i_sb;
        struct bfs_sb_info * info = BFS_SB(s);
+        struct bfs_inode_info * bi = BFS_I(inode);
-        dprintf("ino=%08lx\n", inode->i_ino);
+        dprintf("ino=%08lx\n", ino);
-        if (inode->i_ino < BFS_ROOT_INO || inode->i_ino > info->si_lasti) {
+        truncate_inode_pages(&inode->i_data, 0);
-                printf("invalid ino=%08lx\n", inode->i_ino);
+        if (ino < BFS_ROOT_INO || ino > info->si_lasti) {
+                printf("invalid ino=%08lx\n", ino);
                return;
        }
        
@@ -160,13 +171,13 @@ static void bfs_delete_inode(struct inode * inode)
                return;
        }
        off = (ino - BFS_ROOT_INO)%BFS_INODES_PER_BLOCK;
-        di = (struct bfs_inode *)bh->b_data + off;
+        di = (struct bfs_inode *) bh->b_data + off;
-        if (di->i_ino) {
+        if (bi->i_dsk_ino) {
-                info->si_freeb += BFS_FILEBLOCKS(di);
+                info->si_freeb += 1 + bi->i_eblock - bi->i_sblock;
                info->si_freei++;
-                clear_bit(di->i_ino, info->si_imap);
+                clear_bit(ino, info->si_imap);
                dump_imap("delete_inode", s);
-        }
+        }
        di->i_ino = 0;
        di->i_sblock = 0;
        mark_buffer_dirty(bh);
@@ -272,14 +283,14 @@ static struct super_operations bfs_sops = {
 void dump_imap(const char *prefix, struct super_block * s)
 {
-#if 0
+#ifdef DEBUG
        int i;
        char *tmpbuf = (char *)get_zeroed_page(GFP_KERNEL);
        if (!tmpbuf)
                return;
        for (i=BFS_SB(s)->si_lasti; i>=0; i--) {
-                if (i>PAGE_SIZE-100) break;
+                if (i > PAGE_SIZE-100) break;
                if (test_bit(i, BFS_SB(s)->si_imap))
                        strcat(tmpbuf, "1");
                else
@@ -295,7 +306,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
        struct buffer_head * bh;
        struct bfs_super_block * bfs_sb;
        struct inode * inode;
-        int i, imap_len;
+        unsigned i, imap_len;
        struct bfs_sb_info * info;
        info = kmalloc(sizeof(*info), GFP_KERNEL);
@@ -310,19 +321,18 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
        if(!bh)
                goto out;
        bfs_sb = (struct bfs_super_block *)bh->b_data;
-        if (bfs_sb->s_magic != BFS_MAGIC) {
+        if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) {
                if (!silent)
                        printf("No BFS filesystem on %s (magic=%08x)\n", 
-                                s->s_id, bfs_sb->s_magic);
+                                s->s_id,  le32_to_cpu(bfs_sb->s_magic));
                goto out;
        }
        if (BFS_UNCLEAN(bfs_sb, s) && !silent)
                printf("%s is unclean, continuing\n", s->s_id);
        s->s_magic = BFS_MAGIC;
-        info->si_bfs_sb = bfs_sb;
        info->si_sbh = bh;
-        info->si_lasti = (bfs_sb->s_start - BFS_BSIZE)/sizeof(struct bfs_inode) 
+        info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE)/sizeof(struct bfs_inode)
                        + BFS_ROOT_INO - 1;
        imap_len = info->si_lasti/8 + 1;
@@ -346,8 +356,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
                goto out;
        }
-        info->si_blocks = (bfs_sb->s_end + 1)>>BFS_BSIZE_BITS; /* for statfs(2) */
+        info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1)>>BFS_BSIZE_BITS; /* for statfs(2) */
-        info->si_freeb = (bfs_sb->s_end + 1 - bfs_sb->s_start)>>BFS_BSIZE_BITS;
+        info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1 -  cpu_to_le32(bfs_sb->s_start))>>BFS_BSIZE_BITS;
        info->si_freei = 0;
        info->si_lf_eblk = 0;
        info->si_lf_sblk = 0;
diff --git a/fs/bio.c b/fs/bio.c
index a7d4fd3a3299..83a349574567 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -683,7 +683,7 @@ struct bio *bio_map_user(request_queue_t *q, struct block_device *bdev,
 {
        struct sg_iovec iov;
-        iov.iov_base = (__user void *)uaddr;
+        iov.iov_base = (void __user *)uaddr;
        iov.iov_len = len;
        return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm);
diff --git a/fs/compat.c b/fs/compat.c
index 8c665705c6a0..ac3fb9ed8eea 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1619,6 +1619,7 @@ compat_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp
        char *bits;
        long timeout;
        int size, max_fdset, ret = -EINVAL;
+        struct fdtable *fdt;
        timeout = MAX_SCHEDULE_TIMEOUT;
        if (tvp) {
@@ -1644,7 +1645,10 @@ compat_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp
                goto out_nofds;
        /* max_fdset can increase, so grab it once to avoid race */
-        max_fdset = current->files->max_fdset;
+        rcu_read_lock();
+        fdt = files_fdtable(current->files);
+        max_fdset = fdt->max_fdset;
+        rcu_read_unlock();
        if (n > max_fdset)
                n = max_fdset;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 155e612635f1..e28a74203f3b 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -798,13 +798,16 @@ static int routing_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
                r = (void *) &r4;
        }
-        if (ret)
+        if (ret) {
-                return -EFAULT;
+                ret = -EFAULT;
+                goto out;
+        }
        set_fs (KERNEL_DS);
        ret = sys_ioctl (fd, cmd, (unsigned long) r);
        set_fs (old_fs);
+out:
        if (mysock)
                sockfd_put(mysock);
diff --git a/fs/exec.c b/fs/exec.c
index 222ab1c572d8..14dd03907ccb 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -798,6 +798,7 @@ no_thread_group:
 static inline void flush_old_files(struct files_struct * files)
 {
        long j = -1;
+        struct fdtable *fdt;
        spin_lock(&files->file_lock);
        for (;;) {
@@ -805,12 +806,13 @@ static inline void flush_old_files(struct files_struct * files)
                j++;
                i = j * __NFDBITS;
-                if (i >= files->max_fds || i >= files->max_fdset)
+                fdt = files_fdtable(files);
+                if (i >= fdt->max_fds || i >= fdt->max_fdset)
                        break;
-                set = files->close_on_exec->fds_bits[j];
+                set = fdt->close_on_exec->fds_bits[j];
                if (!set)
                        continue;
-                files->close_on_exec->fds_bits[j] = 0;
+                fdt->close_on_exec->fds_bits[j] = 0;
                spin_unlock(&files->file_lock);
                for ( ; set ; i++,set >>= 1) {
                        if (set & 1) {
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 161f156d98c8..c8d07030c897 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -615,6 +615,11 @@ got:
                DQUOT_DROP(inode);
                goto fail2;
        }
+        err = ext2_init_security(inode,dir);
+        if (err) {
+                DQUOT_FREE_INODE(inode);
+                goto fail2;
+        }
        mark_inode_dirty(inode);
        ext2_debug("allocating inode %lu\n", inode->i_ino);
        ext2_preread_inode(inode);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 53dceb0c6593..fdba4d1d3c60 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -71,6 +71,8 @@ void ext2_put_inode(struct inode *inode)
 */
 void ext2_delete_inode (struct inode * inode)
 {
+        truncate_inode_pages(&inode->i_data, 0);
        if (is_bad_inode(inode))
                goto no_delete;
        EXT2_I(inode)->i_dtime  = get_seconds();
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index 5f3bfde3b810..67cfeb66e897 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -116,3 +116,11 @@ exit_ext2_xattr(void)
 # endif  /* CONFIG_EXT2_FS_XATTR */
+#ifdef CONFIG_EXT2_FS_SECURITY
+extern int ext2_init_security(struct inode *inode, struct inode *dir);
+#else
+static inline int ext2_init_security(struct inode *inode, struct inode *dir)
+{
+        return 0;
+}
+#endif
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 6a6c59fbe599..a26612798471 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -8,6 +8,7 @@
 #include <linux/fs.h>
 #include <linux/smp_lock.h>
 #include <linux/ext2_fs.h>
+#include <linux/security.h>
 #include "xattr.h"
 static size_t
@@ -45,6 +46,27 @@ ext2_xattr_security_set(struct inode *inode, const char *name,
                              value, size, flags);
 }
+int
+ext2_init_security(struct inode *inode, struct inode *dir)
+{
+        int err;
+        size_t len;
+        void *value;
+        char *name;
+        err = security_inode_init_security(inode, dir, &name, &value, &len);
+        if (err) {
+                if (err == -EOPNOTSUPP)
+                        return 0;
+                return err;
+        }
+        err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY,
+                             name, value, len, 0);
+        kfree(name);
+        kfree(value);
+        return err;
+}
 struct xattr_handler ext2_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .list   = ext2_xattr_security_list,
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 6981bd014ede..96552769d039 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -607,6 +607,11 @@ got:
                DQUOT_DROP(inode);
                goto fail2;
        }
+        err = ext3_init_security(handle,inode, dir);
+        if (err) {
+                DQUOT_FREE_INODE(inode);
+                goto fail2;
+        }
        err = ext3_mark_inode_dirty(handle, inode);
        if (err) {
                ext3_std_error(sb, err);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 9989fdcf4d5a..b5177c90d6f1 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -187,6 +187,8 @@ void ext3_delete_inode (struct inode * inode)
 {
        handle_t *handle;
+        truncate_inode_pages(&inode->i_data, 0);
        if (is_bad_inode(inode))
                goto no_delete;
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index eb31a69e82dc..2ceae38f3d49 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -133,3 +133,14 @@ exit_ext3_xattr(void)
 #define ext3_xattr_handlers     NULL
 # endif  /* CONFIG_EXT3_FS_XATTR */
+#ifdef CONFIG_EXT3_FS_SECURITY
+extern int ext3_init_security(handle_t *handle, struct inode *inode,
+                                struct inode *dir);
+#else
+static inline int ext3_init_security(handle_t *handle, struct inode *inode,
+                                struct inode *dir)
+{
+        return 0;
+}
+#endif
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index ddc1c41750e1..b9c40c15647b 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -9,6 +9,7 @@
 #include <linux/smp_lock.h>
 #include <linux/ext3_jbd.h>
 #include <linux/ext3_fs.h>
+#include <linux/security.h>
 #include "xattr.h"
 static size_t
@@ -47,6 +48,27 @@ ext3_xattr_security_set(struct inode *inode, const char *name,
                              value, size, flags);
 }
+int
+ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
+{
+        int err;
+        size_t len;
+        void *value;
+        char *name;
+        err = security_inode_init_security(inode, dir, &name, &value, &len);
+        if (err) {
+                if (err == -EOPNOTSUPP)
+                        return 0;
+                return err;
+        }
+        err = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_SECURITY,
+                                    name, value, len, 0);
+        kfree(name);
+        kfree(value);
+        return err;
+}
 struct xattr_handler ext3_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .list   = ext3_xattr_security_list,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 96ae85b67eba..a7cbe68e2259 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -335,6 +335,8 @@ EXPORT_SYMBOL(fat_build_inode);
 static void fat_delete_inode(struct inode *inode)
 {
+        truncate_inode_pages(&inode->i_data, 0);
        if (!is_bad_inode(inode)) {
                inode->i_size = 0;
                fat_truncate(inode);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 6fbc9d8fcc36..863b46e0d78a 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -16,6 +16,7 @@
 #include <linux/security.h>
 #include <linux/ptrace.h>
 #include <linux/signal.h>
+#include <linux/rcupdate.h>
 #include <asm/poll.h>
 #include <asm/siginfo.h>
@@ -24,21 +25,25 @@
 void fastcall set_close_on_exec(unsigned int fd, int flag)
 {
        struct files_struct *files = current->files;
+        struct fdtable *fdt;
        spin_lock(&files->file_lock);
+        fdt = files_fdtable(files);
        if (flag)
-                FD_SET(fd, files->close_on_exec);
+                FD_SET(fd, fdt->close_on_exec);
        else
-                FD_CLR(fd, files->close_on_exec);
+                FD_CLR(fd, fdt->close_on_exec);
        spin_unlock(&files->file_lock);
 }
 static inline int get_close_on_exec(unsigned int fd)
 {
        struct files_struct *files = current->files;
+        struct fdtable *fdt;
        int res;
-        spin_lock(&files->file_lock);
+        rcu_read_lock();
-        res = FD_ISSET(fd, files->close_on_exec);
+        fdt = files_fdtable(files);
-        spin_unlock(&files->file_lock);
+        res = FD_ISSET(fd, fdt->close_on_exec);
+        rcu_read_unlock();
        return res;
 }
@@ -54,24 +59,26 @@ static int locate_fd(struct files_struct *files,
        unsigned int newfd;
        unsigned int start;
        int error;
+        struct fdtable *fdt;
        error = -EINVAL;
        if (orig_start >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
                goto out;
 repeat:
+        fdt = files_fdtable(files);
        /*
         * Someone might have closed fd's in the range
-         * orig_start..files->next_fd
+         * orig_start..fdt->next_fd
         */
        start = orig_start;
-        if (start < files->next_fd)
+        if (start < fdt->next_fd)
-                start = files->next_fd;
+                start = fdt->next_fd;
        newfd = start;
-        if (start < files->max_fdset) {
+        if (start < fdt->max_fdset) {
-                newfd = find_next_zero_bit(files->open_fds->fds_bits,
+                newfd = find_next_zero_bit(fdt->open_fds->fds_bits,
-                        files->max_fdset, start);
+                        fdt->max_fdset, start);
        }
        
        error = -EMFILE;
@@ -89,9 +96,15 @@ repeat:
        if (error)
                goto repeat;
-        if (start <= files->next_fd)
+        /*
-                files->next_fd = newfd + 1;
+         * We reacquired files_lock, so we are safe as long as
-        
+         * we reacquire the fdtable pointer and use it while holding
+         * the lock, no one can free it during that time.
+         */
+        fdt = files_fdtable(files);
+        if (start <= fdt->next_fd)
+                fdt->next_fd = newfd + 1;
        error = newfd;
        
 out:
@@ -101,13 +114,16 @@ out:
 static int dupfd(struct file *file, unsigned int start)
 {
        struct files_struct * files = current->files;
+        struct fdtable *fdt;
        int fd;
        spin_lock(&files->file_lock);
        fd = locate_fd(files, file, start);
        if (fd >= 0) {
-                FD_SET(fd, files->open_fds);
+                /* locate_fd() may have expanded fdtable, load the ptr */
-                FD_CLR(fd, files->close_on_exec);
+                fdt = files_fdtable(files);
+                FD_SET(fd, fdt->open_fds);
+                FD_CLR(fd, fdt->close_on_exec);
                spin_unlock(&files->file_lock);
                fd_install(fd, file);
        } else {
@@ -123,6 +139,7 @@ asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
        int err = -EBADF;
        struct file * file, *tofree;
        struct files_struct * files = current->files;
+        struct fdtable *fdt;
        spin_lock(&files->file_lock);
        if (!(file = fcheck(oldfd)))
@@ -148,13 +165,14 @@ asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
        /* Yes. It's a race. In user space. Nothing sane to do */
        err = -EBUSY;
-        tofree = files->fd[newfd];
+        fdt = files_fdtable(files);
-        if (!tofree && FD_ISSET(newfd, files->open_fds))
+        tofree = fdt->fd[newfd];
+        if (!tofree && FD_ISSET(newfd, fdt->open_fds))
                goto out_fput;
-        files->fd[newfd] = file;
+        rcu_assign_pointer(fdt->fd[newfd], file);
-        FD_SET(newfd, files->open_fds);
+        FD_SET(newfd, fdt->open_fds);
-        FD_CLR(newfd, files->close_on_exec);
+        FD_CLR(newfd, fdt->close_on_exec);
        spin_unlock(&files->file_lock);
        if (tofree)
diff --git a/fs/file.c b/fs/file.c
index 92b5f25985d2..2127a7b9dc3a 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -13,6 +13,25 @@
 #include <linux/vmalloc.h>
 #include <linux/file.h>
 #include <linux/bitops.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/workqueue.h>
+struct fdtable_defer {
+        spinlock_t lock;
+        struct work_struct wq;
+        struct timer_list timer;
+        struct fdtable *next;
+};
+/*
+ * We use this list to defer free fdtables that have vmalloced
+ * sets/arrays. By keeping a per-cpu list, we avoid having to embed
+ * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in
+ * this per-task structure.
+ */
+static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
 /*
@@ -48,82 +67,143 @@ void free_fd_array(struct file **array, int num)
                vfree(array);
 }
-/*
+static void __free_fdtable(struct fdtable *fdt)
- * Expand the fd array in the files_struct.  Called with the files
+{
- * spinlock held for write.
+        int fdset_size, fdarray_size;
- */
-static int expand_fd_array(struct files_struct *files, int nr)
+        fdset_size = fdt->max_fdset / 8;
-        __releases(files->file_lock)
+        fdarray_size = fdt->max_fds * sizeof(struct file *);
-        __acquires(files->file_lock)
+        free_fdset(fdt->open_fds, fdset_size);
+        free_fdset(fdt->close_on_exec, fdset_size);
+        free_fd_array(fdt->fd, fdarray_size);
+        kfree(fdt);
+}
+static void fdtable_timer(unsigned long data)
 {
-        struct file **new_fds;
+        struct fdtable_defer *fddef = (struct fdtable_defer *)data;
-        int error, nfds;
-        
+        spin_lock(&fddef->lock);
-        error = -EMFILE;
+        /*
-        if (files->max_fds >= NR_OPEN || nr >= NR_OPEN)
+         * If someone already emptied the queue return.
+         */
+        if (!fddef->next)
                goto out;
+        if (!schedule_work(&fddef->wq))
+                mod_timer(&fddef->timer, 5);
+out:
+        spin_unlock(&fddef->lock);
+}
-        nfds = files->max_fds;
+static void free_fdtable_work(struct fdtable_defer *f)
-        spin_unlock(&files->file_lock);
+{
+        struct fdtable *fdt;
-        /* 
+        spin_lock_bh(&f->lock);
-         * Expand to the max in easy steps, and keep expanding it until
+        fdt = f->next;
-         * we have enough for the requested fd array size. 
+        f->next = NULL;
-         */
+        spin_unlock_bh(&f->lock);
+        while(fdt) {
+                struct fdtable *next = fdt->next;
+                __free_fdtable(fdt);
+                fdt = next;
+        }
+}
-        do {
+static void free_fdtable_rcu(struct rcu_head *rcu)
-#if NR_OPEN_DEFAULT < 256
+{
-                if (nfds < 256)
+        struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);
-                        nfds = 256;
+        int fdset_size, fdarray_size;
-                else 
+        struct fdtable_defer *fddef;
-#endif
-                if (nfds < (PAGE_SIZE / sizeof(struct file *)))
-                        nfds = PAGE_SIZE / sizeof(struct file *);
-                else {
-                        nfds = nfds * 2;
-                        if (nfds > NR_OPEN)
-                                nfds = NR_OPEN;
-                }
-        } while (nfds <= nr);
-        error = -ENOMEM;
+        BUG_ON(!fdt);
-        new_fds = alloc_fd_array(nfds);
+        fdset_size = fdt->max_fdset / 8;
-        spin_lock(&files->file_lock);
+        fdarray_size = fdt->max_fds * sizeof(struct file *);
-        if (!new_fds)
-                goto out;
-        /* Copy the existing array and install the new pointer */
+        if (fdt->free_files) {
+                /*
-        if (nfds > files->max_fds) {
+                 * The this fdtable was embedded in the files structure
-                struct file **old_fds;
+                 * and the files structure itself was getting destroyed.
-                int i;
+                 * It is now safe to free the files structure.
-                
+                 */
-                old_fds = xchg(&files->fd, new_fds);
+                kmem_cache_free(files_cachep, fdt->free_files);
-                i = xchg(&files->max_fds, nfds);
+                return;
+        }
-                /* Don't copy/clear the array if we are creating a new
+        if (fdt->max_fdset <= __FD_SETSIZE && fdt->max_fds <= NR_OPEN_DEFAULT) {
-                   fd array for fork() */
+                /*
-                if (i) {
+                 * The fdtable was embedded
-                        memcpy(new_fds, old_fds, i * sizeof(struct file *));
+                 */
-                        /* clear the remainder of the array */
+                return;
-                        memset(&new_fds[i], 0,
+        }
-                               (nfds-i) * sizeof(struct file *)); 
+        if (fdset_size <= PAGE_SIZE && fdarray_size <= PAGE_SIZE) {
+                kfree(fdt->open_fds);
-                        spin_unlock(&files->file_lock);
+                kfree(fdt->close_on_exec);
-                        free_fd_array(old_fds, i);
+                kfree(fdt->fd);
-                        spin_lock(&files->file_lock);
+                kfree(fdt);
-                }
        } else {
-                /* Somebody expanded the array while we slept ... */
+                fddef = &get_cpu_var(fdtable_defer_list);
-                spin_unlock(&files->file_lock);
+                spin_lock(&fddef->lock);
-                free_fd_array(new_fds, nfds);
+                fdt->next = fddef->next;
-                spin_lock(&files->file_lock);
+                fddef->next = fdt;
+                /*
+                 * vmallocs are handled from the workqueue context.
+                 * If the per-cpu workqueue is running, then we
+                 * defer work scheduling through a timer.
+                 */
+                if (!schedule_work(&fddef->wq))
+                        mod_timer(&fddef->timer, 5);
+                spin_unlock(&fddef->lock);
+                put_cpu_var(fdtable_defer_list);
        }
-        error = 0;
+}
-out:
-        return error;
+void free_fdtable(struct fdtable *fdt)
+{
+        if (fdt->free_files || fdt->max_fdset > __FD_SETSIZE ||
+                                        fdt->max_fds > NR_OPEN_DEFAULT)
+                call_rcu(&fdt->rcu, free_fdtable_rcu);
+}
+/*
+ * Expand the fdset in the files_struct.  Called with the files spinlock
+ * held for write.
+ */
+static void copy_fdtable(struct fdtable *nfdt, struct fdtable *fdt)
+{
+        int i;
+        int count;
+        BUG_ON(nfdt->max_fdset < fdt->max_fdset);
+        BUG_ON(nfdt->max_fds < fdt->max_fds);
+        /* Copy the existing tables and install the new pointers */
+        i = fdt->max_fdset / (sizeof(unsigned long) * 8);
+        count = (nfdt->max_fdset - fdt->max_fdset) / 8;
+        /*
+         * Don't copy the entire array if the current fdset is
+         * not yet initialised.
+         */
+        if (i) {
+                memcpy (nfdt->open_fds, fdt->open_fds,
+                                                fdt->max_fdset/8);
+                memcpy (nfdt->close_on_exec, fdt->close_on_exec,
+                                                fdt->max_fdset/8);
+                memset (&nfdt->open_fds->fds_bits[i], 0, count);
+                memset (&nfdt->close_on_exec->fds_bits[i], 0, count);
+        }
+        /* Don't copy/clear the array if we are creating a new
+           fd array for fork() */
+        if (fdt->max_fds) {
+                memcpy(nfdt->fd, fdt->fd,
+                        fdt->max_fds * sizeof(struct file *));
+                /* clear the remainder of the array */
+                memset(&nfdt->fd[fdt->max_fds], 0,
+                       (nfdt->max_fds - fdt->max_fds) *
+                                        sizeof(struct file *));
+        }
+        nfdt->next_fd = fdt->next_fd;
 }
 /*
@@ -154,26 +234,21 @@ void free_fdset(fd_set *array, int num)
                vfree(array);
 }
-/*
+static struct fdtable *alloc_fdtable(int nr)
- * Expand the fdset in the files_struct.  Called with the files spinlock
- * held for write.
- */
-static int expand_fdset(struct files_struct *files, int nr)
-        __releases(file->file_lock)
-        __acquires(file->file_lock)
 {
-        fd_set *new_openset = NULL, *new_execset = NULL;
+        struct fdtable *fdt = NULL;
-        int error, nfds = 0;
+        int nfds = 0;
+        fd_set *new_openset = NULL, *new_execset = NULL;
-        error = -EMFILE;
+        struct file **new_fds;
-        if (files->max_fdset >= NR_OPEN || nr >= NR_OPEN)
-                goto out;
-        nfds = files->max_fdset;
+        fdt = kmalloc(sizeof(*fdt), GFP_KERNEL);
-        spin_unlock(&files->file_lock);
+        if (!fdt)
+                goto out;
+        memset(fdt, 0, sizeof(*fdt));
-        /* Expand to the max in easy steps */
+        nfds = __FD_SETSIZE;
-        do {
+        /* Expand to the max in easy steps */
+        do {
                if (nfds < (PAGE_SIZE * 8))
                        nfds = PAGE_SIZE * 8;
                else {
@@ -183,49 +258,88 @@ static int expand_fdset(struct files_struct *files, int nr)
                }
        } while (nfds <= nr);
-        error = -ENOMEM;
+        new_openset = alloc_fdset(nfds);
-        new_openset = alloc_fdset(nfds);
+        new_execset = alloc_fdset(nfds);
-        new_execset = alloc_fdset(nfds);
+        if (!new_openset || !new_execset)
-        spin_lock(&files->file_lock);
+                goto out;
-        if (!new_openset || !new_execset)
+        fdt->open_fds = new_openset;
+        fdt->close_on_exec = new_execset;
+        fdt->max_fdset = nfds;
+        nfds = NR_OPEN_DEFAULT;
+        /*
+         * Expand to the max in easy steps, and keep expanding it until
+         * we have enough for the requested fd array size.
+         */
+        do {
+#if NR_OPEN_DEFAULT < 256
+                if (nfds < 256)
+                        nfds = 256;
+                else
+#endif
+                if (nfds < (PAGE_SIZE / sizeof(struct file *)))
+                        nfds = PAGE_SIZE / sizeof(struct file *);
+                else {
+                        nfds = nfds * 2;
+                        if (nfds > NR_OPEN)
+                                nfds = NR_OPEN;
+                }
+        } while (nfds <= nr);
+        new_fds = alloc_fd_array(nfds);
+        if (!new_fds)
+                goto out;
+        fdt->fd = new_fds;
+        fdt->max_fds = nfds;
+        fdt->free_files = NULL;
+        return fdt;
+out:
+        if (new_openset)
+                free_fdset(new_openset, nfds);
+        if (new_execset)
+                free_fdset(new_execset, nfds);
+        kfree(fdt);
+        return NULL;
+}
+/*
+ * Expands the file descriptor table - it will allocate a new fdtable and
+ * both fd array and fdset. It is expected to be called with the
+ * files_lock held.
+ */
+static int expand_fdtable(struct files_struct *files, int nr)
+        __releases(files->file_lock)
+        __acquires(files->file_lock)
+{
+        int error = 0;
+        struct fdtable *fdt;
+        struct fdtable *nfdt = NULL;
+        spin_unlock(&files->file_lock);
+        nfdt = alloc_fdtable(nr);
+        if (!nfdt) {
+                error = -ENOMEM;
+                spin_lock(&files->file_lock);
                goto out;
+        }
-        error = 0;
+        spin_lock(&files->file_lock);
-        
+        fdt = files_fdtable(files);
-        /* Copy the existing tables and install the new pointers */
+        /*
-        if (nfds > files->max_fdset) {
+         * Check again since another task may have expanded the
-                int i = files->max_fdset / (sizeof(unsigned long) * 8);
+         * fd table while we dropped the lock
-                int count = (nfds - files->max_fdset) / 8;
+         */
-                
+        if (nr >= fdt->max_fds || nr >= fdt->max_fdset) {
-                /* 
+                copy_fdtable(nfdt, fdt);
-                 * Don't copy the entire array if the current fdset is
+        } else {
-                 * not yet initialised.  
+                /* Somebody expanded while we dropped file_lock */
-                 */
-                if (i) {
-                        memcpy (new_openset, files->open_fds, files->max_fdset/8);
-                        memcpy (new_execset, files->close_on_exec, files->max_fdset/8);
-                        memset (&new_openset->fds_bits[i], 0, count);
-                        memset (&new_execset->fds_bits[i], 0, count);
-                }
-                
-                nfds = xchg(&files->max_fdset, nfds);
-                new_openset = xchg(&files->open_fds, new_openset);
-                new_execset = xchg(&files->close_on_exec, new_execset);
                spin_unlock(&files->file_lock);
-                free_fdset (new_openset, nfds);
+                __free_fdtable(nfdt);
-                free_fdset (new_execset, nfds);
                spin_lock(&files->file_lock);
-                return 0;
+                goto out;
-        } 
+        }
-        /* Somebody expanded the array while we slept ... */
+        rcu_assign_pointer(files->fdt, nfdt);
+        free_fdtable(fdt);
 out:
-        spin_unlock(&files->file_lock);
-        if (new_openset)
-                free_fdset(new_openset, nfds);
-        if (new_execset)
-                free_fdset(new_execset, nfds);
-        spin_lock(&files->file_lock);
        return error;
 }
@@ -237,18 +351,39 @@ out:
 int expand_files(struct files_struct *files, int nr)
 {
        int err, expand = 0;
+        struct fdtable *fdt;
-        if (nr >= files->max_fdset) {
+        fdt = files_fdtable(files);
-                expand = 1;
+        if (nr >= fdt->max_fdset || nr >= fdt->max_fds) {
-                if ((err = expand_fdset(files, nr)))
+                if (fdt->max_fdset >= NR_OPEN ||
+                        fdt->max_fds >= NR_OPEN || nr >= NR_OPEN) {
+                        err = -EMFILE;
                        goto out;
-        }
+                }
-        if (nr >= files->max_fds) {
                expand = 1;
-                if ((err = expand_fd_array(files, nr)))
+                if ((err = expand_fdtable(files, nr)))
                        goto out;
        }
        err = expand;
 out:
        return err;
 }
+static void __devinit fdtable_defer_list_init(int cpu)
+{
+        struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
+        spin_lock_init(&fddef->lock);
+        INIT_WORK(&fddef->wq, (void (*)(void *))free_fdtable_work, fddef);
+        init_timer(&fddef->timer);
+        fddef->timer.data = (unsigned long)fddef;
+        fddef->timer.function = fdtable_timer;
+        fddef->next = NULL;
+}
+void __init files_defer_init(void)
+{
+        int i;
+        /* Really early - can't use for_each_cpu */
+        for (i = 0; i < NR_CPUS; i++)
+                fdtable_defer_list_init(i);
+}
diff --git a/fs/file_table.c b/fs/file_table.c
index 43e9e1737de2..86ec8ae985b4 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -14,6 +14,7 @@
 #include <linux/fs.h>
 #include <linux/security.h>
 #include <linux/eventpoll.h>
+#include <linux/rcupdate.h>
 #include <linux/mount.h>
 #include <linux/cdev.h>
 #include <linux/fsnotify.h>
@@ -53,11 +54,17 @@ void filp_dtor(void * objp, struct kmem_cache_s *cachep, unsigned long dflags)
        spin_unlock_irqrestore(&filp_count_lock, flags);
 }
-static inline void file_free(struct file *f)
+static inline void file_free_rcu(struct rcu_head *head)
 {
+        struct file *f =  container_of(head, struct file, f_rcuhead);
        kmem_cache_free(filp_cachep, f);
 }
+static inline void file_free(struct file *f)
+{
+        call_rcu(&f->f_rcuhead, file_free_rcu);
+}
 /* Find an unused file structure and return a pointer to it.
 * Returns NULL, if there are no more free file structures or
 * we run out of memory.
@@ -110,7 +117,7 @@ EXPORT_SYMBOL(get_empty_filp);
 void fastcall fput(struct file *file)
 {
-        if (atomic_dec_and_test(&file->f_count))
+        if (rcuref_dec_and_test(&file->f_count))
                __fput(file);
 }
@@ -156,11 +163,17 @@ struct file fastcall *fget(unsigned int fd)
        struct file *file;
        struct files_struct *files = current->files;
-        spin_lock(&files->file_lock);
+        rcu_read_lock();
        file = fcheck_files(files, fd);
-        if (file)
+        if (file) {
-                get_file(file);
+                if (!rcuref_inc_lf(&file->f_count)) {
-        spin_unlock(&files->file_lock);
+                        /* File object ref couldn't be taken */
+                        rcu_read_unlock();
+                        return NULL;
+                }
+        }
+        rcu_read_unlock();
        return file;
 }
@@ -182,21 +195,25 @@ struct file fastcall *fget_light(unsigned int fd, int *fput_needed)
        if (likely((atomic_read(&files->count) == 1))) {
                file = fcheck_files(files, fd);
        } else {
-                spin_lock(&files->file_lock);
+                rcu_read_lock();
                file = fcheck_files(files, fd);
                if (file) {
-                        get_file(file);
+                        if (rcuref_inc_lf(&file->f_count))
-                        *fput_needed = 1;
+                                *fput_needed = 1;
+                        else
+                                /* Didn't get the reference, someone's freed */
+                                file = NULL;
                }
-                spin_unlock(&files->file_lock);
+                rcu_read_unlock();
        }
        return file;
 }
 void put_filp(struct file *file)
 {
-        if (atomic_dec_and_test(&file->f_count)) {
+        if (rcuref_dec_and_test(&file->f_count)) {
                security_file_free(file);
                file_kill(file);
                file_free(file);
@@ -257,4 +274,5 @@ void __init files_init(unsigned long mempages)
        files_stat.max_files = n; 
        if (files_stat.max_files < NR_FILE)
                files_stat.max_files = NR_FILE;
+        files_defer_init();
 } 
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
new file mode 100644
index 000000000000..c3e1f760cac9
--- /dev/null
+++ b/fs/fuse/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the FUSE filesystem.
+#
+obj-$(CONFIG_FUSE_FS) += fuse.o
+fuse-objs := dev.o dir.o file.o inode.o
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
new file mode 100644
index 000000000000..d4c869c6d01b
--- /dev/null
+++ b/fs/fuse/dev.c
@@ -0,0 +1,877 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+#include "fuse_i.h"
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/poll.h>
+#include <linux/uio.h>
+#include <linux/miscdevice.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+MODULE_ALIAS_MISCDEV(FUSE_MINOR);
+static kmem_cache_t *fuse_req_cachep;
+static inline struct fuse_conn *fuse_get_conn(struct file *file)
+{
+        struct fuse_conn *fc;
+        spin_lock(&fuse_lock);
+        fc = file->private_data;
+        if (fc && !fc->mounted)
+                fc = NULL;
+        spin_unlock(&fuse_lock);
+        return fc;
+}
+static inline void fuse_request_init(struct fuse_req *req)
+{
+        memset(req, 0, sizeof(*req));
+        INIT_LIST_HEAD(&req->list);
+        init_waitqueue_head(&req->waitq);
+        atomic_set(&req->count, 1);
+}
+struct fuse_req *fuse_request_alloc(void)
+{
+        struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, SLAB_KERNEL);
+        if (req)
+                fuse_request_init(req);
+        return req;
+}
+void fuse_request_free(struct fuse_req *req)
+{
+        kmem_cache_free(fuse_req_cachep, req);
+}
+static inline void block_sigs(sigset_t *oldset)
+{
+        sigset_t mask;
+        siginitsetinv(&mask, sigmask(SIGKILL));
+        sigprocmask(SIG_BLOCK, &mask, oldset);
+}
+static inline void restore_sigs(sigset_t *oldset)
+{
+        sigprocmask(SIG_SETMASK, oldset, NULL);
+}
+void fuse_reset_request(struct fuse_req *req)
+{
+        int preallocated = req->preallocated;
+        BUG_ON(atomic_read(&req->count) != 1);
+        fuse_request_init(req);
+        req->preallocated = preallocated;
+}
+static void __fuse_get_request(struct fuse_req *req)
+{
+        atomic_inc(&req->count);
+}
+/* Must be called with > 1 refcount */
+static void __fuse_put_request(struct fuse_req *req)
+{
+        BUG_ON(atomic_read(&req->count) < 2);
+        atomic_dec(&req->count);
+}
+static struct fuse_req *do_get_request(struct fuse_conn *fc)
+{
+        struct fuse_req *req;
+        spin_lock(&fuse_lock);
+        BUG_ON(list_empty(&fc->unused_list));
+        req = list_entry(fc->unused_list.next, struct fuse_req, list);
+        list_del_init(&req->list);
+        spin_unlock(&fuse_lock);
+        fuse_request_init(req);
+        req->preallocated = 1;
+        req->in.h.uid = current->fsuid;
+        req->in.h.gid = current->fsgid;
+        req->in.h.pid = current->pid;
+        return req;
+}
+/* This can return NULL, but only in case it's interrupted by a SIGKILL */
+struct fuse_req *fuse_get_request(struct fuse_conn *fc)
+{
+        int intr;
+        sigset_t oldset;
+        block_sigs(&oldset);
+        intr = down_interruptible(&fc->outstanding_sem);
+        restore_sigs(&oldset);
+        return intr ? NULL : do_get_request(fc);
+}
+static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+        spin_lock(&fuse_lock);
+        if (req->preallocated)
+                list_add(&req->list, &fc->unused_list);
+        else
+                fuse_request_free(req);
+        /* If we are in debt decrease that first */
+        if (fc->outstanding_debt)
+                fc->outstanding_debt--;
+        else
+                up(&fc->outstanding_sem);
+        spin_unlock(&fuse_lock);
+}
+void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+        if (atomic_dec_and_test(&req->count))
+                fuse_putback_request(fc, req);
+}
+void fuse_release_background(struct fuse_req *req)
+{
+        iput(req->inode);
+        iput(req->inode2);
+        if (req->file)
+                fput(req->file);
+        spin_lock(&fuse_lock);
+        list_del(&req->bg_entry);
+        spin_unlock(&fuse_lock);
+}
+/*
+ * This function is called when a request is finished.  Either a reply
+ * has arrived or it was interrupted (and not yet sent) or some error
+ * occured during communication with userspace, or the device file was
+ * closed.  It decreases the referece count for the request.  In case
+ * of a background request the referece to the stored objects are
+ * released.  The requester thread is woken up (if still waiting), and
+ * finally the request is either freed or put on the unused_list
+ *
+ * Called with fuse_lock, unlocks it
+ */
+static void request_end(struct fuse_conn *fc, struct fuse_req *req)
+{
+        int putback;
+        req->finished = 1;
+        putback = atomic_dec_and_test(&req->count);
+        spin_unlock(&fuse_lock);
+        if (req->background) {
+                down_read(&fc->sbput_sem);
+                if (fc->mounted)
+                        fuse_release_background(req);
+                up_read(&fc->sbput_sem);
+        }
+        wake_up(&req->waitq);
+        if (req->in.h.opcode == FUSE_INIT) {
+                int i;
+                if (req->misc.init_in_out.major != FUSE_KERNEL_VERSION)
+                        fc->conn_error = 1;
+                /* After INIT reply is received other requests can go
+                   out.  So do (FUSE_MAX_OUTSTANDING - 1) number of
+                   up()s on outstanding_sem.  The last up() is done in
+                   fuse_putback_request() */
+                for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
+                        up(&fc->outstanding_sem);
+        }
+        if (putback)
+                fuse_putback_request(fc, req);
+}
+/*
+ * Unfortunately request interruption not just solves the deadlock
+ * problem, it causes problems too.  These stem from the fact, that an
+ * interrupted request is continued to be processed in userspace,
+ * while all the locks and object references (inode and file) held
+ * during the operation are released.
+ *
+ * To release the locks is exactly why there's a need to interrupt the
+ * request, so there's not a lot that can be done about this, except
+ * introduce additional locking in userspace.
+ *
+ * More important is to keep inode and file references until userspace
+ * has replied, otherwise FORGET and RELEASE could be sent while the
+ * inode/file is still used by the filesystem.
+ *
+ * For this reason the concept of "background" request is introduced.
+ * An interrupted request is backgrounded if it has been already sent
+ * to userspace.  Backgrounding involves getting an extra reference to
+ * inode(s) or file used in the request, and adding the request to
+ * fc->background list.  When a reply is received for a background
+ * request, the object references are released, and the request is
+ * removed from the list.  If the filesystem is unmounted while there
+ * are still background requests, the list is walked and references
+ * are released as if a reply was received.
+ *
+ * There's one more use for a background request.  The RELEASE message is
+ * always sent as background, since it doesn't return an error or
+ * data.
+ */
+static void background_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+        req->background = 1;
+        list_add(&req->bg_entry, &fc->background);
+        if (req->inode)
+                req->inode = igrab(req->inode);
+        if (req->inode2)
+                req->inode2 = igrab(req->inode2);
+        if (req->file)
+                get_file(req->file);
+}
+/* Called with fuse_lock held.  Releases, and then reacquires it. */
+static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
+{
+        sigset_t oldset;
+        spin_unlock(&fuse_lock);
+        block_sigs(&oldset);
+        wait_event_interruptible(req->waitq, req->finished);
+        restore_sigs(&oldset);
+        spin_lock(&fuse_lock);
+        if (req->finished)
+                return;
+        req->out.h.error = -EINTR;
+        req->interrupted = 1;
+        if (req->locked) {
+                /* This is uninterruptible sleep, because data is
+                   being copied to/from the buffers of req.  During
+                   locked state, there mustn't be any filesystem
+                   operation (e.g. page fault), since that could lead
+                   to deadlock */
+                spin_unlock(&fuse_lock);
+                wait_event(req->waitq, !req->locked);
+                spin_lock(&fuse_lock);
+        }
+        if (!req->sent && !list_empty(&req->list)) {
+                list_del(&req->list);
+                __fuse_put_request(req);
+        } else if (!req->finished && req->sent)
+                background_request(fc, req);
+}
+static unsigned len_args(unsigned numargs, struct fuse_arg *args)
+{
+        unsigned nbytes = 0;
+        unsigned i;
+        for (i = 0; i < numargs; i++)
+                nbytes += args[i].size;
+        return nbytes;
+}
+static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+        fc->reqctr++;
+        /* zero is special */
+        if (fc->reqctr == 0)
+                fc->reqctr = 1;
+        req->in.h.unique = fc->reqctr;
+        req->in.h.len = sizeof(struct fuse_in_header) +
+                len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
+        if (!req->preallocated) {
+                /* If request is not preallocated (either FORGET or
+                   RELEASE), then still decrease outstanding_sem, so
+                   user can't open infinite number of files while not
+                   processing the RELEASE requests.  However for
+                   efficiency do it without blocking, so if down()
+                   would block, just increase the debt instead */
+                if (down_trylock(&fc->outstanding_sem))
+                        fc->outstanding_debt++;
+        }
+        list_add_tail(&req->list, &fc->pending);
+        wake_up(&fc->waitq);
+}
+/*
+ * This can only be interrupted by a SIGKILL
+ */
+void request_send(struct fuse_conn *fc, struct fuse_req *req)
+{
+        req->isreply = 1;
+        spin_lock(&fuse_lock);
+        if (!fc->connected)
+                req->out.h.error = -ENOTCONN;
+        else if (fc->conn_error)
+                req->out.h.error = -ECONNREFUSED;
+        else {
+                queue_request(fc, req);
+                /* acquire extra reference, since request is still needed
+                   after request_end() */
+                __fuse_get_request(req);
+                request_wait_answer(fc, req);
+        }
+        spin_unlock(&fuse_lock);
+}
+static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
+{
+        spin_lock(&fuse_lock);
+        if (fc->connected) {
+                queue_request(fc, req);
+                spin_unlock(&fuse_lock);
+        } else {
+                req->out.h.error = -ENOTCONN;
+                request_end(fc, req);
+        }
+}
+void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
+{
+        req->isreply = 0;
+        request_send_nowait(fc, req);
+}
+void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
+{
+        req->isreply = 1;
+        spin_lock(&fuse_lock);
+        background_request(fc, req);
+        spin_unlock(&fuse_lock);
+        request_send_nowait(fc, req);
+}
+void fuse_send_init(struct fuse_conn *fc)
+{
+        /* This is called from fuse_read_super() so there's guaranteed
+           to be a request available */
+        struct fuse_req *req = do_get_request(fc);
+        struct fuse_init_in_out *arg = &req->misc.init_in_out;
+        arg->major = FUSE_KERNEL_VERSION;
+        arg->minor = FUSE_KERNEL_MINOR_VERSION;
+        req->in.h.opcode = FUSE_INIT;
+        req->in.numargs = 1;
+        req->in.args[0].size = sizeof(*arg);
+        req->in.args[0].value = arg;
+        req->out.numargs = 1;
+        req->out.args[0].size = sizeof(*arg);
+        req->out.args[0].value = arg;
+        request_send_background(fc, req);
+}
+/*
+ * Lock the request.  Up to the next unlock_request() there mustn't be
+ * anything that could cause a page-fault.  If the request was already
+ * interrupted bail out.
+ */
+static inline int lock_request(struct fuse_req *req)
+{
+        int err = 0;
+        if (req) {
+                spin_lock(&fuse_lock);
+                if (req->interrupted)
+                        err = -ENOENT;
+                else
+                        req->locked = 1;
+                spin_unlock(&fuse_lock);
+        }
+        return err;
+}
+/*
+ * Unlock request.  If it was interrupted during being locked, the
+ * requester thread is currently waiting for it to be unlocked, so
+ * wake it up.
+ */
+static inline void unlock_request(struct fuse_req *req)
+{
+        if (req) {
+                spin_lock(&fuse_lock);
+                req->locked = 0;
+                if (req->interrupted)
+                        wake_up(&req->waitq);
+                spin_unlock(&fuse_lock);
+        }
+}
+struct fuse_copy_state {
+        int write;
+        struct fuse_req *req;
+        const struct iovec *iov;
+        unsigned long nr_segs;
+        unsigned long seglen;
+        unsigned long addr;
+        struct page *pg;
+        void *mapaddr;
+        void *buf;
+        unsigned len;
+};
+static void fuse_copy_init(struct fuse_copy_state *cs, int write,
+                           struct fuse_req *req, const struct iovec *iov,
+                           unsigned long nr_segs)
+{
+        memset(cs, 0, sizeof(*cs));
+        cs->write = write;
+        cs->req = req;
+        cs->iov = iov;
+        cs->nr_segs = nr_segs;
+}
+/* Unmap and put previous page of userspace buffer */
+static inline void fuse_copy_finish(struct fuse_copy_state *cs)
+{
+        if (cs->mapaddr) {
+                kunmap_atomic(cs->mapaddr, KM_USER0);
+                if (cs->write) {
+                        flush_dcache_page(cs->pg);
+                        set_page_dirty_lock(cs->pg);
+                }
+                put_page(cs->pg);
+                cs->mapaddr = NULL;
+        }
+}
+/*
+ * Get another pagefull of userspace buffer, and map it to kernel
+ * address space, and lock request
+ */
+static int fuse_copy_fill(struct fuse_copy_state *cs)
+{
+        unsigned long offset;
+        int err;
+        unlock_request(cs->req);
+        fuse_copy_finish(cs);
+        if (!cs->seglen) {
+                BUG_ON(!cs->nr_segs);
+                cs->seglen = cs->iov[0].iov_len;
+                cs->addr = (unsigned long) cs->iov[0].iov_base;
+                cs->iov ++;
+                cs->nr_segs --;
+        }
+        down_read(&current->mm->mmap_sem);
+        err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
+                             &cs->pg, NULL);
+        up_read(&current->mm->mmap_sem);
+        if (err < 0)
+                return err;
+        BUG_ON(err != 1);
+        offset = cs->addr % PAGE_SIZE;
+        cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
+        cs->buf = cs->mapaddr + offset;
+        cs->len = min(PAGE_SIZE - offset, cs->seglen);
+        cs->seglen -= cs->len;
+        cs->addr += cs->len;
+        return lock_request(cs->req);
+}
+/* Do as much copy to/from userspace buffer as we can */
+static inline int fuse_copy_do(struct fuse_copy_state *cs, void **val,
+                               unsigned *size)
+{
+        unsigned ncpy = min(*size, cs->len);
+        if (val) {
+                if (cs->write)
+                        memcpy(cs->buf, *val, ncpy);
+                else
+                        memcpy(*val, cs->buf, ncpy);
+                *val += ncpy;
+        }
+        *size -= ncpy;
+        cs->len -= ncpy;
+        cs->buf += ncpy;
+        return ncpy;
+}
+/*
+ * Copy a page in the request to/from the userspace buffer.  Must be
+ * done atomically
+ */
+static inline int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
+                                 unsigned offset, unsigned count, int zeroing)
+{
+        if (page && zeroing && count < PAGE_SIZE) {
+                void *mapaddr = kmap_atomic(page, KM_USER1);
+                memset(mapaddr, 0, PAGE_SIZE);
+                kunmap_atomic(mapaddr, KM_USER1);
+        }
+        while (count) {
+                int err;
+                if (!cs->len && (err = fuse_copy_fill(cs)))
+                        return err;
+                if (page) {
+                        void *mapaddr = kmap_atomic(page, KM_USER1);
+                        void *buf = mapaddr + offset;
+                        offset += fuse_copy_do(cs, &buf, &count);
+                        kunmap_atomic(mapaddr, KM_USER1);
+                } else
+                        offset += fuse_copy_do(cs, NULL, &count);
+        }
+        if (page && !cs->write)
+                flush_dcache_page(page);
+        return 0;
+}
+/* Copy pages in the request to/from userspace buffer */
+static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
+                           int zeroing)
+{
+        unsigned i;
+        struct fuse_req *req = cs->req;
+        unsigned offset = req->page_offset;
+        unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
+        for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
+                struct page *page = req->pages[i];
+                int err = fuse_copy_page(cs, page, offset, count, zeroing);
+                if (err)
+                        return err;
+                nbytes -= count;
+                count = min(nbytes, (unsigned) PAGE_SIZE);
+                offset = 0;
+        }
+        return 0;
+}
+/* Copy a single argument in the request to/from userspace buffer */
+static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
+{
+        while (size) {
+                int err;
+                if (!cs->len && (err = fuse_copy_fill(cs)))
+                        return err;
+                fuse_copy_do(cs, &val, &size);
+        }
+        return 0;
+}
+/* Copy request arguments to/from userspace buffer */
+static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
+                          unsigned argpages, struct fuse_arg *args,
+                          int zeroing)
+{
+        int err = 0;
+        unsigned i;
+        for (i = 0; !err && i < numargs; i++)  {
+                struct fuse_arg *arg = &args[i];
+                if (i == numargs - 1 && argpages)
+                        err = fuse_copy_pages(cs, arg->size, zeroing);
+                else
+                        err = fuse_copy_one(cs, arg->value, arg->size);
+        }
+        return err;
+}
+/* Wait until a request is available on the pending list */
+static void request_wait(struct fuse_conn *fc)
+{
+        DECLARE_WAITQUEUE(wait, current);
+        add_wait_queue_exclusive(&fc->waitq, &wait);
+        while (fc->mounted && list_empty(&fc->pending)) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                if (signal_pending(current))
+                        break;
+                spin_unlock(&fuse_lock);
+                schedule();
+                spin_lock(&fuse_lock);
+        }
+        set_current_state(TASK_RUNNING);
+        remove_wait_queue(&fc->waitq, &wait);
+}
+/*
+ * Read a single request into the userspace filesystem's buffer.  This
+ * function waits until a request is available, then removes it from
+ * the pending list and copies request data to userspace buffer.  If
+ * no reply is needed (FORGET) or request has been interrupted or
+ * there was an error during the copying then it's finished by calling
+ * request_end().  Otherwise add it to the processing list, and set
+ * the 'sent' flag.
+ */
+static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
+                              unsigned long nr_segs, loff_t *off)
+{
+        int err;
+        struct fuse_conn *fc;
+        struct fuse_req *req;
+        struct fuse_in *in;
+        struct fuse_copy_state cs;
+        unsigned reqsize;
+        spin_lock(&fuse_lock);
+        fc = file->private_data;
+        err = -EPERM;
+        if (!fc)
+                goto err_unlock;
+        request_wait(fc);
+        err = -ENODEV;
+        if (!fc->mounted)
+                goto err_unlock;
+        err = -ERESTARTSYS;
+        if (list_empty(&fc->pending))
+                goto err_unlock;
+        req = list_entry(fc->pending.next, struct fuse_req, list);
+        list_del_init(&req->list);
+        spin_unlock(&fuse_lock);
+        in = &req->in;
+        reqsize = req->in.h.len;
+        fuse_copy_init(&cs, 1, req, iov, nr_segs);
+        err = -EINVAL;
+        if (iov_length(iov, nr_segs) >= reqsize) {
+                err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
+                if (!err)
+                        err = fuse_copy_args(&cs, in->numargs, in->argpages,
+                                             (struct fuse_arg *) in->args, 0);
+        }
+        fuse_copy_finish(&cs);
+        spin_lock(&fuse_lock);
+        req->locked = 0;
+        if (!err && req->interrupted)
+                err = -ENOENT;
+        if (err) {
+                if (!req->interrupted)
+                        req->out.h.error = -EIO;
+                request_end(fc, req);
+                return err;
+        }
+        if (!req->isreply)
+                request_end(fc, req);
+        else {
+                req->sent = 1;
+                list_add_tail(&req->list, &fc->processing);
+                spin_unlock(&fuse_lock);
+        }
+        return reqsize;
+ err_unlock:
+        spin_unlock(&fuse_lock);
+        return err;
+}
+static ssize_t fuse_dev_read(struct file *file, char __user *buf,
+                             size_t nbytes, loff_t *off)
+{
+        struct iovec iov;
+        iov.iov_len = nbytes;
+        iov.iov_base = buf;
+        return fuse_dev_readv(file, &iov, 1, off);
+}
+/* Look up request on processing list by unique ID */
+static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
+{
+        struct list_head *entry;
+        list_for_each(entry, &fc->processing) {
+                struct fuse_req *req;
+                req = list_entry(entry, struct fuse_req, list);
+                if (req->in.h.unique == unique)
+                        return req;
+        }
+        return NULL;
+}
+static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
+                         unsigned nbytes)
+{
+        unsigned reqsize = sizeof(struct fuse_out_header);
+        if (out->h.error)
+                return nbytes != reqsize ? -EINVAL : 0;
+        reqsize += len_args(out->numargs, out->args);
+        if (reqsize < nbytes || (reqsize > nbytes && !out->argvar))
+                return -EINVAL;
+        else if (reqsize > nbytes) {
+                struct fuse_arg *lastarg = &out->args[out->numargs-1];
+                unsigned diffsize = reqsize - nbytes;
+                if (diffsize > lastarg->size)
+                        return -EINVAL;
+                lastarg->size -= diffsize;
+        }
+        return fuse_copy_args(cs, out->numargs, out->argpages, out->args,
+                              out->page_zeroing);
+}
+/*
+ * Write a single reply to a request.  First the header is copied from
+ * the write buffer.  The request is then searched on the processing
+ * list by the unique ID found in the header.  If found, then remove
+ * it from the list and copy the rest of the buffer to the request.
+ * The request is finished by calling request_end()
+ */
+static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
+                               unsigned long nr_segs, loff_t *off)
+{
+        int err;
+        unsigned nbytes = iov_length(iov, nr_segs);
+        struct fuse_req *req;
+        struct fuse_out_header oh;
+        struct fuse_copy_state cs;
+        struct fuse_conn *fc = fuse_get_conn(file);
+        if (!fc)
+                return -ENODEV;
+        fuse_copy_init(&cs, 0, NULL, iov, nr_segs);
+        if (nbytes < sizeof(struct fuse_out_header))
+                return -EINVAL;
+        err = fuse_copy_one(&cs, &oh, sizeof(oh));
+        if (err)
+                goto err_finish;
+        err = -EINVAL;
+        if (!oh.unique || oh.error <= -1000 || oh.error > 0 ||
+            oh.len != nbytes)
+                goto err_finish;
+        spin_lock(&fuse_lock);
+        req = request_find(fc, oh.unique);
+        err = -EINVAL;
+        if (!req)
+                goto err_unlock;
+        list_del_init(&req->list);
+        if (req->interrupted) {
+                request_end(fc, req);
+                fuse_copy_finish(&cs);
+                return -ENOENT;
+        }
+        req->out.h = oh;
+        req->locked = 1;
+        cs.req = req;
+        spin_unlock(&fuse_lock);
+        err = copy_out_args(&cs, &req->out, nbytes);
+        fuse_copy_finish(&cs);
+        spin_lock(&fuse_lock);
+        req->locked = 0;
+        if (!err) {
+                if (req->interrupted)
+                        err = -ENOENT;
+        } else if (!req->interrupted)
+                req->out.h.error = -EIO;
+        request_end(fc, req);
+        return err ? err : nbytes;
+ err_unlock:
+        spin_unlock(&fuse_lock);
+ err_finish:
+        fuse_copy_finish(&cs);
+        return err;
+}
+static ssize_t fuse_dev_write(struct file *file, const char __user *buf,
+                              size_t nbytes, loff_t *off)
+{
+        struct iovec iov;
+        iov.iov_len = nbytes;
+        iov.iov_base = (char __user *) buf;
+        return fuse_dev_writev(file, &iov, 1, off);
+}
+static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
+{
+        struct fuse_conn *fc = fuse_get_conn(file);
+        unsigned mask = POLLOUT | POLLWRNORM;
+        if (!fc)
+                return -ENODEV;
+        poll_wait(file, &fc->waitq, wait);
+        spin_lock(&fuse_lock);
+        if (!list_empty(&fc->pending))
+                mask |= POLLIN | POLLRDNORM;
+        spin_unlock(&fuse_lock);
+        return mask;
+}
+/* Abort all requests on the given list (pending or processing) */
+static void end_requests(struct fuse_conn *fc, struct list_head *head)
+{
+        while (!list_empty(head)) {
+                struct fuse_req *req;
+                req = list_entry(head->next, struct fuse_req, list);
+                list_del_init(&req->list);
+                req->out.h.error = -ECONNABORTED;
+                request_end(fc, req);
+                spin_lock(&fuse_lock);
+        }
+}
+static int fuse_dev_release(struct inode *inode, struct file *file)
+{
+        struct fuse_conn *fc;
+        spin_lock(&fuse_lock);
+        fc = file->private_data;
+        if (fc) {
+                fc->connected = 0;
+                end_requests(fc, &fc->pending);
+                end_requests(fc, &fc->processing);
+                fuse_release_conn(fc);
+        }
+        spin_unlock(&fuse_lock);
+        return 0;
+}
+struct file_operations fuse_dev_operations = {
+        .owner          = THIS_MODULE,
+        .llseek         = no_llseek,
+        .read           = fuse_dev_read,
+        .readv          = fuse_dev_readv,
+        .write          = fuse_dev_write,
+        .writev         = fuse_dev_writev,
+        .poll           = fuse_dev_poll,
+        .release        = fuse_dev_release,
+};
+static struct miscdevice fuse_miscdevice = {
+        .minor = FUSE_MINOR,
+        .name  = "fuse",
+        .fops = &fuse_dev_operations,
+};
+int __init fuse_dev_init(void)
+{
+        int err = -ENOMEM;
+        fuse_req_cachep = kmem_cache_create("fuse_request",
+                                            sizeof(struct fuse_req),
+                                            0, 0, NULL, NULL);
+        if (!fuse_req_cachep)
+                goto out;
+        err = misc_register(&fuse_miscdevice);
+        if (err)
+                goto out_cache_clean;
+        return 0;
+ out_cache_clean:
+        kmem_cache_destroy(fuse_req_cachep);
+ out:
+        return err;
+}
+void fuse_dev_cleanup(void)
+{
+        misc_deregister(&fuse_miscdevice);
+        kmem_cache_destroy(fuse_req_cachep);
+}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
new file mode 100644
index 000000000000..e79e49b3eec7
--- /dev/null
+++ b/fs/fuse/dir.c
@@ -0,0 +1,982 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+#include "fuse_i.h"
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/namei.h>
+static inline unsigned long time_to_jiffies(unsigned long sec,
+                                            unsigned long nsec)
+{
+        struct timespec ts = {sec, nsec};
+        return jiffies + timespec_to_jiffies(&ts);
+}
+static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
+                             struct dentry *entry,
+                             struct fuse_entry_out *outarg)
+{
+        req->in.h.opcode = FUSE_LOOKUP;
+        req->in.h.nodeid = get_node_id(dir);
+        req->inode = dir;
+        req->in.numargs = 1;
+        req->in.args[0].size = entry->d_name.len + 1;
+        req->in.args[0].value = entry->d_name.name;
+        req->out.numargs = 1;
+        req->out.args[0].size = sizeof(struct fuse_entry_out);
+        req->out.args[0].value = outarg;
+}
+static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
+{
+        if (!entry->d_inode || is_bad_inode(entry->d_inode))
+                return 0;
+        else if (time_after(jiffies, entry->d_time)) {
+                int err;
+                struct fuse_entry_out outarg;
+                struct inode *inode = entry->d_inode;
+                struct fuse_inode *fi = get_fuse_inode(inode);
+                struct fuse_conn *fc = get_fuse_conn(inode);
+                struct fuse_req *req = fuse_get_request(fc);
+                if (!req)
+                        return 0;
+                fuse_lookup_init(req, entry->d_parent->d_inode, entry, &outarg);
+                request_send(fc, req);
+                err = req->out.h.error;
+                if (!err) {
+                        if (outarg.nodeid != get_node_id(inode)) {
+                                fuse_send_forget(fc, req, outarg.nodeid, 1);
+                                return 0;
+                        }
+                        fi->nlookup ++;
+                }
+                fuse_put_request(fc, req);
+                if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
+                        return 0;
+                fuse_change_attributes(inode, &outarg.attr);
+                entry->d_time = time_to_jiffies(outarg.entry_valid,
+                                                outarg.entry_valid_nsec);
+                fi->i_time = time_to_jiffies(outarg.attr_valid,
+                                             outarg.attr_valid_nsec);
+        }
+        return 1;
+}
+static struct dentry_operations fuse_dentry_operations = {
+        .d_revalidate   = fuse_dentry_revalidate,
+};
+static int fuse_lookup_iget(struct inode *dir, struct dentry *entry,
+                            struct inode **inodep)
+{
+        int err;
+        struct fuse_entry_out outarg;
+        struct inode *inode = NULL;
+        struct fuse_conn *fc = get_fuse_conn(dir);
+        struct fuse_req *req;
+        if (entry->d_name.len > FUSE_NAME_MAX)
+                return -ENAMETOOLONG;
+        req = fuse_get_request(fc);
+        if (!req)
+                return -EINTR;
+        fuse_lookup_init(req, dir, entry, &outarg);
+        request_send(fc, req);
+        err = req->out.h.error;
+        if (!err) {
+                inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
+                                  &outarg.attr);
+                if (!inode) {
+                        fuse_send_forget(fc, req, outarg.nodeid, 1);
+                        return -ENOMEM;
+                }
+        }
+        fuse_put_request(fc, req);
+        if (err && err != -ENOENT)
+                return err;
+        if (inode) {
+                struct fuse_inode *fi = get_fuse_inode(inode);
+                entry->d_time = time_to_jiffies(outarg.entry_valid,
+                                                outarg.entry_valid_nsec);
+                fi->i_time = time_to_jiffies(outarg.attr_valid,
+                                             outarg.attr_valid_nsec);
+        }
+        entry->d_op = &fuse_dentry_operations;
+        *inodep = inode;
+        return 0;
+}
+void fuse_invalidate_attr(struct inode *inode)
+{
+        get_fuse_inode(inode)->i_time = jiffies - 1;
+}
+static void fuse_invalidate_entry(struct dentry *entry)
+{
+        d_invalidate(entry);
+        entry->d_time = jiffies - 1;
+}
+static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
+                            struct inode *dir, struct dentry *entry,
+                            int mode)
+{
+        struct fuse_entry_out outarg;
+        struct inode *inode;
+        struct fuse_inode *fi;
+        int err;
+        req->in.h.nodeid = get_node_id(dir);
+        req->inode = dir;
+        req->out.numargs = 1;
+        req->out.args[0].size = sizeof(outarg);
+        req->out.args[0].value = &outarg;
+        request_send(fc, req);
+        err = req->out.h.error;
+        if (err) {
+                fuse_put_request(fc, req);
+                return err;
+        }
+        inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
+                          &outarg.attr);
+        if (!inode) {
+                fuse_send_forget(fc, req, outarg.nodeid, 1);
+                return -ENOMEM;
+        }
+        fuse_put_request(fc, req);
+        /* Don't allow userspace to do really stupid things... */
+        if ((inode->i_mode ^ mode) & S_IFMT) {
+                iput(inode);
+                return -EIO;
+        }
+        entry->d_time = time_to_jiffies(outarg.entry_valid,
+                                        outarg.entry_valid_nsec);
+        fi = get_fuse_inode(inode);
+        fi->i_time = time_to_jiffies(outarg.attr_valid,
+                                     outarg.attr_valid_nsec);
+        d_instantiate(entry, inode);
+        fuse_invalidate_attr(dir);
+        return 0;
+}
+static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode,
+                      dev_t rdev)
+{
+        struct fuse_mknod_in inarg;
+        struct fuse_conn *fc = get_fuse_conn(dir);
+        struct fuse_req *req = fuse_get_request(fc);
+        if (!req)
+                return -EINTR;
+        memset(&inarg, 0, sizeof(inarg));
+        inarg.mode = mode;
+        inarg.rdev = new_encode_dev(rdev);
+        req->in.h.opcode = FUSE_MKNOD;
+        req->in.numargs = 2;
+        req->in.args[0].size = sizeof(inarg);
+        req->in.args[0].value = &inarg;
+        req->in.args[1].size = entry->d_name.len + 1;
+        req->in.args[1].value = entry->d_name.name;
+        return create_new_entry(fc, req, dir, entry, mode);
+}
+static int fuse_create(struct inode *dir, struct dentry *entry, int mode,
+                       struct nameidata *nd)
+{
+        return fuse_mknod(dir, entry, mode, 0);
+}
+static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode)
+{
+        struct fuse_mkdir_in inarg;
+        struct fuse_conn *fc = get_fuse_conn(dir);
+        struct fuse_req *req = fuse_get_request(fc);
+        if (!req)
+                return -EINTR;
+        memset(&inarg, 0, sizeof(inarg));
+        inarg.mode = mode;
+        req->in.h.opcode = FUSE_MKDIR;
+        req->in.numargs = 2;
+        req->in.args[0].size = sizeof(inarg);
+        req->in.args[0].value = &inarg;
+        req->in.args[1].size = entry->d_name.len + 1;
+        req->in.args[1].value = entry->d_name.name;
+        return create_new_entry(fc, req, dir, entry, S_IFDIR);
+}
+static int fuse_symlink(struct inode *dir, struct dentry *entry,
+                        const char *link)
+{
+        struct fuse_conn *fc = get_fuse_conn(dir);
+        unsigned len = strlen(link) + 1;
+        struct fuse_req *req;
+        if (len > FUSE_SYMLINK_MAX)
+                return -ENAMETOOLONG;
+        req = fuse_get_request(fc);
+        if (!req)
+                return -EINTR;
+        req->in.h.opcode = FUSE_SYMLINK;
+        req->in.numargs = 2;
+        req->in.args[0].size = entry->d_name.len + 1;
+        req->in.args[0].value = entry->d_name.name;
+        req->in.args[1].size = len;
+        req->in.args[1].value = link;
+        return create_new_entry(fc, req, dir, entry, S_IFLNK);
+}
+static int fuse_unlink(struct inode *dir, struct dentry *entry)
+{
+        int err;
+        struct fuse_conn *fc = get_fuse_conn(dir);
+        struct fuse_req *req = fuse_get_request(fc);
+        if (!req)
+                return -EINTR;
+        req->in.h.opcode = FUSE_UNLINK;
+        req->in.h.nodeid = get_node_id(dir);
+        req->inode = dir;
+        req->in.numargs = 1;
+        req->in.args[0].size = entry->d_name.len + 1;
+        req->in.args[0].value = entry->d_name.name;
+        request_send(fc, req);
+        err = req->out.h.error;
+        fuse_put_request(fc, req);
+        if (!err) {
+                struct inode *inode = entry->d_inode;
+                /* Set nlink to zero so the inode can be cleared, if
+                   the inode does have more links this will be
+                   discovered at the next lookup/getattr */
+                inode->i_nlink = 0;
+                fuse_invalidate_attr(inode);
+                fuse_invalidate_attr(dir);
+        } else if (err == -EINTR)
+                fuse_invalidate_entry(entry);
+        return err;
+}
+static int fuse_rmdir(struct inode *dir, struct dentry *entry)
+{
+        int err;
+        struct fuse_conn *fc = get_fuse_conn(dir);
+        struct fuse_req *req = fuse_get_request(fc);
+        if (!req)
+                return -EINTR;
+        req->in.h.opcode = FUSE_RMDIR;
+        req->in.h.nodeid = get_node_id(dir);
+        req->inode = dir;
+        req->in.numargs = 1;
+        req->in.args[0].size = entry->d_name.len + 1;
+        req->in.args[0].value = entry->d_name.name;
+        request_send(fc, req);
+        err = req->out.h.error;
+        fuse_put_request(fc, req);
+        if (!err) {
+                entry->d_inode->i_nlink = 0;
+                fuse_invalidate_attr(dir);
+        } else if (err == -EINTR)
+                fuse_invalidate_entry(entry);
+        return err;
+}
+static int fuse_rename(struct inode *olddir, struct dentry *oldent,
+                       struct inode *newdir, struct dentry *newent)
+{
+        int err;
+        struct fuse_rename_in inarg;
+        struct fuse_conn *fc = get_fuse_conn(olddir);
+        struct fuse_req *req = fuse_get_request(fc);
+        if (!req)
+                return -EINTR;
+        memset(&inarg, 0, sizeof(inarg));
+        inarg.newdir = get_node_id(newdir);
+        req->in.h.opcode = FUSE_RENAME;
+        req->in.h.nodeid = get_node_id(olddir);
+        req->inode = olddir;
+        req->inode2 = newdir;
+        req->in.numargs = 3;
+        req->in.args[0].size = sizeof(inarg);
+        req->in.args[0].value = &inarg;
+        req->in.args[1].size = oldent->d_name.len + 1;
+        req->in.args[1].value = oldent->d_name.name;
+        req->in.args[2].size = newent->d_name.len + 1;
+        req->in.args[2].value = newent->d_name.name;
+        request_send(fc, req);
+        err = req->out.h.error;
+        fuse_put_request(fc, req);
+        if (!err) {
+                fuse_invalidate_attr(olddir);
+                if (olddir != newdir)
+                        fuse_invalidate_attr(newdir);
+        } else if (err == -EINTR) {
+                /* If request was interrupted, DEITY only knows if the
+                   rename actually took place.  If the invalidation
+                   fails (e.g. some process has CWD under the renamed
+                   directory), then there can be inconsistency between
+                   the dcache and the real filesystem.  Tough luck. */
+                fuse_invalidate_entry(oldent);
+                if (newent->d_inode)
+                        fuse_invalidate_entry(newent);
+        }
+        return err;
+}
+static int fuse_link(struct dentry *entry, struct inode *newdir,
+                     struct dentry *newent)
+{
+        int err;
+        struct fuse_link_in inarg;
+        struct inode *inode = entry->d_inode;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_req *req = fuse_get_request(fc);
+        if (!req)
+                return -EINTR;
+        memset(&inarg, 0, sizeof(inarg));
+        inarg.oldnodeid = get_node_id(inode);
+        req->in.h.opcode = FUSE_LINK;
+        req->inode2 = inode;
+        req->in.numargs = 2;
+        req->in.args[0].size = sizeof(inarg);
+        req->in.args[0].value = &inarg;
+        req->in.args[1].size = newent->d_name.len + 1;
+        req->in.args[1].value = newent->d_name.name;
+        err = create_new_entry(fc, req, newdir, newent, inode->i_mode);
+        /* Contrary to "normal" filesystems it can happen that link
+           makes two "logical" inodes point to the same "physical"
+           inode.  We invalidate the attributes of the old one, so it
+           will reflect changes in the backing inode (link count,
+           etc.)
+        */
+        if (!err || err == -EINTR)
+                fuse_invalidate_attr(inode);
+        return err;
+}
+int fuse_do_getattr(struct inode *inode)
+{
+        int err;
+        struct fuse_attr_out arg;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_req *req = fuse_get_request(fc);
+        if (!req)
+                return -EINTR;
+        req->in.h.opcode = FUSE_GETATTR;
+        req->in.h.nodeid = get_node_id(inode);
+        req->inode = inode;
+        req->out.numargs = 1;
+        req->out.args[0].size = sizeof(arg);
+        req->out.args[0].value = &arg;
+        request_send(fc, req);
+        err = req->out.h.error;
+        fuse_put_request(fc, req);
+        if (!err) {
+                if ((inode->i_mode ^ arg.attr.mode) & S_IFMT) {
+                        make_bad_inode(inode);
+                        err = -EIO;
+                } else {
+                        struct fuse_inode *fi = get_fuse_inode(inode);
+                        fuse_change_attributes(inode, &arg.attr);
+                        fi->i_time = time_to_jiffies(arg.attr_valid,
+                                                     arg.attr_valid_nsec);
+                }
+        }
+        return err;
+}
+/*
+ * Calling into a user-controlled filesystem gives the filesystem
+ * daemon ptrace-like capabilities over the requester process.  This
+ * means, that the filesystem daemon is able to record the exact
+ * filesystem operations performed, and can also control the behavior
+ * of the requester process in otherwise impossible ways.  For example
+ * it can delay the operation for arbitrary length of time allowing
+ * DoS against the requester.
+ *
+ * For this reason only those processes can call into the filesystem,
+ * for which the owner of the mount has ptrace privilege.  This
+ * excludes processes started by other users, suid or sgid processes.
+ */
+static int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task)
+{
+        if (fc->flags & FUSE_ALLOW_OTHER)
+                return 1;
+        if (task->euid == fc->user_id &&
+            task->suid == fc->user_id &&
+            task->uid == fc->user_id &&
+            task->egid == fc->group_id &&
+            task->sgid == fc->group_id &&
+            task->gid == fc->group_id)
+                return 1;
+        return 0;
+}
+static int fuse_revalidate(struct dentry *entry)
+{
+        struct inode *inode = entry->d_inode;
+        struct fuse_inode *fi = get_fuse_inode(inode);
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        if (!fuse_allow_task(fc, current))
+                return -EACCES;
+        if (get_node_id(inode) != FUSE_ROOT_ID &&
+            time_before_eq(jiffies, fi->i_time))
+                return 0;
+        return fuse_do_getattr(inode);
+}
+static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        if (!fuse_allow_task(fc, current))
+                return -EACCES;
+        else if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
+                int err = generic_permission(inode, mask, NULL);
+                /* If permission is denied, try to refresh file
+                   attributes.  This is also needed, because the root
+                   node will at first have no permissions */
+                if (err == -EACCES) {
+                        err = fuse_do_getattr(inode);
+                        if (!err)
+                                err = generic_permission(inode, mask, NULL);
+                }
+                /* FIXME: Need some mechanism to revoke permissions:
+                   currently if the filesystem suddenly changes the
+                   file mode, we will not be informed about it, and
+                   continue to allow access to the file/directory.
+                   This is actually not so grave, since the user can
+                   simply keep access to the file/directory anyway by
+                   keeping it open... */
+                return err;
+        } else {
+                int mode = inode->i_mode;
+                if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
+                    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
+                        return -EROFS;
+                if ((mask & MAY_EXEC) && !S_ISDIR(mode) && !(mode & S_IXUGO))
+                        return -EACCES;
+                return 0;
+        }
+}
+static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
+                         void *dstbuf, filldir_t filldir)
+{
+        while (nbytes >= FUSE_NAME_OFFSET) {
+                struct fuse_dirent *dirent = (struct fuse_dirent *) buf;
+                size_t reclen = FUSE_DIRENT_SIZE(dirent);
+                int over;
+                if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
+                        return -EIO;
+                if (reclen > nbytes)
+                        break;
+                over = filldir(dstbuf, dirent->name, dirent->namelen,
+                               file->f_pos, dirent->ino, dirent->type);
+                if (over)
+                        break;
+                buf += reclen;
+                nbytes -= reclen;
+                file->f_pos = dirent->off;
+        }
+        return 0;
+}
+static inline size_t fuse_send_readdir(struct fuse_req *req, struct file *file,
+                                       struct inode *inode, loff_t pos,
+                                       size_t count)
+{
+        return fuse_send_read_common(req, file, inode, pos, count, 1);
+}
+static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
+{
+        int err;
+        size_t nbytes;
+        struct page *page;
+        struct inode *inode = file->f_dentry->d_inode;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_req *req = fuse_get_request(fc);
+        if (!req)
+                return -EINTR;
+        page = alloc_page(GFP_KERNEL);
+        if (!page) {
+                fuse_put_request(fc, req);
+                return -ENOMEM;
+        }
+        req->num_pages = 1;
+        req->pages[0] = page;
+        nbytes = fuse_send_readdir(req, file, inode, file->f_pos, PAGE_SIZE);
+        err = req->out.h.error;
+        fuse_put_request(fc, req);
+        if (!err)
+                err = parse_dirfile(page_address(page), nbytes, file, dstbuf,
+                                    filldir);
+        __free_page(page);
+        fuse_invalidate_attr(inode); /* atime changed */
+        return err;
+}
+static char *read_link(struct dentry *dentry)
+{
+        struct inode *inode = dentry->d_inode;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_req *req = fuse_get_request(fc);
+        char *link;
+        if (!req)
+                return ERR_PTR(-EINTR);
+        link = (char *) __get_free_page(GFP_KERNEL);
+        if (!link) {
+                link = ERR_PTR(-ENOMEM);
+                goto out;
+        }
+        req->in.h.opcode = FUSE_READLINK;
+        req->in.h.nodeid = get_node_id(inode);
+        req->inode = inode;
+        req->out.argvar = 1;
+        req->out.numargs = 1;
+        req->out.args[0].size = PAGE_SIZE - 1;
+        req->out.args[0].value = link;
+        request_send(fc, req);
+        if (req->out.h.error) {
+                free_page((unsigned long) link);
+                link = ERR_PTR(req->out.h.error);
+        } else
+                link[req->out.args[0].size] = '\0';
+ out:
+        fuse_put_request(fc, req);
+        fuse_invalidate_attr(inode); /* atime changed */
+        return link;
+}
+static void free_link(char *link)
+{
+        if (!IS_ERR(link))
+                free_page((unsigned long) link);
+}
+static void *fuse_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+        nd_set_link(nd, read_link(dentry));
+        return NULL;
+}
+static void fuse_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
+{
+        free_link(nd_get_link(nd));
+}
+static int fuse_dir_open(struct inode *inode, struct file *file)
+{
+        return fuse_open_common(inode, file, 1);
+}
+static int fuse_dir_release(struct inode *inode, struct file *file)
+{
+        return fuse_release_common(inode, file, 1);
+}
+static int fuse_dir_fsync(struct file *file, struct dentry *de, int datasync)
+{
+        /* nfsd can call this with no file */
+        return file ? fuse_fsync_common(file, de, datasync, 1) : 0;
+}
+static unsigned iattr_to_fattr(struct iattr *iattr, struct fuse_attr *fattr)
+{
+        unsigned ivalid = iattr->ia_valid;
+        unsigned fvalid = 0;
+        memset(fattr, 0, sizeof(*fattr));
+        if (ivalid & ATTR_MODE)
+                fvalid |= FATTR_MODE,   fattr->mode = iattr->ia_mode;
+        if (ivalid & ATTR_UID)
+                fvalid |= FATTR_UID,    fattr->uid = iattr->ia_uid;
+        if (ivalid & ATTR_GID)
+                fvalid |= FATTR_GID,    fattr->gid = iattr->ia_gid;
+        if (ivalid & ATTR_SIZE)
+                fvalid |= FATTR_SIZE,   fattr->size = iattr->ia_size;
+        /* You can only _set_ these together (they may change by themselves) */
+        if ((ivalid & (ATTR_ATIME | ATTR_MTIME)) == (ATTR_ATIME | ATTR_MTIME)) {
+                fvalid |= FATTR_ATIME | FATTR_MTIME;
+                fattr->atime = iattr->ia_atime.tv_sec;
+                fattr->mtime = iattr->ia_mtime.tv_sec;
+        }
+        return fvalid;
+}
+static int fuse_setattr(struct dentry *entry, struct iattr *attr)
+{
+        struct inode *inode = entry->d_inode;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_inode *fi = get_fuse_inode(inode);
+        struct fuse_req *req;
+        struct fuse_setattr_in inarg;
+        struct fuse_attr_out outarg;
+        int err;
+        int is_truncate = 0;
+        if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
+                err = inode_change_ok(inode, attr);
+                if (err)
+                        return err;
+        }
+        if (attr->ia_valid & ATTR_SIZE) {
+                unsigned long limit;
+                is_truncate = 1;
+                limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+                if (limit != RLIM_INFINITY && attr->ia_size > (loff_t) limit) {
+                        send_sig(SIGXFSZ, current, 0);
+                        return -EFBIG;
+                }
+        }
+        req = fuse_get_request(fc);
+        if (!req)
+                return -EINTR;
+        memset(&inarg, 0, sizeof(inarg));
+        inarg.valid = iattr_to_fattr(attr, &inarg.attr);
+        req->in.h.opcode = FUSE_SETATTR;
+        req->in.h.nodeid = get_node_id(inode);
+        req->inode = inode;
+        req->in.numargs = 1;
+        req->in.args[0].size = sizeof(inarg);
+        req->in.args[0].value = &inarg;
+        req->out.numargs = 1;
+        req->out.args[0].size = sizeof(outarg);
+        req->out.args[0].value = &outarg;
+        request_send(fc, req);
+        err = req->out.h.error;
+        fuse_put_request(fc, req);
+        if (!err) {
+                if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
+                        make_bad_inode(inode);
+                        err = -EIO;
+                } else {
+                        if (is_truncate) {
+                                loff_t origsize = i_size_read(inode);
+                                i_size_write(inode, outarg.attr.size);
+                                if (origsize > outarg.attr.size)
+                                        vmtruncate(inode, outarg.attr.size);
+                        }
+                        fuse_change_attributes(inode, &outarg.attr);
+                        fi->i_time = time_to_jiffies(outarg.attr_valid,
+                                                     outarg.attr_valid_nsec);
+                }
+        } else if (err == -EINTR)
+                fuse_invalidate_attr(inode);
+        return err;
+}
+static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
+                        struct kstat *stat)
+{
+        struct inode *inode = entry->d_inode;
+        int err = fuse_revalidate(entry);
+        if (!err)
+                generic_fillattr(inode, stat);
+        return err;
+}
+static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
+                                  struct nameidata *nd)
+{
+        struct inode *inode;
+        int err = fuse_lookup_iget(dir, entry, &inode);
+        if (err)
+                return ERR_PTR(err);
+        if (inode && S_ISDIR(inode->i_mode)) {
+                /* Don't allow creating an alias to a directory  */
+                struct dentry *alias = d_find_alias(inode);
+                if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) {
+                        dput(alias);
+                        iput(inode);
+                        return ERR_PTR(-EIO);
+                }
+        }
+        return d_splice_alias(inode, entry);
+}
+static int fuse_setxattr(struct dentry *entry, const char *name,
+                         const void *value, size_t size, int flags)
+{
+        struct inode *inode = entry->d_inode;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_req *req;
+        struct fuse_setxattr_in inarg;
+        int err;
+        if (size > FUSE_XATTR_SIZE_MAX)
+                return -E2BIG;
+        if (fc->no_setxattr)
+                return -EOPNOTSUPP;
+        req = fuse_get_request(fc);
+        if (!req)
+                return -EINTR;
+        memset(&inarg, 0, sizeof(inarg));
+        inarg.size = size;
+        inarg.flags = flags;
+        req->in.h.opcode = FUSE_SETXATTR;
+        req->in.h.nodeid = get_node_id(inode);
+        req->inode = inode;
+        req->in.numargs = 3;
+        req->in.args[0].size = sizeof(inarg);
+        req->in.args[0].value = &inarg;
+        req->in.args[1].size = strlen(name) + 1;
+        req->in.args[1].value = name;
+        req->in.args[2].size = size;
+        req->in.args[2].value = value;
+        request_send(fc, req);
+        err = req->out.h.error;
+        fuse_put_request(fc, req);
+        if (err == -ENOSYS) {
+                fc->no_setxattr = 1;
+                err = -EOPNOTSUPP;
+        }
+        return err;
+}
+static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
+                             void *value, size_t size)
+{
+        struct inode *inode = entry->d_inode;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_req *req;
+        struct fuse_getxattr_in inarg;
+        struct fuse_getxattr_out outarg;
+        ssize_t ret;
+        if (fc->no_getxattr)
+                return -EOPNOTSUPP;
+        req = fuse_get_request(fc);
+        if (!req)
+                return -EINTR;
+        memset(&inarg, 0, sizeof(inarg));
+        inarg.size = size;
+        req->in.h.opcode = FUSE_GETXATTR;
+        req->in.h.nodeid = get_node_id(inode);
+        req->inode = inode;
+        req->in.numargs = 2;
+        req->in.args[0].size = sizeof(inarg);
+        req->in.args[0].value = &inarg;
+        req->in.args[1].size = strlen(name) + 1;
+        req->in.args[1].value = name;
+        /* This is really two different operations rolled into one */
+        req->out.numargs = 1;
+        if (size) {
+                req->out.argvar = 1;
+                req->out.args[0].size = size;
+                req->out.args[0].value = value;
+        } else {
+                req->out.args[0].size = sizeof(outarg);
+                req->out.args[0].value = &outarg;
+        }
+        request_send(fc, req);
+        ret = req->out.h.error;
+        if (!ret)
+                ret = size ? req->out.args[0].size : outarg.size;
+        else {
+                if (ret == -ENOSYS) {
+                        fc->no_getxattr = 1;
+                        ret = -EOPNOTSUPP;
+                }
+        }
+        fuse_put_request(fc, req);
+        return ret;
+}
+static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
+{
+        struct inode *inode = entry->d_inode;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_req *req;
+        struct fuse_getxattr_in inarg;
+        struct fuse_getxattr_out outarg;
+        ssize_t ret;
+        if (fc->no_listxattr)
+                return -EOPNOTSUPP;
+        req = fuse_get_request(fc);
+        if (!req)
+                return -EINTR;
+        memset(&inarg, 0, sizeof(inarg));
+        inarg.size = size;
+        req->in.h.opcode = FUSE_LISTXATTR;
+        req->in.h.nodeid = get_node_id(inode);
+        req->inode = inode;
+        req->in.numargs = 1;
+        req->in.args[0].size = sizeof(inarg);
+        req->in.args[0].value = &inarg;
+        /* This is really two different operations rolled into one */
+        req->out.numargs = 1;
+        if (size) {
+                req->out.argvar = 1;
+                req->out.args[0].size = size;
+                req->out.args[0].value = list;
+        } else {
+                req->out.args[0].size = sizeof(outarg);
+                req->out.args[0].value = &outarg;
+        }
+        request_send(fc, req);
+        ret = req->out.h.error;
+        if (!ret)
+                ret = size ? req->out.args[0].size : outarg.size;
+        else {
+                if (ret == -ENOSYS) {
+                        fc->no_listxattr = 1;
+                        ret = -EOPNOTSUPP;
+                }
+        }
+        fuse_put_request(fc, req);
+        return ret;
+}
+static int fuse_removexattr(struct dentry *entry, const char *name)
+{
+        struct inode *inode = entry->d_inode;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_req *req;
+        int err;
+        if (fc->no_removexattr)
+                return -EOPNOTSUPP;
+        req = fuse_get_request(fc);
+        if (!req)
+                return -EINTR;
+        req->in.h.opcode = FUSE_REMOVEXATTR;
+        req->in.h.nodeid = get_node_id(inode);
+        req->inode = inode;
+        req->in.numargs = 1;
+        req->in.args[0].size = strlen(name) + 1;
+        req->in.args[0].value = name;
+        request_send(fc, req);
+        err = req->out.h.error;
+        fuse_put_request(fc, req);
+        if (err == -ENOSYS) {
+                fc->no_removexattr = 1;
+                err = -EOPNOTSUPP;
+        }
+        return err;
+}
+static struct inode_operations fuse_dir_inode_operations = {
+        .lookup         = fuse_lookup,
+        .mkdir          = fuse_mkdir,
+        .symlink        = fuse_symlink,
+        .unlink         = fuse_unlink,
+        .rmdir          = fuse_rmdir,
+        .rename         = fuse_rename,
+        .link           = fuse_link,
+        .setattr        = fuse_setattr,
+        .create         = fuse_create,
+        .mknod          = fuse_mknod,
+        .permission     = fuse_permission,
+        .getattr        = fuse_getattr,
+        .setxattr       = fuse_setxattr,
+        .getxattr       = fuse_getxattr,
+        .listxattr      = fuse_listxattr,
+        .removexattr    = fuse_removexattr,
+};
+static struct file_operations fuse_dir_operations = {
+        .llseek         = generic_file_llseek,
+        .read           = generic_read_dir,
+        .readdir        = fuse_readdir,
+        .open           = fuse_dir_open,
+        .release        = fuse_dir_release,
+        .fsync          = fuse_dir_fsync,
+};
+static struct inode_operations fuse_common_inode_operations = {
+        .setattr        = fuse_setattr,
+        .permission     = fuse_permission,
+        .getattr        = fuse_getattr,
+        .setxattr       = fuse_setxattr,
+        .getxattr       = fuse_getxattr,
+        .listxattr      = fuse_listxattr,
+        .removexattr    = fuse_removexattr,
+};
+static struct inode_operations fuse_symlink_inode_operations = {
+        .setattr        = fuse_setattr,
+        .follow_link    = fuse_follow_link,
+        .put_link       = fuse_put_link,
+        .readlink       = generic_readlink,
+        .getattr        = fuse_getattr,
+        .setxattr       = fuse_setxattr,
+        .getxattr       = fuse_getxattr,
+        .listxattr      = fuse_listxattr,
+        .removexattr    = fuse_removexattr,
+};
+void fuse_init_common(struct inode *inode)
+{
+        inode->i_op = &fuse_common_inode_operations;
+}
+void fuse_init_dir(struct inode *inode)
+{
+        inode->i_op = &fuse_dir_inode_operations;
+        inode->i_fop = &fuse_dir_operations;
+}
+void fuse_init_symlink(struct inode *inode)
+{
+        inode->i_op = &fuse_symlink_inode_operations;
+}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
new file mode 100644
index 000000000000..6454022b0536
--- /dev/null
+++ b/fs/fuse/file.c
@@ -0,0 +1,555 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+#include "fuse_i.h"
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+static struct file_operations fuse_direct_io_file_operations;
+int fuse_open_common(struct inode *inode, struct file *file, int isdir)
+{
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_req *req;
+        struct fuse_open_in inarg;
+        struct fuse_open_out outarg;
+        struct fuse_file *ff;
+        int err;
+        err = generic_file_open(inode, file);
+        if (err)
+                return err;
+        /* If opening the root node, no lookup has been performed on
+           it, so the attributes must be refreshed */
+        if (get_node_id(inode) == FUSE_ROOT_ID) {
+                int err = fuse_do_getattr(inode);
+                if (err)
+                        return err;
+        }
+        req = fuse_get_request(fc);
+        if (!req)
+                return -EINTR;
+        err = -ENOMEM;
+        ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
+        if (!ff)
+                goto out_put_request;
+        ff->release_req = fuse_request_alloc();
+        if (!ff->release_req) {
+                kfree(ff);
+                goto out_put_request;
+        }
+        memset(&inarg, 0, sizeof(inarg));
+        inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
+        req->in.h.opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
+        req->in.h.nodeid = get_node_id(inode);
+        req->inode = inode;
+        req->in.numargs = 1;
+        req->in.args[0].size = sizeof(inarg);
+        req->in.args[0].value = &inarg;
+        req->out.numargs = 1;
+        req->out.args[0].size = sizeof(outarg);
+        req->out.args[0].value = &outarg;
+        request_send(fc, req);
+        err = req->out.h.error;
+        if (err) {
+                fuse_request_free(ff->release_req);
+                kfree(ff);
+        } else {
+                if (!isdir && (outarg.open_flags & FOPEN_DIRECT_IO))
+                        file->f_op = &fuse_direct_io_file_operations;
+                if (!(outarg.open_flags & FOPEN_KEEP_CACHE))
+                        invalidate_inode_pages(inode->i_mapping);
+                ff->fh = outarg.fh;
+                file->private_data = ff;
+        }
+ out_put_request:
+        fuse_put_request(fc, req);
+        return err;
+}
+int fuse_release_common(struct inode *inode, struct file *file, int isdir)
+{
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_file *ff = file->private_data;
+        struct fuse_req *req = ff->release_req;
+        struct fuse_release_in *inarg = &req->misc.release_in;
+        inarg->fh = ff->fh;
+        inarg->flags = file->f_flags & ~O_EXCL;
+        req->in.h.opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
+        req->in.h.nodeid = get_node_id(inode);
+        req->inode = inode;
+        req->in.numargs = 1;
+        req->in.args[0].size = sizeof(struct fuse_release_in);
+        req->in.args[0].value = inarg;
+        request_send_background(fc, req);
+        kfree(ff);
+        /* Return value is ignored by VFS */
+        return 0;
+}
+static int fuse_open(struct inode *inode, struct file *file)
+{
+        return fuse_open_common(inode, file, 0);
+}
+static int fuse_release(struct inode *inode, struct file *file)
+{
+        return fuse_release_common(inode, file, 0);
+}
+static int fuse_flush(struct file *file)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_file *ff = file->private_data;
+        struct fuse_req *req;
+        struct fuse_flush_in inarg;
+        int err;
+        if (fc->no_flush)
+                return 0;
+        req = fuse_get_request(fc);
+        if (!req)
+                return -EINTR;
+        memset(&inarg, 0, sizeof(inarg));
+        inarg.fh = ff->fh;
+        req->in.h.opcode = FUSE_FLUSH;
+        req->in.h.nodeid = get_node_id(inode);
+        req->inode = inode;
+        req->file = file;
+        req->in.numargs = 1;
+        req->in.args[0].size = sizeof(inarg);
+        req->in.args[0].value = &inarg;
+        request_send(fc, req);
+        err = req->out.h.error;
+        fuse_put_request(fc, req);
+        if (err == -ENOSYS) {
+                fc->no_flush = 1;
+                err = 0;
+        }
+        return err;
+}
+int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
+                      int isdir)
+{
+        struct inode *inode = de->d_inode;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_file *ff = file->private_data;
+        struct fuse_req *req;
+        struct fuse_fsync_in inarg;
+        int err;
+        if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
+                return 0;
+        req = fuse_get_request(fc);
+        if (!req)
+                return -EINTR;
+        memset(&inarg, 0, sizeof(inarg));
+        inarg.fh = ff->fh;
+        inarg.fsync_flags = datasync ? 1 : 0;
+        req->in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC;
+        req->in.h.nodeid = get_node_id(inode);
+        req->inode = inode;
+        req->file = file;
+        req->in.numargs = 1;
+        req->in.args[0].size = sizeof(inarg);
+        req->in.args[0].value = &inarg;
+        request_send(fc, req);
+        err = req->out.h.error;
+        fuse_put_request(fc, req);
+        if (err == -ENOSYS) {
+                if (isdir)
+                        fc->no_fsyncdir = 1;
+                else
+                        fc->no_fsync = 1;
+                err = 0;
+        }
+        return err;
+}
+static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
+{
+        return fuse_fsync_common(file, de, datasync, 0);
+}
+size_t fuse_send_read_common(struct fuse_req *req, struct file *file,
+                             struct inode *inode, loff_t pos, size_t count,
+                             int isdir)
+{
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_file *ff = file->private_data;
+        struct fuse_read_in inarg;
+        memset(&inarg, 0, sizeof(struct fuse_read_in));
+        inarg.fh = ff->fh;
+        inarg.offset = pos;
+        inarg.size = count;
+        req->in.h.opcode = isdir ? FUSE_READDIR : FUSE_READ;
+        req->in.h.nodeid = get_node_id(inode);
+        req->inode = inode;
+        req->file = file;
+        req->in.numargs = 1;
+        req->in.args[0].size = sizeof(struct fuse_read_in);
+        req->in.args[0].value = &inarg;
+        req->out.argpages = 1;
+        req->out.argvar = 1;
+        req->out.numargs = 1;
+        req->out.args[0].size = count;
+        request_send(fc, req);
+        return req->out.args[0].size;
+}
+static inline size_t fuse_send_read(struct fuse_req *req, struct file *file,
+                                    struct inode *inode, loff_t pos,
+                                    size_t count)
+{
+        return fuse_send_read_common(req, file, inode, pos, count, 0);
+}
+static int fuse_readpage(struct file *file, struct page *page)
+{
+        struct inode *inode = page->mapping->host;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        loff_t pos = (loff_t) page->index << PAGE_CACHE_SHIFT;
+        struct fuse_req *req = fuse_get_request(fc);
+        int err = -EINTR;
+        if (!req)
+                goto out;
+        req->out.page_zeroing = 1;
+        req->num_pages = 1;
+        req->pages[0] = page;
+        fuse_send_read(req, file, inode, pos, PAGE_CACHE_SIZE);
+        err = req->out.h.error;
+        fuse_put_request(fc, req);
+        if (!err)
+                SetPageUptodate(page);
+        fuse_invalidate_attr(inode); /* atime changed */
+ out:
+        unlock_page(page);
+        return err;
+}
+static int fuse_send_readpages(struct fuse_req *req, struct file *file,
+                               struct inode *inode)
+{
+        loff_t pos = (loff_t) req->pages[0]->index << PAGE_CACHE_SHIFT;
+        size_t count = req->num_pages << PAGE_CACHE_SHIFT;
+        unsigned i;
+        req->out.page_zeroing = 1;
+        fuse_send_read(req, file, inode, pos, count);
+        for (i = 0; i < req->num_pages; i++) {
+                struct page *page = req->pages[i];
+                if (!req->out.h.error)
+                        SetPageUptodate(page);
+                unlock_page(page);
+        }
+        return req->out.h.error;
+}
+struct fuse_readpages_data {
+        struct fuse_req *req;
+        struct file *file;
+        struct inode *inode;
+};
+static int fuse_readpages_fill(void *_data, struct page *page)
+{
+        struct fuse_readpages_data *data = _data;
+        struct fuse_req *req = data->req;
+        struct inode *inode = data->inode;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        if (req->num_pages &&
+            (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
+             (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
+             req->pages[req->num_pages - 1]->index + 1 != page->index)) {
+                int err = fuse_send_readpages(req, data->file, inode);
+                if (err) {
+                        unlock_page(page);
+                        return err;
+                }
+                fuse_reset_request(req);
+        }
+        req->pages[req->num_pages] = page;
+        req->num_pages ++;
+        return 0;
+}
+static int fuse_readpages(struct file *file, struct address_space *mapping,
+                          struct list_head *pages, unsigned nr_pages)
+{
+        struct inode *inode = mapping->host;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_readpages_data data;
+        int err;
+        data.file = file;
+        data.inode = inode;
+        data.req = fuse_get_request(fc);
+        if (!data.req)
+                return -EINTR;
+        err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
+        if (!err && data.req->num_pages)
+                err = fuse_send_readpages(data.req, file, inode);
+        fuse_put_request(fc, data.req);
+        fuse_invalidate_attr(inode); /* atime changed */
+        return err;
+}
+static size_t fuse_send_write(struct fuse_req *req, struct file *file,
+                              struct inode *inode, loff_t pos, size_t count)
+{
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_file *ff = file->private_data;
+        struct fuse_write_in inarg;
+        struct fuse_write_out outarg;
+        memset(&inarg, 0, sizeof(struct fuse_write_in));
+        inarg.fh = ff->fh;
+        inarg.offset = pos;
+        inarg.size = count;
+        req->in.h.opcode = FUSE_WRITE;
+        req->in.h.nodeid = get_node_id(inode);
+        req->inode = inode;
+        req->file = file;
+        req->in.argpages = 1;
+        req->in.numargs = 2;
+        req->in.args[0].size = sizeof(struct fuse_write_in);
+        req->in.args[0].value = &inarg;
+        req->in.args[1].size = count;
+        req->out.numargs = 1;
+        req->out.args[0].size = sizeof(struct fuse_write_out);
+        req->out.args[0].value = &outarg;
+        request_send(fc, req);
+        return outarg.size;
+}
+static int fuse_prepare_write(struct file *file, struct page *page,
+                              unsigned offset, unsigned to)
+{
+        /* No op */
+        return 0;
+}
+static int fuse_commit_write(struct file *file, struct page *page,
+                             unsigned offset, unsigned to)
+{
+        int err;
+        size_t nres;
+        unsigned count = to - offset;
+        struct inode *inode = page->mapping->host;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + offset;
+        struct fuse_req *req = fuse_get_request(fc);
+        if (!req)
+                return -EINTR;
+        req->num_pages = 1;
+        req->pages[0] = page;
+        req->page_offset = offset;
+        nres = fuse_send_write(req, file, inode, pos, count);
+        err = req->out.h.error;
+        fuse_put_request(fc, req);
+        if (!err && nres != count)
+                err = -EIO;
+        if (!err) {
+                pos += count;
+                if (pos > i_size_read(inode))
+                        i_size_write(inode, pos);
+                if (offset == 0 && to == PAGE_CACHE_SIZE) {
+                        clear_page_dirty(page);
+                        SetPageUptodate(page);
+                }
+        }
+        fuse_invalidate_attr(inode);
+        return err;
+}
+static void fuse_release_user_pages(struct fuse_req *req, int write)
+{
+        unsigned i;
+        for (i = 0; i < req->num_pages; i++) {
+                struct page *page = req->pages[i];
+                if (write)
+                        set_page_dirty_lock(page);
+                put_page(page);
+        }
+}
+static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
+                               unsigned nbytes, int write)
+{
+        unsigned long user_addr = (unsigned long) buf;
+        unsigned offset = user_addr & ~PAGE_MASK;
+        int npages;
+        /* This doesn't work with nfsd */
+        if (!current->mm)
+                return -EPERM;
+        nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
+        npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        npages = min(npages, FUSE_MAX_PAGES_PER_REQ);
+        down_read(&current->mm->mmap_sem);
+        npages = get_user_pages(current, current->mm, user_addr, npages, write,
+                                0, req->pages, NULL);
+        up_read(&current->mm->mmap_sem);
+        if (npages < 0)
+                return npages;
+        req->num_pages = npages;
+        req->page_offset = offset;
+        return 0;
+}
+static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
+                              size_t count, loff_t *ppos, int write)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        size_t nmax = write ? fc->max_write : fc->max_read;
+        loff_t pos = *ppos;
+        ssize_t res = 0;
+        struct fuse_req *req = fuse_get_request(fc);
+        if (!req)
+                return -EINTR;
+        while (count) {
+                size_t tmp;
+                size_t nres;
+                size_t nbytes = min(count, nmax);
+                int err = fuse_get_user_pages(req, buf, nbytes, !write);
+                if (err) {
+                        res = err;
+                        break;
+                }
+                tmp = (req->num_pages << PAGE_SHIFT) - req->page_offset;
+                nbytes = min(nbytes, tmp);
+                if (write)
+                        nres = fuse_send_write(req, file, inode, pos, nbytes);
+                else
+                        nres = fuse_send_read(req, file, inode, pos, nbytes);
+                fuse_release_user_pages(req, !write);
+                if (req->out.h.error) {
+                        if (!res)
+                                res = req->out.h.error;
+                        break;
+                } else if (nres > nbytes) {
+                        res = -EIO;
+                        break;
+                }
+                count -= nres;
+                res += nres;
+                pos += nres;
+                buf += nres;
+                if (nres != nbytes)
+                        break;
+                if (count)
+                        fuse_reset_request(req);
+        }
+        fuse_put_request(fc, req);
+        if (res > 0) {
+                if (write && pos > i_size_read(inode))
+                        i_size_write(inode, pos);
+                *ppos = pos;
+        }
+        fuse_invalidate_attr(inode);
+        return res;
+}
+static ssize_t fuse_direct_read(struct file *file, char __user *buf,
+                                     size_t count, loff_t *ppos)
+{
+        return fuse_direct_io(file, buf, count, ppos, 0);
+}
+static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
+                                 size_t count, loff_t *ppos)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        ssize_t res;
+        /* Don't allow parallel writes to the same file */
+        down(&inode->i_sem);
+        res = fuse_direct_io(file, buf, count, ppos, 1);
+        up(&inode->i_sem);
+        return res;
+}
+static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        if ((vma->vm_flags & VM_SHARED)) {
+                if ((vma->vm_flags & VM_WRITE))
+                        return -ENODEV;
+                else
+                        vma->vm_flags &= ~VM_MAYWRITE;
+        }
+        return generic_file_mmap(file, vma);
+}
+static int fuse_set_page_dirty(struct page *page)
+{
+        printk("fuse_set_page_dirty: should not happen\n");
+        dump_stack();
+        return 0;
+}
+static struct file_operations fuse_file_operations = {
+        .llseek         = generic_file_llseek,
+        .read           = generic_file_read,
+        .write          = generic_file_write,
+        .mmap           = fuse_file_mmap,
+        .open           = fuse_open,
+        .flush          = fuse_flush,
+        .release        = fuse_release,
+        .fsync          = fuse_fsync,
+        .sendfile       = generic_file_sendfile,
+};
+static struct file_operations fuse_direct_io_file_operations = {
+        .llseek         = generic_file_llseek,
+        .read           = fuse_direct_read,
+        .write          = fuse_direct_write,
+        .open           = fuse_open,
+        .flush          = fuse_flush,
+        .release        = fuse_release,
+        .fsync          = fuse_fsync,
+        /* no mmap and sendfile */
+};
+static struct address_space_operations fuse_file_aops  = {
+        .readpage       = fuse_readpage,
+        .prepare_write  = fuse_prepare_write,
+        .commit_write   = fuse_commit_write,
+        .readpages      = fuse_readpages,
+        .set_page_dirty = fuse_set_page_dirty,
+};
+void fuse_init_file_inode(struct inode *inode)
+{
+        inode->i_fop = &fuse_file_operations;
+        inode->i_data.a_ops = &fuse_file_aops;
+}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
new file mode 100644
index 000000000000..24d761518d86
--- /dev/null
+++ b/fs/fuse/fuse_i.h
@@ -0,0 +1,451 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+#include <linux/fuse.h>
+#include <linux/fs.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/backing-dev.h>
+#include <asm/semaphore.h>
+/** Max number of pages that can be used in a single read request */
+#define FUSE_MAX_PAGES_PER_REQ 32
+/** If more requests are outstanding, then the operation will block */
+#define FUSE_MAX_OUTSTANDING 10
+/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
+    module will check permissions based on the file mode.  Otherwise no
+    permission checking is done in the kernel */
+#define FUSE_DEFAULT_PERMISSIONS (1 << 0)
+/** If the FUSE_ALLOW_OTHER flag is given, then not only the user
+    doing the mount will be allowed to access the filesystem */
+#define FUSE_ALLOW_OTHER         (1 << 1)
+/** FUSE inode */
+struct fuse_inode {
+        /** Inode data */
+        struct inode inode;
+        /** Unique ID, which identifies the inode between userspace
+         * and kernel */
+        u64 nodeid;
+        /** Number of lookups on this inode */
+        u64 nlookup;
+        /** The request used for sending the FORGET message */
+        struct fuse_req *forget_req;
+        /** Time in jiffies until the file attributes are valid */
+        unsigned long i_time;
+};
+/** FUSE specific file data */
+struct fuse_file {
+        /** Request reserved for flush and release */
+        struct fuse_req *release_req;
+        /** File handle used by userspace */
+        u64 fh;
+};
+/** One input argument of a request */
+struct fuse_in_arg {
+        unsigned size;
+        const void *value;
+};
+/** The request input */
+struct fuse_in {
+        /** The request header */
+        struct fuse_in_header h;
+        /** True if the data for the last argument is in req->pages */
+        unsigned argpages:1;
+        /** Number of arguments */
+        unsigned numargs;
+        /** Array of arguments */
+        struct fuse_in_arg args[3];
+};
+/** One output argument of a request */
+struct fuse_arg {
+        unsigned size;
+        void *value;
+};
+/** The request output */
+struct fuse_out {
+        /** Header returned from userspace */
+        struct fuse_out_header h;
+        /** Last argument is variable length (can be shorter than
+            arg->size) */
+        unsigned argvar:1;
+        /** Last argument is a list of pages to copy data to */
+        unsigned argpages:1;
+        /** Zero partially or not copied pages */
+        unsigned page_zeroing:1;
+        /** Number or arguments */
+        unsigned numargs;
+        /** Array of arguments */
+        struct fuse_arg args[3];
+};
+struct fuse_req;
+struct fuse_conn;
+/**
+ * A request to the client
+ */
+struct fuse_req {
+        /** This can be on either unused_list, pending or processing
+            lists in fuse_conn */
+        struct list_head list;
+        /** Entry on the background list */
+        struct list_head bg_entry;
+        /** refcount */
+        atomic_t count;
+        /** True if the request has reply */
+        unsigned isreply:1;
+        /** The request is preallocated */
+        unsigned preallocated:1;
+        /** The request was interrupted */
+        unsigned interrupted:1;
+        /** Request is sent in the background */
+        unsigned background:1;
+        /** Data is being copied to/from the request */
+        unsigned locked:1;
+        /** Request has been sent to userspace */
+        unsigned sent:1;
+        /** The request is finished */
+        unsigned finished:1;
+        /** The request input */
+        struct fuse_in in;
+        /** The request output */
+        struct fuse_out out;
+        /** Used to wake up the task waiting for completion of request*/
+        wait_queue_head_t waitq;
+        /** Data for asynchronous requests */
+        union {
+                struct fuse_forget_in forget_in;
+                struct fuse_release_in release_in;
+                struct fuse_init_in_out init_in_out;
+        } misc;
+        /** page vector */
+        struct page *pages[FUSE_MAX_PAGES_PER_REQ];
+        /** number of pages in vector */
+        unsigned num_pages;
+        /** offset of data on first page */
+        unsigned page_offset;
+        /** Inode used in the request */
+        struct inode *inode;
+        /** Second inode used in the request (or NULL) */
+        struct inode *inode2;
+        /** File used in the request (or NULL) */
+        struct file *file;
+};
+/**
+ * A Fuse connection.
+ *
+ * This structure is created, when the filesystem is mounted, and is
+ * destroyed, when the client device is closed and the filesystem is
+ * unmounted.
+ */
+struct fuse_conn {
+        /** Reference count */
+        int count;
+        /** The user id for this mount */
+        uid_t user_id;
+        /** The group id for this mount */
+        gid_t group_id;
+        /** The fuse mount flags for this mount */
+        unsigned flags;
+        /** Maximum read size */
+        unsigned max_read;
+        /** Maximum write size */
+        unsigned max_write;
+        /** Readers of the connection are waiting on this */
+        wait_queue_head_t waitq;
+        /** The list of pending requests */
+        struct list_head pending;
+        /** The list of requests being processed */
+        struct list_head processing;
+        /** Requests put in the background (RELEASE or any other
+            interrupted request) */
+        struct list_head background;
+        /** Controls the maximum number of outstanding requests */
+        struct semaphore outstanding_sem;
+        /** This counts the number of outstanding requests if
+            outstanding_sem would go negative */
+        unsigned outstanding_debt;
+        /** RW semaphore for exclusion with fuse_put_super() */
+        struct rw_semaphore sbput_sem;
+        /** The list of unused requests */
+        struct list_head unused_list;
+        /** The next unique request id */
+        u64 reqctr;
+        /** Mount is active */
+        unsigned mounted : 1;
+        /** Connection established */
+        unsigned connected : 1;
+        /** Connection failed (version mismatch) */
+        unsigned conn_error : 1;
+        /** Is fsync not implemented by fs? */
+        unsigned no_fsync : 1;
+        /** Is fsyncdir not implemented by fs? */
+        unsigned no_fsyncdir : 1;
+        /** Is flush not implemented by fs? */
+        unsigned no_flush : 1;
+        /** Is setxattr not implemented by fs? */
+        unsigned no_setxattr : 1;
+        /** Is getxattr not implemented by fs? */
+        unsigned no_getxattr : 1;
+        /** Is listxattr not implemented by fs? */
+        unsigned no_listxattr : 1;
+        /** Is removexattr not implemented by fs? */
+        unsigned no_removexattr : 1;
+        /** Backing dev info */
+        struct backing_dev_info bdi;
+};
+static inline struct fuse_conn **get_fuse_conn_super_p(struct super_block *sb)
+{
+        return (struct fuse_conn **) &sb->s_fs_info;
+}
+static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
+{
+        return *get_fuse_conn_super_p(sb);
+}
+static inline struct fuse_conn *get_fuse_conn(struct inode *inode)
+{
+        return get_fuse_conn_super(inode->i_sb);
+}
+static inline struct fuse_inode *get_fuse_inode(struct inode *inode)
+{
+        return container_of(inode, struct fuse_inode, inode);
+}
+static inline u64 get_node_id(struct inode *inode)
+{
+        return get_fuse_inode(inode)->nodeid;
+}
+/** Device operations */
+extern struct file_operations fuse_dev_operations;
+/**
+ * This is the single global spinlock which protects FUSE's structures
+ *
+ * The following data is protected by this lock:
+ *
+ *  - the private_data field of the device file
+ *  - the s_fs_info field of the super block
+ *  - unused_list, pending, processing lists in fuse_conn
+ *  - background list in fuse_conn
+ *  - the unique request ID counter reqctr in fuse_conn
+ *  - the sb (super_block) field in fuse_conn
+ *  - the file (device file) field in fuse_conn
+ */
+extern spinlock_t fuse_lock;
+/**
+ * Get a filled in inode
+ */
+struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
+                        int generation, struct fuse_attr *attr);
+/**
+ * Send FORGET command
+ */
+void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
+                      unsigned long nodeid, u64 nlookup);
+/**
+ * Send READ or READDIR request
+ */
+size_t fuse_send_read_common(struct fuse_req *req, struct file *file,
+                             struct inode *inode, loff_t pos, size_t count,
+                             int isdir);
+/**
+ * Send OPEN or OPENDIR request
+ */
+int fuse_open_common(struct inode *inode, struct file *file, int isdir);
+/**
+ * Send RELEASE or RELEASEDIR request
+ */
+int fuse_release_common(struct inode *inode, struct file *file, int isdir);
+/**
+ * Send FSYNC or FSYNCDIR request
+ */
+int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
+                      int isdir);
+/**
+ * Initialise file operations on a regular file
+ */
+void fuse_init_file_inode(struct inode *inode);
+/**
+ * Initialise inode operations on regular files and special files
+ */
+void fuse_init_common(struct inode *inode);
+/**
+ * Initialise inode and file operations on a directory
+ */
+void fuse_init_dir(struct inode *inode);
+/**
+ * Initialise inode operations on a symlink
+ */
+void fuse_init_symlink(struct inode *inode);
+/**
+ * Change attributes of an inode
+ */
+void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr);
+/**
+ * Check if the connection can be released, and if yes, then free the
+ * connection structure
+ */
+void fuse_release_conn(struct fuse_conn *fc);
+/**
+ * Initialize the client device
+ */
+int fuse_dev_init(void);
+/**
+ * Cleanup the client device
+ */
+void fuse_dev_cleanup(void);
+/**
+ * Allocate a request
+ */
+struct fuse_req *fuse_request_alloc(void);
+/**
+ * Free a request
+ */
+void fuse_request_free(struct fuse_req *req);
+/**
+ * Reinitialize a request, the preallocated flag is left unmodified
+ */
+void fuse_reset_request(struct fuse_req *req);
+/**
+ * Reserve a preallocated request
+ */
+struct fuse_req *fuse_get_request(struct fuse_conn *fc);
+/**
+ * Decrement reference count of a request.  If count goes to zero put
+ * on unused list (preallocated) or free reqest (not preallocated).
+ */
+void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
+/**
+ * Send a request (synchronous)
+ */
+void request_send(struct fuse_conn *fc, struct fuse_req *req);
+/**
+ * Send a request with no reply
+ */
+void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
+/**
+ * Send a request in the background
+ */
+void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
+/**
+ * Release inodes and file assiciated with background request
+ */
+void fuse_release_background(struct fuse_req *req);
+/**
+ * Get the attributes of a file
+ */
+int fuse_do_getattr(struct inode *inode);
+/**
+ * Invalidate inode attributes
+ */
+void fuse_invalidate_attr(struct inode *inode);
+/**
+ * Send the INIT message
+ */
+void fuse_send_init(struct fuse_conn *fc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
new file mode 100644
index 000000000000..e69a546844d0
--- /dev/null
+++ b/fs/fuse/inode.c
@@ -0,0 +1,591 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+#include "fuse_i.h"
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/parser.h>
+#include <linux/statfs.h>
+MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
+MODULE_DESCRIPTION("Filesystem in Userspace");
+MODULE_LICENSE("GPL");
+spinlock_t fuse_lock;
+static kmem_cache_t *fuse_inode_cachep;
+#define FUSE_SUPER_MAGIC 0x65735546
+struct fuse_mount_data {
+        int fd;
+        unsigned rootmode;
+        unsigned user_id;
+        unsigned group_id;
+        unsigned fd_present : 1;
+        unsigned rootmode_present : 1;
+        unsigned user_id_present : 1;
+        unsigned group_id_present : 1;
+        unsigned flags;
+        unsigned max_read;
+};
+static struct inode *fuse_alloc_inode(struct super_block *sb)
+{
+        struct inode *inode;
+        struct fuse_inode *fi;
+        inode = kmem_cache_alloc(fuse_inode_cachep, SLAB_KERNEL);
+        if (!inode)
+                return NULL;
+        fi = get_fuse_inode(inode);
+        fi->i_time = jiffies - 1;
+        fi->nodeid = 0;
+        fi->nlookup = 0;
+        fi->forget_req = fuse_request_alloc();
+        if (!fi->forget_req) {
+                kmem_cache_free(fuse_inode_cachep, inode);
+                return NULL;
+        }
+        return inode;
+}
+static void fuse_destroy_inode(struct inode *inode)
+{
+        struct fuse_inode *fi = get_fuse_inode(inode);
+        if (fi->forget_req)
+                fuse_request_free(fi->forget_req);
+        kmem_cache_free(fuse_inode_cachep, inode);
+}
+static void fuse_read_inode(struct inode *inode)
+{
+        /* No op */
+}
+void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
+                      unsigned long nodeid, u64 nlookup)
+{
+        struct fuse_forget_in *inarg = &req->misc.forget_in;
+        inarg->nlookup = nlookup;
+        req->in.h.opcode = FUSE_FORGET;
+        req->in.h.nodeid = nodeid;
+        req->in.numargs = 1;
+        req->in.args[0].size = sizeof(struct fuse_forget_in);
+        req->in.args[0].value = inarg;
+        request_send_noreply(fc, req);
+}
+static void fuse_clear_inode(struct inode *inode)
+{
+        if (inode->i_sb->s_flags & MS_ACTIVE) {
+                struct fuse_conn *fc = get_fuse_conn(inode);
+                struct fuse_inode *fi = get_fuse_inode(inode);
+                fuse_send_forget(fc, fi->forget_req, fi->nodeid, fi->nlookup);
+                fi->forget_req = NULL;
+        }
+}
+void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr)
+{
+        if (S_ISREG(inode->i_mode) && i_size_read(inode) != attr->size)
+                invalidate_inode_pages(inode->i_mapping);
+        inode->i_ino     = attr->ino;
+        inode->i_mode    = (inode->i_mode & S_IFMT) + (attr->mode & 07777);
+        inode->i_nlink   = attr->nlink;
+        inode->i_uid     = attr->uid;
+        inode->i_gid     = attr->gid;
+        i_size_write(inode, attr->size);
+        inode->i_blksize = PAGE_CACHE_SIZE;
+        inode->i_blocks  = attr->blocks;
+        inode->i_atime.tv_sec   = attr->atime;
+        inode->i_atime.tv_nsec  = attr->atimensec;
+        inode->i_mtime.tv_sec   = attr->mtime;
+        inode->i_mtime.tv_nsec  = attr->mtimensec;
+        inode->i_ctime.tv_sec   = attr->ctime;
+        inode->i_ctime.tv_nsec  = attr->ctimensec;
+}
+static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
+{
+        inode->i_mode = attr->mode & S_IFMT;
+        i_size_write(inode, attr->size);
+        if (S_ISREG(inode->i_mode)) {
+                fuse_init_common(inode);
+                fuse_init_file_inode(inode);
+        } else if (S_ISDIR(inode->i_mode))
+                fuse_init_dir(inode);
+        else if (S_ISLNK(inode->i_mode))
+                fuse_init_symlink(inode);
+        else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+                 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
+                fuse_init_common(inode);
+                init_special_inode(inode, inode->i_mode,
+                                   new_decode_dev(attr->rdev));
+        } else {
+                /* Don't let user create weird files */
+                inode->i_mode = S_IFREG;
+                fuse_init_common(inode);
+                fuse_init_file_inode(inode);
+        }
+}
+static int fuse_inode_eq(struct inode *inode, void *_nodeidp)
+{
+        unsigned long nodeid = *(unsigned long *) _nodeidp;
+        if (get_node_id(inode) == nodeid)
+                return 1;
+        else
+                return 0;
+}
+static int fuse_inode_set(struct inode *inode, void *_nodeidp)
+{
+        unsigned long nodeid = *(unsigned long *) _nodeidp;
+        get_fuse_inode(inode)->nodeid = nodeid;
+        return 0;
+}
+struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
+                        int generation, struct fuse_attr *attr)
+{
+        struct inode *inode;
+        struct fuse_inode *fi;
+        struct fuse_conn *fc = get_fuse_conn_super(sb);
+        int retried = 0;
+ retry:
+        inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid);
+        if (!inode)
+                return NULL;
+        if ((inode->i_state & I_NEW)) {
+                inode->i_flags |= S_NOATIME|S_NOCMTIME;
+                inode->i_generation = generation;
+                inode->i_data.backing_dev_info = &fc->bdi;
+                fuse_init_inode(inode, attr);
+                unlock_new_inode(inode);
+        } else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
+                BUG_ON(retried);
+                /* Inode has changed type, any I/O on the old should fail */
+                make_bad_inode(inode);
+                iput(inode);
+                retried = 1;
+                goto retry;
+        }
+        fi = get_fuse_inode(inode);
+        fi->nlookup ++;
+        fuse_change_attributes(inode, attr);
+        return inode;
+}
+static void fuse_put_super(struct super_block *sb)
+{
+        struct fuse_conn *fc = get_fuse_conn_super(sb);
+        down_write(&fc->sbput_sem);
+        while (!list_empty(&fc->background))
+                fuse_release_background(list_entry(fc->background.next,
+                                                   struct fuse_req, bg_entry));
+        spin_lock(&fuse_lock);
+        fc->mounted = 0;
+        fc->user_id = 0;
+        fc->group_id = 0;
+        fc->flags = 0;
+        /* Flush all readers on this fs */
+        wake_up_all(&fc->waitq);
+        up_write(&fc->sbput_sem);
+        fuse_release_conn(fc);
+        spin_unlock(&fuse_lock);
+}
+static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr)
+{
+        stbuf->f_type    = FUSE_SUPER_MAGIC;
+        stbuf->f_bsize   = attr->bsize;
+        stbuf->f_blocks  = attr->blocks;
+        stbuf->f_bfree   = attr->bfree;
+        stbuf->f_bavail  = attr->bavail;
+        stbuf->f_files   = attr->files;
+        stbuf->f_ffree   = attr->ffree;
+        stbuf->f_namelen = attr->namelen;
+        /* fsid is left zero */
+}
+static int fuse_statfs(struct super_block *sb, struct kstatfs *buf)
+{
+        struct fuse_conn *fc = get_fuse_conn_super(sb);
+        struct fuse_req *req;
+        struct fuse_statfs_out outarg;
+        int err;
+        req = fuse_get_request(fc);
+        if (!req)
+                return -EINTR;
+        req->in.numargs = 0;
+        req->in.h.opcode = FUSE_STATFS;
+        req->out.numargs = 1;
+        req->out.args[0].size = sizeof(outarg);
+        req->out.args[0].value = &outarg;
+        request_send(fc, req);
+        err = req->out.h.error;
+        if (!err)
+                convert_fuse_statfs(buf, &outarg.st);
+        fuse_put_request(fc, req);
+        return err;
+}
+enum {
+        OPT_FD,
+        OPT_ROOTMODE,
+        OPT_USER_ID,
+        OPT_GROUP_ID,
+        OPT_DEFAULT_PERMISSIONS,
+        OPT_ALLOW_OTHER,
+        OPT_MAX_READ,
+        OPT_ERR
+};
+static match_table_t tokens = {
+        {OPT_FD,                        "fd=%u"},
+        {OPT_ROOTMODE,                  "rootmode=%o"},
+        {OPT_USER_ID,                   "user_id=%u"},
+        {OPT_GROUP_ID,                  "group_id=%u"},
+        {OPT_DEFAULT_PERMISSIONS,       "default_permissions"},
+        {OPT_ALLOW_OTHER,               "allow_other"},
+        {OPT_MAX_READ,                  "max_read=%u"},
+        {OPT_ERR,                       NULL}
+};
+static int parse_fuse_opt(char *opt, struct fuse_mount_data *d)
+{
+        char *p;
+        memset(d, 0, sizeof(struct fuse_mount_data));
+        d->max_read = ~0;
+        while ((p = strsep(&opt, ",")) != NULL) {
+                int token;
+                int value;
+                substring_t args[MAX_OPT_ARGS];
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case OPT_FD:
+                        if (match_int(&args[0], &value))
+                                return 0;
+                        d->fd = value;
+                        d->fd_present = 1;
+                        break;
+                case OPT_ROOTMODE:
+                        if (match_octal(&args[0], &value))
+                                return 0;
+                        d->rootmode = value;
+                        d->rootmode_present = 1;
+                        break;
+                case OPT_USER_ID:
+                        if (match_int(&args[0], &value))
+                                return 0;
+                        d->user_id = value;
+                        d->user_id_present = 1;
+                        break;
+                case OPT_GROUP_ID:
+                        if (match_int(&args[0], &value))
+                                return 0;
+                        d->group_id = value;
+                        d->group_id_present = 1;
+                        break;
+                case OPT_DEFAULT_PERMISSIONS:
+                        d->flags |= FUSE_DEFAULT_PERMISSIONS;
+                        break;
+                case OPT_ALLOW_OTHER:
+                        d->flags |= FUSE_ALLOW_OTHER;
+                        break;
+                case OPT_MAX_READ:
+                        if (match_int(&args[0], &value))
+                                return 0;
+                        d->max_read = value;
+                        break;
+                default:
+                        return 0;
+                }
+        }
+        if (!d->fd_present || !d->rootmode_present ||
+            !d->user_id_present || !d->group_id_present)
+                return 0;
+        return 1;
+}
+static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+        struct fuse_conn *fc = get_fuse_conn_super(mnt->mnt_sb);
+        seq_printf(m, ",user_id=%u", fc->user_id);
+        seq_printf(m, ",group_id=%u", fc->group_id);
+        if (fc->flags & FUSE_DEFAULT_PERMISSIONS)
+                seq_puts(m, ",default_permissions");
+        if (fc->flags & FUSE_ALLOW_OTHER)
+                seq_puts(m, ",allow_other");
+        if (fc->max_read != ~0)
+                seq_printf(m, ",max_read=%u", fc->max_read);
+        return 0;
+}
+static void free_conn(struct fuse_conn *fc)
+{
+        while (!list_empty(&fc->unused_list)) {
+                struct fuse_req *req;
+                req = list_entry(fc->unused_list.next, struct fuse_req, list);
+                list_del(&req->list);
+                fuse_request_free(req);
+        }
+        kfree(fc);
+}
+/* Must be called with the fuse lock held */
+void fuse_release_conn(struct fuse_conn *fc)
+{
+        fc->count--;
+        if (!fc->count)
+                free_conn(fc);
+}
+static struct fuse_conn *new_conn(void)
+{
+        struct fuse_conn *fc;
+        fc = kmalloc(sizeof(*fc), GFP_KERNEL);
+        if (fc != NULL) {
+                int i;
+                memset(fc, 0, sizeof(*fc));
+                init_waitqueue_head(&fc->waitq);
+                INIT_LIST_HEAD(&fc->pending);
+                INIT_LIST_HEAD(&fc->processing);
+                INIT_LIST_HEAD(&fc->unused_list);
+                INIT_LIST_HEAD(&fc->background);
+                sema_init(&fc->outstanding_sem, 0);
+                init_rwsem(&fc->sbput_sem);
+                for (i = 0; i < FUSE_MAX_OUTSTANDING; i++) {
+                        struct fuse_req *req = fuse_request_alloc();
+                        if (!req) {
+                                free_conn(fc);
+                                return NULL;
+                        }
+                        list_add(&req->list, &fc->unused_list);
+                }
+                fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+                fc->bdi.unplug_io_fn = default_unplug_io_fn;
+                fc->reqctr = 0;
+        }
+        return fc;
+}
+static struct fuse_conn *get_conn(struct file *file, struct super_block *sb)
+{
+        struct fuse_conn *fc;
+        if (file->f_op != &fuse_dev_operations)
+                return ERR_PTR(-EINVAL);
+        fc = new_conn();
+        if (fc == NULL)
+                return ERR_PTR(-ENOMEM);
+        spin_lock(&fuse_lock);
+        if (file->private_data) {
+                free_conn(fc);
+                fc = ERR_PTR(-EINVAL);
+        } else {
+                file->private_data = fc;
+                *get_fuse_conn_super_p(sb) = fc;
+                fc->mounted = 1;
+                fc->connected = 1;
+                fc->count = 2;
+        }
+        spin_unlock(&fuse_lock);
+        return fc;
+}
+static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
+{
+        struct fuse_attr attr;
+        memset(&attr, 0, sizeof(attr));
+        attr.mode = mode;
+        attr.ino = FUSE_ROOT_ID;
+        return fuse_iget(sb, 1, 0, &attr);
+}
+static struct super_operations fuse_super_operations = {
+        .alloc_inode    = fuse_alloc_inode,
+        .destroy_inode  = fuse_destroy_inode,
+        .read_inode     = fuse_read_inode,
+        .clear_inode    = fuse_clear_inode,
+        .put_super      = fuse_put_super,
+        .statfs         = fuse_statfs,
+        .show_options   = fuse_show_options,
+};
+static int fuse_fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct fuse_conn *fc;
+        struct inode *root;
+        struct fuse_mount_data d;
+        struct file *file;
+        int err;
+        if (!parse_fuse_opt((char *) data, &d))
+                return -EINVAL;
+        sb->s_blocksize = PAGE_CACHE_SIZE;
+        sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+        sb->s_magic = FUSE_SUPER_MAGIC;
+        sb->s_op = &fuse_super_operations;
+        sb->s_maxbytes = MAX_LFS_FILESIZE;
+        file = fget(d.fd);
+        if (!file)
+                return -EINVAL;
+        fc = get_conn(file, sb);
+        fput(file);
+        if (IS_ERR(fc))
+                return PTR_ERR(fc);
+        fc->flags = d.flags;
+        fc->user_id = d.user_id;
+        fc->group_id = d.group_id;
+        fc->max_read = d.max_read;
+        if (fc->max_read / PAGE_CACHE_SIZE < fc->bdi.ra_pages)
+                fc->bdi.ra_pages = fc->max_read / PAGE_CACHE_SIZE;
+        fc->max_write = FUSE_MAX_IN / 2;
+        err = -ENOMEM;
+        root = get_root_inode(sb, d.rootmode);
+        if (root == NULL)
+                goto err;
+        sb->s_root = d_alloc_root(root);
+        if (!sb->s_root) {
+                iput(root);
+                goto err;
+        }
+        fuse_send_init(fc);
+        return 0;
+ err:
+        spin_lock(&fuse_lock);
+        fuse_release_conn(fc);
+        spin_unlock(&fuse_lock);
+        return err;
+}
+static struct super_block *fuse_get_sb(struct file_system_type *fs_type,
+                                       int flags, const char *dev_name,
+                                       void *raw_data)
+{
+        return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super);
+}
+static struct file_system_type fuse_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "fuse",
+        .get_sb         = fuse_get_sb,
+        .kill_sb        = kill_anon_super,
+};
+static void fuse_inode_init_once(void *foo, kmem_cache_t *cachep,
+                                 unsigned long flags)
+{
+        struct inode * inode = foo;
+        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+            SLAB_CTOR_CONSTRUCTOR)
+                inode_init_once(inode);
+}
+static int __init fuse_fs_init(void)
+{
+        int err;
+        err = register_filesystem(&fuse_fs_type);
+        if (err)
+                printk("fuse: failed to register filesystem\n");
+        else {
+                fuse_inode_cachep = kmem_cache_create("fuse_inode",
+                                                      sizeof(struct fuse_inode),
+                                                      0, SLAB_HWCACHE_ALIGN,
+                                                      fuse_inode_init_once, NULL);
+                if (!fuse_inode_cachep) {
+                        unregister_filesystem(&fuse_fs_type);
+                        err = -ENOMEM;
+                }
+        }
+        return err;
+}
+static void fuse_fs_cleanup(void)
+{
+        unregister_filesystem(&fuse_fs_type);
+        kmem_cache_destroy(fuse_inode_cachep);
+}
+static int __init fuse_init(void)
+{
+        int res;
+        printk("fuse init (API version %i.%i)\n",
+               FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
+        spin_lock_init(&fuse_lock);
+        res = fuse_fs_init();
+        if (res)
+                goto err;
+        res = fuse_dev_init();
+        if (res)
+                goto err_fs_cleanup;
+        return 0;
+ err_fs_cleanup:
+        fuse_fs_cleanup();
+ err:
+        return res;
+}
+static void __exit fuse_exit(void)
+{
+        printk(KERN_DEBUG "fuse exit\n");
+        fuse_fs_cleanup();
+        fuse_dev_cleanup();
+}
+module_init(fuse_init);
+module_exit(fuse_exit);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index b2d18200a003..59c5062cd63f 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -284,6 +284,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
 static void hostfs_delete_inode(struct inode *inode)
 {
+        truncate_inode_pages(&inode->i_data, 0);
        if(HOSTFS_I(inode)->fd != -1) {
                close_file(&HOSTFS_I(inode)->fd);
                HOSTFS_I(inode)->fd = -1;
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 38b1741fa539..e3d17e9ea6c1 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -284,6 +284,7 @@ void hpfs_write_if_changed(struct inode *inode)
 void hpfs_delete_inode(struct inode *inode)
 {
+        truncate_inode_pages(&inode->i_data, 0);
        lock_kernel();
        hpfs_remove_fnode(inode->i_sb, inode->i_ino);
        unlock_kernel();
diff --git a/fs/inode.c b/fs/inode.c
index 71df1b1e8f75..f80a79ff156b 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1034,19 +1034,21 @@ void generic_delete_inode(struct inode *inode)
        inodes_stat.nr_inodes--;
        spin_unlock(&inode_lock);
-        if (inode->i_data.nrpages)
-                truncate_inode_pages(&inode->i_data, 0);
        security_inode_delete(inode);
        if (op->delete_inode) {
                void (*delete)(struct inode *) = op->delete_inode;
                if (!is_bad_inode(inode))
                        DQUOT_INIT(inode);
-                /* s_op->delete_inode internally recalls clear_inode() */
+                /* Filesystems implementing their own
+                 * s_op->delete_inode are required to call
+                 * truncate_inode_pages and clear_inode()
+                 * internally */
                delete(inode);
-        } else
+        } else {
+                truncate_inode_pages(&inode->i_data, 0);
                clear_inode(inode);
+        }
        spin_lock(&inode_lock);
        hlist_del_init(&inode->i_hash);
        spin_unlock(&inode_lock);
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index 777b90057b89..3dcc6d2162cb 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -1744,6 +1744,7 @@ jffs_delete_inode(struct inode *inode)
        D3(printk("jffs_delete_inode(): inode->i_ino == %lu\n",
                  inode->i_ino));
+        truncate_inode_pages(&inode->i_data, 0);
        lock_kernel();
        inode->i_size = 0;
        inode->i_blocks = 0;
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 767c7ecb429e..cff352f4ec18 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -132,6 +132,8 @@ void jfs_delete_inode(struct inode *inode)
            (JFS_IP(inode)->fileset != cpu_to_le32(FILESYSTEM_I)))
                        return;
+        truncate_inode_pages(&inode->i_data, 0);
        if (test_cflag(COMMIT_Freewmap, inode))
                jfs_free_zero_link(inode);
diff --git a/fs/locks.c b/fs/locks.c
index 11956b6179ff..c2c09b4798d6 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2198,21 +2198,23 @@ void steal_locks(fl_owner_t from)
 {
        struct files_struct *files = current->files;
        int i, j;
+        struct fdtable *fdt;
        if (from == files)
                return;
        lock_kernel();
        j = 0;
+        fdt = files_fdtable(files);
        for (;;) {
                unsigned long set;
                i = j * __NFDBITS;
-                if (i >= files->max_fdset || i >= files->max_fds)
+                if (i >= fdt->max_fdset || i >= fdt->max_fds)
                        break;
-                set = files->open_fds->fds_bits[j++];
+                set = fdt->open_fds->fds_bits[j++];
                while (set) {
                        if (set & 1) {
-                                struct file *file = files->fd[i];
+                                struct file *file = fdt->fd[i];
                                if (file)
                                        __steal_locks(file, from);
                        }
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 3f18c21198d7..790cc0d0e970 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -24,6 +24,7 @@ static int minix_remount (struct super_block * sb, int * flags, char * data);
 static void minix_delete_inode(struct inode *inode)
 {
+        truncate_inode_pages(&inode->i_data, 0);
        inode->i_size = 0;
        minix_truncate(inode);
        minix_free_inode(inode);
diff --git a/fs/namei.c b/fs/namei.c
index 145e852c4bd0..21d85f1ac839 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1316,10 +1316,8 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
                return error;
        DQUOT_INIT(dir);
        error = dir->i_op->create(dir, dentry, mode, nd);
-        if (!error) {
+        if (!error)
                fsnotify_create(dir, dentry->d_name.name);
-                security_inode_post_create(dir, dentry, mode);
-        }
        return error;
 }
@@ -1635,10 +1633,8 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
        DQUOT_INIT(dir);
        error = dir->i_op->mknod(dir, dentry, mode, dev);
-        if (!error) {
+        if (!error)
                fsnotify_create(dir, dentry->d_name.name);
-                security_inode_post_mknod(dir, dentry, mode, dev);
-        }
        return error;
 }
@@ -1708,10 +1704,8 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        DQUOT_INIT(dir);
        error = dir->i_op->mkdir(dir, dentry, mode);
-        if (!error) {
+        if (!error)
                fsnotify_mkdir(dir, dentry->d_name.name);
-                security_inode_post_mkdir(dir,dentry, mode);
-        }
        return error;
 }
@@ -1947,10 +1941,8 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, i
        DQUOT_INIT(dir);
        error = dir->i_op->symlink(dir, dentry, oldname);
-        if (!error) {
+        if (!error)
                fsnotify_create(dir, dentry->d_name.name);
-                security_inode_post_symlink(dir, dentry, oldname);
-        }
        return error;
 }
@@ -2020,10 +2012,8 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
        DQUOT_INIT(dir);
        error = dir->i_op->link(old_dentry, dir, new_dentry);
        up(&old_dentry->d_inode->i_sem);
-        if (!error) {
+        if (!error)
                fsnotify_create(dir, new_dentry->d_name.name);
-                security_inode_post_link(old_dentry, dir, new_dentry);
-        }
        return error;
 }
@@ -2142,11 +2132,8 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
                        d_rehash(new_dentry);
                dput(new_dentry);
        }
-        if (!error) {
+        if (!error)
                d_move(old_dentry,new_dentry);
-                security_inode_post_rename(old_dir, old_dentry,
-                                           new_dir, new_dentry);
-        }
        return error;
 }
@@ -2172,7 +2159,6 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
                /* The following d_move() should become unconditional */
                if (!(old_dir->i_sb->s_type->fs_flags & FS_ODD_RENAME))
                        d_move(old_dentry, new_dentry);
-                security_inode_post_rename(old_dir, old_dentry, new_dir, new_dentry);
        }
        if (target)
                up(&target->i_sem);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 44795d2f4b30..8c8839203cd5 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -286,6 +286,8 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
 static void
 ncp_delete_inode(struct inode *inode)
 {
+        truncate_inode_pages(&inode->i_data, 0);
        if (S_ISDIR(inode->i_mode)) {
                DDPRINTK("ncp_delete_inode: put directory %ld\n", inode->i_ino);
        }
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 541b418327c8..6922469d6fc5 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -146,6 +146,8 @@ nfs_delete_inode(struct inode * inode)
 {
        dprintk("NFS: delete_inode(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
+        truncate_inode_pages(&inode->i_data, 0);
        nfs_wb_all(inode);
        /*
         * The following should never happen...
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 9eecc9939dfe..e4fd6134244d 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -22,6 +22,76 @@ ToDo/Notes:
        - Enable the code for setting the NT4 compatibility flag when we start
          making NTFS 1.2 specific modifications.
+2.1.24 - Lots of bug fixes and support more clean journal states.
+        - Support journals ($LogFile) which have been modified by chkdsk.  This
+          means users can boot into Windows after we marked the volume dirty.
+          The Windows boot will run chkdsk and then reboot.  The user can then
+          immediately boot into Linux rather than having to do a full Windows
+          boot first before rebooting into Linux and we will recognize such a
+          journal and empty it as it is clean by definition.
+        - Support journals ($LogFile) with only one restart page as well as
+          journals with two different restart pages.  We sanity check both and
+          either use the only sane one or the more recent one of the two in the
+          case that both are valid.
+        - Modify fs/ntfs/malloc.h::ntfs_malloc_nofs() to do the kmalloc() based
+          allocations with __GFP_HIGHMEM, analogous to how the vmalloc() based
+          allocations are done.
+        - Add fs/ntfs/malloc.h::ntfs_malloc_nofs_nofail() which is analogous to
+          ntfs_malloc_nofs() but it performs allocations with __GFP_NOFAIL and
+          hence cannot fail.
+        - Use ntfs_malloc_nofs_nofail() in the two critical regions in
+          fs/ntfs/runlist.c::ntfs_runlists_merge().  This means we no longer
+          need to panic() if the allocation fails as it now cannot fail.
+        - Fix two nasty runlist merging bugs that had gone unnoticed so far.
+          Thanks to Stefano Picerno for the bug report.
+        - Remove two bogus BUG_ON()s from fs/ntfs/mft.c.
+        - Fix handling of valid but empty mapping pairs array in
+          fs/ntfs/runlist.c::ntfs_mapping_pairs_decompress().
+        - Report unrepresentable inodes during ntfs_readdir() as KERN_WARNING
+          messages and include the inode number.  Thanks to Yura Pakhuchiy for
+          pointing this out.
+        - Change ntfs_rl_truncate_nolock() to throw away the runlist if the new
+          length is zero.
+        - Add runlist.[hc]::ntfs_rl_punch_nolock() which punches a caller
+          specified hole into a runlist.
+        - Fix a bug in fs/ntfs/index.c::ntfs_index_lookup().  When the returned
+          index entry is in the index root, we forgot to set the @ir pointer in
+          the index context.  Thanks to Yura Pakhuchiy for finding this bug.
+        - Remove bogus setting of PageError in ntfs_read_compressed_block().
+        - Add fs/ntfs/attrib.[hc]::ntfs_resident_attr_value_resize().
+        - Fix a bug in ntfs_map_runlist_nolock() where we forgot to protect
+          access to the allocated size in the ntfs inode with the size lock.
+        - Fix ntfs_attr_vcn_to_lcn_nolock() and ntfs_attr_find_vcn_nolock() to
+          return LCN_ENOENT when there is no runlist and the allocated size is
+          zero.
+        - Fix load_attribute_list() to handle the case of a NULL runlist.
+        - Fix handling of sparse attributes in ntfs_attr_make_non_resident().
+        - Add BUG() checks to ntfs_attr_make_non_resident() and ntfs_attr_set()
+          to ensure that these functions are never called for compressed or
+          encrypted attributes.
+        - Fix cluster (de)allocators to work when the runlist is NULL and more
+          importantly to take a locked runlist rather than them locking it
+          which leads to lock reversal.
+        - Truncate {a,c,m}time to the ntfs supported time granularity when
+          updating the times in the inode in ntfs_setattr().
+        - Fixup handling of sparse, compressed, and encrypted attributes in
+          fs/ntfs/inode.c::ntfs_read_locked_{,attr_,index_}inode(),
+          fs/ntfs/aops.c::ntfs_{read,write}page().
+        - Make ntfs_write_block() not instantiate sparse blocks if they contain
+          only zeroes.
+        - Optimize fs/ntfs/aops.c::ntfs_write_block() by extending the page
+          lock protection over the buffer submission for i/o which allows the
+          removal of the get_bh()/put_bh() pairs for each buffer.
+        - Fix fs/ntfs/aops.c::ntfs_{read,write}_block() to handle the case
+          where a concurrent truncate has truncated the runlist under our feet.
+        - Fix page_has_buffers()/page_buffers() handling in fs/ntfs/aops.c.
+        - In fs/ntfs/aops.c::ntfs_end_buffer_async_read(), use a bit spin lock
+          in the first buffer head instead of a driver global spin lock to
+          improve scalability.
+        - Minor fix to error handling and error message display in
+          fs/ntfs/aops.c::ntfs_prepare_nonresident_write(). 
 2.1.23 - Implement extension of resident files and make writing safe as well as
         many bug fixes, cleanups, and enhancements...
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index f083f27d8b69..894b2b876d35 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
             index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
             unistr.o upcase.o
-EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.23\"
+EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.24\"
 ifeq ($(CONFIG_NTFS_DEBUG),y)
 EXTRA_CFLAGS += -DDEBUG
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 78adad7a988d..545236414d59 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -55,9 +55,8 @@
 */
 static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 {
-        static DEFINE_SPINLOCK(page_uptodate_lock);
        unsigned long flags;
-        struct buffer_head *tmp;
+        struct buffer_head *first, *tmp;
        struct page *page;
        ntfs_inode *ni;
        int page_uptodate = 1;
@@ -89,11 +88,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
                }
        } else {
                clear_buffer_uptodate(bh);
+                SetPageError(page);
                ntfs_error(ni->vol->sb, "Buffer I/O error, logical block %llu.",
                                (unsigned long long)bh->b_blocknr);
-                SetPageError(page);
        }
-        spin_lock_irqsave(&page_uptodate_lock, flags);
+        first = page_buffers(page);
+        local_irq_save(flags);
+        bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
        clear_buffer_async_read(bh);
        unlock_buffer(bh);
        tmp = bh;
@@ -108,7 +109,8 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
                }
                tmp = tmp->b_this_page;
        } while (tmp != bh);
-        spin_unlock_irqrestore(&page_uptodate_lock, flags);
+        bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
+        local_irq_restore(flags);
        /*
         * If none of the buffers had errors then we can set the page uptodate,
         * but we first have to perform the post read mst fixups, if the
@@ -141,7 +143,8 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
        unlock_page(page);
        return;
 still_busy:
-        spin_unlock_irqrestore(&page_uptodate_lock, flags);
+        bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
+        local_irq_restore(flags);
        return;
 }
@@ -185,13 +188,15 @@ static int ntfs_read_block(struct page *page)
        blocksize_bits = VFS_I(ni)->i_blkbits;
        blocksize = 1 << blocksize_bits;
-        if (!page_has_buffers(page))
+        if (!page_has_buffers(page)) {
                create_empty_buffers(page, blocksize, 0);
-        bh = head = page_buffers(page);
+                if (unlikely(!page_has_buffers(page))) {
-        if (unlikely(!bh)) {
+                        unlock_page(page);
-                unlock_page(page);
+                        return -ENOMEM;
-                return -ENOMEM;
+                }
        }
+        bh = head = page_buffers(page);
+        BUG_ON(!bh);
        iblock = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
        read_lock_irqsave(&ni->size_lock, flags);
@@ -204,6 +209,7 @@ static int ntfs_read_block(struct page *page)
        nr = i = 0;
        do {
                u8 *kaddr;
+                int err;
                if (unlikely(buffer_uptodate(bh)))
                        continue;
@@ -211,6 +217,7 @@ static int ntfs_read_block(struct page *page)
                        arr[nr++] = bh;
                        continue;
                }
+                err = 0;
                bh->b_bdev = vol->sb->s_bdev;
                /* Is the block within the allowed limits? */
                if (iblock < lblock) {
@@ -252,7 +259,6 @@ lock_retry_remap:
                                goto handle_hole;
                        /* If first try and runlist unmapped, map and retry. */
                        if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
-                                int err;
                                is_retry = TRUE;
                                /*
                                 * Attempt to map runlist, dropping lock for
@@ -263,20 +269,30 @@ lock_retry_remap:
                                if (likely(!err))
                                        goto lock_retry_remap;
                                rl = NULL;
-                                lcn = err;
                        } else if (!rl)
                                up_read(&ni->runlist.lock);
+                        /*
+                         * If buffer is outside the runlist, treat it as a
+                         * hole.  This can happen due to concurrent truncate
+                         * for example.
+                         */
+                        if (err == -ENOENT || lcn == LCN_ENOENT) {
+                                err = 0;
+                                goto handle_hole;
+                        }
                        /* Hard error, zero out region. */
+                        if (!err)
+                                err = -EIO;
                        bh->b_blocknr = -1;
                        SetPageError(page);
                        ntfs_error(vol->sb, "Failed to read from inode 0x%lx, "
                                        "attribute type 0x%x, vcn 0x%llx, "
                                        "offset 0x%x because its location on "
                                        "disk could not be determined%s "
-                                        "(error code %lli).", ni->mft_no,
+                                        "(error code %i).", ni->mft_no,
                                        ni->type, (unsigned long long)vcn,
                                        vcn_ofs, is_retry ? " even after "
-                                        "retrying" : "", (long long)lcn);
+                                        "retrying" : "", err);
                }
                /*
                 * Either iblock was outside lblock limits or
@@ -289,9 +305,10 @@ handle_hole:
 handle_zblock:
                kaddr = kmap_atomic(page, KM_USER0);
                memset(kaddr + i * blocksize, 0, blocksize);
-                flush_dcache_page(page);
                kunmap_atomic(kaddr, KM_USER0);
-                set_buffer_uptodate(bh);
+                flush_dcache_page(page);
+                if (likely(!err))
+                        set_buffer_uptodate(bh);
        } while (i++, iblock++, (bh = bh->b_this_page) != head);
        /* Release the lock if we took it. */
@@ -367,31 +384,38 @@ retry_readpage:
                return 0;
        }
        ni = NTFS_I(page->mapping->host);
+        /*
+         * Only $DATA attributes can be encrypted and only unnamed $DATA
+         * attributes can be compressed.  Index root can have the flags set but
+         * this means to create compressed/encrypted files, not that the
+         * attribute is compressed/encrypted.
+         */
+        if (ni->type != AT_INDEX_ROOT) {
+                /* If attribute is encrypted, deny access, just like NT4. */
+                if (NInoEncrypted(ni)) {
+                        BUG_ON(ni->type != AT_DATA);
+                        err = -EACCES;
+                        goto err_out;
+                }
+                /* Compressed data streams are handled in compress.c. */
+                if (NInoNonResident(ni) && NInoCompressed(ni)) {
+                        BUG_ON(ni->type != AT_DATA);
+                        BUG_ON(ni->name_len);
+                        return ntfs_read_compressed_block(page);
+                }
+        }
        /* NInoNonResident() == NInoIndexAllocPresent() */
        if (NInoNonResident(ni)) {
-                /*
+                /* Normal, non-resident data stream. */
-                 * Only unnamed $DATA attributes can be compressed or
-                 * encrypted.
-                 */
-                if (ni->type == AT_DATA && !ni->name_len) {
-                        /* If file is encrypted, deny access, just like NT4. */
-                        if (NInoEncrypted(ni)) {
-                                err = -EACCES;
-                                goto err_out;
-                        }
-                        /* Compressed data streams are handled in compress.c. */
-                        if (NInoCompressed(ni))
-                                return ntfs_read_compressed_block(page);
-                }
-                /* Normal data stream. */
                return ntfs_read_block(page);
        }
        /*
         * Attribute is resident, implying it is not compressed or encrypted.
         * This also means the attribute is smaller than an mft record and
         * hence smaller than a page, so can simply zero out any pages with
-         * index above 0.
+         * index above 0.  Note the attribute can actually be marked compressed
+         * but if it is resident the actual data is not compressed so we are
+         * ok to ignore the compressed flag here.
         */
        if (unlikely(page->index > 0)) {
                kaddr = kmap_atomic(page, KM_USER0);
@@ -511,19 +535,21 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
                BUG_ON(!PageUptodate(page));
                create_empty_buffers(page, blocksize,
                                (1 << BH_Uptodate) | (1 << BH_Dirty));
+                if (unlikely(!page_has_buffers(page))) {
+                        ntfs_warning(vol->sb, "Error allocating page "
+                                        "buffers.  Redirtying page so we try "
+                                        "again later.");
+                        /*
+                         * Put the page back on mapping->dirty_pages, but leave
+                         * its buffers' dirty state as-is.
+                         */
+                        redirty_page_for_writepage(wbc, page);
+                        unlock_page(page);
+                        return 0;
+                }
        }
        bh = head = page_buffers(page);
-        if (unlikely(!bh)) {
+        BUG_ON(!bh);
-                ntfs_warning(vol->sb, "Error allocating page buffers. "
-                                "Redirtying page so we try again later.");
-                /*
-                 * Put the page back on mapping->dirty_pages, but leave its
-                 * buffer's dirty state as-is.
-                 */
-                redirty_page_for_writepage(wbc, page);
-                unlock_page(page);
-                return 0;
-        }
        /* NOTE: Different naming scheme to ntfs_read_block()! */
@@ -670,6 +696,27 @@ lock_retry_remap:
                }
                /* It is a hole, need to instantiate it. */
                if (lcn == LCN_HOLE) {
+                        u8 *kaddr;
+                        unsigned long *bpos, *bend;
+                        /* Check if the buffer is zero. */
+                        kaddr = kmap_atomic(page, KM_USER0);
+                        bpos = (unsigned long *)(kaddr + bh_offset(bh));
+                        bend = (unsigned long *)((u8*)bpos + blocksize);
+                        do {
+                                if (unlikely(*bpos))
+                                        break;
+                        } while (likely(++bpos < bend));
+                        kunmap_atomic(kaddr, KM_USER0);
+                        if (bpos == bend) {
+                                /*
+                                 * Buffer is zero and sparse, no need to write
+                                 * it.
+                                 */
+                                bh->b_blocknr = -1;
+                                clear_buffer_dirty(bh);
+                                continue;
+                        }
                        // TODO: Instantiate the hole.
                        // clear_buffer_new(bh);
                        // unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
@@ -690,20 +737,37 @@ lock_retry_remap:
                        if (likely(!err))
                                goto lock_retry_remap;
                        rl = NULL;
-                        lcn = err;
                } else if (!rl)
                        up_read(&ni->runlist.lock);
+                /*
+                 * If buffer is outside the runlist, truncate has cut it out
+                 * of the runlist.  Just clean and clear the buffer and set it
+                 * uptodate so it can get discarded by the VM.
+                 */
+                if (err == -ENOENT || lcn == LCN_ENOENT) {
+                        u8 *kaddr;
+                        bh->b_blocknr = -1;
+                        clear_buffer_dirty(bh);
+                        kaddr = kmap_atomic(page, KM_USER0);
+                        memset(kaddr + bh_offset(bh), 0, blocksize);
+                        kunmap_atomic(kaddr, KM_USER0);
+                        flush_dcache_page(page);
+                        set_buffer_uptodate(bh);
+                        err = 0;
+                        continue;
+                }
                /* Failed to map the buffer, even after retrying. */
+                if (!err)
+                        err = -EIO;
                bh->b_blocknr = -1;
                ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
                                "attribute type 0x%x, vcn 0x%llx, offset 0x%x "
                                "because its location on disk could not be "
-                                "determined%s (error code %lli).", ni->mft_no,
+                                "determined%s (error code %i).", ni->mft_no,
                                ni->type, (unsigned long long)vcn,
                                vcn_ofs, is_retry ? " even after "
-                                "retrying" : "", (long long)lcn);
+                                "retrying" : "", err);
-                if (!err)
-                        err = -EIO;
                break;
        } while (block++, (bh = bh->b_this_page) != head);
@@ -714,7 +778,7 @@ lock_retry_remap:
        /* For the error case, need to reset bh to the beginning. */
        bh = head;
-        /* Just an optimization, so ->readpage() isn't called later. */
+        /* Just an optimization, so ->readpage() is not called later. */
        if (unlikely(!PageUptodate(page))) {
                int uptodate = 1;
                do {
@@ -730,7 +794,6 @@ lock_retry_remap:
        /* Setup all mapped, dirty buffers for async write i/o. */
        do {
-                get_bh(bh);
                if (buffer_mapped(bh) && buffer_dirty(bh)) {
                        lock_buffer(bh);
                        if (test_clear_buffer_dirty(bh)) {
@@ -768,14 +831,8 @@ lock_retry_remap:
        BUG_ON(PageWriteback(page));
        set_page_writeback(page);       /* Keeps try_to_free_buffers() away. */
-        unlock_page(page);
-        /*
+        /* Submit the prepared buffers for i/o. */
-         * Submit the prepared buffers for i/o. Note the page is unlocked,
-         * and the async write i/o completion handler can end_page_writeback()
-         * at any time after the *first* submit_bh(). So the buffers can then
-         * disappear...
-         */
        need_end_writeback = TRUE;
        do {
                struct buffer_head *next = bh->b_this_page;
@@ -783,9 +840,9 @@ lock_retry_remap:
                        submit_bh(WRITE, bh);
                        need_end_writeback = FALSE;
                }
-                put_bh(bh);
                bh = next;
        } while (bh != head);
+        unlock_page(page);
        /* If no i/o was started, need to end_page_writeback(). */
        if (unlikely(need_end_writeback))
@@ -860,7 +917,6 @@ static int ntfs_write_mst_block(struct page *page,
        sync = (wbc->sync_mode == WB_SYNC_ALL);
        /* Make sure we have mapped buffers. */
-        BUG_ON(!page_has_buffers(page));
        bh = head = page_buffers(page);
        BUG_ON(!bh);
@@ -1280,38 +1336,42 @@ retry_writepage:
                ntfs_debug("Write outside i_size - truncated?");
                return 0;
        }
+        /*
+         * Only $DATA attributes can be encrypted and only unnamed $DATA
+         * attributes can be compressed.  Index root can have the flags set but
+         * this means to create compressed/encrypted files, not that the
+         * attribute is compressed/encrypted.
+         */
+        if (ni->type != AT_INDEX_ROOT) {
+                /* If file is encrypted, deny access, just like NT4. */
+                if (NInoEncrypted(ni)) {
+                        unlock_page(page);
+                        BUG_ON(ni->type != AT_DATA);
+                        ntfs_debug("Denying write access to encrypted "
+                                        "file.");
+                        return -EACCES;
+                }
+                /* Compressed data streams are handled in compress.c. */
+                if (NInoNonResident(ni) && NInoCompressed(ni)) {
+                        BUG_ON(ni->type != AT_DATA);
+                        BUG_ON(ni->name_len);
+                        // TODO: Implement and replace this with
+                        // return ntfs_write_compressed_block(page);
+                        unlock_page(page);
+                        ntfs_error(vi->i_sb, "Writing to compressed files is "
+                                        "not supported yet.  Sorry.");
+                        return -EOPNOTSUPP;
+                }
+                // TODO: Implement and remove this check.
+                if (NInoNonResident(ni) && NInoSparse(ni)) {
+                        unlock_page(page);
+                        ntfs_error(vi->i_sb, "Writing to sparse files is not "
+                                        "supported yet.  Sorry.");
+                        return -EOPNOTSUPP;
+                }
+        }
        /* NInoNonResident() == NInoIndexAllocPresent() */
        if (NInoNonResident(ni)) {
-                /*
-                 * Only unnamed $DATA attributes can be compressed, encrypted,
-                 * and/or sparse.
-                 */
-                if (ni->type == AT_DATA && !ni->name_len) {
-                        /* If file is encrypted, deny access, just like NT4. */
-                        if (NInoEncrypted(ni)) {
-                                unlock_page(page);
-                                ntfs_debug("Denying write access to encrypted "
-                                                "file.");
-                                return -EACCES;
-                        }
-                        /* Compressed data streams are handled in compress.c. */
-                        if (NInoCompressed(ni)) {
-                                // TODO: Implement and replace this check with
-                                // return ntfs_write_compressed_block(page);
-                                unlock_page(page);
-                                ntfs_error(vi->i_sb, "Writing to compressed "
-                                                "files is not supported yet. "
-                                                "Sorry.");
-                                return -EOPNOTSUPP;
-                        }
-                        // TODO: Implement and remove this check.
-                        if (NInoSparse(ni)) {
-                                unlock_page(page);
-                                ntfs_error(vi->i_sb, "Writing to sparse files "
-                                                "is not supported yet. Sorry.");
-                                return -EOPNOTSUPP;
-                        }
-                }
                /* We have to zero every time due to mmap-at-end-of-file. */
                if (page->index >= (i_size >> PAGE_CACHE_SHIFT)) {
                        /* The page straddles i_size. */
@@ -1324,14 +1384,16 @@ retry_writepage:
                /* Handle mst protected attributes. */
                if (NInoMstProtected(ni))
                        return ntfs_write_mst_block(page, wbc);
-                /* Normal data stream. */
+                /* Normal, non-resident data stream. */
                return ntfs_write_block(page, wbc);
        }
        /*
-         * Attribute is resident, implying it is not compressed, encrypted,
+         * Attribute is resident, implying it is not compressed, encrypted, or
-         * sparse, or mst protected.  This also means the attribute is smaller
+         * mst protected.  This also means the attribute is smaller than an mft
-         * than an mft record and hence smaller than a page, so can simply
+         * record and hence smaller than a page, so can simply return error on
-         * return error on any pages with index above 0.
+         * any pages with index above 0.  Note the attribute can actually be
+         * marked compressed but if it is resident the actual data is not
+         * compressed so we are ok to ignore the compressed flag here.
         */
        BUG_ON(page_has_buffers(page));
        BUG_ON(!PageUptodate(page));
@@ -1380,30 +1442,14 @@ retry_writepage:
        BUG_ON(PageWriteback(page));
        set_page_writeback(page);
        unlock_page(page);
        /*
-         * Here, we don't need to zero the out of bounds area everytime because
+         * Here, we do not need to zero the out of bounds area everytime
-         * the below memcpy() already takes care of the mmap-at-end-of-file
+         * because the below memcpy() already takes care of the
-         * requirements. If the file is converted to a non-resident one, then
+         * mmap-at-end-of-file requirements.  If the file is converted to a
-         * the code path use is switched to the non-resident one where the
+         * non-resident one, then the code path use is switched to the
-         * zeroing happens on each ntfs_writepage() invocation.
+         * non-resident one where the zeroing happens on each ntfs_writepage()
-         *
+         * invocation.
-         * The above also applies nicely when i_size is decreased.
-         *
-         * When i_size is increased, the memory between the old and new i_size
-         * _must_ be zeroed (or overwritten with new data). Otherwise we will
-         * expose data to userspace/disk which should never have been exposed.
-         *
-         * FIXME: Ensure that i_size increases do the zeroing/overwriting and
-         * if we cannot guarantee that, then enable the zeroing below.  If the
-         * zeroing below is enabled, we MUST move the unlock_page() from above
-         * to after the kunmap_atomic(), i.e. just before the
-         * end_page_writeback().
-         * UPDATE: ntfs_prepare/commit_write() do the zeroing on i_size
-         * increases for resident attributes so those are ok.
-         * TODO: ntfs_truncate(), others?
         */
        attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
        i_size = i_size_read(vi);
        if (unlikely(attr_len > i_size)) {
@@ -1681,27 +1727,25 @@ lock_retry_remap:
                                        if (likely(!err))
                                                goto lock_retry_remap;
                                        rl = NULL;
-                                        lcn = err;
                                } else if (!rl)
                                        up_read(&ni->runlist.lock);
                                /*
                                 * Failed to map the buffer, even after
                                 * retrying.
                                 */
+                                if (!err)
+                                        err = -EIO;
                                bh->b_blocknr = -1;
                                ntfs_error(vol->sb, "Failed to write to inode "
                                                "0x%lx, attribute type 0x%x, "
                                                "vcn 0x%llx, offset 0x%x "
                                                "because its location on disk "
                                                "could not be determined%s "
-                                                "(error code %lli).",
+                                                "(error code %i).",
                                                ni->mft_no, ni->type,
                                                (unsigned long long)vcn,
                                                vcn_ofs, is_retry ? " even "
-                                                "after retrying" : "",
+                                                "after retrying" : "", err);
-                                                (long long)lcn);
-                                if (!err)
-                                        err = -EIO;
                                goto err_out;
                        }
                        /* We now have a successful remap, i.e. lcn >= 0. */
@@ -2357,6 +2401,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
                        buffers_to_free = bh;
        }
        bh = head = page_buffers(page);
+        BUG_ON(!bh);
        do {
                bh_ofs = bh_offset(bh);
                if (bh_ofs + bh_size <= ofs)
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index cd0f9e740b14..3f9a4ff42ee5 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -43,6 +43,9 @@
 * which is not an error as such.  This is -ENOENT.  It means that @vcn is out
 * of bounds of the runlist.
 *
+ * Note the runlist can be NULL after this function returns if @vcn is zero and
+ * the attribute has zero allocated size, i.e. there simply is no runlist.
+ *
 * Locking: - The runlist must be locked for writing.
 *          - This function modifies the runlist.
 */
@@ -54,6 +57,7 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn)
        ATTR_RECORD *a;
        ntfs_attr_search_ctx *ctx;
        runlist_element *rl;
+        unsigned long flags;
        int err = 0;
        ntfs_debug("Mapping runlist part containing vcn 0x%llx.",
@@ -85,8 +89,11 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn)
         * ntfs_mapping_pairs_decompress() fails.
         */
        end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1;
-        if (unlikely(!a->data.non_resident.lowest_vcn && end_vcn <= 1))
+        if (unlikely(!a->data.non_resident.lowest_vcn && end_vcn <= 1)) {
+                read_lock_irqsave(&ni->size_lock, flags);
                end_vcn = ni->allocated_size >> ni->vol->cluster_size_bits;
+                read_unlock_irqrestore(&ni->size_lock, flags);
+        }
        if (unlikely(vcn >= end_vcn)) {
                err = -ENOENT;
                goto err_out;
@@ -165,6 +172,7 @@ LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
                const BOOL write_locked)
 {
        LCN lcn;
+        unsigned long flags;
        BOOL is_retry = FALSE;
        ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.",
@@ -173,6 +181,14 @@ LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
        BUG_ON(!ni);
        BUG_ON(!NInoNonResident(ni));
        BUG_ON(vcn < 0);
+        if (!ni->runlist.rl) {
+                read_lock_irqsave(&ni->size_lock, flags);
+                if (!ni->allocated_size) {
+                        read_unlock_irqrestore(&ni->size_lock, flags);
+                        return LCN_ENOENT;
+                }
+                read_unlock_irqrestore(&ni->size_lock, flags);
+        }
 retry_remap:
        /* Convert vcn to lcn.  If that fails map the runlist and retry once. */
        lcn = ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn);
@@ -255,6 +271,7 @@ retry_remap:
 runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,
                const BOOL write_locked)
 {
+        unsigned long flags;
        runlist_element *rl;
        int err = 0;
        BOOL is_retry = FALSE;
@@ -265,6 +282,14 @@ runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,
        BUG_ON(!ni);
        BUG_ON(!NInoNonResident(ni));
        BUG_ON(vcn < 0);
+        if (!ni->runlist.rl) {
+                read_lock_irqsave(&ni->size_lock, flags);
+                if (!ni->allocated_size) {
+                        read_unlock_irqrestore(&ni->size_lock, flags);
+                        return ERR_PTR(-ENOENT);
+                }
+                read_unlock_irqrestore(&ni->size_lock, flags);
+        }
 retry_remap:
        rl = ni->runlist.rl;
        if (likely(rl && vcn >= rl[0].vcn)) {
@@ -528,6 +553,11 @@ int load_attribute_list(ntfs_volume *vol, runlist *runlist, u8 *al_start,
        block_size_bits = sb->s_blocksize_bits;
        down_read(&runlist->lock);
        rl = runlist->rl;
+        if (!rl) {
+                ntfs_error(sb, "Cannot read attribute list since runlist is "
+                                "missing.");
+                goto err_out;   
+        }
        /* Read all clusters specified by the runlist one run at a time. */
        while (rl->length) {
                lcn = ntfs_rl_vcn_to_lcn(rl, rl->vcn);
@@ -1247,6 +1277,46 @@ int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size)
 }
 /**
+ * ntfs_resident_attr_value_resize - resize the value of a resident attribute
+ * @m:          mft record containing attribute record
+ * @a:          attribute record whose value to resize
+ * @new_size:   new size in bytes to which to resize the attribute value of @a
+ *
+ * Resize the value of the attribute @a in the mft record @m to @new_size bytes.
+ * If the value is made bigger, the newly allocated space is cleared.
+ *
+ * Return 0 on success and -errno on error.  The following error codes are
+ * defined:
+ *      -ENOSPC - Not enough space in the mft record @m to perform the resize.
+ *
+ * Note: On error, no modifications have been performed whatsoever.
+ *
+ * Warning: If you make a record smaller without having copied all the data you
+ *          are interested in the data may be overwritten.
+ */
+int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
+                const u32 new_size)
+{
+        u32 old_size;
+        /* Resize the resident part of the attribute record. */
+        if (ntfs_attr_record_resize(m, a,
+                        le16_to_cpu(a->data.resident.value_offset) + new_size))
+                return -ENOSPC;
+        /*
+         * The resize succeeded!  If we made the attribute value bigger, clear
+         * the area between the old size and @new_size.
+         */
+        old_size = le32_to_cpu(a->data.resident.value_length);
+        if (new_size > old_size)
+                memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) +
+                                old_size, 0, new_size - old_size);
+        /* Finally update the length of the attribute value. */
+        a->data.resident.value_length = cpu_to_le32(new_size);
+        return 0;
+}
+/**
 * ntfs_attr_make_non_resident - convert a resident to a non-resident attribute
 * @ni:         ntfs inode describing the attribute to convert
 *
@@ -1302,6 +1372,12 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
                return err;
        }
        /*
+         * FIXME: Compressed and encrypted attributes are not supported when
+         * writing and we should never have gotten here for them.
+         */
+        BUG_ON(NInoCompressed(ni));
+        BUG_ON(NInoEncrypted(ni));
+        /*
         * The size needs to be aligned to a cluster boundary for allocation
         * purposes.
         */
@@ -1377,10 +1453,15 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
        BUG_ON(a->non_resident);
        /*
         * Calculate new offsets for the name and the mapping pairs array.
-         * We assume the attribute is not compressed or sparse.
         */
-        name_ofs = (offsetof(ATTR_REC,
+        if (NInoSparse(ni) || NInoCompressed(ni))
-                        data.non_resident.compressed_size) + 7) & ~7;
+                name_ofs = (offsetof(ATTR_REC,
+                                data.non_resident.compressed_size) +
+                                sizeof(a->data.non_resident.compressed_size) +
+                                7) & ~7;
+        else
+                name_ofs = (offsetof(ATTR_REC,
+                                data.non_resident.compressed_size) + 7) & ~7;
        mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7;
        /*
         * Determine the size of the resident part of the now non-resident
@@ -1419,24 +1500,23 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
                memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset),
                                a->name_length * sizeof(ntfschar));
        a->name_offset = cpu_to_le16(name_ofs);
-        /*
-         * FIXME: For now just clear all of these as we do not support them
-         * when writing.
-         */
-        a->flags &= cpu_to_le16(0xffff & ~le16_to_cpu(ATTR_IS_SPARSE |
-                        ATTR_IS_ENCRYPTED | ATTR_COMPRESSION_MASK));
        /* Setup the fields specific to non-resident attributes. */
        a->data.non_resident.lowest_vcn = 0;
        a->data.non_resident.highest_vcn = cpu_to_sle64((new_size - 1) >>
                        vol->cluster_size_bits);
        a->data.non_resident.mapping_pairs_offset = cpu_to_le16(mp_ofs);
-        a->data.non_resident.compression_unit = 0;
        memset(&a->data.non_resident.reserved, 0,
                        sizeof(a->data.non_resident.reserved));
        a->data.non_resident.allocated_size = cpu_to_sle64(new_size);
        a->data.non_resident.data_size =
                        a->data.non_resident.initialized_size =
                        cpu_to_sle64(attr_size);
+        if (NInoSparse(ni) || NInoCompressed(ni)) {
+                a->data.non_resident.compression_unit = 4;
+                a->data.non_resident.compressed_size =
+                                a->data.non_resident.allocated_size;
+        } else
+                a->data.non_resident.compression_unit = 0;
        /* Generate the mapping pairs array into the attribute record. */
        err = ntfs_mapping_pairs_build(vol, (u8*)a + mp_ofs,
                        arec_size - mp_ofs, rl, 0, -1, NULL);
@@ -1446,16 +1526,19 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
                goto undo_err_out;
        }
        /* Setup the in-memory attribute structure to be non-resident. */
-        /*
-         * FIXME: For now just clear all of these as we do not support them
-         * when writing.
-         */
-        NInoClearSparse(ni);
-        NInoClearEncrypted(ni);
-        NInoClearCompressed(ni);
        ni->runlist.rl = rl;
        write_lock_irqsave(&ni->size_lock, flags);
        ni->allocated_size = new_size;
+        if (NInoSparse(ni) || NInoCompressed(ni)) {
+                ni->itype.compressed.size = ni->allocated_size;
+                ni->itype.compressed.block_size = 1U <<
+                                (a->data.non_resident.compression_unit +
+                                vol->cluster_size_bits);
+                ni->itype.compressed.block_size_bits =
+                                ffs(ni->itype.compressed.block_size) - 1;
+                ni->itype.compressed.block_clusters = 1U <<
+                                a->data.non_resident.compression_unit;
+        }
        write_unlock_irqrestore(&ni->size_lock, flags);
        /*
         * This needs to be last since the address space operations ->readpage
@@ -1603,6 +1686,12 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
        BUG_ON(cnt < 0);
        if (!cnt)
                goto done;
+        /*
+         * FIXME: Compressed and encrypted attributes are not supported when
+         * writing and we should never have gotten here for them.
+         */
+        BUG_ON(NInoCompressed(ni));
+        BUG_ON(NInoEncrypted(ni));
        mapping = VFS_I(ni)->i_mapping;
        /* Work out the starting index and page offset. */
        idx = ofs >> PAGE_CACHE_SHIFT;
diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h
index 0e4ac6d3c0e7..0618ed6fd7b3 100644
--- a/fs/ntfs/attrib.h
+++ b/fs/ntfs/attrib.h
@@ -99,6 +99,8 @@ extern int ntfs_attr_can_be_resident(const ntfs_volume *vol,
                const ATTR_TYPE type);
 extern int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size);
+extern int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
+                const u32 new_size);
 extern int ntfs_attr_make_non_resident(ntfs_inode *ni);
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 6d265cfd49aa..25d24106f893 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -539,7 +539,6 @@ int ntfs_read_compressed_block(struct page *page)
        if (unlikely(!pages || !bhs)) {
                kfree(bhs);
                kfree(pages);
-                SetPageError(page);
                unlock_page(page);
                ntfs_error(vol->sb, "Failed to allocate internal buffers.");
                return -ENOMEM;
@@ -871,9 +870,6 @@ lock_retry_remap:
                        for (; prev_cur_page < cur_page; prev_cur_page++) {
                                page = pages[prev_cur_page];
                                if (page) {
-                                        if (prev_cur_page == xpage &&
-                                                        !xpage_done)
-                                                SetPageError(page);
                                        flush_dcache_page(page);
                                        kunmap(page);
                                        unlock_page(page);
@@ -904,8 +900,6 @@ lock_retry_remap:
                                        "Terminating them with extreme "
                                        "prejudice.  Inode 0x%lx, page index "
                                        "0x%lx.", ni->mft_no, page->index);
-                        if (cur_page == xpage && !xpage_done)
-                                SetPageError(page);
                        flush_dcache_page(page);
                        kunmap(page);
                        unlock_page(page);
@@ -953,8 +947,6 @@ err_out:
        for (i = cur_page; i < max_page; i++) {
                page = pages[i];
                if (page) {
-                        if (i == xpage && !xpage_done)
-                                SetPageError(page);
                        flush_dcache_page(page);
                        kunmap(page);
                        unlock_page(page);
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 46779471c542..795c3d1930f5 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1051,7 +1051,8 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
                        ie->key.file_name.file_name_length, &name,
                        NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1);
        if (name_len <= 0) {
-                ntfs_debug("Skipping unrepresentable file.");
+                ntfs_warning(vol->sb, "Skipping unrepresentable inode 0x%llx.",
+                                (long long)MREF_LE(ie->data.dir.indexed_file));
                return 0;
        }
        if (ie->key.file_name.file_attributes &
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index e0f530ce6b99..be9fd1dd423d 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
 /*
- * file.c - NTFS kernel file operations. Part of the Linux-NTFS project.
+ * file.c - NTFS kernel file operations.  Part of the Linux-NTFS project.
 *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2005 Anton Altaparmakov
 *
 * This program/include file is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published
@@ -94,6 +94,11 @@ static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
        if (!datasync || !NInoNonResident(NTFS_I(vi)))
                ret = ntfs_write_inode(vi, 1);
        write_inode_now(vi, !datasync);
+        /*
+         * NOTE: If we were to use mapping->private_list (see ext2 and
+         * fs/buffer.c) for dirty blocks then we could optimize the below to be
+         * sync_mapping_buffers(vi->i_mapping).
+         */
        err = sync_blockdev(vi->i_sb->s_bdev);
        if (unlikely(err && !ret))
                ret = err;
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
index 11fd5307d780..8f2d5727546f 100644
--- a/fs/ntfs/index.c
+++ b/fs/ntfs/index.c
@@ -205,6 +205,7 @@ int ntfs_index_lookup(const void *key, const int key_len,
                                &ie->key, key_len)) {
 ir_done:
                        ictx->is_in_root = TRUE;
+                        ictx->ir = ir;
                        ictx->actx = actx;
                        ictx->base_ni = base_ni;
                        ictx->ia = NULL;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 886214a77f90..dc4bbe3acf5c 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1013,41 +1013,50 @@ skip_large_dir_stuff:
                }
                a = ctx->attr;
                /* Setup the state. */
-                if (a->non_resident) {
+                if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
-                        NInoSetNonResident(ni);
+                        if (a->flags & ATTR_COMPRESSION_MASK) {
-                        if (a->flags & (ATTR_COMPRESSION_MASK |
+                                NInoSetCompressed(ni);
-                                        ATTR_IS_SPARSE)) {
+                                if (vol->cluster_size > 4096) {
-                                if (a->flags & ATTR_COMPRESSION_MASK) {
+                                        ntfs_error(vi->i_sb, "Found "
-                                        NInoSetCompressed(ni);
-                                        if (vol->cluster_size > 4096) {
-                                                ntfs_error(vi->i_sb, "Found "
                                                        "compressed data but "
                                                        "compression is "
                                                        "disabled due to "
                                                        "cluster size (%i) > "
                                                        "4kiB.",
                                                        vol->cluster_size);
-                                                goto unm_err_out;
+                                        goto unm_err_out;
-                                        }
+                                }
-                                        if ((a->flags & ATTR_COMPRESSION_MASK)
+                                if ((a->flags & ATTR_COMPRESSION_MASK)
-                                                        != ATTR_IS_COMPRESSED) {
+                                                != ATTR_IS_COMPRESSED) {
-                                                ntfs_error(vi->i_sb, "Found "
+                                        ntfs_error(vi->i_sb, "Found unknown "
-                                                        "unknown compression "
+                                                        "compression method "
-                                                        "method or corrupt "
+                                                        "or corrupt file.");
-                                                        "file.");
+                                        goto unm_err_out;
-                                                goto unm_err_out;
-                                        }
                                }
-                                if (a->flags & ATTR_IS_SPARSE)
+                        }
-                                        NInoSetSparse(ni);
+                        if (a->flags & ATTR_IS_SPARSE)
+                                NInoSetSparse(ni);
+                }
+                if (a->flags & ATTR_IS_ENCRYPTED) {
+                        if (NInoCompressed(ni)) {
+                                ntfs_error(vi->i_sb, "Found encrypted and "
+                                                "compressed data.");
+                                goto unm_err_out;
+                        }
+                        NInoSetEncrypted(ni);
+                }
+                if (a->non_resident) {
+                        NInoSetNonResident(ni);
+                        if (NInoCompressed(ni) || NInoSparse(ni)) {
                                if (a->data.non_resident.compression_unit !=
                                                4) {
                                        ntfs_error(vi->i_sb, "Found "
-                                                "nonstandard compression unit "
+                                                        "nonstandard "
-                                                "(%u instead of 4).  Cannot "
+                                                        "compression unit (%u "
-                                                "handle this.",
+                                                        "instead of 4).  "
-                                                a->data.non_resident.
+                                                        "Cannot handle this.",
-                                                compression_unit);
+                                                        a->data.non_resident.
+                                                        compression_unit);
                                        err = -EOPNOTSUPP;
                                        goto unm_err_out;
                                }
@@ -1065,14 +1074,6 @@ skip_large_dir_stuff:
                                                a->data.non_resident.
                                                compressed_size);
                        }
-                        if (a->flags & ATTR_IS_ENCRYPTED) {
-                                if (a->flags & ATTR_COMPRESSION_MASK) {
-                                        ntfs_error(vi->i_sb, "Found encrypted "
-                                                        "and compressed data.");
-                                        goto unm_err_out;
-                                }
-                                NInoSetEncrypted(ni);
-                        }
                        if (a->data.non_resident.lowest_vcn) {
                                ntfs_error(vi->i_sb, "First extent of $DATA "
                                                "attribute has non zero "
@@ -1212,6 +1213,75 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
        if (unlikely(err))
                goto unm_err_out;
        a = ctx->attr;
+        if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
+                if (a->flags & ATTR_COMPRESSION_MASK) {
+                        NInoSetCompressed(ni);
+                        if ((ni->type != AT_DATA) || (ni->type == AT_DATA &&
+                                        ni->name_len)) {
+                                ntfs_error(vi->i_sb, "Found compressed "
+                                                "non-data or named data "
+                                                "attribute.  Please report "
+                                                "you saw this message to "
+                                                "linux-ntfs-dev@lists."
+                                                "sourceforge.net");
+                                goto unm_err_out;
+                        }
+                        if (vol->cluster_size > 4096) {
+                                ntfs_error(vi->i_sb, "Found compressed "
+                                                "attribute but compression is "
+                                                "disabled due to cluster size "
+                                                "(%i) > 4kiB.",
+                                                vol->cluster_size);
+                                goto unm_err_out;
+                        }
+                        if ((a->flags & ATTR_COMPRESSION_MASK) !=
+                                        ATTR_IS_COMPRESSED) {
+                                ntfs_error(vi->i_sb, "Found unknown "
+                                                "compression method.");
+                                goto unm_err_out;
+                        }
+                }
+                /*
+                 * The encryption flag set in an index root just means to
+                 * compress all files.
+                 */
+                if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) {
+                        ntfs_error(vi->i_sb, "Found mst protected attribute "
+                                        "but the attribute is %s.  Please "
+                                        "report you saw this message to "
+                                        "linux-ntfs-dev@lists.sourceforge.net",
+                                        NInoCompressed(ni) ? "compressed" :
+                                        "sparse");
+                        goto unm_err_out;
+                }
+                if (a->flags & ATTR_IS_SPARSE)
+                        NInoSetSparse(ni);
+        }
+        if (a->flags & ATTR_IS_ENCRYPTED) {
+                if (NInoCompressed(ni)) {
+                        ntfs_error(vi->i_sb, "Found encrypted and compressed "
+                                        "data.");
+                        goto unm_err_out;
+                }
+                /*
+                 * The encryption flag set in an index root just means to
+                 * encrypt all files.
+                 */
+                if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) {
+                        ntfs_error(vi->i_sb, "Found mst protected attribute "
+                                        "but the attribute is encrypted.  "
+                                        "Please report you saw this message "
+                                        "to linux-ntfs-dev@lists.sourceforge."
+                                        "net");
+                        goto unm_err_out;
+                }
+                if (ni->type != AT_DATA) {
+                        ntfs_error(vi->i_sb, "Found encrypted non-data "
+                                        "attribute.");
+                        goto unm_err_out;
+                }
+                NInoSetEncrypted(ni);
+        }
        if (!a->non_resident) {
                /* Ensure the attribute name is placed before the value. */
                if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
@@ -1220,11 +1290,10 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
                                        "the attribute value.");
                        goto unm_err_out;
                }
-                if (NInoMstProtected(ni) || a->flags) {
+                if (NInoMstProtected(ni)) {
                        ntfs_error(vi->i_sb, "Found mst protected attribute "
-                                        "or attribute with non-zero flags but "
+                                        "but the attribute is resident.  "
-                                        "the attribute is resident.  Please "
+                                        "Please report you saw this message to "
-                                        "report you saw this message to "
                                        "linux-ntfs-dev@lists.sourceforge.net");
                        goto unm_err_out;
                }
@@ -1250,50 +1319,8 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
                                        "the mapping pairs array.");
                        goto unm_err_out;
                }
-                if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
+                if ((NInoCompressed(ni) || NInoSparse(ni)) &&
-                        if (a->flags & ATTR_COMPRESSION_MASK) {
+                                ni->type != AT_INDEX_ROOT) {
-                                NInoSetCompressed(ni);
-                                if ((ni->type != AT_DATA) || (ni->type ==
-                                                AT_DATA && ni->name_len)) {
-                                        ntfs_error(vi->i_sb, "Found compressed "
-                                                        "non-data or named "
-                                                        "data attribute.  "
-                                                        "Please report you "
-                                                        "saw this message to "
-                                                        "linux-ntfs-dev@lists."
-                                                        "sourceforge.net");
-                                        goto unm_err_out;
-                                }
-                                if (vol->cluster_size > 4096) {
-                                        ntfs_error(vi->i_sb, "Found compressed "
-                                                        "attribute but "
-                                                        "compression is "
-                                                        "disabled due to "
-                                                        "cluster size (%i) > "
-                                                        "4kiB.",
-                                                        vol->cluster_size);
-                                        goto unm_err_out;
-                                }
-                                if ((a->flags & ATTR_COMPRESSION_MASK) !=
-                                                ATTR_IS_COMPRESSED) {
-                                        ntfs_error(vi->i_sb, "Found unknown "
-                                                        "compression method.");
-                                        goto unm_err_out;
-                                }
-                        }
-                        if (NInoMstProtected(ni)) {
-                                ntfs_error(vi->i_sb, "Found mst protected "
-                                                "attribute but the attribute "
-                                                "is %s.  Please report you "
-                                                "saw this message to "
-                                                "linux-ntfs-dev@lists."
-                                                "sourceforge.net",
-                                                NInoCompressed(ni) ?
-                                                "compressed" : "sparse");
-                                goto unm_err_out;
-                        }
-                        if (a->flags & ATTR_IS_SPARSE)
-                                NInoSetSparse(ni);
                        if (a->data.non_resident.compression_unit != 4) {
                                ntfs_error(vi->i_sb, "Found nonstandard "
                                                "compression unit (%u instead "
@@ -1313,23 +1340,6 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
                        ni->itype.compressed.size = sle64_to_cpu(
                                        a->data.non_resident.compressed_size);
                }
-                if (a->flags & ATTR_IS_ENCRYPTED) {
-                        if (a->flags & ATTR_COMPRESSION_MASK) {
-                                ntfs_error(vi->i_sb, "Found encrypted and "
-                                                "compressed data.");
-                                goto unm_err_out;
-                        }
-                        if (NInoMstProtected(ni)) {
-                                ntfs_error(vi->i_sb, "Found mst protected "
-                                                "attribute but the attribute "
-                                                "is encrypted.  Please report "
-                                                "you saw this message to "
-                                                "linux-ntfs-dev@lists."
-                                                "sourceforge.net");
-                                goto unm_err_out;
-                        }
-                        NInoSetEncrypted(ni);
-                }
                if (a->data.non_resident.lowest_vcn) {
                        ntfs_error(vi->i_sb, "First extent of attribute has "
                                        "non-zero lowest_vcn.");
@@ -1348,12 +1358,12 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
                vi->i_mapping->a_ops = &ntfs_mst_aops;
        else
                vi->i_mapping->a_ops = &ntfs_aops;
-        if (NInoCompressed(ni) || NInoSparse(ni))
+        if ((NInoCompressed(ni) || NInoSparse(ni)) && ni->type != AT_INDEX_ROOT)
                vi->i_blocks = ni->itype.compressed.size >> 9;
        else
                vi->i_blocks = ni->allocated_size >> 9;
        /*
-         * Make sure the base inode doesn't go away and attach it to the
+         * Make sure the base inode does not go away and attach it to the
         * attribute inode.
         */
        igrab(base_vi);
@@ -1480,7 +1490,10 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
                                "after the attribute value.");
                goto unm_err_out;
        }
-        /* Compressed/encrypted/sparse index root is not allowed. */
+        /*
+         * Compressed/encrypted/sparse index root is not allowed, except for
+         * directories of course but those are not dealt with here.
+         */
        if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_ENCRYPTED |
                        ATTR_IS_SPARSE)) {
                ntfs_error(vi->i_sb, "Found compressed/encrypted/sparse index "
@@ -2430,16 +2443,18 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
                         * We skipped the truncate but must still update
                         * timestamps.
                         */
-                        ia_valid |= ATTR_MTIME|ATTR_CTIME;
+                        ia_valid |= ATTR_MTIME | ATTR_CTIME;
                }
        }
        if (ia_valid & ATTR_ATIME)
-                vi->i_atime = attr->ia_atime;
+                vi->i_atime = timespec_trunc(attr->ia_atime,
+                                vi->i_sb->s_time_gran);
        if (ia_valid & ATTR_MTIME)
-                vi->i_mtime = attr->ia_mtime;
+                vi->i_mtime = timespec_trunc(attr->ia_mtime,
+                                vi->i_sb->s_time_gran);
        if (ia_valid & ATTR_CTIME)
-                vi->i_ctime = attr->ia_ctime;
+                vi->i_ctime = timespec_trunc(attr->ia_ctime,
+                                vi->i_sb->s_time_gran);
        mark_inode_dirty(vi);
 out:
        return err;
diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c
index a4bc07616e5d..7b5934290685 100644
--- a/fs/ntfs/lcnalloc.c
+++ b/fs/ntfs/lcnalloc.c
@@ -54,6 +54,8 @@ int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
        int ret = 0;
        ntfs_debug("Entering.");
+        if (!rl)
+                return 0;
        for (; rl->length; rl++) {
                int err;
@@ -163,17 +165,9 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
        BUG_ON(zone < FIRST_ZONE);
        BUG_ON(zone > LAST_ZONE);
-        /* Return empty runlist if @count == 0 */
+        /* Return NULL if @count is zero. */
-        // FIXME: Do we want to just return NULL instead? (AIA)
+        if (!count)
-        if (!count) {
+                return NULL;
-                rl = ntfs_malloc_nofs(PAGE_SIZE);
-                if (!rl)
-                        return ERR_PTR(-ENOMEM);
-                rl[0].vcn = start_vcn;
-                rl[0].lcn = LCN_RL_NOT_MAPPED;
-                rl[0].length = 0;
-                return rl;
-        }
        /* Take the lcnbmp lock for writing. */
        down_write(&vol->lcnbmp_lock);
        /*
@@ -788,7 +782,8 @@ out:
 * @vi:         vfs inode whose runlist describes the clusters to free
 * @start_vcn:  vcn in the runlist of @vi at which to start freeing clusters
 * @count:      number of clusters to free or -1 for all clusters
- * @is_rollback:        if TRUE this is a rollback operation
+ * @write_locked:       true if the runlist is locked for writing
+ * @is_rollback:        true if this is a rollback operation
 *
 * Free @count clusters starting at the cluster @start_vcn in the runlist
 * described by the vfs inode @vi.
@@ -806,17 +801,17 @@ out:
 * Return the number of deallocated clusters (not counting sparse ones) on
 * success and -errno on error.
 *
- * Locking: - The runlist described by @vi must be unlocked on entry and is
+ * Locking: - The runlist described by @vi must be locked on entry and is
- *            unlocked on return.
+ *            locked on return.  Note if the runlist is locked for reading the
- *          - This function takes the runlist lock of @vi for reading and
+ *            lock may be dropped and reacquired.  Note the runlist may be
- *            sometimes for writing and sometimes modifies the runlist.
+ *            modified when needed runlist fragments need to be mapped.
 *          - The volume lcn bitmap must be unlocked on entry and is unlocked
 *            on return.
 *          - This function takes the volume lcn bitmap lock for writing and
 *            modifies the bitmap contents.
 */
 s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
-                const BOOL is_rollback)
+                const BOOL write_locked, const BOOL is_rollback)
 {
        s64 delta, to_free, total_freed, real_freed;
        ntfs_inode *ni;
@@ -848,8 +843,7 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
        total_freed = real_freed = 0;
-        down_read(&ni->runlist.lock);
+        rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, write_locked);
-        rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, FALSE);
        if (IS_ERR(rl)) {
                if (!is_rollback)
                        ntfs_error(vol->sb, "Failed to find first runlist "
@@ -903,7 +897,7 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
                        /* Attempt to map runlist. */
                        vcn = rl->vcn;
-                        rl = ntfs_attr_find_vcn_nolock(ni, vcn, FALSE);
+                        rl = ntfs_attr_find_vcn_nolock(ni, vcn, write_locked);
                        if (IS_ERR(rl)) {
                                err = PTR_ERR(rl);
                                if (!is_rollback)
@@ -950,7 +944,6 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
                /* Update the total done clusters. */
                total_freed += to_free;
        }
-        up_read(&ni->runlist.lock);
        if (likely(!is_rollback))
                up_write(&vol->lcnbmp_lock);
@@ -960,7 +953,6 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
        ntfs_debug("Done.");
        return real_freed;
 err_out:
-        up_read(&ni->runlist.lock);
        if (is_rollback)
                return err;
        /* If no real clusters were freed, no need to rollback. */
@@ -973,7 +965,8 @@ err_out:
         * If rollback fails, set the volume errors flag, emit an error
         * message, and return the error code.
         */
-        delta = __ntfs_cluster_free(vi, start_vcn, total_freed, TRUE);
+        delta = __ntfs_cluster_free(vi, start_vcn, total_freed, write_locked,
+                        TRUE);
        if (delta < 0) {
                ntfs_error(vol->sb, "Failed to rollback (error %i).  Leaving "
                                "inconsistent metadata!  Unmount and run "
diff --git a/fs/ntfs/lcnalloc.h b/fs/ntfs/lcnalloc.h
index 4cac1c024af6..e4d7fb98d685 100644
--- a/fs/ntfs/lcnalloc.h
+++ b/fs/ntfs/lcnalloc.h
@@ -43,13 +43,14 @@ extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol,
                const NTFS_CLUSTER_ALLOCATION_ZONES zone);
 extern s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn,
-                s64 count, const BOOL is_rollback);
+                s64 count, const BOOL write_locked, const BOOL is_rollback);
 /**
 * ntfs_cluster_free - free clusters on an ntfs volume
 * @vi:         vfs inode whose runlist describes the clusters to free
 * @start_vcn:  vcn in the runlist of @vi at which to start freeing clusters
 * @count:      number of clusters to free or -1 for all clusters
+ * @write_locked:       true if the runlist is locked for writing
 *
 * Free @count clusters starting at the cluster @start_vcn in the runlist
 * described by the vfs inode @vi.
@@ -64,19 +65,19 @@ extern s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn,
 * Return the number of deallocated clusters (not counting sparse ones) on
 * success and -errno on error.
 *
- * Locking: - The runlist described by @vi must be unlocked on entry and is
+ * Locking: - The runlist described by @vi must be locked on entry and is
- *            unlocked on return.
+ *            locked on return.  Note if the runlist is locked for reading the
- *          - This function takes the runlist lock of @vi for reading and
+ *            lock may be dropped and reacquired.  Note the runlist may be
- *            sometimes for writing and sometimes modifies the runlist.
+ *            modified when needed runlist fragments need to be mapped.
 *          - The volume lcn bitmap must be unlocked on entry and is unlocked
 *            on return.
 *          - This function takes the volume lcn bitmap lock for writing and
 *            modifies the bitmap contents.
 */
 static inline s64 ntfs_cluster_free(struct inode *vi, const VCN start_vcn,
-                s64 count)
+                s64 count, const BOOL write_locked)
 {
-        return __ntfs_cluster_free(vi, start_vcn, count, FALSE);
+        return __ntfs_cluster_free(vi, start_vcn, count, write_locked, FALSE);
 }
 extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
@@ -93,8 +94,10 @@ extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
 *
 * Return 0 on success and -errno on error.
 *
- * Locking: This function takes the volume lcn bitmap lock for writing and
+ * Locking: - This function takes the volume lcn bitmap lock for writing and
- *          modifies the bitmap contents.
+ *            modifies the bitmap contents.
+ *          - The caller must have locked the runlist @rl for reading or
+ *            writing.
 */
 static inline int ntfs_cluster_free_from_rl(ntfs_volume *vol,
                const runlist_element *rl)
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index 8edb8e20fb08..0173e95500d9 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -121,7 +121,7 @@ static BOOL ntfs_check_restart_page_header(struct inode *vi,
         */
        if (!ntfs_is_chkd_record(rp->magic) && sle64_to_cpu(rp->chkdsk_lsn)) {
                ntfs_error(vi->i_sb, "$LogFile restart page is not modified "
-                                "chkdsk but a chkdsk LSN is specified.");
+                                "by chkdsk but a chkdsk LSN is specified.");
                return FALSE;
        }
        ntfs_debug("Done.");
@@ -312,10 +312,12 @@ err_out:
 * @vi:         $LogFile inode to which the restart page belongs
 * @rp:         restart page to check
 * @pos:        position in @vi at which the restart page resides
- * @wrp:        copy of the multi sector transfer deprotected restart page
+ * @wrp:        [OUT] copy of the multi sector transfer deprotected restart page
+ * @lsn:        [OUT] set to the current logfile lsn on success
 *
- * Check the restart page @rp for consistency and return TRUE if it is
+ * Check the restart page @rp for consistency and return 0 if it is consistent
- * consistent and FALSE otherwise.
+ * and -errno otherwise.  The restart page may have been modified by chkdsk in
+ * which case its magic is CHKD instead of RSTR.
 *
 * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
 * require the full restart page.
@@ -323,25 +325,33 @@ err_out:
 * If @wrp is not NULL, on success, *@wrp will point to a buffer containing a
 * copy of the complete multi sector transfer deprotected page.  On failure,
 * *@wrp is undefined.
+ *
+ * Simillarly, if @lsn is not NULL, on succes *@lsn will be set to the current
+ * logfile lsn according to this restart page.  On failure, *@lsn is undefined.
+ *
+ * The following error codes are defined:
+ *      -EINVAL - The restart page is inconsistent.
+ *      -ENOMEM - Not enough memory to load the restart page.
+ *      -EIO    - Failed to reading from $LogFile.
 */
-static BOOL ntfs_check_and_load_restart_page(struct inode *vi,
+static int ntfs_check_and_load_restart_page(struct inode *vi,
-                RESTART_PAGE_HEADER *rp, s64 pos, RESTART_PAGE_HEADER **wrp)
+                RESTART_PAGE_HEADER *rp, s64 pos, RESTART_PAGE_HEADER **wrp,
+                LSN *lsn)
 {
        RESTART_AREA *ra;
        RESTART_PAGE_HEADER *trp;
-        int size;
+        int size, err;
-        BOOL ret;
        ntfs_debug("Entering.");
        /* Check the restart page header for consistency. */
        if (!ntfs_check_restart_page_header(vi, rp, pos)) {
                /* Error output already done inside the function. */
-                return FALSE;
+                return -EINVAL;
        }
        /* Check the restart area for consistency. */
        if (!ntfs_check_restart_area(vi, rp)) {
                /* Error output already done inside the function. */
-                return FALSE;
+                return -EINVAL;
        }
        ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
        /*
@@ -352,7 +362,7 @@ static BOOL ntfs_check_and_load_restart_page(struct inode *vi,
        if (!trp) {
                ntfs_error(vi->i_sb, "Failed to allocate memory for $LogFile "
                                "restart page buffer.");
-                return FALSE;
+                return -ENOMEM;
        }
        /*
         * Read the whole of the restart page into the buffer.  If it fits
@@ -379,6 +389,9 @@ static BOOL ntfs_check_and_load_restart_page(struct inode *vi,
                        if (IS_ERR(page)) {
                                ntfs_error(vi->i_sb, "Error mapping $LogFile "
                                                "page (index %lu).", idx);
+                                err = PTR_ERR(page);
+                                if (err != -EIO && err != -ENOMEM)
+                                        err = -EIO;
                                goto err_out;
                        }
                        size = min_t(int, to_read, PAGE_CACHE_SIZE);
@@ -392,29 +405,57 @@ static BOOL ntfs_check_and_load_restart_page(struct inode *vi,
        /* Perform the multi sector transfer deprotection on the buffer. */
        if (post_read_mst_fixup((NTFS_RECORD*)trp,
                        le32_to_cpu(rp->system_page_size))) {
-                ntfs_error(vi->i_sb, "Multi sector transfer error detected in "
+                /*
-                                "$LogFile restart page.");
+                 * A multi sector tranfer error was detected.  We only need to
-                goto err_out;
+                 * abort if the restart page contents exceed the multi sector
+                 * transfer fixup of the first sector.
+                 */
+                if (le16_to_cpu(rp->restart_area_offset) +
+                                le16_to_cpu(ra->restart_area_length) >
+                                NTFS_BLOCK_SIZE - sizeof(u16)) {
+                        ntfs_error(vi->i_sb, "Multi sector transfer error "
+                                        "detected in $LogFile restart page.");
+                        err = -EINVAL;
+                        goto err_out;
+                }
+        }
+        /*
+         * If the restart page is modified by chkdsk or there are no active
+         * logfile clients, the logfile is consistent.  Otherwise, need to
+         * check the log client records for consistency, too.
+         */
+        err = 0;
+        if (ntfs_is_rstr_record(rp->magic) &&
+                        ra->client_in_use_list != LOGFILE_NO_CLIENT) {
+                if (!ntfs_check_log_client_array(vi, trp)) {
+                        err = -EINVAL;
+                        goto err_out;
+                }
+        }
+        if (lsn) {
+                if (ntfs_is_rstr_record(rp->magic))
+                        *lsn = sle64_to_cpu(ra->current_lsn);
+                else /* if (ntfs_is_chkd_record(rp->magic)) */
+                        *lsn = sle64_to_cpu(rp->chkdsk_lsn);
        }
-        /* Check the log client records for consistency. */
-        ret = ntfs_check_log_client_array(vi, trp);
-        if (ret && wrp)
-                *wrp = trp;
-        else
-                ntfs_free(trp);
        ntfs_debug("Done.");
-        return ret;
+        if (wrp)
+                *wrp = trp;
+        else {
 err_out:
-        ntfs_free(trp);
+                ntfs_free(trp);
-        return FALSE;
+        }
+        return err;
 }
 /**
 * ntfs_check_logfile - check the journal for consistency
 * @log_vi:     struct inode of loaded journal $LogFile to check
+ * @rp:         [OUT] on success this is a copy of the current restart page
 *
 * Check the $LogFile journal for consistency and return TRUE if it is
- * consistent and FALSE if not.
+ * consistent and FALSE if not.  On success, the current restart page is
+ * returned in *@rp.  Caller must call ntfs_free(*@rp) when finished with it.
 *
 * At present we only check the two restart pages and ignore the log record
 * pages.
@@ -424,19 +465,18 @@ err_out:
 * if the $LogFile was created on a system with a different page size to ours
 * yet and mst deprotection would fail if our page size is smaller.
 */
-BOOL ntfs_check_logfile(struct inode *log_vi)
+BOOL ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
 {
-        s64 size, pos, rstr1_pos, rstr2_pos;
+        s64 size, pos;
+        LSN rstr1_lsn, rstr2_lsn;
        ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
        struct address_space *mapping = log_vi->i_mapping;
        struct page *page = NULL;
        u8 *kaddr = NULL;
        RESTART_PAGE_HEADER *rstr1_ph = NULL;
        RESTART_PAGE_HEADER *rstr2_ph = NULL;
-        int log_page_size, log_page_mask, ofs;
+        int log_page_size, log_page_mask, err;
        BOOL logfile_is_empty = TRUE;
-        BOOL rstr1_found = FALSE;
-        BOOL rstr2_found = FALSE;
        u8 log_page_bits;
        ntfs_debug("Entering.");
@@ -491,7 +531,7 @@ BOOL ntfs_check_logfile(struct inode *log_vi)
                        if (IS_ERR(page)) {
                                ntfs_error(vol->sb, "Error mapping $LogFile "
                                                "page (index %lu).", idx);
-                                return FALSE;
+                                goto err_out;
                        }
                }
                kaddr = (u8*)page_address(page) + (pos & ~PAGE_CACHE_MASK);
@@ -510,99 +550,95 @@ BOOL ntfs_check_logfile(struct inode *log_vi)
                 */
                if (ntfs_is_rcrd_recordp((le32*)kaddr))
                        break;
-                /*
+                /* If not a (modified by chkdsk) restart page, continue. */
-                 * A modified by chkdsk restart page means we cannot handle
+                if (!ntfs_is_rstr_recordp((le32*)kaddr) &&
-                 * this log file.
+                                !ntfs_is_chkd_recordp((le32*)kaddr)) {
-                 */
-                if (ntfs_is_chkd_recordp((le32*)kaddr)) {
-                        ntfs_error(vol->sb, "$LogFile has been modified by "
-                                        "chkdsk.  Mount this volume in "
-                                        "Windows.");
-                        goto err_out;
-                }
-                /* If not a restart page, continue. */
-                if (!ntfs_is_rstr_recordp((le32*)kaddr)) {
-                        /* Skip to the minimum page size for the next one. */
                        if (!pos)
                                pos = NTFS_BLOCK_SIZE >> 1;
                        continue;
                }
-                /* We now know we have a restart page. */
-                if (!pos) {
-                        rstr1_found = TRUE;
-                        rstr1_pos = pos;
-                } else {
-                        if (rstr2_found) {
-                                ntfs_error(vol->sb, "Found more than two "
-                                                "restart pages in $LogFile.");
-                                goto err_out;
-                        }
-                        rstr2_found = TRUE;
-                        rstr2_pos = pos;
-                }
                /*
-                 * Check the restart page for consistency and get a copy of the
+                 * Check the (modified by chkdsk) restart page for consistency
-                 * complete multi sector transfer deprotected restart page.
+                 * and get a copy of the complete multi sector transfer
+                 * deprotected restart page.
                 */
-                if (!ntfs_check_and_load_restart_page(log_vi,
+                err = ntfs_check_and_load_restart_page(log_vi,
                                (RESTART_PAGE_HEADER*)kaddr, pos,
-                                !pos ? &rstr1_ph : &rstr2_ph)) {
+                                !rstr1_ph ? &rstr1_ph : &rstr2_ph,
-                        /* Error output already done inside the function. */
+                                !rstr1_ph ? &rstr1_lsn : &rstr2_lsn);
-                        goto err_out;
+                if (!err) {
+                        /*
+                         * If we have now found the first (modified by chkdsk)
+                         * restart page, continue looking for the second one.
+                         */
+                        if (!pos) {
+                                pos = NTFS_BLOCK_SIZE >> 1;
+                                continue;
+                        }
+                        /*
+                         * We have now found the second (modified by chkdsk)
+                         * restart page, so we can stop looking.
+                         */
+                        break;
                }
                /*
-                 * We have a valid restart page.  The next one must be after
+                 * Error output already done inside the function.  Note, we do
-                 * a whole system page size as specified by the valid restart
+                 * not abort if the restart page was invalid as we might still
-                 * page.
+                 * find a valid one further in the file.
                 */
+                if (err != -EINVAL) {
+                        ntfs_unmap_page(page);
+                        goto err_out;
+                }
+                /* Continue looking. */
                if (!pos)
-                        pos = le32_to_cpu(rstr1_ph->system_page_size) >> 1;
+                        pos = NTFS_BLOCK_SIZE >> 1;
        }
-        if (page) {
+        if (page)
                ntfs_unmap_page(page);
-                page = NULL;
-        }
        if (logfile_is_empty) {
                NVolSetLogFileEmpty(vol);
 is_empty:
                ntfs_debug("Done.  ($LogFile is empty.)");
                return TRUE;
        }
-        if (!rstr1_found || !rstr2_found) {
+        if (!rstr1_ph) {
-                ntfs_error(vol->sb, "Did not find two restart pages in "
+                BUG_ON(rstr2_ph);
-                                "$LogFile.");
+                ntfs_error(vol->sb, "Did not find any restart pages in "
-                goto err_out;
+                                "$LogFile and it was not empty.");
+                return FALSE;
+        }
+        /* If both restart pages were found, use the more recent one. */
+        if (rstr2_ph) {
+                /*
+                 * If the second restart area is more recent, switch to it.
+                 * Otherwise just throw it away.
+                 */
+                if (rstr2_lsn > rstr1_lsn) {
+                        ntfs_free(rstr1_ph);
+                        rstr1_ph = rstr2_ph;
+                        /* rstr1_lsn = rstr2_lsn; */
+                } else
+                        ntfs_free(rstr2_ph);
+                rstr2_ph = NULL;
        }
-        /*
-         * The two restart areas must be identical except for the update
-         * sequence number.
-         */
-        ofs = le16_to_cpu(rstr1_ph->usa_ofs);
-        if (memcmp(rstr1_ph, rstr2_ph, ofs) || (ofs += sizeof(u16),
-                        memcmp((u8*)rstr1_ph + ofs, (u8*)rstr2_ph + ofs,
-                        le32_to_cpu(rstr1_ph->system_page_size) - ofs))) {
-                ntfs_error(vol->sb, "The two restart pages in $LogFile do not "
-                                "match.");
-                goto err_out;
-        }
-        ntfs_free(rstr1_ph);
-        ntfs_free(rstr2_ph);
        /* All consistency checks passed. */
+        if (rp)
+                *rp = rstr1_ph;
+        else
+                ntfs_free(rstr1_ph);
        ntfs_debug("Done.");
        return TRUE;
 err_out:
-        if (page)
-                ntfs_unmap_page(page);
        if (rstr1_ph)
                ntfs_free(rstr1_ph);
-        if (rstr2_ph)
-                ntfs_free(rstr2_ph);
        return FALSE;
 }
 /**
 * ntfs_is_logfile_clean - check in the journal if the volume is clean
 * @log_vi:     struct inode of loaded journal $LogFile to check
+ * @rp:         copy of the current restart page
 *
 * Analyze the $LogFile journal and return TRUE if it indicates the volume was
 * shutdown cleanly and FALSE if not.
@@ -619,11 +655,9 @@ err_out:
 * is empty this function requires that NVolLogFileEmpty() is true otherwise an
 * empty volume will be reported as dirty.
 */
-BOOL ntfs_is_logfile_clean(struct inode *log_vi)
+BOOL ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp)
 {
        ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
-        struct page *page;
-        RESTART_PAGE_HEADER *rp;
        RESTART_AREA *ra;
        ntfs_debug("Entering.");
@@ -632,24 +666,15 @@ BOOL ntfs_is_logfile_clean(struct inode *log_vi)
                ntfs_debug("Done.  ($LogFile is empty.)");
                return TRUE;
        }
-        /*
+        BUG_ON(!rp);
-         * Read the first restart page.  It will be possibly incomplete and
+        if (!ntfs_is_rstr_record(rp->magic) &&
-         * will not be multi sector transfer deprotected but we only need the
+                        !ntfs_is_chkd_record(rp->magic)) {
-         * first NTFS_BLOCK_SIZE bytes so it does not matter.
+                ntfs_error(vol->sb, "Restart page buffer is invalid.  This is "
-         */
+                                "probably a bug in that the $LogFile should "
-        page = ntfs_map_page(log_vi->i_mapping, 0);
+                                "have been consistency checked before calling "
-        if (IS_ERR(page)) {
+                                "this function.");
-                ntfs_error(vol->sb, "Error mapping $LogFile page (index 0).");
                return FALSE;
        }
-        rp = (RESTART_PAGE_HEADER*)page_address(page);
-        if (!ntfs_is_rstr_record(rp->magic)) {
-                ntfs_error(vol->sb, "No restart page found at offset zero in "
-                                "$LogFile.  This is probably a bug in that "
-                                "the $LogFile should have been consistency "
-                                "checked before calling this function.");
-                goto err_out;
-        }
        ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
        /*
         * If the $LogFile has active clients, i.e. it is open, and we do not
@@ -659,15 +684,11 @@ BOOL ntfs_is_logfile_clean(struct inode *log_vi)
        if (ra->client_in_use_list != LOGFILE_NO_CLIENT &&
                        !(ra->flags & RESTART_VOLUME_IS_CLEAN)) {
                ntfs_debug("Done.  $LogFile indicates a dirty shutdown.");
-                goto err_out;
+                return FALSE;
        }
-        ntfs_unmap_page(page);
        /* $LogFile indicates a clean shutdown. */
        ntfs_debug("Done.  $LogFile indicates a clean shutdown.");
        return TRUE;
-err_out:
-        ntfs_unmap_page(page);
-        return FALSE;
 }
 /**
diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h
index 4ee4378de061..42388f95ea6d 100644
--- a/fs/ntfs/logfile.h
+++ b/fs/ntfs/logfile.h
@@ -2,7 +2,7 @@
 * logfile.h - Defines for NTFS kernel journal ($LogFile) handling.  Part of
 *             the Linux-NTFS project.
 *
- * Copyright (c) 2000-2004 Anton Altaparmakov
+ * Copyright (c) 2000-2005 Anton Altaparmakov
 *
 * This program/include file is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published
@@ -296,9 +296,11 @@ typedef struct {
 /* sizeof() = 160 (0xa0) bytes */
 } __attribute__ ((__packed__)) LOG_CLIENT_RECORD;
-extern BOOL ntfs_check_logfile(struct inode *log_vi);
+extern BOOL ntfs_check_logfile(struct inode *log_vi,
+                RESTART_PAGE_HEADER **rp);
-extern BOOL ntfs_is_logfile_clean(struct inode *log_vi);
+extern BOOL ntfs_is_logfile_clean(struct inode *log_vi,
+                const RESTART_PAGE_HEADER *rp);
 extern BOOL ntfs_empty_logfile(struct inode *log_vi);
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index fac5944df6d8..9994e019a3cf 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -27,27 +27,63 @@
 #include <linux/highmem.h>
 /**
- * ntfs_malloc_nofs - allocate memory in multiples of pages
+ * __ntfs_malloc - allocate memory in multiples of pages
- * @size        number of bytes to allocate
+ * @size:       number of bytes to allocate
+ * @gfp_mask:   extra flags for the allocator
+ *
+ * Internal function.  You probably want ntfs_malloc_nofs()...
 *
 * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
 * returns a pointer to the allocated memory.
 *
 * If there was insufficient memory to complete the request, return NULL.
+ * Depending on @gfp_mask the allocation may be guaranteed to succeed.
 */
-static inline void *ntfs_malloc_nofs(unsigned long size)
+static inline void *__ntfs_malloc(unsigned long size,
+                unsigned int __nocast gfp_mask)
 {
        if (likely(size <= PAGE_SIZE)) {
                BUG_ON(!size);
                /* kmalloc() has per-CPU caches so is faster for now. */
-                return kmalloc(PAGE_SIZE, GFP_NOFS);
+                return kmalloc(PAGE_SIZE, gfp_mask);
-                /* return (void *)__get_free_page(GFP_NOFS | __GFP_HIGHMEM); */
+                /* return (void *)__get_free_page(gfp_mask); */
        }
        if (likely(size >> PAGE_SHIFT < num_physpages))
-                return __vmalloc(size, GFP_NOFS | __GFP_HIGHMEM, PAGE_KERNEL);
+                return __vmalloc(size, gfp_mask, PAGE_KERNEL);
        return NULL;
 }
+/**
+ * ntfs_malloc_nofs - allocate memory in multiples of pages
+ * @size:       number of bytes to allocate
+ *
+ * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
+ * returns a pointer to the allocated memory.
+ *
+ * If there was insufficient memory to complete the request, return NULL.
+ */
+static inline void *ntfs_malloc_nofs(unsigned long size)
+{
+        return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM);
+}
+/**
+ * ntfs_malloc_nofs_nofail - allocate memory in multiples of pages
+ * @size:       number of bytes to allocate
+ *
+ * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
+ * returns a pointer to the allocated memory.
+ *
+ * This function guarantees that the allocation will succeed.  It will sleep
+ * for as long as it takes to complete the allocation.
+ *
+ * If there was insufficient memory to complete the request, return NULL.
+ */
+static inline void *ntfs_malloc_nofs_nofail(unsigned long size)
+{
+        return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_NOFAIL);
+}
 static inline void ntfs_free(void *addr)
 {
        if (likely(((unsigned long)addr < VMALLOC_START) ||
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 317f7c679fd3..2c32b84385a8 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -511,7 +511,6 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
                } while (bh);
                tail->b_this_page = head;
                attach_page_buffers(page, head);
-                BUG_ON(!page_has_buffers(page));
        }
        bh = head = page_buffers(page);
        BUG_ON(!bh);
@@ -692,7 +691,6 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
         */
        if (!NInoTestClearDirty(ni))
                goto done;
-        BUG_ON(!page_has_buffers(page));
        bh = head = page_buffers(page);
        BUG_ON(!bh);
        rl = NULL;
@@ -1955,7 +1953,7 @@ restore_undo_alloc:
        a = ctx->attr;
        a->data.non_resident.highest_vcn = cpu_to_sle64(old_last_vcn - 1);
 undo_alloc:
-        if (ntfs_cluster_free(vol->mft_ino, old_last_vcn, -1) < 0) {
+        if (ntfs_cluster_free(vol->mft_ino, old_last_vcn, -1, TRUE) < 0) {
                ntfs_error(vol->sb, "Failed to free clusters from mft data "
                                "attribute.%s", es);
                NVolSetErrors(vol);
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
index 758855b0414e..f5b2ac929081 100644
--- a/fs/ntfs/runlist.c
+++ b/fs/ntfs/runlist.c
@@ -35,7 +35,7 @@ static inline void ntfs_rl_mm(runlist_element *base, int dst, int src,
                int size)
 {
        if (likely((dst != src) && (size > 0)))
-                memmove(base + dst, base + src, size * sizeof (*base));
+                memmove(base + dst, base + src, size * sizeof(*base));
 }
 /**
@@ -95,6 +95,51 @@ static inline runlist_element *ntfs_rl_realloc(runlist_element *rl,
 }
 /**
+ * ntfs_rl_realloc_nofail - Reallocate memory for runlists
+ * @rl:         original runlist
+ * @old_size:   number of runlist elements in the original runlist @rl
+ * @new_size:   number of runlist elements we need space for
+ *
+ * As the runlists grow, more memory will be required.  To prevent the
+ * kernel having to allocate and reallocate large numbers of small bits of
+ * memory, this function returns an entire page of memory.
+ *
+ * This function guarantees that the allocation will succeed.  It will sleep
+ * for as long as it takes to complete the allocation.
+ *
+ * It is up to the caller to serialize access to the runlist @rl.
+ *
+ * N.B.  If the new allocation doesn't require a different number of pages in
+ *       memory, the function will return the original pointer.
+ *
+ * On success, return a pointer to the newly allocated, or recycled, memory.
+ * On error, return -errno. The following error codes are defined:
+ *      -ENOMEM - Not enough memory to allocate runlist array.
+ *      -EINVAL - Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_realloc_nofail(runlist_element *rl,
+                int old_size, int new_size)
+{
+        runlist_element *new_rl;
+        old_size = PAGE_ALIGN(old_size * sizeof(*rl));
+        new_size = PAGE_ALIGN(new_size * sizeof(*rl));
+        if (old_size == new_size)
+                return rl;
+        new_rl = ntfs_malloc_nofs_nofail(new_size);
+        BUG_ON(!new_rl);
+        if (likely(rl != NULL)) {
+                if (unlikely(old_size > new_size))
+                        old_size = new_size;
+                memcpy(new_rl, rl, old_size);
+                ntfs_free(rl);
+        }
+        return new_rl;
+}
+/**
 * ntfs_are_rl_mergeable - test if two runlists can be joined together
 * @dst:        original runlist
 * @src:        new runlist to test for mergeability with @dst
@@ -497,6 +542,7 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl,
                        /* Scan to the end of the source runlist. */
                        for (dend = 0; likely(drl[dend].length); dend++)
                                ;
+                        dend++;
                        drl = ntfs_rl_realloc(drl, dend, dend + 1);
                        if (IS_ERR(drl))
                                return drl;
@@ -566,8 +612,8 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl,
                 ((drl[dins].vcn + drl[dins].length) <=      /* End of hole   */
                  (srl[send - 1].vcn + srl[send - 1].length)));
-        /* Or we'll lose an end marker */
+        /* Or we will lose an end marker. */
-        if (start && finish && (drl[dins].length == 0))
+        if (finish && !drl[dins].length)
                ss++;
        if (marker && (drl[dins].vcn + drl[dins].length > srl[send - 1].vcn))
                finish = FALSE;
@@ -621,11 +667,8 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl,
                        if (drl[ds].lcn != LCN_RL_NOT_MAPPED) {
                                /* Add an unmapped runlist element. */
                                if (!slots) {
-                                        /* FIXME/TODO: We need to have the
+                                        drl = ntfs_rl_realloc_nofail(drl, ds,
-                                         * extra memory already! (AIA) */
+                                                        ds + 2);
-                                        drl = ntfs_rl_realloc(drl, ds, ds + 2);
-                                        if (!drl)
-                                                goto critical_error;
                                        slots = 2;
                                }
                                ds++;
@@ -640,13 +683,8 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl,
                        drl[ds].length = marker_vcn - drl[ds].vcn;
                        /* Finally add the ENOENT terminator. */
                        ds++;
-                        if (!slots) {
+                        if (!slots)
-                                /* FIXME/TODO: We need to have the extra
+                                drl = ntfs_rl_realloc_nofail(drl, ds, ds + 1);
-                                 * memory already! (AIA) */
-                                drl = ntfs_rl_realloc(drl, ds, ds + 1);
-                                if (!drl)
-                                        goto critical_error;
-                        }
                        drl[ds].vcn = marker_vcn;
                        drl[ds].lcn = LCN_ENOENT;
                        drl[ds].length = (s64)0;
@@ -659,11 +697,6 @@ finished:
        ntfs_debug("Merged runlist:");
        ntfs_debug_dump_runlist(drl);
        return drl;
-critical_error:
-        /* Critical error! We cannot afford to fail here. */
-        ntfs_error(NULL, "Critical error! Not enough memory.");
-        panic("NTFS: Cannot continue.");
 }
 /**
@@ -727,6 +760,9 @@ runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol,
                ntfs_error(vol->sb, "Corrupt attribute.");
                return ERR_PTR(-EIO);
        }
+        /* If the mapping pairs array is valid but empty, nothing to do. */
+        if (!vcn && !*buf)
+                return old_rl;
        /* Current position in runlist array. */
        rlpos = 0;
        /* Allocate first page and set current runlist size to one page. */
@@ -1419,6 +1455,7 @@ err_out:
 /**
 * ntfs_rl_truncate_nolock - truncate a runlist starting at a specified vcn
+ * @vol:        ntfs volume (needed for error output)
 * @runlist:    runlist to truncate
 * @new_length: the new length of the runlist in VCNs
 *
@@ -1426,12 +1463,16 @@ err_out:
 * holding the runlist elements to a length of @new_length VCNs.
 *
 * If @new_length lies within the runlist, the runlist elements with VCNs of
- * @new_length and above are discarded.
+ * @new_length and above are discarded.  As a special case if @new_length is
+ * zero, the runlist is discarded and set to NULL.
 *
 * If @new_length lies beyond the runlist, a sparse runlist element is added to
 * the end of the runlist @runlist or if the last runlist element is a sparse
 * one already, this is extended.
 *
+ * Note, no checking is done for unmapped runlist elements.  It is assumed that
+ * the caller has mapped any elements that need to be mapped already.
+ *
 * Return 0 on success and -errno on error.
 *
 * Locking: The caller must hold @runlist->lock for writing.
@@ -1446,6 +1487,13 @@ int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist,
        BUG_ON(!runlist);
        BUG_ON(new_length < 0);
        rl = runlist->rl;
+        if (!new_length) {
+                ntfs_debug("Freeing runlist.");
+                runlist->rl = NULL;
+                if (rl)
+                        ntfs_free(rl);
+                return 0;
+        }
        if (unlikely(!rl)) {
                /*
                 * Create a runlist consisting of a sparse runlist element of
@@ -1553,4 +1601,288 @@ int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist,
        return 0;
 }
+/**
+ * ntfs_rl_punch_nolock - punch a hole into a runlist
+ * @vol:        ntfs volume (needed for error output)
+ * @runlist:    runlist to punch a hole into
+ * @start:      starting VCN of the hole to be created
+ * @length:     size of the hole to be created in units of clusters
+ *
+ * Punch a hole into the runlist @runlist starting at VCN @start and of size
+ * @length clusters.
+ *
+ * Return 0 on success and -errno on error, in which case @runlist has not been
+ * modified.
+ *
+ * If @start and/or @start + @length are outside the runlist return error code
+ * -ENOENT.
+ *
+ * If the runlist contains unmapped or error elements between @start and @start
+ * + @length return error code -EINVAL.
+ *
+ * Locking: The caller must hold @runlist->lock for writing.
+ */
+int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist,
+                const VCN start, const s64 length)
+{
+        const VCN end = start + length;
+        s64 delta;
+        runlist_element *rl, *rl_end, *rl_real_end, *trl;
+        int old_size;
+        BOOL lcn_fixup = FALSE;
+        ntfs_debug("Entering for start 0x%llx, length 0x%llx.",
+                        (long long)start, (long long)length);
+        BUG_ON(!runlist);
+        BUG_ON(start < 0);
+        BUG_ON(length < 0);
+        BUG_ON(end < 0);
+        rl = runlist->rl;
+        if (unlikely(!rl)) {
+                if (likely(!start && !length))
+                        return 0;
+                return -ENOENT;
+        }
+        /* Find @start in the runlist. */
+        while (likely(rl->length && start >= rl[1].vcn))
+                rl++;
+        rl_end = rl;
+        /* Find @end in the runlist. */
+        while (likely(rl_end->length && end >= rl_end[1].vcn)) {
+                /* Verify there are no unmapped or error elements. */
+                if (unlikely(rl_end->lcn < LCN_HOLE))
+                        return -EINVAL;
+                rl_end++;
+        }
+        /* Check the last element. */
+        if (unlikely(rl_end->length && rl_end->lcn < LCN_HOLE))
+                return -EINVAL;
+        /* This covers @start being out of bounds, too. */
+        if (!rl_end->length && end > rl_end->vcn)
+                return -ENOENT;
+        if (!length)
+                return 0;
+        if (!rl->length)
+                return -ENOENT;
+        rl_real_end = rl_end;
+        /* Determine the runlist size. */
+        while (likely(rl_real_end->length))
+                rl_real_end++;
+        old_size = rl_real_end - runlist->rl + 1;
+        /* If @start is in a hole simply extend the hole. */
+        if (rl->lcn == LCN_HOLE) {
+                /*
+                 * If both @start and @end are in the same sparse run, we are
+                 * done.
+                 */
+                if (end <= rl[1].vcn) {
+                        ntfs_debug("Done (requested hole is already sparse).");
+                        return 0;
+                }
+extend_hole:
+                /* Extend the hole. */
+                rl->length = end - rl->vcn;
+                /* If @end is in a hole, merge it with the current one. */
+                if (rl_end->lcn == LCN_HOLE) {
+                        rl_end++;
+                        rl->length = rl_end->vcn - rl->vcn;
+                }
+                /* We have done the hole.  Now deal with the remaining tail. */
+                rl++;
+                /* Cut out all runlist elements up to @end. */
+                if (rl < rl_end)
+                        memmove(rl, rl_end, (rl_real_end - rl_end + 1) *
+                                        sizeof(*rl));
+                /* Adjust the beginning of the tail if necessary. */
+                if (end > rl->vcn) {
+                        s64 delta = end - rl->vcn;
+                        rl->vcn = end;
+                        rl->length -= delta;
+                        /* Only adjust the lcn if it is real. */
+                        if (rl->lcn >= 0)
+                                rl->lcn += delta;
+                }
+shrink_allocation:
+                /* Reallocate memory if the allocation changed. */
+                if (rl < rl_end) {
+                        rl = ntfs_rl_realloc(runlist->rl, old_size,
+                                        old_size - (rl_end - rl));
+                        if (IS_ERR(rl))
+                                ntfs_warning(vol->sb, "Failed to shrink "
+                                                "runlist buffer.  This just "
+                                                "wastes a bit of memory "
+                                                "temporarily so we ignore it "
+                                                "and return success.");
+                        else
+                                runlist->rl = rl;
+                }
+                ntfs_debug("Done (extend hole).");
+                return 0;
+        }
+        /*
+         * If @start is at the beginning of a run things are easier as there is
+         * no need to split the first run.
+         */
+        if (start == rl->vcn) {
+                /*
+                 * @start is at the beginning of a run.
+                 *
+                 * If the previous run is sparse, extend its hole.
+                 *
+                 * If @end is not in the same run, switch the run to be sparse
+                 * and extend the newly created hole.
+                 *
+                 * Thus both of these cases reduce the problem to the above
+                 * case of "@start is in a hole".
+                 */
+                if (rl > runlist->rl && (rl - 1)->lcn == LCN_HOLE) {
+                        rl--;
+                        goto extend_hole;
+                }
+                if (end >= rl[1].vcn) {
+                        rl->lcn = LCN_HOLE;
+                        goto extend_hole;
+                }
+                /*
+                 * The final case is when @end is in the same run as @start.
+                 * For this need to split the run into two.  One run for the
+                 * sparse region between the beginning of the old run, i.e.
+                 * @start, and @end and one for the remaining non-sparse
+                 * region, i.e. between @end and the end of the old run.
+                 */
+                trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1);
+                if (IS_ERR(trl))
+                        goto enomem_out;
+                old_size++;
+                if (runlist->rl != trl) {
+                        rl = trl + (rl - runlist->rl);
+                        rl_end = trl + (rl_end - runlist->rl);
+                        rl_real_end = trl + (rl_real_end - runlist->rl);
+                        runlist->rl = trl;
+                }
+split_end:
+                /* Shift all the runs up by one. */
+                memmove(rl + 1, rl, (rl_real_end - rl + 1) * sizeof(*rl));
+                /* Finally, setup the two split runs. */
+                rl->lcn = LCN_HOLE;
+                rl->length = length;
+                rl++;
+                rl->vcn += length;
+                /* Only adjust the lcn if it is real. */
+                if (rl->lcn >= 0 || lcn_fixup)
+                        rl->lcn += length;
+                rl->length -= length;
+                ntfs_debug("Done (split one).");
+                return 0;
+        }
+        /*
+         * @start is neither in a hole nor at the beginning of a run.
+         *
+         * If @end is in a hole, things are easier as simply truncating the run
+         * @start is in to end at @start - 1, deleting all runs after that up
+         * to @end, and finally extending the beginning of the run @end is in
+         * to be @start is all that is needed.
+         */
+        if (rl_end->lcn == LCN_HOLE) {
+                /* Truncate the run containing @start. */
+                rl->length = start - rl->vcn;
+                rl++;
+                /* Cut out all runlist elements up to @end. */
+                if (rl < rl_end)
+                        memmove(rl, rl_end, (rl_real_end - rl_end + 1) *
+                                        sizeof(*rl));
+                /* Extend the beginning of the run @end is in to be @start. */
+                rl->vcn = start;
+                rl->length = rl[1].vcn - start;
+                goto shrink_allocation;
+        }
+        /* 
+         * If @end is not in a hole there are still two cases to distinguish.
+         * Either @end is or is not in the same run as @start.
+         *
+         * The second case is easier as it can be reduced to an already solved
+         * problem by truncating the run @start is in to end at @start - 1.
+         * Then, if @end is in the next run need to split the run into a sparse
+         * run followed by a non-sparse run (already covered above) and if @end
+         * is not in the next run switching it to be sparse, again reduces the
+         * problem to the already covered case of "@start is in a hole".
+         */
+        if (end >= rl[1].vcn) {
+                /*
+                 * If @end is not in the next run, reduce the problem to the
+                 * case of "@start is in a hole".
+                 */
+                if (rl[1].length && end >= rl[2].vcn) {
+                        /* Truncate the run containing @start. */
+                        rl->length = start - rl->vcn;
+                        rl++;
+                        rl->vcn = start;
+                        rl->lcn = LCN_HOLE;
+                        goto extend_hole;
+                }
+                trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1);
+                if (IS_ERR(trl))
+                        goto enomem_out;
+                old_size++;
+                if (runlist->rl != trl) {
+                        rl = trl + (rl - runlist->rl);
+                        rl_end = trl + (rl_end - runlist->rl);
+                        rl_real_end = trl + (rl_real_end - runlist->rl);
+                        runlist->rl = trl;
+                }
+                /* Truncate the run containing @start. */
+                rl->length = start - rl->vcn;
+                rl++;
+                /*
+                 * @end is in the next run, reduce the problem to the case
+                 * where "@start is at the beginning of a run and @end is in
+                 * the same run as @start".
+                 */
+                delta = rl->vcn - start;
+                rl->vcn = start;
+                if (rl->lcn >= 0) {
+                        rl->lcn -= delta;
+                        /* Need this in case the lcn just became negative. */
+                        lcn_fixup = TRUE;
+                }
+                rl->length += delta;
+                goto split_end;
+        }
+        /*
+         * The first case from above, i.e. @end is in the same run as @start.
+         * We need to split the run into three.  One run for the non-sparse
+         * region between the beginning of the old run and @start, one for the
+         * sparse region between @start and @end, and one for the remaining
+         * non-sparse region, i.e. between @end and the end of the old run.
+         */
+        trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 2);
+        if (IS_ERR(trl))
+                goto enomem_out;
+        old_size += 2;
+        if (runlist->rl != trl) {
+                rl = trl + (rl - runlist->rl);
+                rl_end = trl + (rl_end - runlist->rl);
+                rl_real_end = trl + (rl_real_end - runlist->rl);
+                runlist->rl = trl;
+        }
+        /* Shift all the runs up by two. */
+        memmove(rl + 2, rl, (rl_real_end - rl + 1) * sizeof(*rl));
+        /* Finally, setup the three split runs. */
+        rl->length = start - rl->vcn;
+        rl++;
+        rl->vcn = start;
+        rl->lcn = LCN_HOLE;
+        rl->length = length;
+        rl++;
+        delta = end - rl->vcn;
+        rl->vcn = end;
+        rl->lcn += delta;
+        rl->length -= delta;
+        ntfs_debug("Done (split both).");
+        return 0;
+enomem_out:
+        ntfs_error(vol->sb, "Not enough memory to extend runlist buffer.");
+        return -ENOMEM;
+}
 #endif /* NTFS_RW */
diff --git a/fs/ntfs/runlist.h b/fs/ntfs/runlist.h
index aa0ee6540e7c..47728fbb610b 100644
--- a/fs/ntfs/runlist.h
+++ b/fs/ntfs/runlist.h
@@ -94,6 +94,9 @@ extern int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
 extern int ntfs_rl_truncate_nolock(const ntfs_volume *vol,
                runlist *const runlist, const s64 new_length);
+int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist,
+                const VCN start, const s64 length);
 #endif /* NTFS_RW */
 #endif /* _LINUX_NTFS_RUNLIST_H */
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 41aa8eb6755b..b2b392961268 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1133,7 +1133,8 @@ mft_unmap_out:
 *
 * Return TRUE on success or FALSE on error.
 */
-static BOOL load_and_check_logfile(ntfs_volume *vol)
+static BOOL load_and_check_logfile(ntfs_volume *vol,
+                RESTART_PAGE_HEADER **rp)
 {
        struct inode *tmp_ino;
@@ -1145,7 +1146,7 @@ static BOOL load_and_check_logfile(ntfs_volume *vol)
                /* Caller will display error message. */
                return FALSE;
        }
-        if (!ntfs_check_logfile(tmp_ino)) {
+        if (!ntfs_check_logfile(tmp_ino, rp)) {
                iput(tmp_ino);
                /* ntfs_check_logfile() will have displayed error output. */
                return FALSE;
@@ -1689,6 +1690,7 @@ static BOOL load_system_files(ntfs_volume *vol)
        VOLUME_INFORMATION *vi;
        ntfs_attr_search_ctx *ctx;
 #ifdef NTFS_RW
+        RESTART_PAGE_HEADER *rp;
        int err;
 #endif /* NTFS_RW */
@@ -1841,8 +1843,9 @@ get_ctx_vol_failed:
         * Get the inode for the logfile, check it and determine if the volume
         * was shutdown cleanly.
         */
-        if (!load_and_check_logfile(vol) ||
+        rp = NULL;
-                        !ntfs_is_logfile_clean(vol->logfile_ino)) {
+        if (!load_and_check_logfile(vol, &rp) ||
+                        !ntfs_is_logfile_clean(vol->logfile_ino, rp)) {
                static const char *es1a = "Failed to load $LogFile";
                static const char *es1b = "$LogFile is not clean";
                static const char *es2 = ".  Mount in Windows.";
@@ -1857,6 +1860,10 @@ get_ctx_vol_failed:
                                                "continue nor on_errors="
                                                "remount-ro was specified%s",
                                                es1, es2);
+                                if (vol->logfile_ino) {
+                                        BUG_ON(!rp);
+                                        ntfs_free(rp);
+                                }
                                goto iput_logfile_err_out;
                        }
                        sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
@@ -1867,6 +1874,7 @@ get_ctx_vol_failed:
                /* This will prevent a read-write remount. */
                NVolSetErrors(vol);
        }
+        ntfs_free(rp);
 #endif /* NTFS_RW */
        /* Get the root directory inode so we can do path lookups. */
        vol->root_ino = ntfs_iget(sb, FILE_root);
diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c
index 19c42e231b44..a389a5a16c84 100644
--- a/fs/ntfs/unistr.c
+++ b/fs/ntfs/unistr.c
@@ -372,7 +372,8 @@ retry:			wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o,
        return -EINVAL;
 conversion_err:
        ntfs_error(vol->sb, "Unicode name contains characters that cannot be "
-                        "converted to character set %s.", nls->charset);
+                        "converted to character set %s.  You might want to "
+                        "try to use the mount option nls=utf8.", nls->charset);
        if (ns != *outs)
                kfree(ns);
        if (wc != -ENAMETOOLONG)
diff --git a/fs/open.c b/fs/open.c
index 4ee2dcc31c28..2fac58c51910 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -24,6 +24,7 @@
 #include <linux/personality.h>
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
+#include <linux/rcupdate.h>
 #include <asm/unistd.h>
@@ -842,14 +843,16 @@ int get_unused_fd(void)
 {
        struct files_struct * files = current->files;
        int fd, error;
+        struct fdtable *fdt;
        error = -EMFILE;
        spin_lock(&files->file_lock);
 repeat:
-        fd = find_next_zero_bit(files->open_fds->fds_bits, 
+        fdt = files_fdtable(files);
-                                files->max_fdset, 
+        fd = find_next_zero_bit(fdt->open_fds->fds_bits,
-                                files->next_fd);
+                                fdt->max_fdset,
+                                fdt->next_fd);
        /*
         * N.B. For clone tasks sharing a files structure, this test
@@ -872,14 +875,14 @@ repeat:
                goto repeat;
        }
-        FD_SET(fd, files->open_fds);
+        FD_SET(fd, fdt->open_fds);
-        FD_CLR(fd, files->close_on_exec);
+        FD_CLR(fd, fdt->close_on_exec);
-        files->next_fd = fd + 1;
+        fdt->next_fd = fd + 1;
 #if 1
        /* Sanity check */
-        if (files->fd[fd] != NULL) {
+        if (fdt->fd[fd] != NULL) {
                printk(KERN_WARNING "get_unused_fd: slot %d not NULL!\n", fd);
-                files->fd[fd] = NULL;
+                fdt->fd[fd] = NULL;
        }
 #endif
        error = fd;
@@ -893,9 +896,10 @@ EXPORT_SYMBOL(get_unused_fd);
 static inline void __put_unused_fd(struct files_struct *files, unsigned int fd)
 {
-        __FD_CLR(fd, files->open_fds);
+        struct fdtable *fdt = files_fdtable(files);
-        if (fd < files->next_fd)
+        __FD_CLR(fd, fdt->open_fds);
-                files->next_fd = fd;
+        if (fd < fdt->next_fd)
+                fdt->next_fd = fd;
 }
 void fastcall put_unused_fd(unsigned int fd)
@@ -924,10 +928,11 @@ EXPORT_SYMBOL(put_unused_fd);
 void fastcall fd_install(unsigned int fd, struct file * file)
 {
        struct files_struct *files = current->files;
+        struct fdtable *fdt;
        spin_lock(&files->file_lock);
-        if (unlikely(files->fd[fd] != NULL))
+        fdt = files_fdtable(files);
-                BUG();
+        BUG_ON(fdt->fd[fd] != NULL);
-        files->fd[fd] = file;
+        rcu_assign_pointer(fdt->fd[fd], file);
        spin_unlock(&files->file_lock);
 }
@@ -1010,15 +1015,17 @@ asmlinkage long sys_close(unsigned int fd)
 {
        struct file * filp;
        struct files_struct *files = current->files;
+        struct fdtable *fdt;
        spin_lock(&files->file_lock);
-        if (fd >= files->max_fds)
+        fdt = files_fdtable(files);
+        if (fd >= fdt->max_fds)
                goto out_unlock;
-        filp = files->fd[fd];
+        filp = fdt->fd[fd];
        if (!filp)
                goto out_unlock;
-        files->fd[fd] = NULL;
+        rcu_assign_pointer(fdt->fd[fd], NULL);
-        FD_CLR(fd, files->close_on_exec);
+        FD_CLR(fd, fdt->close_on_exec);
        __put_unused_fd(files, fd);
        spin_unlock(&files->file_lock);
        return filp_close(filp, files);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 37668fe998ad..d88d518d30f6 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -159,6 +159,7 @@ static inline char * task_state(struct task_struct *p, char *buffer)
 {
        struct group_info *group_info;
        int g;
+        struct fdtable *fdt = NULL;
        read_lock(&tasklist_lock);
        buffer += sprintf(buffer,
@@ -179,10 +180,12 @@ static inline char * task_state(struct task_struct *p, char *buffer)
                p->gid, p->egid, p->sgid, p->fsgid);
        read_unlock(&tasklist_lock);
        task_lock(p);
+        if (p->files)
+                fdt = files_fdtable(p->files);
        buffer += sprintf(buffer,
                "FDSize:\t%d\n"
                "Groups:\t",
-                p->files ? p->files->max_fds : 0);
+                fdt ? fdt->max_fds : 0);
        group_info = p->group_info;
        get_group_info(group_info);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 84751f3f52d5..23db452ab428 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -62,6 +62,7 @@
 #include <linux/namespace.h>
 #include <linux/mm.h>
 #include <linux/smp_lock.h>
+#include <linux/rcupdate.h>
 #include <linux/kallsyms.h>
 #include <linux/mount.h>
 #include <linux/security.h>
@@ -283,16 +284,16 @@ static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsm
        files = get_files_struct(task);
        if (files) {
-                spin_lock(&files->file_lock);
+                rcu_read_lock();
                file = fcheck_files(files, fd);
                if (file) {
                        *mnt = mntget(file->f_vfsmnt);
                        *dentry = dget(file->f_dentry);
-                        spin_unlock(&files->file_lock);
+                        rcu_read_unlock();
                        put_files_struct(files);
                        return 0;
                }
-                spin_unlock(&files->file_lock);
+                rcu_read_unlock();
                put_files_struct(files);
        }
        return -ENOENT;
@@ -1039,6 +1040,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
        int retval;
        char buf[NUMBUF];
        struct files_struct * files;
+        struct fdtable *fdt;
        retval = -ENOENT;
        if (!pid_alive(p))
@@ -1061,15 +1063,16 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
                        files = get_files_struct(p);
                        if (!files)
                                goto out;
-                        spin_lock(&files->file_lock);
+                        rcu_read_lock();
+                        fdt = files_fdtable(files);
                        for (fd = filp->f_pos-2;
-                             fd < files->max_fds;
+                             fd < fdt->max_fds;
                             fd++, filp->f_pos++) {
                                unsigned int i,j;
                                if (!fcheck_files(files, fd))
                                        continue;
-                                spin_unlock(&files->file_lock);
+                                rcu_read_unlock();
                                j = NUMBUF;
                                i = fd;
@@ -1081,12 +1084,12 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
                                ino = fake_ino(tid, PROC_TID_FD_DIR + fd);
                                if (filldir(dirent, buf+j, NUMBUF-j, fd+2, ino, DT_LNK) < 0) {
-                                        spin_lock(&files->file_lock);
+                                        rcu_read_lock();
                                        break;
                                }
-                                spin_lock(&files->file_lock);
+                                rcu_read_lock();
                        }
-                        spin_unlock(&files->file_lock);
+                        rcu_read_unlock();
                        put_files_struct(files);
        }
 out:
@@ -1261,9 +1264,9 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
        files = get_files_struct(task);
        if (files) {
-                spin_lock(&files->file_lock);
+                rcu_read_lock();
                if (fcheck_files(files, fd)) {
-                        spin_unlock(&files->file_lock);
+                        rcu_read_unlock();
                        put_files_struct(files);
                        if (task_dumpable(task)) {
                                inode->i_uid = task->euid;
@@ -1275,7 +1278,7 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
                        security_task_to_inode(task, inode);
                        return 1;
                }
-                spin_unlock(&files->file_lock);
+                rcu_read_unlock();
                put_files_struct(files);
        }
        d_drop(dentry);
@@ -1367,7 +1370,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
        if (!files)
                goto out_unlock;
        inode->i_mode = S_IFLNK;
-        spin_lock(&files->file_lock);
+        rcu_read_lock();
        file = fcheck_files(files, fd);
        if (!file)
                goto out_unlock2;
@@ -1375,7 +1378,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
                inode->i_mode |= S_IRUSR | S_IXUSR;
        if (file->f_mode & 2)
                inode->i_mode |= S_IWUSR | S_IXUSR;
-        spin_unlock(&files->file_lock);
+        rcu_read_unlock();
        put_files_struct(files);
        inode->i_op = &proc_pid_link_inode_operations;
        inode->i_size = 64;
@@ -1385,7 +1388,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
        return NULL;
 out_unlock2:
-        spin_unlock(&files->file_lock);
+        rcu_read_unlock();
        put_files_struct(files);
 out_unlock:
        iput(inode);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 133c28685105..effa6c0c467a 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -60,6 +60,8 @@ static void proc_delete_inode(struct inode *inode)
        struct proc_dir_entry *de;
        struct task_struct *tsk;
+        truncate_inode_pages(&inode->i_data, 0);
        /* Let go of any associated process */
        tsk = PROC_I(inode)->task;
        if (tsk)
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index b79162a35478..80f32911c0cb 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -63,6 +63,7 @@ int qnx4_sync_inode(struct inode *inode)
 static void qnx4_delete_inode(struct inode *inode)
 {
        QNX4DEBUG(("qnx4: deleting inode [%lu]\n", (unsigned long) inode->i_ino));
+        truncate_inode_pages(&inode->i_data, 0);
        inode->i_size = 0;
        qnx4_truncate(inode);
        lock_kernel();
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ff291c973a56..1a8a1bf2154d 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -33,6 +33,8 @@ void reiserfs_delete_inode(struct inode *inode)
            2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
        struct reiserfs_transaction_handle th;
+        truncate_inode_pages(&inode->i_data, 0);
        reiserfs_write_lock(inode->i_sb);
        /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
diff --git a/fs/select.c b/fs/select.c
index b80e7eb0ac0d..f10a10317d54 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -22,6 +22,7 @@
 #include <linux/personality.h> /* for STICKY_TIMEOUTS */
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/rcupdate.h>
 #include <asm/uaccess.h>
@@ -132,11 +133,13 @@ static int max_select_fd(unsigned long n, fd_set_bits *fds)
        unsigned long *open_fds;
        unsigned long set;
        int max;
+        struct fdtable *fdt;
        /* handle last in-complete long-word first */
        set = ~(~0UL << (n & (__NFDBITS-1)));
        n /= __NFDBITS;
-        open_fds = current->files->open_fds->fds_bits+n;
+        fdt = files_fdtable(current->files);
+        open_fds = fdt->open_fds->fds_bits+n;
        max = 0;
        if (set) {
                set &= BITS(fds, n);
@@ -183,9 +186,9 @@ int do_select(int n, fd_set_bits *fds, long *timeout)
        int retval, i;
        long __timeout = *timeout;
-        spin_lock(&current->files->file_lock);
+        rcu_read_lock();
        retval = max_select_fd(n, fds);
-        spin_unlock(&current->files->file_lock);
+        rcu_read_unlock();
        if (retval < 0)
                return retval;
@@ -299,6 +302,7 @@ sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s
        char *bits;
        long timeout;
        int ret, size, max_fdset;
+        struct fdtable *fdt;
        timeout = MAX_SCHEDULE_TIMEOUT;
        if (tvp) {
@@ -326,7 +330,10 @@ sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s
                goto out_nofds;
        /* max_fdset can increase, so grab it once to avoid race */
-        max_fdset = current->files->max_fdset;
+        rcu_read_lock();
+        fdt = files_fdtable(current->files);
+        max_fdset = fdt->max_fdset;
+        rcu_read_unlock();
        if (n > max_fdset)
                n = max_fdset;
@@ -464,9 +471,15 @@ asmlinkage long sys_poll(struct pollfd __user * ufds, unsigned int nfds, long ti
        unsigned int i;
        struct poll_list *head;
        struct poll_list *walk;
+        struct fdtable *fdt;
+        int max_fdset;
        /* Do a sanity check on nfds ... */
-        if (nfds > current->files->max_fdset && nfds > OPEN_MAX)
+        rcu_read_lock();
+        fdt = files_fdtable(current->files);
+        max_fdset = fdt->max_fdset;
+        rcu_read_unlock();
+        if (nfds > max_fdset && nfds > OPEN_MAX)
                return -EINVAL;
        if (timeout) {
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 4765aaac9fd2..10b994428fef 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -331,6 +331,7 @@ static void
 smb_delete_inode(struct inode *ino)
 {
        DEBUG1("ino=%ld\n", ino->i_ino);
+        truncate_inode_pages(&ino->i_data, 0);
        lock_kernel();
        if (smb_close(ino))
                PARANOIA("could not close inode %ld\n", ino->i_ino);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 0530077d9dd8..fa33eceb0011 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -292,6 +292,7 @@ int sysv_sync_inode(struct inode * inode)
 static void sysv_delete_inode(struct inode *inode)
 {
+        truncate_inode_pages(&inode->i_data, 0);
        inode->i_size = 0;
        sysv_truncate(inode);
        lock_kernel();
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 3d68de39fad6..b83890beaaac 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -87,6 +87,8 @@ static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
 */
 void udf_delete_inode(struct inode * inode)
 {
+        truncate_inode_pages(&inode->i_data, 0);
        if (is_bad_inode(inode))
                goto no_delete;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 718627ca8b5c..55f4aa16e3fc 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -804,6 +804,7 @@ int ufs_sync_inode (struct inode *inode)
 void ufs_delete_inode (struct inode * inode)
 {
+        truncate_inode_pages(&inode->i_data, 0);
        /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
        lock_kernel();
        mark_inode_dirty(inode);
diff --git a/fs/xfs/Makefile-linux-2.6 b/fs/xfs/Makefile-linux-2.6
index 8e18ff157247..d8c87fa21ad1 100644
--- a/fs/xfs/Makefile-linux-2.6
+++ b/fs/xfs/Makefile-linux-2.6
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+# Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
 #
 # This program is free software; you can redistribute it and/or modify it
 # under the terms of version 2 of the GNU General Public License as
@@ -55,7 +55,18 @@ ifeq ($(CONFIG_XFS_TRACE),y)
 endif
 obj-$(CONFIG_XFS_FS)            += xfs.o
-xfs-$(CONFIG_XFS_QUOTA)         += quota/
+xfs-$(CONFIG_XFS_QUOTA)         += $(addprefix quota/, \
+                                   xfs_dquot.o \
+                                   xfs_dquot_item.o \
+                                   xfs_trans_dquot.o \
+                                   xfs_qm_syscalls.o \
+                                   xfs_qm_bhv.o \
+                                   xfs_qm.o)
+ifeq ($(CONFIG_XFS_QUOTA),y)
+xfs-$(CONFIG_PROC_FS)           += quota/xfs_qm_stats.o
+endif
 xfs-$(CONFIG_XFS_RT)            += xfs_rtalloc.o
 xfs-$(CONFIG_XFS_POSIX_ACL)     += xfs_acl.o
diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
index 3dae14c8c55a..fa8394f9437d 100644
--- a/fs/xfs/support/ktrace.c
+++ b/fs/xfs/support/ktrace.c
@@ -170,7 +170,7 @@ ktrace_enter(
        void            *val14,
        void            *val15)
 {
-        static lock_t   wrap_lock = SPIN_LOCK_UNLOCKED;
+        static DEFINE_SPINLOCK(wrap_lock);
        unsigned long   flags;
        int             index;
        ktrace_entry_t  *ktep;