61 files changed, 6372 insertions, 2689 deletions
diff --git a/net/9p/Makefile b/net/9p/Makefile
index d3abb246ccab..8a1051101898 100644
--- a/net/9p/Makefile
+++ b/net/9p/Makefile
@@ -4,7 +4,6 @@ obj-$(CONFIG_NET_9P_VIRTIO) += 9pnet_virtio.o
 9pnet-objs := \
        mod.o \
-        mux.o \
        client.o \
        conv.o \
        error.o \
diff --git a/net/9p/client.c b/net/9p/client.c
index af9199364049..84e087e24146 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -3,6 +3,7 @@
 *
 * 9P Client
 *
+ *  Copyright (C) 2008 by Eric Van Hensbergen <ericvh@gmail.com>
 *  Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net>
 *
 *  This program is free software; you can redistribute it and/or modify
@@ -25,6 +26,7 @@
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/poll.h>
 #include <linux/idr.h>
 #include <linux/mutex.h>
 #include <linux/sched.h>
@@ -32,15 +34,97 @@
 #include <net/9p/9p.h>
 #include <linux/parser.h>
 #include <net/9p/transport.h>
-#include <net/9p/conn.h>
 #include <net/9p/client.h>
 static struct p9_fid *p9_fid_create(struct p9_client *clnt);
 static void p9_fid_destroy(struct p9_fid *fid);
 static struct p9_stat *p9_clone_stat(struct p9_stat *st, int dotu);
-struct p9_client *p9_client_create(struct p9_trans *trans, int msize,
+/*
-                                                                   int dotu)
+  * Client Option Parsing (code inspired by NFS code)
+  *  - a little lazy - parse all client options
+  */
+enum {
+        Opt_msize,
+        Opt_trans,
+        Opt_legacy,
+        Opt_err,
+};
+static match_table_t tokens = {
+        {Opt_msize, "msize=%u"},
+        {Opt_legacy, "noextend"},
+        {Opt_trans, "trans=%s"},
+        {Opt_err, NULL},
+};
+/**
+ * v9fs_parse_options - parse mount options into session structure
+ * @options: options string passed from mount
+ * @v9ses: existing v9fs session information
+ *
+ */
+static void parse_opts(char *options, struct p9_client *clnt)
+{
+        char *p;
+        substring_t args[MAX_OPT_ARGS];
+        int option;
+        int ret;
+        clnt->trans_mod = v9fs_default_trans();
+        clnt->dotu = 1;
+        clnt->msize = 8192;
+        if (!options)
+                return;
+        while ((p = strsep(&options, ",")) != NULL) {
+                int token;
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                if (token < Opt_trans) {
+                        ret = match_int(&args[0], &option);
+                        if (ret < 0) {
+                                P9_DPRINTK(P9_DEBUG_ERROR,
+                                        "integer field, but no integer?\n");
+                                continue;
+                        }
+                }
+                switch (token) {
+                case Opt_msize:
+                        clnt->msize = option;
+                        break;
+                case Opt_trans:
+                        clnt->trans_mod = v9fs_match_trans(&args[0]);
+                        break;
+                case Opt_legacy:
+                        clnt->dotu = 0;
+                        break;
+                default:
+                        continue;
+                }
+        }
+}
+/**
+ * p9_client_rpc - sends 9P request and waits until a response is available.
+ *      The function can be interrupted.
+ * @c: client data
+ * @tc: request to be sent
+ * @rc: pointer where a pointer to the response is stored
+ */
+int
+p9_client_rpc(struct p9_client *c, struct p9_fcall *tc,
+        struct p9_fcall **rc)
+{
+        return c->trans->rpc(c->trans, tc, rc);
+}
+struct p9_client *p9_client_create(const char *dev_name, char *options)
 {
        int err, n;
        struct p9_client *clnt;
@@ -54,12 +138,7 @@ struct p9_client *p9_client_create(struct p9_trans *trans, int msize,
        if (!clnt)
                return ERR_PTR(-ENOMEM);
-        P9_DPRINTK(P9_DEBUG_9P, "clnt %p trans %p msize %d dotu %d\n",
-                clnt, trans, msize, dotu);
        spin_lock_init(&clnt->lock);
-        clnt->trans = trans;
-        clnt->msize = msize;
-        clnt->dotu = dotu;
        INIT_LIST_HEAD(&clnt->fidlist);
        clnt->fidpool = p9_idpool_create();
        if (!clnt->fidpool) {
@@ -68,13 +147,29 @@ struct p9_client *p9_client_create(struct p9_trans *trans, int msize,
                goto error;
        }
-        clnt->conn = p9_conn_create(clnt->trans, clnt->msize, &clnt->dotu);
+        parse_opts(options, clnt);
-        if (IS_ERR(clnt->conn)) {
+        if (clnt->trans_mod == NULL) {
-                err = PTR_ERR(clnt->conn);
+                err = -EPROTONOSUPPORT;
-                clnt->conn = NULL;
+                P9_DPRINTK(P9_DEBUG_ERROR,
+                                "No transport defined or default transport\n");
                goto error;
        }
+        P9_DPRINTK(P9_DEBUG_9P, "clnt %p trans %p msize %d dotu %d\n",
+                clnt, clnt->trans_mod, clnt->msize, clnt->dotu);
+        clnt->trans = clnt->trans_mod->create(dev_name, options, clnt->msize,
+                                                                clnt->dotu);
+        if (IS_ERR(clnt->trans)) {
+                err = PTR_ERR(clnt->trans);
+                clnt->trans = NULL;
+                goto error;
+        }
+        if ((clnt->msize+P9_IOHDRSZ) > clnt->trans_mod->maxsize)
+                clnt->msize = clnt->trans_mod->maxsize-P9_IOHDRSZ;
        tc = p9_create_tversion(clnt->msize, clnt->dotu?"9P2000.u":"9P2000");
        if (IS_ERR(tc)) {
                err = PTR_ERR(tc);
@@ -82,7 +177,7 @@ struct p9_client *p9_client_create(struct p9_trans *trans, int msize,
                goto error;
        }
-        err = p9_conn_rpc(clnt->conn, tc, &rc);
+        err = p9_client_rpc(clnt, tc, &rc);
        if (err)
                goto error;
@@ -117,10 +212,6 @@ void p9_client_destroy(struct p9_client *clnt)
        struct p9_fid *fid, *fidptr;
        P9_DPRINTK(P9_DEBUG_9P, "clnt %p\n", clnt);
-        if (clnt->conn) {
-                p9_conn_destroy(clnt->conn);
-                clnt->conn = NULL;
-        }
        if (clnt->trans) {
                clnt->trans->close(clnt->trans);
@@ -142,7 +233,6 @@ void p9_client_disconnect(struct p9_client *clnt)
 {
        P9_DPRINTK(P9_DEBUG_9P, "clnt %p\n", clnt);
        clnt->trans->status = Disconnected;
-        p9_conn_cancel(clnt->conn, -EIO);
 }
 EXPORT_SYMBOL(p9_client_disconnect);
@@ -174,7 +264,7 @@ struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid,
                goto error;
        }
-        err = p9_conn_rpc(clnt->conn, tc, &rc);
+        err = p9_client_rpc(clnt, tc, &rc);
        if (err)
                goto error;
@@ -219,7 +309,7 @@ struct p9_fid *p9_client_auth(struct p9_client *clnt, char *uname,
                goto error;
        }
-        err = p9_conn_rpc(clnt->conn, tc, &rc);
+        err = p9_client_rpc(clnt, tc, &rc);
        if (err)
                goto error;
@@ -270,7 +360,7 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, int nwname, char **wnames,
                goto error;
        }
-        err = p9_conn_rpc(clnt->conn, tc, &rc);
+        err = p9_client_rpc(clnt, tc, &rc);
        if (err) {
                if (rc && rc->id == P9_RWALK)
                        goto clunk_fid;
@@ -305,7 +395,7 @@ clunk_fid:
                goto error;
        }
-        p9_conn_rpc(clnt->conn, tc, &rc);
+        p9_client_rpc(clnt, tc, &rc);
 error:
        kfree(tc);
@@ -339,7 +429,7 @@ int p9_client_open(struct p9_fid *fid, int mode)
                goto done;
        }
-        err = p9_conn_rpc(clnt->conn, tc, &rc);
+        err = p9_client_rpc(clnt, tc, &rc);
        if (err)
                goto done;
@@ -378,7 +468,7 @@ int p9_client_fcreate(struct p9_fid *fid, char *name, u32 perm, int mode,
                goto done;
        }
-        err = p9_conn_rpc(clnt->conn, tc, &rc);
+        err = p9_client_rpc(clnt, tc, &rc);
        if (err)
                goto done;
@@ -411,7 +501,7 @@ int p9_client_clunk(struct p9_fid *fid)
                goto done;
        }
-        err = p9_conn_rpc(clnt->conn, tc, &rc);
+        err = p9_client_rpc(clnt, tc, &rc);
        if (err)
                goto done;
@@ -443,7 +533,7 @@ int p9_client_remove(struct p9_fid *fid)
                goto done;
        }
-        err = p9_conn_rpc(clnt->conn, tc, &rc);
+        err = p9_client_rpc(clnt, tc, &rc);
        if (err)
                goto done;
@@ -485,7 +575,7 @@ int p9_client_read(struct p9_fid *fid, char *data, u64 offset, u32 count)
                        goto error;
                }
-                err = p9_conn_rpc(clnt->conn, tc, &rc);
+                err = p9_client_rpc(clnt, tc, &rc);
                if (err)
                        goto error;
@@ -542,7 +632,7 @@ int p9_client_write(struct p9_fid *fid, char *data, u64 offset, u32 count)
                        goto error;
                }
-                err = p9_conn_rpc(clnt->conn, tc, &rc);
+                err = p9_client_rpc(clnt, tc, &rc);
                if (err)
                        goto error;
@@ -596,7 +686,7 @@ p9_client_uread(struct p9_fid *fid, char __user *data, u64 offset, u32 count)
                        goto error;
                }
-                err = p9_conn_rpc(clnt->conn, tc, &rc);
+                err = p9_client_rpc(clnt, tc, &rc);
                if (err)
                        goto error;
@@ -660,7 +750,7 @@ p9_client_uwrite(struct p9_fid *fid, const char __user *data, u64 offset,
                        goto error;
                }
-                err = p9_conn_rpc(clnt->conn, tc, &rc);
+                err = p9_client_rpc(clnt, tc, &rc);
                if (err)
                        goto error;
@@ -731,7 +821,7 @@ struct p9_stat *p9_client_stat(struct p9_fid *fid)
                goto error;
        }
-        err = p9_conn_rpc(clnt->conn, tc, &rc);
+        err = p9_client_rpc(clnt, tc, &rc);
        if (err)
                goto error;
@@ -773,7 +863,7 @@ int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst)
                goto done;
        }
-        err = p9_conn_rpc(clnt->conn, tc, &rc);
+        err = p9_client_rpc(clnt, tc, &rc);
 done:
        kfree(tc);
@@ -830,7 +920,7 @@ struct p9_stat *p9_client_dirread(struct p9_fid *fid, u64 offset)
                                goto error;
                        }
-                        err = p9_conn_rpc(clnt->conn, tc, &rc);
+                        err = p9_client_rpc(clnt, tc, &rc);
                        if (err)
                                goto error;
@@ -901,16 +991,21 @@ static struct p9_stat *p9_clone_stat(struct p9_stat *st, int dotu)
        memmove(ret, st, sizeof(struct p9_stat));
        p = ((char *) ret) + sizeof(struct p9_stat);
        memmove(p, st->name.str, st->name.len);
+        ret->name.str = p;
        p += st->name.len;
        memmove(p, st->uid.str, st->uid.len);
+        ret->uid.str = p;
        p += st->uid.len;
        memmove(p, st->gid.str, st->gid.len);
+        ret->gid.str = p;
        p += st->gid.len;
        memmove(p, st->muid.str, st->muid.len);
+        ret->muid.str = p;
        p += st->muid.len;
        if (dotu) {
                memmove(p, st->extension.str, st->extension.len);
+                ret->extension.str = p;
                p += st->extension.len;
        }
diff --git a/net/9p/fcprint.c b/net/9p/fcprint.c
index b1ae8ec57d54..40244fbd9b0d 100644
--- a/net/9p/fcprint.c
+++ b/net/9p/fcprint.c
@@ -347,12 +347,12 @@ p9_printfcall(char *buf, int buflen, struct p9_fcall *fc, int extended)
        return ret;
 }
 #else
 int
 p9_printfcall(char *buf, int buflen, struct p9_fcall *fc, int extended)
 {
        return 0;
 }
-EXPORT_SYMBOL(p9_printfcall);
 #endif /* CONFIG_NET_9P_DEBUG */
+EXPORT_SYMBOL(p9_printfcall);
diff --git a/net/9p/mod.c b/net/9p/mod.c
index 8f9763a9dc12..c285aab2af04 100644
--- a/net/9p/mod.c
+++ b/net/9p/mod.c
@@ -106,15 +106,10 @@ EXPORT_SYMBOL(v9fs_default_trans);
 */
 static int __init init_p9(void)
 {
-        int ret;
+        int ret = 0;
        p9_error_init();
        printk(KERN_INFO "Installing 9P2000 support\n");
-        ret = p9_mux_global_init();
-        if (ret) {
-                printk(KERN_WARNING "9p: starting mux failed\n");
-                return ret;
-        }
        return ret;
 }
@@ -126,7 +121,7 @@ static int __init init_p9(void)
 static void __exit exit_p9(void)
 {
-        p9_mux_global_exit();
+        printk(KERN_INFO "Unloading 9P2000 support\n");
 }
 module_init(init_p9)
diff --git a/net/9p/mux.c b/net/9p/mux.c
deleted file mode 100644
index c9f0805048e4..000000000000
--- a/net/9p/mux.c
+++ /dev/null
@@ -1,1060 +0,0 @@
-/*
- * net/9p/mux.c
- *
- * Protocol Multiplexer
- *
- *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
- *  Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2
- *  as published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to:
- *  Free Software Foundation
- *  51 Franklin Street, Fifth Floor
- *  Boston, MA  02111-1301  USA
- *
- */
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/poll.h>
-#include <linux/kthread.h>
-#include <linux/idr.h>
-#include <linux/mutex.h>
-#include <net/9p/9p.h>
-#include <linux/parser.h>
-#include <net/9p/transport.h>
-#include <net/9p/conn.h>
-#define ERREQFLUSH      1
-#define SCHED_TIMEOUT   10
-#define MAXPOLLWADDR    2
-enum {
-        Rworksched = 1,         /* read work scheduled or running */
-        Rpending = 2,           /* can read */
-        Wworksched = 4,         /* write work scheduled or running */
-        Wpending = 8,           /* can write */
-};
-enum {
-        None,
-        Flushing,
-        Flushed,
-};
-struct p9_mux_poll_task;
-struct p9_req {
-        spinlock_t lock; /* protect request structure */
-        int tag;
-        struct p9_fcall *tcall;
-        struct p9_fcall *rcall;
-        int err;
-        p9_conn_req_callback cb;
-        void *cba;
-        int flush;
-        struct list_head req_list;
-};
-struct p9_conn {
-        spinlock_t lock; /* protect lock structure */
-        struct list_head mux_list;
-        struct p9_mux_poll_task *poll_task;
-        int msize;
-        unsigned char *extended;
-        struct p9_trans *trans;
-        struct p9_idpool *tagpool;
-        int err;
-        wait_queue_head_t equeue;
-        struct list_head req_list;
-        struct list_head unsent_req_list;
-        struct p9_fcall *rcall;
-        int rpos;
-        char *rbuf;
-        int wpos;
-        int wsize;
-        char *wbuf;
-        wait_queue_t poll_wait[MAXPOLLWADDR];
-        wait_queue_head_t *poll_waddr[MAXPOLLWADDR];
-        poll_table pt;
-        struct work_struct rq;
-        struct work_struct wq;
-        unsigned long wsched;
-};
-struct p9_mux_poll_task {
-        struct task_struct *task;
-        struct list_head mux_list;
-        int muxnum;
-};
-struct p9_mux_rpc {
-        struct p9_conn *m;
-        int err;
-        struct p9_fcall *tcall;
-        struct p9_fcall *rcall;
-        wait_queue_head_t wqueue;
-};
-static int p9_poll_proc(void *);
-static void p9_read_work(struct work_struct *work);
-static void p9_write_work(struct work_struct *work);
-static void p9_pollwait(struct file *filp, wait_queue_head_t *wait_address,
-                          poll_table * p);
-static u16 p9_mux_get_tag(struct p9_conn *);
-static void p9_mux_put_tag(struct p9_conn *, u16);
-static DEFINE_MUTEX(p9_mux_task_lock);
-static struct workqueue_struct *p9_mux_wq;
-static int p9_mux_num;
-static int p9_mux_poll_task_num;
-static struct p9_mux_poll_task p9_mux_poll_tasks[100];
-int p9_mux_global_init(void)
-{
-        int i;
-        for (i = 0; i < ARRAY_SIZE(p9_mux_poll_tasks); i++)
-                p9_mux_poll_tasks[i].task = NULL;
-        p9_mux_wq = create_workqueue("v9fs");
-        if (!p9_mux_wq) {
-                printk(KERN_WARNING "v9fs: mux: creating workqueue failed\n");
-                return -ENOMEM;
-        }
-        return 0;
-}
-void p9_mux_global_exit(void)
-{
-        destroy_workqueue(p9_mux_wq);
-}
-/**
- * p9_mux_calc_poll_procs - calculates the number of polling procs
- * based on the number of mounted v9fs filesystems.
- *
- * The current implementation returns sqrt of the number of mounts.
- */
-static int p9_mux_calc_poll_procs(int muxnum)
-{
-        int n;
-        if (p9_mux_poll_task_num)
-                n = muxnum / p9_mux_poll_task_num +
-                    (muxnum % p9_mux_poll_task_num ? 1 : 0);
-        else
-                n = 1;
-        if (n > ARRAY_SIZE(p9_mux_poll_tasks))
-                n = ARRAY_SIZE(p9_mux_poll_tasks);
-        return n;
-}
-static int p9_mux_poll_start(struct p9_conn *m)
-{
-        int i, n;
-        struct p9_mux_poll_task *vpt, *vptlast;
-        struct task_struct *pproc;
-        P9_DPRINTK(P9_DEBUG_MUX, "mux %p muxnum %d procnum %d\n", m, p9_mux_num,
-                p9_mux_poll_task_num);
-        mutex_lock(&p9_mux_task_lock);
-        n = p9_mux_calc_poll_procs(p9_mux_num + 1);
-        if (n > p9_mux_poll_task_num) {
-                for (i = 0; i < ARRAY_SIZE(p9_mux_poll_tasks); i++) {
-                        if (p9_mux_poll_tasks[i].task == NULL) {
-                                vpt = &p9_mux_poll_tasks[i];
-                                P9_DPRINTK(P9_DEBUG_MUX, "create proc %p\n",
-                                                                        vpt);
-                                pproc = kthread_create(p9_poll_proc, vpt,
-                                                                "v9fs-poll");
-                                if (!IS_ERR(pproc)) {
-                                        vpt->task = pproc;
-                                        INIT_LIST_HEAD(&vpt->mux_list);
-                                        vpt->muxnum = 0;
-                                        p9_mux_poll_task_num++;
-                                        wake_up_process(vpt->task);
-                                }
-                                break;
-                        }
-                }
-                if (i >= ARRAY_SIZE(p9_mux_poll_tasks))
-                        P9_DPRINTK(P9_DEBUG_ERROR,
-                                        "warning: no free poll slots\n");
-        }
-        n = (p9_mux_num + 1) / p9_mux_poll_task_num +
-            ((p9_mux_num + 1) % p9_mux_poll_task_num ? 1 : 0);
-        vptlast = NULL;
-        for (i = 0; i < ARRAY_SIZE(p9_mux_poll_tasks); i++) {
-                vpt = &p9_mux_poll_tasks[i];
-                if (vpt->task != NULL) {
-                        vptlast = vpt;
-                        if (vpt->muxnum < n) {
-                                P9_DPRINTK(P9_DEBUG_MUX, "put in proc %d\n", i);
-                                list_add(&m->mux_list, &vpt->mux_list);
-                                vpt->muxnum++;
-                                m->poll_task = vpt;
-                                memset(&m->poll_waddr, 0,
-                                                        sizeof(m->poll_waddr));
-                                init_poll_funcptr(&m->pt, p9_pollwait);
-                                break;
-                        }
-                }
-        }
-        if (i >= ARRAY_SIZE(p9_mux_poll_tasks)) {
-                if (vptlast == NULL) {
-                        mutex_unlock(&p9_mux_task_lock);
-                        return -ENOMEM;
-                }
-                P9_DPRINTK(P9_DEBUG_MUX, "put in proc %d\n", i);
-                list_add(&m->mux_list, &vptlast->mux_list);
-                vptlast->muxnum++;
-                m->poll_task = vptlast;
-                memset(&m->poll_waddr, 0, sizeof(m->poll_waddr));
-                init_poll_funcptr(&m->pt, p9_pollwait);
-        }
-        p9_mux_num++;
-        mutex_unlock(&p9_mux_task_lock);
-        return 0;
-}
-static void p9_mux_poll_stop(struct p9_conn *m)
-{
-        int i;
-        struct p9_mux_poll_task *vpt;
-        mutex_lock(&p9_mux_task_lock);
-        vpt = m->poll_task;
-        list_del(&m->mux_list);
-        for (i = 0; i < ARRAY_SIZE(m->poll_waddr); i++) {
-                if (m->poll_waddr[i] != NULL) {
-                        remove_wait_queue(m->poll_waddr[i], &m->poll_wait[i]);
-                        m->poll_waddr[i] = NULL;
-                }
-        }
-        vpt->muxnum--;
-        if (!vpt->muxnum) {
-                P9_DPRINTK(P9_DEBUG_MUX, "destroy proc %p\n", vpt);
-                kthread_stop(vpt->task);
-                vpt->task = NULL;
-                p9_mux_poll_task_num--;
-        }
-        p9_mux_num--;
-        mutex_unlock(&p9_mux_task_lock);
-}
-/**
- * p9_conn_create - allocate and initialize the per-session mux data
- * Creates the polling task if this is the first session.
- *
- * @trans - transport structure
- * @msize - maximum message size
- * @extended - pointer to the extended flag
- */
-struct p9_conn *p9_conn_create(struct p9_trans *trans, int msize,
-                                    unsigned char *extended)
-{
-        int i, n;
-        struct p9_conn *m, *mtmp;
-        P9_DPRINTK(P9_DEBUG_MUX, "transport %p msize %d\n", trans, msize);
-        m = kmalloc(sizeof(struct p9_conn), GFP_KERNEL);
-        if (!m)
-                return ERR_PTR(-ENOMEM);
-        spin_lock_init(&m->lock);
-        INIT_LIST_HEAD(&m->mux_list);
-        m->msize = msize;
-        m->extended = extended;
-        m->trans = trans;
-        m->tagpool = p9_idpool_create();
-        if (IS_ERR(m->tagpool)) {
-                mtmp = ERR_PTR(-ENOMEM);
-                kfree(m);
-                return mtmp;
-        }
-        m->err = 0;
-        init_waitqueue_head(&m->equeue);
-        INIT_LIST_HEAD(&m->req_list);
-        INIT_LIST_HEAD(&m->unsent_req_list);
-        m->rcall = NULL;
-        m->rpos = 0;
-        m->rbuf = NULL;
-        m->wpos = m->wsize = 0;
-        m->wbuf = NULL;
-        INIT_WORK(&m->rq, p9_read_work);
-        INIT_WORK(&m->wq, p9_write_work);
-        m->wsched = 0;
-        memset(&m->poll_waddr, 0, sizeof(m->poll_waddr));
-        m->poll_task = NULL;
-        n = p9_mux_poll_start(m);
-        if (n) {
-                kfree(m);
-                return ERR_PTR(n);
-        }
-        n = trans->poll(trans, &m->pt);
-        if (n & POLLIN) {
-                P9_DPRINTK(P9_DEBUG_MUX, "mux %p can read\n", m);
-                set_bit(Rpending, &m->wsched);
-        }
-        if (n & POLLOUT) {
-                P9_DPRINTK(P9_DEBUG_MUX, "mux %p can write\n", m);
-                set_bit(Wpending, &m->wsched);
-        }
-        for (i = 0; i < ARRAY_SIZE(m->poll_waddr); i++) {
-                if (IS_ERR(m->poll_waddr[i])) {
-                        p9_mux_poll_stop(m);
-                        mtmp = (void *)m->poll_waddr;   /* the error code */
-                        kfree(m);
-                        m = mtmp;
-                        break;
-                }
-        }
-        return m;
-}
-EXPORT_SYMBOL(p9_conn_create);
-/**
- * p9_mux_destroy - cancels all pending requests and frees mux resources
- */
-void p9_conn_destroy(struct p9_conn *m)
-{
-        P9_DPRINTK(P9_DEBUG_MUX, "mux %p prev %p next %p\n", m,
-                m->mux_list.prev, m->mux_list.next);
-        p9_conn_cancel(m, -ECONNRESET);
-        if (!list_empty(&m->req_list)) {
-                /* wait until all processes waiting on this session exit */
-                P9_DPRINTK(P9_DEBUG_MUX,
-                        "mux %p waiting for empty request queue\n", m);
-                wait_event_timeout(m->equeue, (list_empty(&m->req_list)), 5000);
-                P9_DPRINTK(P9_DEBUG_MUX, "mux %p request queue empty: %d\n", m,
-                        list_empty(&m->req_list));
-        }
-        p9_mux_poll_stop(m);
-        m->trans = NULL;
-        p9_idpool_destroy(m->tagpool);
-        kfree(m);
-}
-EXPORT_SYMBOL(p9_conn_destroy);
-/**
- * p9_pollwait - called by files poll operation to add v9fs-poll task
- *      to files wait queue
- */
-static void
-p9_pollwait(struct file *filp, wait_queue_head_t *wait_address,
-              poll_table * p)
-{
-        int i;
-        struct p9_conn *m;
-        m = container_of(p, struct p9_conn, pt);
-        for (i = 0; i < ARRAY_SIZE(m->poll_waddr); i++)
-                if (m->poll_waddr[i] == NULL)
-                        break;
-        if (i >= ARRAY_SIZE(m->poll_waddr)) {
-                P9_DPRINTK(P9_DEBUG_ERROR, "not enough wait_address slots\n");
-                return;
-        }
-        m->poll_waddr[i] = wait_address;
-        if (!wait_address) {
-                P9_DPRINTK(P9_DEBUG_ERROR, "no wait_address\n");
-                m->poll_waddr[i] = ERR_PTR(-EIO);
-                return;
-        }
-        init_waitqueue_entry(&m->poll_wait[i], m->poll_task->task);
-        add_wait_queue(wait_address, &m->poll_wait[i]);
-}
-/**
- * p9_poll_mux - polls a mux and schedules read or write works if necessary
- */
-static void p9_poll_mux(struct p9_conn *m)
-{
-        int n;
-        if (m->err < 0)
-                return;
-        n = m->trans->poll(m->trans, NULL);
-        if (n < 0 || n & (POLLERR | POLLHUP | POLLNVAL)) {
-                P9_DPRINTK(P9_DEBUG_MUX, "error mux %p err %d\n", m, n);
-                if (n >= 0)
-                        n = -ECONNRESET;
-                p9_conn_cancel(m, n);
-        }
-        if (n & POLLIN) {
-                set_bit(Rpending, &m->wsched);
-                P9_DPRINTK(P9_DEBUG_MUX, "mux %p can read\n", m);
-                if (!test_and_set_bit(Rworksched, &m->wsched)) {
-                        P9_DPRINTK(P9_DEBUG_MUX, "schedule read work %p\n", m);
-                        queue_work(p9_mux_wq, &m->rq);
-                }
-        }
-        if (n & POLLOUT) {
-                set_bit(Wpending, &m->wsched);
-                P9_DPRINTK(P9_DEBUG_MUX, "mux %p can write\n", m);
-                if ((m->wsize || !list_empty(&m->unsent_req_list))
-                    && !test_and_set_bit(Wworksched, &m->wsched)) {
-                        P9_DPRINTK(P9_DEBUG_MUX, "schedule write work %p\n", m);
-                        queue_work(p9_mux_wq, &m->wq);
-                }
-        }
-}
-/**
- * p9_poll_proc - polls all v9fs transports for new events and queues
- *      the appropriate work to the work queue
- */
-static int p9_poll_proc(void *a)
-{
-        struct p9_conn *m, *mtmp;
-        struct p9_mux_poll_task *vpt;
-        vpt = a;
-        P9_DPRINTK(P9_DEBUG_MUX, "start %p %p\n", current, vpt);
-        while (!kthread_should_stop()) {
-                set_current_state(TASK_INTERRUPTIBLE);
-                list_for_each_entry_safe(m, mtmp, &vpt->mux_list, mux_list) {
-                        p9_poll_mux(m);
-                }
-                P9_DPRINTK(P9_DEBUG_MUX, "sleeping...\n");
-                schedule_timeout(SCHED_TIMEOUT * HZ);
-        }
-        __set_current_state(TASK_RUNNING);
-        P9_DPRINTK(P9_DEBUG_MUX, "finish\n");
-        return 0;
-}
-/**
- * p9_write_work - called when a transport can send some data
- */
-static void p9_write_work(struct work_struct *work)
-{
-        int n, err;
-        struct p9_conn *m;
-        struct p9_req *req;
-        m = container_of(work, struct p9_conn, wq);
-        if (m->err < 0) {
-                clear_bit(Wworksched, &m->wsched);
-                return;
-        }
-        if (!m->wsize) {
-                if (list_empty(&m->unsent_req_list)) {
-                        clear_bit(Wworksched, &m->wsched);
-                        return;
-                }
-                spin_lock(&m->lock);
-again:
-                req = list_entry(m->unsent_req_list.next, struct p9_req,
-                               req_list);
-                list_move_tail(&req->req_list, &m->req_list);
-                if (req->err == ERREQFLUSH)
-                        goto again;
-                m->wbuf = req->tcall->sdata;
-                m->wsize = req->tcall->size;
-                m->wpos = 0;
-                spin_unlock(&m->lock);
-        }
-        P9_DPRINTK(P9_DEBUG_MUX, "mux %p pos %d size %d\n", m, m->wpos,
-                                                                m->wsize);
-        clear_bit(Wpending, &m->wsched);
-        err = m->trans->write(m->trans, m->wbuf + m->wpos, m->wsize - m->wpos);
-        P9_DPRINTK(P9_DEBUG_MUX, "mux %p sent %d bytes\n", m, err);
-        if (err == -EAGAIN) {
-                clear_bit(Wworksched, &m->wsched);
-                return;
-        }
-        if (err < 0)
-                goto error;
-        else if (err == 0) {
-                err = -EREMOTEIO;
-                goto error;
-        }
-        m->wpos += err;
-        if (m->wpos == m->wsize)
-                m->wpos = m->wsize = 0;
-        if (m->wsize == 0 && !list_empty(&m->unsent_req_list)) {
-                if (test_and_clear_bit(Wpending, &m->wsched))
-                        n = POLLOUT;
-                else
-                        n = m->trans->poll(m->trans, NULL);
-                if (n & POLLOUT) {
-                        P9_DPRINTK(P9_DEBUG_MUX, "schedule write work %p\n", m);
-                        queue_work(p9_mux_wq, &m->wq);
-                } else
-                        clear_bit(Wworksched, &m->wsched);
-        } else
-                clear_bit(Wworksched, &m->wsched);
-        return;
-error:
-        p9_conn_cancel(m, err);
-        clear_bit(Wworksched, &m->wsched);
-}
-static void process_request(struct p9_conn *m, struct p9_req *req)
-{
-        int ecode;
-        struct p9_str *ename;
-        if (!req->err && req->rcall->id == P9_RERROR) {
-                ecode = req->rcall->params.rerror.errno;
-                ename = &req->rcall->params.rerror.error;
-                P9_DPRINTK(P9_DEBUG_MUX, "Rerror %.*s\n", ename->len,
-                                                                ename->str);
-                if (*m->extended)
-                        req->err = -ecode;
-                if (!req->err) {
-                        req->err = p9_errstr2errno(ename->str, ename->len);
-                        if (!req->err) {        /* string match failed */
-                                PRINT_FCALL_ERROR("unknown error", req->rcall);
-                        }
-                        if (!req->err)
-                                req->err = -ESERVERFAULT;
-                }
-        } else if (req->tcall && req->rcall->id != req->tcall->id + 1) {
-                P9_DPRINTK(P9_DEBUG_ERROR,
-                                "fcall mismatch: expected %d, got %d\n",
-                                req->tcall->id + 1, req->rcall->id);
-                if (!req->err)
-                        req->err = -EIO;
-        }
-}
-/**
- * p9_read_work - called when there is some data to be read from a transport
- */
-static void p9_read_work(struct work_struct *work)
-{
-        int n, err;
-        struct p9_conn *m;
-        struct p9_req *req, *rptr, *rreq;
-        struct p9_fcall *rcall;
-        char *rbuf;
-        m = container_of(work, struct p9_conn, rq);
-        if (m->err < 0)
-                return;
-        rcall = NULL;
-        P9_DPRINTK(P9_DEBUG_MUX, "start mux %p pos %d\n", m, m->rpos);
-        if (!m->rcall) {
-                m->rcall =
-                    kmalloc(sizeof(struct p9_fcall) + m->msize, GFP_KERNEL);
-                if (!m->rcall) {
-                        err = -ENOMEM;
-                        goto error;
-                }
-                m->rbuf = (char *)m->rcall + sizeof(struct p9_fcall);
-                m->rpos = 0;
-        }
-        clear_bit(Rpending, &m->wsched);
-        err = m->trans->read(m->trans, m->rbuf + m->rpos, m->msize - m->rpos);
-        P9_DPRINTK(P9_DEBUG_MUX, "mux %p got %d bytes\n", m, err);
-        if (err == -EAGAIN) {
-                clear_bit(Rworksched, &m->wsched);
-                return;
-        }
-        if (err <= 0)
-                goto error;
-        m->rpos += err;
-        while (m->rpos > 4) {
-                n = le32_to_cpu(*(__le32 *) m->rbuf);
-                if (n >= m->msize) {
-                        P9_DPRINTK(P9_DEBUG_ERROR,
-                                "requested packet size too big: %d\n", n);
-                        err = -EIO;
-                        goto error;
-                }
-                if (m->rpos < n)
-                        break;
-                err =
-                    p9_deserialize_fcall(m->rbuf, n, m->rcall, *m->extended);
-                if (err < 0) {
-                        goto error;
-                }
-#ifdef CONFIG_NET_9P_DEBUG
-                if ((p9_debug_level&P9_DEBUG_FCALL) == P9_DEBUG_FCALL) {
-                        char buf[150];
-                        p9_printfcall(buf, sizeof(buf), m->rcall,
-                                *m->extended);
-                        printk(KERN_NOTICE ">>> %p %s\n", m, buf);
-                }
-#endif
-                rcall = m->rcall;
-                rbuf = m->rbuf;
-                if (m->rpos > n) {
-                        m->rcall = kmalloc(sizeof(struct p9_fcall) + m->msize,
-                                           GFP_KERNEL);
-                        if (!m->rcall) {
-                                err = -ENOMEM;
-                                goto error;
-                        }
-                        m->rbuf = (char *)m->rcall + sizeof(struct p9_fcall);
-                        memmove(m->rbuf, rbuf + n, m->rpos - n);
-                        m->rpos -= n;
-                } else {
-                        m->rcall = NULL;
-                        m->rbuf = NULL;
-                        m->rpos = 0;
-                }
-                P9_DPRINTK(P9_DEBUG_MUX, "mux %p fcall id %d tag %d\n", m,
-                                                        rcall->id, rcall->tag);
-                req = NULL;
-                spin_lock(&m->lock);
-                list_for_each_entry_safe(rreq, rptr, &m->req_list, req_list) {
-                        if (rreq->tag == rcall->tag) {
-                                req = rreq;
-                                if (req->flush != Flushing)
-                                        list_del(&req->req_list);
-                                break;
-                        }
-                }
-                spin_unlock(&m->lock);
-                if (req) {
-                        req->rcall = rcall;
-                        process_request(m, req);
-                        if (req->flush != Flushing) {
-                                if (req->cb)
-                                        (*req->cb) (req, req->cba);
-                                else
-                                        kfree(req->rcall);
-                                wake_up(&m->equeue);
-                        }
-                } else {
-                        if (err >= 0 && rcall->id != P9_RFLUSH)
-                                P9_DPRINTK(P9_DEBUG_ERROR,
-                                  "unexpected response mux %p id %d tag %d\n",
-                                  m, rcall->id, rcall->tag);
-                        kfree(rcall);
-                }
-        }
-        if (!list_empty(&m->req_list)) {
-                if (test_and_clear_bit(Rpending, &m->wsched))
-                        n = POLLIN;
-                else
-                        n = m->trans->poll(m->trans, NULL);
-                if (n & POLLIN) {
-                        P9_DPRINTK(P9_DEBUG_MUX, "schedule read work %p\n", m);
-                        queue_work(p9_mux_wq, &m->rq);
-                } else
-                        clear_bit(Rworksched, &m->wsched);
-        } else
-                clear_bit(Rworksched, &m->wsched);
-        return;
-error:
-        p9_conn_cancel(m, err);
-        clear_bit(Rworksched, &m->wsched);
-}
-/**
- * p9_send_request - send 9P request
- * The function can sleep until the request is scheduled for sending.
- * The function can be interrupted. Return from the function is not
- * a guarantee that the request is sent successfully. Can return errors
- * that can be retrieved by PTR_ERR macros.
- *
- * @m: mux data
- * @tc: request to be sent
- * @cb: callback function to call when response is received
- * @cba: parameter to pass to the callback function
- */
-static struct p9_req *p9_send_request(struct p9_conn *m,
-                                          struct p9_fcall *tc,
-                                          p9_conn_req_callback cb, void *cba)
-{
-        int n;
-        struct p9_req *req;
-        P9_DPRINTK(P9_DEBUG_MUX, "mux %p task %p tcall %p id %d\n", m, current,
-                tc, tc->id);
-        if (m->err < 0)
-                return ERR_PTR(m->err);
-        req = kmalloc(sizeof(struct p9_req), GFP_KERNEL);
-        if (!req)
-                return ERR_PTR(-ENOMEM);
-        if (tc->id == P9_TVERSION)
-                n = P9_NOTAG;
-        else
-                n = p9_mux_get_tag(m);
-        if (n < 0)
-                return ERR_PTR(-ENOMEM);
-        p9_set_tag(tc, n);
-#ifdef CONFIG_NET_9P_DEBUG
-        if ((p9_debug_level&P9_DEBUG_FCALL) == P9_DEBUG_FCALL) {
-                char buf[150];
-                p9_printfcall(buf, sizeof(buf), tc, *m->extended);
-                printk(KERN_NOTICE "<<< %p %s\n", m, buf);
-        }
-#endif
-        spin_lock_init(&req->lock);
-        req->tag = n;
-        req->tcall = tc;
-        req->rcall = NULL;
-        req->err = 0;
-        req->cb = cb;
-        req->cba = cba;
-        req->flush = None;
-        spin_lock(&m->lock);
-        list_add_tail(&req->req_list, &m->unsent_req_list);
-        spin_unlock(&m->lock);
-        if (test_and_clear_bit(Wpending, &m->wsched))
-                n = POLLOUT;
-        else
-                n = m->trans->poll(m->trans, NULL);
-        if (n & POLLOUT && !test_and_set_bit(Wworksched, &m->wsched))
-                queue_work(p9_mux_wq, &m->wq);
-        return req;
-}
-static void p9_mux_free_request(struct p9_conn *m, struct p9_req *req)
-{
-        p9_mux_put_tag(m, req->tag);
-        kfree(req);
-}
-static void p9_mux_flush_cb(struct p9_req *freq, void *a)
-{
-        p9_conn_req_callback cb;
-        int tag;
-        struct p9_conn *m;
-        struct p9_req *req, *rreq, *rptr;
-        m = a;
-        P9_DPRINTK(P9_DEBUG_MUX, "mux %p tc %p rc %p err %d oldtag %d\n", m,
-                freq->tcall, freq->rcall, freq->err,
-                freq->tcall->params.tflush.oldtag);
-        spin_lock(&m->lock);
-        cb = NULL;
-        tag = freq->tcall->params.tflush.oldtag;
-        req = NULL;
-        list_for_each_entry_safe(rreq, rptr, &m->req_list, req_list) {
-                if (rreq->tag == tag) {
-                        req = rreq;
-                        list_del(&req->req_list);
-                        break;
-                }
-        }
-        spin_unlock(&m->lock);
-        if (req) {
-                spin_lock(&req->lock);
-                req->flush = Flushed;
-                spin_unlock(&req->lock);
-                if (req->cb)
-                        (*req->cb) (req, req->cba);
-                else
-                        kfree(req->rcall);
-                wake_up(&m->equeue);
-        }
-        kfree(freq->tcall);
-        kfree(freq->rcall);
-        p9_mux_free_request(m, freq);
-}
-static int
-p9_mux_flush_request(struct p9_conn *m, struct p9_req *req)
-{
-        struct p9_fcall *fc;
-        struct p9_req *rreq, *rptr;
-        P9_DPRINTK(P9_DEBUG_MUX, "mux %p req %p tag %d\n", m, req, req->tag);
-        /* if a response was received for a request, do nothing */
-        spin_lock(&req->lock);
-        if (req->rcall || req->err) {
-                spin_unlock(&req->lock);
-                P9_DPRINTK(P9_DEBUG_MUX,
-                        "mux %p req %p response already received\n", m, req);
-                return 0;
-        }
-        req->flush = Flushing;
-        spin_unlock(&req->lock);
-        spin_lock(&m->lock);
-        /* if the request is not sent yet, just remove it from the list */
-        list_for_each_entry_safe(rreq, rptr, &m->unsent_req_list, req_list) {
-                if (rreq->tag == req->tag) {
-                        P9_DPRINTK(P9_DEBUG_MUX,
-                           "mux %p req %p request is not sent yet\n", m, req);
-                        list_del(&rreq->req_list);
-                        req->flush = Flushed;
-                        spin_unlock(&m->lock);
-                        if (req->cb)
-                                (*req->cb) (req, req->cba);
-                        return 0;
-                }
-        }
-        spin_unlock(&m->lock);
-        clear_thread_flag(TIF_SIGPENDING);
-        fc = p9_create_tflush(req->tag);
-        p9_send_request(m, fc, p9_mux_flush_cb, m);
-        return 1;
-}
-static void
-p9_conn_rpc_cb(struct p9_req *req, void *a)
-{
-        struct p9_mux_rpc *r;
-        P9_DPRINTK(P9_DEBUG_MUX, "req %p r %p\n", req, a);
-        r = a;
-        r->rcall = req->rcall;
-        r->err = req->err;
-        if (req->flush != None && !req->err)
-                r->err = -ERESTARTSYS;
-        wake_up(&r->wqueue);
-}
-/**
- * p9_mux_rpc - sends 9P request and waits until a response is available.
- *      The function can be interrupted.
- * @m: mux data
- * @tc: request to be sent
- * @rc: pointer where a pointer to the response is stored
- */
-int
-p9_conn_rpc(struct p9_conn *m, struct p9_fcall *tc,
-             struct p9_fcall **rc)
-{
-        int err, sigpending;
-        unsigned long flags;
-        struct p9_req *req;
-        struct p9_mux_rpc r;
-        r.err = 0;
-        r.tcall = tc;
-        r.rcall = NULL;
-        r.m = m;
-        init_waitqueue_head(&r.wqueue);
-        if (rc)
-                *rc = NULL;
-        sigpending = 0;
-        if (signal_pending(current)) {
-                sigpending = 1;
-                clear_thread_flag(TIF_SIGPENDING);
-        }
-        req = p9_send_request(m, tc, p9_conn_rpc_cb, &r);
-        if (IS_ERR(req)) {
-                err = PTR_ERR(req);
-                P9_DPRINTK(P9_DEBUG_MUX, "error %d\n", err);
-                return err;
-        }
-        err = wait_event_interruptible(r.wqueue, r.rcall != NULL || r.err < 0);
-        if (r.err < 0)
-                err = r.err;
-        if (err == -ERESTARTSYS && m->trans->status == Connected
-                                                        && m->err == 0) {
-                if (p9_mux_flush_request(m, req)) {
-                        /* wait until we get response of the flush message */
-                        do {
-                                clear_thread_flag(TIF_SIGPENDING);
-                                err = wait_event_interruptible(r.wqueue,
-                                        r.rcall || r.err);
-                        } while (!r.rcall && !r.err && err == -ERESTARTSYS &&
-                                m->trans->status == Connected && !m->err);
-                        err = -ERESTARTSYS;
-                }
-                sigpending = 1;
-        }
-        if (sigpending) {
-                spin_lock_irqsave(&current->sighand->siglock, flags);
-                recalc_sigpending();
-                spin_unlock_irqrestore(&current->sighand->siglock, flags);
-        }
-        if (rc)
-                *rc = r.rcall;
-        else
-                kfree(r.rcall);
-        p9_mux_free_request(m, req);
-        if (err > 0)
-                err = -EIO;
-        return err;
-}
-EXPORT_SYMBOL(p9_conn_rpc);
-#ifdef P9_NONBLOCK
-/**
- * p9_conn_rpcnb - sends 9P request without waiting for response.
- * @m: mux data
- * @tc: request to be sent
- * @cb: callback function to be called when response arrives
- * @cba: value to pass to the callback function
- */
-int p9_conn_rpcnb(struct p9_conn *m, struct p9_fcall *tc,
-                   p9_conn_req_callback cb, void *a)
-{
-        int err;
-        struct p9_req *req;
-        req = p9_send_request(m, tc, cb, a);
-        if (IS_ERR(req)) {
-                err = PTR_ERR(req);
-                P9_DPRINTK(P9_DEBUG_MUX, "error %d\n", err);
-                return PTR_ERR(req);
-        }
-        P9_DPRINTK(P9_DEBUG_MUX, "mux %p tc %p tag %d\n", m, tc, req->tag);
-        return 0;
-}
-EXPORT_SYMBOL(p9_conn_rpcnb);
-#endif /* P9_NONBLOCK */
-/**
- * p9_conn_cancel - cancel all pending requests with error
- * @m: mux data
- * @err: error code
- */
-void p9_conn_cancel(struct p9_conn *m, int err)
-{
-        struct p9_req *req, *rtmp;
-        LIST_HEAD(cancel_list);
-        P9_DPRINTK(P9_DEBUG_ERROR, "mux %p err %d\n", m, err);
-        m->err = err;
-        spin_lock(&m->lock);
-        list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) {
-                list_move(&req->req_list, &cancel_list);
-        }
-        list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) {
-                list_move(&req->req_list, &cancel_list);
-        }
-        spin_unlock(&m->lock);
-        list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) {
-                list_del(&req->req_list);
-                if (!req->err)
-                        req->err = err;
-                if (req->cb)
-                        (*req->cb) (req, req->cba);
-                else
-                        kfree(req->rcall);
-        }
-        wake_up(&m->equeue);
-}
-EXPORT_SYMBOL(p9_conn_cancel);
-static u16 p9_mux_get_tag(struct p9_conn *m)
-{
-        int tag;
-        tag = p9_idpool_get(m->tagpool);
-        if (tag < 0)
-                return P9_NOTAG;
-        else
-                return (u16) tag;
-}
-static void p9_mux_put_tag(struct p9_conn *m, u16 tag)
-{
-        if (tag != P9_NOTAG && p9_idpool_check(tag, m->tagpool))
-                p9_idpool_put(tag, m->tagpool);
-}
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 62332ed9da4a..1aa9d5175398 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -5,7 +5,7 @@
 *
 *  Copyright (C) 2006 by Russ Cox <rsc@swtch.com>
 *  Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net>
- *  Copyright (C) 2004-2007 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com>
 *  Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com>
 *
 *  This program is free software; you can redistribute it and/or modify
@@ -29,6 +29,7 @@
 #include <linux/module.h>
 #include <linux/net.h>
 #include <linux/ipv6.h>
+#include <linux/kthread.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/un.h>
@@ -42,7 +43,9 @@
 #define P9_PORT 564
 #define MAX_SOCK_BUF (64*1024)
+#define ERREQFLUSH      1
+#define SCHED_TIMEOUT   10
+#define MAXPOLLWADDR    2
 struct p9_fd_opts {
        int rfd;
@@ -53,6 +56,7 @@ struct p9_fd_opts {
 struct p9_trans_fd {
        struct file *rd;
        struct file *wr;
+        struct p9_conn *conn;
 };
 /*
@@ -72,6 +76,1028 @@ static match_table_t tokens = {
        {Opt_err, NULL},
 };
+enum {
+        Rworksched = 1,         /* read work scheduled or running */
+        Rpending = 2,           /* can read */
+        Wworksched = 4,         /* write work scheduled or running */
+        Wpending = 8,           /* can write */
+};
+enum {
+        None,
+        Flushing,
+        Flushed,
+};
+struct p9_req;
+typedef void (*p9_conn_req_callback)(struct p9_req *req, void *a);
+struct p9_req {
+        spinlock_t lock; /* protect request structure */
+        int tag;
+        struct p9_fcall *tcall;
+        struct p9_fcall *rcall;
+        int err;
+        p9_conn_req_callback cb;
+        void *cba;
+        int flush;
+        struct list_head req_list;
+};
+struct p9_mux_poll_task;
+struct p9_conn {
+        spinlock_t lock; /* protect lock structure */
+        struct list_head mux_list;
+        struct p9_mux_poll_task *poll_task;
+        int msize;
+        unsigned char extended;
+        struct p9_trans *trans;
+        struct p9_idpool *tagpool;
+        int err;
+        wait_queue_head_t equeue;
+        struct list_head req_list;
+        struct list_head unsent_req_list;
+        struct p9_fcall *rcall;
+        int rpos;
+        char *rbuf;
+        int wpos;
+        int wsize;
+        char *wbuf;
+        wait_queue_t poll_wait[MAXPOLLWADDR];
+        wait_queue_head_t *poll_waddr[MAXPOLLWADDR];
+        poll_table pt;
+        struct work_struct rq;
+        struct work_struct wq;
+        unsigned long wsched;
+};
+struct p9_mux_poll_task {
+        struct task_struct *task;
+        struct list_head mux_list;
+        int muxnum;
+};
+struct p9_mux_rpc {
+        struct p9_conn *m;
+        int err;
+        struct p9_fcall *tcall;
+        struct p9_fcall *rcall;
+        wait_queue_head_t wqueue;
+};
+static int p9_poll_proc(void *);
+static void p9_read_work(struct work_struct *work);
+static void p9_write_work(struct work_struct *work);
+static void p9_pollwait(struct file *filp, wait_queue_head_t *wait_address,
+                                                                poll_table *p);
+static int p9_fd_write(struct p9_trans *trans, void *v, int len);
+static int p9_fd_read(struct p9_trans *trans, void *v, int len);
+static DEFINE_MUTEX(p9_mux_task_lock);
+static struct workqueue_struct *p9_mux_wq;
+static int p9_mux_num;
+static int p9_mux_poll_task_num;
+static struct p9_mux_poll_task p9_mux_poll_tasks[100];
+static void p9_conn_destroy(struct p9_conn *);
+static unsigned int p9_fd_poll(struct p9_trans *trans,
+                                                struct poll_table_struct *pt);
+#ifdef P9_NONBLOCK
+static int p9_conn_rpcnb(struct p9_conn *m, struct p9_fcall *tc,
+        p9_conn_req_callback cb, void *a);
+#endif /* P9_NONBLOCK */
+static void p9_conn_cancel(struct p9_conn *m, int err);
+static int p9_mux_global_init(void)
+{
+        int i;
+        for (i = 0; i < ARRAY_SIZE(p9_mux_poll_tasks); i++)
+                p9_mux_poll_tasks[i].task = NULL;
+        p9_mux_wq = create_workqueue("v9fs");
+        if (!p9_mux_wq) {
+                printk(KERN_WARNING "v9fs: mux: creating workqueue failed\n");
+                return -ENOMEM;
+        }
+        return 0;
+}
+static u16 p9_mux_get_tag(struct p9_conn *m)
+{
+        int tag;
+        tag = p9_idpool_get(m->tagpool);
+        if (tag < 0)
+                return P9_NOTAG;
+        else
+                return (u16) tag;
+}
+static void p9_mux_put_tag(struct p9_conn *m, u16 tag)
+{
+        if (tag != P9_NOTAG && p9_idpool_check(tag, m->tagpool))
+                p9_idpool_put(tag, m->tagpool);
+}
+/**
+ * p9_mux_calc_poll_procs - calculates the number of polling procs
+ * based on the number of mounted v9fs filesystems.
+ *
+ * The current implementation returns sqrt of the number of mounts.
+ */
+static int p9_mux_calc_poll_procs(int muxnum)
+{
+        int n;
+        if (p9_mux_poll_task_num)
+                n = muxnum / p9_mux_poll_task_num +
+                    (muxnum % p9_mux_poll_task_num ? 1 : 0);
+        else
+                n = 1;
+        if (n > ARRAY_SIZE(p9_mux_poll_tasks))
+                n = ARRAY_SIZE(p9_mux_poll_tasks);
+        return n;
+}
+static int p9_mux_poll_start(struct p9_conn *m)
+{
+        int i, n;
+        struct p9_mux_poll_task *vpt, *vptlast;
+        struct task_struct *pproc;
+        P9_DPRINTK(P9_DEBUG_MUX, "mux %p muxnum %d procnum %d\n", m, p9_mux_num,
+                p9_mux_poll_task_num);
+        mutex_lock(&p9_mux_task_lock);
+        n = p9_mux_calc_poll_procs(p9_mux_num + 1);
+        if (n > p9_mux_poll_task_num) {
+                for (i = 0; i < ARRAY_SIZE(p9_mux_poll_tasks); i++) {
+                        if (p9_mux_poll_tasks[i].task == NULL) {
+                                vpt = &p9_mux_poll_tasks[i];
+                                P9_DPRINTK(P9_DEBUG_MUX, "create proc %p\n",
+                                                                        vpt);
+                                pproc = kthread_create(p9_poll_proc, vpt,
+                                                                "v9fs-poll");
+                                if (!IS_ERR(pproc)) {
+                                        vpt->task = pproc;
+                                        INIT_LIST_HEAD(&vpt->mux_list);
+                                        vpt->muxnum = 0;
+                                        p9_mux_poll_task_num++;
+                                        wake_up_process(vpt->task);
+                                }
+                                break;
+                        }
+                }
+                if (i >= ARRAY_SIZE(p9_mux_poll_tasks))
+                        P9_DPRINTK(P9_DEBUG_ERROR,
+                                        "warning: no free poll slots\n");
+        }
+        n = (p9_mux_num + 1) / p9_mux_poll_task_num +
+            ((p9_mux_num + 1) % p9_mux_poll_task_num ? 1 : 0);
+        vptlast = NULL;
+        for (i = 0; i < ARRAY_SIZE(p9_mux_poll_tasks); i++) {
+                vpt = &p9_mux_poll_tasks[i];
+                if (vpt->task != NULL) {
+                        vptlast = vpt;
+                        if (vpt->muxnum < n) {
+                                P9_DPRINTK(P9_DEBUG_MUX, "put in proc %d\n", i);
+                                list_add(&m->mux_list, &vpt->mux_list);
+                                vpt->muxnum++;
+                                m->poll_task = vpt;
+                                memset(&m->poll_waddr, 0,
+                                                        sizeof(m->poll_waddr));
+                                init_poll_funcptr(&m->pt, p9_pollwait);
+                                break;
+                        }
+                }
+        }
+        if (i >= ARRAY_SIZE(p9_mux_poll_tasks)) {
+                if (vptlast == NULL) {
+                        mutex_unlock(&p9_mux_task_lock);
+                        return -ENOMEM;
+                }
+                P9_DPRINTK(P9_DEBUG_MUX, "put in proc %d\n", i);
+                list_add(&m->mux_list, &vptlast->mux_list);
+                vptlast->muxnum++;
+                m->poll_task = vptlast;
+                memset(&m->poll_waddr, 0, sizeof(m->poll_waddr));
+                init_poll_funcptr(&m->pt, p9_pollwait);
+        }
+        p9_mux_num++;
+        mutex_unlock(&p9_mux_task_lock);
+        return 0;
+}
+static void p9_mux_poll_stop(struct p9_conn *m)
+{
+        int i;
+        struct p9_mux_poll_task *vpt;
+        mutex_lock(&p9_mux_task_lock);
+        vpt = m->poll_task;
+        list_del(&m->mux_list);
+        for (i = 0; i < ARRAY_SIZE(m->poll_waddr); i++) {
+                if (m->poll_waddr[i] != NULL) {
+                        remove_wait_queue(m->poll_waddr[i], &m->poll_wait[i]);
+                        m->poll_waddr[i] = NULL;
+                }
+        }
+        vpt->muxnum--;
+        if (!vpt->muxnum) {
+                P9_DPRINTK(P9_DEBUG_MUX, "destroy proc %p\n", vpt);
+                kthread_stop(vpt->task);
+                vpt->task = NULL;
+                p9_mux_poll_task_num--;
+        }
+        p9_mux_num--;
+        mutex_unlock(&p9_mux_task_lock);
+}
+/**
+ * p9_conn_create - allocate and initialize the per-session mux data
+ * Creates the polling task if this is the first session.
+ *
+ * @trans - transport structure
+ * @msize - maximum message size
+ * @extended - extended flag
+ */
+static struct p9_conn *p9_conn_create(struct p9_trans *trans)
+{
+        int i, n;
+        struct p9_conn *m, *mtmp;
+        P9_DPRINTK(P9_DEBUG_MUX, "transport %p msize %d\n", trans,
+                                                                trans->msize);
+        m = kmalloc(sizeof(struct p9_conn), GFP_KERNEL);
+        if (!m)
+                return ERR_PTR(-ENOMEM);
+        spin_lock_init(&m->lock);
+        INIT_LIST_HEAD(&m->mux_list);
+        m->msize = trans->msize;
+        m->extended = trans->extended;
+        m->trans = trans;
+        m->tagpool = p9_idpool_create();
+        if (IS_ERR(m->tagpool)) {
+                mtmp = ERR_PTR(-ENOMEM);
+                kfree(m);
+                return mtmp;
+        }
+        m->err = 0;
+        init_waitqueue_head(&m->equeue);
+        INIT_LIST_HEAD(&m->req_list);
+        INIT_LIST_HEAD(&m->unsent_req_list);
+        m->rcall = NULL;
+        m->rpos = 0;
+        m->rbuf = NULL;
+        m->wpos = m->wsize = 0;
+        m->wbuf = NULL;
+        INIT_WORK(&m->rq, p9_read_work);
+        INIT_WORK(&m->wq, p9_write_work);
+        m->wsched = 0;
+        memset(&m->poll_waddr, 0, sizeof(m->poll_waddr));
+        m->poll_task = NULL;
+        n = p9_mux_poll_start(m);
+        if (n) {
+                kfree(m);
+                return ERR_PTR(n);
+        }
+        n = p9_fd_poll(trans, &m->pt);
+        if (n & POLLIN) {
+                P9_DPRINTK(P9_DEBUG_MUX, "mux %p can read\n", m);
+                set_bit(Rpending, &m->wsched);
+        }
+        if (n & POLLOUT) {
+                P9_DPRINTK(P9_DEBUG_MUX, "mux %p can write\n", m);
+                set_bit(Wpending, &m->wsched);
+        }
+        for (i = 0; i < ARRAY_SIZE(m->poll_waddr); i++) {
+                if (IS_ERR(m->poll_waddr[i])) {
+                        p9_mux_poll_stop(m);
+                        mtmp = (void *)m->poll_waddr;   /* the error code */
+                        kfree(m);
+                        m = mtmp;
+                        break;
+                }
+        }
+        return m;
+}
+/**
+ * p9_mux_destroy - cancels all pending requests and frees mux resources
+ */
+static void p9_conn_destroy(struct p9_conn *m)
+{
+        P9_DPRINTK(P9_DEBUG_MUX, "mux %p prev %p next %p\n", m,
+                m->mux_list.prev, m->mux_list.next);
+        p9_conn_cancel(m, -ECONNRESET);
+        if (!list_empty(&m->req_list)) {
+                /* wait until all processes waiting on this session exit */
+                P9_DPRINTK(P9_DEBUG_MUX,
+                        "mux %p waiting for empty request queue\n", m);
+                wait_event_timeout(m->equeue, (list_empty(&m->req_list)), 5000);
+                P9_DPRINTK(P9_DEBUG_MUX, "mux %p request queue empty: %d\n", m,
+                        list_empty(&m->req_list));
+        }
+        p9_mux_poll_stop(m);
+        m->trans = NULL;
+        p9_idpool_destroy(m->tagpool);
+        kfree(m);
+}
+/**
+ * p9_pollwait - called by files poll operation to add v9fs-poll task
+ *      to files wait queue
+ */
+static void
+p9_pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p)
+{
+        int i;
+        struct p9_conn *m;
+        m = container_of(p, struct p9_conn, pt);
+        for (i = 0; i < ARRAY_SIZE(m->poll_waddr); i++)
+                if (m->poll_waddr[i] == NULL)
+                        break;
+        if (i >= ARRAY_SIZE(m->poll_waddr)) {
+                P9_DPRINTK(P9_DEBUG_ERROR, "not enough wait_address slots\n");
+                return;
+        }
+        m->poll_waddr[i] = wait_address;
+        if (!wait_address) {
+                P9_DPRINTK(P9_DEBUG_ERROR, "no wait_address\n");
+                m->poll_waddr[i] = ERR_PTR(-EIO);
+                return;
+        }
+        init_waitqueue_entry(&m->poll_wait[i], m->poll_task->task);
+        add_wait_queue(wait_address, &m->poll_wait[i]);
+}
+/**
+ * p9_poll_mux - polls a mux and schedules read or write works if necessary
+ */
+static void p9_poll_mux(struct p9_conn *m)
+{
+        int n;
+        if (m->err < 0)
+                return;
+        n = p9_fd_poll(m->trans, NULL);
+        if (n < 0 || n & (POLLERR | POLLHUP | POLLNVAL)) {
+                P9_DPRINTK(P9_DEBUG_MUX, "error mux %p err %d\n", m, n);
+                if (n >= 0)
+                        n = -ECONNRESET;
+                p9_conn_cancel(m, n);
+        }
+        if (n & POLLIN) {
+                set_bit(Rpending, &m->wsched);
+                P9_DPRINTK(P9_DEBUG_MUX, "mux %p can read\n", m);
+                if (!test_and_set_bit(Rworksched, &m->wsched)) {
+                        P9_DPRINTK(P9_DEBUG_MUX, "schedule read work %p\n", m);
+                        queue_work(p9_mux_wq, &m->rq);
+                }
+        }
+        if (n & POLLOUT) {
+                set_bit(Wpending, &m->wsched);
+                P9_DPRINTK(P9_DEBUG_MUX, "mux %p can write\n", m);
+                if ((m->wsize || !list_empty(&m->unsent_req_list))
+                    && !test_and_set_bit(Wworksched, &m->wsched)) {
+                        P9_DPRINTK(P9_DEBUG_MUX, "schedule write work %p\n", m);
+                        queue_work(p9_mux_wq, &m->wq);
+                }
+        }
+}
+/**
+ * p9_poll_proc - polls all v9fs transports for new events and queues
+ *      the appropriate work to the work queue
+ */
+static int p9_poll_proc(void *a)
+{
+        struct p9_conn *m, *mtmp;
+        struct p9_mux_poll_task *vpt;
+        vpt = a;
+        P9_DPRINTK(P9_DEBUG_MUX, "start %p %p\n", current, vpt);
+        while (!kthread_should_stop()) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                list_for_each_entry_safe(m, mtmp, &vpt->mux_list, mux_list) {
+                        p9_poll_mux(m);
+                }
+                P9_DPRINTK(P9_DEBUG_MUX, "sleeping...\n");
+                schedule_timeout(SCHED_TIMEOUT * HZ);
+        }
+        __set_current_state(TASK_RUNNING);
+        P9_DPRINTK(P9_DEBUG_MUX, "finish\n");
+        return 0;
+}
+/**
+ * p9_write_work - called when a transport can send some data
+ */
+static void p9_write_work(struct work_struct *work)
+{
+        int n, err;
+        struct p9_conn *m;
+        struct p9_req *req;
+        m = container_of(work, struct p9_conn, wq);
+        if (m->err < 0) {
+                clear_bit(Wworksched, &m->wsched);
+                return;
+        }
+        if (!m->wsize) {
+                if (list_empty(&m->unsent_req_list)) {
+                        clear_bit(Wworksched, &m->wsched);
+                        return;
+                }
+                spin_lock(&m->lock);
+again:
+                req = list_entry(m->unsent_req_list.next, struct p9_req,
+                               req_list);
+                list_move_tail(&req->req_list, &m->req_list);
+                if (req->err == ERREQFLUSH)
+                        goto again;
+                m->wbuf = req->tcall->sdata;
+                m->wsize = req->tcall->size;
+                m->wpos = 0;
+                spin_unlock(&m->lock);
+        }
+        P9_DPRINTK(P9_DEBUG_MUX, "mux %p pos %d size %d\n", m, m->wpos,
+                                                                m->wsize);
+        clear_bit(Wpending, &m->wsched);
+        err = p9_fd_write(m->trans, m->wbuf + m->wpos, m->wsize - m->wpos);
+        P9_DPRINTK(P9_DEBUG_MUX, "mux %p sent %d bytes\n", m, err);
+        if (err == -EAGAIN) {
+                clear_bit(Wworksched, &m->wsched);
+                return;
+        }
+        if (err < 0)
+                goto error;
+        else if (err == 0) {
+                err = -EREMOTEIO;
+                goto error;
+        }
+        m->wpos += err;
+        if (m->wpos == m->wsize)
+                m->wpos = m->wsize = 0;
+        if (m->wsize == 0 && !list_empty(&m->unsent_req_list)) {
+                if (test_and_clear_bit(Wpending, &m->wsched))
+                        n = POLLOUT;
+                else
+                        n = p9_fd_poll(m->trans, NULL);
+                if (n & POLLOUT) {
+                        P9_DPRINTK(P9_DEBUG_MUX, "schedule write work %p\n", m);
+                        queue_work(p9_mux_wq, &m->wq);
+                } else
+                        clear_bit(Wworksched, &m->wsched);
+        } else
+                clear_bit(Wworksched, &m->wsched);
+        return;
+error:
+        p9_conn_cancel(m, err);
+        clear_bit(Wworksched, &m->wsched);
+}
+static void process_request(struct p9_conn *m, struct p9_req *req)
+{
+        int ecode;
+        struct p9_str *ename;
+        if (!req->err && req->rcall->id == P9_RERROR) {
+                ecode = req->rcall->params.rerror.errno;
+                ename = &req->rcall->params.rerror.error;
+                P9_DPRINTK(P9_DEBUG_MUX, "Rerror %.*s\n", ename->len,
+                                                                ename->str);
+                if (m->extended)
+                        req->err = -ecode;
+                if (!req->err) {
+                        req->err = p9_errstr2errno(ename->str, ename->len);
+                        /* string match failed */
+                        if (!req->err) {
+                                PRINT_FCALL_ERROR("unknown error", req->rcall);
+                                req->err = -ESERVERFAULT;
+                        }
+                }
+        } else if (req->tcall && req->rcall->id != req->tcall->id + 1) {
+                P9_DPRINTK(P9_DEBUG_ERROR,
+                                "fcall mismatch: expected %d, got %d\n",
+                                req->tcall->id + 1, req->rcall->id);
+                if (!req->err)
+                        req->err = -EIO;
+        }
+}
+/**
+ * p9_read_work - called when there is some data to be read from a transport
+ */
+static void p9_read_work(struct work_struct *work)
+{
+        int n, err;
+        struct p9_conn *m;
+        struct p9_req *req, *rptr, *rreq;
+        struct p9_fcall *rcall;
+        char *rbuf;
+        m = container_of(work, struct p9_conn, rq);
+        if (m->err < 0)
+                return;
+        rcall = NULL;
+        P9_DPRINTK(P9_DEBUG_MUX, "start mux %p pos %d\n", m, m->rpos);
+        if (!m->rcall) {
+                m->rcall =
+                    kmalloc(sizeof(struct p9_fcall) + m->msize, GFP_KERNEL);
+                if (!m->rcall) {
+                        err = -ENOMEM;
+                        goto error;
+                }
+                m->rbuf = (char *)m->rcall + sizeof(struct p9_fcall);
+                m->rpos = 0;
+        }
+        clear_bit(Rpending, &m->wsched);
+        err = p9_fd_read(m->trans, m->rbuf + m->rpos, m->msize - m->rpos);
+        P9_DPRINTK(P9_DEBUG_MUX, "mux %p got %d bytes\n", m, err);
+        if (err == -EAGAIN) {
+                clear_bit(Rworksched, &m->wsched);
+                return;
+        }
+        if (err <= 0)
+                goto error;
+        m->rpos += err;
+        while (m->rpos > 4) {
+                n = le32_to_cpu(*(__le32 *) m->rbuf);
+                if (n >= m->msize) {
+                        P9_DPRINTK(P9_DEBUG_ERROR,
+                                "requested packet size too big: %d\n", n);
+                        err = -EIO;
+                        goto error;
+                }
+                if (m->rpos < n)
+                        break;
+                err =
+                    p9_deserialize_fcall(m->rbuf, n, m->rcall, m->extended);
+                if (err < 0)
+                        goto error;
+#ifdef CONFIG_NET_9P_DEBUG
+                if ((p9_debug_level&P9_DEBUG_FCALL) == P9_DEBUG_FCALL) {
+                        char buf[150];
+                        p9_printfcall(buf, sizeof(buf), m->rcall,
+                                m->extended);
+                        printk(KERN_NOTICE ">>> %p %s\n", m, buf);
+                }
+#endif
+                rcall = m->rcall;
+                rbuf = m->rbuf;
+                if (m->rpos > n) {
+                        m->rcall = kmalloc(sizeof(struct p9_fcall) + m->msize,
+                                           GFP_KERNEL);
+                        if (!m->rcall) {
+                                err = -ENOMEM;
+                                goto error;
+                        }
+                        m->rbuf = (char *)m->rcall + sizeof(struct p9_fcall);
+                        memmove(m->rbuf, rbuf + n, m->rpos - n);
+                        m->rpos -= n;
+                } else {
+                        m->rcall = NULL;
+                        m->rbuf = NULL;
+                        m->rpos = 0;
+                }
+                P9_DPRINTK(P9_DEBUG_MUX, "mux %p fcall id %d tag %d\n", m,
+                                                        rcall->id, rcall->tag);
+                req = NULL;
+                spin_lock(&m->lock);
+                list_for_each_entry_safe(rreq, rptr, &m->req_list, req_list) {
+                        if (rreq->tag == rcall->tag) {
+                                req = rreq;
+                                if (req->flush != Flushing)
+                                        list_del(&req->req_list);
+                                break;
+                        }
+                }
+                spin_unlock(&m->lock);
+                if (req) {
+                        req->rcall = rcall;
+                        process_request(m, req);
+                        if (req->flush != Flushing) {
+                                if (req->cb)
+                                        (*req->cb) (req, req->cba);
+                                else
+                                        kfree(req->rcall);
+                                wake_up(&m->equeue);
+                        }
+                } else {
+                        if (err >= 0 && rcall->id != P9_RFLUSH)
+                                P9_DPRINTK(P9_DEBUG_ERROR,
+                                  "unexpected response mux %p id %d tag %d\n",
+                                  m, rcall->id, rcall->tag);
+                        kfree(rcall);
+                }
+        }
+        if (!list_empty(&m->req_list)) {
+                if (test_and_clear_bit(Rpending, &m->wsched))
+                        n = POLLIN;
+                else
+                        n = p9_fd_poll(m->trans, NULL);
+                if (n & POLLIN) {
+                        P9_DPRINTK(P9_DEBUG_MUX, "schedule read work %p\n", m);
+                        queue_work(p9_mux_wq, &m->rq);
+                } else
+                        clear_bit(Rworksched, &m->wsched);
+        } else
+                clear_bit(Rworksched, &m->wsched);
+        return;
+error:
+        p9_conn_cancel(m, err);
+        clear_bit(Rworksched, &m->wsched);
+}
+/**
+ * p9_send_request - send 9P request
+ * The function can sleep until the request is scheduled for sending.
+ * The function can be interrupted. Return from the function is not
+ * a guarantee that the request is sent successfully. Can return errors
+ * that can be retrieved by PTR_ERR macros.
+ *
+ * @m: mux data
+ * @tc: request to be sent
+ * @cb: callback function to call when response is received
+ * @cba: parameter to pass to the callback function
+ */
+static struct p9_req *p9_send_request(struct p9_conn *m,
+                                          struct p9_fcall *tc,
+                                          p9_conn_req_callback cb, void *cba)
+{
+        int n;
+        struct p9_req *req;
+        P9_DPRINTK(P9_DEBUG_MUX, "mux %p task %p tcall %p id %d\n", m, current,
+                tc, tc->id);
+        if (m->err < 0)
+                return ERR_PTR(m->err);
+        req = kmalloc(sizeof(struct p9_req), GFP_KERNEL);
+        if (!req)
+                return ERR_PTR(-ENOMEM);
+        if (tc->id == P9_TVERSION)
+                n = P9_NOTAG;
+        else
+                n = p9_mux_get_tag(m);
+        if (n < 0)
+                return ERR_PTR(-ENOMEM);
+        p9_set_tag(tc, n);
+#ifdef CONFIG_NET_9P_DEBUG
+        if ((p9_debug_level&P9_DEBUG_FCALL) == P9_DEBUG_FCALL) {
+                char buf[150];
+                p9_printfcall(buf, sizeof(buf), tc, m->extended);
+                printk(KERN_NOTICE "<<< %p %s\n", m, buf);
+        }
+#endif
+        spin_lock_init(&req->lock);
+        req->tag = n;
+        req->tcall = tc;
+        req->rcall = NULL;
+        req->err = 0;
+        req->cb = cb;
+        req->cba = cba;
+        req->flush = None;
+        spin_lock(&m->lock);
+        list_add_tail(&req->req_list, &m->unsent_req_list);
+        spin_unlock(&m->lock);
+        if (test_and_clear_bit(Wpending, &m->wsched))
+                n = POLLOUT;
+        else
+                n = p9_fd_poll(m->trans, NULL);
+        if (n & POLLOUT && !test_and_set_bit(Wworksched, &m->wsched))
+                queue_work(p9_mux_wq, &m->wq);
+        return req;
+}
+static void p9_mux_free_request(struct p9_conn *m, struct p9_req *req)
+{
+        p9_mux_put_tag(m, req->tag);
+        kfree(req);
+}
+static void p9_mux_flush_cb(struct p9_req *freq, void *a)
+{
+        p9_conn_req_callback cb;
+        int tag;
+        struct p9_conn *m;
+        struct p9_req *req, *rreq, *rptr;
+        m = a;
+        P9_DPRINTK(P9_DEBUG_MUX, "mux %p tc %p rc %p err %d oldtag %d\n", m,
+                freq->tcall, freq->rcall, freq->err,
+                freq->tcall->params.tflush.oldtag);
+        spin_lock(&m->lock);
+        cb = NULL;
+        tag = freq->tcall->params.tflush.oldtag;
+        req = NULL;
+        list_for_each_entry_safe(rreq, rptr, &m->req_list, req_list) {
+                if (rreq->tag == tag) {
+                        req = rreq;
+                        list_del(&req->req_list);
+                        break;
+                }
+        }
+        spin_unlock(&m->lock);
+        if (req) {
+                spin_lock(&req->lock);
+                req->flush = Flushed;
+                spin_unlock(&req->lock);
+                if (req->cb)
+                        (*req->cb) (req, req->cba);
+                else
+                        kfree(req->rcall);
+                wake_up(&m->equeue);
+        }
+        kfree(freq->tcall);
+        kfree(freq->rcall);
+        p9_mux_free_request(m, freq);
+}
+static int
+p9_mux_flush_request(struct p9_conn *m, struct p9_req *req)
+{
+        struct p9_fcall *fc;
+        struct p9_req *rreq, *rptr;
+        P9_DPRINTK(P9_DEBUG_MUX, "mux %p req %p tag %d\n", m, req, req->tag);
+        /* if a response was received for a request, do nothing */
+        spin_lock(&req->lock);
+        if (req->rcall || req->err) {
+                spin_unlock(&req->lock);
+                P9_DPRINTK(P9_DEBUG_MUX,
+                        "mux %p req %p response already received\n", m, req);
+                return 0;
+        }
+        req->flush = Flushing;
+        spin_unlock(&req->lock);
+        spin_lock(&m->lock);
+        /* if the request is not sent yet, just remove it from the list */
+        list_for_each_entry_safe(rreq, rptr, &m->unsent_req_list, req_list) {
+                if (rreq->tag == req->tag) {
+                        P9_DPRINTK(P9_DEBUG_MUX,
+                           "mux %p req %p request is not sent yet\n", m, req);
+                        list_del(&rreq->req_list);
+                        req->flush = Flushed;
+                        spin_unlock(&m->lock);
+                        if (req->cb)
+                                (*req->cb) (req, req->cba);
+                        return 0;
+                }
+        }
+        spin_unlock(&m->lock);
+        clear_thread_flag(TIF_SIGPENDING);
+        fc = p9_create_tflush(req->tag);
+        p9_send_request(m, fc, p9_mux_flush_cb, m);
+        return 1;
+}
+static void
+p9_conn_rpc_cb(struct p9_req *req, void *a)
+{
+        struct p9_mux_rpc *r;
+        P9_DPRINTK(P9_DEBUG_MUX, "req %p r %p\n", req, a);
+        r = a;
+        r->rcall = req->rcall;
+        r->err = req->err;
+        if (req->flush != None && !req->err)
+                r->err = -ERESTARTSYS;
+        wake_up(&r->wqueue);
+}
+/**
+ * p9_fd_rpc- sends 9P request and waits until a response is available.
+ *      The function can be interrupted.
+ * @m: mux data
+ * @tc: request to be sent
+ * @rc: pointer where a pointer to the response is stored
+ */
+int
+p9_fd_rpc(struct p9_trans *t, struct p9_fcall *tc, struct p9_fcall **rc)
+{
+        struct p9_trans_fd *p = t->priv;
+        struct p9_conn *m = p->conn;
+        int err, sigpending;
+        unsigned long flags;
+        struct p9_req *req;
+        struct p9_mux_rpc r;
+        r.err = 0;
+        r.tcall = tc;
+        r.rcall = NULL;
+        r.m = m;
+        init_waitqueue_head(&r.wqueue);
+        if (rc)
+                *rc = NULL;
+        sigpending = 0;
+        if (signal_pending(current)) {
+                sigpending = 1;
+                clear_thread_flag(TIF_SIGPENDING);
+        }
+        req = p9_send_request(m, tc, p9_conn_rpc_cb, &r);
+        if (IS_ERR(req)) {
+                err = PTR_ERR(req);
+                P9_DPRINTK(P9_DEBUG_MUX, "error %d\n", err);
+                return err;
+        }
+        err = wait_event_interruptible(r.wqueue, r.rcall != NULL || r.err < 0);
+        if (r.err < 0)
+                err = r.err;
+        if (err == -ERESTARTSYS && m->trans->status == Connected
+                                                        && m->err == 0) {
+                if (p9_mux_flush_request(m, req)) {
+                        /* wait until we get response of the flush message */
+                        do {
+                                clear_thread_flag(TIF_SIGPENDING);
+                                err = wait_event_interruptible(r.wqueue,
+                                        r.rcall || r.err);
+                        } while (!r.rcall && !r.err && err == -ERESTARTSYS &&
+                                m->trans->status == Connected && !m->err);
+                        err = -ERESTARTSYS;
+                }
+                sigpending = 1;
+        }
+        if (sigpending) {
+                spin_lock_irqsave(&current->sighand->siglock, flags);
+                recalc_sigpending();
+                spin_unlock_irqrestore(&current->sighand->siglock, flags);
+        }
+        if (rc)
+                *rc = r.rcall;
+        else
+                kfree(r.rcall);
+        p9_mux_free_request(m, req);
+        if (err > 0)
+                err = -EIO;
+        return err;
+}
+#ifdef P9_NONBLOCK
+/**
+ * p9_conn_rpcnb - sends 9P request without waiting for response.
+ * @m: mux data
+ * @tc: request to be sent
+ * @cb: callback function to be called when response arrives
+ * @cba: value to pass to the callback function
+ */
+int p9_conn_rpcnb(struct p9_conn *m, struct p9_fcall *tc,
+                   p9_conn_req_callback cb, void *a)
+{
+        int err;
+        struct p9_req *req;
+        req = p9_send_request(m, tc, cb, a);
+        if (IS_ERR(req)) {
+                err = PTR_ERR(req);
+                P9_DPRINTK(P9_DEBUG_MUX, "error %d\n", err);
+                return PTR_ERR(req);
+        }
+        P9_DPRINTK(P9_DEBUG_MUX, "mux %p tc %p tag %d\n", m, tc, req->tag);
+        return 0;
+}
+#endif /* P9_NONBLOCK */
+/**
+ * p9_conn_cancel - cancel all pending requests with error
+ * @m: mux data
+ * @err: error code
+ */
+void p9_conn_cancel(struct p9_conn *m, int err)
+{
+        struct p9_req *req, *rtmp;
+        LIST_HEAD(cancel_list);
+        P9_DPRINTK(P9_DEBUG_ERROR, "mux %p err %d\n", m, err);
+        m->err = err;
+        spin_lock(&m->lock);
+        list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) {
+                list_move(&req->req_list, &cancel_list);
+        }
+        list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) {
+                list_move(&req->req_list, &cancel_list);
+        }
+        spin_unlock(&m->lock);
+        list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) {
+                list_del(&req->req_list);
+                if (!req->err)
+                        req->err = err;
+                if (req->cb)
+                        (*req->cb) (req, req->cba);
+                else
+                        kfree(req->rcall);
+        }
+        wake_up(&m->equeue);
+}
 /**
 * v9fs_parse_options - parse mount options into session structure
 * @options: options string passed from mount
@@ -268,7 +1294,7 @@ end:
 }
 /**
- * p9_sock_close - shutdown socket
+ * p9_fd_close - shutdown socket
 * @trans: private socket structure
 *
 */
@@ -284,6 +1310,8 @@ static void p9_fd_close(struct p9_trans *trans)
        if (!ts)
                return;
+        p9_conn_destroy(ts->conn);
        trans->status = Disconnected;
        if (ts->rd)
                fput(ts->rd);
@@ -292,13 +1320,15 @@ static void p9_fd_close(struct p9_trans *trans)
        kfree(ts);
 }
-static struct p9_trans *p9_trans_create_tcp(const char *addr, char *args)
+static struct p9_trans *
+p9_trans_create_tcp(const char *addr, char *args, int msize, unsigned char dotu)
 {
        int err;
        struct p9_trans *trans;
        struct socket *csocket;
        struct sockaddr_in sin_server;
        struct p9_fd_opts opts;
+        struct p9_trans_fd *p;
        parse_opts(args, &opts);
@@ -306,11 +1336,10 @@ static struct p9_trans *p9_trans_create_tcp(const char *addr, char *args)
        trans = kmalloc(sizeof(struct p9_trans), GFP_KERNEL);
        if (!trans)
                return ERR_PTR(-ENOMEM);
+        trans->msize = msize;
-        trans->write = p9_fd_write;
+        trans->extended = dotu;
-        trans->read = p9_fd_read;
+        trans->rpc = p9_fd_rpc;
        trans->close = p9_fd_close;
-        trans->poll = p9_fd_poll;
        sin_server.sin_family = AF_INET;
        sin_server.sin_addr.s_addr = in_aton(addr);
@@ -337,6 +1366,14 @@ static struct p9_trans *p9_trans_create_tcp(const char *addr, char *args)
        if (err < 0)
                goto error;
+        p = (struct p9_trans_fd *) trans->priv;
+        p->conn = p9_conn_create(trans);
+        if (IS_ERR(p->conn)) {
+                err = PTR_ERR(p->conn);
+                p->conn = NULL;
+                goto error;
+        }
        return trans;
 error:
@@ -347,22 +1384,23 @@ error:
        return ERR_PTR(err);
 }
-static struct p9_trans *p9_trans_create_unix(const char *addr, char *args)
+static struct p9_trans *
+p9_trans_create_unix(const char *addr, char *args, int msize,
+                                                        unsigned char dotu)
 {
        int err;
        struct socket *csocket;
        struct sockaddr_un sun_server;
        struct p9_trans *trans;
+        struct p9_trans_fd *p;
        csocket = NULL;
        trans = kmalloc(sizeof(struct p9_trans), GFP_KERNEL);
        if (!trans)
                return ERR_PTR(-ENOMEM);
-        trans->write = p9_fd_write;
+        trans->rpc = p9_fd_rpc;
-        trans->read = p9_fd_read;
        trans->close = p9_fd_close;
-        trans->poll = p9_fd_poll;
        if (strlen(addr) > UNIX_PATH_MAX) {
                P9_EPRINTK(KERN_ERR, "p9_trans_unix: address too long: %s\n",
@@ -387,6 +1425,16 @@ static struct p9_trans *p9_trans_create_unix(const char *addr, char *args)
        if (err < 0)
                goto error;
+        trans->msize = msize;
+        trans->extended = dotu;
+        p = (struct p9_trans_fd *) trans->priv;
+        p->conn = p9_conn_create(trans);
+        if (IS_ERR(p->conn)) {
+                err = PTR_ERR(p->conn);
+                p->conn = NULL;
+                goto error;
+        }
        return trans;
 error:
@@ -397,11 +1445,14 @@ error:
        return ERR_PTR(err);
 }
-static struct p9_trans *p9_trans_create_fd(const char *name, char *args)
+static struct p9_trans *
+p9_trans_create_fd(const char *name, char *args, int msize,
+                                                        unsigned char extended)
 {
        int err;
        struct p9_trans *trans;
        struct p9_fd_opts opts;
+        struct p9_trans_fd *p;
        parse_opts(args, &opts);
@@ -414,15 +1465,23 @@ static struct p9_trans *p9_trans_create_fd(const char *name, char *args)
        if (!trans)
                return ERR_PTR(-ENOMEM);
-        trans->write = p9_fd_write;
+        trans->rpc = p9_fd_rpc;
-        trans->read = p9_fd_read;
        trans->close = p9_fd_close;
-        trans->poll = p9_fd_poll;
        err = p9_fd_open(trans, opts.rfd, opts.wfd);
        if (err < 0)
                goto error;
+        trans->msize = msize;
+        trans->extended = extended;
+        p = (struct p9_trans_fd *) trans->priv;
+        p->conn = p9_conn_create(trans);
+        if (IS_ERR(p->conn)) {
+                err = PTR_ERR(p->conn);
+                p->conn = NULL;
+                goto error;
+        }
        return trans;
 error:
@@ -453,6 +1512,12 @@ static struct p9_trans_module p9_fd_trans = {
 static int __init p9_trans_fd_init(void)
 {
+        int ret = p9_mux_global_init();
+        if (ret) {
+                printk(KERN_WARNING "9p: starting mux failed\n");
+                return ret;
+        }
        v9fs_register_trans(&p9_tcp_trans);
        v9fs_register_trans(&p9_unix_trans);
        v9fs_register_trans(&p9_fd_trans);
@@ -460,13 +1525,7 @@ static int __init p9_trans_fd_init(void)
        return 1;
 }
-static void __exit p9_trans_fd_exit(void) {
-        printk(KERN_ERR "Removal of 9p transports not implemented\n");
-        BUG();
-}
 module_init(p9_trans_fd_init);
-module_exit(p9_trans_fd_exit);
 MODULE_AUTHOR("Latchesar Ionkov <lucho@ionkov.net>");
 MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index 40b71a29fc3f..0117b9fb8480 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -1,17 +1,8 @@
 /*
 * The Guest 9p transport driver
 *
- * This is a trivial pipe-based transport driver based on the lguest console
+ * This is a block based transport driver based on the lguest block driver
- * code: we use lguest's DMA mechanism to send bytes out, and register a
+ * code.
- * DMA buffer to receive bytes in.  It is assumed to be present and available
- * from the very beginning of boot.
- *
- * This may be have been done by just instaniating another HVC console,
- * but HVC's blocksize of 16 bytes is annoying and painful to performance.
- *
- * A more efficient transport could be built based on the virtio block driver
- * but it requires some changes in the 9p transport model (which are in
- * progress)
 *
 */
 /*
@@ -55,11 +46,25 @@
 #include <linux/virtio.h>
 #include <linux/virtio_9p.h>
+#define VIRTQUEUE_NUM   128
 /* a single mutex to manage channel initialization and attachment */
 static DECLARE_MUTEX(virtio_9p_lock);
 /* global which tracks highest initialized channel */
 static int chan_index;
+#define P9_INIT_MAXTAG  16
+#define REQ_STATUS_IDLE 0
+#define REQ_STATUS_SENT 1
+#define REQ_STATUS_RCVD 2
+#define REQ_STATUS_FLSH 3
+struct p9_req_t {
+        int status;
+        wait_queue_head_t *wq;
+};
 /* We keep all per-channel information in a structure.
 * This structure is allocated within the devices dev->mem space.
 * A pointer to the structure will get put in the transport private.
@@ -68,148 +73,198 @@ static struct virtio_chan {
        bool initialized;               /* channel is initialized */
        bool inuse;                     /* channel is in use */
-        struct virtqueue *in_vq, *out_vq;
+        spinlock_t lock;
        struct virtio_device *vdev;
+        struct virtqueue *vq;
-        /* This is our input buffer, and how much data is left in it. */
+        struct p9_idpool *tagpool;
-        unsigned int in_len;
+        struct p9_req_t *reqs;
-        char *in, *inbuf;
+        int max_tag;
-        wait_queue_head_t wq;           /* waitq for buffer */
+        /* Scatterlist: can be too big for stack. */
+        struct scatterlist sg[VIRTQUEUE_NUM];
 } channels[MAX_9P_CHAN];
+/* Lookup requests by tag */
+static struct p9_req_t *p9_lookup_tag(struct virtio_chan *c, u16 tag)
+{
+        /* This looks up the original request by tag so we know which
+         * buffer to read the data into */
+        tag++;
+        while (tag >= c->max_tag) {
+                int old_max = c->max_tag;
+                int count;
+                if (c->max_tag)
+                        c->max_tag *= 2;
+                else
+                        c->max_tag = P9_INIT_MAXTAG;
+                c->reqs = krealloc(c->reqs, sizeof(struct p9_req_t)*c->max_tag,
+                                                                GFP_ATOMIC);
+                if (!c->reqs) {
+                        printk(KERN_ERR "Couldn't grow tag array\n");
+                        BUG();
+                }
+                for (count = old_max; count < c->max_tag; count++) {
+                        c->reqs[count].status = REQ_STATUS_IDLE;
+                        c->reqs[count].wq = kmalloc(sizeof(wait_queue_t),
+                                                                GFP_ATOMIC);
+                        if (!c->reqs[count].wq) {
+                                printk(KERN_ERR "Couldn't grow tag array\n");
+                                BUG();
+                        }
+                        init_waitqueue_head(c->reqs[count].wq);
+                }
+        }
+        return &c->reqs[tag];
+}
 /* How many bytes left in this page. */
 static unsigned int rest_of_page(void *data)
 {
        return PAGE_SIZE - ((unsigned long)data % PAGE_SIZE);
 }
-static int p9_virtio_write(struct p9_trans *trans, void *buf, int count)
+static void p9_virtio_close(struct p9_trans *trans)
 {
-        struct virtio_chan *chan = (struct virtio_chan *) trans->priv;
+        struct virtio_chan *chan = trans->priv;
-        struct virtqueue *out_vq = chan->out_vq;
+        int count;
-        struct scatterlist sg[1];
+        unsigned int flags;
-        unsigned int len;
-        P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio write (%d)\n", count);
+        spin_lock_irqsave(&chan->lock, flags);
+        p9_idpool_destroy(chan->tagpool);
+        for (count = 0; count < chan->max_tag; count++)
+                kfree(chan->reqs[count].wq);
+        kfree(chan->reqs);
+        chan->max_tag = 0;
+        spin_unlock_irqrestore(&chan->lock, flags);
-        /* keep it simple - make sure we don't overflow a page */
+        down(&virtio_9p_lock);
-        if (rest_of_page(buf) < count)
+        chan->inuse = false;
-                count = rest_of_page(buf);
+        up(&virtio_9p_lock);
-        sg_init_one(sg, buf, count);
+        kfree(trans);
+}
-        /* add_buf wants a token to identify this buffer: we hand it any
+static void req_done(struct virtqueue *vq)
-         * non-NULL pointer, since there's only ever one buffer. */
+{
-        if (out_vq->vq_ops->add_buf(out_vq, sg, 1, 0, (void *)1) == 0) {
+        struct virtio_chan *chan = vq->vdev->priv;
-                /* Tell Host to go! */
+        struct p9_fcall *rc;
-                out_vq->vq_ops->kick(out_vq);
+        unsigned int len;
-                /* Chill out until it's done with the buffer. */
+        unsigned long flags;
-                while (!out_vq->vq_ops->get_buf(out_vq, &len))
+        struct p9_req_t *req;
-                        cpu_relax();
+        spin_lock_irqsave(&chan->lock, flags);
+        while ((rc = chan->vq->vq_ops->get_buf(chan->vq, &len)) != NULL) {
+                req = p9_lookup_tag(chan, rc->tag);
+                req->status = REQ_STATUS_RCVD;
+                wake_up(req->wq);
        }
+        /* In case queue is stopped waiting for more buffers. */
-        P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio wrote (%d)\n", count);
+        spin_unlock_irqrestore(&chan->lock, flags);
-        /* We're expected to return the amount of data we wrote: all of it. */
-        return count;
 }
-/* Create a scatter-gather list representing our input buffer and put it in the
+static int
- * queue. */
+pack_sg_list(struct scatterlist *sg, int start, int limit, char *data,
-static void add_inbuf(struct virtio_chan *chan)
+                                                                int count)
 {
-        struct scatterlist sg[1];
+        int s;
+        int index = start;
-        sg_init_one(sg, chan->inbuf, PAGE_SIZE);
+        while (count) {
+                s = rest_of_page(data);
+                if (s > count)
+                        s = count;
+                sg_set_buf(&sg[index++], data, s);
+                count -= s;
+                data += s;
+                if (index > limit)
+                        BUG();
+        }
-        /* We should always be able to add one buffer to an empty queue. */
+        return index-start;
-        if (chan->in_vq->vq_ops->add_buf(chan->in_vq, sg, 0, 1, chan->inbuf))
-                BUG();
-        chan->in_vq->vq_ops->kick(chan->in_vq);
 }
-static int p9_virtio_read(struct p9_trans *trans, void *buf, int count)
+static int
+p9_virtio_rpc(struct p9_trans *t, struct p9_fcall *tc, struct p9_fcall **rc)
 {
-        struct virtio_chan *chan = (struct virtio_chan *) trans->priv;
+        int in, out;
-        struct virtqueue *in_vq = chan->in_vq;
+        int n, err, size;
+        struct virtio_chan *chan = t->priv;
-        P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio read (%d)\n", count);
+        char *rdata;
+        struct p9_req_t *req;
+        unsigned long flags;
+        if (*rc == NULL) {
+                *rc = kmalloc(sizeof(struct p9_fcall) + t->msize, GFP_KERNEL);
+                if (!*rc)
+                        return -ENOMEM;
+        }
-        /* If we don't have an input queue yet, we can't get input. */
+        rdata = (char *)*rc+sizeof(struct p9_fcall);
-        BUG_ON(!in_vq);
-        /* No buffer?  Try to get one. */
+        n = P9_NOTAG;
-        if (!chan->in_len) {
+        if (tc->id != P9_TVERSION) {
-                chan->in = in_vq->vq_ops->get_buf(in_vq, &chan->in_len);
+                n = p9_idpool_get(chan->tagpool);
-                if (!chan->in)
+                if (n < 0)
-                        return 0;
+                        return -ENOMEM;
        }
-        /* You want more than we have to give?  Well, try wanting less! */
+        spin_lock_irqsave(&chan->lock, flags);
-        if (chan->in_len < count)
+        req = p9_lookup_tag(chan, n);
-                count = chan->in_len;
+        spin_unlock_irqrestore(&chan->lock, flags);
-        /* Copy across to their buffer and increment offset. */
+        p9_set_tag(tc, n);
-        memcpy(buf, chan->in, count);
-        chan->in += count;
-        chan->in_len -= count;
-        /* Finished?  Re-register buffer so Host will use it again. */
+        P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio rpc tag %d\n", n);
-        if (chan->in_len == 0)
-                add_inbuf(chan);
-        P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio finished read (%d)\n",
+        out = pack_sg_list(chan->sg, 0, VIRTQUEUE_NUM, tc->sdata, tc->size);
-                                                                        count);
+        in = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM-out, rdata, t->msize);
-        return count;
-}
-/* The poll function is used by 9p transports to determine if there
+        req->status = REQ_STATUS_SENT;
- * is there is activity available on a particular channel.  In our case
- * we use it to wait for a callback from the input routines.
- */
-static unsigned int
-p9_virtio_poll(struct p9_trans *trans, struct poll_table_struct *pt)
-{
-        struct virtio_chan *chan = (struct virtio_chan *)trans->priv;
-        struct virtqueue *in_vq = chan->in_vq;
-        int ret = POLLOUT; /* we can always handle more output */
-        poll_wait(NULL, &chan->wq, pt);
+        if (chan->vq->vq_ops->add_buf(chan->vq, chan->sg, out, in, tc)) {
+                P9_DPRINTK(P9_DEBUG_TRANS,
+                        "9p debug: virtio rpc add_buf returned failure");
+                return -EIO;
+        }
-        /* No buffer?  Try to get one. */
+        chan->vq->vq_ops->kick(chan->vq);
-        if (!chan->in_len)
-                chan->in = in_vq->vq_ops->get_buf(in_vq, &chan->in_len);
-        if (chan->in_len)
+        wait_event(*req->wq, req->status == REQ_STATUS_RCVD);
-                ret |= POLLIN;
-        return ret;
+        size = le32_to_cpu(*(__le32 *) rdata);
-}
-static void p9_virtio_close(struct p9_trans *trans)
+        err = p9_deserialize_fcall(rdata, size, *rc, t->extended);
-{
+        if (err < 0) {
-        struct virtio_chan *chan = trans->priv;
+                P9_DPRINTK(P9_DEBUG_TRANS,
+                        "9p debug: virtio rpc deserialize returned %d\n", err);
+                return err;
+        }
-        down(&virtio_9p_lock);
+#ifdef CONFIG_NET_9P_DEBUG
-        chan->inuse = false;
+        if ((p9_debug_level&P9_DEBUG_FCALL) == P9_DEBUG_FCALL) {
-        up(&virtio_9p_lock);
+                char buf[150];
-        kfree(trans);
+                p9_printfcall(buf, sizeof(buf), *rc, t->extended);
-}
+                printk(KERN_NOTICE ">>> %p %s\n", t, buf);
+        }
+#endif
-static bool p9_virtio_intr(struct virtqueue *q)
+        if (n != P9_NOTAG && p9_idpool_check(n, chan->tagpool))
-{
+                p9_idpool_put(n, chan->tagpool);
-        struct virtio_chan *chan = q->vdev->priv;
-        P9_DPRINTK(P9_DEBUG_TRANS, "9p poll_wakeup: %p\n", &chan->wq);
+        req->status = REQ_STATUS_IDLE;
-        wake_up_interruptible(&chan->wq);
-        return true;
+        return 0;
 }
-static int p9_virtio_probe(struct virtio_device *dev)
+static int p9_virtio_probe(struct virtio_device *vdev)
 {
        int err;
        struct virtio_chan *chan;
@@ -223,44 +278,29 @@ static int p9_virtio_probe(struct virtio_device *dev)
        if (chan_index > MAX_9P_CHAN) {
                printk(KERN_ERR "9p: virtio: Maximum channels exceeded\n");
                BUG();
-        }
-        chan->vdev = dev;
-        /* This is the scratch page we use to receive console input */
-        chan->inbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
-        if (!chan->inbuf) {
                err = -ENOMEM;
                goto fail;
        }
-        /* Find the input queue. */
+        chan->vdev = vdev;
-        dev->priv = chan;
-        chan->in_vq = dev->config->find_vq(dev, p9_virtio_intr);
-        if (IS_ERR(chan->in_vq)) {
-                err = PTR_ERR(chan->in_vq);
-                goto free;
-        }
-        chan->out_vq = dev->config->find_vq(dev, NULL);
+        /* We expect one virtqueue, for requests. */
-        if (IS_ERR(chan->out_vq)) {
+        chan->vq = vdev->config->find_vq(vdev, 0, req_done);
-                err = PTR_ERR(chan->out_vq);
+        if (IS_ERR(chan->vq)) {
-                goto free_in_vq;
+                err = PTR_ERR(chan->vq);
+                goto out_free_vq;
        }
+        chan->vq->vdev->priv = chan;
+        spin_lock_init(&chan->lock);
-        init_waitqueue_head(&chan->wq);
+        sg_init_table(chan->sg, VIRTQUEUE_NUM);
-        /* Register the input buffer the first time. */
-        add_inbuf(chan);
        chan->inuse = false;
        chan->initialized = true;
        return 0;
-free_in_vq:
+out_free_vq:
-        dev->config->del_vq(chan->in_vq);
+        vdev->config->del_vq(chan->vq);
-free:
-        kfree(chan->inbuf);
 fail:
        down(&virtio_9p_lock);
        chan_index--;
@@ -273,11 +313,13 @@ fail:
 * alternate channels by matching devname versus a virtio_config entry.
 * We use a simple reference count mechanism to ensure that only a single
 * mount has a channel open at a time. */
-static struct p9_trans *p9_virtio_create(const char *devname, char *args)
+static struct p9_trans *
+p9_virtio_create(const char *devname, char *args, int msize,
+                                                        unsigned char extended)
 {
        struct p9_trans *trans;
-        int index = 0;
        struct virtio_chan *chan = channels;
+        int index = 0;
        down(&virtio_9p_lock);
        while (index < MAX_9P_CHAN) {
@@ -292,25 +334,45 @@ static struct p9_trans *p9_virtio_create(const char *devname, char *args)
        up(&virtio_9p_lock);
        if (index >= MAX_9P_CHAN) {
-                printk(KERN_ERR "9p: virtio: couldn't find a free channel\n");
+                printk(KERN_ERR "9p: no channels available\n");
-                return NULL;
+                return ERR_PTR(-ENODEV);
        }
+        chan->tagpool = p9_idpool_create();
+        if (IS_ERR(chan->tagpool)) {
+                printk(KERN_ERR "9p: couldn't allocate tagpool\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        p9_idpool_get(chan->tagpool); /* reserve tag 0 */
+        chan->max_tag = 0;
+        chan->reqs = NULL;
        trans = kmalloc(sizeof(struct p9_trans), GFP_KERNEL);
        if (!trans) {
                printk(KERN_ERR "9p: couldn't allocate transport\n");
                return ERR_PTR(-ENOMEM);
        }
+        trans->extended = extended;
-        trans->write = p9_virtio_write;
+        trans->msize = msize;
-        trans->read = p9_virtio_read;
        trans->close = p9_virtio_close;
-        trans->poll = p9_virtio_poll;
+        trans->rpc = p9_virtio_rpc;
        trans->priv = chan;
        return trans;
 }
+static void p9_virtio_remove(struct virtio_device *vdev)
+{
+        struct virtio_chan *chan = vdev->priv;
+        BUG_ON(chan->inuse);
+        if (chan->initialized) {
+                vdev->config->del_vq(chan->vq);
+                chan->initialized = false;
+        }
+}
 #define VIRTIO_ID_9P 9
 static struct virtio_device_id id_table[] = {
@@ -324,12 +386,13 @@ static struct virtio_driver p9_virtio_drv = {
        .driver.owner = THIS_MODULE,
        .id_table =     id_table,
        .probe =        p9_virtio_probe,
+        .remove =       p9_virtio_remove,
 };
 static struct p9_trans_module p9_virtio_trans = {
        .name = "virtio",
        .create = p9_virtio_create,
-        .maxsize = PAGE_SIZE,
+        .maxsize = PAGE_SIZE*16,
        .def = 0,
 };
@@ -345,7 +408,13 @@ static int __init p9_virtio_init(void)
        return register_virtio_driver(&p9_virtio_drv);
 }
+static void __exit p9_virtio_cleanup(void)
+{
+        unregister_virtio_driver(&p9_virtio_drv);
+}
 module_init(p9_virtio_init);
+module_exit(p9_virtio_cleanup);
 MODULE_DEVICE_TABLE(virtio, id_table);
 MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
diff --git a/net/9p/util.c b/net/9p/util.c
index 22077b79395d..ef7215565d88 100644
--- a/net/9p/util.c
+++ b/net/9p/util.c
@@ -33,7 +33,7 @@
 #include <net/9p/9p.h>
 struct p9_idpool {
-        struct semaphore lock;
+        spinlock_t lock;
        struct idr pool;
 };
@@ -45,7 +45,7 @@ struct p9_idpool *p9_idpool_create(void)
        if (!p)
                return ERR_PTR(-ENOMEM);
-        init_MUTEX(&p->lock);
+        spin_lock_init(&p->lock);
        idr_init(&p->pool);
        return p;
@@ -71,19 +71,17 @@ int p9_idpool_get(struct p9_idpool *p)
 {
        int i = 0;
        int error;
+        unsigned int flags;
 retry:
        if (idr_pre_get(&p->pool, GFP_KERNEL) == 0)
                return 0;
-        if (down_interruptible(&p->lock) == -EINTR) {
+        spin_lock_irqsave(&p->lock, flags);
-                P9_EPRINTK(KERN_WARNING, "Interrupted while locking\n");
-                return -1;
-        }
        /* no need to store exactly p, we just need something non-null */
        error = idr_get_new(&p->pool, p, &i);
-        up(&p->lock);
+        spin_unlock_irqrestore(&p->lock, flags);
        if (error == -EAGAIN)
                goto retry;
@@ -104,12 +102,10 @@ EXPORT_SYMBOL(p9_idpool_get);
 void p9_idpool_put(int id, struct p9_idpool *p)
 {
-        if (down_interruptible(&p->lock) == -EINTR) {
+        unsigned int flags;
-                P9_EPRINTK(KERN_WARNING, "Interrupted while locking\n");
+        spin_lock_irqsave(&p->lock, flags);
-                return;
-        }
        idr_remove(&p->pool, id);
-        up(&p->lock);
+        spin_unlock_irqrestore(&p->lock, flags);
 }
 EXPORT_SYMBOL(p9_idpool_put);
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 782a22602b86..519cdb920f93 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -135,8 +135,8 @@ static void __hidp_copy_session(struct hidp_session *session, struct hidp_connin
        }
 }
-static inline int hidp_queue_event(struct hidp_session *session, struct input_dev *dev,
+static int hidp_queue_event(struct hidp_session *session, struct input_dev *dev,
-                                        unsigned int type, unsigned int code, int value)
+                                unsigned int type, unsigned int code, int value)
 {
        unsigned char newleds;
        struct sk_buff *skb;
@@ -243,7 +243,8 @@ static void hidp_input_report(struct hidp_session *session, struct sk_buff *skb)
        input_sync(dev);
 }
-static inline int hidp_queue_report(struct hidp_session *session, unsigned char *data, int size)
+static int hidp_queue_report(struct hidp_session *session,
+                                unsigned char *data, int size)
 {
        struct sk_buff *skb;
@@ -287,7 +288,7 @@ static void hidp_idle_timeout(unsigned long arg)
        hidp_schedule(session);
 }
-static inline void hidp_set_timer(struct hidp_session *session)
+static void hidp_set_timer(struct hidp_session *session)
 {
        if (session->idle_to > 0)
                mod_timer(&session->timer, jiffies + HZ * session->idle_to);
@@ -332,7 +333,8 @@ static inline int hidp_send_ctrl_message(struct hidp_session *session,
        return err;
 }
-static inline void hidp_process_handshake(struct hidp_session *session, unsigned char param)
+static void hidp_process_handshake(struct hidp_session *session,
+                                        unsigned char param)
 {
        BT_DBG("session %p param 0x%02x", session, param);
@@ -365,38 +367,23 @@ static inline void hidp_process_handshake(struct hidp_session *session, unsigned
        }
 }
-static inline void hidp_process_hid_control(struct hidp_session *session, unsigned char param)
+static void hidp_process_hid_control(struct hidp_session *session,
+                                        unsigned char param)
 {
        BT_DBG("session %p param 0x%02x", session, param);
-        switch (param) {
+        if (param == HIDP_CTRL_VIRTUAL_CABLE_UNPLUG) {
-        case HIDP_CTRL_NOP:
-                break;
-        case HIDP_CTRL_VIRTUAL_CABLE_UNPLUG:
                /* Flush the transmit queues */
                skb_queue_purge(&session->ctrl_transmit);
                skb_queue_purge(&session->intr_transmit);
                /* Kill session thread */
                atomic_inc(&session->terminate);
-                break;
-        case HIDP_CTRL_HARD_RESET:
-        case HIDP_CTRL_SOFT_RESET:
-        case HIDP_CTRL_SUSPEND:
-        case HIDP_CTRL_EXIT_SUSPEND:
-                /* FIXME: We have to parse these and return no error */
-                break;
-        default:
-                __hidp_send_ctrl_message(session,
-                        HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_INVALID_PARAMETER, NULL, 0);
-                break;
        }
 }
-static inline void hidp_process_data(struct hidp_session *session, struct sk_buff *skb, unsigned char param)
+static void hidp_process_data(struct hidp_session *session, struct sk_buff *skb,
+                                unsigned char param)
 {
        BT_DBG("session %p skb %p len %d param 0x%02x", session, skb, skb->len, param);
@@ -423,7 +410,8 @@ static inline void hidp_process_data(struct hidp_session *session, struct sk_buf
        }
 }
-static inline void hidp_recv_ctrl_frame(struct hidp_session *session, struct sk_buff *skb)
+static void hidp_recv_ctrl_frame(struct hidp_session *session,
+                                        struct sk_buff *skb)
 {
        unsigned char hdr, type, param;
@@ -457,7 +445,8 @@ static inline void hidp_recv_ctrl_frame(struct hidp_session *session, struct sk_
        kfree_skb(skb);
 }
-static inline void hidp_recv_intr_frame(struct hidp_session *session, struct sk_buff *skb)
+static void hidp_recv_intr_frame(struct hidp_session *session,
+                                struct sk_buff *skb)
 {
        unsigned char hdr;
@@ -625,7 +614,8 @@ static struct device *hidp_get_device(struct hidp_session *session)
        return conn ? &conn->dev : NULL;
 }
-static inline int hidp_setup_input(struct hidp_session *session, struct hidp_connadd_req *req)
+static int hidp_setup_input(struct hidp_session *session,
+                                struct hidp_connadd_req *req)
 {
        struct input_dev *input = session->input;
        int i;
@@ -702,7 +692,8 @@ static void hidp_setup_quirks(struct hid_device *hid)
                        hid->quirks = hidp_blacklist[n].quirks;
 }
-static inline void hidp_setup_hid(struct hidp_session *session, struct hidp_connadd_req *req)
+static void hidp_setup_hid(struct hidp_session *session,
+                                struct hidp_connadd_req *req)
 {
        struct hid_device *hid = session->hid;
        struct hid_report *report;
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 788c70321858..e4c779bb8d76 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -429,7 +429,8 @@ static int rfcomm_release_dev(void __user *arg)
        if (dev->tty)
                tty_vhangup(dev->tty);
-        rfcomm_dev_del(dev);
+        if (!test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags))
+                rfcomm_dev_del(dev);
        rfcomm_dev_put(dev);
        return 0;
 }
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 26e941d912e8..7b660834a4c2 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -287,7 +287,7 @@ EXPORT_SYMBOL_GPL(register_pernet_subsys);
 *      @ops: pernet operations structure to manipulate
 *
 *      Remove the pernet operations structure from the list to be
- *      used when network namespaces are created or destoryed.  In
+ *      used when network namespaces are created or destroyed.  In
 *      addition run the exit method for all existing network
 *      namespaces.
 */
@@ -335,7 +335,7 @@ EXPORT_SYMBOL_GPL(register_pernet_device);
 *      @ops: pernet operations structure to manipulate
 *
 *      Remove the pernet operations structure from the list to be
- *      used when network namespaces are created or destoryed.  In
+ *      used when network namespaces are created or destroyed.  In
 *      addition run the exit method for all existing network
 *      namespaces.
 */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index ddbdde82a700..61ac8d06292c 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -82,32 +82,6 @@ int rtnl_trylock(void)
        return mutex_trylock(&rtnl_mutex);
 }
-int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, int len)
-{
-        memset(tb, 0, sizeof(struct rtattr*)*maxattr);
-        while (RTA_OK(rta, len)) {
-                unsigned flavor = rta->rta_type;
-                if (flavor && flavor <= maxattr)
-                        tb[flavor-1] = rta;
-                rta = RTA_NEXT(rta, len);
-        }
-        return 0;
-}
-int __rtattr_parse_nested_compat(struct rtattr *tb[], int maxattr,
-                                 struct rtattr *rta, int len)
-{
-        if (RTA_PAYLOAD(rta) < len)
-                return -1;
-        if (RTA_PAYLOAD(rta) >= RTA_ALIGN(len) + sizeof(struct rtattr)) {
-                rta = RTA_DATA(rta) + RTA_ALIGN(len);
-                return rtattr_parse_nested(tb, maxattr, rta);
-        }
-        memset(tb, 0, sizeof(struct rtattr *) * maxattr);
-        return 0;
-}
 static struct rtnl_link *rtnl_msg_handlers[NPROTO];
 static inline int rtm_msgindex(int msgtype)
@@ -442,21 +416,6 @@ void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data
        memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size);
 }
-size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size)
-{
-        size_t ret = RTA_PAYLOAD(rta);
-        char *src = RTA_DATA(rta);
-        if (ret > 0 && src[ret - 1] == '\0')
-                ret--;
-        if (size > 0) {
-                size_t len = (ret >= size) ? size - 1 : ret;
-                memset(dest, 0, size);
-                memcpy(dest, src, len);
-        }
-        return ret;
-}
 int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group, int echo)
 {
        struct sock *rtnl = net->rtnl;
@@ -1411,9 +1370,6 @@ void __init rtnetlink_init(void)
 }
 EXPORT_SYMBOL(__rta_fill);
-EXPORT_SYMBOL(rtattr_strlcpy);
-EXPORT_SYMBOL(rtattr_parse);
-EXPORT_SYMBOL(__rtattr_parse_nested_compat);
 EXPORT_SYMBOL(rtnetlink_put_metrics);
 EXPORT_SYMBOL(rtnl_lock);
 EXPORT_SYMBOL(rtnl_trylock);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 98420f9c4b6d..4e354221ec23 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2461,6 +2461,34 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
        return elt;
 }
+/**
+ * skb_partial_csum_set - set up and verify partial csum values for packet
+ * @skb: the skb to set
+ * @start: the number of bytes after skb->data to start checksumming.
+ * @off: the offset from start to place the checksum.
+ *
+ * For untrusted partially-checksummed packets, we need to make sure the values
+ * for skb->csum_start and skb->csum_offset are valid so we don't oops.
+ *
+ * This function checks and sets those values and skb->ip_summed: if this
+ * returns false you should drop the packet.
+ */
+bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
+{
+        if (unlikely(start > skb->len - 2) ||
+            unlikely((int)start + off > skb->len - 2)) {
+                if (net_ratelimit())
+                        printk(KERN_WARNING
+                               "bad partial csum: csum=%u/%u len=%u\n",
+                               start, off, skb->len);
+                return false;
+        }
+        skb->ip_summed = CHECKSUM_PARTIAL;
+        skb->csum_start = skb_headroom(skb) + start;
+        skb->csum_offset = off;
+        return true;
+}
 EXPORT_SYMBOL(___pskb_trim);
 EXPORT_SYMBOL(__kfree_skb);
 EXPORT_SYMBOL(kfree_skb);
@@ -2497,3 +2525,4 @@ EXPORT_SYMBOL(skb_append_datato_frags);
 EXPORT_SYMBOL_GPL(skb_to_sgvec);
 EXPORT_SYMBOL_GPL(skb_cow_data);
+EXPORT_SYMBOL_GPL(skb_partial_csum_set);
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index ebe59d98721a..287a62bc2e0f 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -271,8 +271,6 @@ extern struct sk_buff	*dccp_make_response(struct sock *sk,
 extern int         dccp_connect(struct sock *sk);
 extern int         dccp_disconnect(struct sock *sk, int flags);
-extern void        dccp_hash(struct sock *sk);
-extern void        dccp_unhash(struct sock *sk);
 extern int         dccp_getsockopt(struct sock *sk, int level, int optname,
                                   char __user *optval, int __user *optlen);
 extern int         dccp_setsockopt(struct sock *sk, int level, int optname,
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index c982ad88223d..474075adbde4 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -38,12 +38,6 @@
 */
 static struct socket *dccp_v4_ctl_socket;
-static int dccp_v4_get_port(struct sock *sk, const unsigned short snum)
-{
-        return inet_csk_get_port(&dccp_hashinfo, sk, snum,
-                                 inet_csk_bind_conflict);
-}
 int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
        struct inet_sock *inet = inet_sk(sk);
@@ -408,8 +402,8 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
        dccp_sync_mss(newsk, dst_mtu(dst));
-        __inet_hash_nolisten(&dccp_hashinfo, newsk);
+        __inet_hash_nolisten(newsk);
-        __inet_inherit_port(&dccp_hashinfo, sk, newsk);
+        __inet_inherit_port(sk, newsk);
        return newsk;
@@ -898,6 +892,7 @@ static struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
        .getsockopt        = ip_getsockopt,
        .addr2sockaddr     = inet_csk_addr2sockaddr,
        .sockaddr_len      = sizeof(struct sockaddr_in),
+        .bind_conflict     = inet_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
        .compat_setsockopt = compat_ip_setsockopt,
        .compat_getsockopt = compat_ip_getsockopt,
@@ -937,10 +932,10 @@ static struct proto dccp_v4_prot = {
        .sendmsg                = dccp_sendmsg,
        .recvmsg                = dccp_recvmsg,
        .backlog_rcv            = dccp_v4_do_rcv,
-        .hash                   = dccp_hash,
+        .hash                   = inet_hash,
-        .unhash                 = dccp_unhash,
+        .unhash                 = inet_unhash,
        .accept                 = inet_csk_accept,
-        .get_port               = dccp_v4_get_port,
+        .get_port               = inet_csk_get_port,
        .shutdown               = dccp_shutdown,
        .destroy                = dccp_destroy_sock,
        .orphan_count           = &dccp_orphan_count,
@@ -948,6 +943,7 @@ static struct proto dccp_v4_prot = {
        .obj_size               = sizeof(struct dccp_sock),
        .rsk_prot               = &dccp_request_sock_ops,
        .twsk_prot              = &dccp_timewait_sock_ops,
+        .hashinfo               = &dccp_hashinfo,
 #ifdef CONFIG_COMPAT
        .compat_setsockopt      = compat_dccp_setsockopt,
        .compat_getsockopt      = compat_dccp_getsockopt,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index ed0a0053a797..490333d47c7b 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -39,21 +39,15 @@ static struct socket *dccp_v6_ctl_socket;
 static struct inet_connection_sock_af_ops dccp_ipv6_mapped;
 static struct inet_connection_sock_af_ops dccp_ipv6_af_ops;
-static int dccp_v6_get_port(struct sock *sk, unsigned short snum)
-{
-        return inet_csk_get_port(&dccp_hashinfo, sk, snum,
-                                 inet6_csk_bind_conflict);
-}
 static void dccp_v6_hash(struct sock *sk)
 {
        if (sk->sk_state != DCCP_CLOSED) {
                if (inet_csk(sk)->icsk_af_ops == &dccp_ipv6_mapped) {
-                        dccp_hash(sk);
+                        inet_hash(sk);
                        return;
                }
                local_bh_disable();
-                __inet6_hash(&dccp_hashinfo, sk);
+                __inet6_hash(sk);
                local_bh_enable();
        }
 }
@@ -630,8 +624,8 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
        newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6;
-        __inet6_hash(&dccp_hashinfo, newsk);
+        __inet6_hash(newsk);
-        inet_inherit_port(&dccp_hashinfo, sk, newsk);
+        inet_inherit_port(sk, newsk);
        return newsk;
@@ -1054,6 +1048,7 @@ static struct inet_connection_sock_af_ops dccp_ipv6_af_ops = {
        .getsockopt        = ipv6_getsockopt,
        .addr2sockaddr     = inet6_csk_addr2sockaddr,
        .sockaddr_len      = sizeof(struct sockaddr_in6),
+        .bind_conflict     = inet6_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
        .compat_setsockopt = compat_ipv6_setsockopt,
        .compat_getsockopt = compat_ipv6_getsockopt,
@@ -1123,9 +1118,9 @@ static struct proto dccp_v6_prot = {
        .recvmsg           = dccp_recvmsg,
        .backlog_rcv       = dccp_v6_do_rcv,
        .hash              = dccp_v6_hash,
-        .unhash            = dccp_unhash,
+        .unhash            = inet_unhash,
        .accept            = inet_csk_accept,
-        .get_port          = dccp_v6_get_port,
+        .get_port          = inet_csk_get_port,
        .shutdown          = dccp_shutdown,
        .destroy           = dccp_v6_destroy_sock,
        .orphan_count      = &dccp_orphan_count,
@@ -1133,6 +1128,7 @@ static struct proto dccp_v6_prot = {
        .obj_size          = sizeof(struct dccp6_sock),
        .rsk_prot          = &dccp6_request_sock_ops,
        .twsk_prot         = &dccp6_timewait_sock_ops,
+        .hashinfo          = &dccp_hashinfo,
 #ifdef CONFIG_COMPAT
        .compat_setsockopt = compat_dccp_setsockopt,
        .compat_getsockopt = compat_dccp_getsockopt,
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 0bed4a6095b7..e3f5d37b84be 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -78,7 +78,7 @@ void dccp_set_state(struct sock *sk, const int state)
                sk->sk_prot->unhash(sk);
                if (inet_csk(sk)->icsk_bind_hash != NULL &&
                    !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
-                        inet_put_port(&dccp_hashinfo, sk);
+                        inet_put_port(sk);
                /* fall through */
        default:
                if (oldstate == DCCP_OPEN)
@@ -173,20 +173,6 @@ const char *dccp_state_name(const int state)
 EXPORT_SYMBOL_GPL(dccp_state_name);
-void dccp_hash(struct sock *sk)
-{
-        inet_hash(&dccp_hashinfo, sk);
-}
-EXPORT_SYMBOL_GPL(dccp_hash);
-void dccp_unhash(struct sock *sk)
-{
-        inet_unhash(&dccp_hashinfo, sk);
-}
-EXPORT_SYMBOL_GPL(dccp_unhash);
 int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 {
        struct dccp_sock *dp = dccp_sk(sk);
@@ -268,7 +254,7 @@ int dccp_destroy_sock(struct sock *sk)
        /* Clean up a referenced DCCP bind bucket. */
        if (inet_csk(sk)->icsk_bind_hash != NULL)
-                inet_put_port(&dccp_hashinfo, sk);
+                inet_put_port(sk);
        kfree(dp->dccps_service_list);
        dp->dccps_service_list = NULL;
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index a2241060113b..8cd357f41283 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -547,8 +547,8 @@ int cipso_v4_doi_remove(u32 doi,
                rcu_read_lock();
                list_for_each_entry_rcu(dom_iter, &doi_def->dom_list, list)
                        if (dom_iter->valid)
-                                netlbl_domhsh_remove(dom_iter->domain,
+                                netlbl_cfg_map_del(dom_iter->domain,
-                                                     audit_info);
+                                                   audit_info);
                rcu_read_unlock();
                cipso_v4_cache_invalidate();
                call_rcu(&doi_def->rcu, callback);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 35851c96bdfb..f5fba3f71c06 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2431,8 +2431,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
                                           rtn_type(buf2, sizeof(buf2),
                                                    fa->fa_type));
                                if (fa->fa_tos)
-                                        seq_printf(seq, "tos =%d\n",
+                                        seq_printf(seq, " tos=%d", fa->fa_tos);
-                                                   fa->fa_tos);
                                seq_putc(seq, '\n');
                        }
                }
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index a7321a82df6d..a13c074dac09 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1015,7 +1015,8 @@ int icmp_rcv(struct sk_buff *skb)
                        goto error;
        }
-        __skb_pull(skb, sizeof(*icmph));
+        if (!pskb_pull(skb, sizeof(*icmph)))
+                goto error;
        icmph = icmp_hdr(skb);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index de5a41de191a..b189278c7bc1 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -78,11 +78,9 @@ EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
 /* Obtain a reference to a local port for the given sock,
 * if snum is zero it means select any available local port.
 */
-int inet_csk_get_port(struct inet_hashinfo *hashinfo,
+int inet_csk_get_port(struct sock *sk, unsigned short snum)
-                      struct sock *sk, unsigned short snum,
-                      int (*bind_conflict)(const struct sock *sk,
-                                           const struct inet_bind_bucket *tb))
 {
+        struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo;
        struct inet_bind_hashbucket *head;
        struct hlist_node *node;
        struct inet_bind_bucket *tb;
@@ -142,7 +140,7 @@ tb_found:
                        goto success;
                } else {
                        ret = 1;
-                        if (bind_conflict(sk, tb))
+                        if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb))
                                goto fail_unlock;
                }
        }
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 48d45008f749..9cac6c034abd 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -66,8 +66,9 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
 /*
 * Get rid of any references to a local port held by the given sock.
 */
-static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
+static void __inet_put_port(struct sock *sk)
 {
+        struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo;
        const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size);
        struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
        struct inet_bind_bucket *tb;
@@ -81,10 +82,10 @@ static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
        spin_unlock(&head->lock);
 }
-void inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
+void inet_put_port(struct sock *sk)
 {
        local_bh_disable();
-        __inet_put_port(hashinfo, sk);
+        __inet_put_port(sk);
        local_bh_enable();
 }
@@ -317,8 +318,9 @@ static inline u32 inet_sk_port_offset(const struct sock *sk)
                                          inet->dport);
 }
-void __inet_hash_nolisten(struct inet_hashinfo *hashinfo, struct sock *sk)
+void __inet_hash_nolisten(struct sock *sk)
 {
+        struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo;
        struct hlist_head *list;
        rwlock_t *lock;
        struct inet_ehash_bucket *head;
@@ -337,13 +339,14 @@ void __inet_hash_nolisten(struct inet_hashinfo *hashinfo, struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
-void __inet_hash(struct inet_hashinfo *hashinfo, struct sock *sk)
+static void __inet_hash(struct sock *sk)
 {
+        struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo;
        struct hlist_head *list;
        rwlock_t *lock;
        if (sk->sk_state != TCP_LISTEN) {
-                __inet_hash_nolisten(hashinfo, sk);
+                __inet_hash_nolisten(sk);
                return;
        }
@@ -357,13 +360,48 @@ void __inet_hash(struct inet_hashinfo *hashinfo, struct sock *sk)
        write_unlock(lock);
        wake_up(&hashinfo->lhash_wait);
 }
-EXPORT_SYMBOL_GPL(__inet_hash);
+void inet_hash(struct sock *sk)
+{
+        if (sk->sk_state != TCP_CLOSE) {
+                local_bh_disable();
+                __inet_hash(sk);
+                local_bh_enable();
+        }
+}
+EXPORT_SYMBOL_GPL(inet_hash);
+void inet_unhash(struct sock *sk)
+{
+        rwlock_t *lock;
+        struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo;
+        if (sk_unhashed(sk))
+                goto out;
+        if (sk->sk_state == TCP_LISTEN) {
+                local_bh_disable();
+                inet_listen_wlock(hashinfo);
+                lock = &hashinfo->lhash_lock;
+        } else {
+                lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+                write_lock_bh(lock);
+        }
+        if (__sk_del_node_init(sk))
+                sock_prot_inuse_add(sk->sk_prot, -1);
+        write_unlock_bh(lock);
+out:
+        if (sk->sk_state == TCP_LISTEN)
+                wake_up(&hashinfo->lhash_wait);
+}
+EXPORT_SYMBOL_GPL(inet_unhash);
 int __inet_hash_connect(struct inet_timewait_death_row *death_row,
-                struct sock *sk,
+                struct sock *sk, u32 port_offset,
                int (*check_established)(struct inet_timewait_death_row *,
                        struct sock *, __u16, struct inet_timewait_sock **),
-                void (*hash)(struct inet_hashinfo *, struct sock *))
+                void (*hash)(struct sock *sk))
 {
        struct inet_hashinfo *hinfo = death_row->hashinfo;
        const unsigned short snum = inet_sk(sk)->num;
@@ -375,7 +413,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
        if (!snum) {
                int i, remaining, low, high, port;
                static u32 hint;
-                u32 offset = hint + inet_sk_port_offset(sk);
+                u32 offset = hint + port_offset;
                struct hlist_node *node;
                struct inet_timewait_sock *tw = NULL;
@@ -427,7 +465,7 @@ ok:
                inet_bind_hash(sk, tb, port);
                if (sk_unhashed(sk)) {
                        inet_sk(sk)->sport = htons(port);
-                        hash(hinfo, sk);
+                        hash(sk);
                }
                spin_unlock(&head->lock);
@@ -444,7 +482,7 @@ ok:
        tb  = inet_csk(sk)->icsk_bind_hash;
        spin_lock_bh(&head->lock);
        if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
-                hash(hinfo, sk);
+                hash(sk);
                spin_unlock_bh(&head->lock);
                return 0;
        } else {
@@ -464,7 +502,7 @@ EXPORT_SYMBOL_GPL(__inet_hash_connect);
 int inet_hash_connect(struct inet_timewait_death_row *death_row,
                      struct sock *sk)
 {
-        return __inet_hash_connect(death_row, sk,
+        return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
                        __inet_check_established, __inet_hash_nolisten);
 }
diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c
index 749fa044eca5..85c680add6df 100644
--- a/net/ipv4/ipvs/ip_vs_wrr.c
+++ b/net/ipv4/ipvs/ip_vs_wrr.c
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/net.h>
 #include <net/ip_vs.h>
@@ -169,7 +170,7 @@ ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
                                 */
                                if (mark->cw == 0) {
                                        mark->cl = &svc->destinations;
-                                        IP_VS_INFO("ip_vs_wrr_schedule(): "
+                                        IP_VS_ERR_RL("ip_vs_wrr_schedule(): "
                                                   "no available servers\n");
                                        dest = NULL;
                                        goto out;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a0d373bd9065..071e83a894ad 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1669,7 +1669,7 @@ void tcp_set_state(struct sock *sk, int state)
                sk->sk_prot->unhash(sk);
                if (inet_csk(sk)->icsk_bind_hash &&
                    !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
-                        inet_put_port(&tcp_hashinfo, sk);
+                        inet_put_port(sk);
                /* fall through */
        default:
                if (oldstate==TCP_ESTABLISHED)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 77c1939a2b0d..63414ea427c5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -108,22 +108,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
        .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
 };
-static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
-{
-        return inet_csk_get_port(&tcp_hashinfo, sk, snum,
-                                 inet_csk_bind_conflict);
-}
-static void tcp_v4_hash(struct sock *sk)
-{
-        inet_hash(&tcp_hashinfo, sk);
-}
-void tcp_unhash(struct sock *sk)
-{
-        inet_unhash(&tcp_hashinfo, sk);
-}
 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 {
        return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
@@ -1478,8 +1462,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
        }
 #endif
-        __inet_hash_nolisten(&tcp_hashinfo, newsk);
+        __inet_hash_nolisten(newsk);
-        __inet_inherit_port(&tcp_hashinfo, sk, newsk);
+        __inet_inherit_port(sk, newsk);
        return newsk;
@@ -1827,6 +1811,7 @@ struct inet_connection_sock_af_ops ipv4_specific = {
        .getsockopt        = ip_getsockopt,
        .addr2sockaddr     = inet_csk_addr2sockaddr,
        .sockaddr_len      = sizeof(struct sockaddr_in),
+        .bind_conflict     = inet_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
        .compat_setsockopt = compat_ip_setsockopt,
        .compat_getsockopt = compat_ip_getsockopt,
@@ -1926,7 +1911,7 @@ int tcp_v4_destroy_sock(struct sock *sk)
        /* Clean up a referenced TCP bind bucket. */
        if (inet_csk(sk)->icsk_bind_hash)
-                inet_put_port(&tcp_hashinfo, sk);
+                inet_put_port(sk);
        /*
         * If sendmsg cached page exists, toss it.
@@ -2435,9 +2420,9 @@ struct proto tcp_prot = {
        .getsockopt             = tcp_getsockopt,
        .recvmsg                = tcp_recvmsg,
        .backlog_rcv            = tcp_v4_do_rcv,
-        .hash                   = tcp_v4_hash,
+        .hash                   = inet_hash,
-        .unhash                 = tcp_unhash,
+        .unhash                 = inet_unhash,
-        .get_port               = tcp_v4_get_port,
+        .get_port               = inet_csk_get_port,
        .enter_memory_pressure  = tcp_enter_memory_pressure,
        .sockets_allocated      = &tcp_sockets_allocated,
        .orphan_count           = &tcp_orphan_count,
@@ -2450,6 +2435,7 @@ struct proto tcp_prot = {
        .obj_size               = sizeof(struct tcp_sock),
        .twsk_prot              = &tcp_timewait_sock_ops,
        .rsk_prot               = &tcp_request_sock_ops,
+        .hashinfo               = &tcp_hashinfo,
 #ifdef CONFIG_COMPAT
        .compat_setsockopt      = compat_tcp_setsockopt,
        .compat_getsockopt      = compat_tcp_getsockopt,
@@ -2467,7 +2453,6 @@ void __init tcp_v4_init(struct net_proto_family *ops)
 EXPORT_SYMBOL(ipv4_specific);
 EXPORT_SYMBOL(tcp_hashinfo);
 EXPORT_SYMBOL(tcp_prot);
-EXPORT_SYMBOL(tcp_unhash);
 EXPORT_SYMBOL(tcp_v4_conn_request);
 EXPORT_SYMBOL(tcp_v4_connect);
 EXPORT_SYMBOL(tcp_v4_do_rcv);
diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c
index e093a7b59e18..b47030ba162b 100644
--- a/net/ipv4/xfrm4_mode_beet.c
+++ b/net/ipv4/xfrm4_mode_beet.c
@@ -102,7 +102,7 @@ static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb)
                XFRM_MODE_SKB_CB(skb)->protocol = ph->nexthdr;
-                if (!pskb_may_pull(skb, phlen));
+                if (!pskb_may_pull(skb, phlen))
                        goto out;
                __skb_pull(skb, phlen);
        }
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index cbb5b9cf84ad..121d517bf91c 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -683,7 +683,8 @@ static int icmpv6_rcv(struct sk_buff *skb)
                }
        }
-        __skb_pull(skb, sizeof(*hdr));
+        if (!pskb_pull(skb, sizeof(*hdr)))
+                goto discard_it;
        hdr = icmp6_hdr(skb);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index d325a9958909..99fd25f7f005 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -22,9 +22,9 @@
 #include <net/inet6_hashtables.h>
 #include <net/ip.h>
-void __inet6_hash(struct inet_hashinfo *hashinfo,
+void __inet6_hash(struct sock *sk)
-                                struct sock *sk)
 {
+        struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo;
        struct hlist_head *list;
        rwlock_t *lock;
@@ -236,7 +236,7 @@ static inline u32 inet6_sk_port_offset(const struct sock *sk)
 int inet6_hash_connect(struct inet_timewait_death_row *death_row,
                       struct sock *sk)
 {
-        return __inet_hash_connect(death_row, sk,
+        return __inet_hash_connect(death_row, sk, inet6_sk_port_offset(sk),
                        __inet6_check_established, __inet6_hash);
 }
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 59d0029e93a7..12750f2b05ab 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -86,12 +86,6 @@ static struct tcp_sock_af_ops tcp_sock_ipv6_specific;
 static struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
 #endif
-static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
-{
-        return inet_csk_get_port(&tcp_hashinfo, sk, snum,
-                                 inet6_csk_bind_conflict);
-}
 static void tcp_v6_hash(struct sock *sk)
 {
        if (sk->sk_state != TCP_CLOSE) {
@@ -100,7 +94,7 @@ static void tcp_v6_hash(struct sock *sk)
                        return;
                }
                local_bh_disable();
-                __inet6_hash(&tcp_hashinfo, sk);
+                __inet6_hash(sk);
                local_bh_enable();
        }
 }
@@ -1504,8 +1498,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
        }
 #endif
-        __inet6_hash(&tcp_hashinfo, newsk);
+        __inet6_hash(newsk);
-        inet_inherit_port(&tcp_hashinfo, sk, newsk);
+        inet_inherit_port(sk, newsk);
        return newsk;
@@ -1833,6 +1827,7 @@ static struct inet_connection_sock_af_ops ipv6_specific = {
        .getsockopt        = ipv6_getsockopt,
        .addr2sockaddr     = inet6_csk_addr2sockaddr,
        .sockaddr_len      = sizeof(struct sockaddr_in6),
+        .bind_conflict     = inet6_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
        .compat_setsockopt = compat_ipv6_setsockopt,
        .compat_getsockopt = compat_ipv6_getsockopt,
@@ -1864,6 +1859,7 @@ static struct inet_connection_sock_af_ops ipv6_mapped = {
        .getsockopt        = ipv6_getsockopt,
        .addr2sockaddr     = inet6_csk_addr2sockaddr,
        .sockaddr_len      = sizeof(struct sockaddr_in6),
+        .bind_conflict     = inet6_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
        .compat_setsockopt = compat_ipv6_setsockopt,
        .compat_getsockopt = compat_ipv6_getsockopt,
@@ -2127,8 +2123,8 @@ struct proto tcpv6_prot = {
        .recvmsg                = tcp_recvmsg,
        .backlog_rcv            = tcp_v6_do_rcv,
        .hash                   = tcp_v6_hash,
-        .unhash                 = tcp_unhash,
+        .unhash                 = inet_unhash,
-        .get_port               = tcp_v6_get_port,
+        .get_port               = inet_csk_get_port,
        .enter_memory_pressure  = tcp_enter_memory_pressure,
        .sockets_allocated      = &tcp_sockets_allocated,
        .memory_allocated       = &tcp_memory_allocated,
@@ -2141,6 +2137,7 @@ struct proto tcpv6_prot = {
        .obj_size               = sizeof(struct tcp6_sock),
        .twsk_prot              = &tcp6_timewait_sock_ops,
        .rsk_prot               = &tcp6_request_sock_ops,
+        .hashinfo               = &tcp_hashinfo,
 #ifdef CONFIG_COMPAT
        .compat_setsockopt      = compat_tcp_setsockopt,
        .compat_getsockopt      = compat_tcp_getsockopt,
diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig
index 09c255002e56..45c7c0c3875e 100644
--- a/net/mac80211/Kconfig
+++ b/net/mac80211/Kconfig
@@ -1,6 +1,5 @@
 config MAC80211
        tristate "Generic IEEE 802.11 Networking Stack (mac80211)"
-        depends on EXPERIMENTAL
        select CRYPTO
        select CRYPTO_ECB
        select CRYPTO_ARC4
@@ -98,6 +97,18 @@ config MAC80211_DEBUGFS
          Say N unless you know you need this.
+config MAC80211_DEBUG_PACKET_ALIGNMENT
+        bool "Enable packet alignment debugging"
+        depends on MAC80211
+        help
+          This option is recommended for driver authors and strongly
+          discouraged for everybody else, it will trigger a warning
+          when a driver hands mac80211 a buffer that is aligned in
+          a way that will cause problems with the IP stack on some
+          architectures.
+          Say N unless you're writing a mac80211 based driver.
 config MAC80211_DEBUG
        bool "Enable debugging output"
        depends on MAC80211
diff --git a/net/mac80211/ieee80211.c b/net/mac80211/ieee80211.c
index 5dcc2d61551f..67b7c75c430d 100644
--- a/net/mac80211/ieee80211.c
+++ b/net/mac80211/ieee80211.c
@@ -1344,17 +1344,17 @@ static int __init ieee80211_init(void)
        ret = rc80211_simple_init();
        if (ret)
-                goto fail;
+                goto out;
        ret = rc80211_pid_init();
        if (ret)
-                goto fail_simple;
+                goto out_cleanup_simple;
        ret = ieee80211_wme_register();
        if (ret) {
                printk(KERN_DEBUG "ieee80211_init: failed to "
                       "initialize WME (err=%d)\n", ret);
-                goto fail_pid;
+                goto out_cleanup_pid;
        }
        ieee80211_debugfs_netdev_init();
@@ -1362,11 +1362,11 @@ static int __init ieee80211_init(void)
        return 0;
- fail_pid:
+ out_cleanup_pid:
-        rc80211_simple_exit();
- fail_simple:
        rc80211_pid_exit();
- fail:
+ out_cleanup_simple:
+        rc80211_simple_exit();
+ out:
        return ret;
 }
diff --git a/net/mac80211/rc80211_pid_algo.c b/net/mac80211/rc80211_pid_algo.c
index 554c4baed6fb..c339571632b2 100644
--- a/net/mac80211/rc80211_pid_algo.c
+++ b/net/mac80211/rc80211_pid_algo.c
@@ -538,7 +538,7 @@ int __init rc80211_pid_init(void)
        return ieee80211_rate_control_register(&mac80211_rcpid);
 }
-void __exit rc80211_pid_exit(void)
+void rc80211_pid_exit(void)
 {
        ieee80211_rate_control_unregister(&mac80211_rcpid);
 }
diff --git a/net/mac80211/rc80211_simple.c b/net/mac80211/rc80211_simple.c
index 934676d687d6..9a78b116acff 100644
--- a/net/mac80211/rc80211_simple.c
+++ b/net/mac80211/rc80211_simple.c
@@ -389,7 +389,7 @@ int __init rc80211_simple_init(void)
        return ieee80211_rate_control_register(&mac80211_rcsimple);
 }
-void __exit rc80211_simple_exit(void)
+void rc80211_simple_exit(void)
 {
        ieee80211_rate_control_unregister(&mac80211_rcsimple);
 }
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index d44c87269bcb..535407d07fa4 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -340,11 +340,15 @@ static u32 ieee80211_rx_load_stats(struct ieee80211_local *local,
        return load;
 }
+#ifdef CONFIG_MAC80211_DEBUG_PACKET_ALIGNMENT
 static ieee80211_txrx_result
 ieee80211_rx_h_verify_ip_alignment(struct ieee80211_txrx_data *rx)
 {
        int hdrlen;
+        if (!WLAN_FC_DATA_PRESENT(rx->fc))
+                return TXRX_CONTINUE;
        /*
         * Drivers are required to align the payload data in a way that
         * guarantees that the contained IP header is aligned to a four-
@@ -371,11 +375,14 @@ ieee80211_rx_h_verify_ip_alignment(struct ieee80211_txrx_data *rx)
        return TXRX_CONTINUE;
 }
+#endif
 ieee80211_rx_handler ieee80211_rx_pre_handlers[] =
 {
        ieee80211_rx_h_parse_qos,
+#ifdef CONFIG_MAC80211_DEBUG_PACKET_ALIGNMENT
        ieee80211_rx_h_verify_ip_alignment,
+#endif
        NULL
 };
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index becf91a952ae..c7ad64d664ad 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -90,7 +90,7 @@ static const struct nla_policy netlbl_cipsov4_genl_policy[NLBL_CIPSOV4_A_MAX + 1
 * safely.
 *
 */
-static void netlbl_cipsov4_doi_free(struct rcu_head *entry)
+void netlbl_cipsov4_doi_free(struct rcu_head *entry)
 {
        struct cipso_v4_doi *ptr;
diff --git a/net/netlabel/netlabel_cipso_v4.h b/net/netlabel/netlabel_cipso_v4.h
index f03cf9b78286..220cb9d06b49 100644
--- a/net/netlabel/netlabel_cipso_v4.h
+++ b/net/netlabel/netlabel_cipso_v4.h
@@ -163,4 +163,7 @@ enum {
 /* NetLabel protocol functions */
 int netlbl_cipsov4_genl_init(void);
+/* Free the memory associated with a CIPSOv4 DOI definition */
+void netlbl_cipsov4_doi_free(struct rcu_head *entry);
 #endif
diff --git a/net/netlabel/netlabel_domainhash.h b/net/netlabel/netlabel_domainhash.h
index 3689956c3436..8220990ceb96 100644
--- a/net/netlabel/netlabel_domainhash.h
+++ b/net/netlabel/netlabel_domainhash.h
@@ -61,6 +61,7 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry,
                      struct netlbl_audit *audit_info);
 int netlbl_domhsh_add_default(struct netlbl_dom_map *entry,
                              struct netlbl_audit *audit_info);
+int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info);
 int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info);
 struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain);
 int netlbl_domhsh_walk(u32 *skip_bkt,
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index c69e3e1f05c3..39793a1a93aa 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -30,6 +30,7 @@
 #include <linux/init.h>
 #include <linux/types.h>
+#include <linux/audit.h>
 #include <net/ip.h>
 #include <net/netlabel.h>
 #include <net/cipso_ipv4.h>
@@ -38,10 +39,186 @@
 #include "netlabel_domainhash.h"
 #include "netlabel_unlabeled.h"
+#include "netlabel_cipso_v4.h"
 #include "netlabel_user.h"
 #include "netlabel_mgmt.h"
 /*
+ * Configuration Functions
+ */
+/**
+ * netlbl_cfg_map_del - Remove a NetLabel/LSM domain mapping
+ * @domain: the domain mapping to remove
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Removes a NetLabel/LSM domain mapping.  A @domain value of NULL causes the
+ * default domain mapping to be removed.  Returns zero on success, negative
+ * values on failure.
+ *
+ */
+int netlbl_cfg_map_del(const char *domain, struct netlbl_audit *audit_info)
+{
+        return netlbl_domhsh_remove(domain, audit_info);
+}
+/**
+ * netlbl_cfg_unlbl_add_map - Add an unlabeled NetLabel/LSM domain mapping
+ * @domain: the domain mapping to add
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Adds a new unlabeled NetLabel/LSM domain mapping.  A @domain value of NULL
+ * causes a new default domain mapping to be added.  Returns zero on success,
+ * negative values on failure.
+ *
+ */
+int netlbl_cfg_unlbl_add_map(const char *domain,
+                             struct netlbl_audit *audit_info)
+{
+        int ret_val = -ENOMEM;
+        struct netlbl_dom_map *entry;
+        entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+        if (entry == NULL)
+                goto cfg_unlbl_add_map_failure;
+        if (domain != NULL) {
+                entry->domain = kstrdup(domain, GFP_ATOMIC);
+                if (entry->domain == NULL)
+                        goto cfg_unlbl_add_map_failure;
+        }
+        entry->type = NETLBL_NLTYPE_UNLABELED;
+        ret_val = netlbl_domhsh_add(entry, audit_info);
+        if (ret_val != 0)
+                goto cfg_unlbl_add_map_failure;
+        return 0;
+cfg_unlbl_add_map_failure:
+        if (entry != NULL)
+                kfree(entry->domain);
+        kfree(entry);
+        return ret_val;
+}
+/**
+ * netlbl_cfg_cipsov4_add - Add a new CIPSOv4 DOI definition
+ * @doi_def: the DOI definition
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Add a new CIPSOv4 DOI definition to the NetLabel subsystem.  Returns zero on
+ * success, negative values on failure.
+ *
+ */
+int netlbl_cfg_cipsov4_add(struct cipso_v4_doi *doi_def,
+                           struct netlbl_audit *audit_info)
+{
+        int ret_val;
+        const char *type_str;
+        struct audit_buffer *audit_buf;
+        ret_val = cipso_v4_doi_add(doi_def);
+        audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_ADD,
+                                              audit_info);
+        if (audit_buf != NULL) {
+                switch (doi_def->type) {
+                case CIPSO_V4_MAP_STD:
+                        type_str = "std";
+                        break;
+                case CIPSO_V4_MAP_PASS:
+                        type_str = "pass";
+                        break;
+                default:
+                        type_str = "(unknown)";
+                }
+                audit_log_format(audit_buf,
+                                 " cipso_doi=%u cipso_type=%s res=%u",
+                                 doi_def->doi,
+                                 type_str,
+                                 ret_val == 0 ? 1 : 0);
+                audit_log_end(audit_buf);
+        }
+        return ret_val;
+}
+/**
+ * netlbl_cfg_cipsov4_add_map - Add a new CIPSOv4 DOI definition and mapping
+ * @doi_def: the DOI definition
+ * @domain: the domain mapping to add
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Add a new CIPSOv4 DOI definition and NetLabel/LSM domain mapping for this
+ * new DOI definition to the NetLabel subsystem.  A @domain value of NULL adds
+ * a new default domain mapping.  Returns zero on success, negative values on
+ * failure.
+ *
+ */
+int netlbl_cfg_cipsov4_add_map(struct cipso_v4_doi *doi_def,
+                               const char *domain,
+                               struct netlbl_audit *audit_info)
+{
+        int ret_val = -ENOMEM;
+        struct netlbl_dom_map *entry;
+        entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+        if (entry == NULL)
+                goto cfg_cipsov4_add_map_failure;
+        if (domain != NULL) {
+                entry->domain = kstrdup(domain, GFP_ATOMIC);
+                if (entry->domain == NULL)
+                        goto cfg_cipsov4_add_map_failure;
+        }
+        entry->type = NETLBL_NLTYPE_CIPSOV4;
+        entry->type_def.cipsov4 = doi_def;
+        /* Grab a RCU read lock here so nothing happens to the doi_def variable
+         * between adding it to the CIPSOv4 protocol engine and adding a
+         * domain mapping for it. */
+        rcu_read_lock();
+        ret_val = netlbl_cfg_cipsov4_add(doi_def, audit_info);
+        if (ret_val != 0)
+                goto cfg_cipsov4_add_map_failure_unlock;
+        ret_val = netlbl_domhsh_add(entry, audit_info);
+        if (ret_val != 0)
+                goto cfg_cipsov4_add_map_failure_remove_doi;
+        rcu_read_unlock();
+        return 0;
+cfg_cipsov4_add_map_failure_remove_doi:
+        cipso_v4_doi_remove(doi_def->doi, audit_info, netlbl_cipsov4_doi_free);
+cfg_cipsov4_add_map_failure_unlock:
+        rcu_read_unlock();
+cfg_cipsov4_add_map_failure:
+        if (entry != NULL)
+                kfree(entry->domain);
+        kfree(entry);
+        return ret_val;
+}
+/**
+ * netlbl_cfg_cipsov4_del - Removean existing CIPSOv4 DOI definition
+ * @doi: the CIPSO DOI value
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Removes an existing CIPSOv4 DOI definition from the NetLabel subsystem.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+int netlbl_cfg_cipsov4_del(u32 doi, struct netlbl_audit *audit_info)
+{
+        return cipso_v4_doi_remove(doi, audit_info, netlbl_cipsov4_doi_free);
+}
+/*
 * Security Attribute Functions
 */
diff --git a/net/rfkill/rfkill.c b/net/rfkill/rfkill.c
index 6562f868e82f..1a47f5d1be17 100644
--- a/net/rfkill/rfkill.c
+++ b/net/rfkill/rfkill.c
@@ -340,7 +340,7 @@ EXPORT_SYMBOL(rfkill_allocate);
 * rfkill_free - Mark rfkill structure for deletion
 * @rfkill: rfkill structure to be destroyed
 *
- * Decrements reference count of rfkill structure so it is destoryed.
+ * Decrements reference count of rfkill structure so it is destroyed.
 * Note that rfkill_free() should _not_ be called after rfkill_unregister().
 */
 void rfkill_free(struct rfkill *rfkill)
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 5a7f6a3060fc..971b867e0484 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -19,6 +19,7 @@
 #include <linux/in.h>
 #include <linux/ip.h>
 #include <linux/ipv6.h>
+#include <linux/if_vlan.h>
 #include <net/pkt_cls.h>
 #include <net/ip.h>
@@ -270,6 +271,15 @@ static u32 flow_get_skgid(const struct sk_buff *skb)
        return 0;
 }
+static u32 flow_get_vlan_tag(const struct sk_buff *skb)
+{
+        u16 uninitialized_var(tag);
+        if (vlan_get_tag(skb, &tag) < 0)
+                return 0;
+        return tag & VLAN_VID_MASK;
+}
 static u32 flow_key_get(const struct sk_buff *skb, int key)
 {
        switch (key) {
@@ -305,6 +315,8 @@ static u32 flow_key_get(const struct sk_buff *skb, int key)
                return flow_get_skuid(skb);
        case FLOW_KEY_SKGID:
                return flow_get_skgid(skb);
+        case FLOW_KEY_VLAN_TAG:
+                return flow_get_vlan_tag(skb);
        default:
                WARN_ON(1);
                return 0;
@@ -402,12 +414,13 @@ static int flow_change(struct tcf_proto *tp, unsigned long base,
        if (tb[TCA_FLOW_KEYS]) {
                keymask = nla_get_u32(tb[TCA_FLOW_KEYS]);
-                if (fls(keymask) - 1 > FLOW_KEY_MAX)
-                        return -EOPNOTSUPP;
                nkeys = hweight32(keymask);
                if (nkeys == 0)
                        return -EINVAL;
+                if (fls(keymask) - 1 > FLOW_KEY_MAX)
+                        return -EOPNOTSUPP;
        }
        err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &flow_ext_map);
@@ -594,11 +607,11 @@ static int flow_dump(struct tcf_proto *tp, unsigned long fh,
        if (tcf_exts_dump(skb, &f->exts, &flow_ext_map) < 0)
                goto nla_put_failure;
+#ifdef CONFIG_NET_EMATCH
        if (f->ematches.hdr.nmatches &&
            tcf_em_tree_dump(skb, &f->ematches, TCA_FLOW_EMATCHES) < 0)
                goto nla_put_failure;
+#endif
        nla_nest_end(skb, nest);
        if (tcf_exts_dump_stats(skb, &f->exts, &flow_ext_map) < 0)
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index a1e5619b1876..2a7e648fbcf4 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -65,6 +65,7 @@
 #include <linux/string.h>
 #include <linux/skbuff.h>
 #include <linux/random.h>
+#include <linux/if_vlan.h>
 #include <linux/tc_ematch/tc_em_meta.h>
 #include <net/dst.h>
 #include <net/route.h>
@@ -170,6 +171,21 @@ META_COLLECTOR(var_dev)
 }
 /**************************************************************************
+ * vlan tag
+ **************************************************************************/
+META_COLLECTOR(int_vlan_tag)
+{
+        unsigned short uninitialized_var(tag);
+        if (vlan_get_tag(skb, &tag) < 0)
+                *err = -1;
+        else
+                dst->value = tag;
+}
+/**************************************************************************
 * skb attributes
 **************************************************************************/
@@ -520,6 +536,7 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
                [META_ID(SK_SNDTIMEO)]          = META_FUNC(int_sk_sndtimeo),
                [META_ID(SK_SENDMSG_OFF)]       = META_FUNC(int_sk_sendmsg_off),
                [META_ID(SK_WRITE_PENDING)]     = META_FUNC(int_sk_write_pend),
+                [META_ID(VLAN_TAG)]             = META_FUNC(int_vlan_tag),
        }
 };
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 97e6ebd14500..ae367c82e512 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -420,15 +420,15 @@ struct sctp_shared_key *sctp_auth_get_shkey(
                                const struct sctp_association *asoc,
                                __u16 key_id)
 {
-        struct sctp_shared_key *key = NULL;
+        struct sctp_shared_key *key;
        /* First search associations set of endpoint pair shared keys */
        key_for_each(key, &asoc->endpoint_shared_keys) {
                if (key->key_id == key_id)
-                        break;
+                        return key;
        }
-        return key;
+        return NULL;
 }
 /*
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 61cbd5a8dd0c..f98658782d4f 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -537,7 +537,7 @@ sctp_disposition_t sctp_sf_do_5_1C_ack(const struct sctp_endpoint *ep,
                 *
                 * This means that if we only want to abort associations
                 * in an authenticated way (i.e AUTH+ABORT), then we
-                 * can't destory this association just becuase the packet
+                 * can't destroy this association just becuase the packet
                 * was malformed.
                 */
                if (sctp_auth_recv_cid(SCTP_CID_ABORT, asoc))
@@ -3865,6 +3865,10 @@ sctp_disposition_t sctp_sf_eat_auth(const struct sctp_endpoint *ep,
        struct sctp_chunk *err_chunk;
        sctp_ierror_t error;
+        /* Make sure that the peer has AUTH capable */
+        if (!asoc->peer.auth_capable)
+                return sctp_sf_unk_chunk(ep, asoc, type, arg, commands);
        if (!sctp_vtag_verify(chunk, asoc)) {
                sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
                                SCTP_NULL());
@@ -4130,7 +4134,7 @@ static sctp_disposition_t sctp_sf_abort_violation(
         *
         * This means that if we only want to abort associations
         * in an authenticated way (i.e AUTH+ABORT), then we
-         * can't destory this association just becuase the packet
+         * can't destroy this association just becuase the packet
         * was malformed.
         */
        if (sctp_auth_recv_cid(SCTP_CID_ABORT, asoc))
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index 5c69a725e530..92e1dbe50947 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -11,6 +11,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
            auth.o auth_null.o auth_unix.o \
            svc.o svcsock.o svcauth.o svcauth_unix.o \
            rpcb_clnt.o timer.o xdr.o \
-            sunrpc_syms.o cache.o rpc_pipe.o
+            sunrpc_syms.o cache.o rpc_pipe.o \
+            svc_xprt.o
 sunrpc-$(CONFIG_PROC_FS) += stats.o
 sunrpc-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 73940df6c460..481f984e9a22 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -224,38 +224,34 @@ static int rsi_parse(struct cache_detail *cd,
        /* major/minor */
        len = qword_get(&mesg, buf, mlen);
-        if (len < 0)
+        if (len <= 0)
                goto out;
-        if (len == 0) {
+        rsii.major_status = simple_strtoul(buf, &ep, 10);
+        if (*ep)
+                goto out;
+        len = qword_get(&mesg, buf, mlen);
+        if (len <= 0)
+                goto out;
+        rsii.minor_status = simple_strtoul(buf, &ep, 10);
+        if (*ep)
                goto out;
-        } else {
-                rsii.major_status = simple_strtoul(buf, &ep, 10);
-                if (*ep)
-                        goto out;
-                len = qword_get(&mesg, buf, mlen);
-                if (len <= 0)
-                        goto out;
-                rsii.minor_status = simple_strtoul(buf, &ep, 10);
-                if (*ep)
-                        goto out;
-                /* out_handle */
+        /* out_handle */
-                len = qword_get(&mesg, buf, mlen);
+        len = qword_get(&mesg, buf, mlen);
-                if (len < 0)
+        if (len < 0)
-                        goto out;
+                goto out;
-                status = -ENOMEM;
+        status = -ENOMEM;
-                if (dup_to_netobj(&rsii.out_handle, buf, len))
+        if (dup_to_netobj(&rsii.out_handle, buf, len))
-                        goto out;
+                goto out;
-                /* out_token */
+        /* out_token */
-                len = qword_get(&mesg, buf, mlen);
+        len = qword_get(&mesg, buf, mlen);
-                status = -EINVAL;
+        status = -EINVAL;
-                if (len < 0)
+        if (len < 0)
-                        goto out;
+                goto out;
-                status = -ENOMEM;
+        status = -ENOMEM;
-                if (dup_to_netobj(&rsii.out_token, buf, len))
+        if (dup_to_netobj(&rsii.out_token, buf, len))
-                        goto out;
+                goto out;
-        }
        rsii.h.expiry_time = expiry;
        rsip = rsi_update(&rsii, rsip);
        status = 0;
@@ -975,6 +971,7 @@ static int svcauth_gss_handle_init(struct svc_rqst *rqstp,
        struct kvec *resv = &rqstp->rq_res.head[0];
        struct xdr_netobj tmpobj;
        struct rsi *rsip, rsikey;
+        int ret;
        /* Read the verifier; should be NULL: */
        *authp = rpc_autherr_badverf;
@@ -1014,23 +1011,27 @@ static int svcauth_gss_handle_init(struct svc_rqst *rqstp,
                /* No upcall result: */
                return SVC_DROP;
        case 0:
+                ret = SVC_DROP;
                /* Got an answer to the upcall; use it: */
                if (gss_write_init_verf(rqstp, rsip))
-                        return SVC_DROP;
+                        goto out;
                if (resv->iov_len + 4 > PAGE_SIZE)
-                        return SVC_DROP;
+                        goto out;
                svc_putnl(resv, RPC_SUCCESS);
                if (svc_safe_putnetobj(resv, &rsip->out_handle))
-                        return SVC_DROP;
+                        goto out;
                if (resv->iov_len + 3 * 4 > PAGE_SIZE)
-                        return SVC_DROP;
+                        goto out;
                svc_putnl(resv, rsip->major_status);
                svc_putnl(resv, rsip->minor_status);
                svc_putnl(resv, GSS_SEQ_WIN);
                if (svc_safe_putnetobj(resv, &rsip->out_token))
-                        return SVC_DROP;
+                        goto out;
        }
-        return SVC_COMPLETE;
+        ret = SVC_COMPLETE;
+out:
+        cache_put(&rsip->h, &rsi_cache);
+        return ret;
 }
 /*
@@ -1125,6 +1126,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
        case RPC_GSS_PROC_DESTROY:
                if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
                        goto auth_err;
+                rsci->h.expiry_time = get_seconds();
                set_bit(CACHE_NEGATIVE, &rsci->h.flags);
                if (resv->iov_len + 4 > PAGE_SIZE)
                        goto drop;
@@ -1386,19 +1388,26 @@ int
 gss_svc_init(void)
 {
        int rv = svc_auth_register(RPC_AUTH_GSS, &svcauthops_gss);
-        if (rv == 0) {
+        if (rv)
-                cache_register(&rsc_cache);
+                return rv;
-                cache_register(&rsi_cache);
+        rv = cache_register(&rsc_cache);
-        }
+        if (rv)
+                goto out1;
+        rv = cache_register(&rsi_cache);
+        if (rv)
+                goto out2;
+        return 0;
+out2:
+        cache_unregister(&rsc_cache);
+out1:
+        svc_auth_unregister(RPC_AUTH_GSS);
        return rv;
 }
 void
 gss_svc_shutdown(void)
 {
-        if (cache_unregister(&rsc_cache))
+        cache_unregister(&rsc_cache);
-                printk(KERN_ERR "auth_rpcgss: failed to unregister rsc cache\n");
+        cache_unregister(&rsi_cache);
-        if (cache_unregister(&rsi_cache))
-                printk(KERN_ERR "auth_rpcgss: failed to unregister rsi cache\n");
        svc_auth_unregister(RPC_AUTH_GSS);
 }
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 73f053d0cc7a..636c8e04e0be 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -245,6 +245,7 @@ int cache_check(struct cache_detail *detail,
                cache_put(h, detail);
        return rv;
 }
+EXPORT_SYMBOL(cache_check);
 /*
 * caches need to be periodically cleaned.
@@ -290,44 +291,78 @@ static const struct file_operations cache_flush_operations;
 static void do_cache_clean(struct work_struct *work);
 static DECLARE_DELAYED_WORK(cache_cleaner, do_cache_clean);
-void cache_register(struct cache_detail *cd)
+static void remove_cache_proc_entries(struct cache_detail *cd)
 {
-        cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc);
+        if (cd->proc_ent == NULL)
-        if (cd->proc_ent) {
+                return;
-                struct proc_dir_entry *p;
+        if (cd->flush_ent)
-                cd->proc_ent->owner = cd->owner;
+                remove_proc_entry("flush", cd->proc_ent);
-                cd->channel_ent = cd->content_ent = NULL;
+        if (cd->channel_ent)
+                remove_proc_entry("channel", cd->proc_ent);
+        if (cd->content_ent)
+                remove_proc_entry("content", cd->proc_ent);
+        cd->proc_ent = NULL;
+        remove_proc_entry(cd->name, proc_net_rpc);
+}
-                p = create_proc_entry("flush", S_IFREG|S_IRUSR|S_IWUSR,
+#ifdef CONFIG_PROC_FS
-                                      cd->proc_ent);
+static int create_cache_proc_entries(struct cache_detail *cd)
-                cd->flush_ent =  p;
+{
-                if (p) {
+        struct proc_dir_entry *p;
-                        p->proc_fops = &cache_flush_operations;
-                        p->owner = cd->owner;
-                        p->data = cd;
-                }
-                if (cd->cache_request || cd->cache_parse) {
+        cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc);
-                        p = create_proc_entry("channel", S_IFREG|S_IRUSR|S_IWUSR,
+        if (cd->proc_ent == NULL)
-                                              cd->proc_ent);
+                goto out_nomem;
-                        cd->channel_ent = p;
+        cd->proc_ent->owner = cd->owner;
-                        if (p) {
+        cd->channel_ent = cd->content_ent = NULL;
-                                p->proc_fops = &cache_file_operations;
-                                p->owner = cd->owner;
+        p = create_proc_entry("flush", S_IFREG|S_IRUSR|S_IWUSR, cd->proc_ent);
-                                p->data = cd;
+        cd->flush_ent = p;
-                        }
+        if (p == NULL)
-                }
+                goto out_nomem;
-                if (cd->cache_show) {
+        p->proc_fops = &cache_flush_operations;
-                        p = create_proc_entry("content", S_IFREG|S_IRUSR|S_IWUSR,
+        p->owner = cd->owner;
-                                              cd->proc_ent);
+        p->data = cd;
-                        cd->content_ent = p;
-                        if (p) {
+        if (cd->cache_request || cd->cache_parse) {
-                                p->proc_fops = &content_file_operations;
+                p = create_proc_entry("channel", S_IFREG|S_IRUSR|S_IWUSR,
-                                p->owner = cd->owner;
+                                      cd->proc_ent);
-                                p->data = cd;
+                cd->channel_ent = p;
-                        }
+                if (p == NULL)
-                }
+                        goto out_nomem;
+                p->proc_fops = &cache_file_operations;
+                p->owner = cd->owner;
+                p->data = cd;
        }
+        if (cd->cache_show) {
+                p = create_proc_entry("content", S_IFREG|S_IRUSR|S_IWUSR,
+                                      cd->proc_ent);
+                cd->content_ent = p;
+                if (p == NULL)
+                        goto out_nomem;
+                p->proc_fops = &content_file_operations;
+                p->owner = cd->owner;
+                p->data = cd;
+        }
+        return 0;
+out_nomem:
+        remove_cache_proc_entries(cd);
+        return -ENOMEM;
+}
+#else /* CONFIG_PROC_FS */
+static int create_cache_proc_entries(struct cache_detail *cd)
+{
+        return 0;
+}
+#endif
+int cache_register(struct cache_detail *cd)
+{
+        int ret;
+        ret = create_cache_proc_entries(cd);
+        if (ret)
+                return ret;
        rwlock_init(&cd->hash_lock);
        INIT_LIST_HEAD(&cd->queue);
        spin_lock(&cache_list_lock);
@@ -341,9 +376,11 @@ void cache_register(struct cache_detail *cd)
        /* start the cleaning process */
        schedule_delayed_work(&cache_cleaner, 0);
+        return 0;
 }
+EXPORT_SYMBOL(cache_register);
-int cache_unregister(struct cache_detail *cd)
+void cache_unregister(struct cache_detail *cd)
 {
        cache_purge(cd);
        spin_lock(&cache_list_lock);
@@ -351,30 +388,23 @@ int cache_unregister(struct cache_detail *cd)
        if (cd->entries || atomic_read(&cd->inuse)) {
                write_unlock(&cd->hash_lock);
                spin_unlock(&cache_list_lock);
-                return -EBUSY;
+                goto out;
        }
        if (current_detail == cd)
                current_detail = NULL;
        list_del_init(&cd->others);
        write_unlock(&cd->hash_lock);
        spin_unlock(&cache_list_lock);
-        if (cd->proc_ent) {
+        remove_cache_proc_entries(cd);
-                if (cd->flush_ent)
-                        remove_proc_entry("flush", cd->proc_ent);
-                if (cd->channel_ent)
-                        remove_proc_entry("channel", cd->proc_ent);
-                if (cd->content_ent)
-                        remove_proc_entry("content", cd->proc_ent);
-                cd->proc_ent = NULL;
-                remove_proc_entry(cd->name, proc_net_rpc);
-        }
        if (list_empty(&cache_list)) {
                /* module must be being unloaded so its safe to kill the worker */
                cancel_delayed_work_sync(&cache_cleaner);
        }
-        return 0;
+        return;
+out:
+        printk(KERN_ERR "nfsd: failed to unregister %s cache\n", cd->name);
 }
+EXPORT_SYMBOL(cache_unregister);
 /* clean cache tries to find something to clean
 * and cleans it.
@@ -489,6 +519,7 @@ void cache_flush(void)
        while (cache_clean() != -1)
                cond_resched();
 }
+EXPORT_SYMBOL(cache_flush);
 void cache_purge(struct cache_detail *detail)
 {
@@ -497,7 +528,7 @@ void cache_purge(struct cache_detail *detail)
        cache_flush();
        detail->flush_time = 1;
 }
+EXPORT_SYMBOL(cache_purge);
 /*
@@ -634,13 +665,13 @@ void cache_clean_deferred(void *owner)
 /*
 * communicate with user-space
 *
- * We have a magic /proc file - /proc/sunrpc/cache
+ * We have a magic /proc file - /proc/sunrpc/<cachename>/channel.
- * On read, you get a full request, or block
+ * On read, you get a full request, or block.
- * On write, an update request is processed
+ * On write, an update request is processed.
- * Poll works if anything to read, and always allows write
+ * Poll works if anything to read, and always allows write.
 *
 * Implemented by linked list of requests.  Each open file has
- * a ->private that also exists in this list.  New request are added
+ * a ->private that also exists in this list.  New requests are added
 * to the end and may wakeup and preceding readers.
 * New readers are added to the head.  If, on read, an item is found with
 * CACHE_UPCALLING clear, we free it from the list.
@@ -963,6 +994,7 @@ void qword_add(char **bpp, int *lp, char *str)
        *bpp = bp;
        *lp = len;
 }
+EXPORT_SYMBOL(qword_add);
 void qword_addhex(char **bpp, int *lp, char *buf, int blen)
 {
@@ -991,6 +1023,7 @@ void qword_addhex(char **bpp, int *lp, char *buf, int blen)
        *bpp = bp;
        *lp = len;
 }
+EXPORT_SYMBOL(qword_addhex);
 static void warn_no_listener(struct cache_detail *detail)
 {
@@ -1113,6 +1146,7 @@ int qword_get(char **bpp, char *dest, int bufsize)
        *dest = '\0';
        return len;
 }
+EXPORT_SYMBOL(qword_get);
 /*
@@ -1244,18 +1278,18 @@ static ssize_t read_flush(struct file *file, char __user *buf,
        struct cache_detail *cd = PDE(file->f_path.dentry->d_inode)->data;
        char tbuf[20];
        unsigned long p = *ppos;
-        int len;
+        size_t len;
        sprintf(tbuf, "%lu\n", cd->flush_time);
        len = strlen(tbuf);
        if (p >= len)
                return 0;
        len -= p;
-        if (len > count) len = count;
+        if (len > count)
+                len = count;
        if (copy_to_user(buf, (void*)(tbuf+p), len))
-                len = -EFAULT;
+                return -EFAULT;
-        else
+        *ppos += len;
-                *ppos += len;
        return len;
 }
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 74df2d358e61..5a16875f5ac8 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -33,7 +33,7 @@ struct proc_dir_entry	*proc_net_rpc = NULL;
 static int rpc_proc_show(struct seq_file *seq, void *v) {
        const struct rpc_stat   *statp = seq->private;
        const struct rpc_program *prog = statp->program;
-        int             i, j;
+        unsigned int i, j;
        seq_printf(seq,
                "net %u %u %u %u\n",
@@ -81,7 +81,7 @@ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) {
        const struct svc_program *prog = statp->program;
        const struct svc_procedure *proc;
        const struct svc_version *vers;
-        int             i, j;
+        unsigned int i, j;
        seq_printf(seq,
                "net %u %u %u %u\n",
@@ -106,6 +106,7 @@ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) {
                seq_putc(seq, '\n');
        }
 }
+EXPORT_SYMBOL(svc_seq_show);
 /**
 * rpc_alloc_iostats - allocate an rpc_iostats structure
@@ -255,12 +256,14 @@ svc_proc_register(struct svc_stat *statp, const struct file_operations *fops)
 {
        return do_register(statp->program->pg_name, statp, fops);
 }
+EXPORT_SYMBOL(svc_proc_register);
 void
 svc_proc_unregister(const char *name)
 {
        remove_proc_entry(name, proc_net_rpc);
 }
+EXPORT_SYMBOL(svc_proc_unregister);
 void
 rpc_proc_init(void)
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 1a7e309d008b..843629f55763 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -22,48 +22,6 @@
 #include <linux/sunrpc/rpc_pipe_fs.h>
 #include <linux/sunrpc/xprtsock.h>
-/* RPC server stuff */
-EXPORT_SYMBOL(svc_create);
-EXPORT_SYMBOL(svc_create_thread);
-EXPORT_SYMBOL(svc_create_pooled);
-EXPORT_SYMBOL(svc_set_num_threads);
-EXPORT_SYMBOL(svc_exit_thread);
-EXPORT_SYMBOL(svc_destroy);
-EXPORT_SYMBOL(svc_drop);
-EXPORT_SYMBOL(svc_process);
-EXPORT_SYMBOL(svc_recv);
-EXPORT_SYMBOL(svc_wake_up);
-EXPORT_SYMBOL(svc_makesock);
-EXPORT_SYMBOL(svc_reserve);
-EXPORT_SYMBOL(svc_auth_register);
-EXPORT_SYMBOL(auth_domain_lookup);
-EXPORT_SYMBOL(svc_authenticate);
-EXPORT_SYMBOL(svc_set_client);
-/* RPC statistics */
-#ifdef CONFIG_PROC_FS
-EXPORT_SYMBOL(svc_proc_register);
-EXPORT_SYMBOL(svc_proc_unregister);
-EXPORT_SYMBOL(svc_seq_show);
-#endif
-/* caching... */
-EXPORT_SYMBOL(auth_domain_find);
-EXPORT_SYMBOL(auth_domain_put);
-EXPORT_SYMBOL(auth_unix_add_addr);
-EXPORT_SYMBOL(auth_unix_forget_old);
-EXPORT_SYMBOL(auth_unix_lookup);
-EXPORT_SYMBOL(cache_check);
-EXPORT_SYMBOL(cache_flush);
-EXPORT_SYMBOL(cache_purge);
-EXPORT_SYMBOL(cache_register);
-EXPORT_SYMBOL(cache_unregister);
-EXPORT_SYMBOL(qword_add);
-EXPORT_SYMBOL(qword_addhex);
-EXPORT_SYMBOL(qword_get);
-EXPORT_SYMBOL(svcauth_unix_purge);
-EXPORT_SYMBOL(unix_domain_find);
 extern struct cache_detail ip_map_cache, unix_gid_cache;
 static int __init
@@ -85,7 +43,8 @@ init_sunrpc(void)
 #endif
        cache_register(&ip_map_cache);
        cache_register(&unix_gid_cache);
-        init_socket_xprt();
+        svc_init_xprt_sock();   /* svc sock transport */
+        init_socket_xprt();     /* clnt sock transport */
        rpcauth_init_module();
 out:
        return err;
@@ -96,12 +55,11 @@ cleanup_sunrpc(void)
 {
        rpcauth_remove_module();
        cleanup_socket_xprt();
+        svc_cleanup_xprt_sock();
        unregister_rpc_pipefs();
        rpc_destroy_mempool();
-        if (cache_unregister(&ip_map_cache))
+        cache_unregister(&ip_map_cache);
-                printk(KERN_ERR "sunrpc: failed to unregister ip_map cache\n");
+        cache_unregister(&unix_gid_cache);
-        if (cache_unregister(&unix_gid_cache))
-              printk(KERN_ERR "sunrpc: failed to unregister unix_gid cache\n");
 #ifdef RPC_DEBUG
        rpc_unregister_sysctl();
 #endif
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 4ad5fbbb18b4..a290e1523297 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -364,7 +364,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
           void (*shutdown)(struct svc_serv *serv))
 {
        struct svc_serv *serv;
-        int vers;
+        unsigned int vers;
        unsigned int xdrsize;
        unsigned int i;
@@ -433,6 +433,7 @@ svc_create(struct svc_program *prog, unsigned int bufsize,
 {
        return __svc_create(prog, bufsize, /*npools*/1, shutdown);
 }
+EXPORT_SYMBOL(svc_create);
 struct svc_serv *
 svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
@@ -452,6 +453,7 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
        return serv;
 }
+EXPORT_SYMBOL(svc_create_pooled);
 /*
 * Destroy an RPC service.  Should be called with the BKL held
@@ -459,9 +461,6 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
 void
 svc_destroy(struct svc_serv *serv)
 {
-        struct svc_sock *svsk;
-        struct svc_sock *tmp;
        dprintk("svc: svc_destroy(%s, %d)\n",
                                serv->sv_program->pg_name,
                                serv->sv_nrthreads);
@@ -476,14 +475,12 @@ svc_destroy(struct svc_serv *serv)
        del_timer_sync(&serv->sv_temptimer);
-        list_for_each_entry_safe(svsk, tmp, &serv->sv_tempsocks, sk_list)
+        svc_close_all(&serv->sv_tempsocks);
-                svc_force_close_socket(svsk);
        if (serv->sv_shutdown)
                serv->sv_shutdown(serv);
-        list_for_each_entry_safe(svsk, tmp, &serv->sv_permsocks, sk_list)
+        svc_close_all(&serv->sv_permsocks);
-                svc_force_close_socket(svsk);
        BUG_ON(!list_empty(&serv->sv_permsocks));
        BUG_ON(!list_empty(&serv->sv_tempsocks));
@@ -498,6 +495,7 @@ svc_destroy(struct svc_serv *serv)
        kfree(serv->sv_pools);
        kfree(serv);
 }
+EXPORT_SYMBOL(svc_destroy);
 /*
 * Allocate an RPC server's buffer space.
@@ -536,31 +534,17 @@ svc_release_buffer(struct svc_rqst *rqstp)
                        put_page(rqstp->rq_pages[i]);
 }
-/*
+struct svc_rqst *
- * Create a thread in the given pool.  Caller must hold BKL.
+svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool)
- * On a NUMA or SMP machine, with a multi-pool serv, the thread
- * will be restricted to run on the cpus belonging to the pool.
- */
-static int
-__svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
-                    struct svc_pool *pool)
 {
        struct svc_rqst *rqstp;
-        int             error = -ENOMEM;
-        int             have_oldmask = 0;
-        cpumask_t       oldmask;
        rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
        if (!rqstp)
-                goto out;
+                goto out_enomem;
        init_waitqueue_head(&rqstp->rq_wait);
-        if (!(rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL))
-         || !(rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL))
-         || !svc_init_buffer(rqstp, serv->sv_max_mesg))
-                goto out_thread;
        serv->sv_nrthreads++;
        spin_lock_bh(&pool->sp_lock);
        pool->sp_nrthreads++;
@@ -569,6 +553,45 @@ __svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
        rqstp->rq_server = serv;
        rqstp->rq_pool = pool;
+        rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL);
+        if (!rqstp->rq_argp)
+                goto out_thread;
+        rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL);
+        if (!rqstp->rq_resp)
+                goto out_thread;
+        if (!svc_init_buffer(rqstp, serv->sv_max_mesg))
+                goto out_thread;
+        return rqstp;
+out_thread:
+        svc_exit_thread(rqstp);
+out_enomem:
+        return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(svc_prepare_thread);
+/*
+ * Create a thread in the given pool.  Caller must hold BKL.
+ * On a NUMA or SMP machine, with a multi-pool serv, the thread
+ * will be restricted to run on the cpus belonging to the pool.
+ */
+static int
+__svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
+                    struct svc_pool *pool)
+{
+        struct svc_rqst *rqstp;
+        int             error = -ENOMEM;
+        int             have_oldmask = 0;
+        cpumask_t       oldmask;
+        rqstp = svc_prepare_thread(serv, pool);
+        if (IS_ERR(rqstp)) {
+                error = PTR_ERR(rqstp);
+                goto out;
+        }
        if (serv->sv_nrpools > 1)
                have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask);
@@ -597,6 +620,7 @@ svc_create_thread(svc_thread_fn func, struct svc_serv *serv)
 {
        return __svc_create_thread(func, serv, &serv->sv_pools[0]);
 }
+EXPORT_SYMBOL(svc_create_thread);
 /*
 * Choose a pool in which to create a new thread, for svc_set_num_threads
@@ -700,6 +724,7 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
        return error;
 }
+EXPORT_SYMBOL(svc_set_num_threads);
 /*
 * Called from a server thread as it's exiting.  Caller must hold BKL.
@@ -726,6 +751,7 @@ svc_exit_thread(struct svc_rqst *rqstp)
        if (serv)
                svc_destroy(serv);
 }
+EXPORT_SYMBOL(svc_exit_thread);
 /*
 * Register an RPC service with the local portmapper.
@@ -737,7 +763,8 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port)
 {
        struct svc_program      *progp;
        unsigned long           flags;
-        int                     i, error = 0, dummy;
+        unsigned int            i;
+        int                     error = 0, dummy;
        if (!port)
                clear_thread_flag(TIF_SIGPENDING);
@@ -840,9 +867,9 @@ svc_process(struct svc_rqst *rqstp)
        rqstp->rq_res.tail[0].iov_len = 0;
        /* Will be turned off only in gss privacy case: */
        rqstp->rq_splice_ok = 1;
-        /* tcp needs a space for the record length... */
-        if (rqstp->rq_prot == IPPROTO_TCP)
+        /* Setup reply header */
-                svc_putnl(resv, 0);
+        rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp);
        rqstp->rq_xid = svc_getu32(argv);
        svc_putu32(resv, rqstp->rq_xid);
@@ -1049,16 +1076,15 @@ err_bad:
        svc_putnl(resv, ntohl(rpc_stat));
        goto sendit;
 }
+EXPORT_SYMBOL(svc_process);
 /*
 * Return (transport-specific) limit on the rpc payload.
 */
 u32 svc_max_payload(const struct svc_rqst *rqstp)
 {
-        int max = RPCSVC_MAXPAYLOAD_TCP;
+        u32 max = rqstp->rq_xprt->xpt_class->xcl_max_payload;
-        if (rqstp->rq_sock->sk_sock->type == SOCK_DGRAM)
-                max = RPCSVC_MAXPAYLOAD_UDP;
        if (rqstp->rq_server->sv_max_payload < max)
                max = rqstp->rq_server->sv_max_payload;
        return max;
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
new file mode 100644
index 000000000000..ea377e06afae
--- /dev/null
+++ b/net/sunrpc/svc_xprt.c
@@ -0,0 +1,1055 @@
+/*
+ * linux/net/sunrpc/svc_xprt.c
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/file.h>
+#include <linux/freezer.h>
+#include <net/sock.h>
+#include <net/checksum.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/tcp_states.h>
+#include <linux/uaccess.h>
+#include <asm/ioctls.h>
+#include <linux/sunrpc/types.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/svc_xprt.h>
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt);
+static int svc_deferred_recv(struct svc_rqst *rqstp);
+static struct cache_deferred_req *svc_defer(struct cache_req *req);
+static void svc_age_temp_xprts(unsigned long closure);
+/* apparently the "standard" is that clients close
+ * idle connections after 5 minutes, servers after
+ * 6 minutes
+ *   http://www.connectathon.org/talks96/nfstcp.pdf
+ */
+static int svc_conn_age_period = 6*60;
+/* List of registered transport classes */
+static DEFINE_SPINLOCK(svc_xprt_class_lock);
+static LIST_HEAD(svc_xprt_class_list);
+/* SMP locking strategy:
+ *
+ *      svc_pool->sp_lock protects most of the fields of that pool.
+ *      svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
+ *      when both need to be taken (rare), svc_serv->sv_lock is first.
+ *      BKL protects svc_serv->sv_nrthread.
+ *      svc_sock->sk_lock protects the svc_sock->sk_deferred list
+ *             and the ->sk_info_authunix cache.
+ *
+ *      The XPT_BUSY bit in xprt->xpt_flags prevents a transport being
+ *      enqueued multiply. During normal transport processing this bit
+ *      is set by svc_xprt_enqueue and cleared by svc_xprt_received.
+ *      Providers should not manipulate this bit directly.
+ *
+ *      Some flags can be set to certain values at any time
+ *      providing that certain rules are followed:
+ *
+ *      XPT_CONN, XPT_DATA:
+ *              - Can be set or cleared at any time.
+ *              - After a set, svc_xprt_enqueue must be called to enqueue
+ *                the transport for processing.
+ *              - After a clear, the transport must be read/accepted.
+ *                If this succeeds, it must be set again.
+ *      XPT_CLOSE:
+ *              - Can set at any time. It is never cleared.
+ *      XPT_DEAD:
+ *              - Can only be set while XPT_BUSY is held which ensures
+ *                that no other thread will be using the transport or will
+ *                try to set XPT_DEAD.
+ */
+int svc_reg_xprt_class(struct svc_xprt_class *xcl)
+{
+        struct svc_xprt_class *cl;
+        int res = -EEXIST;
+        dprintk("svc: Adding svc transport class '%s'\n", xcl->xcl_name);
+        INIT_LIST_HEAD(&xcl->xcl_list);
+        spin_lock(&svc_xprt_class_lock);
+        /* Make sure there isn't already a class with the same name */
+        list_for_each_entry(cl, &svc_xprt_class_list, xcl_list) {
+                if (strcmp(xcl->xcl_name, cl->xcl_name) == 0)
+                        goto out;
+        }
+        list_add_tail(&xcl->xcl_list, &svc_xprt_class_list);
+        res = 0;
+out:
+        spin_unlock(&svc_xprt_class_lock);
+        return res;
+}
+EXPORT_SYMBOL_GPL(svc_reg_xprt_class);
+void svc_unreg_xprt_class(struct svc_xprt_class *xcl)
+{
+        dprintk("svc: Removing svc transport class '%s'\n", xcl->xcl_name);
+        spin_lock(&svc_xprt_class_lock);
+        list_del_init(&xcl->xcl_list);
+        spin_unlock(&svc_xprt_class_lock);
+}
+EXPORT_SYMBOL_GPL(svc_unreg_xprt_class);
+/*
+ * Format the transport list for printing
+ */
+int svc_print_xprts(char *buf, int maxlen)
+{
+        struct list_head *le;
+        char tmpstr[80];
+        int len = 0;
+        buf[0] = '\0';
+        spin_lock(&svc_xprt_class_lock);
+        list_for_each(le, &svc_xprt_class_list) {
+                int slen;
+                struct svc_xprt_class *xcl =
+                        list_entry(le, struct svc_xprt_class, xcl_list);
+                sprintf(tmpstr, "%s %d\n", xcl->xcl_name, xcl->xcl_max_payload);
+                slen = strlen(tmpstr);
+                if (len + slen > maxlen)
+                        break;
+                len += slen;
+                strcat(buf, tmpstr);
+        }
+        spin_unlock(&svc_xprt_class_lock);
+        return len;
+}
+static void svc_xprt_free(struct kref *kref)
+{
+        struct svc_xprt *xprt =
+                container_of(kref, struct svc_xprt, xpt_ref);
+        struct module *owner = xprt->xpt_class->xcl_owner;
+        if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)
+            && xprt->xpt_auth_cache != NULL)
+                svcauth_unix_info_release(xprt->xpt_auth_cache);
+        xprt->xpt_ops->xpo_free(xprt);
+        module_put(owner);
+}
+void svc_xprt_put(struct svc_xprt *xprt)
+{
+        kref_put(&xprt->xpt_ref, svc_xprt_free);
+}
+EXPORT_SYMBOL_GPL(svc_xprt_put);
+/*
+ * Called by transport drivers to initialize the transport independent
+ * portion of the transport instance.
+ */
+void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt,
+                   struct svc_serv *serv)
+{
+        memset(xprt, 0, sizeof(*xprt));
+        xprt->xpt_class = xcl;
+        xprt->xpt_ops = xcl->xcl_ops;
+        kref_init(&xprt->xpt_ref);
+        xprt->xpt_server = serv;
+        INIT_LIST_HEAD(&xprt->xpt_list);
+        INIT_LIST_HEAD(&xprt->xpt_ready);
+        INIT_LIST_HEAD(&xprt->xpt_deferred);
+        mutex_init(&xprt->xpt_mutex);
+        spin_lock_init(&xprt->xpt_lock);
+        set_bit(XPT_BUSY, &xprt->xpt_flags);
+}
+EXPORT_SYMBOL_GPL(svc_xprt_init);
+int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
+                    int flags)
+{
+        struct svc_xprt_class *xcl;
+        struct sockaddr_in sin = {
+                .sin_family             = AF_INET,
+                .sin_addr.s_addr        = INADDR_ANY,
+                .sin_port               = htons(port),
+        };
+        dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
+        spin_lock(&svc_xprt_class_lock);
+        list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) {
+                struct svc_xprt *newxprt;
+                if (strcmp(xprt_name, xcl->xcl_name))
+                        continue;
+                if (!try_module_get(xcl->xcl_owner))
+                        goto err;
+                spin_unlock(&svc_xprt_class_lock);
+                newxprt = xcl->xcl_ops->
+                        xpo_create(serv, (struct sockaddr *)&sin, sizeof(sin),
+                                   flags);
+                if (IS_ERR(newxprt)) {
+                        module_put(xcl->xcl_owner);
+                        return PTR_ERR(newxprt);
+                }
+                clear_bit(XPT_TEMP, &newxprt->xpt_flags);
+                spin_lock_bh(&serv->sv_lock);
+                list_add(&newxprt->xpt_list, &serv->sv_permsocks);
+                spin_unlock_bh(&serv->sv_lock);
+                clear_bit(XPT_BUSY, &newxprt->xpt_flags);
+                return svc_xprt_local_port(newxprt);
+        }
+ err:
+        spin_unlock(&svc_xprt_class_lock);
+        dprintk("svc: transport %s not found\n", xprt_name);
+        return -ENOENT;
+}
+EXPORT_SYMBOL_GPL(svc_create_xprt);
+/*
+ * Copy the local and remote xprt addresses to the rqstp structure
+ */
+void svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt)
+{
+        struct sockaddr *sin;
+        memcpy(&rqstp->rq_addr, &xprt->xpt_remote, xprt->xpt_remotelen);
+        rqstp->rq_addrlen = xprt->xpt_remotelen;
+        /*
+         * Destination address in request is needed for binding the
+         * source address in RPC replies/callbacks later.
+         */
+        sin = (struct sockaddr *)&xprt->xpt_local;
+        switch (sin->sa_family) {
+        case AF_INET:
+                rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr;
+                break;
+        case AF_INET6:
+                rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr;
+                break;
+        }
+}
+EXPORT_SYMBOL_GPL(svc_xprt_copy_addrs);
+/**
+ * svc_print_addr - Format rq_addr field for printing
+ * @rqstp: svc_rqst struct containing address to print
+ * @buf: target buffer for formatted address
+ * @len: length of target buffer
+ *
+ */
+char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len)
+{
+        return __svc_print_addr(svc_addr(rqstp), buf, len);
+}
+EXPORT_SYMBOL_GPL(svc_print_addr);
+/*
+ * Queue up an idle server thread.  Must have pool->sp_lock held.
+ * Note: this is really a stack rather than a queue, so that we only
+ * use as many different threads as we need, and the rest don't pollute
+ * the cache.
+ */
+static void svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp)
+{
+        list_add(&rqstp->rq_list, &pool->sp_threads);
+}
+/*
+ * Dequeue an nfsd thread.  Must have pool->sp_lock held.
+ */
+static void svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp)
+{
+        list_del(&rqstp->rq_list);
+}
+/*
+ * Queue up a transport with data pending. If there are idle nfsd
+ * processes, wake 'em up.
+ *
+ */
+void svc_xprt_enqueue(struct svc_xprt *xprt)
+{
+        struct svc_serv *serv = xprt->xpt_server;
+        struct svc_pool *pool;
+        struct svc_rqst *rqstp;
+        int cpu;
+        if (!(xprt->xpt_flags &
+              ((1<<XPT_CONN)|(1<<XPT_DATA)|(1<<XPT_CLOSE)|(1<<XPT_DEFERRED))))
+                return;
+        if (test_bit(XPT_DEAD, &xprt->xpt_flags))
+                return;
+        cpu = get_cpu();
+        pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
+        put_cpu();
+        spin_lock_bh(&pool->sp_lock);
+        if (!list_empty(&pool->sp_threads) &&
+            !list_empty(&pool->sp_sockets))
+                printk(KERN_ERR
+                       "svc_xprt_enqueue: "
+                       "threads and transports both waiting??\n");
+        if (test_bit(XPT_DEAD, &xprt->xpt_flags)) {
+                /* Don't enqueue dead transports */
+                dprintk("svc: transport %p is dead, not enqueued\n", xprt);
+                goto out_unlock;
+        }
+        /* Mark transport as busy. It will remain in this state until
+         * the provider calls svc_xprt_received. We update XPT_BUSY
+         * atomically because it also guards against trying to enqueue
+         * the transport twice.
+         */
+        if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) {
+                /* Don't enqueue transport while already enqueued */
+                dprintk("svc: transport %p busy, not enqueued\n", xprt);
+                goto out_unlock;
+        }
+        BUG_ON(xprt->xpt_pool != NULL);
+        xprt->xpt_pool = pool;
+        /* Handle pending connection */
+        if (test_bit(XPT_CONN, &xprt->xpt_flags))
+                goto process;
+        /* Handle close in-progress */
+        if (test_bit(XPT_CLOSE, &xprt->xpt_flags))
+                goto process;
+        /* Check if we have space to reply to a request */
+        if (!xprt->xpt_ops->xpo_has_wspace(xprt)) {
+                /* Don't enqueue while not enough space for reply */
+                dprintk("svc: no write space, transport %p  not enqueued\n",
+                        xprt);
+                xprt->xpt_pool = NULL;
+                clear_bit(XPT_BUSY, &xprt->xpt_flags);
+                goto out_unlock;
+        }
+ process:
+        if (!list_empty(&pool->sp_threads)) {
+                rqstp = list_entry(pool->sp_threads.next,
+                                   struct svc_rqst,
+                                   rq_list);
+                dprintk("svc: transport %p served by daemon %p\n",
+                        xprt, rqstp);
+                svc_thread_dequeue(pool, rqstp);
+                if (rqstp->rq_xprt)
+                        printk(KERN_ERR
+                                "svc_xprt_enqueue: server %p, rq_xprt=%p!\n",
+                                rqstp, rqstp->rq_xprt);
+                rqstp->rq_xprt = xprt;
+                svc_xprt_get(xprt);
+                rqstp->rq_reserved = serv->sv_max_mesg;
+                atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
+                BUG_ON(xprt->xpt_pool != pool);
+                wake_up(&rqstp->rq_wait);
+        } else {
+                dprintk("svc: transport %p put into queue\n", xprt);
+                list_add_tail(&xprt->xpt_ready, &pool->sp_sockets);
+                BUG_ON(xprt->xpt_pool != pool);
+        }
+out_unlock:
+        spin_unlock_bh(&pool->sp_lock);
+}
+EXPORT_SYMBOL_GPL(svc_xprt_enqueue);
+/*
+ * Dequeue the first transport.  Must be called with the pool->sp_lock held.
+ */
+static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
+{
+        struct svc_xprt *xprt;
+        if (list_empty(&pool->sp_sockets))
+                return NULL;
+        xprt = list_entry(pool->sp_sockets.next,
+                          struct svc_xprt, xpt_ready);
+        list_del_init(&xprt->xpt_ready);
+        dprintk("svc: transport %p dequeued, inuse=%d\n",
+                xprt, atomic_read(&xprt->xpt_ref.refcount));
+        return xprt;
+}
+/*
+ * svc_xprt_received conditionally queues the transport for processing
+ * by another thread. The caller must hold the XPT_BUSY bit and must
+ * not thereafter touch transport data.
+ *
+ * Note: XPT_DATA only gets cleared when a read-attempt finds no (or
+ * insufficient) data.
+ */
+void svc_xprt_received(struct svc_xprt *xprt)
+{
+        BUG_ON(!test_bit(XPT_BUSY, &xprt->xpt_flags));
+        xprt->xpt_pool = NULL;
+        clear_bit(XPT_BUSY, &xprt->xpt_flags);
+        svc_xprt_enqueue(xprt);
+}
+EXPORT_SYMBOL_GPL(svc_xprt_received);
+/**
+ * svc_reserve - change the space reserved for the reply to a request.
+ * @rqstp:  The request in question
+ * @space: new max space to reserve
+ *
+ * Each request reserves some space on the output queue of the transport
+ * to make sure the reply fits.  This function reduces that reserved
+ * space to be the amount of space used already, plus @space.
+ *
+ */
+void svc_reserve(struct svc_rqst *rqstp, int space)
+{
+        space += rqstp->rq_res.head[0].iov_len;
+        if (space < rqstp->rq_reserved) {
+                struct svc_xprt *xprt = rqstp->rq_xprt;
+                atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved);
+                rqstp->rq_reserved = space;
+                svc_xprt_enqueue(xprt);
+        }
+}
+EXPORT_SYMBOL(svc_reserve);
+static void svc_xprt_release(struct svc_rqst *rqstp)
+{
+        struct svc_xprt *xprt = rqstp->rq_xprt;
+        rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp);
+        svc_free_res_pages(rqstp);
+        rqstp->rq_res.page_len = 0;
+        rqstp->rq_res.page_base = 0;
+        /* Reset response buffer and release
+         * the reservation.
+         * But first, check that enough space was reserved
+         * for the reply, otherwise we have a bug!
+         */
+        if ((rqstp->rq_res.len) >  rqstp->rq_reserved)
+                printk(KERN_ERR "RPC request reserved %d but used %d\n",
+                       rqstp->rq_reserved,
+                       rqstp->rq_res.len);
+        rqstp->rq_res.head[0].iov_len = 0;
+        svc_reserve(rqstp, 0);
+        rqstp->rq_xprt = NULL;
+        svc_xprt_put(xprt);
+}
+/*
+ * External function to wake up a server waiting for data
+ * This really only makes sense for services like lockd
+ * which have exactly one thread anyway.
+ */
+void svc_wake_up(struct svc_serv *serv)
+{
+        struct svc_rqst *rqstp;
+        unsigned int i;
+        struct svc_pool *pool;
+        for (i = 0; i < serv->sv_nrpools; i++) {
+                pool = &serv->sv_pools[i];
+                spin_lock_bh(&pool->sp_lock);
+                if (!list_empty(&pool->sp_threads)) {
+                        rqstp = list_entry(pool->sp_threads.next,
+                                           struct svc_rqst,
+                                           rq_list);
+                        dprintk("svc: daemon %p woken up.\n", rqstp);
+                        /*
+                        svc_thread_dequeue(pool, rqstp);
+                        rqstp->rq_xprt = NULL;
+                         */
+                        wake_up(&rqstp->rq_wait);
+                }
+                spin_unlock_bh(&pool->sp_lock);
+        }
+}
+EXPORT_SYMBOL(svc_wake_up);
+int svc_port_is_privileged(struct sockaddr *sin)
+{
+        switch (sin->sa_family) {
+        case AF_INET:
+                return ntohs(((struct sockaddr_in *)sin)->sin_port)
+                        < PROT_SOCK;
+        case AF_INET6:
+                return ntohs(((struct sockaddr_in6 *)sin)->sin6_port)
+                        < PROT_SOCK;
+        default:
+                return 0;
+        }
+}
+/*
+ * Make sure that we don't have too many active connections.  If we
+ * have, something must be dropped.
+ *
+ * There's no point in trying to do random drop here for DoS
+ * prevention. The NFS clients does 1 reconnect in 15 seconds. An
+ * attacker can easily beat that.
+ *
+ * The only somewhat efficient mechanism would be if drop old
+ * connections from the same IP first. But right now we don't even
+ * record the client IP in svc_sock.
+ */
+static void svc_check_conn_limits(struct svc_serv *serv)
+{
+        if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) {
+                struct svc_xprt *xprt = NULL;
+                spin_lock_bh(&serv->sv_lock);
+                if (!list_empty(&serv->sv_tempsocks)) {
+                        if (net_ratelimit()) {
+                                /* Try to help the admin */
+                                printk(KERN_NOTICE "%s: too many open  "
+                                       "connections, consider increasing the "
+                                       "number of nfsd threads\n",
+                                       serv->sv_name);
+                        }
+                        /*
+                         * Always select the oldest connection. It's not fair,
+                         * but so is life
+                         */
+                        xprt = list_entry(serv->sv_tempsocks.prev,
+                                          struct svc_xprt,
+                                          xpt_list);
+                        set_bit(XPT_CLOSE, &xprt->xpt_flags);
+                        svc_xprt_get(xprt);
+                }
+                spin_unlock_bh(&serv->sv_lock);
+                if (xprt) {
+                        svc_xprt_enqueue(xprt);
+                        svc_xprt_put(xprt);
+                }
+        }
+}
+/*
+ * Receive the next request on any transport.  This code is carefully
+ * organised not to touch any cachelines in the shared svc_serv
+ * structure, only cachelines in the local svc_pool.
+ */
+int svc_recv(struct svc_rqst *rqstp, long timeout)
+{
+        struct svc_xprt         *xprt = NULL;
+        struct svc_serv         *serv = rqstp->rq_server;
+        struct svc_pool         *pool = rqstp->rq_pool;
+        int                     len, i;
+        int                     pages;
+        struct xdr_buf          *arg;
+        DECLARE_WAITQUEUE(wait, current);
+        dprintk("svc: server %p waiting for data (to = %ld)\n",
+                rqstp, timeout);
+        if (rqstp->rq_xprt)
+                printk(KERN_ERR
+                        "svc_recv: service %p, transport not NULL!\n",
+                         rqstp);
+        if (waitqueue_active(&rqstp->rq_wait))
+                printk(KERN_ERR
+                        "svc_recv: service %p, wait queue active!\n",
+                         rqstp);
+        /* now allocate needed pages.  If we get a failure, sleep briefly */
+        pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE;
+        for (i = 0; i < pages ; i++)
+                while (rqstp->rq_pages[i] == NULL) {
+                        struct page *p = alloc_page(GFP_KERNEL);
+                        if (!p) {
+                                int j = msecs_to_jiffies(500);
+                                schedule_timeout_uninterruptible(j);
+                        }
+                        rqstp->rq_pages[i] = p;
+                }
+        rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */
+        BUG_ON(pages >= RPCSVC_MAXPAGES);
+        /* Make arg->head point to first page and arg->pages point to rest */
+        arg = &rqstp->rq_arg;
+        arg->head[0].iov_base = page_address(rqstp->rq_pages[0]);
+        arg->head[0].iov_len = PAGE_SIZE;
+        arg->pages = rqstp->rq_pages + 1;
+        arg->page_base = 0;
+        /* save at least one page for response */
+        arg->page_len = (pages-2)*PAGE_SIZE;
+        arg->len = (pages-1)*PAGE_SIZE;
+        arg->tail[0].iov_len = 0;
+        try_to_freeze();
+        cond_resched();
+        if (signalled())
+                return -EINTR;
+        spin_lock_bh(&pool->sp_lock);
+        xprt = svc_xprt_dequeue(pool);
+        if (xprt) {
+                rqstp->rq_xprt = xprt;
+                svc_xprt_get(xprt);
+                rqstp->rq_reserved = serv->sv_max_mesg;
+                atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
+        } else {
+                /* No data pending. Go to sleep */
+                svc_thread_enqueue(pool, rqstp);
+                /*
+                 * We have to be able to interrupt this wait
+                 * to bring down the daemons ...
+                 */
+                set_current_state(TASK_INTERRUPTIBLE);
+                add_wait_queue(&rqstp->rq_wait, &wait);
+                spin_unlock_bh(&pool->sp_lock);
+                schedule_timeout(timeout);
+                try_to_freeze();
+                spin_lock_bh(&pool->sp_lock);
+                remove_wait_queue(&rqstp->rq_wait, &wait);
+                xprt = rqstp->rq_xprt;
+                if (!xprt) {
+                        svc_thread_dequeue(pool, rqstp);
+                        spin_unlock_bh(&pool->sp_lock);
+                        dprintk("svc: server %p, no data yet\n", rqstp);
+                        return signalled()? -EINTR : -EAGAIN;
+                }
+        }
+        spin_unlock_bh(&pool->sp_lock);
+        len = 0;
+        if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) {
+                dprintk("svc_recv: found XPT_CLOSE\n");
+                svc_delete_xprt(xprt);
+        } else if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) {
+                struct svc_xprt *newxpt;
+                newxpt = xprt->xpt_ops->xpo_accept(xprt);
+                if (newxpt) {
+                        /*
+                         * We know this module_get will succeed because the
+                         * listener holds a reference too
+                         */
+                        __module_get(newxpt->xpt_class->xcl_owner);
+                        svc_check_conn_limits(xprt->xpt_server);
+                        spin_lock_bh(&serv->sv_lock);
+                        set_bit(XPT_TEMP, &newxpt->xpt_flags);
+                        list_add(&newxpt->xpt_list, &serv->sv_tempsocks);
+                        serv->sv_tmpcnt++;
+                        if (serv->sv_temptimer.function == NULL) {
+                                /* setup timer to age temp transports */
+                                setup_timer(&serv->sv_temptimer,
+                                            svc_age_temp_xprts,
+                                            (unsigned long)serv);
+                                mod_timer(&serv->sv_temptimer,
+                                          jiffies + svc_conn_age_period * HZ);
+                        }
+                        spin_unlock_bh(&serv->sv_lock);
+                        svc_xprt_received(newxpt);
+                }
+                svc_xprt_received(xprt);
+        } else {
+                dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
+                        rqstp, pool->sp_id, xprt,
+                        atomic_read(&xprt->xpt_ref.refcount));
+                rqstp->rq_deferred = svc_deferred_dequeue(xprt);
+                if (rqstp->rq_deferred) {
+                        svc_xprt_received(xprt);
+                        len = svc_deferred_recv(rqstp);
+                } else
+                        len = xprt->xpt_ops->xpo_recvfrom(rqstp);
+                dprintk("svc: got len=%d\n", len);
+        }
+        /* No data, incomplete (TCP) read, or accept() */
+        if (len == 0 || len == -EAGAIN) {
+                rqstp->rq_res.len = 0;
+                svc_xprt_release(rqstp);
+                return -EAGAIN;
+        }
+        clear_bit(XPT_OLD, &xprt->xpt_flags);
+        rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp));
+        rqstp->rq_chandle.defer = svc_defer;
+        if (serv->sv_stats)
+                serv->sv_stats->netcnt++;
+        return len;
+}
+EXPORT_SYMBOL(svc_recv);
+/*
+ * Drop request
+ */
+void svc_drop(struct svc_rqst *rqstp)
+{
+        dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt);
+        svc_xprt_release(rqstp);
+}
+EXPORT_SYMBOL(svc_drop);
+/*
+ * Return reply to client.
+ */
+int svc_send(struct svc_rqst *rqstp)
+{
+        struct svc_xprt *xprt;
+        int             len;
+        struct xdr_buf  *xb;
+        xprt = rqstp->rq_xprt;
+        if (!xprt)
+                return -EFAULT;
+        /* release the receive skb before sending the reply */
+        rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp);
+        /* calculate over-all length */
+        xb = &rqstp->rq_res;
+        xb->len = xb->head[0].iov_len +
+                xb->page_len +
+                xb->tail[0].iov_len;
+        /* Grab mutex to serialize outgoing data. */
+        mutex_lock(&xprt->xpt_mutex);
+        if (test_bit(XPT_DEAD, &xprt->xpt_flags))
+                len = -ENOTCONN;
+        else
+                len = xprt->xpt_ops->xpo_sendto(rqstp);
+        mutex_unlock(&xprt->xpt_mutex);
+        svc_xprt_release(rqstp);
+        if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN)
+                return 0;
+        return len;
+}
+/*
+ * Timer function to close old temporary transports, using
+ * a mark-and-sweep algorithm.
+ */
+static void svc_age_temp_xprts(unsigned long closure)
+{
+        struct svc_serv *serv = (struct svc_serv *)closure;
+        struct svc_xprt *xprt;
+        struct list_head *le, *next;
+        LIST_HEAD(to_be_aged);
+        dprintk("svc_age_temp_xprts\n");
+        if (!spin_trylock_bh(&serv->sv_lock)) {
+                /* busy, try again 1 sec later */
+                dprintk("svc_age_temp_xprts: busy\n");
+                mod_timer(&serv->sv_temptimer, jiffies + HZ);
+                return;
+        }
+        list_for_each_safe(le, next, &serv->sv_tempsocks) {
+                xprt = list_entry(le, struct svc_xprt, xpt_list);
+                /* First time through, just mark it OLD. Second time
+                 * through, close it. */
+                if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags))
+                        continue;
+                if (atomic_read(&xprt->xpt_ref.refcount) > 1
+                    || test_bit(XPT_BUSY, &xprt->xpt_flags))
+                        continue;
+                svc_xprt_get(xprt);
+                list_move(le, &to_be_aged);
+                set_bit(XPT_CLOSE, &xprt->xpt_flags);
+                set_bit(XPT_DETACHED, &xprt->xpt_flags);
+        }
+        spin_unlock_bh(&serv->sv_lock);
+        while (!list_empty(&to_be_aged)) {
+                le = to_be_aged.next;
+                /* fiddling the xpt_list node is safe 'cos we're XPT_DETACHED */
+                list_del_init(le);
+                xprt = list_entry(le, struct svc_xprt, xpt_list);
+                dprintk("queuing xprt %p for closing\n", xprt);
+                /* a thread will dequeue and close it soon */
+                svc_xprt_enqueue(xprt);
+                svc_xprt_put(xprt);
+        }
+        mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ);
+}
+/*
+ * Remove a dead transport
+ */
+void svc_delete_xprt(struct svc_xprt *xprt)
+{
+        struct svc_serv *serv = xprt->xpt_server;
+        dprintk("svc: svc_delete_xprt(%p)\n", xprt);
+        xprt->xpt_ops->xpo_detach(xprt);
+        spin_lock_bh(&serv->sv_lock);
+        if (!test_and_set_bit(XPT_DETACHED, &xprt->xpt_flags))
+                list_del_init(&xprt->xpt_list);
+        /*
+         * We used to delete the transport from whichever list
+         * it's sk_xprt.xpt_ready node was on, but we don't actually
+         * need to.  This is because the only time we're called
+         * while still attached to a queue, the queue itself
+         * is about to be destroyed (in svc_destroy).
+         */
+        if (!test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) {
+                BUG_ON(atomic_read(&xprt->xpt_ref.refcount) < 2);
+                if (test_bit(XPT_TEMP, &xprt->xpt_flags))
+                        serv->sv_tmpcnt--;
+                svc_xprt_put(xprt);
+        }
+        spin_unlock_bh(&serv->sv_lock);
+}
+void svc_close_xprt(struct svc_xprt *xprt)
+{
+        set_bit(XPT_CLOSE, &xprt->xpt_flags);
+        if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags))
+                /* someone else will have to effect the close */
+                return;
+        svc_xprt_get(xprt);
+        svc_delete_xprt(xprt);
+        clear_bit(XPT_BUSY, &xprt->xpt_flags);
+        svc_xprt_put(xprt);
+}
+EXPORT_SYMBOL_GPL(svc_close_xprt);
+void svc_close_all(struct list_head *xprt_list)
+{
+        struct svc_xprt *xprt;
+        struct svc_xprt *tmp;
+        list_for_each_entry_safe(xprt, tmp, xprt_list, xpt_list) {
+                set_bit(XPT_CLOSE, &xprt->xpt_flags);
+                if (test_bit(XPT_BUSY, &xprt->xpt_flags)) {
+                        /* Waiting to be processed, but no threads left,
+                         * So just remove it from the waiting list
+                         */
+                        list_del_init(&xprt->xpt_ready);
+                        clear_bit(XPT_BUSY, &xprt->xpt_flags);
+                }
+                svc_close_xprt(xprt);
+        }
+}
+/*
+ * Handle defer and revisit of requests
+ */
+static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
+{
+        struct svc_deferred_req *dr =
+                container_of(dreq, struct svc_deferred_req, handle);
+        struct svc_xprt *xprt = dr->xprt;
+        if (too_many) {
+                svc_xprt_put(xprt);
+                kfree(dr);
+                return;
+        }
+        dprintk("revisit queued\n");
+        dr->xprt = NULL;
+        spin_lock(&xprt->xpt_lock);
+        list_add(&dr->handle.recent, &xprt->xpt_deferred);
+        spin_unlock(&xprt->xpt_lock);
+        set_bit(XPT_DEFERRED, &xprt->xpt_flags);
+        svc_xprt_enqueue(xprt);
+        svc_xprt_put(xprt);
+}
+/*
+ * Save the request off for later processing. The request buffer looks
+ * like this:
+ *
+ * <xprt-header><rpc-header><rpc-pagelist><rpc-tail>
+ *
+ * This code can only handle requests that consist of an xprt-header
+ * and rpc-header.
+ */
+static struct cache_deferred_req *svc_defer(struct cache_req *req)
+{
+        struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle);
+        struct svc_deferred_req *dr;
+        if (rqstp->rq_arg.page_len)
+                return NULL; /* if more than a page, give up FIXME */
+        if (rqstp->rq_deferred) {
+                dr = rqstp->rq_deferred;
+                rqstp->rq_deferred = NULL;
+        } else {
+                size_t skip;
+                size_t size;
+                /* FIXME maybe discard if size too large */
+                size = sizeof(struct svc_deferred_req) + rqstp->rq_arg.len;
+                dr = kmalloc(size, GFP_KERNEL);
+                if (dr == NULL)
+                        return NULL;
+                dr->handle.owner = rqstp->rq_server;
+                dr->prot = rqstp->rq_prot;
+                memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen);
+                dr->addrlen = rqstp->rq_addrlen;
+                dr->daddr = rqstp->rq_daddr;
+                dr->argslen = rqstp->rq_arg.len >> 2;
+                dr->xprt_hlen = rqstp->rq_xprt_hlen;
+                /* back up head to the start of the buffer and copy */
+                skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
+                memcpy(dr->args, rqstp->rq_arg.head[0].iov_base - skip,
+                       dr->argslen << 2);
+        }
+        svc_xprt_get(rqstp->rq_xprt);
+        dr->xprt = rqstp->rq_xprt;
+        dr->handle.revisit = svc_revisit;
+        return &dr->handle;
+}
+/*
+ * recv data from a deferred request into an active one
+ */
+static int svc_deferred_recv(struct svc_rqst *rqstp)
+{
+        struct svc_deferred_req *dr = rqstp->rq_deferred;
+        /* setup iov_base past transport header */
+        rqstp->rq_arg.head[0].iov_base = dr->args + (dr->xprt_hlen>>2);
+        /* The iov_len does not include the transport header bytes */
+        rqstp->rq_arg.head[0].iov_len = (dr->argslen<<2) - dr->xprt_hlen;
+        rqstp->rq_arg.page_len = 0;
+        /* The rq_arg.len includes the transport header bytes */
+        rqstp->rq_arg.len     = dr->argslen<<2;
+        rqstp->rq_prot        = dr->prot;
+        memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen);
+        rqstp->rq_addrlen     = dr->addrlen;
+        /* Save off transport header len in case we get deferred again */
+        rqstp->rq_xprt_hlen   = dr->xprt_hlen;
+        rqstp->rq_daddr       = dr->daddr;
+        rqstp->rq_respages    = rqstp->rq_pages;
+        return (dr->argslen<<2) - dr->xprt_hlen;
+}
+static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
+{
+        struct svc_deferred_req *dr = NULL;
+        if (!test_bit(XPT_DEFERRED, &xprt->xpt_flags))
+                return NULL;
+        spin_lock(&xprt->xpt_lock);
+        clear_bit(XPT_DEFERRED, &xprt->xpt_flags);
+        if (!list_empty(&xprt->xpt_deferred)) {
+                dr = list_entry(xprt->xpt_deferred.next,
+                                struct svc_deferred_req,
+                                handle.recent);
+                list_del_init(&dr->handle.recent);
+                set_bit(XPT_DEFERRED, &xprt->xpt_flags);
+        }
+        spin_unlock(&xprt->xpt_lock);
+        return dr;
+}
+/*
+ * Return the transport instance pointer for the endpoint accepting
+ * connections/peer traffic from the specified transport class,
+ * address family and port.
+ *
+ * Specifying 0 for the address family or port is effectively a
+ * wild-card, and will result in matching the first transport in the
+ * service's list that has a matching class name.
+ */
+struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name,
+                               int af, int port)
+{
+        struct svc_xprt *xprt;
+        struct svc_xprt *found = NULL;
+        /* Sanity check the args */
+        if (!serv || !xcl_name)
+                return found;
+        spin_lock_bh(&serv->sv_lock);
+        list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) {
+                if (strcmp(xprt->xpt_class->xcl_name, xcl_name))
+                        continue;
+                if (af != AF_UNSPEC && af != xprt->xpt_local.ss_family)
+                        continue;
+                if (port && port != svc_xprt_local_port(xprt))
+                        continue;
+                found = xprt;
+                svc_xprt_get(xprt);
+                break;
+        }
+        spin_unlock_bh(&serv->sv_lock);
+        return found;
+}
+EXPORT_SYMBOL_GPL(svc_find_xprt);
+/*
+ * Format a buffer with a list of the active transports. A zero for
+ * the buflen parameter disables target buffer overflow checking.
+ */
+int svc_xprt_names(struct svc_serv *serv, char *buf, int buflen)
+{
+        struct svc_xprt *xprt;
+        char xprt_str[64];
+        int totlen = 0;
+        int len;
+        /* Sanity check args */
+        if (!serv)
+                return 0;
+        spin_lock_bh(&serv->sv_lock);
+        list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) {
+                len = snprintf(xprt_str, sizeof(xprt_str),
+                               "%s %d\n", xprt->xpt_class->xcl_name,
+                               svc_xprt_local_port(xprt));
+                /* If the string was truncated, replace with error string */
+                if (len >= sizeof(xprt_str))
+                        strcpy(xprt_str, "name-too-long\n");
+                /* Don't overflow buffer */
+                len = strlen(xprt_str);
+                if (buflen && (len + totlen >= buflen))
+                        break;
+                strcpy(buf+totlen, xprt_str);
+                totlen += len;
+        }
+        spin_unlock_bh(&serv->sv_lock);
+        return totlen;
+}
+EXPORT_SYMBOL_GPL(svc_xprt_names);
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index af7c5f05c6e1..8a73cbb16052 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -57,11 +57,13 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp)
        rqstp->rq_authop = aops;
        return aops->accept(rqstp, authp);
 }
+EXPORT_SYMBOL(svc_authenticate);
 int svc_set_client(struct svc_rqst *rqstp)
 {
        return rqstp->rq_authop->set_client(rqstp);
 }
+EXPORT_SYMBOL(svc_set_client);
 /* A request, which was authenticated, has now executed.
 * Time to finalise the credentials and verifier
@@ -93,6 +95,7 @@ svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops)
        spin_unlock(&authtab_lock);
        return rv;
 }
+EXPORT_SYMBOL(svc_auth_register);
 void
 svc_auth_unregister(rpc_authflavor_t flavor)
@@ -129,6 +132,7 @@ void auth_domain_put(struct auth_domain *dom)
                spin_unlock(&auth_domain_lock);
        }
 }
+EXPORT_SYMBOL(auth_domain_put);
 struct auth_domain *
 auth_domain_lookup(char *name, struct auth_domain *new)
@@ -153,8 +157,10 @@ auth_domain_lookup(char *name, struct auth_domain *new)
        spin_unlock(&auth_domain_lock);
        return new;
 }
+EXPORT_SYMBOL(auth_domain_lookup);
 struct auth_domain *auth_domain_find(char *name)
 {
        return auth_domain_lookup(name, NULL);
 }
+EXPORT_SYMBOL(auth_domain_find);
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 411479411b21..3c64051e4555 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -63,6 +63,7 @@ struct auth_domain *unix_domain_find(char *name)
                rv = auth_domain_lookup(name, &new->h);
        }
 }
+EXPORT_SYMBOL(unix_domain_find);
 static void svcauth_unix_domain_release(struct auth_domain *dom)
 {
@@ -340,6 +341,7 @@ int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom)
        else
                return -ENOMEM;
 }
+EXPORT_SYMBOL(auth_unix_add_addr);
 int auth_unix_forget_old(struct auth_domain *dom)
 {
@@ -351,6 +353,7 @@ int auth_unix_forget_old(struct auth_domain *dom)
        udom->addr_changes++;
        return 0;
 }
+EXPORT_SYMBOL(auth_unix_forget_old);
 struct auth_domain *auth_unix_lookup(struct in_addr addr)
 {
@@ -375,50 +378,56 @@ struct auth_domain *auth_unix_lookup(struct in_addr addr)
        cache_put(&ipm->h, &ip_map_cache);
        return rv;
 }
+EXPORT_SYMBOL(auth_unix_lookup);
 void svcauth_unix_purge(void)
 {
        cache_purge(&ip_map_cache);
 }
+EXPORT_SYMBOL(svcauth_unix_purge);
 static inline struct ip_map *
 ip_map_cached_get(struct svc_rqst *rqstp)
 {
-        struct ip_map *ipm;
+        struct ip_map *ipm = NULL;
-        struct svc_sock *svsk = rqstp->rq_sock;
+        struct svc_xprt *xprt = rqstp->rq_xprt;
-        spin_lock(&svsk->sk_lock);
-        ipm = svsk->sk_info_authunix;
+        if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) {
-        if (ipm != NULL) {
+                spin_lock(&xprt->xpt_lock);
-                if (!cache_valid(&ipm->h)) {
+                ipm = xprt->xpt_auth_cache;
-                        /*
+                if (ipm != NULL) {
-                         * The entry has been invalidated since it was
+                        if (!cache_valid(&ipm->h)) {
-                         * remembered, e.g. by a second mount from the
+                                /*
-                         * same IP address.
+                                 * The entry has been invalidated since it was
-                         */
+                                 * remembered, e.g. by a second mount from the
-                        svsk->sk_info_authunix = NULL;
+                                 * same IP address.
-                        spin_unlock(&svsk->sk_lock);
+                                 */
-                        cache_put(&ipm->h, &ip_map_cache);
+                                xprt->xpt_auth_cache = NULL;
-                        return NULL;
+                                spin_unlock(&xprt->xpt_lock);
+                                cache_put(&ipm->h, &ip_map_cache);
+                                return NULL;
+                        }
+                        cache_get(&ipm->h);
                }
-                cache_get(&ipm->h);
+                spin_unlock(&xprt->xpt_lock);
        }
-        spin_unlock(&svsk->sk_lock);
        return ipm;
 }
 static inline void
 ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm)
 {
-        struct svc_sock *svsk = rqstp->rq_sock;
+        struct svc_xprt *xprt = rqstp->rq_xprt;
-        spin_lock(&svsk->sk_lock);
+        if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) {
-        if (svsk->sk_sock->type == SOCK_STREAM &&
+                spin_lock(&xprt->xpt_lock);
-            svsk->sk_info_authunix == NULL) {
+                if (xprt->xpt_auth_cache == NULL) {
-                /* newly cached, keep the reference */
+                        /* newly cached, keep the reference */
-                svsk->sk_info_authunix = ipm;
+                        xprt->xpt_auth_cache = ipm;
-                ipm = NULL;
+                        ipm = NULL;
+                }
+                spin_unlock(&xprt->xpt_lock);
        }
-        spin_unlock(&svsk->sk_lock);
        if (ipm)
                cache_put(&ipm->h, &ip_map_cache);
 }
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index c75bffeb89eb..1d3e5fcc2cc4 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -5,7 +5,7 @@
 *
 * The server scheduling algorithm does not always distribute the load
 * evenly when servicing a single client. May need to modify the
- * svc_sock_enqueue procedure...
+ * svc_xprt_enqueue procedure...
 *
 * TCP support is largely untested and may be a little slow. The problem
 * is that we currently do two separate recvfrom's, one for the 4-byte
@@ -48,72 +48,40 @@
 #include <linux/sunrpc/svcsock.h>
 #include <linux/sunrpc/stats.h>
-/* SMP locking strategy:
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
- *
- *      svc_pool->sp_lock protects most of the fields of that pool.
- *      svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
- *      when both need to be taken (rare), svc_serv->sv_lock is first.
- *      BKL protects svc_serv->sv_nrthread.
- *      svc_sock->sk_lock protects the svc_sock->sk_deferred list
- *             and the ->sk_info_authunix cache.
- *      svc_sock->sk_flags.SK_BUSY prevents a svc_sock being enqueued multiply.
- *
- *      Some flags can be set to certain values at any time
- *      providing that certain rules are followed:
- *
- *      SK_CONN, SK_DATA, can be set or cleared at any time.
- *              after a set, svc_sock_enqueue must be called.
- *              after a clear, the socket must be read/accepted
- *               if this succeeds, it must be set again.
- *      SK_CLOSE can set at any time. It is never cleared.
- *      sk_inuse contains a bias of '1' until SK_DEAD is set.
- *             so when sk_inuse hits zero, we know the socket is dead
- *             and no-one is using it.
- *      SK_DEAD can only be set while SK_BUSY is held which ensures
- *             no other thread will be using the socket or will try to
- *             set SK_DEAD.
- *
- */
-#define RPCDBG_FACILITY RPCDBG_SVCSOCK
 static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *,
                                         int *errp, int flags);
-static void             svc_delete_socket(struct svc_sock *svsk);
 static void             svc_udp_data_ready(struct sock *, int);
 static int              svc_udp_recvfrom(struct svc_rqst *);
 static int              svc_udp_sendto(struct svc_rqst *);
-static void             svc_close_socket(struct svc_sock *svsk);
+static void             svc_sock_detach(struct svc_xprt *);
+static void             svc_sock_free(struct svc_xprt *);
-static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk);
-static int svc_deferred_recv(struct svc_rqst *rqstp);
-static struct cache_deferred_req *svc_defer(struct cache_req *req);
-/* apparently the "standard" is that clients close
- * idle connections after 5 minutes, servers after
- * 6 minutes
- *   http://www.connectathon.org/talks96/nfstcp.pdf
- */
-static int svc_conn_age_period = 6*60;
+static struct svc_xprt *svc_create_socket(struct svc_serv *, int,
+                                          struct sockaddr *, int, int);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key svc_key[2];
 static struct lock_class_key svc_slock_key[2];
-static inline void svc_reclassify_socket(struct socket *sock)
+static void svc_reclassify_socket(struct socket *sock)
 {
        struct sock *sk = sock->sk;
        BUG_ON(sock_owned_by_user(sk));
        switch (sk->sk_family) {
        case AF_INET:
                sock_lock_init_class_and_name(sk, "slock-AF_INET-NFSD",
-                    &svc_slock_key[0], "sk_lock-AF_INET-NFSD", &svc_key[0]);
+                                              &svc_slock_key[0],
+                                              "sk_xprt.xpt_lock-AF_INET-NFSD",
+                                              &svc_key[0]);
                break;
        case AF_INET6:
                sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFSD",
-                    &svc_slock_key[1], "sk_lock-AF_INET6-NFSD", &svc_key[1]);
+                                              &svc_slock_key[1],
+                                              "sk_xprt.xpt_lock-AF_INET6-NFSD",
+                                              &svc_key[1]);
                break;
        default:
@@ -121,81 +89,26 @@ static inline void svc_reclassify_socket(struct socket *sock)
        }
 }
 #else
-static inline void svc_reclassify_socket(struct socket *sock)
+static void svc_reclassify_socket(struct socket *sock)
 {
 }
 #endif
-static char *__svc_print_addr(struct sockaddr *addr, char *buf, size_t len)
-{
-        switch (addr->sa_family) {
-        case AF_INET:
-                snprintf(buf, len, "%u.%u.%u.%u, port=%u",
-                        NIPQUAD(((struct sockaddr_in *) addr)->sin_addr),
-                        ntohs(((struct sockaddr_in *) addr)->sin_port));
-                break;
-        case AF_INET6:
-                snprintf(buf, len, "%x:%x:%x:%x:%x:%x:%x:%x, port=%u",
-                        NIP6(((struct sockaddr_in6 *) addr)->sin6_addr),
-                        ntohs(((struct sockaddr_in6 *) addr)->sin6_port));
-                break;
-        default:
-                snprintf(buf, len, "unknown address type: %d", addr->sa_family);
-                break;
-        }
-        return buf;
-}
-/**
- * svc_print_addr - Format rq_addr field for printing
- * @rqstp: svc_rqst struct containing address to print
- * @buf: target buffer for formatted address
- * @len: length of target buffer
- *
- */
-char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len)
-{
-        return __svc_print_addr(svc_addr(rqstp), buf, len);
-}
-EXPORT_SYMBOL_GPL(svc_print_addr);
-/*
- * Queue up an idle server thread.  Must have pool->sp_lock held.
- * Note: this is really a stack rather than a queue, so that we only
- * use as many different threads as we need, and the rest don't pollute
- * the cache.
- */
-static inline void
-svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp)
-{
-        list_add(&rqstp->rq_list, &pool->sp_threads);
-}
-/*
- * Dequeue an nfsd thread.  Must have pool->sp_lock held.
- */
-static inline void
-svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp)
-{
-        list_del(&rqstp->rq_list);
-}
 /*
 * Release an skbuff after use
 */
-static inline void
+static void svc_release_skb(struct svc_rqst *rqstp)
-svc_release_skb(struct svc_rqst *rqstp)
 {
-        struct sk_buff *skb = rqstp->rq_skbuff;
+        struct sk_buff *skb = rqstp->rq_xprt_ctxt;
        struct svc_deferred_req *dr = rqstp->rq_deferred;
        if (skb) {
-                rqstp->rq_skbuff = NULL;
+                struct svc_sock *svsk =
+                        container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+                rqstp->rq_xprt_ctxt = NULL;
                dprintk("svc: service %p, releasing skb %p\n", rqstp, skb);
-                skb_free_datagram(rqstp->rq_sock->sk_sk, skb);
+                skb_free_datagram(svsk->sk_sk, skb);
        }
        if (dr) {
                rqstp->rq_deferred = NULL;
@@ -203,253 +116,6 @@ svc_release_skb(struct svc_rqst *rqstp)
        }
 }
-/*
- * Any space to write?
- */
-static inline unsigned long
-svc_sock_wspace(struct svc_sock *svsk)
-{
-        int wspace;
-        if (svsk->sk_sock->type == SOCK_STREAM)
-                wspace = sk_stream_wspace(svsk->sk_sk);
-        else
-                wspace = sock_wspace(svsk->sk_sk);
-        return wspace;
-}
-/*
- * Queue up a socket with data pending. If there are idle nfsd
- * processes, wake 'em up.
- *
- */
-static void
-svc_sock_enqueue(struct svc_sock *svsk)
-{
-        struct svc_serv *serv = svsk->sk_server;
-        struct svc_pool *pool;
-        struct svc_rqst *rqstp;
-        int cpu;
-        if (!(svsk->sk_flags &
-              ( (1<<SK_CONN)|(1<<SK_DATA)|(1<<SK_CLOSE)|(1<<SK_DEFERRED)) ))
-                return;
-        if (test_bit(SK_DEAD, &svsk->sk_flags))
-                return;
-        cpu = get_cpu();
-        pool = svc_pool_for_cpu(svsk->sk_server, cpu);
-        put_cpu();
-        spin_lock_bh(&pool->sp_lock);
-        if (!list_empty(&pool->sp_threads) &&
-            !list_empty(&pool->sp_sockets))
-                printk(KERN_ERR
-                        "svc_sock_enqueue: threads and sockets both waiting??\n");
-        if (test_bit(SK_DEAD, &svsk->sk_flags)) {
-                /* Don't enqueue dead sockets */
-                dprintk("svc: socket %p is dead, not enqueued\n", svsk->sk_sk);
-                goto out_unlock;
-        }
-        /* Mark socket as busy. It will remain in this state until the
-         * server has processed all pending data and put the socket back
-         * on the idle list.  We update SK_BUSY atomically because
-         * it also guards against trying to enqueue the svc_sock twice.
-         */
-        if (test_and_set_bit(SK_BUSY, &svsk->sk_flags)) {
-                /* Don't enqueue socket while already enqueued */
-                dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk);
-                goto out_unlock;
-        }
-        BUG_ON(svsk->sk_pool != NULL);
-        svsk->sk_pool = pool;
-        set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
-        if (((atomic_read(&svsk->sk_reserved) + serv->sv_max_mesg)*2
-             > svc_sock_wspace(svsk))
-            && !test_bit(SK_CLOSE, &svsk->sk_flags)
-            && !test_bit(SK_CONN, &svsk->sk_flags)) {
-                /* Don't enqueue while not enough space for reply */
-                dprintk("svc: socket %p  no space, %d*2 > %ld, not enqueued\n",
-                        svsk->sk_sk, atomic_read(&svsk->sk_reserved)+serv->sv_max_mesg,
-                        svc_sock_wspace(svsk));
-                svsk->sk_pool = NULL;
-                clear_bit(SK_BUSY, &svsk->sk_flags);
-                goto out_unlock;
-        }
-        clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
-        if (!list_empty(&pool->sp_threads)) {
-                rqstp = list_entry(pool->sp_threads.next,
-                                   struct svc_rqst,
-                                   rq_list);
-                dprintk("svc: socket %p served by daemon %p\n",
-                        svsk->sk_sk, rqstp);
-                svc_thread_dequeue(pool, rqstp);
-                if (rqstp->rq_sock)
-                        printk(KERN_ERR
-                                "svc_sock_enqueue: server %p, rq_sock=%p!\n",
-                                rqstp, rqstp->rq_sock);
-                rqstp->rq_sock = svsk;
-                atomic_inc(&svsk->sk_inuse);
-                rqstp->rq_reserved = serv->sv_max_mesg;
-                atomic_add(rqstp->rq_reserved, &svsk->sk_reserved);
-                BUG_ON(svsk->sk_pool != pool);
-                wake_up(&rqstp->rq_wait);
-        } else {
-                dprintk("svc: socket %p put into queue\n", svsk->sk_sk);
-                list_add_tail(&svsk->sk_ready, &pool->sp_sockets);
-                BUG_ON(svsk->sk_pool != pool);
-        }
-out_unlock:
-        spin_unlock_bh(&pool->sp_lock);
-}
-/*
- * Dequeue the first socket.  Must be called with the pool->sp_lock held.
- */
-static inline struct svc_sock *
-svc_sock_dequeue(struct svc_pool *pool)
-{
-        struct svc_sock *svsk;
-        if (list_empty(&pool->sp_sockets))
-                return NULL;
-        svsk = list_entry(pool->sp_sockets.next,
-                          struct svc_sock, sk_ready);
-        list_del_init(&svsk->sk_ready);
-        dprintk("svc: socket %p dequeued, inuse=%d\n",
-                svsk->sk_sk, atomic_read(&svsk->sk_inuse));
-        return svsk;
-}
-/*
- * Having read something from a socket, check whether it
- * needs to be re-enqueued.
- * Note: SK_DATA only gets cleared when a read-attempt finds
- * no (or insufficient) data.
- */
-static inline void
-svc_sock_received(struct svc_sock *svsk)
-{
-        svsk->sk_pool = NULL;
-        clear_bit(SK_BUSY, &svsk->sk_flags);
-        svc_sock_enqueue(svsk);
-}
-/**
- * svc_reserve - change the space reserved for the reply to a request.
- * @rqstp:  The request in question
- * @space: new max space to reserve
- *
- * Each request reserves some space on the output queue of the socket
- * to make sure the reply fits.  This function reduces that reserved
- * space to be the amount of space used already, plus @space.
- *
- */
-void svc_reserve(struct svc_rqst *rqstp, int space)
-{
-        space += rqstp->rq_res.head[0].iov_len;
-        if (space < rqstp->rq_reserved) {
-                struct svc_sock *svsk = rqstp->rq_sock;
-                atomic_sub((rqstp->rq_reserved - space), &svsk->sk_reserved);
-                rqstp->rq_reserved = space;
-                svc_sock_enqueue(svsk);
-        }
-}
-/*
- * Release a socket after use.
- */
-static inline void
-svc_sock_put(struct svc_sock *svsk)
-{
-        if (atomic_dec_and_test(&svsk->sk_inuse)) {
-                BUG_ON(! test_bit(SK_DEAD, &svsk->sk_flags));
-                dprintk("svc: releasing dead socket\n");
-                if (svsk->sk_sock->file)
-                        sockfd_put(svsk->sk_sock);
-                else
-                        sock_release(svsk->sk_sock);
-                if (svsk->sk_info_authunix != NULL)
-                        svcauth_unix_info_release(svsk->sk_info_authunix);
-                kfree(svsk);
-        }
-}
-static void
-svc_sock_release(struct svc_rqst *rqstp)
-{
-        struct svc_sock *svsk = rqstp->rq_sock;
-        svc_release_skb(rqstp);
-        svc_free_res_pages(rqstp);
-        rqstp->rq_res.page_len = 0;
-        rqstp->rq_res.page_base = 0;
-        /* Reset response buffer and release
-         * the reservation.
-         * But first, check that enough space was reserved
-         * for the reply, otherwise we have a bug!
-         */
-        if ((rqstp->rq_res.len) >  rqstp->rq_reserved)
-                printk(KERN_ERR "RPC request reserved %d but used %d\n",
-                       rqstp->rq_reserved,
-                       rqstp->rq_res.len);
-        rqstp->rq_res.head[0].iov_len = 0;
-        svc_reserve(rqstp, 0);
-        rqstp->rq_sock = NULL;
-        svc_sock_put(svsk);
-}
-/*
- * External function to wake up a server waiting for data
- * This really only makes sense for services like lockd
- * which have exactly one thread anyway.
- */
-void
-svc_wake_up(struct svc_serv *serv)
-{
-        struct svc_rqst *rqstp;
-        unsigned int i;
-        struct svc_pool *pool;
-        for (i = 0; i < serv->sv_nrpools; i++) {
-                pool = &serv->sv_pools[i];
-                spin_lock_bh(&pool->sp_lock);
-                if (!list_empty(&pool->sp_threads)) {
-                        rqstp = list_entry(pool->sp_threads.next,
-                                           struct svc_rqst,
-                                           rq_list);
-                        dprintk("svc: daemon %p woken up.\n", rqstp);
-                        /*
-                        svc_thread_dequeue(pool, rqstp);
-                        rqstp->rq_sock = NULL;
-                         */
-                        wake_up(&rqstp->rq_wait);
-                }
-                spin_unlock_bh(&pool->sp_lock);
-        }
-}
 union svc_pktinfo_u {
        struct in_pktinfo pkti;
        struct in6_pktinfo pkti6;
@@ -459,7 +125,9 @@ union svc_pktinfo_u {
 static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh)
 {
-        switch (rqstp->rq_sock->sk_sk->sk_family) {
+        struct svc_sock *svsk =
+                container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+        switch (svsk->sk_sk->sk_family) {
        case AF_INET: {
                        struct in_pktinfo *pki = CMSG_DATA(cmh);
@@ -489,10 +157,10 @@ static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh)
 /*
 * Generic sendto routine
 */
-static int
+static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
-svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
 {
-        struct svc_sock *svsk = rqstp->rq_sock;
+        struct svc_sock *svsk =
+                container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
        struct socket   *sock = svsk->sk_sock;
        int             slen;
        union {
@@ -565,7 +233,7 @@ svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
        }
 out:
        dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n",
-                rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len,
+                svsk, xdr->head[0].iov_base, xdr->head[0].iov_len,
                xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf)));
        return len;
@@ -602,7 +270,7 @@ svc_sock_names(char *buf, struct svc_serv *serv, char *toclose)
        if (!serv)
                return 0;
        spin_lock_bh(&serv->sv_lock);
-        list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) {
+        list_for_each_entry(svsk, &serv->sv_permsocks, sk_xprt.xpt_list) {
                int onelen = one_sock_name(buf+len, svsk);
                if (toclose && strcmp(toclose, buf+len) == 0)
                        closesk = svsk;
@@ -614,7 +282,7 @@ svc_sock_names(char *buf, struct svc_serv *serv, char *toclose)
                /* Should unregister with portmap, but you cannot
                 * unregister just one protocol...
                 */
-                svc_close_socket(closesk);
+                svc_close_xprt(&closesk->sk_xprt);
        else if (toclose)
                return -ENOENT;
        return len;
@@ -624,8 +292,7 @@ EXPORT_SYMBOL(svc_sock_names);
 /*
 * Check input queue length
 */
-static int
+static int svc_recv_available(struct svc_sock *svsk)
-svc_recv_available(struct svc_sock *svsk)
 {
        struct socket   *sock = svsk->sk_sock;
        int             avail, err;
@@ -638,48 +305,31 @@ svc_recv_available(struct svc_sock *svsk)
 /*
 * Generic recvfrom routine.
 */
-static int
+static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr,
-svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, int buflen)
+                        int buflen)
 {
-        struct svc_sock *svsk = rqstp->rq_sock;
+        struct svc_sock *svsk =
+                container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
        struct msghdr msg = {
                .msg_flags      = MSG_DONTWAIT,
        };
-        struct sockaddr *sin;
        int len;
+        rqstp->rq_xprt_hlen = 0;
        len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen,
                                msg.msg_flags);
-        /* sock_recvmsg doesn't fill in the name/namelen, so we must..
-         */
-        memcpy(&rqstp->rq_addr, &svsk->sk_remote, svsk->sk_remotelen);
-        rqstp->rq_addrlen = svsk->sk_remotelen;
-        /* Destination address in request is needed for binding the
-         * source address in RPC callbacks later.
-         */
-        sin = (struct sockaddr *)&svsk->sk_local;
-        switch (sin->sa_family) {
-        case AF_INET:
-                rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr;
-                break;
-        case AF_INET6:
-                rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr;
-                break;
-        }
        dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n",
                svsk, iov[0].iov_base, iov[0].iov_len, len);
        return len;
 }
 /*
 * Set socket snd and rcv buffer lengths
 */
-static inline void
+static void svc_sock_setbufsize(struct socket *sock, unsigned int snd,
-svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv)
+                                unsigned int rcv)
 {
 #if 0
        mm_segment_t    oldfs;
@@ -704,16 +354,16 @@ svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv)
 /*
 * INET callback when data has been received on the socket.
 */
-static void
+static void svc_udp_data_ready(struct sock *sk, int count)
-svc_udp_data_ready(struct sock *sk, int count)
 {
        struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
        if (svsk) {
                dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n",
-                        svsk, sk, count, test_bit(SK_BUSY, &svsk->sk_flags));
+                        svsk, sk, count,
-                set_bit(SK_DATA, &svsk->sk_flags);
+                        test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
-                svc_sock_enqueue(svsk);
+                set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+                svc_xprt_enqueue(&svsk->sk_xprt);
        }
        if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
                wake_up_interruptible(sk->sk_sleep);
@@ -722,15 +372,14 @@ svc_udp_data_ready(struct sock *sk, int count)
 /*
 * INET callback when space is newly available on the socket.
 */
-static void
+static void svc_write_space(struct sock *sk)
-svc_write_space(struct sock *sk)
 {
        struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data);
        if (svsk) {
                dprintk("svc: socket %p(inet %p), write_space busy=%d\n",
-                        svsk, sk, test_bit(SK_BUSY, &svsk->sk_flags));
+                        svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
-                svc_sock_enqueue(svsk);
+                svc_xprt_enqueue(&svsk->sk_xprt);
        }
        if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) {
@@ -740,10 +389,19 @@ svc_write_space(struct sock *sk)
        }
 }
-static inline void svc_udp_get_dest_address(struct svc_rqst *rqstp,
+/*
-                                            struct cmsghdr *cmh)
+ * Copy the UDP datagram's destination address to the rqstp structure.
+ * The 'destination' address in this case is the address to which the
+ * peer sent the datagram, i.e. our local address. For multihomed
+ * hosts, this can change from msg to msg. Note that only the IP
+ * address changes, the port number should remain the same.
+ */
+static void svc_udp_get_dest_address(struct svc_rqst *rqstp,
+                                     struct cmsghdr *cmh)
 {
-        switch (rqstp->rq_sock->sk_sk->sk_family) {
+        struct svc_sock *svsk =
+                container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+        switch (svsk->sk_sk->sk_family) {
        case AF_INET: {
                struct in_pktinfo *pki = CMSG_DATA(cmh);
                rqstp->rq_daddr.addr.s_addr = pki->ipi_spec_dst.s_addr;
@@ -760,11 +418,11 @@ static inline void svc_udp_get_dest_address(struct svc_rqst *rqstp,
 /*
 * Receive a datagram from a UDP socket.
 */
-static int
+static int svc_udp_recvfrom(struct svc_rqst *rqstp)
-svc_udp_recvfrom(struct svc_rqst *rqstp)
 {
-        struct svc_sock *svsk = rqstp->rq_sock;
+        struct svc_sock *svsk =
-        struct svc_serv *serv = svsk->sk_server;
+                container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+        struct svc_serv *serv = svsk->sk_xprt.xpt_server;
        struct sk_buff  *skb;
        union {
                struct cmsghdr  hdr;
@@ -779,7 +437,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
                .msg_flags = MSG_DONTWAIT,
        };
-        if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags))
+        if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
            /* udp sockets need large rcvbuf as all pending
             * requests are still in that buffer.  sndbuf must
             * also be large enough that there is enough space
@@ -792,17 +450,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
                                (serv->sv_nrthreads+3) * serv->sv_max_mesg,
                                (serv->sv_nrthreads+3) * serv->sv_max_mesg);
-        if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) {
+        clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
-                svc_sock_received(svsk);
-                return svc_deferred_recv(rqstp);
-        }
-        if (test_bit(SK_CLOSE, &svsk->sk_flags)) {
-                svc_delete_socket(svsk);
-                return 0;
-        }
-        clear_bit(SK_DATA, &svsk->sk_flags);
        skb = NULL;
        err = kernel_recvmsg(svsk->sk_sock, &msg, NULL,
                             0, 0, MSG_PEEK | MSG_DONTWAIT);
@@ -813,24 +461,27 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
                if (err != -EAGAIN) {
                        /* possibly an icmp error */
                        dprintk("svc: recvfrom returned error %d\n", -err);
-                        set_bit(SK_DATA, &svsk->sk_flags);
+                        set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
                }
-                svc_sock_received(svsk);
+                svc_xprt_received(&svsk->sk_xprt);
                return -EAGAIN;
        }
-        rqstp->rq_addrlen = sizeof(rqstp->rq_addr);
+        len = svc_addr_len(svc_addr(rqstp));
+        if (len < 0)
+                return len;
+        rqstp->rq_addrlen = len;
        if (skb->tstamp.tv64 == 0) {
                skb->tstamp = ktime_get_real();
                /* Don't enable netstamp, sunrpc doesn't
                   need that much accuracy */
        }
        svsk->sk_sk->sk_stamp = skb->tstamp;
-        set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */
+        set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */
        /*
         * Maybe more packets - kick another thread ASAP.
         */
-        svc_sock_received(svsk);
+        svc_xprt_received(&svsk->sk_xprt);
        len  = skb->len - sizeof(struct udphdr);
        rqstp->rq_arg.len = len;
@@ -861,13 +512,14 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
                skb_free_datagram(svsk->sk_sk, skb);
        } else {
                /* we can use it in-place */
-                rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr);
+                rqstp->rq_arg.head[0].iov_base = skb->data +
+                        sizeof(struct udphdr);
                rqstp->rq_arg.head[0].iov_len = len;
                if (skb_checksum_complete(skb)) {
                        skb_free_datagram(svsk->sk_sk, skb);
                        return 0;
                }
-                rqstp->rq_skbuff = skb;
+                rqstp->rq_xprt_ctxt = skb;
        }
        rqstp->rq_arg.page_base = 0;
@@ -900,27 +552,81 @@ svc_udp_sendto(struct svc_rqst *rqstp)
        return error;
 }
-static void
+static void svc_udp_prep_reply_hdr(struct svc_rqst *rqstp)
-svc_udp_init(struct svc_sock *svsk)
+{
+}
+static int svc_udp_has_wspace(struct svc_xprt *xprt)
+{
+        struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+        struct svc_serv *serv = xprt->xpt_server;
+        unsigned long required;
+        /*
+         * Set the SOCK_NOSPACE flag before checking the available
+         * sock space.
+         */
+        set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+        required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg;
+        if (required*2 > sock_wspace(svsk->sk_sk))
+                return 0;
+        clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+        return 1;
+}
+static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt)
+{
+        BUG();
+        return NULL;
+}
+static struct svc_xprt *svc_udp_create(struct svc_serv *serv,
+                                       struct sockaddr *sa, int salen,
+                                       int flags)
+{
+        return svc_create_socket(serv, IPPROTO_UDP, sa, salen, flags);
+}
+static struct svc_xprt_ops svc_udp_ops = {
+        .xpo_create = svc_udp_create,
+        .xpo_recvfrom = svc_udp_recvfrom,
+        .xpo_sendto = svc_udp_sendto,
+        .xpo_release_rqst = svc_release_skb,
+        .xpo_detach = svc_sock_detach,
+        .xpo_free = svc_sock_free,
+        .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr,
+        .xpo_has_wspace = svc_udp_has_wspace,
+        .xpo_accept = svc_udp_accept,
+};
+static struct svc_xprt_class svc_udp_class = {
+        .xcl_name = "udp",
+        .xcl_owner = THIS_MODULE,
+        .xcl_ops = &svc_udp_ops,
+        .xcl_max_payload = RPCSVC_MAXPAYLOAD_UDP,
+};
+static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
 {
        int one = 1;
        mm_segment_t oldfs;
+        svc_xprt_init(&svc_udp_class, &svsk->sk_xprt, serv);
+        clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
        svsk->sk_sk->sk_data_ready = svc_udp_data_ready;
        svsk->sk_sk->sk_write_space = svc_write_space;
-        svsk->sk_recvfrom = svc_udp_recvfrom;
-        svsk->sk_sendto = svc_udp_sendto;
        /* initialise setting must have enough space to
         * receive and respond to one request.
         * svc_udp_recvfrom will re-adjust if necessary
         */
        svc_sock_setbufsize(svsk->sk_sock,
-                            3 * svsk->sk_server->sv_max_mesg,
+                            3 * svsk->sk_xprt.xpt_server->sv_max_mesg,
-                            3 * svsk->sk_server->sv_max_mesg);
+                            3 * svsk->sk_xprt.xpt_server->sv_max_mesg);
-        set_bit(SK_DATA, &svsk->sk_flags); /* might have come in before data_ready set up */
+        /* data might have come in before data_ready set up */
-        set_bit(SK_CHNGBUF, &svsk->sk_flags);
+        set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+        set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
        oldfs = get_fs();
        set_fs(KERNEL_DS);
@@ -934,8 +640,7 @@ svc_udp_init(struct svc_sock *svsk)
 * A data_ready event on a listening socket means there's a connection
 * pending. Do not use state_change as a substitute for it.
 */
-static void
+static void svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
-svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
 {
        struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
@@ -954,8 +659,8 @@ svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
         */
        if (sk->sk_state == TCP_LISTEN) {
                if (svsk) {
-                        set_bit(SK_CONN, &svsk->sk_flags);
+                        set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
-                        svc_sock_enqueue(svsk);
+                        svc_xprt_enqueue(&svsk->sk_xprt);
                } else
                        printk("svc: socket %p: no user data\n", sk);
        }
@@ -967,8 +672,7 @@ svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
 /*
 * A state change on a connected socket means it's dying or dead.
 */
-static void
+static void svc_tcp_state_change(struct sock *sk)
-svc_tcp_state_change(struct sock *sk)
 {
        struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
@@ -978,51 +682,36 @@ svc_tcp_state_change(struct sock *sk)
        if (!svsk)
                printk("svc: socket %p: no user data\n", sk);
        else {
-                set_bit(SK_CLOSE, &svsk->sk_flags);
+                set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
-                svc_sock_enqueue(svsk);
+                svc_xprt_enqueue(&svsk->sk_xprt);
        }
        if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
                wake_up_interruptible_all(sk->sk_sleep);
 }
-static void
+static void svc_tcp_data_ready(struct sock *sk, int count)
-svc_tcp_data_ready(struct sock *sk, int count)
 {
        struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
        dprintk("svc: socket %p TCP data ready (svsk %p)\n",
                sk, sk->sk_user_data);
        if (svsk) {
-                set_bit(SK_DATA, &svsk->sk_flags);
+                set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
-                svc_sock_enqueue(svsk);
+                svc_xprt_enqueue(&svsk->sk_xprt);
        }
        if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
                wake_up_interruptible(sk->sk_sleep);
 }
-static inline int svc_port_is_privileged(struct sockaddr *sin)
-{
-        switch (sin->sa_family) {
-        case AF_INET:
-                return ntohs(((struct sockaddr_in *)sin)->sin_port)
-                        < PROT_SOCK;
-        case AF_INET6:
-                return ntohs(((struct sockaddr_in6 *)sin)->sin6_port)
-                        < PROT_SOCK;
-        default:
-                return 0;
-        }
-}
 /*
 * Accept a TCP connection
 */
-static void
+static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
-svc_tcp_accept(struct svc_sock *svsk)
 {
+        struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
        struct sockaddr_storage addr;
        struct sockaddr *sin = (struct sockaddr *) &addr;
-        struct svc_serv *serv = svsk->sk_server;
+        struct svc_serv *serv = svsk->sk_xprt.xpt_server;
        struct socket   *sock = svsk->sk_sock;
        struct socket   *newsock;
        struct svc_sock *newsvsk;
@@ -1031,9 +720,9 @@ svc_tcp_accept(struct svc_sock *svsk)
        dprintk("svc: tcp_accept %p sock %p\n", svsk, sock);
        if (!sock)
-                return;
+                return NULL;
-        clear_bit(SK_CONN, &svsk->sk_flags);
+        clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
        err = kernel_accept(sock, &newsock, O_NONBLOCK);
        if (err < 0) {
                if (err == -ENOMEM)
@@ -1042,11 +731,9 @@ svc_tcp_accept(struct svc_sock *svsk)
                else if (err != -EAGAIN && net_ratelimit())
                        printk(KERN_WARNING "%s: accept failed (err %d)!\n",
                                   serv->sv_name, -err);
-                return;
+                return NULL;
        }
+        set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
-        set_bit(SK_CONN, &svsk->sk_flags);
-        svc_sock_enqueue(svsk);
        err = kernel_getpeername(newsock, sin, &slen);
        if (err < 0) {
@@ -1077,106 +764,42 @@ svc_tcp_accept(struct svc_sock *svsk)
        if (!(newsvsk = svc_setup_socket(serv, newsock, &err,
                                 (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY))))
                goto failed;
-        memcpy(&newsvsk->sk_remote, sin, slen);
+        svc_xprt_set_remote(&newsvsk->sk_xprt, sin, slen);
-        newsvsk->sk_remotelen = slen;
        err = kernel_getsockname(newsock, sin, &slen);
        if (unlikely(err < 0)) {
                dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err);
                slen = offsetof(struct sockaddr, sa_data);
        }
-        memcpy(&newsvsk->sk_local, sin, slen);
+        svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen);
-        svc_sock_received(newsvsk);
-        /* make sure that we don't have too many active connections.
-         * If we have, something must be dropped.
-         *
-         * There's no point in trying to do random drop here for
-         * DoS prevention. The NFS clients does 1 reconnect in 15
-         * seconds. An attacker can easily beat that.
-         *
-         * The only somewhat efficient mechanism would be if drop
-         * old connections from the same IP first. But right now
-         * we don't even record the client IP in svc_sock.
-         */
-        if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) {
-                struct svc_sock *svsk = NULL;
-                spin_lock_bh(&serv->sv_lock);
-                if (!list_empty(&serv->sv_tempsocks)) {
-                        if (net_ratelimit()) {
-                                /* Try to help the admin */
-                                printk(KERN_NOTICE "%s: too many open TCP "
-                                        "sockets, consider increasing the "
-                                        "number of nfsd threads\n",
-                                                   serv->sv_name);
-                                printk(KERN_NOTICE
-                                       "%s: last TCP connect from %s\n",
-                                       serv->sv_name, __svc_print_addr(sin,
-                                                        buf, sizeof(buf)));
-                        }
-                        /*
-                         * Always select the oldest socket. It's not fair,
-                         * but so is life
-                         */
-                        svsk = list_entry(serv->sv_tempsocks.prev,
-                                          struct svc_sock,
-                                          sk_list);
-                        set_bit(SK_CLOSE, &svsk->sk_flags);
-                        atomic_inc(&svsk->sk_inuse);
-                }
-                spin_unlock_bh(&serv->sv_lock);
-                if (svsk) {
-                        svc_sock_enqueue(svsk);
-                        svc_sock_put(svsk);
-                }
-        }
        if (serv->sv_stats)
                serv->sv_stats->nettcpconn++;
-        return;
+        return &newsvsk->sk_xprt;
 failed:
        sock_release(newsock);
-        return;
+        return NULL;
 }
 /*
 * Receive data from a TCP socket.
 */
-static int
+static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
-svc_tcp_recvfrom(struct svc_rqst *rqstp)
 {
-        struct svc_sock *svsk = rqstp->rq_sock;
+        struct svc_sock *svsk =
-        struct svc_serv *serv = svsk->sk_server;
+                container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+        struct svc_serv *serv = svsk->sk_xprt.xpt_server;
        int             len;
        struct kvec *vec;
        int pnum, vlen;
        dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
-                svsk, test_bit(SK_DATA, &svsk->sk_flags),
+                svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags),
-                test_bit(SK_CONN, &svsk->sk_flags),
+                test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags),
-                test_bit(SK_CLOSE, &svsk->sk_flags));
+                test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags));
-        if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) {
+        if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
-                svc_sock_received(svsk);
-                return svc_deferred_recv(rqstp);
-        }
-        if (test_bit(SK_CLOSE, &svsk->sk_flags)) {
-                svc_delete_socket(svsk);
-                return 0;
-        }
-        if (svsk->sk_sk->sk_state == TCP_LISTEN) {
-                svc_tcp_accept(svsk);
-                svc_sock_received(svsk);
-                return 0;
-        }
-        if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags))
                /* sndbuf needs to have room for one request
                 * per thread, otherwise we can stall even when the
                 * network isn't a bottleneck.
@@ -1193,7 +816,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
                                    (serv->sv_nrthreads+3) * serv->sv_max_mesg,
                                    3 * serv->sv_max_mesg);
-        clear_bit(SK_DATA, &svsk->sk_flags);
+        clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
        /* Receive data. If we haven't got the record length yet, get
         * the next four bytes. Otherwise try to gobble up as much as
@@ -1212,7 +835,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
                if (len < want) {
                        dprintk("svc: short recvfrom while reading record length (%d of %lu)\n",
                                len, want);
-                        svc_sock_received(svsk);
+                        svc_xprt_received(&svsk->sk_xprt);
                        return -EAGAIN; /* record header not complete */
                }
@@ -1248,11 +871,11 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
        if (len < svsk->sk_reclen) {
                dprintk("svc: incomplete TCP record (%d of %d)\n",
                        len, svsk->sk_reclen);
-                svc_sock_received(svsk);
+                svc_xprt_received(&svsk->sk_xprt);
                return -EAGAIN; /* record not complete */
        }
        len = svsk->sk_reclen;
-        set_bit(SK_DATA, &svsk->sk_flags);
+        set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
        vec = rqstp->rq_vec;
        vec[0] = rqstp->rq_arg.head[0];
@@ -1281,30 +904,31 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
                rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len;
        }
-        rqstp->rq_skbuff      = NULL;
+        rqstp->rq_xprt_ctxt   = NULL;
        rqstp->rq_prot        = IPPROTO_TCP;
        /* Reset TCP read info */
        svsk->sk_reclen = 0;
        svsk->sk_tcplen = 0;
-        svc_sock_received(svsk);
+        svc_xprt_copy_addrs(rqstp, &svsk->sk_xprt);
+        svc_xprt_received(&svsk->sk_xprt);
        if (serv->sv_stats)
                serv->sv_stats->nettcpcnt++;
        return len;
 err_delete:
-        svc_delete_socket(svsk);
+        set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
        return -EAGAIN;
 error:
        if (len == -EAGAIN) {
                dprintk("RPC: TCP recvfrom got EAGAIN\n");
-                svc_sock_received(svsk);
+                svc_xprt_received(&svsk->sk_xprt);
        } else {
                printk(KERN_NOTICE "%s: recvfrom returned errno %d\n",
-                                        svsk->sk_server->sv_name, -len);
+                       svsk->sk_xprt.xpt_server->sv_name, -len);
                goto err_delete;
        }
@@ -1314,8 +938,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
 /*
 * Send out data on TCP socket.
 */
-static int
+static int svc_tcp_sendto(struct svc_rqst *rqstp)
-svc_tcp_sendto(struct svc_rqst *rqstp)
 {
        struct xdr_buf  *xbufp = &rqstp->rq_res;
        int sent;
@@ -1328,35 +951,109 @@ svc_tcp_sendto(struct svc_rqst *rqstp)
        reclen = htonl(0x80000000|((xbufp->len ) - 4));
        memcpy(xbufp->head[0].iov_base, &reclen, 4);
-        if (test_bit(SK_DEAD, &rqstp->rq_sock->sk_flags))
+        if (test_bit(XPT_DEAD, &rqstp->rq_xprt->xpt_flags))
                return -ENOTCONN;
        sent = svc_sendto(rqstp, &rqstp->rq_res);
        if (sent != xbufp->len) {
-                printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n",
+                printk(KERN_NOTICE
-                       rqstp->rq_sock->sk_server->sv_name,
+                       "rpc-srv/tcp: %s: %s %d when sending %d bytes "
+                       "- shutting down socket\n",
+                       rqstp->rq_xprt->xpt_server->sv_name,
                       (sent<0)?"got error":"sent only",
                       sent, xbufp->len);
-                set_bit(SK_CLOSE, &rqstp->rq_sock->sk_flags);
+                set_bit(XPT_CLOSE, &rqstp->rq_xprt->xpt_flags);
-                svc_sock_enqueue(rqstp->rq_sock);
+                svc_xprt_enqueue(rqstp->rq_xprt);
                sent = -EAGAIN;
        }
        return sent;
 }
-static void
+/*
-svc_tcp_init(struct svc_sock *svsk)
+ * Setup response header. TCP has a 4B record length field.
+ */
+static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp)
+{
+        struct kvec *resv = &rqstp->rq_res.head[0];
+        /* tcp needs a space for the record length... */
+        svc_putnl(resv, 0);
+}
+static int svc_tcp_has_wspace(struct svc_xprt *xprt)
+{
+        struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+        struct svc_serv *serv = svsk->sk_xprt.xpt_server;
+        int required;
+        int wspace;
+        /*
+         * Set the SOCK_NOSPACE flag before checking the available
+         * sock space.
+         */
+        set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+        required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg;
+        wspace = sk_stream_wspace(svsk->sk_sk);
+        if (wspace < sk_stream_min_wspace(svsk->sk_sk))
+                return 0;
+        if (required * 2 > wspace)
+                return 0;
+        clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+        return 1;
+}
+static struct svc_xprt *svc_tcp_create(struct svc_serv *serv,
+                                       struct sockaddr *sa, int salen,
+                                       int flags)
+{
+        return svc_create_socket(serv, IPPROTO_TCP, sa, salen, flags);
+}
+static struct svc_xprt_ops svc_tcp_ops = {
+        .xpo_create = svc_tcp_create,
+        .xpo_recvfrom = svc_tcp_recvfrom,
+        .xpo_sendto = svc_tcp_sendto,
+        .xpo_release_rqst = svc_release_skb,
+        .xpo_detach = svc_sock_detach,
+        .xpo_free = svc_sock_free,
+        .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr,
+        .xpo_has_wspace = svc_tcp_has_wspace,
+        .xpo_accept = svc_tcp_accept,
+};
+static struct svc_xprt_class svc_tcp_class = {
+        .xcl_name = "tcp",
+        .xcl_owner = THIS_MODULE,
+        .xcl_ops = &svc_tcp_ops,
+        .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
+};
+void svc_init_xprt_sock(void)
+{
+        svc_reg_xprt_class(&svc_tcp_class);
+        svc_reg_xprt_class(&svc_udp_class);
+}
+void svc_cleanup_xprt_sock(void)
+{
+        svc_unreg_xprt_class(&svc_tcp_class);
+        svc_unreg_xprt_class(&svc_udp_class);
+}
+static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
 {
        struct sock     *sk = svsk->sk_sk;
        struct tcp_sock *tp = tcp_sk(sk);
-        svsk->sk_recvfrom = svc_tcp_recvfrom;
+        svc_xprt_init(&svc_tcp_class, &svsk->sk_xprt, serv);
-        svsk->sk_sendto = svc_tcp_sendto;
+        set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
        if (sk->sk_state == TCP_LISTEN) {
                dprintk("setting up TCP socket for listening\n");
+                set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
                sk->sk_data_ready = svc_tcp_listen_data_ready;
-                set_bit(SK_CONN, &svsk->sk_flags);
+                set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
        } else {
                dprintk("setting up TCP socket for reading\n");
                sk->sk_state_change = svc_tcp_state_change;
@@ -1373,18 +1070,17 @@ svc_tcp_init(struct svc_sock *svsk)
                 * svc_tcp_recvfrom will re-adjust if necessary
                 */
                svc_sock_setbufsize(svsk->sk_sock,
-                                    3 * svsk->sk_server->sv_max_mesg,
+                                    3 * svsk->sk_xprt.xpt_server->sv_max_mesg,
-                                    3 * svsk->sk_server->sv_max_mesg);
+                                    3 * svsk->sk_xprt.xpt_server->sv_max_mesg);
-                set_bit(SK_CHNGBUF, &svsk->sk_flags);
+                set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
-                set_bit(SK_DATA, &svsk->sk_flags);
+                set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
                if (sk->sk_state != TCP_ESTABLISHED)
-                        set_bit(SK_CLOSE, &svsk->sk_flags);
+                        set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
        }
 }
-void
+void svc_sock_update_bufs(struct svc_serv *serv)
-svc_sock_update_bufs(struct svc_serv *serv)
 {
        /*
         * The number of server threads has changed. Update
@@ -1395,232 +1091,18 @@ svc_sock_update_bufs(struct svc_serv *serv)
        spin_lock_bh(&serv->sv_lock);
        list_for_each(le, &serv->sv_permsocks) {
                struct svc_sock *svsk =
-                        list_entry(le, struct svc_sock, sk_list);
+                        list_entry(le, struct svc_sock, sk_xprt.xpt_list);
-                set_bit(SK_CHNGBUF, &svsk->sk_flags);
+                set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
        }
        list_for_each(le, &serv->sv_tempsocks) {
                struct svc_sock *svsk =
-                        list_entry(le, struct svc_sock, sk_list);
+                        list_entry(le, struct svc_sock, sk_xprt.xpt_list);
-                set_bit(SK_CHNGBUF, &svsk->sk_flags);
+                set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
        }
        spin_unlock_bh(&serv->sv_lock);
 }
 /*
- * Receive the next request on any socket.  This code is carefully
- * organised not to touch any cachelines in the shared svc_serv
- * structure, only cachelines in the local svc_pool.
- */
-int
-svc_recv(struct svc_rqst *rqstp, long timeout)
-{
-        struct svc_sock         *svsk = NULL;
-        struct svc_serv         *serv = rqstp->rq_server;
-        struct svc_pool         *pool = rqstp->rq_pool;
-        int                     len, i;
-        int                     pages;
-        struct xdr_buf          *arg;
-        DECLARE_WAITQUEUE(wait, current);
-        dprintk("svc: server %p waiting for data (to = %ld)\n",
-                rqstp, timeout);
-        if (rqstp->rq_sock)
-                printk(KERN_ERR
-                        "svc_recv: service %p, socket not NULL!\n",
-                         rqstp);
-        if (waitqueue_active(&rqstp->rq_wait))
-                printk(KERN_ERR
-                        "svc_recv: service %p, wait queue active!\n",
-                         rqstp);
-        /* now allocate needed pages.  If we get a failure, sleep briefly */
-        pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE;
-        for (i=0; i < pages ; i++)
-                while (rqstp->rq_pages[i] == NULL) {
-                        struct page *p = alloc_page(GFP_KERNEL);
-                        if (!p)
-                                schedule_timeout_uninterruptible(msecs_to_jiffies(500));
-                        rqstp->rq_pages[i] = p;
-                }
-        rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */
-        BUG_ON(pages >= RPCSVC_MAXPAGES);
-        /* Make arg->head point to first page and arg->pages point to rest */
-        arg = &rqstp->rq_arg;
-        arg->head[0].iov_base = page_address(rqstp->rq_pages[0]);
-        arg->head[0].iov_len = PAGE_SIZE;
-        arg->pages = rqstp->rq_pages + 1;
-        arg->page_base = 0;
-        /* save at least one page for response */
-        arg->page_len = (pages-2)*PAGE_SIZE;
-        arg->len = (pages-1)*PAGE_SIZE;
-        arg->tail[0].iov_len = 0;
-        try_to_freeze();
-        cond_resched();
-        if (signalled())
-                return -EINTR;
-        spin_lock_bh(&pool->sp_lock);
-        if ((svsk = svc_sock_dequeue(pool)) != NULL) {
-                rqstp->rq_sock = svsk;
-                atomic_inc(&svsk->sk_inuse);
-                rqstp->rq_reserved = serv->sv_max_mesg;
-                atomic_add(rqstp->rq_reserved, &svsk->sk_reserved);
-        } else {
-                /* No data pending. Go to sleep */
-                svc_thread_enqueue(pool, rqstp);
-                /*
-                 * We have to be able to interrupt this wait
-                 * to bring down the daemons ...
-                 */
-                set_current_state(TASK_INTERRUPTIBLE);
-                add_wait_queue(&rqstp->rq_wait, &wait);
-                spin_unlock_bh(&pool->sp_lock);
-                schedule_timeout(timeout);
-                try_to_freeze();
-                spin_lock_bh(&pool->sp_lock);
-                remove_wait_queue(&rqstp->rq_wait, &wait);
-                if (!(svsk = rqstp->rq_sock)) {
-                        svc_thread_dequeue(pool, rqstp);
-                        spin_unlock_bh(&pool->sp_lock);
-                        dprintk("svc: server %p, no data yet\n", rqstp);
-                        return signalled()? -EINTR : -EAGAIN;
-                }
-        }
-        spin_unlock_bh(&pool->sp_lock);
-        dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n",
-                 rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_inuse));
-        len = svsk->sk_recvfrom(rqstp);
-        dprintk("svc: got len=%d\n", len);
-        /* No data, incomplete (TCP) read, or accept() */
-        if (len == 0 || len == -EAGAIN) {
-                rqstp->rq_res.len = 0;
-                svc_sock_release(rqstp);
-                return -EAGAIN;
-        }
-        svsk->sk_lastrecv = get_seconds();
-        clear_bit(SK_OLD, &svsk->sk_flags);
-        rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp));
-        rqstp->rq_chandle.defer = svc_defer;
-        if (serv->sv_stats)
-                serv->sv_stats->netcnt++;
-        return len;
-}
-/*
- * Drop request
- */
-void
-svc_drop(struct svc_rqst *rqstp)
-{
-        dprintk("svc: socket %p dropped request\n", rqstp->rq_sock);
-        svc_sock_release(rqstp);
-}
-/*
- * Return reply to client.
- */
-int
-svc_send(struct svc_rqst *rqstp)
-{
-        struct svc_sock *svsk;
-        int             len;
-        struct xdr_buf  *xb;
-        if ((svsk = rqstp->rq_sock) == NULL) {
-                printk(KERN_WARNING "NULL socket pointer in %s:%d\n",
-                                __FILE__, __LINE__);
-                return -EFAULT;
-        }
-        /* release the receive skb before sending the reply */
-        svc_release_skb(rqstp);
-        /* calculate over-all length */
-        xb = & rqstp->rq_res;
-        xb->len = xb->head[0].iov_len +
-                xb->page_len +
-                xb->tail[0].iov_len;
-        /* Grab svsk->sk_mutex to serialize outgoing data. */
-        mutex_lock(&svsk->sk_mutex);
-        if (test_bit(SK_DEAD, &svsk->sk_flags))
-                len = -ENOTCONN;
-        else
-                len = svsk->sk_sendto(rqstp);
-        mutex_unlock(&svsk->sk_mutex);
-        svc_sock_release(rqstp);
-        if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN)
-                return 0;
-        return len;
-}
-/*
- * Timer function to close old temporary sockets, using
- * a mark-and-sweep algorithm.
- */
-static void
-svc_age_temp_sockets(unsigned long closure)
-{
-        struct svc_serv *serv = (struct svc_serv *)closure;
-        struct svc_sock *svsk;
-        struct list_head *le, *next;
-        LIST_HEAD(to_be_aged);
-        dprintk("svc_age_temp_sockets\n");
-        if (!spin_trylock_bh(&serv->sv_lock)) {
-                /* busy, try again 1 sec later */
-                dprintk("svc_age_temp_sockets: busy\n");
-                mod_timer(&serv->sv_temptimer, jiffies + HZ);
-                return;
-        }
-        list_for_each_safe(le, next, &serv->sv_tempsocks) {
-                svsk = list_entry(le, struct svc_sock, sk_list);
-                if (!test_and_set_bit(SK_OLD, &svsk->sk_flags))
-                        continue;
-                if (atomic_read(&svsk->sk_inuse) > 1 || test_bit(SK_BUSY, &svsk->sk_flags))
-                        continue;
-                atomic_inc(&svsk->sk_inuse);
-                list_move(le, &to_be_aged);
-                set_bit(SK_CLOSE, &svsk->sk_flags);
-                set_bit(SK_DETACHED, &svsk->sk_flags);
-        }
-        spin_unlock_bh(&serv->sv_lock);
-        while (!list_empty(&to_be_aged)) {
-                le = to_be_aged.next;
-                /* fiddling the sk_list node is safe 'cos we're SK_DETACHED */
-                list_del_init(le);
-                svsk = list_entry(le, struct svc_sock, sk_list);
-                dprintk("queuing svsk %p for closing, %lu seconds old\n",
-                        svsk, get_seconds() - svsk->sk_lastrecv);
-                /* a thread will dequeue and close it soon */
-                svc_sock_enqueue(svsk);
-                svc_sock_put(svsk);
-        }
-        mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ);
-}
-/*
 * Initialize socket for RPC use and create svc_sock struct
 * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF.
 */
@@ -1631,7 +1113,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
        struct svc_sock *svsk;
        struct sock     *inet;
        int             pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
-        int             is_temporary = flags & SVC_SOCK_TEMPORARY;
        dprintk("svc: svc_setup_socket %p\n", sock);
        if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) {
@@ -1651,44 +1132,18 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
                return NULL;
        }
-        set_bit(SK_BUSY, &svsk->sk_flags);
        inet->sk_user_data = svsk;
        svsk->sk_sock = sock;
        svsk->sk_sk = inet;
        svsk->sk_ostate = inet->sk_state_change;
        svsk->sk_odata = inet->sk_data_ready;
        svsk->sk_owspace = inet->sk_write_space;
-        svsk->sk_server = serv;
-        atomic_set(&svsk->sk_inuse, 1);
-        svsk->sk_lastrecv = get_seconds();
-        spin_lock_init(&svsk->sk_lock);
-        INIT_LIST_HEAD(&svsk->sk_deferred);
-        INIT_LIST_HEAD(&svsk->sk_ready);
-        mutex_init(&svsk->sk_mutex);
        /* Initialize the socket */
        if (sock->type == SOCK_DGRAM)
-                svc_udp_init(svsk);
+                svc_udp_init(svsk, serv);
        else
-                svc_tcp_init(svsk);
+                svc_tcp_init(svsk, serv);
-        spin_lock_bh(&serv->sv_lock);
-        if (is_temporary) {
-                set_bit(SK_TEMP, &svsk->sk_flags);
-                list_add(&svsk->sk_list, &serv->sv_tempsocks);
-                serv->sv_tmpcnt++;
-                if (serv->sv_temptimer.function == NULL) {
-                        /* setup timer to age temp sockets */
-                        setup_timer(&serv->sv_temptimer, svc_age_temp_sockets,
-                                        (unsigned long)serv);
-                        mod_timer(&serv->sv_temptimer,
-                                        jiffies + svc_conn_age_period * HZ);
-                }
-        } else {
-                clear_bit(SK_TEMP, &svsk->sk_flags);
-                list_add(&svsk->sk_list, &serv->sv_permsocks);
-        }
-        spin_unlock_bh(&serv->sv_lock);
        dprintk("svc: svc_setup_socket created %p (inet %p)\n",
                                svsk, svsk->sk_sk);
@@ -1717,7 +1172,16 @@ int svc_addsock(struct svc_serv *serv,
        else {
                svsk = svc_setup_socket(serv, so, &err, SVC_SOCK_DEFAULTS);
                if (svsk) {
-                        svc_sock_received(svsk);
+                        struct sockaddr_storage addr;
+                        struct sockaddr *sin = (struct sockaddr *)&addr;
+                        int salen;
+                        if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0)
+                                svc_xprt_set_local(&svsk->sk_xprt, sin, salen);
+                        clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags);
+                        spin_lock_bh(&serv->sv_lock);
+                        list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks);
+                        spin_unlock_bh(&serv->sv_lock);
+                        svc_xprt_received(&svsk->sk_xprt);
                        err = 0;
                }
        }
@@ -1733,14 +1197,19 @@ EXPORT_SYMBOL_GPL(svc_addsock);
 /*
 * Create socket for RPC service.
 */
-static int svc_create_socket(struct svc_serv *serv, int protocol,
+static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
-                                struct sockaddr *sin, int len, int flags)
+                                          int protocol,
+                                          struct sockaddr *sin, int len,
+                                          int flags)
 {
        struct svc_sock *svsk;
        struct socket   *sock;
        int             error;
        int             type;
        char            buf[RPC_MAX_ADDRBUFLEN];
+        struct sockaddr_storage addr;
+        struct sockaddr *newsin = (struct sockaddr *)&addr;
+        int             newlen;
        dprintk("svc: svc_create_socket(%s, %d, %s)\n",
                        serv->sv_program->pg_name, protocol,
@@ -1749,13 +1218,13 @@ static int svc_create_socket(struct svc_serv *serv, int protocol,
        if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) {
                printk(KERN_WARNING "svc: only UDP and TCP "
                                "sockets supported\n");
-                return -EINVAL;
+                return ERR_PTR(-EINVAL);
        }
        type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM;
        error = sock_create_kern(sin->sa_family, type, protocol, &sock);
        if (error < 0)
-                return error;
+                return ERR_PTR(error);
        svc_reclassify_socket(sock);
@@ -1765,203 +1234,55 @@ static int svc_create_socket(struct svc_serv *serv, int protocol,
        if (error < 0)
                goto bummer;
+        newlen = len;
+        error = kernel_getsockname(sock, newsin, &newlen);
+        if (error < 0)
+                goto bummer;
        if (protocol == IPPROTO_TCP) {
                if ((error = kernel_listen(sock, 64)) < 0)
                        goto bummer;
        }
        if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) {
-                svc_sock_received(svsk);
+                svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen);
-                return ntohs(inet_sk(svsk->sk_sk)->sport);
+                return (struct svc_xprt *)svsk;
        }
 bummer:
        dprintk("svc: svc_create_socket error = %d\n", -error);
        sock_release(sock);
-        return error;
+        return ERR_PTR(error);
 }
 /*
- * Remove a dead socket
+ * Detach the svc_sock from the socket so that no
+ * more callbacks occur.
 */
-static void
+static void svc_sock_detach(struct svc_xprt *xprt)
-svc_delete_socket(struct svc_sock *svsk)
 {
-        struct svc_serv *serv;
+        struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
-        struct sock     *sk;
+        struct sock *sk = svsk->sk_sk;
-        dprintk("svc: svc_delete_socket(%p)\n", svsk);
-        serv = svsk->sk_server;
+        dprintk("svc: svc_sock_detach(%p)\n", svsk);
-        sk = svsk->sk_sk;
+        /* put back the old socket callbacks */
        sk->sk_state_change = svsk->sk_ostate;
        sk->sk_data_ready = svsk->sk_odata;
        sk->sk_write_space = svsk->sk_owspace;
-        spin_lock_bh(&serv->sv_lock);
-        if (!test_and_set_bit(SK_DETACHED, &svsk->sk_flags))
-                list_del_init(&svsk->sk_list);
-        /*
-         * We used to delete the svc_sock from whichever list
-         * it's sk_ready node was on, but we don't actually
-         * need to.  This is because the only time we're called
-         * while still attached to a queue, the queue itself
-         * is about to be destroyed (in svc_destroy).
-         */
-        if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags)) {
-                BUG_ON(atomic_read(&svsk->sk_inuse)<2);
-                atomic_dec(&svsk->sk_inuse);
-                if (test_bit(SK_TEMP, &svsk->sk_flags))
-                        serv->sv_tmpcnt--;
-        }
-        spin_unlock_bh(&serv->sv_lock);
-}
-static void svc_close_socket(struct svc_sock *svsk)
-{
-        set_bit(SK_CLOSE, &svsk->sk_flags);
-        if (test_and_set_bit(SK_BUSY, &svsk->sk_flags))
-                /* someone else will have to effect the close */
-                return;
-        atomic_inc(&svsk->sk_inuse);
-        svc_delete_socket(svsk);
-        clear_bit(SK_BUSY, &svsk->sk_flags);
-        svc_sock_put(svsk);
-}
-void svc_force_close_socket(struct svc_sock *svsk)
-{
-        set_bit(SK_CLOSE, &svsk->sk_flags);
-        if (test_bit(SK_BUSY, &svsk->sk_flags)) {
-                /* Waiting to be processed, but no threads left,
-                 * So just remove it from the waiting list
-                 */
-                list_del_init(&svsk->sk_ready);
-                clear_bit(SK_BUSY, &svsk->sk_flags);
-        }
-        svc_close_socket(svsk);
-}
-/**
- * svc_makesock - Make a socket for nfsd and lockd
- * @serv: RPC server structure
- * @protocol: transport protocol to use
- * @port: port to use
- * @flags: requested socket characteristics
- *
- */
-int svc_makesock(struct svc_serv *serv, int protocol, unsigned short port,
-                        int flags)
-{
-        struct sockaddr_in sin = {
-                .sin_family             = AF_INET,
-                .sin_addr.s_addr        = INADDR_ANY,
-                .sin_port               = htons(port),
-        };
-        dprintk("svc: creating socket proto = %d\n", protocol);
-        return svc_create_socket(serv, protocol, (struct sockaddr *) &sin,
-                                                        sizeof(sin), flags);
 }
 /*
- * Handle defer and revisit of requests
+ * Free the svc_sock's socket resources and the svc_sock itself.
 */
+static void svc_sock_free(struct svc_xprt *xprt)
-static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
 {
-        struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle);
+        struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
-        struct svc_sock *svsk;
+        dprintk("svc: svc_sock_free(%p)\n", svsk);
-        if (too_many) {
+        if (svsk->sk_sock->file)
-                svc_sock_put(dr->svsk);
+                sockfd_put(svsk->sk_sock);
-                kfree(dr);
+        else
-                return;
+                sock_release(svsk->sk_sock);
-        }
+        kfree(svsk);
-        dprintk("revisit queued\n");
-        svsk = dr->svsk;
-        dr->svsk = NULL;
-        spin_lock(&svsk->sk_lock);
-        list_add(&dr->handle.recent, &svsk->sk_deferred);
-        spin_unlock(&svsk->sk_lock);
-        set_bit(SK_DEFERRED, &svsk->sk_flags);
-        svc_sock_enqueue(svsk);
-        svc_sock_put(svsk);
-}
-static struct cache_deferred_req *
-svc_defer(struct cache_req *req)
-{
-        struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle);
-        int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.len);
-        struct svc_deferred_req *dr;
-        if (rqstp->rq_arg.page_len)
-                return NULL; /* if more than a page, give up FIXME */
-        if (rqstp->rq_deferred) {
-                dr = rqstp->rq_deferred;
-                rqstp->rq_deferred = NULL;
-        } else {
-                int skip  = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
-                /* FIXME maybe discard if size too large */
-                dr = kmalloc(size, GFP_KERNEL);
-                if (dr == NULL)
-                        return NULL;
-                dr->handle.owner = rqstp->rq_server;
-                dr->prot = rqstp->rq_prot;
-                memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen);
-                dr->addrlen = rqstp->rq_addrlen;
-                dr->daddr = rqstp->rq_daddr;
-                dr->argslen = rqstp->rq_arg.len >> 2;
-                memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2);
-        }
-        atomic_inc(&rqstp->rq_sock->sk_inuse);
-        dr->svsk = rqstp->rq_sock;
-        dr->handle.revisit = svc_revisit;
-        return &dr->handle;
-}
-/*
- * recv data from a deferred request into an active one
- */
-static int svc_deferred_recv(struct svc_rqst *rqstp)
-{
-        struct svc_deferred_req *dr = rqstp->rq_deferred;
-        rqstp->rq_arg.head[0].iov_base = dr->args;
-        rqstp->rq_arg.head[0].iov_len = dr->argslen<<2;
-        rqstp->rq_arg.page_len = 0;
-        rqstp->rq_arg.len = dr->argslen<<2;
-        rqstp->rq_prot        = dr->prot;
-        memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen);
-        rqstp->rq_addrlen     = dr->addrlen;
-        rqstp->rq_daddr       = dr->daddr;
-        rqstp->rq_respages    = rqstp->rq_pages;
-        return dr->argslen<<2;
-}
-static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk)
-{
-        struct svc_deferred_req *dr = NULL;
-        if (!test_bit(SK_DEFERRED, &svsk->sk_flags))
-                return NULL;
-        spin_lock(&svsk->sk_lock);
-        clear_bit(SK_DEFERRED, &svsk->sk_flags);
-        if (!list_empty(&svsk->sk_deferred)) {
-                dr = list_entry(svsk->sk_deferred.next,
-                                struct svc_deferred_req,
-                                handle.recent);
-                list_del_init(&dr->handle.recent);
-                set_bit(SK_DEFERRED, &svsk->sk_flags);
-        }
-        spin_unlock(&svsk->sk_lock);
-        return dr;
 }
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index bada7de0c2fc..0f8c439b848a 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -18,6 +18,7 @@
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/sched.h>
 #include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/svc_xprt.h>
 /*
 * Declare the debug flags here
@@ -55,6 +56,30 @@ rpc_unregister_sysctl(void)
        }
 }
+static int proc_do_xprt(ctl_table *table, int write, struct file *file,
+                        void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        char tmpbuf[256];
+        int len;
+        if ((*ppos && !write) || !*lenp) {
+                *lenp = 0;
+                return 0;
+        }
+        if (write)
+                return -EINVAL;
+        else {
+                len = svc_print_xprts(tmpbuf, sizeof(tmpbuf));
+                if (!access_ok(VERIFY_WRITE, buffer, len))
+                        return -EFAULT;
+                if (__copy_to_user(buffer, tmpbuf, len))
+                        return -EFAULT;
+        }
+        *lenp -= len;
+        *ppos += len;
+        return 0;
+}
 static int
 proc_dodebug(ctl_table *table, int write, struct file *file,
                                void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -147,6 +172,12 @@ static ctl_table debug_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dodebug
        },
+        {
+                .procname       = "transports",
+                .maxlen         = 256,
+                .mode           = 0444,
+                .proc_handler   = &proc_do_xprt,
+        },
        { .ctl_name = 0 }
 };
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 54264062ea69..995c3fdc16c2 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -96,11 +96,13 @@ xdr_encode_string(__be32 *p, const char *string)
 EXPORT_SYMBOL(xdr_encode_string);
 __be32 *
-xdr_decode_string_inplace(__be32 *p, char **sp, int *lenp, int maxlen)
+xdr_decode_string_inplace(__be32 *p, char **sp,
+                          unsigned int *lenp, unsigned int maxlen)
 {
-        unsigned int    len;
+        u32 len;
-        if ((len = ntohl(*p++)) > maxlen)
+        len = ntohl(*p++);
+        if (len > maxlen)
                return NULL;
        *lenp = len;
        *sp = (char *) p;
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index 264f0feeb513..5a8f268bdd30 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,3 +1,8 @@
 obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o
 xprtrdma-y := transport.o rpc_rdma.o verbs.o
+obj-$(CONFIG_SUNRPC_XPRT_RDMA) += svcrdma.o
+svcrdma-y := svc_rdma.o svc_rdma_transport.o \
+        svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
new file mode 100644
index 000000000000..88c0ca20bb1e
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/svc_rdma.h>
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+/* RPC/RDMA parameters */
+unsigned int svcrdma_ord = RPCRDMA_ORD;
+static unsigned int min_ord = 1;
+static unsigned int max_ord = 4096;
+unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
+static unsigned int min_max_requests = 4;
+static unsigned int max_max_requests = 16384;
+unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE;
+static unsigned int min_max_inline = 4096;
+static unsigned int max_max_inline = 65536;
+atomic_t rdma_stat_recv;
+atomic_t rdma_stat_read;
+atomic_t rdma_stat_write;
+atomic_t rdma_stat_sq_starve;
+atomic_t rdma_stat_rq_starve;
+atomic_t rdma_stat_rq_poll;
+atomic_t rdma_stat_rq_prod;
+atomic_t rdma_stat_sq_poll;
+atomic_t rdma_stat_sq_prod;
+/*
+ * This function implements reading and resetting an atomic_t stat
+ * variable through read/write to a proc file. Any write to the file
+ * resets the associated statistic to zero. Any read returns it's
+ * current value.
+ */
+static int read_reset_stat(ctl_table *table, int write,
+                           struct file *filp, void __user *buffer, size_t *lenp,
+                           loff_t *ppos)
+{
+        atomic_t *stat = (atomic_t *)table->data;
+        if (!stat)
+                return -EINVAL;
+        if (write)
+                atomic_set(stat, 0);
+        else {
+                char str_buf[32];
+                char *data;
+                int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat));
+                if (len >= 32)
+                        return -EFAULT;
+                len = strlen(str_buf);
+                if (*ppos > len) {
+                        *lenp = 0;
+                        return 0;
+                }
+                data = &str_buf[*ppos];
+                len -= *ppos;
+                if (len > *lenp)
+                        len = *lenp;
+                if (len && copy_to_user(buffer, str_buf, len))
+                        return -EFAULT;
+                *lenp = len;
+                *ppos += len;
+        }
+        return 0;
+}
+static struct ctl_table_header *svcrdma_table_header;
+static ctl_table svcrdma_parm_table[] = {
+        {
+                .procname       = "max_requests",
+                .data           = &svcrdma_max_requests,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_minmax,
+                .strategy       = &sysctl_intvec,
+                .extra1         = &min_max_requests,
+                .extra2         = &max_max_requests
+        },
+        {
+                .procname       = "max_req_size",
+                .data           = &svcrdma_max_req_size,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_minmax,
+                .strategy       = &sysctl_intvec,
+                .extra1         = &min_max_inline,
+                .extra2         = &max_max_inline
+        },
+        {
+                .procname       = "max_outbound_read_requests",
+                .data           = &svcrdma_ord,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_minmax,
+                .strategy       = &sysctl_intvec,
+                .extra1         = &min_ord,
+                .extra2         = &max_ord,
+        },
+        {
+                .procname       = "rdma_stat_read",
+                .data           = &rdma_stat_read,
+                .maxlen         = sizeof(atomic_t),
+                .mode           = 0644,
+                .proc_handler   = &read_reset_stat,
+        },
+        {
+                .procname       = "rdma_stat_recv",
+                .data           = &rdma_stat_recv,
+                .maxlen         = sizeof(atomic_t),
+                .mode           = 0644,
+                .proc_handler   = &read_reset_stat,
+        },
+        {
+                .procname       = "rdma_stat_write",
+                .data           = &rdma_stat_write,
+                .maxlen         = sizeof(atomic_t),
+                .mode           = 0644,
+                .proc_handler   = &read_reset_stat,
+        },
+        {
+                .procname       = "rdma_stat_sq_starve",
+                .data           = &rdma_stat_sq_starve,
+                .maxlen         = sizeof(atomic_t),
+                .mode           = 0644,
+                .proc_handler   = &read_reset_stat,
+        },
+        {
+                .procname       = "rdma_stat_rq_starve",
+                .data           = &rdma_stat_rq_starve,
+                .maxlen         = sizeof(atomic_t),
+                .mode           = 0644,
+                .proc_handler   = &read_reset_stat,
+        },
+        {
+                .procname       = "rdma_stat_rq_poll",
+                .data           = &rdma_stat_rq_poll,
+                .maxlen         = sizeof(atomic_t),
+                .mode           = 0644,
+                .proc_handler   = &read_reset_stat,
+        },
+        {
+                .procname       = "rdma_stat_rq_prod",
+                .data           = &rdma_stat_rq_prod,
+                .maxlen         = sizeof(atomic_t),
+                .mode           = 0644,
+                .proc_handler   = &read_reset_stat,
+        },
+        {
+                .procname       = "rdma_stat_sq_poll",
+                .data           = &rdma_stat_sq_poll,
+                .maxlen         = sizeof(atomic_t),
+                .mode           = 0644,
+                .proc_handler   = &read_reset_stat,
+        },
+        {
+                .procname       = "rdma_stat_sq_prod",
+                .data           = &rdma_stat_sq_prod,
+                .maxlen         = sizeof(atomic_t),
+                .mode           = 0644,
+                .proc_handler   = &read_reset_stat,
+        },
+        {
+                .ctl_name = 0,
+        },
+};
+static ctl_table svcrdma_table[] = {
+        {
+                .procname       = "svc_rdma",
+                .mode           = 0555,
+                .child          = svcrdma_parm_table
+        },
+        {
+                .ctl_name = 0,
+        },
+};
+static ctl_table svcrdma_root_table[] = {
+        {
+                .ctl_name       = CTL_SUNRPC,
+                .procname       = "sunrpc",
+                .mode           = 0555,
+                .child          = svcrdma_table
+        },
+        {
+                .ctl_name = 0,
+        },
+};
+void svc_rdma_cleanup(void)
+{
+        dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
+        if (svcrdma_table_header) {
+                unregister_sysctl_table(svcrdma_table_header);
+                svcrdma_table_header = NULL;
+        }
+        svc_unreg_xprt_class(&svc_rdma_class);
+}
+int svc_rdma_init(void)
+{
+        dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
+        dprintk("\tsvcrdma_ord      : %d\n", svcrdma_ord);
+        dprintk("\tmax_requests     : %d\n", svcrdma_max_requests);
+        dprintk("\tsq_depth         : %d\n",
+                svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
+        dprintk("\tmax_inline       : %d\n", svcrdma_max_req_size);
+        if (!svcrdma_table_header)
+                svcrdma_table_header =
+                        register_sysctl_table(svcrdma_root_table);
+        /* Register RDMA with the SVC transport switch */
+        svc_reg_xprt_class(&svc_rdma_class);
+        return 0;
+}
+MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
+MODULE_DESCRIPTION("SVC RDMA Transport");
+MODULE_LICENSE("Dual BSD/GPL");
+module_init(svc_rdma_init);
+module_exit(svc_rdma_cleanup);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
new file mode 100644
index 000000000000..9530ef2d40dc
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/debug.h>
+#include <asm/unaligned.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/sunrpc/svc_rdma.h>
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+/*
+ * Decodes a read chunk list. The expected format is as follows:
+ *    descrim  : xdr_one
+ *    position : u32 offset into XDR stream
+ *    handle   : u32 RKEY
+ *    . . .
+ *  end-of-list: xdr_zero
+ */
+static u32 *decode_read_list(u32 *va, u32 *vaend)
+{
+        struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va;
+        while (ch->rc_discrim != xdr_zero) {
+                u64 ch_offset;
+                if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) >
+                    (unsigned long)vaend) {
+                        dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch);
+                        return NULL;
+                }
+                ch->rc_discrim = ntohl(ch->rc_discrim);
+                ch->rc_position = ntohl(ch->rc_position);
+                ch->rc_target.rs_handle = ntohl(ch->rc_target.rs_handle);
+                ch->rc_target.rs_length = ntohl(ch->rc_target.rs_length);
+                va = (u32 *)&ch->rc_target.rs_offset;
+                xdr_decode_hyper(va, &ch_offset);
+                put_unaligned(ch_offset, (u64 *)va);
+                ch++;
+        }
+        return (u32 *)&ch->rc_position;
+}
+/*
+ * Determine number of chunks and total bytes in chunk list. The chunk
+ * list has already been verified to fit within the RPCRDMA header.
+ */
+void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *ch,
+                               int *ch_count, int *byte_count)
+{
+        /* compute the number of bytes represented by read chunks */
+        *byte_count = 0;
+        *ch_count = 0;
+        for (; ch->rc_discrim != 0; ch++) {
+                *byte_count = *byte_count + ch->rc_target.rs_length;
+                *ch_count = *ch_count + 1;
+        }
+}
+/*
+ * Decodes a write chunk list. The expected format is as follows:
+ *    descrim  : xdr_one
+ *    nchunks  : <count>
+ *       handle   : u32 RKEY              ---+
+ *       length   : u32 <len of segment>     |
+ *       offset   : remove va                + <count>
+ *       . . .                               |
+ *                                        ---+
+ */
+static u32 *decode_write_list(u32 *va, u32 *vaend)
+{
+        int ch_no;
+        struct rpcrdma_write_array *ary =
+                (struct rpcrdma_write_array *)va;
+        /* Check for not write-array */
+        if (ary->wc_discrim == xdr_zero)
+                return (u32 *)&ary->wc_nchunks;
+        if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
+            (unsigned long)vaend) {
+                dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
+                return NULL;
+        }
+        ary->wc_discrim = ntohl(ary->wc_discrim);
+        ary->wc_nchunks = ntohl(ary->wc_nchunks);
+        if (((unsigned long)&ary->wc_array[0] +
+             (sizeof(struct rpcrdma_write_chunk) * ary->wc_nchunks)) >
+            (unsigned long)vaend) {
+                dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
+                        ary, ary->wc_nchunks, vaend);
+                return NULL;
+        }
+        for (ch_no = 0; ch_no < ary->wc_nchunks; ch_no++) {
+                u64 ch_offset;
+                ary->wc_array[ch_no].wc_target.rs_handle =
+                        ntohl(ary->wc_array[ch_no].wc_target.rs_handle);
+                ary->wc_array[ch_no].wc_target.rs_length =
+                        ntohl(ary->wc_array[ch_no].wc_target.rs_length);
+                va = (u32 *)&ary->wc_array[ch_no].wc_target.rs_offset;
+                xdr_decode_hyper(va, &ch_offset);
+                put_unaligned(ch_offset, (u64 *)va);
+        }
+        /*
+         * rs_length is the 2nd 4B field in wc_target and taking its
+         * address skips the list terminator
+         */
+        return (u32 *)&ary->wc_array[ch_no].wc_target.rs_length;
+}
+static u32 *decode_reply_array(u32 *va, u32 *vaend)
+{
+        int ch_no;
+        struct rpcrdma_write_array *ary =
+                (struct rpcrdma_write_array *)va;
+        /* Check for no reply-array */
+        if (ary->wc_discrim == xdr_zero)
+                return (u32 *)&ary->wc_nchunks;
+        if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
+            (unsigned long)vaend) {
+                dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
+                return NULL;
+        }
+        ary->wc_discrim = ntohl(ary->wc_discrim);
+        ary->wc_nchunks = ntohl(ary->wc_nchunks);
+        if (((unsigned long)&ary->wc_array[0] +
+             (sizeof(struct rpcrdma_write_chunk) * ary->wc_nchunks)) >
+            (unsigned long)vaend) {
+                dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
+                        ary, ary->wc_nchunks, vaend);
+                return NULL;
+        }
+        for (ch_no = 0; ch_no < ary->wc_nchunks; ch_no++) {
+                u64 ch_offset;
+                ary->wc_array[ch_no].wc_target.rs_handle =
+                        ntohl(ary->wc_array[ch_no].wc_target.rs_handle);
+                ary->wc_array[ch_no].wc_target.rs_length =
+                        ntohl(ary->wc_array[ch_no].wc_target.rs_length);
+                va = (u32 *)&ary->wc_array[ch_no].wc_target.rs_offset;
+                xdr_decode_hyper(va, &ch_offset);
+                put_unaligned(ch_offset, (u64 *)va);
+        }
+        return (u32 *)&ary->wc_array[ch_no];
+}
+int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req,
+                            struct svc_rqst *rqstp)
+{
+        struct rpcrdma_msg *rmsgp = NULL;
+        u32 *va;
+        u32 *vaend;
+        u32 hdr_len;
+        rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
+        /* Verify that there's enough bytes for header + something */
+        if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_MIN) {
+                dprintk("svcrdma: header too short = %d\n",
+                        rqstp->rq_arg.len);
+                return -EINVAL;
+        }
+        /* Decode the header */
+        rmsgp->rm_xid = ntohl(rmsgp->rm_xid);
+        rmsgp->rm_vers = ntohl(rmsgp->rm_vers);
+        rmsgp->rm_credit = ntohl(rmsgp->rm_credit);
+        rmsgp->rm_type = ntohl(rmsgp->rm_type);
+        if (rmsgp->rm_vers != RPCRDMA_VERSION)
+                return -ENOSYS;
+        /* Pull in the extra for the padded case and bump our pointer */
+        if (rmsgp->rm_type == RDMA_MSGP) {
+                int hdrlen;
+                rmsgp->rm_body.rm_padded.rm_align =
+                        ntohl(rmsgp->rm_body.rm_padded.rm_align);
+                rmsgp->rm_body.rm_padded.rm_thresh =
+                        ntohl(rmsgp->rm_body.rm_padded.rm_thresh);
+                va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
+                rqstp->rq_arg.head[0].iov_base = va;
+                hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp);
+                rqstp->rq_arg.head[0].iov_len -= hdrlen;
+                if (hdrlen > rqstp->rq_arg.len)
+                        return -EINVAL;
+                return hdrlen;
+        }
+        /* The chunk list may contain either a read chunk list or a write
+         * chunk list and a reply chunk list.
+         */
+        va = &rmsgp->rm_body.rm_chunks[0];
+        vaend = (u32 *)((unsigned long)rmsgp + rqstp->rq_arg.len);
+        va = decode_read_list(va, vaend);
+        if (!va)
+                return -EINVAL;
+        va = decode_write_list(va, vaend);
+        if (!va)
+                return -EINVAL;
+        va = decode_reply_array(va, vaend);
+        if (!va)
+                return -EINVAL;
+        rqstp->rq_arg.head[0].iov_base = va;
+        hdr_len = (unsigned long)va - (unsigned long)rmsgp;
+        rqstp->rq_arg.head[0].iov_len -= hdr_len;
+        *rdma_req = rmsgp;
+        return hdr_len;
+}
+int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *rqstp)
+{
+        struct rpcrdma_msg *rmsgp = NULL;
+        struct rpcrdma_read_chunk *ch;
+        struct rpcrdma_write_array *ary;
+        u32 *va;
+        u32 hdrlen;
+        dprintk("svcrdma: processing deferred RDMA header on rqstp=%p\n",
+                rqstp);
+        rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
+        /* Pull in the extra for the padded case and bump our pointer */
+        if (rmsgp->rm_type == RDMA_MSGP) {
+                va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
+                rqstp->rq_arg.head[0].iov_base = va;
+                hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp);
+                rqstp->rq_arg.head[0].iov_len -= hdrlen;
+                return hdrlen;
+        }
+        /*
+         * Skip all chunks to find RPC msg. These were previously processed
+         */
+        va = &rmsgp->rm_body.rm_chunks[0];
+        /* Skip read-list */
+        for (ch = (struct rpcrdma_read_chunk *)va;
+             ch->rc_discrim != xdr_zero; ch++);
+        va = (u32 *)&ch->rc_position;
+        /* Skip write-list */
+        ary = (struct rpcrdma_write_array *)va;
+        if (ary->wc_discrim == xdr_zero)
+                va = (u32 *)&ary->wc_nchunks;
+        else
+                /*
+                 * rs_length is the 2nd 4B field in wc_target and taking its
+                 * address skips the list terminator
+                 */
+                va = (u32 *)&ary->wc_array[ary->wc_nchunks].wc_target.rs_length;
+        /* Skip reply-array */
+        ary = (struct rpcrdma_write_array *)va;
+        if (ary->wc_discrim == xdr_zero)
+                va = (u32 *)&ary->wc_nchunks;
+        else
+                va = (u32 *)&ary->wc_array[ary->wc_nchunks];
+        rqstp->rq_arg.head[0].iov_base = va;
+        hdrlen = (unsigned long)va - (unsigned long)rmsgp;
+        rqstp->rq_arg.head[0].iov_len -= hdrlen;
+        return hdrlen;
+}
+int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
+                              struct rpcrdma_msg *rmsgp,
+                              enum rpcrdma_errcode err, u32 *va)
+{
+        u32 *startp = va;
+        *va++ = htonl(rmsgp->rm_xid);
+        *va++ = htonl(rmsgp->rm_vers);
+        *va++ = htonl(xprt->sc_max_requests);
+        *va++ = htonl(RDMA_ERROR);
+        *va++ = htonl(err);
+        if (err == ERR_VERS) {
+                *va++ = htonl(RPCRDMA_VERSION);
+                *va++ = htonl(RPCRDMA_VERSION);
+        }
+        return (int)((unsigned long)va - (unsigned long)startp);
+}
+int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp)
+{
+        struct rpcrdma_write_array *wr_ary;
+        /* There is no read-list in a reply */
+        /* skip write list */
+        wr_ary = (struct rpcrdma_write_array *)
+                &rmsgp->rm_body.rm_chunks[1];
+        if (wr_ary->wc_discrim)
+                wr_ary = (struct rpcrdma_write_array *)
+                        &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)].
+                        wc_target.rs_length;
+        else
+                wr_ary = (struct rpcrdma_write_array *)
+                        &wr_ary->wc_nchunks;
+        /* skip reply array */
+        if (wr_ary->wc_discrim)
+                wr_ary = (struct rpcrdma_write_array *)
+                        &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)];
+        else
+                wr_ary = (struct rpcrdma_write_array *)
+                        &wr_ary->wc_nchunks;
+        return (unsigned long) wr_ary - (unsigned long) rmsgp;
+}
+void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
+{
+        struct rpcrdma_write_array *ary;
+        /* no read-list */
+        rmsgp->rm_body.rm_chunks[0] = xdr_zero;
+        /* write-array discrim */
+        ary = (struct rpcrdma_write_array *)
+                &rmsgp->rm_body.rm_chunks[1];
+        ary->wc_discrim = xdr_one;
+        ary->wc_nchunks = htonl(chunks);
+        /* write-list terminator */
+        ary->wc_array[chunks].wc_target.rs_handle = xdr_zero;
+        /* reply-array discriminator */
+        ary->wc_array[chunks].wc_target.rs_length = xdr_zero;
+}
+void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary,
+                                 int chunks)
+{
+        ary->wc_discrim = xdr_one;
+        ary->wc_nchunks = htonl(chunks);
+}
+void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
+                                     int chunk_no,
+                                     u32 rs_handle, u64 rs_offset,
+                                     u32 write_len)
+{
+        struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target;
+        seg->rs_handle = htonl(rs_handle);
+        seg->rs_length = htonl(write_len);
+        xdr_encode_hyper((u32 *) &seg->rs_offset, rs_offset);
+}
+void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt,
+                                  struct rpcrdma_msg *rdma_argp,
+                                  struct rpcrdma_msg *rdma_resp,
+                                  enum rpcrdma_proc rdma_type)
+{
+        rdma_resp->rm_xid = htonl(rdma_argp->rm_xid);
+        rdma_resp->rm_vers = htonl(rdma_argp->rm_vers);
+        rdma_resp->rm_credit = htonl(xprt->sc_max_requests);
+        rdma_resp->rm_type = htonl(rdma_type);
+        /* Encode <nul> chunks lists */
+        rdma_resp->rm_body.rm_chunks[0] = xdr_zero;
+        rdma_resp->rm_body.rm_chunks[1] = xdr_zero;
+        rdma_resp->rm_body.rm_chunks[2] = xdr_zero;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
new file mode 100644
index 000000000000..ab54a736486e
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -0,0 +1,586 @@
+/*
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/spinlock.h>
+#include <asm/unaligned.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <linux/sunrpc/svc_rdma.h>
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+/*
+ * Replace the pages in the rq_argpages array with the pages from the SGE in
+ * the RDMA_RECV completion. The SGL should contain full pages up until the
+ * last one.
+ */
+static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
+                               struct svc_rdma_op_ctxt *ctxt,
+                               u32 byte_count)
+{
+        struct page *page;
+        u32 bc;
+        int sge_no;
+        /* Swap the page in the SGE with the page in argpages */
+        page = ctxt->pages[0];
+        put_page(rqstp->rq_pages[0]);
+        rqstp->rq_pages[0] = page;
+        /* Set up the XDR head */
+        rqstp->rq_arg.head[0].iov_base = page_address(page);
+        rqstp->rq_arg.head[0].iov_len = min(byte_count, ctxt->sge[0].length);
+        rqstp->rq_arg.len = byte_count;
+        rqstp->rq_arg.buflen = byte_count;
+        /* Compute bytes past head in the SGL */
+        bc = byte_count - rqstp->rq_arg.head[0].iov_len;
+        /* If data remains, store it in the pagelist */
+        rqstp->rq_arg.page_len = bc;
+        rqstp->rq_arg.page_base = 0;
+        rqstp->rq_arg.pages = &rqstp->rq_pages[1];
+        sge_no = 1;
+        while (bc && sge_no < ctxt->count) {
+                page = ctxt->pages[sge_no];
+                put_page(rqstp->rq_pages[sge_no]);
+                rqstp->rq_pages[sge_no] = page;
+                bc -= min(bc, ctxt->sge[sge_no].length);
+                rqstp->rq_arg.buflen += ctxt->sge[sge_no].length;
+                sge_no++;
+        }
+        rqstp->rq_respages = &rqstp->rq_pages[sge_no];
+        /* We should never run out of SGE because the limit is defined to
+         * support the max allowed RPC data length
+         */
+        BUG_ON(bc && (sge_no == ctxt->count));
+        BUG_ON((rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len)
+               != byte_count);
+        BUG_ON(rqstp->rq_arg.len != byte_count);
+        /* If not all pages were used from the SGL, free the remaining ones */
+        bc = sge_no;
+        while (sge_no < ctxt->count) {
+                page = ctxt->pages[sge_no++];
+                put_page(page);
+        }
+        ctxt->count = bc;
+        /* Set up tail */
+        rqstp->rq_arg.tail[0].iov_base = NULL;
+        rqstp->rq_arg.tail[0].iov_len = 0;
+}
+struct chunk_sge {
+        int start;              /* sge no for this chunk */
+        int count;              /* sge count for this chunk */
+};
+/* Encode a read-chunk-list as an array of IB SGE
+ *
+ * Assumptions:
+ * - chunk[0]->position points to pages[0] at an offset of 0
+ * - pages[] is not physically or virtually contigous and consists of
+ *   PAGE_SIZE elements.
+ *
+ * Output:
+ * - sge array pointing into pages[] array.
+ * - chunk_sge array specifying sge index and count for each
+ *   chunk in the read list
+ *
+ */
+static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
+                           struct svc_rqst *rqstp,
+                           struct svc_rdma_op_ctxt *head,
+                           struct rpcrdma_msg *rmsgp,
+                           struct ib_sge *sge,
+                           struct chunk_sge *ch_sge_ary,
+                           int ch_count,
+                           int byte_count)
+{
+        int sge_no;
+        int sge_bytes;
+        int page_off;
+        int page_no;
+        int ch_bytes;
+        int ch_no;
+        struct rpcrdma_read_chunk *ch;
+        sge_no = 0;
+        page_no = 0;
+        page_off = 0;
+        ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+        ch_no = 0;
+        ch_bytes = ch->rc_target.rs_length;
+        head->arg.head[0] = rqstp->rq_arg.head[0];
+        head->arg.tail[0] = rqstp->rq_arg.tail[0];
+        head->arg.pages = &head->pages[head->count];
+        head->sge[0].length = head->count; /* save count of hdr pages */
+        head->arg.page_base = 0;
+        head->arg.page_len = ch_bytes;
+        head->arg.len = rqstp->rq_arg.len + ch_bytes;
+        head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes;
+        head->count++;
+        ch_sge_ary[0].start = 0;
+        while (byte_count) {
+                sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes);
+                sge[sge_no].addr =
+                        ib_dma_map_page(xprt->sc_cm_id->device,
+                                        rqstp->rq_arg.pages[page_no],
+                                        page_off, sge_bytes,
+                                        DMA_FROM_DEVICE);
+                sge[sge_no].length = sge_bytes;
+                sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+                /*
+                 * Don't bump head->count here because the same page
+                 * may be used by multiple SGE.
+                 */
+                head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
+                rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1];
+                byte_count -= sge_bytes;
+                ch_bytes -= sge_bytes;
+                sge_no++;
+                /*
+                 * If all bytes for this chunk have been mapped to an
+                 * SGE, move to the next SGE
+                 */
+                if (ch_bytes == 0) {
+                        ch_sge_ary[ch_no].count =
+                                sge_no - ch_sge_ary[ch_no].start;
+                        ch_no++;
+                        ch++;
+                        ch_sge_ary[ch_no].start = sge_no;
+                        ch_bytes = ch->rc_target.rs_length;
+                        /* If bytes remaining account for next chunk */
+                        if (byte_count) {
+                                head->arg.page_len += ch_bytes;
+                                head->arg.len += ch_bytes;
+                                head->arg.buflen += ch_bytes;
+                        }
+                }
+                /*
+                 * If this SGE consumed all of the page, move to the
+                 * next page
+                 */
+                if ((sge_bytes + page_off) == PAGE_SIZE) {
+                        page_no++;
+                        page_off = 0;
+                        /*
+                         * If there are still bytes left to map, bump
+                         * the page count
+                         */
+                        if (byte_count)
+                                head->count++;
+                } else
+                        page_off += sge_bytes;
+        }
+        BUG_ON(byte_count != 0);
+        return sge_no;
+}
+static void rdma_set_ctxt_sge(struct svc_rdma_op_ctxt *ctxt,
+                              struct ib_sge *sge,
+                              u64 *sgl_offset,
+                              int count)
+{
+        int i;
+        ctxt->count = count;
+        for (i = 0; i < count; i++) {
+                ctxt->sge[i].addr = sge[i].addr;
+                ctxt->sge[i].length = sge[i].length;
+                *sgl_offset = *sgl_offset + sge[i].length;
+        }
+}
+static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
+{
+#ifdef RDMA_TRANSPORT_IWARP
+        if ((RDMA_TRANSPORT_IWARP ==
+             rdma_node_get_transport(xprt->sc_cm_id->
+                                     device->node_type))
+            && sge_count > 1)
+                return 1;
+        else
+#endif
+                return min_t(int, sge_count, xprt->sc_max_sge);
+}
+/*
+ * Use RDMA_READ to read data from the advertised client buffer into the
+ * XDR stream starting at rq_arg.head[0].iov_base.
+ * Each chunk in the array
+ * contains the following fields:
+ * discrim      - '1', This isn't used for data placement
+ * position     - The xdr stream offset (the same for every chunk)
+ * handle       - RMR for client memory region
+ * length       - data transfer length
+ * offset       - 64 bit tagged offset in remote memory region
+ *
+ * On our side, we need to read into a pagelist. The first page immediately
+ * follows the RPC header.
+ *
+ * This function returns 1 to indicate success. The data is not yet in
+ * the pagelist and therefore the RPC request must be deferred. The
+ * I/O completion will enqueue the transport again and
+ * svc_rdma_recvfrom will complete the request.
+ *
+ * NOTE: The ctxt must not be touched after the last WR has been posted
+ * because the I/O completion processing may occur on another
+ * processor and free / modify the context. Ne touche pas!
+ */
+static int rdma_read_xdr(struct svcxprt_rdma *xprt,
+                         struct rpcrdma_msg *rmsgp,
+                         struct svc_rqst *rqstp,
+                         struct svc_rdma_op_ctxt *hdr_ctxt)
+{
+        struct ib_send_wr read_wr;
+        int err = 0;
+        int ch_no;
+        struct ib_sge *sge;
+        int ch_count;
+        int byte_count;
+        int sge_count;
+        u64 sgl_offset;
+        struct rpcrdma_read_chunk *ch;
+        struct svc_rdma_op_ctxt *ctxt = NULL;
+        struct svc_rdma_op_ctxt *head;
+        struct svc_rdma_op_ctxt *tmp_sge_ctxt;
+        struct svc_rdma_op_ctxt *tmp_ch_ctxt;
+        struct chunk_sge *ch_sge_ary;
+        /* If no read list is present, return 0 */
+        ch = svc_rdma_get_read_chunk(rmsgp);
+        if (!ch)
+                return 0;
+        /* Allocate temporary contexts to keep SGE */
+        BUG_ON(sizeof(struct ib_sge) < sizeof(struct chunk_sge));
+        tmp_sge_ctxt = svc_rdma_get_context(xprt);
+        sge = tmp_sge_ctxt->sge;
+        tmp_ch_ctxt = svc_rdma_get_context(xprt);
+        ch_sge_ary = (struct chunk_sge *)tmp_ch_ctxt->sge;
+        svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count);
+        sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp,
+                                    sge, ch_sge_ary,
+                                    ch_count, byte_count);
+        head = svc_rdma_get_context(xprt);
+        sgl_offset = 0;
+        ch_no = 0;
+        for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+             ch->rc_discrim != 0; ch++, ch_no++) {
+next_sge:
+                if (!ctxt)
+                        ctxt = head;
+                else {
+                        ctxt->next = svc_rdma_get_context(xprt);
+                        ctxt = ctxt->next;
+                }
+                ctxt->next = NULL;
+                ctxt->direction = DMA_FROM_DEVICE;
+                clear_bit(RDMACTXT_F_READ_DONE, &ctxt->flags);
+                clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+                if ((ch+1)->rc_discrim == 0) {
+                        /*
+                         * Checked in sq_cq_reap to see if we need to
+                         * be enqueued
+                         */
+                        set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+                        ctxt->next = hdr_ctxt;
+                        hdr_ctxt->next = head;
+                }
+                /* Prepare READ WR */
+                memset(&read_wr, 0, sizeof read_wr);
+                ctxt->wr_op = IB_WR_RDMA_READ;
+                read_wr.wr_id = (unsigned long)ctxt;
+                read_wr.opcode = IB_WR_RDMA_READ;
+                read_wr.send_flags = IB_SEND_SIGNALED;
+                read_wr.wr.rdma.rkey = ch->rc_target.rs_handle;
+                read_wr.wr.rdma.remote_addr =
+                        get_unaligned(&(ch->rc_target.rs_offset)) +
+                        sgl_offset;
+                read_wr.sg_list = &sge[ch_sge_ary[ch_no].start];
+                read_wr.num_sge =
+                        rdma_read_max_sge(xprt, ch_sge_ary[ch_no].count);
+                rdma_set_ctxt_sge(ctxt, &sge[ch_sge_ary[ch_no].start],
+                                  &sgl_offset,
+                                  read_wr.num_sge);
+                /* Post the read */
+                err = svc_rdma_send(xprt, &read_wr);
+                if (err) {
+                        printk(KERN_ERR "svcrdma: Error posting send = %d\n",
+                               err);
+                        /*
+                         * Break the circular list so free knows when
+                         * to stop if the error happened to occur on
+                         * the last read
+                         */
+                        ctxt->next = NULL;
+                        goto out;
+                }
+                atomic_inc(&rdma_stat_read);
+                if (read_wr.num_sge < ch_sge_ary[ch_no].count) {
+                        ch_sge_ary[ch_no].count -= read_wr.num_sge;
+                        ch_sge_ary[ch_no].start += read_wr.num_sge;
+                        goto next_sge;
+                }
+                sgl_offset = 0;
+                err = 0;
+        }
+ out:
+        svc_rdma_put_context(tmp_sge_ctxt, 0);
+        svc_rdma_put_context(tmp_ch_ctxt, 0);
+        /* Detach arg pages. svc_recv will replenish them */
+        for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++)
+                rqstp->rq_pages[ch_no] = NULL;
+        /*
+         * Detach res pages. svc_release must see a resused count of
+         * zero or it will attempt to put them.
+         */
+        while (rqstp->rq_resused)
+                rqstp->rq_respages[--rqstp->rq_resused] = NULL;
+        if (err) {
+                printk(KERN_ERR "svcrdma : RDMA_READ error = %d\n", err);
+                set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+                /* Free the linked list of read contexts */
+                while (head != NULL) {
+                        ctxt = head->next;
+                        svc_rdma_put_context(head, 1);
+                        head = ctxt;
+                }
+                return 0;
+        }
+        return 1;
+}
+static int rdma_read_complete(struct svc_rqst *rqstp,
+                              struct svc_rdma_op_ctxt *data)
+{
+        struct svc_rdma_op_ctxt *head = data->next;
+        int page_no;
+        int ret;
+        BUG_ON(!head);
+        /* Copy RPC pages */
+        for (page_no = 0; page_no < head->count; page_no++) {
+                put_page(rqstp->rq_pages[page_no]);
+                rqstp->rq_pages[page_no] = head->pages[page_no];
+        }
+        /* Point rq_arg.pages past header */
+        rqstp->rq_arg.pages = &rqstp->rq_pages[head->sge[0].length];
+        rqstp->rq_arg.page_len = head->arg.page_len;
+        rqstp->rq_arg.page_base = head->arg.page_base;
+        /* rq_respages starts after the last arg page */
+        rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
+        rqstp->rq_resused = 0;
+        /* Rebuild rq_arg head and tail. */
+        rqstp->rq_arg.head[0] = head->arg.head[0];
+        rqstp->rq_arg.tail[0] = head->arg.tail[0];
+        rqstp->rq_arg.len = head->arg.len;
+        rqstp->rq_arg.buflen = head->arg.buflen;
+        /* XXX: What should this be? */
+        rqstp->rq_prot = IPPROTO_MAX;
+        /*
+         * Free the contexts we used to build the RDMA_READ. We have
+         * to be careful here because the context list uses the same
+         * next pointer used to chain the contexts associated with the
+         * RDMA_READ
+         */
+        data->next = NULL;      /* terminate circular list */
+        do {
+                data = head->next;
+                svc_rdma_put_context(head, 0);
+                head = data;
+        } while (head != NULL);
+        ret = rqstp->rq_arg.head[0].iov_len
+                + rqstp->rq_arg.page_len
+                + rqstp->rq_arg.tail[0].iov_len;
+        dprintk("svcrdma: deferred read ret=%d, rq_arg.len =%d, "
+                "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n",
+                ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base,
+                rqstp->rq_arg.head[0].iov_len);
+        /* Indicate that we've consumed an RQ credit */
+        rqstp->rq_xprt_ctxt = rqstp->rq_xprt;
+        svc_xprt_received(rqstp->rq_xprt);
+        return ret;
+}
+/*
+ * Set up the rqstp thread context to point to the RQ buffer. If
+ * necessary, pull additional data from the client with an RDMA_READ
+ * request.
+ */
+int svc_rdma_recvfrom(struct svc_rqst *rqstp)
+{
+        struct svc_xprt *xprt = rqstp->rq_xprt;
+        struct svcxprt_rdma *rdma_xprt =
+                container_of(xprt, struct svcxprt_rdma, sc_xprt);
+        struct svc_rdma_op_ctxt *ctxt = NULL;
+        struct rpcrdma_msg *rmsgp;
+        int ret = 0;
+        int len;
+        dprintk("svcrdma: rqstp=%p\n", rqstp);
+        /*
+         * The rq_xprt_ctxt indicates if we've consumed an RQ credit
+         * or not. It is used in the rdma xpo_release_rqst function to
+         * determine whether or not to return an RQ WQE to the RQ.
+         */
+        rqstp->rq_xprt_ctxt = NULL;
+        spin_lock_bh(&rdma_xprt->sc_read_complete_lock);
+        if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
+                ctxt = list_entry(rdma_xprt->sc_read_complete_q.next,
+                                  struct svc_rdma_op_ctxt,
+                                  dto_q);
+                list_del_init(&ctxt->dto_q);
+        }
+        spin_unlock_bh(&rdma_xprt->sc_read_complete_lock);
+        if (ctxt)
+                return rdma_read_complete(rqstp, ctxt);
+        spin_lock_bh(&rdma_xprt->sc_rq_dto_lock);
+        if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
+                ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
+                                  struct svc_rdma_op_ctxt,
+                                  dto_q);
+                list_del_init(&ctxt->dto_q);
+        } else {
+                atomic_inc(&rdma_stat_rq_starve);
+                clear_bit(XPT_DATA, &xprt->xpt_flags);
+                ctxt = NULL;
+        }
+        spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
+        if (!ctxt) {
+                /* This is the EAGAIN path. The svc_recv routine will
+                 * return -EAGAIN, the nfsd thread will go to call into
+                 * svc_recv again and we shouldn't be on the active
+                 * transport list
+                 */
+                if (test_bit(XPT_CLOSE, &xprt->xpt_flags))
+                        goto close_out;
+                BUG_ON(ret);
+                goto out;
+        }
+        dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n",
+                ctxt, rdma_xprt, rqstp, ctxt->wc_status);
+        BUG_ON(ctxt->wc_status != IB_WC_SUCCESS);
+        atomic_inc(&rdma_stat_recv);
+        /* Build up the XDR from the receive buffers. */
+        rdma_build_arg_xdr(rqstp, ctxt, ctxt->byte_len);
+        /* Decode the RDMA header. */
+        len = svc_rdma_xdr_decode_req(&rmsgp, rqstp);
+        rqstp->rq_xprt_hlen = len;
+        /* If the request is invalid, reply with an error */
+        if (len < 0) {
+                if (len == -ENOSYS)
+                        (void)svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS);
+                goto close_out;
+        }
+        /* Read read-list data. If we would need to wait, defer
+         * it. Not that in this case, we don't return the RQ credit
+         * until after the read completes.
+         */
+        if (rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt)) {
+                svc_xprt_received(xprt);
+                return 0;
+        }
+        /* Indicate we've consumed an RQ credit */
+        rqstp->rq_xprt_ctxt = rqstp->rq_xprt;
+        ret = rqstp->rq_arg.head[0].iov_len
+                + rqstp->rq_arg.page_len
+                + rqstp->rq_arg.tail[0].iov_len;
+        svc_rdma_put_context(ctxt, 0);
+ out:
+        dprintk("svcrdma: ret = %d, rq_arg.len =%d, "
+                "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n",
+                ret, rqstp->rq_arg.len,
+                rqstp->rq_arg.head[0].iov_base,
+                rqstp->rq_arg.head[0].iov_len);
+        rqstp->rq_prot = IPPROTO_MAX;
+        svc_xprt_copy_addrs(rqstp, xprt);
+        svc_xprt_received(xprt);
+        return ret;
+ close_out:
+        if (ctxt) {
+                svc_rdma_put_context(ctxt, 1);
+                /* Indicate we've consumed an RQ credit */
+                rqstp->rq_xprt_ctxt = rqstp->rq_xprt;
+        }
+        dprintk("svcrdma: transport %p is closing\n", xprt);
+        /*
+         * Set the close bit and enqueue it. svc_recv will see the
+         * close bit and call svc_xprt_delete
+         */
+        set_bit(XPT_CLOSE, &xprt->xpt_flags);
+        svc_xprt_received(xprt);
+        return 0;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
new file mode 100644
index 000000000000..3e321949e1dc
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -0,0 +1,520 @@
+/*
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/spinlock.h>
+#include <asm/unaligned.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <linux/sunrpc/svc_rdma.h>
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+/* Encode an XDR as an array of IB SGE
+ *
+ * Assumptions:
+ * - head[0] is physically contiguous.
+ * - tail[0] is physically contiguous.
+ * - pages[] is not physically or virtually contigous and consists of
+ *   PAGE_SIZE elements.
+ *
+ * Output:
+ * SGE[0]              reserved for RCPRDMA header
+ * SGE[1]              data from xdr->head[]
+ * SGE[2..sge_count-2] data from xdr->pages[]
+ * SGE[sge_count-1]    data from xdr->tail.
+ *
+ */
+static struct ib_sge *xdr_to_sge(struct svcxprt_rdma *xprt,
+                                 struct xdr_buf *xdr,
+                                 struct ib_sge *sge,
+                                 int *sge_count)
+{
+        /* Max we need is the length of the XDR / pagesize + one for
+         * head + one for tail + one for RPCRDMA header
+         */
+        int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3;
+        int sge_no;
+        u32 byte_count = xdr->len;
+        u32 sge_bytes;
+        u32 page_bytes;
+        int page_off;
+        int page_no;
+        /* Skip the first sge, this is for the RPCRDMA header */
+        sge_no = 1;
+        /* Head SGE */
+        sge[sge_no].addr = ib_dma_map_single(xprt->sc_cm_id->device,
+                                             xdr->head[0].iov_base,
+                                             xdr->head[0].iov_len,
+                                             DMA_TO_DEVICE);
+        sge_bytes = min_t(u32, byte_count, xdr->head[0].iov_len);
+        byte_count -= sge_bytes;
+        sge[sge_no].length = sge_bytes;
+        sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+        sge_no++;
+        /* pages SGE */
+        page_no = 0;
+        page_bytes = xdr->page_len;
+        page_off = xdr->page_base;
+        while (byte_count && page_bytes) {
+                sge_bytes = min_t(u32, byte_count, (PAGE_SIZE-page_off));
+                sge[sge_no].addr =
+                        ib_dma_map_page(xprt->sc_cm_id->device,
+                                        xdr->pages[page_no], page_off,
+                                        sge_bytes, DMA_TO_DEVICE);
+                sge_bytes = min(sge_bytes, page_bytes);
+                byte_count -= sge_bytes;
+                page_bytes -= sge_bytes;
+                sge[sge_no].length = sge_bytes;
+                sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+                sge_no++;
+                page_no++;
+                page_off = 0; /* reset for next time through loop */
+        }
+        /* Tail SGE */
+        if (byte_count && xdr->tail[0].iov_len) {
+                sge[sge_no].addr =
+                        ib_dma_map_single(xprt->sc_cm_id->device,
+                                          xdr->tail[0].iov_base,
+                                          xdr->tail[0].iov_len,
+                                          DMA_TO_DEVICE);
+                sge_bytes = min_t(u32, byte_count, xdr->tail[0].iov_len);
+                byte_count -= sge_bytes;
+                sge[sge_no].length = sge_bytes;
+                sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+                sge_no++;
+        }
+        BUG_ON(sge_no > sge_max);
+        BUG_ON(byte_count != 0);
+        *sge_count = sge_no;
+        return sge;
+}
+/* Assumptions:
+ * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
+ */
+static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
+                      u32 rmr, u64 to,
+                      u32 xdr_off, int write_len,
+                      struct ib_sge *xdr_sge, int sge_count)
+{
+        struct svc_rdma_op_ctxt *tmp_sge_ctxt;
+        struct ib_send_wr write_wr;
+        struct ib_sge *sge;
+        int xdr_sge_no;
+        int sge_no;
+        int sge_bytes;
+        int sge_off;
+        int bc;
+        struct svc_rdma_op_ctxt *ctxt;
+        int ret = 0;
+        BUG_ON(sge_count >= 32);
+        dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, "
+                "write_len=%d, xdr_sge=%p, sge_count=%d\n",
+                rmr, to, xdr_off, write_len, xdr_sge, sge_count);
+        ctxt = svc_rdma_get_context(xprt);
+        ctxt->count = 0;
+        tmp_sge_ctxt = svc_rdma_get_context(xprt);
+        sge = tmp_sge_ctxt->sge;
+        /* Find the SGE associated with xdr_off */
+        for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < sge_count;
+             xdr_sge_no++) {
+                if (xdr_sge[xdr_sge_no].length > bc)
+                        break;
+                bc -= xdr_sge[xdr_sge_no].length;
+        }
+        sge_off = bc;
+        bc = write_len;
+        sge_no = 0;
+        /* Copy the remaining SGE */
+        while (bc != 0 && xdr_sge_no < sge_count) {
+                sge[sge_no].addr = xdr_sge[xdr_sge_no].addr + sge_off;
+                sge[sge_no].lkey = xdr_sge[xdr_sge_no].lkey;
+                sge_bytes = min((size_t)bc,
+                                (size_t)(xdr_sge[xdr_sge_no].length-sge_off));
+                sge[sge_no].length = sge_bytes;
+                sge_off = 0;
+                sge_no++;
+                xdr_sge_no++;
+                bc -= sge_bytes;
+        }
+        BUG_ON(bc != 0);
+        BUG_ON(xdr_sge_no > sge_count);
+        /* Prepare WRITE WR */
+        memset(&write_wr, 0, sizeof write_wr);
+        ctxt->wr_op = IB_WR_RDMA_WRITE;
+        write_wr.wr_id = (unsigned long)ctxt;
+        write_wr.sg_list = &sge[0];
+        write_wr.num_sge = sge_no;
+        write_wr.opcode = IB_WR_RDMA_WRITE;
+        write_wr.send_flags = IB_SEND_SIGNALED;
+        write_wr.wr.rdma.rkey = rmr;
+        write_wr.wr.rdma.remote_addr = to;
+        /* Post It */
+        atomic_inc(&rdma_stat_write);
+        if (svc_rdma_send(xprt, &write_wr)) {
+                svc_rdma_put_context(ctxt, 1);
+                /* Fatal error, close transport */
+                ret = -EIO;
+        }
+        svc_rdma_put_context(tmp_sge_ctxt, 0);
+        return ret;
+}
+static int send_write_chunks(struct svcxprt_rdma *xprt,
+                             struct rpcrdma_msg *rdma_argp,
+                             struct rpcrdma_msg *rdma_resp,
+                             struct svc_rqst *rqstp,
+                             struct ib_sge *sge,
+                             int sge_count)
+{
+        u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
+        int write_len;
+        int max_write;
+        u32 xdr_off;
+        int chunk_off;
+        int chunk_no;
+        struct rpcrdma_write_array *arg_ary;
+        struct rpcrdma_write_array *res_ary;
+        int ret;
+        arg_ary = svc_rdma_get_write_array(rdma_argp);
+        if (!arg_ary)
+                return 0;
+        res_ary = (struct rpcrdma_write_array *)
+                &rdma_resp->rm_body.rm_chunks[1];
+        max_write = xprt->sc_max_sge * PAGE_SIZE;
+        /* Write chunks start at the pagelist */
+        for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
+             xfer_len && chunk_no < arg_ary->wc_nchunks;
+             chunk_no++) {
+                struct rpcrdma_segment *arg_ch;
+                u64 rs_offset;
+                arg_ch = &arg_ary->wc_array[chunk_no].wc_target;
+                write_len = min(xfer_len, arg_ch->rs_length);
+                /* Prepare the response chunk given the length actually
+                 * written */
+                rs_offset = get_unaligned(&(arg_ch->rs_offset));
+                svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
+                                            arg_ch->rs_handle,
+                                            rs_offset,
+                                            write_len);
+                chunk_off = 0;
+                while (write_len) {
+                        int this_write;
+                        this_write = min(write_len, max_write);
+                        ret = send_write(xprt, rqstp,
+                                         arg_ch->rs_handle,
+                                         rs_offset + chunk_off,
+                                         xdr_off,
+                                         this_write,
+                                         sge,
+                                         sge_count);
+                        if (ret) {
+                                dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
+                                        ret);
+                                return -EIO;
+                        }
+                        chunk_off += this_write;
+                        xdr_off += this_write;
+                        xfer_len -= this_write;
+                        write_len -= this_write;
+                }
+        }
+        /* Update the req with the number of chunks actually used */
+        svc_rdma_xdr_encode_write_list(rdma_resp, chunk_no);
+        return rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
+}
+static int send_reply_chunks(struct svcxprt_rdma *xprt,
+                             struct rpcrdma_msg *rdma_argp,
+                             struct rpcrdma_msg *rdma_resp,
+                             struct svc_rqst *rqstp,
+                             struct ib_sge *sge,
+                             int sge_count)
+{
+        u32 xfer_len = rqstp->rq_res.len;
+        int write_len;
+        int max_write;
+        u32 xdr_off;
+        int chunk_no;
+        int chunk_off;
+        struct rpcrdma_segment *ch;
+        struct rpcrdma_write_array *arg_ary;
+        struct rpcrdma_write_array *res_ary;
+        int ret;
+        arg_ary = svc_rdma_get_reply_array(rdma_argp);
+        if (!arg_ary)
+                return 0;
+        /* XXX: need to fix when reply lists occur with read-list and or
+         * write-list */
+        res_ary = (struct rpcrdma_write_array *)
+                &rdma_resp->rm_body.rm_chunks[2];
+        max_write = xprt->sc_max_sge * PAGE_SIZE;
+        /* xdr offset starts at RPC message */
+        for (xdr_off = 0, chunk_no = 0;
+             xfer_len && chunk_no < arg_ary->wc_nchunks;
+             chunk_no++) {
+                u64 rs_offset;
+                ch = &arg_ary->wc_array[chunk_no].wc_target;
+                write_len = min(xfer_len, ch->rs_length);
+                /* Prepare the reply chunk given the length actually
+                 * written */
+                rs_offset = get_unaligned(&(ch->rs_offset));
+                svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
+                                            ch->rs_handle, rs_offset,
+                                            write_len);
+                chunk_off = 0;
+                while (write_len) {
+                        int this_write;
+                        this_write = min(write_len, max_write);
+                        ret = send_write(xprt, rqstp,
+                                         ch->rs_handle,
+                                         rs_offset + chunk_off,
+                                         xdr_off,
+                                         this_write,
+                                         sge,
+                                         sge_count);
+                        if (ret) {
+                                dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
+                                        ret);
+                                return -EIO;
+                        }
+                        chunk_off += this_write;
+                        xdr_off += this_write;
+                        xfer_len -= this_write;
+                        write_len -= this_write;
+                }
+        }
+        /* Update the req with the number of chunks actually used */
+        svc_rdma_xdr_encode_reply_array(res_ary, chunk_no);
+        return rqstp->rq_res.len;
+}
+/* This function prepares the portion of the RPCRDMA message to be
+ * sent in the RDMA_SEND. This function is called after data sent via
+ * RDMA has already been transmitted. There are three cases:
+ * - The RPCRDMA header, RPC header, and payload are all sent in a
+ *   single RDMA_SEND. This is the "inline" case.
+ * - The RPCRDMA header and some portion of the RPC header and data
+ *   are sent via this RDMA_SEND and another portion of the data is
+ *   sent via RDMA.
+ * - The RPCRDMA header [NOMSG] is sent in this RDMA_SEND and the RPC
+ *   header and data are all transmitted via RDMA.
+ * In all three cases, this function prepares the RPCRDMA header in
+ * sge[0], the 'type' parameter indicates the type to place in the
+ * RPCRDMA header, and the 'byte_count' field indicates how much of
+ * the XDR to include in this RDMA_SEND.
+ */
+static int send_reply(struct svcxprt_rdma *rdma,
+                      struct svc_rqst *rqstp,
+                      struct page *page,
+                      struct rpcrdma_msg *rdma_resp,
+                      struct svc_rdma_op_ctxt *ctxt,
+                      int sge_count,
+                      int byte_count)
+{
+        struct ib_send_wr send_wr;
+        int sge_no;
+        int sge_bytes;
+        int page_no;
+        int ret;
+        /* Prepare the context */
+        ctxt->pages[0] = page;
+        ctxt->count = 1;
+        /* Prepare the SGE for the RPCRDMA Header */
+        ctxt->sge[0].addr =
+                ib_dma_map_page(rdma->sc_cm_id->device,
+                                page, 0, PAGE_SIZE, DMA_TO_DEVICE);
+        ctxt->direction = DMA_TO_DEVICE;
+        ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
+        ctxt->sge[0].lkey = rdma->sc_phys_mr->lkey;
+        /* Determine how many of our SGE are to be transmitted */
+        for (sge_no = 1; byte_count && sge_no < sge_count; sge_no++) {
+                sge_bytes = min((size_t)ctxt->sge[sge_no].length,
+                                (size_t)byte_count);
+                byte_count -= sge_bytes;
+        }
+        BUG_ON(byte_count != 0);
+        /* Save all respages in the ctxt and remove them from the
+         * respages array. They are our pages until the I/O
+         * completes.
+         */
+        for (page_no = 0; page_no < rqstp->rq_resused; page_no++) {
+                ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
+                ctxt->count++;
+                rqstp->rq_respages[page_no] = NULL;
+        }
+        BUG_ON(sge_no > rdma->sc_max_sge);
+        memset(&send_wr, 0, sizeof send_wr);
+        ctxt->wr_op = IB_WR_SEND;
+        send_wr.wr_id = (unsigned long)ctxt;
+        send_wr.sg_list = ctxt->sge;
+        send_wr.num_sge = sge_no;
+        send_wr.opcode = IB_WR_SEND;
+        send_wr.send_flags =  IB_SEND_SIGNALED;
+        ret = svc_rdma_send(rdma, &send_wr);
+        if (ret)
+                svc_rdma_put_context(ctxt, 1);
+        return ret;
+}
+void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
+{
+}
+/*
+ * Return the start of an xdr buffer.
+ */
+static void *xdr_start(struct xdr_buf *xdr)
+{
+        return xdr->head[0].iov_base -
+                (xdr->len -
+                 xdr->page_len -
+                 xdr->tail[0].iov_len -
+                 xdr->head[0].iov_len);
+}
+int svc_rdma_sendto(struct svc_rqst *rqstp)
+{
+        struct svc_xprt *xprt = rqstp->rq_xprt;
+        struct svcxprt_rdma *rdma =
+                container_of(xprt, struct svcxprt_rdma, sc_xprt);
+        struct rpcrdma_msg *rdma_argp;
+        struct rpcrdma_msg *rdma_resp;
+        struct rpcrdma_write_array *reply_ary;
+        enum rpcrdma_proc reply_type;
+        int ret;
+        int inline_bytes;
+        struct ib_sge *sge;
+        int sge_count = 0;
+        struct page *res_page;
+        struct svc_rdma_op_ctxt *ctxt;
+        dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
+        /* Get the RDMA request header. */
+        rdma_argp = xdr_start(&rqstp->rq_arg);
+        /* Build an SGE for the XDR */
+        ctxt = svc_rdma_get_context(rdma);
+        ctxt->direction = DMA_TO_DEVICE;
+        sge = xdr_to_sge(rdma, &rqstp->rq_res, ctxt->sge, &sge_count);
+        inline_bytes = rqstp->rq_res.len;
+        /* Create the RDMA response header */
+        res_page = svc_rdma_get_page();
+        rdma_resp = page_address(res_page);
+        reply_ary = svc_rdma_get_reply_array(rdma_argp);
+        if (reply_ary)
+                reply_type = RDMA_NOMSG;
+        else
+                reply_type = RDMA_MSG;
+        svc_rdma_xdr_encode_reply_header(rdma, rdma_argp,
+                                         rdma_resp, reply_type);
+        /* Send any write-chunk data and build resp write-list */
+        ret = send_write_chunks(rdma, rdma_argp, rdma_resp,
+                                rqstp, sge, sge_count);
+        if (ret < 0) {
+                printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n",
+                       ret);
+                goto error;
+        }
+        inline_bytes -= ret;
+        /* Send any reply-list data and update resp reply-list */
+        ret = send_reply_chunks(rdma, rdma_argp, rdma_resp,
+                                rqstp, sge, sge_count);
+        if (ret < 0) {
+                printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n",
+                       ret);
+                goto error;
+        }
+        inline_bytes -= ret;
+        ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, sge_count,
+                         inline_bytes);
+        dprintk("svcrdma: send_reply returns %d\n", ret);
+        return ret;
+ error:
+        svc_rdma_put_context(ctxt, 0);
+        put_page(res_page);
+        return ret;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
new file mode 100644
index 000000000000..f09444c451bc
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -0,0 +1,1080 @@
+/*
+ * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+#include <linux/sunrpc/svc_xprt.h>
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/spinlock.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <linux/sunrpc/svc_rdma.h>
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
+                                        struct sockaddr *sa, int salen,
+                                        int flags);
+static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt);
+static void svc_rdma_release_rqst(struct svc_rqst *);
+static void rdma_destroy_xprt(struct svcxprt_rdma *xprt);
+static void dto_tasklet_func(unsigned long data);
+static void svc_rdma_detach(struct svc_xprt *xprt);
+static void svc_rdma_free(struct svc_xprt *xprt);
+static int svc_rdma_has_wspace(struct svc_xprt *xprt);
+static void rq_cq_reap(struct svcxprt_rdma *xprt);
+static void sq_cq_reap(struct svcxprt_rdma *xprt);
+DECLARE_TASKLET(dto_tasklet, dto_tasklet_func, 0UL);
+static DEFINE_SPINLOCK(dto_lock);
+static LIST_HEAD(dto_xprt_q);
+static struct svc_xprt_ops svc_rdma_ops = {
+        .xpo_create = svc_rdma_create,
+        .xpo_recvfrom = svc_rdma_recvfrom,
+        .xpo_sendto = svc_rdma_sendto,
+        .xpo_release_rqst = svc_rdma_release_rqst,
+        .xpo_detach = svc_rdma_detach,
+        .xpo_free = svc_rdma_free,
+        .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr,
+        .xpo_has_wspace = svc_rdma_has_wspace,
+        .xpo_accept = svc_rdma_accept,
+};
+struct svc_xprt_class svc_rdma_class = {
+        .xcl_name = "rdma",
+        .xcl_owner = THIS_MODULE,
+        .xcl_ops = &svc_rdma_ops,
+        .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
+};
+static int rdma_bump_context_cache(struct svcxprt_rdma *xprt)
+{
+        int target;
+        int at_least_one = 0;
+        struct svc_rdma_op_ctxt *ctxt;
+        target = min(xprt->sc_ctxt_cnt + xprt->sc_ctxt_bump,
+                     xprt->sc_ctxt_max);
+        spin_lock_bh(&xprt->sc_ctxt_lock);
+        while (xprt->sc_ctxt_cnt < target) {
+                xprt->sc_ctxt_cnt++;
+                spin_unlock_bh(&xprt->sc_ctxt_lock);
+                ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
+                spin_lock_bh(&xprt->sc_ctxt_lock);
+                if (ctxt) {
+                        at_least_one = 1;
+                        ctxt->next = xprt->sc_ctxt_head;
+                        xprt->sc_ctxt_head = ctxt;
+                } else {
+                        /* kmalloc failed...give up for now */
+                        xprt->sc_ctxt_cnt--;
+                        break;
+                }
+        }
+        spin_unlock_bh(&xprt->sc_ctxt_lock);
+        dprintk("svcrdma: sc_ctxt_max=%d, sc_ctxt_cnt=%d\n",
+                xprt->sc_ctxt_max, xprt->sc_ctxt_cnt);
+        return at_least_one;
+}
+struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
+{
+        struct svc_rdma_op_ctxt *ctxt;
+        while (1) {
+                spin_lock_bh(&xprt->sc_ctxt_lock);
+                if (unlikely(xprt->sc_ctxt_head == NULL)) {
+                        /* Try to bump my cache. */
+                        spin_unlock_bh(&xprt->sc_ctxt_lock);
+                        if (rdma_bump_context_cache(xprt))
+                                continue;
+                        printk(KERN_INFO "svcrdma: sleeping waiting for "
+                               "context memory on xprt=%p\n",
+                               xprt);
+                        schedule_timeout_uninterruptible(msecs_to_jiffies(500));
+                        continue;
+                }
+                ctxt = xprt->sc_ctxt_head;
+                xprt->sc_ctxt_head = ctxt->next;
+                spin_unlock_bh(&xprt->sc_ctxt_lock);
+                ctxt->xprt = xprt;
+                INIT_LIST_HEAD(&ctxt->dto_q);
+                ctxt->count = 0;
+                break;
+        }
+        return ctxt;
+}
+void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
+{
+        struct svcxprt_rdma *xprt;
+        int i;
+        BUG_ON(!ctxt);
+        xprt = ctxt->xprt;
+        if (free_pages)
+                for (i = 0; i < ctxt->count; i++)
+                        put_page(ctxt->pages[i]);
+        for (i = 0; i < ctxt->count; i++)
+                dma_unmap_single(xprt->sc_cm_id->device->dma_device,
+                                 ctxt->sge[i].addr,
+                                 ctxt->sge[i].length,
+                                 ctxt->direction);
+        spin_lock_bh(&xprt->sc_ctxt_lock);
+        ctxt->next = xprt->sc_ctxt_head;
+        xprt->sc_ctxt_head = ctxt;
+        spin_unlock_bh(&xprt->sc_ctxt_lock);
+}
+/* ib_cq event handler */
+static void cq_event_handler(struct ib_event *event, void *context)
+{
+        struct svc_xprt *xprt = context;
+        dprintk("svcrdma: received CQ event id=%d, context=%p\n",
+                event->event, context);
+        set_bit(XPT_CLOSE, &xprt->xpt_flags);
+}
+/* QP event handler */
+static void qp_event_handler(struct ib_event *event, void *context)
+{
+        struct svc_xprt *xprt = context;
+        switch (event->event) {
+        /* These are considered benign events */
+        case IB_EVENT_PATH_MIG:
+        case IB_EVENT_COMM_EST:
+        case IB_EVENT_SQ_DRAINED:
+        case IB_EVENT_QP_LAST_WQE_REACHED:
+                dprintk("svcrdma: QP event %d received for QP=%p\n",
+                        event->event, event->element.qp);
+                break;
+        /* These are considered fatal events */
+        case IB_EVENT_PATH_MIG_ERR:
+        case IB_EVENT_QP_FATAL:
+        case IB_EVENT_QP_REQ_ERR:
+        case IB_EVENT_QP_ACCESS_ERR:
+        case IB_EVENT_DEVICE_FATAL:
+        default:
+                dprintk("svcrdma: QP ERROR event %d received for QP=%p, "
+                        "closing transport\n",
+                        event->event, event->element.qp);
+                set_bit(XPT_CLOSE, &xprt->xpt_flags);
+                break;
+        }
+}
+/*
+ * Data Transfer Operation Tasklet
+ *
+ * Walks a list of transports with I/O pending, removing entries as
+ * they are added to the server's I/O pending list. Two bits indicate
+ * if SQ, RQ, or both have I/O pending. The dto_lock is an irqsave
+ * spinlock that serializes access to the transport list with the RQ
+ * and SQ interrupt handlers.
+ */
+static void dto_tasklet_func(unsigned long data)
+{
+        struct svcxprt_rdma *xprt;
+        unsigned long flags;
+        spin_lock_irqsave(&dto_lock, flags);
+        while (!list_empty(&dto_xprt_q)) {
+                xprt = list_entry(dto_xprt_q.next,
+                                  struct svcxprt_rdma, sc_dto_q);
+                list_del_init(&xprt->sc_dto_q);
+                spin_unlock_irqrestore(&dto_lock, flags);
+                if (test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags)) {
+                        ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP);
+                        rq_cq_reap(xprt);
+                        set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
+                        /*
+                         * If data arrived before established event,
+                         * don't enqueue. This defers RPC I/O until the
+                         * RDMA connection is complete.
+                         */
+                        if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags))
+                                svc_xprt_enqueue(&xprt->sc_xprt);
+                }
+                if (test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) {
+                        ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP);
+                        sq_cq_reap(xprt);
+                }
+                spin_lock_irqsave(&dto_lock, flags);
+        }
+        spin_unlock_irqrestore(&dto_lock, flags);
+}
+/*
+ * Receive Queue Completion Handler
+ *
+ * Since an RQ completion handler is called on interrupt context, we
+ * need to defer the handling of the I/O to a tasklet
+ */
+static void rq_comp_handler(struct ib_cq *cq, void *cq_context)
+{
+        struct svcxprt_rdma *xprt = cq_context;
+        unsigned long flags;
+        /*
+         * Set the bit regardless of whether or not it's on the list
+         * because it may be on the list already due to an SQ
+         * completion.
+        */
+        set_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags);
+        /*
+         * If this transport is not already on the DTO transport queue,
+         * add it
+         */
+        spin_lock_irqsave(&dto_lock, flags);
+        if (list_empty(&xprt->sc_dto_q))
+                list_add_tail(&xprt->sc_dto_q, &dto_xprt_q);
+        spin_unlock_irqrestore(&dto_lock, flags);
+        /* Tasklet does all the work to avoid irqsave locks. */
+        tasklet_schedule(&dto_tasklet);
+}
+/*
+ * rq_cq_reap - Process the RQ CQ.
+ *
+ * Take all completing WC off the CQE and enqueue the associated DTO
+ * context on the dto_q for the transport.
+ */
+static void rq_cq_reap(struct svcxprt_rdma *xprt)
+{
+        int ret;
+        struct ib_wc wc;
+        struct svc_rdma_op_ctxt *ctxt = NULL;
+        atomic_inc(&rdma_stat_rq_poll);
+        spin_lock_bh(&xprt->sc_rq_dto_lock);
+        while ((ret = ib_poll_cq(xprt->sc_rq_cq, 1, &wc)) > 0) {
+                ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
+                ctxt->wc_status = wc.status;
+                ctxt->byte_len = wc.byte_len;
+                if (wc.status != IB_WC_SUCCESS) {
+                        /* Close the transport */
+                        set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+                        svc_rdma_put_context(ctxt, 1);
+                        continue;
+                }
+                list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q);
+        }
+        spin_unlock_bh(&xprt->sc_rq_dto_lock);
+        if (ctxt)
+                atomic_inc(&rdma_stat_rq_prod);
+}
+/*
+ * Send Queue Completion Handler - potentially called on interrupt context.
+ */
+static void sq_cq_reap(struct svcxprt_rdma *xprt)
+{
+        struct svc_rdma_op_ctxt *ctxt = NULL;
+        struct ib_wc wc;
+        struct ib_cq *cq = xprt->sc_sq_cq;
+        int ret;
+        atomic_inc(&rdma_stat_sq_poll);
+        while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) {
+                ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
+                xprt = ctxt->xprt;
+                if (wc.status != IB_WC_SUCCESS)
+                        /* Close the transport */
+                        set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+                /* Decrement used SQ WR count */
+                atomic_dec(&xprt->sc_sq_count);
+                wake_up(&xprt->sc_send_wait);
+                switch (ctxt->wr_op) {
+                case IB_WR_SEND:
+                case IB_WR_RDMA_WRITE:
+                        svc_rdma_put_context(ctxt, 1);
+                        break;
+                case IB_WR_RDMA_READ:
+                        if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
+                                set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
+                                set_bit(RDMACTXT_F_READ_DONE, &ctxt->flags);
+                                spin_lock_bh(&xprt->sc_read_complete_lock);
+                                list_add_tail(&ctxt->dto_q,
+                                              &xprt->sc_read_complete_q);
+                                spin_unlock_bh(&xprt->sc_read_complete_lock);
+                                svc_xprt_enqueue(&xprt->sc_xprt);
+                        }
+                        break;
+                default:
+                        printk(KERN_ERR "svcrdma: unexpected completion type, "
+                               "opcode=%d, status=%d\n",
+                               wc.opcode, wc.status);
+                        break;
+                }
+        }
+        if (ctxt)
+                atomic_inc(&rdma_stat_sq_prod);
+}
+static void sq_comp_handler(struct ib_cq *cq, void *cq_context)
+{
+        struct svcxprt_rdma *xprt = cq_context;
+        unsigned long flags;
+        /*
+         * Set the bit regardless of whether or not it's on the list
+         * because it may be on the list already due to an RQ
+         * completion.
+        */
+        set_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags);
+        /*
+         * If this transport is not already on the DTO transport queue,
+         * add it
+         */
+        spin_lock_irqsave(&dto_lock, flags);
+        if (list_empty(&xprt->sc_dto_q))
+                list_add_tail(&xprt->sc_dto_q, &dto_xprt_q);
+        spin_unlock_irqrestore(&dto_lock, flags);
+        /* Tasklet does all the work to avoid irqsave locks. */
+        tasklet_schedule(&dto_tasklet);
+}
+static void create_context_cache(struct svcxprt_rdma *xprt,
+                                 int ctxt_count, int ctxt_bump, int ctxt_max)
+{
+        struct svc_rdma_op_ctxt *ctxt;
+        int i;
+        xprt->sc_ctxt_max = ctxt_max;
+        xprt->sc_ctxt_bump = ctxt_bump;
+        xprt->sc_ctxt_cnt = 0;
+        xprt->sc_ctxt_head = NULL;
+        for (i = 0; i < ctxt_count; i++) {
+                ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
+                if (ctxt) {
+                        ctxt->next = xprt->sc_ctxt_head;
+                        xprt->sc_ctxt_head = ctxt;
+                        xprt->sc_ctxt_cnt++;
+                }
+        }
+}
+static void destroy_context_cache(struct svc_rdma_op_ctxt *ctxt)
+{
+        struct svc_rdma_op_ctxt *next;
+        if (!ctxt)
+                return;
+        do {
+                next = ctxt->next;
+                kfree(ctxt);
+                ctxt = next;
+        } while (next);
+}
+static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
+                                             int listener)
+{
+        struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL);
+        if (!cma_xprt)
+                return NULL;
+        svc_xprt_init(&svc_rdma_class, &cma_xprt->sc_xprt, serv);
+        INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
+        INIT_LIST_HEAD(&cma_xprt->sc_dto_q);
+        INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
+        INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
+        init_waitqueue_head(&cma_xprt->sc_send_wait);
+        spin_lock_init(&cma_xprt->sc_lock);
+        spin_lock_init(&cma_xprt->sc_read_complete_lock);
+        spin_lock_init(&cma_xprt->sc_ctxt_lock);
+        spin_lock_init(&cma_xprt->sc_rq_dto_lock);
+        cma_xprt->sc_ord = svcrdma_ord;
+        cma_xprt->sc_max_req_size = svcrdma_max_req_size;
+        cma_xprt->sc_max_requests = svcrdma_max_requests;
+        cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT;
+        atomic_set(&cma_xprt->sc_sq_count, 0);
+        if (!listener) {
+                int reqs = cma_xprt->sc_max_requests;
+                create_context_cache(cma_xprt,
+                                     reqs << 1, /* starting size */
+                                     reqs,      /* bump amount */
+                                     reqs +
+                                     cma_xprt->sc_sq_depth +
+                                     RPCRDMA_MAX_THREADS + 1); /* max */
+                if (!cma_xprt->sc_ctxt_head) {
+                        kfree(cma_xprt);
+                        return NULL;
+                }
+                clear_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
+        } else
+                set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
+        return cma_xprt;
+}
+struct page *svc_rdma_get_page(void)
+{
+        struct page *page;
+        while ((page = alloc_page(GFP_KERNEL)) == NULL) {
+                /* If we can't get memory, wait a bit and try again */
+                printk(KERN_INFO "svcrdma: out of memory...retrying in 1000 "
+                       "jiffies.\n");
+                schedule_timeout_uninterruptible(msecs_to_jiffies(1000));
+        }
+        return page;
+}
+int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
+{
+        struct ib_recv_wr recv_wr, *bad_recv_wr;
+        struct svc_rdma_op_ctxt *ctxt;
+        struct page *page;
+        unsigned long pa;
+        int sge_no;
+        int buflen;
+        int ret;
+        ctxt = svc_rdma_get_context(xprt);
+        buflen = 0;
+        ctxt->direction = DMA_FROM_DEVICE;
+        for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) {
+                BUG_ON(sge_no >= xprt->sc_max_sge);
+                page = svc_rdma_get_page();
+                ctxt->pages[sge_no] = page;
+                pa = ib_dma_map_page(xprt->sc_cm_id->device,
+                                     page, 0, PAGE_SIZE,
+                                     DMA_FROM_DEVICE);
+                ctxt->sge[sge_no].addr = pa;
+                ctxt->sge[sge_no].length = PAGE_SIZE;
+                ctxt->sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+                buflen += PAGE_SIZE;
+        }
+        ctxt->count = sge_no;
+        recv_wr.next = NULL;
+        recv_wr.sg_list = &ctxt->sge[0];
+        recv_wr.num_sge = ctxt->count;
+        recv_wr.wr_id = (u64)(unsigned long)ctxt;
+        ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr);
+        return ret;
+}
+/*
+ * This function handles the CONNECT_REQUEST event on a listening
+ * endpoint. It is passed the cma_id for the _new_ connection. The context in
+ * this cma_id is inherited from the listening cma_id and is the svc_xprt
+ * structure for the listening endpoint.
+ *
+ * This function creates a new xprt for the new connection and enqueues it on
+ * the accept queue for the listent xprt. When the listen thread is kicked, it
+ * will call the recvfrom method on the listen xprt which will accept the new
+ * connection.
+ */
+static void handle_connect_req(struct rdma_cm_id *new_cma_id)
+{
+        struct svcxprt_rdma *listen_xprt = new_cma_id->context;
+        struct svcxprt_rdma *newxprt;
+        /* Create a new transport */
+        newxprt = rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, 0);
+        if (!newxprt) {
+                dprintk("svcrdma: failed to create new transport\n");
+                return;
+        }
+        newxprt->sc_cm_id = new_cma_id;
+        new_cma_id->context = newxprt;
+        dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n",
+                newxprt, newxprt->sc_cm_id, listen_xprt);
+        /*
+         * Enqueue the new transport on the accept queue of the listening
+         * transport
+         */
+        spin_lock_bh(&listen_xprt->sc_lock);
+        list_add_tail(&newxprt->sc_accept_q, &listen_xprt->sc_accept_q);
+        spin_unlock_bh(&listen_xprt->sc_lock);
+        /*
+         * Can't use svc_xprt_received here because we are not on a
+         * rqstp thread
+        */
+        set_bit(XPT_CONN, &listen_xprt->sc_xprt.xpt_flags);
+        svc_xprt_enqueue(&listen_xprt->sc_xprt);
+}
+/*
+ * Handles events generated on the listening endpoint. These events will be
+ * either be incoming connect requests or adapter removal  events.
+ */
+static int rdma_listen_handler(struct rdma_cm_id *cma_id,
+                               struct rdma_cm_event *event)
+{
+        struct svcxprt_rdma *xprt = cma_id->context;
+        int ret = 0;
+        switch (event->event) {
+        case RDMA_CM_EVENT_CONNECT_REQUEST:
+                dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
+                        "event=%d\n", cma_id, cma_id->context, event->event);
+                handle_connect_req(cma_id);
+                break;
+        case RDMA_CM_EVENT_ESTABLISHED:
+                /* Accept complete */
+                dprintk("svcrdma: Connection completed on LISTEN xprt=%p, "
+                        "cm_id=%p\n", xprt, cma_id);
+                break;
+        case RDMA_CM_EVENT_DEVICE_REMOVAL:
+                dprintk("svcrdma: Device removal xprt=%p, cm_id=%p\n",
+                        xprt, cma_id);
+                if (xprt)
+                        set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+                break;
+        default:
+                dprintk("svcrdma: Unexpected event on listening endpoint %p, "
+                        "event=%d\n", cma_id, event->event);
+                break;
+        }
+        return ret;
+}
+static int rdma_cma_handler(struct rdma_cm_id *cma_id,
+                            struct rdma_cm_event *event)
+{
+        struct svc_xprt *xprt = cma_id->context;
+        struct svcxprt_rdma *rdma =
+                container_of(xprt, struct svcxprt_rdma, sc_xprt);
+        switch (event->event) {
+        case RDMA_CM_EVENT_ESTABLISHED:
+                /* Accept complete */
+                dprintk("svcrdma: Connection completed on DTO xprt=%p, "
+                        "cm_id=%p\n", xprt, cma_id);
+                clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags);
+                svc_xprt_enqueue(xprt);
+                break;
+        case RDMA_CM_EVENT_DISCONNECTED:
+                dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n",
+                        xprt, cma_id);
+                if (xprt) {
+                        set_bit(XPT_CLOSE, &xprt->xpt_flags);
+                        svc_xprt_enqueue(xprt);
+                }
+                break;
+        case RDMA_CM_EVENT_DEVICE_REMOVAL:
+                dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, "
+                        "event=%d\n", cma_id, xprt, event->event);
+                if (xprt) {
+                        set_bit(XPT_CLOSE, &xprt->xpt_flags);
+                        svc_xprt_enqueue(xprt);
+                }
+                break;
+        default:
+                dprintk("svcrdma: Unexpected event on DTO endpoint %p, "
+                        "event=%d\n", cma_id, event->event);
+                break;
+        }
+        return 0;
+}
+/*
+ * Create a listening RDMA service endpoint.
+ */
+static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
+                                        struct sockaddr *sa, int salen,
+                                        int flags)
+{
+        struct rdma_cm_id *listen_id;
+        struct svcxprt_rdma *cma_xprt;
+        struct svc_xprt *xprt;
+        int ret;
+        dprintk("svcrdma: Creating RDMA socket\n");
+        cma_xprt = rdma_create_xprt(serv, 1);
+        if (!cma_xprt)
+                return ERR_PTR(ENOMEM);
+        xprt = &cma_xprt->sc_xprt;
+        listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP);
+        if (IS_ERR(listen_id)) {
+                rdma_destroy_xprt(cma_xprt);
+                dprintk("svcrdma: rdma_create_id failed = %ld\n",
+                        PTR_ERR(listen_id));
+                return (void *)listen_id;
+        }
+        ret = rdma_bind_addr(listen_id, sa);
+        if (ret) {
+                rdma_destroy_xprt(cma_xprt);
+                rdma_destroy_id(listen_id);
+                dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret);
+                return ERR_PTR(ret);
+        }
+        cma_xprt->sc_cm_id = listen_id;
+        ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG);
+        if (ret) {
+                rdma_destroy_id(listen_id);
+                rdma_destroy_xprt(cma_xprt);
+                dprintk("svcrdma: rdma_listen failed = %d\n", ret);
+        }
+        /*
+         * We need to use the address from the cm_id in case the
+         * caller specified 0 for the port number.
+         */
+        sa = (struct sockaddr *)&cma_xprt->sc_cm_id->route.addr.src_addr;
+        svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen);
+        return &cma_xprt->sc_xprt;
+}
+/*
+ * This is the xpo_recvfrom function for listening endpoints. Its
+ * purpose is to accept incoming connections. The CMA callback handler
+ * has already created a new transport and attached it to the new CMA
+ * ID.
+ *
+ * There is a queue of pending connections hung on the listening
+ * transport. This queue contains the new svc_xprt structure. This
+ * function takes svc_xprt structures off the accept_q and completes
+ * the connection.
+ */
+static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
+{
+        struct svcxprt_rdma *listen_rdma;
+        struct svcxprt_rdma *newxprt = NULL;
+        struct rdma_conn_param conn_param;
+        struct ib_qp_init_attr qp_attr;
+        struct ib_device_attr devattr;
+        struct sockaddr *sa;
+        int ret;
+        int i;
+        listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt);
+        clear_bit(XPT_CONN, &xprt->xpt_flags);
+        /* Get the next entry off the accept list */
+        spin_lock_bh(&listen_rdma->sc_lock);
+        if (!list_empty(&listen_rdma->sc_accept_q)) {
+                newxprt = list_entry(listen_rdma->sc_accept_q.next,
+                                     struct svcxprt_rdma, sc_accept_q);
+                list_del_init(&newxprt->sc_accept_q);
+        }
+        if (!list_empty(&listen_rdma->sc_accept_q))
+                set_bit(XPT_CONN, &listen_rdma->sc_xprt.xpt_flags);
+        spin_unlock_bh(&listen_rdma->sc_lock);
+        if (!newxprt)
+                return NULL;
+        dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n",
+                newxprt, newxprt->sc_cm_id);
+        ret = ib_query_device(newxprt->sc_cm_id->device, &devattr);
+        if (ret) {
+                dprintk("svcrdma: could not query device attributes on "
+                        "device %p, rc=%d\n", newxprt->sc_cm_id->device, ret);
+                goto errout;
+        }
+        /* Qualify the transport resource defaults with the
+         * capabilities of this particular device */
+        newxprt->sc_max_sge = min((size_t)devattr.max_sge,
+                                  (size_t)RPCSVC_MAXPAGES);
+        newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr,
+                                   (size_t)svcrdma_max_requests);
+        newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests;
+        newxprt->sc_ord =  min((size_t)devattr.max_qp_rd_atom,
+                               (size_t)svcrdma_ord);
+        newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device);
+        if (IS_ERR(newxprt->sc_pd)) {
+                dprintk("svcrdma: error creating PD for connect request\n");
+                goto errout;
+        }
+        newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device,
+                                         sq_comp_handler,
+                                         cq_event_handler,
+                                         newxprt,
+                                         newxprt->sc_sq_depth,
+                                         0);
+        if (IS_ERR(newxprt->sc_sq_cq)) {
+                dprintk("svcrdma: error creating SQ CQ for connect request\n");
+                goto errout;
+        }
+        newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device,
+                                         rq_comp_handler,
+                                         cq_event_handler,
+                                         newxprt,
+                                         newxprt->sc_max_requests,
+                                         0);
+        if (IS_ERR(newxprt->sc_rq_cq)) {
+                dprintk("svcrdma: error creating RQ CQ for connect request\n");
+                goto errout;
+        }
+        memset(&qp_attr, 0, sizeof qp_attr);
+        qp_attr.event_handler = qp_event_handler;
+        qp_attr.qp_context = &newxprt->sc_xprt;
+        qp_attr.cap.max_send_wr = newxprt->sc_sq_depth;
+        qp_attr.cap.max_recv_wr = newxprt->sc_max_requests;
+        qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
+        qp_attr.cap.max_recv_sge = newxprt->sc_max_sge;
+        qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+        qp_attr.qp_type = IB_QPT_RC;
+        qp_attr.send_cq = newxprt->sc_sq_cq;
+        qp_attr.recv_cq = newxprt->sc_rq_cq;
+        dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n"
+                "    cm_id->device=%p, sc_pd->device=%p\n"
+                "    cap.max_send_wr = %d\n"
+                "    cap.max_recv_wr = %d\n"
+                "    cap.max_send_sge = %d\n"
+                "    cap.max_recv_sge = %d\n",
+                newxprt->sc_cm_id, newxprt->sc_pd,
+                newxprt->sc_cm_id->device, newxprt->sc_pd->device,
+                qp_attr.cap.max_send_wr,
+                qp_attr.cap.max_recv_wr,
+                qp_attr.cap.max_send_sge,
+                qp_attr.cap.max_recv_sge);
+        ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr);
+        if (ret) {
+                /*
+                 * XXX: This is a hack. We need a xx_request_qp interface
+                 * that will adjust the qp_attr's with a best-effort
+                 * number
+                 */
+                qp_attr.cap.max_send_sge -= 2;
+                qp_attr.cap.max_recv_sge -= 2;
+                ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd,
+                                     &qp_attr);
+                if (ret) {
+                        dprintk("svcrdma: failed to create QP, ret=%d\n", ret);
+                        goto errout;
+                }
+                newxprt->sc_max_sge = qp_attr.cap.max_send_sge;
+                newxprt->sc_max_sge = qp_attr.cap.max_recv_sge;
+                newxprt->sc_sq_depth = qp_attr.cap.max_send_wr;
+                newxprt->sc_max_requests = qp_attr.cap.max_recv_wr;
+        }
+        newxprt->sc_qp = newxprt->sc_cm_id->qp;
+        /* Register all of physical memory */
+        newxprt->sc_phys_mr = ib_get_dma_mr(newxprt->sc_pd,
+                                            IB_ACCESS_LOCAL_WRITE |
+                                            IB_ACCESS_REMOTE_WRITE);
+        if (IS_ERR(newxprt->sc_phys_mr)) {
+                dprintk("svcrdma: Failed to create DMA MR ret=%d\n", ret);
+                goto errout;
+        }
+        /* Post receive buffers */
+        for (i = 0; i < newxprt->sc_max_requests; i++) {
+                ret = svc_rdma_post_recv(newxprt);
+                if (ret) {
+                        dprintk("svcrdma: failure posting receive buffers\n");
+                        goto errout;
+                }
+        }
+        /* Swap out the handler */
+        newxprt->sc_cm_id->event_handler = rdma_cma_handler;
+        /* Accept Connection */
+        set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags);
+        memset(&conn_param, 0, sizeof conn_param);
+        conn_param.responder_resources = 0;
+        conn_param.initiator_depth = newxprt->sc_ord;
+        ret = rdma_accept(newxprt->sc_cm_id, &conn_param);
+        if (ret) {
+                dprintk("svcrdma: failed to accept new connection, ret=%d\n",
+                       ret);
+                goto errout;
+        }
+        dprintk("svcrdma: new connection %p accepted with the following "
+                "attributes:\n"
+                "    local_ip        : %d.%d.%d.%d\n"
+                "    local_port      : %d\n"
+                "    remote_ip       : %d.%d.%d.%d\n"
+                "    remote_port     : %d\n"
+                "    max_sge         : %d\n"
+                "    sq_depth        : %d\n"
+                "    max_requests    : %d\n"
+                "    ord             : %d\n",
+                newxprt,
+                NIPQUAD(((struct sockaddr_in *)&newxprt->sc_cm_id->
+                         route.addr.src_addr)->sin_addr.s_addr),
+                ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
+                       route.addr.src_addr)->sin_port),
+                NIPQUAD(((struct sockaddr_in *)&newxprt->sc_cm_id->
+                         route.addr.dst_addr)->sin_addr.s_addr),
+                ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
+                       route.addr.dst_addr)->sin_port),
+                newxprt->sc_max_sge,
+                newxprt->sc_sq_depth,
+                newxprt->sc_max_requests,
+                newxprt->sc_ord);
+        /* Set the local and remote addresses in the transport */
+        sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
+        svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa));
+        sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
+        svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa));
+        ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP);
+        ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP);
+        return &newxprt->sc_xprt;
+ errout:
+        dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret);
+        rdma_destroy_id(newxprt->sc_cm_id);
+        rdma_destroy_xprt(newxprt);
+        return NULL;
+}
+/*
+ * Post an RQ WQE to the RQ when the rqst is being released. This
+ * effectively returns an RQ credit to the client. The rq_xprt_ctxt
+ * will be null if the request is deferred due to an RDMA_READ or the
+ * transport had no data ready (EAGAIN). Note that an RPC deferred in
+ * svc_process will still return the credit, this is because the data
+ * is copied and no longer consume a WQE/WC.
+ */
+static void svc_rdma_release_rqst(struct svc_rqst *rqstp)
+{
+        int err;
+        struct svcxprt_rdma *rdma =
+                container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt);
+        if (rqstp->rq_xprt_ctxt) {
+                BUG_ON(rqstp->rq_xprt_ctxt != rdma);
+                err = svc_rdma_post_recv(rdma);
+                if (err)
+                        dprintk("svcrdma: failed to post an RQ WQE error=%d\n",
+                                err);
+        }
+        rqstp->rq_xprt_ctxt = NULL;
+}
+/* Disable data ready events for this connection */
+static void svc_rdma_detach(struct svc_xprt *xprt)
+{
+        struct svcxprt_rdma *rdma =
+                container_of(xprt, struct svcxprt_rdma, sc_xprt);
+        unsigned long flags;
+        dprintk("svc: svc_rdma_detach(%p)\n", xprt);
+        /*
+         * Shutdown the connection. This will ensure we don't get any
+         * more events from the provider.
+         */
+        rdma_disconnect(rdma->sc_cm_id);
+        rdma_destroy_id(rdma->sc_cm_id);
+        /* We may already be on the DTO list */
+        spin_lock_irqsave(&dto_lock, flags);
+        if (!list_empty(&rdma->sc_dto_q))
+                list_del_init(&rdma->sc_dto_q);
+        spin_unlock_irqrestore(&dto_lock, flags);
+}
+static void svc_rdma_free(struct svc_xprt *xprt)
+{
+        struct svcxprt_rdma *rdma = (struct svcxprt_rdma *)xprt;
+        dprintk("svcrdma: svc_rdma_free(%p)\n", rdma);
+        rdma_destroy_xprt(rdma);
+        kfree(rdma);
+}
+static void rdma_destroy_xprt(struct svcxprt_rdma *xprt)
+{
+        if (xprt->sc_qp && !IS_ERR(xprt->sc_qp))
+                ib_destroy_qp(xprt->sc_qp);
+        if (xprt->sc_sq_cq && !IS_ERR(xprt->sc_sq_cq))
+                ib_destroy_cq(xprt->sc_sq_cq);
+        if (xprt->sc_rq_cq && !IS_ERR(xprt->sc_rq_cq))
+                ib_destroy_cq(xprt->sc_rq_cq);
+        if (xprt->sc_phys_mr && !IS_ERR(xprt->sc_phys_mr))
+                ib_dereg_mr(xprt->sc_phys_mr);
+        if (xprt->sc_pd && !IS_ERR(xprt->sc_pd))
+                ib_dealloc_pd(xprt->sc_pd);
+        destroy_context_cache(xprt->sc_ctxt_head);
+}
+static int svc_rdma_has_wspace(struct svc_xprt *xprt)
+{
+        struct svcxprt_rdma *rdma =
+                container_of(xprt, struct svcxprt_rdma, sc_xprt);
+        /*
+         * If there are fewer SQ WR available than required to send a
+         * simple response, return false.
+         */
+        if ((rdma->sc_sq_depth - atomic_read(&rdma->sc_sq_count) < 3))
+                return 0;
+        /*
+         * ...or there are already waiters on the SQ,
+         * return false.
+         */
+        if (waitqueue_active(&rdma->sc_send_wait))
+                return 0;
+        /* Otherwise return true. */
+        return 1;
+}
+int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
+{
+        struct ib_send_wr *bad_wr;
+        int ret;
+        if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
+                return 0;
+        BUG_ON(wr->send_flags != IB_SEND_SIGNALED);
+        BUG_ON(((struct svc_rdma_op_ctxt *)(unsigned long)wr->wr_id)->wr_op !=
+                wr->opcode);
+        /* If the SQ is full, wait until an SQ entry is available */
+        while (1) {
+                spin_lock_bh(&xprt->sc_lock);
+                if (xprt->sc_sq_depth == atomic_read(&xprt->sc_sq_count)) {
+                        spin_unlock_bh(&xprt->sc_lock);
+                        atomic_inc(&rdma_stat_sq_starve);
+                        /* See if we can reap some SQ WR */
+                        sq_cq_reap(xprt);
+                        /* Wait until SQ WR available if SQ still full */
+                        wait_event(xprt->sc_send_wait,
+                                   atomic_read(&xprt->sc_sq_count) <
+                                   xprt->sc_sq_depth);
+                        continue;
+                }
+                /* Bumped used SQ WR count and post */
+                ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
+                if (!ret)
+                        atomic_inc(&xprt->sc_sq_count);
+                else
+                        dprintk("svcrdma: failed to post SQ WR rc=%d, "
+                               "sc_sq_count=%d, sc_sq_depth=%d\n",
+                               ret, atomic_read(&xprt->sc_sq_count),
+                               xprt->sc_sq_depth);
+                spin_unlock_bh(&xprt->sc_lock);
+                break;
+        }
+        return ret;
+}
+int svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
+                        enum rpcrdma_errcode err)
+{
+        struct ib_send_wr err_wr;
+        struct ib_sge sge;
+        struct page *p;
+        struct svc_rdma_op_ctxt *ctxt;
+        u32 *va;
+        int length;
+        int ret;
+        p = svc_rdma_get_page();
+        va = page_address(p);
+        /* XDR encode error */
+        length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
+        /* Prepare SGE for local address */
+        sge.addr = ib_dma_map_page(xprt->sc_cm_id->device,
+                                   p, 0, PAGE_SIZE, DMA_FROM_DEVICE);
+        sge.lkey = xprt->sc_phys_mr->lkey;
+        sge.length = length;
+        ctxt = svc_rdma_get_context(xprt);
+        ctxt->count = 1;
+        ctxt->pages[0] = p;
+        /* Prepare SEND WR */
+        memset(&err_wr, 0, sizeof err_wr);
+        ctxt->wr_op = IB_WR_SEND;
+        err_wr.wr_id = (unsigned long)ctxt;
+        err_wr.sg_list = &sge;
+        err_wr.num_sge = 1;
+        err_wr.opcode = IB_WR_SEND;
+        err_wr.send_flags = IB_SEND_SIGNALED;
+        /* Post It */
+        ret = svc_rdma_send(xprt, &err_wr);
+        if (ret) {
+                dprintk("svcrdma: Error posting send = %d\n", ret);
+                svc_rdma_put_context(ctxt, 1);
+        }
+        return ret;
+}