2 files changed, 993 insertions, 0 deletions
diff --git a/drivers/xen/xenfs/privcmd.c b/drivers/xen/xenfs/privcmd.c
new file mode 100644
index 00000000000..dbd3b16fd13
--- /dev/null
+++ b/drivers/xen/xenfs/privcmd.c
@@ -0,0 +1,400 @@
+/******************************************************************************
+ * privcmd.c
+ *
+ * Interface to privileged domain-0 commands.
+ *
+ * Copyright (c) 2002-2004, K A Fraser, B Dragovic
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/uaccess.h>
+#include <linux/swap.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include <xen/xen.h>
+#include <xen/privcmd.h>
+#include <xen/interface/xen.h>
+#include <xen/features.h>
+#include <xen/page.h>
+#include <xen/xen-ops.h>
+#ifndef HAVE_ARCH_PRIVCMD_MMAP
+static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
+#endif
+static long privcmd_ioctl_hypercall(void __user *udata)
+{
+        struct privcmd_hypercall hypercall;
+        long ret;
+        if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
+                return -EFAULT;
+        ret = privcmd_call(hypercall.op,
+                           hypercall.arg[0], hypercall.arg[1],
+                           hypercall.arg[2], hypercall.arg[3],
+                           hypercall.arg[4]);
+        return ret;
+}
+static void free_page_list(struct list_head *pages)
+{
+        struct page *p, *n;
+        list_for_each_entry_safe(p, n, pages, lru)
+                __free_page(p);
+        INIT_LIST_HEAD(pages);
+}
+/*
+ * Given an array of items in userspace, return a list of pages
+ * containing the data.  If copying fails, either because of memory
+ * allocation failure or a problem reading user memory, return an
+ * error code; its up to the caller to dispose of any partial list.
+ */
+static int gather_array(struct list_head *pagelist,
+                        unsigned nelem, size_t size,
+                        void __user *data)
+{
+        unsigned pageidx;
+        void *pagedata;
+        int ret;
+        if (size > PAGE_SIZE)
+                return 0;
+        pageidx = PAGE_SIZE;
+        pagedata = NULL;        /* quiet, gcc */
+        while (nelem--) {
+                if (pageidx > PAGE_SIZE-size) {
+                        struct page *page = alloc_page(GFP_KERNEL);
+                        ret = -ENOMEM;
+                        if (page == NULL)
+                                goto fail;
+                        pagedata = page_address(page);
+                        list_add_tail(&page->lru, pagelist);
+                        pageidx = 0;
+                }
+                ret = -EFAULT;
+                if (copy_from_user(pagedata + pageidx, data, size))
+                        goto fail;
+                data += size;
+                pageidx += size;
+        }
+        ret = 0;
+fail:
+        return ret;
+}
+/*
+ * Call function "fn" on each element of the array fragmented
+ * over a list of pages.
+ */
+static int traverse_pages(unsigned nelem, size_t size,
+                          struct list_head *pos,
+                          int (*fn)(void *data, void *state),
+                          void *state)
+{
+        void *pagedata;
+        unsigned pageidx;
+        int ret = 0;
+        BUG_ON(size > PAGE_SIZE);
+        pageidx = PAGE_SIZE;
+        pagedata = NULL;        /* hush, gcc */
+        while (nelem--) {
+                if (pageidx > PAGE_SIZE-size) {
+                        struct page *page;
+                        pos = pos->next;
+                        page = list_entry(pos, struct page, lru);
+                        pagedata = page_address(page);
+                        pageidx = 0;
+                }
+                ret = (*fn)(pagedata + pageidx, state);
+                if (ret)
+                        break;
+                pageidx += size;
+        }
+        return ret;
+}
+struct mmap_mfn_state {
+        unsigned long va;
+        struct vm_area_struct *vma;
+        domid_t domain;
+};
+static int mmap_mfn_range(void *data, void *state)
+{
+        struct privcmd_mmap_entry *msg = data;
+        struct mmap_mfn_state *st = state;
+        struct vm_area_struct *vma = st->vma;
+        int rc;
+        /* Do not allow range to wrap the address space. */
+        if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
+            ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va))
+                return -EINVAL;
+        /* Range chunks must be contiguous in va space. */
+        if ((msg->va != st->va) ||
+            ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
+                return -EINVAL;
+        rc = xen_remap_domain_mfn_range(vma,
+                                        msg->va & PAGE_MASK,
+                                        msg->mfn, msg->npages,
+                                        vma->vm_page_prot,
+                                        st->domain);
+        if (rc < 0)
+                return rc;
+        st->va += msg->npages << PAGE_SHIFT;
+        return 0;
+}
+static long privcmd_ioctl_mmap(void __user *udata)
+{
+        struct privcmd_mmap mmapcmd;
+        struct mm_struct *mm = current->mm;
+        struct vm_area_struct *vma;
+        int rc;
+        LIST_HEAD(pagelist);
+        struct mmap_mfn_state state;
+        if (!xen_initial_domain())
+                return -EPERM;
+        if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
+                return -EFAULT;
+        rc = gather_array(&pagelist,
+                          mmapcmd.num, sizeof(struct privcmd_mmap_entry),
+                          mmapcmd.entry);
+        if (rc || list_empty(&pagelist))
+                goto out;
+        down_write(&mm->mmap_sem);
+        {
+                struct page *page = list_first_entry(&pagelist,
+                                                     struct page, lru);
+                struct privcmd_mmap_entry *msg = page_address(page);
+                vma = find_vma(mm, msg->va);
+                rc = -EINVAL;
+                if (!vma || (msg->va != vma->vm_start) ||
+                    !privcmd_enforce_singleshot_mapping(vma))
+                        goto out_up;
+        }
+        state.va = vma->vm_start;
+        state.vma = vma;
+        state.domain = mmapcmd.dom;
+        rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
+                            &pagelist,
+                            mmap_mfn_range, &state);
+out_up:
+        up_write(&mm->mmap_sem);
+out:
+        free_page_list(&pagelist);
+        return rc;
+}
+struct mmap_batch_state {
+        domid_t domain;
+        unsigned long va;
+        struct vm_area_struct *vma;
+        int err;
+        xen_pfn_t __user *user;
+};
+static int mmap_batch_fn(void *data, void *state)
+{
+        xen_pfn_t *mfnp = data;
+        struct mmap_batch_state *st = state;
+        if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1,
+                                       st->vma->vm_page_prot, st->domain) < 0) {
+                *mfnp |= 0xf0000000U;
+                st->err++;
+        }
+        st->va += PAGE_SIZE;
+        return 0;
+}
+static int mmap_return_errors(void *data, void *state)
+{
+        xen_pfn_t *mfnp = data;
+        struct mmap_batch_state *st = state;
+        return put_user(*mfnp, st->user++);
+}
+static struct vm_operations_struct privcmd_vm_ops;
+static long privcmd_ioctl_mmap_batch(void __user *udata)
+{
+        int ret;
+        struct privcmd_mmapbatch m;
+        struct mm_struct *mm = current->mm;
+        struct vm_area_struct *vma;
+        unsigned long nr_pages;
+        LIST_HEAD(pagelist);
+        struct mmap_batch_state state;
+        if (!xen_initial_domain())
+                return -EPERM;
+        if (copy_from_user(&m, udata, sizeof(m)))
+                return -EFAULT;
+        nr_pages = m.num;
+        if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
+                return -EINVAL;
+        ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t),
+                           m.arr);
+        if (ret || list_empty(&pagelist))
+                goto out;
+        down_write(&mm->mmap_sem);
+        vma = find_vma(mm, m.addr);
+        ret = -EINVAL;
+        if (!vma ||
+            vma->vm_ops != &privcmd_vm_ops ||
+            (m.addr != vma->vm_start) ||
+            ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) ||
+            !privcmd_enforce_singleshot_mapping(vma)) {
+                up_write(&mm->mmap_sem);
+                goto out;
+        }
+        state.domain = m.dom;
+        state.vma = vma;
+        state.va = m.addr;
+        state.err = 0;
+        ret = traverse_pages(m.num, sizeof(xen_pfn_t),
+                             &pagelist, mmap_batch_fn, &state);
+        up_write(&mm->mmap_sem);
+        if (state.err > 0) {
+                state.user = m.arr;
+                ret = traverse_pages(m.num, sizeof(xen_pfn_t),
+                               &pagelist,
+                               mmap_return_errors, &state);
+        }
+out:
+        free_page_list(&pagelist);
+        return ret;
+}
+static long privcmd_ioctl(struct file *file,
+                          unsigned int cmd, unsigned long data)
+{
+        int ret = -ENOSYS;
+        void __user *udata = (void __user *) data;
+        switch (cmd) {
+        case IOCTL_PRIVCMD_HYPERCALL:
+                ret = privcmd_ioctl_hypercall(udata);
+                break;
+        case IOCTL_PRIVCMD_MMAP:
+                ret = privcmd_ioctl_mmap(udata);
+                break;
+        case IOCTL_PRIVCMD_MMAPBATCH:
+                ret = privcmd_ioctl_mmap_batch(udata);
+                break;
+        default:
+                ret = -EINVAL;
+                break;
+        }
+        return ret;
+}
+#ifndef HAVE_ARCH_PRIVCMD_MMAP
+static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
+               vma, vma->vm_start, vma->vm_end,
+               vmf->pgoff, vmf->virtual_address);
+        return VM_FAULT_SIGBUS;
+}
+static struct vm_operations_struct privcmd_vm_ops = {
+        .fault = privcmd_fault
+};
+static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        /* Unsupported for auto-translate guests. */
+        if (xen_feature(XENFEAT_auto_translated_physmap))
+                return -ENOSYS;
+        /* DONTCOPY is essential for Xen because copy_page_range doesn't know
+         * how to recreate these mappings */
+        vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP;
+        vma->vm_ops = &privcmd_vm_ops;
+        vma->vm_private_data = NULL;
+        return 0;
+}
+static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
+{
+        return (xchg(&vma->vm_private_data, (void *)1) == NULL);
+}
+#endif
+const struct file_operations privcmd_file_ops = {
+        .unlocked_ioctl = privcmd_ioctl,
+        .mmap = privcmd_mmap,
+};
diff --git a/drivers/xen/xenfs/xenbus.c b/drivers/xen/xenfs/xenbus.c
new file mode 100644
index 00000000000..bbd000f88af
--- /dev/null
+++ b/drivers/xen/xenfs/xenbus.c
@@ -0,0 +1,593 @@
+/*
+ * Driver giving user-space access to the kernel's xenbus connection
+ * to xenstore.
+ *
+ * Copyright (c) 2005, Christian Limpach
+ * Copyright (c) 2005, Rusty Russell, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Changes:
+ * 2008-10-07  Alex Zeffertt    Replaced /proc/xen/xenbus with xenfs filesystem
+ *                              and /proc/xen compatibility mount point.
+ *                              Turned xenfs into a loadable module.
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/uio.h>
+#include <linux/notifier.h>
+#include <linux/wait.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/uaccess.h>
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include "xenfs.h"
+#include "../xenbus/xenbus_comms.h"
+#include <xen/xenbus.h>
+#include <asm/xen/hypervisor.h>
+/*
+ * An element of a list of outstanding transactions, for which we're
+ * still waiting a reply.
+ */
+struct xenbus_transaction_holder {
+        struct list_head list;
+        struct xenbus_transaction handle;
+};
+/*
+ * A buffer of data on the queue.
+ */
+struct read_buffer {
+        struct list_head list;
+        unsigned int cons;
+        unsigned int len;
+        char msg[];
+};
+struct xenbus_file_priv {
+        /*
+         * msgbuffer_mutex is held while partial requests are built up
+         * and complete requests are acted on.  It therefore protects
+         * the "transactions" and "watches" lists, and the partial
+         * request length and buffer.
+         *
+         * reply_mutex protects the reply being built up to return to
+         * usermode.  It nests inside msgbuffer_mutex but may be held
+         * alone during a watch callback.
+         */
+        struct mutex msgbuffer_mutex;
+        /* In-progress transactions */
+        struct list_head transactions;
+        /* Active watches. */
+        struct list_head watches;
+        /* Partial request. */
+        unsigned int len;
+        union {
+                struct xsd_sockmsg msg;
+                char buffer[PAGE_SIZE];
+        } u;
+        /* Response queue. */
+        struct mutex reply_mutex;
+        struct list_head read_buffers;
+        wait_queue_head_t read_waitq;
+};
+/* Read out any raw xenbus messages queued up. */
+static ssize_t xenbus_file_read(struct file *filp,
+                               char __user *ubuf,
+                               size_t len, loff_t *ppos)
+{
+        struct xenbus_file_priv *u = filp->private_data;
+        struct read_buffer *rb;
+        unsigned i;
+        int ret;
+        mutex_lock(&u->reply_mutex);
+again:
+        while (list_empty(&u->read_buffers)) {
+                mutex_unlock(&u->reply_mutex);
+                if (filp->f_flags & O_NONBLOCK)
+                        return -EAGAIN;
+                ret = wait_event_interruptible(u->read_waitq,
+                                               !list_empty(&u->read_buffers));
+                if (ret)
+                        return ret;
+                mutex_lock(&u->reply_mutex);
+        }
+        rb = list_entry(u->read_buffers.next, struct read_buffer, list);
+        i = 0;
+        while (i < len) {
+                unsigned sz = min((unsigned)len - i, rb->len - rb->cons);
+                ret = copy_to_user(ubuf + i, &rb->msg[rb->cons], sz);
+                i += sz - ret;
+                rb->cons += sz - ret;
+                if (ret != 0) {
+                        if (i == 0)
+                                i = -EFAULT;
+                        goto out;
+                }
+                /* Clear out buffer if it has been consumed */
+                if (rb->cons == rb->len) {
+                        list_del(&rb->list);
+                        kfree(rb);
+                        if (list_empty(&u->read_buffers))
+                                break;
+                        rb = list_entry(u->read_buffers.next,
+                                        struct read_buffer, list);
+                }
+        }
+        if (i == 0)
+                goto again;
+out:
+        mutex_unlock(&u->reply_mutex);
+        return i;
+}
+/*
+ * Add a buffer to the queue.  Caller must hold the appropriate lock
+ * if the queue is not local.  (Commonly the caller will build up
+ * multiple queued buffers on a temporary local list, and then add it
+ * to the appropriate list under lock once all the buffers have een
+ * successfully allocated.)
+ */
+static int queue_reply(struct list_head *queue, const void *data, size_t len)
+{
+        struct read_buffer *rb;
+        if (len == 0)
+                return 0;
+        rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL);
+        if (rb == NULL)
+                return -ENOMEM;
+        rb->cons = 0;
+        rb->len = len;
+        memcpy(rb->msg, data, len);
+        list_add_tail(&rb->list, queue);
+        return 0;
+}
+/*
+ * Free all the read_buffer s on a list.
+ * Caller must have sole reference to list.
+ */
+static void queue_cleanup(struct list_head *list)
+{
+        struct read_buffer *rb;
+        while (!list_empty(list)) {
+                rb = list_entry(list->next, struct read_buffer, list);
+                list_del(list->next);
+                kfree(rb);
+        }
+}
+struct watch_adapter {
+        struct list_head list;
+        struct xenbus_watch watch;
+        struct xenbus_file_priv *dev_data;
+        char *token;
+};
+static void free_watch_adapter(struct watch_adapter *watch)
+{
+        kfree(watch->watch.node);
+        kfree(watch->token);
+        kfree(watch);
+}
+static struct watch_adapter *alloc_watch_adapter(const char *path,
+                                                 const char *token)
+{
+        struct watch_adapter *watch;
+        watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+        if (watch == NULL)
+                goto out_fail;
+        watch->watch.node = kstrdup(path, GFP_KERNEL);
+        if (watch->watch.node == NULL)
+                goto out_free;
+        watch->token = kstrdup(token, GFP_KERNEL);
+        if (watch->token == NULL)
+                goto out_free;
+        return watch;
+out_free:
+        free_watch_adapter(watch);
+out_fail:
+        return NULL;
+}
+static void watch_fired(struct xenbus_watch *watch,
+                        const char **vec,
+                        unsigned int len)
+{
+        struct watch_adapter *adap;
+        struct xsd_sockmsg hdr;
+        const char *path, *token;
+        int path_len, tok_len, body_len, data_len = 0;
+        int ret;
+        LIST_HEAD(staging_q);
+        adap = container_of(watch, struct watch_adapter, watch);
+        path = vec[XS_WATCH_PATH];
+        token = adap->token;
+        path_len = strlen(path) + 1;
+        tok_len = strlen(token) + 1;
+        if (len > 2)
+                data_len = vec[len] - vec[2] + 1;
+        body_len = path_len + tok_len + data_len;
+        hdr.type = XS_WATCH_EVENT;
+        hdr.len = body_len;
+        mutex_lock(&adap->dev_data->reply_mutex);
+        ret = queue_reply(&staging_q, &hdr, sizeof(hdr));
+        if (!ret)
+                ret = queue_reply(&staging_q, path, path_len);
+        if (!ret)
+                ret = queue_reply(&staging_q, token, tok_len);
+        if (!ret && len > 2)
+                ret = queue_reply(&staging_q, vec[2], data_len);
+        if (!ret) {
+                /* success: pass reply list onto watcher */
+                list_splice_tail(&staging_q, &adap->dev_data->read_buffers);
+                wake_up(&adap->dev_data->read_waitq);
+        } else
+                queue_cleanup(&staging_q);
+        mutex_unlock(&adap->dev_data->reply_mutex);
+}
+static int xenbus_write_transaction(unsigned msg_type,
+                                    struct xenbus_file_priv *u)
+{
+        int rc;
+        void *reply;
+        struct xenbus_transaction_holder *trans = NULL;
+        LIST_HEAD(staging_q);
+        if (msg_type == XS_TRANSACTION_START) {
+                trans = kmalloc(sizeof(*trans), GFP_KERNEL);
+                if (!trans) {
+                        rc = -ENOMEM;
+                        goto out;
+                }
+        }
+        reply = xenbus_dev_request_and_reply(&u->u.msg);
+        if (IS_ERR(reply)) {
+                kfree(trans);
+                rc = PTR_ERR(reply);
+                goto out;
+        }
+        if (msg_type == XS_TRANSACTION_START) {
+                trans->handle.id = simple_strtoul(reply, NULL, 0);
+                list_add(&trans->list, &u->transactions);
+        } else if (msg_type == XS_TRANSACTION_END) {
+                list_for_each_entry(trans, &u->transactions, list)
+                        if (trans->handle.id == u->u.msg.tx_id)
+                                break;
+                BUG_ON(&trans->list == &u->transactions);
+                list_del(&trans->list);
+                kfree(trans);
+        }
+        mutex_lock(&u->reply_mutex);
+        rc = queue_reply(&staging_q, &u->u.msg, sizeof(u->u.msg));
+        if (!rc)
+                rc = queue_reply(&staging_q, reply, u->u.msg.len);
+        if (!rc) {
+                list_splice_tail(&staging_q, &u->read_buffers);
+                wake_up(&u->read_waitq);
+        } else {
+                queue_cleanup(&staging_q);
+        }
+        mutex_unlock(&u->reply_mutex);
+        kfree(reply);
+out:
+        return rc;
+}
+static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv *u)
+{
+        struct watch_adapter *watch, *tmp_watch;
+        char *path, *token;
+        int err, rc;
+        LIST_HEAD(staging_q);
+        path = u->u.buffer + sizeof(u->u.msg);
+        token = memchr(path, 0, u->u.msg.len);
+        if (token == NULL) {
+                rc = -EILSEQ;
+                goto out;
+        }
+        token++;
+        if (msg_type == XS_WATCH) {
+                watch = alloc_watch_adapter(path, token);
+                if (watch == NULL) {
+                        rc = -ENOMEM;
+                        goto out;
+                }
+                watch->watch.callback = watch_fired;
+                watch->dev_data = u;
+                err = register_xenbus_watch(&watch->watch);
+                if (err) {
+                        free_watch_adapter(watch);
+                        rc = err;
+                        goto out;
+                }
+                list_add(&watch->list, &u->watches);
+        } else {
+                list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
+                        if (!strcmp(watch->token, token) &&
+                            !strcmp(watch->watch.node, path)) {
+                                unregister_xenbus_watch(&watch->watch);
+                                list_del(&watch->list);
+                                free_watch_adapter(watch);
+                                break;
+                        }
+                }
+        }
+        /* Success.  Synthesize a reply to say all is OK. */
+        {
+                struct {
+                        struct xsd_sockmsg hdr;
+                        char body[3];
+                } __packed reply = {
+                        {
+                                .type = msg_type,
+                                .len = sizeof(reply.body)
+                        },
+                        "OK"
+                };
+                mutex_lock(&u->reply_mutex);
+                rc = queue_reply(&u->read_buffers, &reply, sizeof(reply));
+                wake_up(&u->read_waitq);
+                mutex_unlock(&u->reply_mutex);
+        }
+out:
+        return rc;
+}
+static ssize_t xenbus_file_write(struct file *filp,
+                                const char __user *ubuf,
+                                size_t len, loff_t *ppos)
+{
+        struct xenbus_file_priv *u = filp->private_data;
+        uint32_t msg_type;
+        int rc = len;
+        int ret;
+        LIST_HEAD(staging_q);
+        /*
+         * We're expecting usermode to be writing properly formed
+         * xenbus messages.  If they write an incomplete message we
+         * buffer it up.  Once it is complete, we act on it.
+         */
+        /*
+         * Make sure concurrent writers can't stomp all over each
+         * other's messages and make a mess of our partial message
+         * buffer.  We don't make any attemppt to stop multiple
+         * writers from making a mess of each other's incomplete
+         * messages; we're just trying to guarantee our own internal
+         * consistency and make sure that single writes are handled
+         * atomically.
+         */
+        mutex_lock(&u->msgbuffer_mutex);
+        /* Get this out of the way early to avoid confusion */
+        if (len == 0)
+                goto out;
+        /* Can't write a xenbus message larger we can buffer */
+        if ((len + u->len) > sizeof(u->u.buffer)) {
+                /* On error, dump existing buffer */
+                u->len = 0;
+                rc = -EINVAL;
+                goto out;
+        }
+        ret = copy_from_user(u->u.buffer + u->len, ubuf, len);
+        if (ret != 0) {
+                rc = -EFAULT;
+                goto out;
+        }
+        /* Deal with a partial copy. */
+        len -= ret;
+        rc = len;
+        u->len += len;
+        /* Return if we haven't got a full message yet */
+        if (u->len < sizeof(u->u.msg))
+                goto out;       /* not even the header yet */
+        /* If we're expecting a message that's larger than we can
+           possibly send, dump what we have and return an error. */
+        if ((sizeof(u->u.msg) + u->u.msg.len) > sizeof(u->u.buffer)) {
+                rc = -E2BIG;
+                u->len = 0;
+                goto out;
+        }
+        if (u->len < (sizeof(u->u.msg) + u->u.msg.len))
+                goto out;       /* incomplete data portion */
+        /*
+         * OK, now we have a complete message.  Do something with it.
+         */
+        msg_type = u->u.msg.type;
+        switch (msg_type) {
+        case XS_WATCH:
+        case XS_UNWATCH:
+                /* (Un)Ask for some path to be watched for changes */
+                ret = xenbus_write_watch(msg_type, u);
+                break;
+        default:
+                /* Send out a transaction */
+                ret = xenbus_write_transaction(msg_type, u);
+                break;
+        }
+        if (ret != 0)
+                rc = ret;
+        /* Buffered message consumed */
+        u->len = 0;
+ out:
+        mutex_unlock(&u->msgbuffer_mutex);
+        return rc;
+}
+static int xenbus_file_open(struct inode *inode, struct file *filp)
+{
+        struct xenbus_file_priv *u;
+        if (xen_store_evtchn == 0)
+                return -ENOENT;
+        nonseekable_open(inode, filp);
+        u = kzalloc(sizeof(*u), GFP_KERNEL);
+        if (u == NULL)
+                return -ENOMEM;
+        INIT_LIST_HEAD(&u->transactions);
+        INIT_LIST_HEAD(&u->watches);
+        INIT_LIST_HEAD(&u->read_buffers);
+        init_waitqueue_head(&u->read_waitq);
+        mutex_init(&u->reply_mutex);
+        mutex_init(&u->msgbuffer_mutex);
+        filp->private_data = u;
+        return 0;
+}
+static int xenbus_file_release(struct inode *inode, struct file *filp)
+{
+        struct xenbus_file_priv *u = filp->private_data;
+        struct xenbus_transaction_holder *trans, *tmp;
+        struct watch_adapter *watch, *tmp_watch;
+        struct read_buffer *rb, *tmp_rb;
+        /*
+         * No need for locking here because there are no other users,
+         * by definition.
+         */
+        list_for_each_entry_safe(trans, tmp, &u->transactions, list) {
+                xenbus_transaction_end(trans->handle, 1);
+                list_del(&trans->list);
+                kfree(trans);
+        }
+        list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
+                unregister_xenbus_watch(&watch->watch);
+                list_del(&watch->list);
+                free_watch_adapter(watch);
+        }
+        list_for_each_entry_safe(rb, tmp_rb, &u->read_buffers, list) {
+                list_del(&rb->list);
+                kfree(rb);
+        }
+        kfree(u);
+        return 0;
+}
+static unsigned int xenbus_file_poll(struct file *file, poll_table *wait)
+{
+        struct xenbus_file_priv *u = file->private_data;
+        poll_wait(file, &u->read_waitq, wait);
+        if (!list_empty(&u->read_buffers))
+                return POLLIN | POLLRDNORM;
+        return 0;
+}
+const struct file_operations xenbus_file_ops = {
+        .read = xenbus_file_read,
+        .write = xenbus_file_write,
+        .open = xenbus_file_open,
+        .release = xenbus_file_release,
+        .poll = xenbus_file_poll,
+        .llseek = no_llseek,
+};