aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRusty Russell <rusty@rustcorp.com.au>2007-10-21 21:03:40 -0400
committerRusty Russell <rusty@rustcorp.com.au>2007-10-23 01:49:55 -0400
commit0a8a69dd77ddbd4513b21363021ecde7e1025502 (patch)
treeed6d8f0756835390b4c0d9a172422f2e42a65523
parentb01d9f2863349b0e041b90c3c86a998ee0fed2b0 (diff)
Virtio helper routines for a descriptor ringbuffer implementation
These helper routines supply most of the virtqueue_ops for hypervisors which want to use a ring for virtio. Unlike the previous lguest implementation: 1) The rings are variable sized (2^n-1 elements). 2) They have an unfortunate limit of 65535 bytes per sg element. 3) The page numbers are always 64 bit (PAE anyone?) 4) They no longer place used[] on a separate page, just a separate cacheline. 5) We do a modulo on a variable. We could be tricky if we cared. 6) Interrupts and notifies are suppressed using flags within the rings. Users need only get the ring pages and provide a notify hook (KVM wants the guest to allocate the rings, lguest does it sanely). Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Cc: Dor Laor <dor.laor@qumranet.com>
-rw-r--r--arch/x86/lguest/Kconfig1
-rw-r--r--drivers/virtio/Kconfig5
-rw-r--r--drivers/virtio/Makefile1
-rw-r--r--drivers/virtio/virtio_ring.c313
-rw-r--r--include/linux/virtio_ring.h119
5 files changed, 439 insertions, 0 deletions
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 0fabf87db998..44dccfd845f8 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -2,6 +2,7 @@ config LGUEST_GUEST
2 bool "Lguest guest support" 2 bool "Lguest guest support"
3 select PARAVIRT 3 select PARAVIRT
4 depends on !X86_PAE 4 depends on !X86_PAE
5 select VIRTIO_RING
5 help 6 help
6 Lguest is a tiny in-kernel hypervisor. Selecting this will 7 Lguest is a tiny in-kernel hypervisor. Selecting this will
7 allow your kernel to boot under lguest. This option will increase 8 allow your kernel to boot under lguest. This option will increase
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index bce84b56a659..9e33fc4da875 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -1,3 +1,8 @@
1# Virtio always gets selected by whoever wants it. 1# Virtio always gets selected by whoever wants it.
2config VIRTIO 2config VIRTIO
3 bool 3 bool
4
5# Similarly the virtio ring implementation.
6config VIRTIO_RING
7 bool
8 depends on VIRTIO
diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
index af0d57dad3eb..f70e40971dd9 100644
--- a/drivers/virtio/Makefile
+++ b/drivers/virtio/Makefile
@@ -1 +1,2 @@
1obj-$(CONFIG_VIRTIO) += virtio.o 1obj-$(CONFIG_VIRTIO) += virtio.o
2obj-$(CONFIG_VIRTIO_RING) += virtio_ring.o
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
new file mode 100644
index 000000000000..0e4baca21b8f
--- /dev/null
+++ b/drivers/virtio/virtio_ring.c
@@ -0,0 +1,313 @@
1/* Virtio ring implementation.
2 *
3 * Copyright 2007 Rusty Russell IBM Corporation
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#include <linux/virtio.h>
20#include <linux/virtio_ring.h>
21#include <linux/device.h>
22
23#ifdef DEBUG
24/* For development, we want to crash whenever the ring is screwed. */
25#define BAD_RING(vq, fmt...) \
26 do { dev_err(&vq->vq.vdev->dev, fmt); BUG(); } while(0)
27#define START_USE(vq) \
28 do { if ((vq)->in_use) panic("in_use = %i\n", (vq)->in_use); (vq)->in_use = __LINE__; mb(); } while(0)
29#define END_USE(vq) \
30 do { BUG_ON(!(vq)->in_use); (vq)->in_use = 0; mb(); } while(0)
31#else
32#define BAD_RING(vq, fmt...) \
33 do { dev_err(&vq->vq.vdev->dev, fmt); (vq)->broken = true; } while(0)
34#define START_USE(vq)
35#define END_USE(vq)
36#endif
37
38struct vring_virtqueue
39{
40 struct virtqueue vq;
41
42 /* Actual memory layout for this queue */
43 struct vring vring;
44
45 /* Other side has made a mess, don't try any more. */
46 bool broken;
47
48 /* Number of free buffers */
49 unsigned int num_free;
50 /* Head of free buffer list. */
51 unsigned int free_head;
52 /* Number we've added since last sync. */
53 unsigned int num_added;
54
55 /* Last used index we've seen. */
56 unsigned int last_used_idx;
57
58 /* How to notify other side. FIXME: commonalize hcalls! */
59 void (*notify)(struct virtqueue *vq);
60
61#ifdef DEBUG
62 /* They're supposed to lock for us. */
63 unsigned int in_use;
64#endif
65
66 /* Tokens for callbacks. */
67 void *data[];
68};
69
70#define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
71
72static int vring_add_buf(struct virtqueue *_vq,
73 struct scatterlist sg[],
74 unsigned int out,
75 unsigned int in,
76 void *data)
77{
78 struct vring_virtqueue *vq = to_vvq(_vq);
79 unsigned int i, avail, head, uninitialized_var(prev);
80
81 BUG_ON(data == NULL);
82 BUG_ON(out + in > vq->vring.num);
83 BUG_ON(out + in == 0);
84
85 START_USE(vq);
86
87 if (vq->num_free < out + in) {
88 pr_debug("Can't add buf len %i - avail = %i\n",
89 out + in, vq->num_free);
90 END_USE(vq);
91 return -ENOSPC;
92 }
93
94 /* We're about to use some buffers from the free list. */
95 vq->num_free -= out + in;
96
97 head = vq->free_head;
98 for (i = vq->free_head; out; i = vq->vring.desc[i].next, out--) {
99 vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
100 vq->vring.desc[i].addr = (page_to_pfn(sg_page(sg))<<PAGE_SHIFT)
101 + sg->offset;
102 vq->vring.desc[i].len = sg->length;
103 prev = i;
104 sg++;
105 }
106 for (; in; i = vq->vring.desc[i].next, in--) {
107 vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
108 vq->vring.desc[i].addr = (page_to_pfn(sg_page(sg))<<PAGE_SHIFT)
109 + sg->offset;
110 vq->vring.desc[i].len = sg->length;
111 prev = i;
112 sg++;
113 }
114 /* Last one doesn't continue. */
115 vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT;
116
117 /* Update free pointer */
118 vq->free_head = i;
119
120 /* Set token. */
121 vq->data[head] = data;
122
123 /* Put entry in available array (but don't update avail->idx until they
124 * do sync). FIXME: avoid modulus here? */
125 avail = (vq->vring.avail->idx + vq->num_added++) % vq->vring.num;
126 vq->vring.avail->ring[avail] = head;
127
128 pr_debug("Added buffer head %i to %p\n", head, vq);
129 END_USE(vq);
130 return 0;
131}
132
133static void vring_kick(struct virtqueue *_vq)
134{
135 struct vring_virtqueue *vq = to_vvq(_vq);
136 START_USE(vq);
137 /* Descriptors and available array need to be set before we expose the
138 * new available array entries. */
139 wmb();
140
141 vq->vring.avail->idx += vq->num_added;
142 vq->num_added = 0;
143
144 /* Need to update avail index before checking if we should notify */
145 mb();
146
147 if (!(vq->vring.used->flags & VRING_USED_F_NO_NOTIFY))
148 /* Prod other side to tell it about changes. */
149 vq->notify(&vq->vq);
150
151 END_USE(vq);
152}
153
154static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
155{
156 unsigned int i;
157
158 /* Clear data ptr. */
159 vq->data[head] = NULL;
160
161 /* Put back on free list: find end */
162 i = head;
163 while (vq->vring.desc[i].flags & VRING_DESC_F_NEXT) {
164 i = vq->vring.desc[i].next;
165 vq->num_free++;
166 }
167
168 vq->vring.desc[i].next = vq->free_head;
169 vq->free_head = head;
170 /* Plus final descriptor */
171 vq->num_free++;
172}
173
174/* FIXME: We need to tell other side about removal, to synchronize. */
175static void vring_shutdown(struct virtqueue *_vq)
176{
177 struct vring_virtqueue *vq = to_vvq(_vq);
178 unsigned int i;
179
180 for (i = 0; i < vq->vring.num; i++)
181 detach_buf(vq, i);
182}
183
184static inline bool more_used(const struct vring_virtqueue *vq)
185{
186 return vq->last_used_idx != vq->vring.used->idx;
187}
188
189static void *vring_get_buf(struct virtqueue *_vq, unsigned int *len)
190{
191 struct vring_virtqueue *vq = to_vvq(_vq);
192 void *ret;
193 unsigned int i;
194
195 START_USE(vq);
196
197 if (!more_used(vq)) {
198 pr_debug("No more buffers in queue\n");
199 END_USE(vq);
200 return NULL;
201 }
202
203 i = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].id;
204 *len = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].len;
205
206 if (unlikely(i >= vq->vring.num)) {
207 BAD_RING(vq, "id %u out of range\n", i);
208 return NULL;
209 }
210 if (unlikely(!vq->data[i])) {
211 BAD_RING(vq, "id %u is not a head!\n", i);
212 return NULL;
213 }
214
215 /* detach_buf clears data, so grab it now. */
216 ret = vq->data[i];
217 detach_buf(vq, i);
218 vq->last_used_idx++;
219 END_USE(vq);
220 return ret;
221}
222
223static bool vring_restart(struct virtqueue *_vq)
224{
225 struct vring_virtqueue *vq = to_vvq(_vq);
226
227 START_USE(vq);
228 BUG_ON(!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT));
229
230 /* We optimistically turn back on interrupts, then check if there was
231 * more to do. */
232 vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
233 mb();
234 if (unlikely(more_used(vq))) {
235 vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
236 END_USE(vq);
237 return false;
238 }
239
240 END_USE(vq);
241 return true;
242}
243
244irqreturn_t vring_interrupt(int irq, void *_vq)
245{
246 struct vring_virtqueue *vq = to_vvq(_vq);
247
248 if (!more_used(vq)) {
249 pr_debug("virtqueue interrupt with no work for %p\n", vq);
250 return IRQ_NONE;
251 }
252
253 if (unlikely(vq->broken))
254 return IRQ_HANDLED;
255
256 pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback);
257 if (vq->vq.callback && !vq->vq.callback(&vq->vq))
258 vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
259
260 return IRQ_HANDLED;
261}
262
263static struct virtqueue_ops vring_vq_ops = {
264 .add_buf = vring_add_buf,
265 .get_buf = vring_get_buf,
266 .kick = vring_kick,
267 .restart = vring_restart,
268 .shutdown = vring_shutdown,
269};
270
271struct virtqueue *vring_new_virtqueue(unsigned int num,
272 struct virtio_device *vdev,
273 void *pages,
274 void (*notify)(struct virtqueue *),
275 bool (*callback)(struct virtqueue *))
276{
277 struct vring_virtqueue *vq;
278 unsigned int i;
279
280 vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL);
281 if (!vq)
282 return NULL;
283
284 vring_init(&vq->vring, num, pages);
285 vq->vq.callback = callback;
286 vq->vq.vdev = vdev;
287 vq->vq.vq_ops = &vring_vq_ops;
288 vq->notify = notify;
289 vq->broken = false;
290 vq->last_used_idx = 0;
291 vq->num_added = 0;
292#ifdef DEBUG
293 vq->in_use = false;
294#endif
295
296 /* No callback? Tell other side not to bother us. */
297 if (!callback)
298 vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
299
300 /* Put everything in free lists. */
301 vq->num_free = num;
302 vq->free_head = 0;
303 for (i = 0; i < num-1; i++)
304 vq->vring.desc[i].next = i+1;
305
306 return &vq->vq;
307}
308
309void vring_del_virtqueue(struct virtqueue *vq)
310{
311 kfree(to_vvq(vq));
312}
313
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
new file mode 100644
index 000000000000..ac69e7bb5a14
--- /dev/null
+++ b/include/linux/virtio_ring.h
@@ -0,0 +1,119 @@
1#ifndef _LINUX_VIRTIO_RING_H
2#define _LINUX_VIRTIO_RING_H
3/* An interface for efficient virtio implementation, currently for use by KVM
4 * and lguest, but hopefully others soon. Do NOT change this since it will
5 * break existing servers and clients.
6 *
7 * This header is BSD licensed so anyone can use the definitions to implement
8 * compatible drivers/servers.
9 *
10 * Copyright Rusty Russell IBM Corporation 2007. */
11#include <linux/types.h>
12
13/* This marks a buffer as continuing via the next field. */
14#define VRING_DESC_F_NEXT 1
15/* This marks a buffer as write-only (otherwise read-only). */
16#define VRING_DESC_F_WRITE 2
17
18/* This means don't notify other side when buffer added. */
19#define VRING_USED_F_NO_NOTIFY 1
20/* This means don't interrupt guest when buffer consumed. */
21#define VRING_AVAIL_F_NO_INTERRUPT 1
22
23/* Virtio ring descriptors: 16 bytes. These can chain together via "next". */
24struct vring_desc
25{
26 /* Address (guest-physical). */
27 __u64 addr;
28 /* Length. */
29 __u32 len;
30 /* The flags as indicated above. */
31 __u16 flags;
32 /* We chain unused descriptors via this, too */
33 __u16 next;
34};
35
36struct vring_avail
37{
38 __u16 flags;
39 __u16 idx;
40 __u16 ring[];
41};
42
43/* u32 is used here for ids for padding reasons. */
44struct vring_used_elem
45{
46 /* Index of start of used descriptor chain. */
47 __u32 id;
48 /* Total length of the descriptor chain which was used (written to) */
49 __u32 len;
50};
51
52struct vring_used
53{
54 __u16 flags;
55 __u16 idx;
56 struct vring_used_elem ring[];
57};
58
59struct vring {
60 unsigned int num;
61
62 struct vring_desc *desc;
63
64 struct vring_avail *avail;
65
66 struct vring_used *used;
67};
68
69/* The standard layout for the ring is a continuous chunk of memory which looks
70 * like this. The used fields will be aligned to a "num+1" boundary.
71 *
72 * struct vring
73 * {
74 * // The actual descriptors (16 bytes each)
75 * struct vring_desc desc[num];
76 *
77 * // A ring of available descriptor heads with free-running index.
78 * __u16 avail_flags;
79 * __u16 avail_idx;
80 * __u16 available[num];
81 *
82 * // Padding so a correctly-chosen num value will cache-align used_idx.
83 * char pad[sizeof(struct vring_desc) - sizeof(avail_flags)];
84 *
85 * // A ring of used descriptor heads with free-running index.
86 * __u16 used_flags;
87 * __u16 used_idx;
88 * struct vring_used_elem used[num];
89 * };
90 */
91static inline void vring_init(struct vring *vr, unsigned int num, void *p)
92{
93 vr->num = num;
94 vr->desc = p;
95 vr->avail = p + num*sizeof(struct vring);
96 vr->used = p + (num+1)*(sizeof(struct vring) + sizeof(__u16));
97}
98
99static inline unsigned vring_size(unsigned int num)
100{
101 return (num + 1) * (sizeof(struct vring_desc) + sizeof(__u16))
102 + sizeof(__u32) + num * sizeof(struct vring_used_elem);
103}
104
105#ifdef __KERNEL__
106#include <linux/irqreturn.h>
107struct virtio_device;
108struct virtqueue;
109
110struct virtqueue *vring_new_virtqueue(unsigned int num,
111 struct virtio_device *vdev,
112 void *pages,
113 void (*notify)(struct virtqueue *vq),
114 bool (*callback)(struct virtqueue *vq));
115void vring_del_virtqueue(struct virtqueue *vq);
116
117irqreturn_t vring_interrupt(int irq, void *_vq);
118#endif /* __KERNEL__ */
119#endif /* _LINUX_VIRTIO_RING_H */