diff options
author | Rusty Russell <rusty@rustcorp.com.au> | 2007-10-21 21:03:40 -0400 |
---|---|---|
committer | Rusty Russell <rusty@rustcorp.com.au> | 2007-10-23 01:49:55 -0400 |
commit | 0a8a69dd77ddbd4513b21363021ecde7e1025502 (patch) | |
tree | ed6d8f0756835390b4c0d9a172422f2e42a65523 | |
parent | b01d9f2863349b0e041b90c3c86a998ee0fed2b0 (diff) |
Virtio helper routines for a descriptor ringbuffer implementation
These helper routines supply most of the virtqueue_ops for hypervisors
which want to use a ring for virtio. Unlike the previous lguest
implementation:
1) The rings are variable sized (2^n-1 elements).
2) They have an unfortunate limit of 65535 bytes per sg element.
3) The page numbers are always 64 bit (PAE anyone?)
4) They no longer place used[] on a separate page, just a separate
cacheline.
5) We do a modulo on a variable. We could be tricky if we cared.
6) Interrupts and notifies are suppressed using flags within the rings.
Users need only get the ring pages and provide a notify hook (KVM
wants the guest to allocate the rings, lguest does it sanely).
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Dor Laor <dor.laor@qumranet.com>
-rw-r--r-- | arch/x86/lguest/Kconfig | 1 | ||||
-rw-r--r-- | drivers/virtio/Kconfig | 5 | ||||
-rw-r--r-- | drivers/virtio/Makefile | 1 | ||||
-rw-r--r-- | drivers/virtio/virtio_ring.c | 313 | ||||
-rw-r--r-- | include/linux/virtio_ring.h | 119 |
5 files changed, 439 insertions, 0 deletions
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig index 0fabf87db998..44dccfd845f8 100644 --- a/arch/x86/lguest/Kconfig +++ b/arch/x86/lguest/Kconfig | |||
@@ -2,6 +2,7 @@ config LGUEST_GUEST | |||
2 | bool "Lguest guest support" | 2 | bool "Lguest guest support" |
3 | select PARAVIRT | 3 | select PARAVIRT |
4 | depends on !X86_PAE | 4 | depends on !X86_PAE |
5 | select VIRTIO_RING | ||
5 | help | 6 | help |
6 | Lguest is a tiny in-kernel hypervisor. Selecting this will | 7 | Lguest is a tiny in-kernel hypervisor. Selecting this will |
7 | allow your kernel to boot under lguest. This option will increase | 8 | allow your kernel to boot under lguest. This option will increase |
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index bce84b56a659..9e33fc4da875 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig | |||
@@ -1,3 +1,8 @@ | |||
1 | # Virtio always gets selected by whoever wants it. | 1 | # Virtio always gets selected by whoever wants it. |
2 | config VIRTIO | 2 | config VIRTIO |
3 | bool | 3 | bool |
4 | |||
5 | # Similarly the virtio ring implementation. | ||
6 | config VIRTIO_RING | ||
7 | bool | ||
8 | depends on VIRTIO | ||
diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile index af0d57dad3eb..f70e40971dd9 100644 --- a/drivers/virtio/Makefile +++ b/drivers/virtio/Makefile | |||
@@ -1 +1,2 @@ | |||
1 | obj-$(CONFIG_VIRTIO) += virtio.o | 1 | obj-$(CONFIG_VIRTIO) += virtio.o |
2 | obj-$(CONFIG_VIRTIO_RING) += virtio_ring.o | ||
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c new file mode 100644 index 000000000000..0e4baca21b8f --- /dev/null +++ b/drivers/virtio/virtio_ring.c | |||
@@ -0,0 +1,313 @@ | |||
1 | /* Virtio ring implementation. | ||
2 | * | ||
3 | * Copyright 2007 Rusty Russell IBM Corporation | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | * GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | ||
18 | */ | ||
19 | #include <linux/virtio.h> | ||
20 | #include <linux/virtio_ring.h> | ||
21 | #include <linux/device.h> | ||
22 | |||
23 | #ifdef DEBUG | ||
24 | /* For development, we want to crash whenever the ring is screwed. */ | ||
25 | #define BAD_RING(vq, fmt...) \ | ||
26 | do { dev_err(&vq->vq.vdev->dev, fmt); BUG(); } while(0) | ||
27 | #define START_USE(vq) \ | ||
28 | do { if ((vq)->in_use) panic("in_use = %i\n", (vq)->in_use); (vq)->in_use = __LINE__; mb(); } while(0) | ||
29 | #define END_USE(vq) \ | ||
30 | do { BUG_ON(!(vq)->in_use); (vq)->in_use = 0; mb(); } while(0) | ||
31 | #else | ||
32 | #define BAD_RING(vq, fmt...) \ | ||
33 | do { dev_err(&vq->vq.vdev->dev, fmt); (vq)->broken = true; } while(0) | ||
34 | #define START_USE(vq) | ||
35 | #define END_USE(vq) | ||
36 | #endif | ||
37 | |||
38 | struct vring_virtqueue | ||
39 | { | ||
40 | struct virtqueue vq; | ||
41 | |||
42 | /* Actual memory layout for this queue */ | ||
43 | struct vring vring; | ||
44 | |||
45 | /* Other side has made a mess, don't try any more. */ | ||
46 | bool broken; | ||
47 | |||
48 | /* Number of free buffers */ | ||
49 | unsigned int num_free; | ||
50 | /* Head of free buffer list. */ | ||
51 | unsigned int free_head; | ||
52 | /* Number we've added since last sync. */ | ||
53 | unsigned int num_added; | ||
54 | |||
55 | /* Last used index we've seen. */ | ||
56 | unsigned int last_used_idx; | ||
57 | |||
58 | /* How to notify other side. FIXME: commonalize hcalls! */ | ||
59 | void (*notify)(struct virtqueue *vq); | ||
60 | |||
61 | #ifdef DEBUG | ||
62 | /* They're supposed to lock for us. */ | ||
63 | unsigned int in_use; | ||
64 | #endif | ||
65 | |||
66 | /* Tokens for callbacks. */ | ||
67 | void *data[]; | ||
68 | }; | ||
69 | |||
70 | #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq) | ||
71 | |||
72 | static int vring_add_buf(struct virtqueue *_vq, | ||
73 | struct scatterlist sg[], | ||
74 | unsigned int out, | ||
75 | unsigned int in, | ||
76 | void *data) | ||
77 | { | ||
78 | struct vring_virtqueue *vq = to_vvq(_vq); | ||
79 | unsigned int i, avail, head, uninitialized_var(prev); | ||
80 | |||
81 | BUG_ON(data == NULL); | ||
82 | BUG_ON(out + in > vq->vring.num); | ||
83 | BUG_ON(out + in == 0); | ||
84 | |||
85 | START_USE(vq); | ||
86 | |||
87 | if (vq->num_free < out + in) { | ||
88 | pr_debug("Can't add buf len %i - avail = %i\n", | ||
89 | out + in, vq->num_free); | ||
90 | END_USE(vq); | ||
91 | return -ENOSPC; | ||
92 | } | ||
93 | |||
94 | /* We're about to use some buffers from the free list. */ | ||
95 | vq->num_free -= out + in; | ||
96 | |||
97 | head = vq->free_head; | ||
98 | for (i = vq->free_head; out; i = vq->vring.desc[i].next, out--) { | ||
99 | vq->vring.desc[i].flags = VRING_DESC_F_NEXT; | ||
100 | vq->vring.desc[i].addr = (page_to_pfn(sg_page(sg))<<PAGE_SHIFT) | ||
101 | + sg->offset; | ||
102 | vq->vring.desc[i].len = sg->length; | ||
103 | prev = i; | ||
104 | sg++; | ||
105 | } | ||
106 | for (; in; i = vq->vring.desc[i].next, in--) { | ||
107 | vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; | ||
108 | vq->vring.desc[i].addr = (page_to_pfn(sg_page(sg))<<PAGE_SHIFT) | ||
109 | + sg->offset; | ||
110 | vq->vring.desc[i].len = sg->length; | ||
111 | prev = i; | ||
112 | sg++; | ||
113 | } | ||
114 | /* Last one doesn't continue. */ | ||
115 | vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT; | ||
116 | |||
117 | /* Update free pointer */ | ||
118 | vq->free_head = i; | ||
119 | |||
120 | /* Set token. */ | ||
121 | vq->data[head] = data; | ||
122 | |||
123 | /* Put entry in available array (but don't update avail->idx until they | ||
124 | * do sync). FIXME: avoid modulus here? */ | ||
125 | avail = (vq->vring.avail->idx + vq->num_added++) % vq->vring.num; | ||
126 | vq->vring.avail->ring[avail] = head; | ||
127 | |||
128 | pr_debug("Added buffer head %i to %p\n", head, vq); | ||
129 | END_USE(vq); | ||
130 | return 0; | ||
131 | } | ||
132 | |||
133 | static void vring_kick(struct virtqueue *_vq) | ||
134 | { | ||
135 | struct vring_virtqueue *vq = to_vvq(_vq); | ||
136 | START_USE(vq); | ||
137 | /* Descriptors and available array need to be set before we expose the | ||
138 | * new available array entries. */ | ||
139 | wmb(); | ||
140 | |||
141 | vq->vring.avail->idx += vq->num_added; | ||
142 | vq->num_added = 0; | ||
143 | |||
144 | /* Need to update avail index before checking if we should notify */ | ||
145 | mb(); | ||
146 | |||
147 | if (!(vq->vring.used->flags & VRING_USED_F_NO_NOTIFY)) | ||
148 | /* Prod other side to tell it about changes. */ | ||
149 | vq->notify(&vq->vq); | ||
150 | |||
151 | END_USE(vq); | ||
152 | } | ||
153 | |||
154 | static void detach_buf(struct vring_virtqueue *vq, unsigned int head) | ||
155 | { | ||
156 | unsigned int i; | ||
157 | |||
158 | /* Clear data ptr. */ | ||
159 | vq->data[head] = NULL; | ||
160 | |||
161 | /* Put back on free list: find end */ | ||
162 | i = head; | ||
163 | while (vq->vring.desc[i].flags & VRING_DESC_F_NEXT) { | ||
164 | i = vq->vring.desc[i].next; | ||
165 | vq->num_free++; | ||
166 | } | ||
167 | |||
168 | vq->vring.desc[i].next = vq->free_head; | ||
169 | vq->free_head = head; | ||
170 | /* Plus final descriptor */ | ||
171 | vq->num_free++; | ||
172 | } | ||
173 | |||
174 | /* FIXME: We need to tell other side about removal, to synchronize. */ | ||
175 | static void vring_shutdown(struct virtqueue *_vq) | ||
176 | { | ||
177 | struct vring_virtqueue *vq = to_vvq(_vq); | ||
178 | unsigned int i; | ||
179 | |||
180 | for (i = 0; i < vq->vring.num; i++) | ||
181 | detach_buf(vq, i); | ||
182 | } | ||
183 | |||
184 | static inline bool more_used(const struct vring_virtqueue *vq) | ||
185 | { | ||
186 | return vq->last_used_idx != vq->vring.used->idx; | ||
187 | } | ||
188 | |||
189 | static void *vring_get_buf(struct virtqueue *_vq, unsigned int *len) | ||
190 | { | ||
191 | struct vring_virtqueue *vq = to_vvq(_vq); | ||
192 | void *ret; | ||
193 | unsigned int i; | ||
194 | |||
195 | START_USE(vq); | ||
196 | |||
197 | if (!more_used(vq)) { | ||
198 | pr_debug("No more buffers in queue\n"); | ||
199 | END_USE(vq); | ||
200 | return NULL; | ||
201 | } | ||
202 | |||
203 | i = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].id; | ||
204 | *len = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].len; | ||
205 | |||
206 | if (unlikely(i >= vq->vring.num)) { | ||
207 | BAD_RING(vq, "id %u out of range\n", i); | ||
208 | return NULL; | ||
209 | } | ||
210 | if (unlikely(!vq->data[i])) { | ||
211 | BAD_RING(vq, "id %u is not a head!\n", i); | ||
212 | return NULL; | ||
213 | } | ||
214 | |||
215 | /* detach_buf clears data, so grab it now. */ | ||
216 | ret = vq->data[i]; | ||
217 | detach_buf(vq, i); | ||
218 | vq->last_used_idx++; | ||
219 | END_USE(vq); | ||
220 | return ret; | ||
221 | } | ||
222 | |||
223 | static bool vring_restart(struct virtqueue *_vq) | ||
224 | { | ||
225 | struct vring_virtqueue *vq = to_vvq(_vq); | ||
226 | |||
227 | START_USE(vq); | ||
228 | BUG_ON(!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)); | ||
229 | |||
230 | /* We optimistically turn back on interrupts, then check if there was | ||
231 | * more to do. */ | ||
232 | vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT; | ||
233 | mb(); | ||
234 | if (unlikely(more_used(vq))) { | ||
235 | vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; | ||
236 | END_USE(vq); | ||
237 | return false; | ||
238 | } | ||
239 | |||
240 | END_USE(vq); | ||
241 | return true; | ||
242 | } | ||
243 | |||
244 | irqreturn_t vring_interrupt(int irq, void *_vq) | ||
245 | { | ||
246 | struct vring_virtqueue *vq = to_vvq(_vq); | ||
247 | |||
248 | if (!more_used(vq)) { | ||
249 | pr_debug("virtqueue interrupt with no work for %p\n", vq); | ||
250 | return IRQ_NONE; | ||
251 | } | ||
252 | |||
253 | if (unlikely(vq->broken)) | ||
254 | return IRQ_HANDLED; | ||
255 | |||
256 | pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback); | ||
257 | if (vq->vq.callback && !vq->vq.callback(&vq->vq)) | ||
258 | vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; | ||
259 | |||
260 | return IRQ_HANDLED; | ||
261 | } | ||
262 | |||
263 | static struct virtqueue_ops vring_vq_ops = { | ||
264 | .add_buf = vring_add_buf, | ||
265 | .get_buf = vring_get_buf, | ||
266 | .kick = vring_kick, | ||
267 | .restart = vring_restart, | ||
268 | .shutdown = vring_shutdown, | ||
269 | }; | ||
270 | |||
271 | struct virtqueue *vring_new_virtqueue(unsigned int num, | ||
272 | struct virtio_device *vdev, | ||
273 | void *pages, | ||
274 | void (*notify)(struct virtqueue *), | ||
275 | bool (*callback)(struct virtqueue *)) | ||
276 | { | ||
277 | struct vring_virtqueue *vq; | ||
278 | unsigned int i; | ||
279 | |||
280 | vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL); | ||
281 | if (!vq) | ||
282 | return NULL; | ||
283 | |||
284 | vring_init(&vq->vring, num, pages); | ||
285 | vq->vq.callback = callback; | ||
286 | vq->vq.vdev = vdev; | ||
287 | vq->vq.vq_ops = &vring_vq_ops; | ||
288 | vq->notify = notify; | ||
289 | vq->broken = false; | ||
290 | vq->last_used_idx = 0; | ||
291 | vq->num_added = 0; | ||
292 | #ifdef DEBUG | ||
293 | vq->in_use = false; | ||
294 | #endif | ||
295 | |||
296 | /* No callback? Tell other side not to bother us. */ | ||
297 | if (!callback) | ||
298 | vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; | ||
299 | |||
300 | /* Put everything in free lists. */ | ||
301 | vq->num_free = num; | ||
302 | vq->free_head = 0; | ||
303 | for (i = 0; i < num-1; i++) | ||
304 | vq->vring.desc[i].next = i+1; | ||
305 | |||
306 | return &vq->vq; | ||
307 | } | ||
308 | |||
309 | void vring_del_virtqueue(struct virtqueue *vq) | ||
310 | { | ||
311 | kfree(to_vvq(vq)); | ||
312 | } | ||
313 | |||
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h new file mode 100644 index 000000000000..ac69e7bb5a14 --- /dev/null +++ b/include/linux/virtio_ring.h | |||
@@ -0,0 +1,119 @@ | |||
1 | #ifndef _LINUX_VIRTIO_RING_H | ||
2 | #define _LINUX_VIRTIO_RING_H | ||
3 | /* An interface for efficient virtio implementation, currently for use by KVM | ||
4 | * and lguest, but hopefully others soon. Do NOT change this since it will | ||
5 | * break existing servers and clients. | ||
6 | * | ||
7 | * This header is BSD licensed so anyone can use the definitions to implement | ||
8 | * compatible drivers/servers. | ||
9 | * | ||
10 | * Copyright Rusty Russell IBM Corporation 2007. */ | ||
11 | #include <linux/types.h> | ||
12 | |||
13 | /* This marks a buffer as continuing via the next field. */ | ||
14 | #define VRING_DESC_F_NEXT 1 | ||
15 | /* This marks a buffer as write-only (otherwise read-only). */ | ||
16 | #define VRING_DESC_F_WRITE 2 | ||
17 | |||
18 | /* This means don't notify other side when buffer added. */ | ||
19 | #define VRING_USED_F_NO_NOTIFY 1 | ||
20 | /* This means don't interrupt guest when buffer consumed. */ | ||
21 | #define VRING_AVAIL_F_NO_INTERRUPT 1 | ||
22 | |||
23 | /* Virtio ring descriptors: 16 bytes. These can chain together via "next". */ | ||
24 | struct vring_desc | ||
25 | { | ||
26 | /* Address (guest-physical). */ | ||
27 | __u64 addr; | ||
28 | /* Length. */ | ||
29 | __u32 len; | ||
30 | /* The flags as indicated above. */ | ||
31 | __u16 flags; | ||
32 | /* We chain unused descriptors via this, too */ | ||
33 | __u16 next; | ||
34 | }; | ||
35 | |||
36 | struct vring_avail | ||
37 | { | ||
38 | __u16 flags; | ||
39 | __u16 idx; | ||
40 | __u16 ring[]; | ||
41 | }; | ||
42 | |||
43 | /* u32 is used here for ids for padding reasons. */ | ||
44 | struct vring_used_elem | ||
45 | { | ||
46 | /* Index of start of used descriptor chain. */ | ||
47 | __u32 id; | ||
48 | /* Total length of the descriptor chain which was used (written to) */ | ||
49 | __u32 len; | ||
50 | }; | ||
51 | |||
52 | struct vring_used | ||
53 | { | ||
54 | __u16 flags; | ||
55 | __u16 idx; | ||
56 | struct vring_used_elem ring[]; | ||
57 | }; | ||
58 | |||
59 | struct vring { | ||
60 | unsigned int num; | ||
61 | |||
62 | struct vring_desc *desc; | ||
63 | |||
64 | struct vring_avail *avail; | ||
65 | |||
66 | struct vring_used *used; | ||
67 | }; | ||
68 | |||
69 | /* The standard layout for the ring is a continuous chunk of memory which looks | ||
70 | * like this. The used fields will be aligned to a "num+1" boundary. | ||
71 | * | ||
72 | * struct vring | ||
73 | * { | ||
74 | * // The actual descriptors (16 bytes each) | ||
75 | * struct vring_desc desc[num]; | ||
76 | * | ||
77 | * // A ring of available descriptor heads with free-running index. | ||
78 | * __u16 avail_flags; | ||
79 | * __u16 avail_idx; | ||
80 | * __u16 available[num]; | ||
81 | * | ||
82 | * // Padding so a correctly-chosen num value will cache-align used_idx. | ||
83 | * char pad[sizeof(struct vring_desc) - sizeof(avail_flags)]; | ||
84 | * | ||
85 | * // A ring of used descriptor heads with free-running index. | ||
86 | * __u16 used_flags; | ||
87 | * __u16 used_idx; | ||
88 | * struct vring_used_elem used[num]; | ||
89 | * }; | ||
90 | */ | ||
91 | static inline void vring_init(struct vring *vr, unsigned int num, void *p) | ||
92 | { | ||
93 | vr->num = num; | ||
94 | vr->desc = p; | ||
95 | vr->avail = p + num*sizeof(struct vring); | ||
96 | vr->used = p + (num+1)*(sizeof(struct vring) + sizeof(__u16)); | ||
97 | } | ||
98 | |||
99 | static inline unsigned vring_size(unsigned int num) | ||
100 | { | ||
101 | return (num + 1) * (sizeof(struct vring_desc) + sizeof(__u16)) | ||
102 | + sizeof(__u32) + num * sizeof(struct vring_used_elem); | ||
103 | } | ||
104 | |||
105 | #ifdef __KERNEL__ | ||
106 | #include <linux/irqreturn.h> | ||
107 | struct virtio_device; | ||
108 | struct virtqueue; | ||
109 | |||
110 | struct virtqueue *vring_new_virtqueue(unsigned int num, | ||
111 | struct virtio_device *vdev, | ||
112 | void *pages, | ||
113 | void (*notify)(struct virtqueue *vq), | ||
114 | bool (*callback)(struct virtqueue *vq)); | ||
115 | void vring_del_virtqueue(struct virtqueue *vq); | ||
116 | |||
117 | irqreturn_t vring_interrupt(int irq, void *_vq); | ||
118 | #endif /* __KERNEL__ */ | ||
119 | #endif /* _LINUX_VIRTIO_RING_H */ | ||