aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ia64/paravirt_ops.txt137
-rw-r--r--Documentation/virtual/00-INDEX3
-rw-r--r--Documentation/virtual/paravirt_ops.txt32
-rw-r--r--MAINTAINERS2
-rw-r--r--arch/mn10300/unit-asb2305/pci-iomap.c35
-rw-r--r--arch/s390/include/asm/pci_io.h1
-rw-r--r--arch/s390/pci/pci.c34
-rw-r--r--arch/x86/include/asm/lguest_hcall.h1
-rw-r--r--arch/x86/lguest/boot.c173
-rw-r--r--drivers/block/virtio_blk.c12
-rw-r--r--drivers/char/virtio_console.c5
-rw-r--r--drivers/lguest/Makefile3
-rw-r--r--drivers/lguest/core.c29
-rw-r--r--drivers/lguest/hypercalls.c7
-rw-r--r--drivers/lguest/lg.h26
-rw-r--r--drivers/lguest/lguest_device.c540
-rw-r--r--drivers/lguest/lguest_user.c221
-rw-r--r--drivers/lguest/page_tables.c75
-rw-r--r--drivers/lguest/x86/core.c198
-rw-r--r--drivers/net/virtio_net.c6
-rw-r--r--drivers/scsi/virtio_scsi.c6
-rw-r--r--drivers/virtio/Kconfig24
-rw-r--r--drivers/virtio/Makefile3
-rw-r--r--drivers/virtio/virtio.c5
-rw-r--r--drivers/virtio/virtio_balloon.c9
-rw-r--r--drivers/virtio/virtio_mmio.c131
-rw-r--r--drivers/virtio/virtio_pci_common.c94
-rw-r--r--drivers/virtio/virtio_pci_common.h43
-rw-r--r--drivers/virtio/virtio_pci_legacy.c76
-rw-r--r--drivers/virtio/virtio_pci_modern.c695
-rw-r--r--drivers/virtio/virtio_ring.c9
-rw-r--r--include/asm-generic/pci_iomap.h10
-rw-r--r--include/linux/lguest_launcher.h61
-rw-r--r--include/linux/virtio_mmio.h44
-rw-r--r--include/uapi/linux/virtio_balloon.h3
-rw-r--r--include/uapi/linux/virtio_blk.h17
-rw-r--r--include/uapi/linux/virtio_config.h2
-rw-r--r--include/uapi/linux/virtio_net.h42
-rw-r--r--include/uapi/linux/virtio_pci.h93
-rw-r--r--lib/pci_iomap.c35
-rw-r--r--net/9p/trans_virtio.c6
-rw-r--r--tools/lguest/Makefile8
-rw-r--r--tools/lguest/lguest.c2016
43 files changed, 3367 insertions, 1605 deletions
diff --git a/Documentation/ia64/paravirt_ops.txt b/Documentation/ia64/paravirt_ops.txt
deleted file mode 100644
index 39ded02ec33f..000000000000
--- a/Documentation/ia64/paravirt_ops.txt
+++ /dev/null
@@ -1,137 +0,0 @@
1Paravirt_ops on IA64
2====================
3 21 May 2008, Isaku Yamahata <yamahata@valinux.co.jp>
4
5
6Introduction
7------------
8The aim of this documentation is to help with maintainability and/or to
9encourage people to use paravirt_ops/IA64.
10
11paravirt_ops (pv_ops in short) is a way for virtualization support of
12Linux kernel on x86. Several ways for virtualization support were
13proposed, paravirt_ops is the winner.
14On the other hand, now there are also several IA64 virtualization
15technologies like kvm/IA64, xen/IA64 and many other academic IA64
16hypervisors so that it is good to add generic virtualization
17infrastructure on Linux/IA64.
18
19
20What is paravirt_ops?
21---------------------
22It has been developed on x86 as virtualization support via API, not ABI.
23It allows each hypervisor to override operations which are important for
24hypervisors at API level. And it allows a single kernel binary to run on
25all supported execution environments including native machine.
26Essentially paravirt_ops is a set of function pointers which represent
27operations corresponding to low level sensitive instructions and high
28level functionalities in various area. But one significant difference
29from usual function pointer table is that it allows optimization with
30binary patch. It is because some of these operations are very
31performance sensitive and indirect call overhead is not negligible.
32With binary patch, indirect C function call can be transformed into
33direct C function call or in-place execution to eliminate the overhead.
34
35Thus, operations of paravirt_ops are classified into three categories.
36- simple indirect call
37 These operations correspond to high level functionality so that the
38 overhead of indirect call isn't very important.
39
40- indirect call which allows optimization with binary patch
41 Usually these operations correspond to low level instructions. They
42 are called frequently and performance critical. So the overhead is
43 very important.
44
45- a set of macros for hand written assembly code
46 Hand written assembly codes (.S files) also need paravirtualization
47 because they include sensitive instructions or some of code paths in
48 them are very performance critical.
49
50
51The relation to the IA64 machine vector
52---------------------------------------
53Linux/IA64 has the IA64 machine vector functionality which allows the
54kernel to switch implementations (e.g. initialization, ipi, dma api...)
55depending on executing platform.
56We can replace some implementations very easily defining a new machine
57vector. Thus another approach for virtualization support would be
58enhancing the machine vector functionality.
59But paravirt_ops approach was taken because
60- virtualization support needs wider support than machine vector does.
61 e.g. low level instruction paravirtualization. It must be
62 initialized very early before platform detection.
63
64- virtualization support needs more functionality like binary patch.
65 Probably the calling overhead might not be very large compared to the
66 emulation overhead of virtualization. However in the native case, the
67 overhead should be eliminated completely.
68 A single kernel binary should run on each environment including native,
69 and the overhead of paravirt_ops on native environment should be as
70 small as possible.
71
72- for full virtualization technology, e.g. KVM/IA64 or
73 Xen/IA64 HVM domain, the result would be
74 (the emulated platform machine vector. probably dig) + (pv_ops).
75 This means that the virtualization support layer should be under
76 the machine vector layer.
77
78Possibly it might be better to move some function pointers from
79paravirt_ops to machine vector. In fact, Xen domU case utilizes both
80pv_ops and machine vector.
81
82
83IA64 paravirt_ops
84-----------------
85In this section, the concrete paravirt_ops will be discussed.
86Because of the architecture difference between ia64 and x86, the
87resulting set of functions is very different from x86 pv_ops.
88
89- C function pointer tables
90They are not very performance critical so that simple C indirect
91function call is acceptable. The following structures are defined at
92this moment. For details see linux/include/asm-ia64/paravirt.h
93 - struct pv_info
94 This structure describes the execution environment.
95 - struct pv_init_ops
96 This structure describes the various initialization hooks.
97 - struct pv_iosapic_ops
98 This structure describes hooks to iosapic operations.
99 - struct pv_irq_ops
100 This structure describes hooks to irq related operations
101 - struct pv_time_op
102 This structure describes hooks to steal time accounting.
103
104- a set of indirect calls which need optimization
105Currently this class of functions correspond to a subset of IA64
106intrinsics. At this moment the optimization with binary patch isn't
107implemented yet.
108struct pv_cpu_op is defined. For details see
109linux/include/asm-ia64/paravirt_privop.h
110Mostly they correspond to ia64 intrinsics 1-to-1.
111Caveat: Now they are defined as C indirect function pointers, but in
112order to support binary patch optimization, they will be changed
113using GCC extended inline assembly code.
114
115- a set of macros for hand written assembly code (.S files)
116For maintenance purpose, the taken approach for .S files is single
117source code and compile multiple times with different macros definitions.
118Each pv_ops instance must define those macros to compile.
119The important thing here is that sensitive, but non-privileged
120instructions must be paravirtualized and that some privileged
121instructions also need paravirtualization for reasonable performance.
122Developers who modify .S files must be aware of that. At this moment
123an easy checker is implemented to detect paravirtualization breakage.
124But it doesn't cover all the cases.
125
126Sometimes this set of macros is called pv_cpu_asm_op. But there is no
127corresponding structure in the source code.
128Those macros mostly 1:1 correspond to a subset of privileged
129instructions. See linux/include/asm-ia64/native/inst.h.
130And some functions written in assembly also need to be overrided so
131that each pv_ops instance have to define some macros. Again see
132linux/include/asm-ia64/native/inst.h.
133
134
135Those structures must be initialized very early before start_kernel.
136Probably initialized in head.S using multi entry point or some other trick.
137For native case implementation see linux/arch/ia64/kernel/paravirt.c.
diff --git a/Documentation/virtual/00-INDEX b/Documentation/virtual/00-INDEX
index e952d30bbf0f..af0d23968ee7 100644
--- a/Documentation/virtual/00-INDEX
+++ b/Documentation/virtual/00-INDEX
@@ -2,6 +2,9 @@ Virtualization support in the Linux kernel.
2 2
300-INDEX 300-INDEX
4 - this file. 4 - this file.
5
6paravirt_ops.txt
7 - Describes the Linux kernel pv_ops to support different hypervisors
5kvm/ 8kvm/
6 - Kernel Virtual Machine. See also http://linux-kvm.org 9 - Kernel Virtual Machine. See also http://linux-kvm.org
7uml/ 10uml/
diff --git a/Documentation/virtual/paravirt_ops.txt b/Documentation/virtual/paravirt_ops.txt
new file mode 100644
index 000000000000..d4881c00e339
--- /dev/null
+++ b/Documentation/virtual/paravirt_ops.txt
@@ -0,0 +1,32 @@
1Paravirt_ops
2============
3
4Linux provides support for different hypervisor virtualization technologies.
5Historically different binary kernels would be required in order to support
6different hypervisors, this restriction was removed with pv_ops.
7Linux pv_ops is a virtualization API which enables support for different
8hypervisors. It allows each hypervisor to override critical operations and
9allows a single kernel binary to run on all supported execution environments
10including native machine -- without any hypervisors.
11
12pv_ops provides a set of function pointers which represent operations
13corresponding to low level critical instructions and high level
14functionalities in various areas. pv-ops allows for optimizations at run
15time by enabling binary patching of the low-ops critical operations
16at boot time.
17
18pv_ops operations are classified into three categories:
19
20- simple indirect call
21 These operations correspond to high level functionality where it is
22 known that the overhead of indirect call isn't very important.
23
24- indirect call which allows optimization with binary patch
25 Usually these operations correspond to low level critical instructions. They
26 are called frequently and are performance critical. The overhead is
27 very important.
28
29- a set of macros for hand written assembly code
30 Hand written assembly codes (.S files) also need paravirtualization
31 because they include sensitive instructions or some of code paths in
32 them are very performance critical.
diff --git a/MAINTAINERS b/MAINTAINERS
index 4f4915cbeab9..1921ed58d1a0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7302,7 +7302,7 @@ M: Alok Kataria <akataria@vmware.com>
7302M: Rusty Russell <rusty@rustcorp.com.au> 7302M: Rusty Russell <rusty@rustcorp.com.au>
7303L: virtualization@lists.linux-foundation.org 7303L: virtualization@lists.linux-foundation.org
7304S: Supported 7304S: Supported
7305F: Documentation/ia64/paravirt_ops.txt 7305F: Documentation/virtual/paravirt_ops.txt
7306F: arch/*/kernel/paravirt* 7306F: arch/*/kernel/paravirt*
7307F: arch/*/include/asm/paravirt.h 7307F: arch/*/include/asm/paravirt.h
7308 7308
diff --git a/arch/mn10300/unit-asb2305/pci-iomap.c b/arch/mn10300/unit-asb2305/pci-iomap.c
deleted file mode 100644
index bd65dae17f32..000000000000
--- a/arch/mn10300/unit-asb2305/pci-iomap.c
+++ /dev/null
@@ -1,35 +0,0 @@
1/* ASB2305 PCI I/O mapping handler
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11#include <linux/pci.h>
12#include <linux/module.h>
13
14/*
15 * Create a virtual mapping cookie for a PCI BAR (memory or IO)
16 */
17void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
18{
19 resource_size_t start = pci_resource_start(dev, bar);
20 resource_size_t len = pci_resource_len(dev, bar);
21 unsigned long flags = pci_resource_flags(dev, bar);
22
23 if (!len || !start)
24 return NULL;
25
26 if ((flags & IORESOURCE_IO) || (flags & IORESOURCE_MEM)) {
27 if (flags & IORESOURCE_CACHEABLE && !(flags & IORESOURCE_IO))
28 return ioremap(start, len);
29 else
30 return ioremap_nocache(start, len);
31 }
32
33 return NULL;
34}
35EXPORT_SYMBOL(pci_iomap);
diff --git a/arch/s390/include/asm/pci_io.h b/arch/s390/include/asm/pci_io.h
index f664e96f48c7..1a9a98de5bde 100644
--- a/arch/s390/include/asm/pci_io.h
+++ b/arch/s390/include/asm/pci_io.h
@@ -16,6 +16,7 @@
16struct zpci_iomap_entry { 16struct zpci_iomap_entry {
17 u32 fh; 17 u32 fh;
18 u8 bar; 18 u8 bar;
19 u16 count;
19}; 20};
20 21
21extern struct zpci_iomap_entry *zpci_iomap_start; 22extern struct zpci_iomap_entry *zpci_iomap_start;
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index 3290f11ae1d9..753a56731951 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -259,7 +259,10 @@ void __iowrite64_copy(void __iomem *to, const void *from, size_t count)
259} 259}
260 260
261/* Create a virtual mapping cookie for a PCI BAR */ 261/* Create a virtual mapping cookie for a PCI BAR */
262void __iomem *pci_iomap(struct pci_dev *pdev, int bar, unsigned long max) 262void __iomem *pci_iomap_range(struct pci_dev *pdev,
263 int bar,
264 unsigned long offset,
265 unsigned long max)
263{ 266{
264 struct zpci_dev *zdev = get_zdev(pdev); 267 struct zpci_dev *zdev = get_zdev(pdev);
265 u64 addr; 268 u64 addr;
@@ -270,14 +273,27 @@ void __iomem *pci_iomap(struct pci_dev *pdev, int bar, unsigned long max)
270 273
271 idx = zdev->bars[bar].map_idx; 274 idx = zdev->bars[bar].map_idx;
272 spin_lock(&zpci_iomap_lock); 275 spin_lock(&zpci_iomap_lock);
273 zpci_iomap_start[idx].fh = zdev->fh; 276 if (zpci_iomap_start[idx].count++) {
274 zpci_iomap_start[idx].bar = bar; 277 BUG_ON(zpci_iomap_start[idx].fh != zdev->fh ||
278 zpci_iomap_start[idx].bar != bar);
279 } else {
280 zpci_iomap_start[idx].fh = zdev->fh;
281 zpci_iomap_start[idx].bar = bar;
282 }
283 /* Detect overrun */
284 BUG_ON(!zpci_iomap_start[idx].count);
275 spin_unlock(&zpci_iomap_lock); 285 spin_unlock(&zpci_iomap_lock);
276 286
277 addr = ZPCI_IOMAP_ADDR_BASE | ((u64) idx << 48); 287 addr = ZPCI_IOMAP_ADDR_BASE | ((u64) idx << 48);
278 return (void __iomem *) addr; 288 return (void __iomem *) addr + offset;
279} 289}
280EXPORT_SYMBOL_GPL(pci_iomap); 290EXPORT_SYMBOL_GPL(pci_iomap_range);
291
292void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
293{
294 return pci_iomap_range(dev, bar, 0, maxlen);
295}
296EXPORT_SYMBOL(pci_iomap);
281 297
282void pci_iounmap(struct pci_dev *pdev, void __iomem *addr) 298void pci_iounmap(struct pci_dev *pdev, void __iomem *addr)
283{ 299{
@@ -285,8 +301,12 @@ void pci_iounmap(struct pci_dev *pdev, void __iomem *addr)
285 301
286 idx = (((__force u64) addr) & ~ZPCI_IOMAP_ADDR_BASE) >> 48; 302 idx = (((__force u64) addr) & ~ZPCI_IOMAP_ADDR_BASE) >> 48;
287 spin_lock(&zpci_iomap_lock); 303 spin_lock(&zpci_iomap_lock);
288 zpci_iomap_start[idx].fh = 0; 304 /* Detect underrun */
289 zpci_iomap_start[idx].bar = 0; 305 BUG_ON(!zpci_iomap_start[idx].count);
306 if (!--zpci_iomap_start[idx].count) {
307 zpci_iomap_start[idx].fh = 0;
308 zpci_iomap_start[idx].bar = 0;
309 }
290 spin_unlock(&zpci_iomap_lock); 310 spin_unlock(&zpci_iomap_lock);
291} 311}
292EXPORT_SYMBOL_GPL(pci_iounmap); 312EXPORT_SYMBOL_GPL(pci_iounmap);
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index 879fd7d33877..ef01fef3eebc 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -16,7 +16,6 @@
16#define LHCALL_SET_PTE 14 16#define LHCALL_SET_PTE 14
17#define LHCALL_SET_PGD 15 17#define LHCALL_SET_PGD 15
18#define LHCALL_LOAD_TLS 16 18#define LHCALL_LOAD_TLS 16
19#define LHCALL_NOTIFY 17
20#define LHCALL_LOAD_GDT_ENTRY 18 19#define LHCALL_LOAD_GDT_ENTRY 18
21#define LHCALL_SEND_INTERRUPTS 19 20#define LHCALL_SEND_INTERRUPTS 19
22 21
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index c1c1544b8485..ac4453d8520e 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -56,6 +56,9 @@
56#include <linux/virtio_console.h> 56#include <linux/virtio_console.h>
57#include <linux/pm.h> 57#include <linux/pm.h>
58#include <linux/export.h> 58#include <linux/export.h>
59#include <linux/pci.h>
60#include <linux/virtio_pci.h>
61#include <asm/acpi.h>
59#include <asm/apic.h> 62#include <asm/apic.h>
60#include <asm/lguest.h> 63#include <asm/lguest.h>
61#include <asm/paravirt.h> 64#include <asm/paravirt.h>
@@ -71,6 +74,8 @@
71#include <asm/stackprotector.h> 74#include <asm/stackprotector.h>
72#include <asm/reboot.h> /* for struct machine_ops */ 75#include <asm/reboot.h> /* for struct machine_ops */
73#include <asm/kvm_para.h> 76#include <asm/kvm_para.h>
77#include <asm/pci_x86.h>
78#include <asm/pci-direct.h>
74 79
75/*G:010 80/*G:010
76 * Welcome to the Guest! 81 * Welcome to the Guest!
@@ -831,6 +836,24 @@ static struct irq_chip lguest_irq_controller = {
831 .irq_unmask = enable_lguest_irq, 836 .irq_unmask = enable_lguest_irq,
832}; 837};
833 838
839static int lguest_enable_irq(struct pci_dev *dev)
840{
841 u8 line = 0;
842
843 /* We literally use the PCI interrupt line as the irq number. */
844 pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line);
845 irq_set_chip_and_handler_name(line, &lguest_irq_controller,
846 handle_level_irq, "level");
847 dev->irq = line;
848 return 0;
849}
850
851/* We don't do hotplug PCI, so this shouldn't be called. */
852static void lguest_disable_irq(struct pci_dev *dev)
853{
854 WARN_ON(1);
855}
856
834/* 857/*
835 * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware 858 * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
836 * interrupt (except 128, which is used for system calls), and then tells the 859 * interrupt (except 128, which is used for system calls), and then tells the
@@ -1181,25 +1204,136 @@ static __init char *lguest_memory_setup(void)
1181 return "LGUEST"; 1204 return "LGUEST";
1182} 1205}
1183 1206
1207/* Offset within PCI config space of BAR access capability. */
1208static int console_cfg_offset = 0;
1209static int console_access_cap;
1210
1211/* Set up so that we access off in bar0 (on bus 0, device 1, function 0) */
1212static void set_cfg_window(u32 cfg_offset, u32 off)
1213{
1214 write_pci_config_byte(0, 1, 0,
1215 cfg_offset + offsetof(struct virtio_pci_cap, bar),
1216 0);
1217 write_pci_config(0, 1, 0,
1218 cfg_offset + offsetof(struct virtio_pci_cap, length),
1219 4);
1220 write_pci_config(0, 1, 0,
1221 cfg_offset + offsetof(struct virtio_pci_cap, offset),
1222 off);
1223}
1224
1225static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val)
1226{
1227 /*
1228 * We could set this up once, then leave it; nothing else in the *
1229 * kernel should touch these registers. But if it went wrong, that
1230 * would be a horrible bug to find.
1231 */
1232 set_cfg_window(cfg_offset, off);
1233 write_pci_config(0, 1, 0,
1234 cfg_offset + sizeof(struct virtio_pci_cap), val);
1235}
1236
1237static void probe_pci_console(void)
1238{
1239 u8 cap, common_cap = 0, device_cap = 0;
1240 /* Offset within BAR0 */
1241 u32 device_offset;
1242 u32 device_len;
1243
1244 /* Avoid recursive printk into here. */
1245 console_cfg_offset = -1;
1246
1247 if (!early_pci_allowed()) {
1248 printk(KERN_ERR "lguest: early PCI access not allowed!\n");
1249 return;
1250 }
1251
1252 /* We expect a console PCI device at BUS0, slot 1. */
1253 if (read_pci_config(0, 1, 0, 0) != 0x10431AF4) {
1254 printk(KERN_ERR "lguest: PCI device is %#x!\n",
1255 read_pci_config(0, 1, 0, 0));
1256 return;
1257 }
1258
1259 /* Find the capabilities we need (must be in bar0) */
1260 cap = read_pci_config_byte(0, 1, 0, PCI_CAPABILITY_LIST);
1261 while (cap) {
1262 u8 vndr = read_pci_config_byte(0, 1, 0, cap);
1263 if (vndr == PCI_CAP_ID_VNDR) {
1264 u8 type, bar;
1265 u32 offset, length;
1266
1267 type = read_pci_config_byte(0, 1, 0,
1268 cap + offsetof(struct virtio_pci_cap, cfg_type));
1269 bar = read_pci_config_byte(0, 1, 0,
1270 cap + offsetof(struct virtio_pci_cap, bar));
1271 offset = read_pci_config(0, 1, 0,
1272 cap + offsetof(struct virtio_pci_cap, offset));
1273 length = read_pci_config(0, 1, 0,
1274 cap + offsetof(struct virtio_pci_cap, length));
1275
1276 switch (type) {
1277 case VIRTIO_PCI_CAP_DEVICE_CFG:
1278 if (bar == 0) {
1279 device_cap = cap;
1280 device_offset = offset;
1281 device_len = length;
1282 }
1283 break;
1284 case VIRTIO_PCI_CAP_PCI_CFG:
1285 console_access_cap = cap;
1286 break;
1287 }
1288 }
1289 cap = read_pci_config_byte(0, 1, 0, cap + PCI_CAP_LIST_NEXT);
1290 }
1291 if (!device_cap || !console_access_cap) {
1292 printk(KERN_ERR "lguest: No caps (%u/%u/%u) in console!\n",
1293 common_cap, device_cap, console_access_cap);
1294 return;
1295 }
1296
1297 /*
1298 * Note that we can't check features, until we've set the DRIVER
1299 * status bit. We don't want to do that until we have a real driver,
1300 * so we just check that the device-specific config has room for
1301 * emerg_wr. If it doesn't support VIRTIO_CONSOLE_F_EMERG_WRITE
1302 * it should ignore the access.
1303 */
1304 if (device_len < (offsetof(struct virtio_console_config, emerg_wr)
1305 + sizeof(u32))) {
1306 printk(KERN_ERR "lguest: console missing emerg_wr field\n");
1307 return;
1308 }
1309
1310 console_cfg_offset = device_offset;
1311 printk(KERN_INFO "lguest: Console via virtio-pci emerg_wr\n");
1312}
1313
1184/* 1314/*
1185 * We will eventually use the virtio console device to produce console output, 1315 * We will eventually use the virtio console device to produce console output,
1186 * but before that is set up we use LHCALL_NOTIFY on normal memory to produce 1316 * but before that is set up we use the virtio PCI console's backdoor mmio
1187 * console output. 1317 * access and the "emergency" write facility (which is legal even before the
1318 * device is configured).
1188 */ 1319 */
1189static __init int early_put_chars(u32 vtermno, const char *buf, int count) 1320static __init int early_put_chars(u32 vtermno, const char *buf, int count)
1190{ 1321{
1191 char scratch[17]; 1322 /* If we couldn't find PCI console, forget it. */
1192 unsigned int len = count; 1323 if (console_cfg_offset < 0)
1324 return count;
1193 1325
1194 /* We use a nul-terminated string, so we make a copy. Icky, huh? */ 1326 if (unlikely(!console_cfg_offset)) {
1195 if (len > sizeof(scratch) - 1) 1327 probe_pci_console();
1196 len = sizeof(scratch) - 1; 1328 if (console_cfg_offset < 0)
1197 scratch[len] = '\0'; 1329 return count;
1198 memcpy(scratch, buf, len); 1330 }
1199 hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0, 0);
1200 1331
1201 /* This routine returns the number of bytes actually written. */ 1332 write_bar_via_cfg(console_access_cap,
1202 return len; 1333 console_cfg_offset
1334 + offsetof(struct virtio_console_config, emerg_wr),
1335 buf[0]);
1336 return 1;
1203} 1337}
1204 1338
1205/* 1339/*
@@ -1400,14 +1534,6 @@ __init void lguest_init(void)
1400 atomic_notifier_chain_register(&panic_notifier_list, &paniced); 1534 atomic_notifier_chain_register(&panic_notifier_list, &paniced);
1401 1535
1402 /* 1536 /*
1403 * The IDE code spends about 3 seconds probing for disks: if we reserve
1404 * all the I/O ports up front it can't get them and so doesn't probe.
1405 * Other device drivers are similar (but less severe). This cuts the
1406 * kernel boot time on my machine from 4.1 seconds to 0.45 seconds.
1407 */
1408 paravirt_disable_iospace();
1409
1410 /*
1411 * This is messy CPU setup stuff which the native boot code does before 1537 * This is messy CPU setup stuff which the native boot code does before
1412 * start_kernel, so we have to do, too: 1538 * start_kernel, so we have to do, too:
1413 */ 1539 */
@@ -1436,6 +1562,13 @@ __init void lguest_init(void)
1436 /* Register our very early console. */ 1562 /* Register our very early console. */
1437 virtio_cons_early_init(early_put_chars); 1563 virtio_cons_early_init(early_put_chars);
1438 1564
1565 /* Don't let ACPI try to control our PCI interrupts. */
1566 disable_acpi();
1567
1568 /* We control them ourselves, by overriding these two hooks. */
1569 pcibios_enable_irq = lguest_enable_irq;
1570 pcibios_disable_irq = lguest_disable_irq;
1571
1439 /* 1572 /*
1440 * Last of all, we set the power management poweroff hook to point to 1573 * Last of all, we set the power management poweroff hook to point to
1441 * the Guest routine to power off, and the reboot hook to our restart 1574 * the Guest routine to power off, and the reboot hook to our restart
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index cdfbd21e3597..655e570b9b31 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -28,8 +28,7 @@ struct virtio_blk_vq {
28 char name[VQ_NAME_LEN]; 28 char name[VQ_NAME_LEN];
29} ____cacheline_aligned_in_smp; 29} ____cacheline_aligned_in_smp;
30 30
31struct virtio_blk 31struct virtio_blk {
32{
33 struct virtio_device *vdev; 32 struct virtio_device *vdev;
34 33
35 /* The disk structure for the kernel. */ 34 /* The disk structure for the kernel. */
@@ -52,8 +51,7 @@ struct virtio_blk
52 struct virtio_blk_vq *vqs; 51 struct virtio_blk_vq *vqs;
53}; 52};
54 53
55struct virtblk_req 54struct virtblk_req {
56{
57 struct request *req; 55 struct request *req;
58 struct virtio_blk_outhdr out_hdr; 56 struct virtio_blk_outhdr out_hdr;
59 struct virtio_scsi_inhdr in_hdr; 57 struct virtio_scsi_inhdr in_hdr;
@@ -575,6 +573,12 @@ static int virtblk_probe(struct virtio_device *vdev)
575 u16 min_io_size; 573 u16 min_io_size;
576 u8 physical_block_exp, alignment_offset; 574 u8 physical_block_exp, alignment_offset;
577 575
576 if (!vdev->config->get) {
577 dev_err(&vdev->dev, "%s failure: config access disabled\n",
578 __func__);
579 return -EINVAL;
580 }
581
578 err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS), 582 err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS),
579 GFP_KERNEL); 583 GFP_KERNEL);
580 if (err < 0) 584 if (err < 0)
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index 26afb56a8073..fae2dbbf5745 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -1986,7 +1986,10 @@ static int virtcons_probe(struct virtio_device *vdev)
1986 bool multiport; 1986 bool multiport;
1987 bool early = early_put_chars != NULL; 1987 bool early = early_put_chars != NULL;
1988 1988
1989 if (!vdev->config->get) { 1989 /* We only need a config space if features are offered */
1990 if (!vdev->config->get &&
1991 (virtio_has_feature(vdev, VIRTIO_CONSOLE_F_SIZE)
1992 || virtio_has_feature(vdev, VIRTIO_CONSOLE_F_MULTIPORT))) {
1990 dev_err(&vdev->dev, "%s failure: config access disabled\n", 1993 dev_err(&vdev->dev, "%s failure: config access disabled\n",
1991 __func__); 1994 __func__);
1992 return -EINVAL; 1995 return -EINVAL;
diff --git a/drivers/lguest/Makefile b/drivers/lguest/Makefile
index c4197503900e..16f52ee73994 100644
--- a/drivers/lguest/Makefile
+++ b/drivers/lguest/Makefile
@@ -1,6 +1,3 @@
1# Guest requires the device configuration and probing code.
2obj-$(CONFIG_LGUEST_GUEST) += lguest_device.o
3
4# Host requires the other files, which can be a module. 1# Host requires the other files, which can be a module.
5obj-$(CONFIG_LGUEST) += lg.o 2obj-$(CONFIG_LGUEST) += lg.o
6lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \ 3lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 6590558d1d31..7dc93aa004c8 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -208,6 +208,14 @@ void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b,
208 */ 208 */
209int run_guest(struct lg_cpu *cpu, unsigned long __user *user) 209int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
210{ 210{
211 /* If the launcher asked for a register with LHREQ_GETREG */
212 if (cpu->reg_read) {
213 if (put_user(*cpu->reg_read, user))
214 return -EFAULT;
215 cpu->reg_read = NULL;
216 return sizeof(*cpu->reg_read);
217 }
218
211 /* We stop running once the Guest is dead. */ 219 /* We stop running once the Guest is dead. */
212 while (!cpu->lg->dead) { 220 while (!cpu->lg->dead) {
213 unsigned int irq; 221 unsigned int irq;
@@ -217,21 +225,12 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
217 if (cpu->hcall) 225 if (cpu->hcall)
218 do_hypercalls(cpu); 226 do_hypercalls(cpu);
219 227
220 /* 228 /* Do we have to tell the Launcher about a trap? */
221 * It's possible the Guest did a NOTIFY hypercall to the 229 if (cpu->pending.trap) {
222 * Launcher. 230 if (copy_to_user(user, &cpu->pending,
223 */ 231 sizeof(cpu->pending)))
224 if (cpu->pending_notify) { 232 return -EFAULT;
225 /* 233 return sizeof(cpu->pending);
226 * Does it just needs to write to a registered
227 * eventfd (ie. the appropriate virtqueue thread)?
228 */
229 if (!send_notify_to_eventfd(cpu)) {
230 /* OK, we tell the main Launcher. */
231 if (put_user(cpu->pending_notify, user))
232 return -EFAULT;
233 return sizeof(cpu->pending_notify);
234 }
235 } 234 }
236 235
237 /* 236 /*
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index 83511eb0923d..1219af493c0f 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -117,9 +117,6 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
117 /* Similarly, this sets the halted flag for run_guest(). */ 117 /* Similarly, this sets the halted flag for run_guest(). */
118 cpu->halted = 1; 118 cpu->halted = 1;
119 break; 119 break;
120 case LHCALL_NOTIFY:
121 cpu->pending_notify = args->arg1;
122 break;
123 default: 120 default:
124 /* It should be an architecture-specific hypercall. */ 121 /* It should be an architecture-specific hypercall. */
125 if (lguest_arch_do_hcall(cpu, args)) 122 if (lguest_arch_do_hcall(cpu, args))
@@ -189,7 +186,7 @@ static void do_async_hcalls(struct lg_cpu *cpu)
189 * Stop doing hypercalls if they want to notify the Launcher: 186 * Stop doing hypercalls if they want to notify the Launcher:
190 * it needs to service this first. 187 * it needs to service this first.
191 */ 188 */
192 if (cpu->pending_notify) 189 if (cpu->pending.trap)
193 break; 190 break;
194 } 191 }
195} 192}
@@ -280,7 +277,7 @@ void do_hypercalls(struct lg_cpu *cpu)
280 * NOTIFY to the Launcher, we want to return now. Otherwise we do 277 * NOTIFY to the Launcher, we want to return now. Otherwise we do
281 * the hypercall. 278 * the hypercall.
282 */ 279 */
283 if (!cpu->pending_notify) { 280 if (!cpu->pending.trap) {
284 do_hcall(cpu, cpu->hcall); 281 do_hcall(cpu, cpu->hcall);
285 /* 282 /*
286 * Tricky point: we reset the hcall pointer to mark the 283 * Tricky point: we reset the hcall pointer to mark the
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 2eef40be4c04..307e8b39e7d1 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -50,7 +50,10 @@ struct lg_cpu {
50 /* Bitmap of what has changed: see CHANGED_* above. */ 50 /* Bitmap of what has changed: see CHANGED_* above. */
51 int changed; 51 int changed;
52 52
53 unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */ 53 /* Pending operation. */
54 struct lguest_pending pending;
55
56 unsigned long *reg_read; /* register from LHREQ_GETREG */
54 57
55 /* At end of a page shared mapped over lguest_pages in guest. */ 58 /* At end of a page shared mapped over lguest_pages in guest. */
56 unsigned long regs_page; 59 unsigned long regs_page;
@@ -78,24 +81,18 @@ struct lg_cpu {
78 struct lg_cpu_arch arch; 81 struct lg_cpu_arch arch;
79}; 82};
80 83
81struct lg_eventfd {
82 unsigned long addr;
83 struct eventfd_ctx *event;
84};
85
86struct lg_eventfd_map {
87 unsigned int num;
88 struct lg_eventfd map[];
89};
90
91/* The private info the thread maintains about the guest. */ 84/* The private info the thread maintains about the guest. */
92struct lguest { 85struct lguest {
93 struct lguest_data __user *lguest_data; 86 struct lguest_data __user *lguest_data;
94 struct lg_cpu cpus[NR_CPUS]; 87 struct lg_cpu cpus[NR_CPUS];
95 unsigned int nr_cpus; 88 unsigned int nr_cpus;
96 89
90 /* Valid guest memory pages must be < this. */
97 u32 pfn_limit; 91 u32 pfn_limit;
98 92
93 /* Device memory is >= pfn_limit and < device_limit. */
94 u32 device_limit;
95
99 /* 96 /*
100 * This provides the offset to the base of guest-physical memory in the 97 * This provides the offset to the base of guest-physical memory in the
101 * Launcher. 98 * Launcher.
@@ -110,8 +107,6 @@ struct lguest {
110 unsigned int stack_pages; 107 unsigned int stack_pages;
111 u32 tsc_khz; 108 u32 tsc_khz;
112 109
113 struct lg_eventfd_map *eventfds;
114
115 /* Dead? */ 110 /* Dead? */
116 const char *dead; 111 const char *dead;
117}; 112};
@@ -197,8 +192,10 @@ void guest_pagetable_flush_user(struct lg_cpu *cpu);
197void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, 192void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
198 unsigned long vaddr, pte_t val); 193 unsigned long vaddr, pte_t val);
199void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages); 194void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages);
200bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode); 195bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode,
196 unsigned long *iomem);
201void pin_page(struct lg_cpu *cpu, unsigned long vaddr); 197void pin_page(struct lg_cpu *cpu, unsigned long vaddr);
198bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr);
202unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr); 199unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr);
203void page_table_guest_data_init(struct lg_cpu *cpu); 200void page_table_guest_data_init(struct lg_cpu *cpu);
204 201
@@ -210,6 +207,7 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu);
210int lguest_arch_init_hypercalls(struct lg_cpu *cpu); 207int lguest_arch_init_hypercalls(struct lg_cpu *cpu);
211int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args); 208int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args);
212void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start); 209void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start);
210unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any);
213 211
214/* <arch>/switcher.S: */ 212/* <arch>/switcher.S: */
215extern char start_switcher_text[], end_switcher_text[], switch_to_guest[]; 213extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
deleted file mode 100644
index 89088d6538fd..000000000000
--- a/drivers/lguest/lguest_device.c
+++ /dev/null
@@ -1,540 +0,0 @@
1/*P:050
2 * Lguest guests use a very simple method to describe devices. It's a
3 * series of device descriptors contained just above the top of normal Guest
4 * memory.
5 *
6 * We use the standard "virtio" device infrastructure, which provides us with a
7 * console, a network and a block driver. Each one expects some configuration
8 * information and a "virtqueue" or two to send and receive data.
9:*/
10#include <linux/init.h>
11#include <linux/bootmem.h>
12#include <linux/lguest_launcher.h>
13#include <linux/virtio.h>
14#include <linux/virtio_config.h>
15#include <linux/interrupt.h>
16#include <linux/virtio_ring.h>
17#include <linux/err.h>
18#include <linux/export.h>
19#include <linux/slab.h>
20#include <asm/io.h>
21#include <asm/paravirt.h>
22#include <asm/lguest_hcall.h>
23
24/* The pointer to our (page) of device descriptions. */
25static void *lguest_devices;
26
27/*
28 * For Guests, device memory can be used as normal memory, so we cast away the
29 * __iomem to quieten sparse.
30 */
31static inline void *lguest_map(unsigned long phys_addr, unsigned long pages)
32{
33 return (__force void *)ioremap_cache(phys_addr, PAGE_SIZE*pages);
34}
35
36static inline void lguest_unmap(void *addr)
37{
38 iounmap((__force void __iomem *)addr);
39}
40
41/*D:100
42 * Each lguest device is just a virtio device plus a pointer to its entry
43 * in the lguest_devices page.
44 */
45struct lguest_device {
46 struct virtio_device vdev;
47
48 /* The entry in the lguest_devices page for this device. */
49 struct lguest_device_desc *desc;
50};
51
52/*
53 * Since the virtio infrastructure hands us a pointer to the virtio_device all
54 * the time, it helps to have a curt macro to get a pointer to the struct
55 * lguest_device it's enclosed in.
56 */
57#define to_lgdev(vd) container_of(vd, struct lguest_device, vdev)
58
59/*D:130
60 * Device configurations
61 *
62 * The configuration information for a device consists of one or more
63 * virtqueues, a feature bitmap, and some configuration bytes. The
64 * configuration bytes don't really matter to us: the Launcher sets them up, and
65 * the driver will look at them during setup.
66 *
67 * A convenient routine to return the device's virtqueue config array:
68 * immediately after the descriptor.
69 */
70static struct lguest_vqconfig *lg_vq(const struct lguest_device_desc *desc)
71{
72 return (void *)(desc + 1);
73}
74
75/* The features come immediately after the virtqueues. */
76static u8 *lg_features(const struct lguest_device_desc *desc)
77{
78 return (void *)(lg_vq(desc) + desc->num_vq);
79}
80
81/* The config space comes after the two feature bitmasks. */
82static u8 *lg_config(const struct lguest_device_desc *desc)
83{
84 return lg_features(desc) + desc->feature_len * 2;
85}
86
87/* The total size of the config page used by this device (incl. desc) */
88static unsigned desc_size(const struct lguest_device_desc *desc)
89{
90 return sizeof(*desc)
91 + desc->num_vq * sizeof(struct lguest_vqconfig)
92 + desc->feature_len * 2
93 + desc->config_len;
94}
95
96/* This gets the device's feature bits. */
97static u64 lg_get_features(struct virtio_device *vdev)
98{
99 unsigned int i;
100 u32 features = 0;
101 struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
102 u8 *in_features = lg_features(desc);
103
104 /* We do this the slow but generic way. */
105 for (i = 0; i < min(desc->feature_len * 8, 32); i++)
106 if (in_features[i / 8] & (1 << (i % 8)))
107 features |= (1 << i);
108
109 return features;
110}
111
112/*
113 * To notify on reset or feature finalization, we (ab)use the NOTIFY
114 * hypercall, with the descriptor address of the device.
115 */
116static void status_notify(struct virtio_device *vdev)
117{
118 unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices;
119
120 hcall(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset, 0, 0, 0);
121}
122
123/*
124 * The virtio core takes the features the Host offers, and copies the ones
125 * supported by the driver into the vdev->features array. Once that's all
126 * sorted out, this routine is called so we can tell the Host which features we
127 * understand and accept.
128 */
129static int lg_finalize_features(struct virtio_device *vdev)
130{
131 unsigned int i, bits;
132 struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
133 /* Second half of bitmap is features we accept. */
134 u8 *out_features = lg_features(desc) + desc->feature_len;
135
136 /* Give virtio_ring a chance to accept features. */
137 vring_transport_features(vdev);
138
139 /* Make sure we don't have any features > 32 bits! */
140 BUG_ON((u32)vdev->features != vdev->features);
141
142 /*
143 * Since lguest is currently x86-only, we're little-endian. That
144 * means we could just memcpy. But it's not time critical, and in
145 * case someone copies this code, we do it the slow, obvious way.
146 */
147 memset(out_features, 0, desc->feature_len);
148 bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8;
149 for (i = 0; i < bits; i++) {
150 if (__virtio_test_bit(vdev, i))
151 out_features[i / 8] |= (1 << (i % 8));
152 }
153
154 /* Tell Host we've finished with this device's feature negotiation */
155 status_notify(vdev);
156
157 return 0;
158}
159
160/* Once they've found a field, getting a copy of it is easy. */
161static void lg_get(struct virtio_device *vdev, unsigned int offset,
162 void *buf, unsigned len)
163{
164 struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
165
166 /* Check they didn't ask for more than the length of the config! */
167 BUG_ON(offset + len > desc->config_len);
168 memcpy(buf, lg_config(desc) + offset, len);
169}
170
171/* Setting the contents is also trivial. */
172static void lg_set(struct virtio_device *vdev, unsigned int offset,
173 const void *buf, unsigned len)
174{
175 struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
176
177 /* Check they didn't ask for more than the length of the config! */
178 BUG_ON(offset + len > desc->config_len);
179 memcpy(lg_config(desc) + offset, buf, len);
180}
181
182/*
183 * The operations to get and set the status word just access the status field
184 * of the device descriptor.
185 */
186static u8 lg_get_status(struct virtio_device *vdev)
187{
188 return to_lgdev(vdev)->desc->status;
189}
190
191static void lg_set_status(struct virtio_device *vdev, u8 status)
192{
193 BUG_ON(!status);
194 to_lgdev(vdev)->desc->status = status;
195
196 /* Tell Host immediately if we failed. */
197 if (status & VIRTIO_CONFIG_S_FAILED)
198 status_notify(vdev);
199}
200
201static void lg_reset(struct virtio_device *vdev)
202{
203 /* 0 status means "reset" */
204 to_lgdev(vdev)->desc->status = 0;
205 status_notify(vdev);
206}
207
208/*
209 * Virtqueues
210 *
211 * The other piece of infrastructure virtio needs is a "virtqueue": a way of
212 * the Guest device registering buffers for the other side to read from or
213 * write into (ie. send and receive buffers). Each device can have multiple
214 * virtqueues: for example the console driver uses one queue for sending and
215 * another for receiving.
216 *
217 * Fortunately for us, a very fast shared-memory-plus-descriptors virtqueue
218 * already exists in virtio_ring.c. We just need to connect it up.
219 *
220 * We start with the information we need to keep about each virtqueue.
221 */
222
223/*D:140 This is the information we remember about each virtqueue. */
224struct lguest_vq_info {
225 /* A copy of the information contained in the device config. */
226 struct lguest_vqconfig config;
227
228 /* The address where we mapped the virtio ring, so we can unmap it. */
229 void *pages;
230};
231
232/*
233 * When the virtio_ring code wants to prod the Host, it calls us here and we
234 * make a hypercall. We hand the physical address of the virtqueue so the Host
235 * knows which virtqueue we're talking about.
236 */
237static bool lg_notify(struct virtqueue *vq)
238{
239 /*
240 * We store our virtqueue information in the "priv" pointer of the
241 * virtqueue structure.
242 */
243 struct lguest_vq_info *lvq = vq->priv;
244
245 hcall(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT, 0, 0, 0);
246 return true;
247}
248
249/* An extern declaration inside a C file is bad form. Don't do it. */
250extern int lguest_setup_irq(unsigned int irq);
251
252/*
253 * This routine finds the Nth virtqueue described in the configuration of
254 * this device and sets it up.
255 *
256 * This is kind of an ugly duckling. It'd be nicer to have a standard
257 * representation of a virtqueue in the configuration space, but it seems that
258 * everyone wants to do it differently. The KVM coders want the Guest to
259 * allocate its own pages and tell the Host where they are, but for lguest it's
260 * simpler for the Host to simply tell us where the pages are.
261 */
262static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
263 unsigned index,
264 void (*callback)(struct virtqueue *vq),
265 const char *name)
266{
267 struct lguest_device *ldev = to_lgdev(vdev);
268 struct lguest_vq_info *lvq;
269 struct virtqueue *vq;
270 int err;
271
272 if (!name)
273 return NULL;
274
275 /* We must have this many virtqueues. */
276 if (index >= ldev->desc->num_vq)
277 return ERR_PTR(-ENOENT);
278
279 lvq = kmalloc(sizeof(*lvq), GFP_KERNEL);
280 if (!lvq)
281 return ERR_PTR(-ENOMEM);
282
283 /*
284 * Make a copy of the "struct lguest_vqconfig" entry, which sits after
285 * the descriptor. We need a copy because the config space might not
286 * be aligned correctly.
287 */
288 memcpy(&lvq->config, lg_vq(ldev->desc)+index, sizeof(lvq->config));
289
290 printk("Mapping virtqueue %i addr %lx\n", index,
291 (unsigned long)lvq->config.pfn << PAGE_SHIFT);
292 /* Figure out how many pages the ring will take, and map that memory */
293 lvq->pages = lguest_map((unsigned long)lvq->config.pfn << PAGE_SHIFT,
294 DIV_ROUND_UP(vring_size(lvq->config.num,
295 LGUEST_VRING_ALIGN),
296 PAGE_SIZE));
297 if (!lvq->pages) {
298 err = -ENOMEM;
299 goto free_lvq;
300 }
301
302 /*
303 * OK, tell virtio_ring.c to set up a virtqueue now we know its size
304 * and we've got a pointer to its pages. Note that we set weak_barriers
305 * to 'true': the host just a(nother) SMP CPU, so we only need inter-cpu
306 * barriers.
307 */
308 vq = vring_new_virtqueue(index, lvq->config.num, LGUEST_VRING_ALIGN, vdev,
309 true, lvq->pages, lg_notify, callback, name);
310 if (!vq) {
311 err = -ENOMEM;
312 goto unmap;
313 }
314
315 /* Make sure the interrupt is allocated. */
316 err = lguest_setup_irq(lvq->config.irq);
317 if (err)
318 goto destroy_vring;
319
320 /*
321 * Tell the interrupt for this virtqueue to go to the virtio_ring
322 * interrupt handler.
323 *
324 * FIXME: We used to have a flag for the Host to tell us we could use
325 * the interrupt as a source of randomness: it'd be nice to have that
326 * back.
327 */
328 err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED,
329 dev_name(&vdev->dev), vq);
330 if (err)
331 goto free_desc;
332
333 /*
334 * Last of all we hook up our 'struct lguest_vq_info" to the
335 * virtqueue's priv pointer.
336 */
337 vq->priv = lvq;
338 return vq;
339
340free_desc:
341 irq_free_desc(lvq->config.irq);
342destroy_vring:
343 vring_del_virtqueue(vq);
344unmap:
345 lguest_unmap(lvq->pages);
346free_lvq:
347 kfree(lvq);
348 return ERR_PTR(err);
349}
350/*:*/
351
352/* Cleaning up a virtqueue is easy */
353static void lg_del_vq(struct virtqueue *vq)
354{
355 struct lguest_vq_info *lvq = vq->priv;
356
357 /* Release the interrupt */
358 free_irq(lvq->config.irq, vq);
359 /* Tell virtio_ring.c to free the virtqueue. */
360 vring_del_virtqueue(vq);
361 /* Unmap the pages containing the ring. */
362 lguest_unmap(lvq->pages);
363 /* Free our own queue information. */
364 kfree(lvq);
365}
366
367static void lg_del_vqs(struct virtio_device *vdev)
368{
369 struct virtqueue *vq, *n;
370
371 list_for_each_entry_safe(vq, n, &vdev->vqs, list)
372 lg_del_vq(vq);
373}
374
375static int lg_find_vqs(struct virtio_device *vdev, unsigned nvqs,
376 struct virtqueue *vqs[],
377 vq_callback_t *callbacks[],
378 const char *names[])
379{
380 struct lguest_device *ldev = to_lgdev(vdev);
381 int i;
382
383 /* We must have this many virtqueues. */
384 if (nvqs > ldev->desc->num_vq)
385 return -ENOENT;
386
387 for (i = 0; i < nvqs; ++i) {
388 vqs[i] = lg_find_vq(vdev, i, callbacks[i], names[i]);
389 if (IS_ERR(vqs[i]))
390 goto error;
391 }
392 return 0;
393
394error:
395 lg_del_vqs(vdev);
396 return PTR_ERR(vqs[i]);
397}
398
399static const char *lg_bus_name(struct virtio_device *vdev)
400{
401 return "";
402}
403
404/* The ops structure which hooks everything together. */
405static const struct virtio_config_ops lguest_config_ops = {
406 .get_features = lg_get_features,
407 .finalize_features = lg_finalize_features,
408 .get = lg_get,
409 .set = lg_set,
410 .get_status = lg_get_status,
411 .set_status = lg_set_status,
412 .reset = lg_reset,
413 .find_vqs = lg_find_vqs,
414 .del_vqs = lg_del_vqs,
415 .bus_name = lg_bus_name,
416};
417
418/*
419 * The root device for the lguest virtio devices. This makes them appear as
420 * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2.
421 */
422static struct device *lguest_root;
423
424/*D:120
425 * This is the core of the lguest bus: actually adding a new device.
426 * It's a separate function because it's neater that way, and because an
427 * earlier version of the code supported hotplug and unplug. They were removed
428 * early on because they were never used.
429 *
430 * As Andrew Tridgell says, "Untested code is buggy code".
431 *
432 * It's worth reading this carefully: we start with a pointer to the new device
433 * descriptor in the "lguest_devices" page, and the offset into the device
434 * descriptor page so we can uniquely identify it if things go badly wrong.
435 */
436static void add_lguest_device(struct lguest_device_desc *d,
437 unsigned int offset)
438{
439 struct lguest_device *ldev;
440
441 /* Start with zeroed memory; Linux's device layer counts on it. */
442 ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
443 if (!ldev) {
444 printk(KERN_EMERG "Cannot allocate lguest dev %u type %u\n",
445 offset, d->type);
446 return;
447 }
448
449 /* This devices' parent is the lguest/ dir. */
450 ldev->vdev.dev.parent = lguest_root;
451 /*
452 * The device type comes straight from the descriptor. There's also a
453 * device vendor field in the virtio_device struct, which we leave as
454 * 0.
455 */
456 ldev->vdev.id.device = d->type;
457 /*
458 * We have a simple set of routines for querying the device's
459 * configuration information and setting its status.
460 */
461 ldev->vdev.config = &lguest_config_ops;
462 /* And we remember the device's descriptor for lguest_config_ops. */
463 ldev->desc = d;
464
465 /*
466 * register_virtio_device() sets up the generic fields for the struct
467 * virtio_device and calls device_register(). This makes the bus
468 * infrastructure look for a matching driver.
469 */
470 if (register_virtio_device(&ldev->vdev) != 0) {
471 printk(KERN_ERR "Failed to register lguest dev %u type %u\n",
472 offset, d->type);
473 kfree(ldev);
474 }
475}
476
477/*D:110
478 * scan_devices() simply iterates through the device page. The type 0 is
479 * reserved to mean "end of devices".
480 */
481static void scan_devices(void)
482{
483 unsigned int i;
484 struct lguest_device_desc *d;
485
486 /* We start at the page beginning, and skip over each entry. */
487 for (i = 0; i < PAGE_SIZE; i += desc_size(d)) {
488 d = lguest_devices + i;
489
490 /* Once we hit a zero, stop. */
491 if (d->type == 0)
492 break;
493
494 printk("Device at %i has size %u\n", i, desc_size(d));
495 add_lguest_device(d, i);
496 }
497}
498
499/*D:105
500 * Fairly early in boot, lguest_devices_init() is called to set up the
501 * lguest device infrastructure. We check that we are a Guest by checking
502 * pv_info.name: there are other ways of checking, but this seems most
503 * obvious to me.
504 *
505 * So we can access the "struct lguest_device_desc"s easily, we map that memory
506 * and store the pointer in the global "lguest_devices". Then we register a
507 * root device from which all our devices will hang (this seems to be the
508 * correct sysfs incantation).
509 *
510 * Finally we call scan_devices() which adds all the devices found in the
511 * lguest_devices page.
512 */
513static int __init lguest_devices_init(void)
514{
515 if (strcmp(pv_info.name, "lguest") != 0)
516 return 0;
517
518 lguest_root = root_device_register("lguest");
519 if (IS_ERR(lguest_root))
520 panic("Could not register lguest root");
521
522 /* Devices are in a single page above top of "normal" mem */
523 lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1);
524
525 scan_devices();
526 return 0;
527}
528/* We do this after core stuff, but before the drivers. */
529postcore_initcall(lguest_devices_init);
530
531/*D:150
532 * At this point in the journey we used to now wade through the lguest
533 * devices themselves: net, block and console. Since they're all now virtio
534 * devices rather than lguest-specific, I've decided to ignore them. Mostly,
535 * they're kind of boring. But this does mean you'll never experience the
536 * thrill of reading the forbidden love scene buried deep in the block driver.
537 *
538 * "make Launcher" beckons, where we answer questions like "Where do Guests
539 * come from?", and "What do you do when someone asks for optimization?".
540 */
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 4263f4cc8c55..c4c6113eb9a6 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -2,175 +2,62 @@
2 * launcher controls and communicates with the Guest. For example, 2 * launcher controls and communicates with the Guest. For example,
3 * the first write will tell us the Guest's memory layout and entry 3 * the first write will tell us the Guest's memory layout and entry
4 * point. A read will run the Guest until something happens, such as 4 * point. A read will run the Guest until something happens, such as
5 * a signal or the Guest doing a NOTIFY out to the Launcher. There is 5 * a signal or the Guest accessing a device.
6 * also a way for the Launcher to attach eventfds to particular NOTIFY
7 * values instead of returning from the read() call.
8:*/ 6:*/
9#include <linux/uaccess.h> 7#include <linux/uaccess.h>
10#include <linux/miscdevice.h> 8#include <linux/miscdevice.h>
11#include <linux/fs.h> 9#include <linux/fs.h>
12#include <linux/sched.h> 10#include <linux/sched.h>
13#include <linux/eventfd.h>
14#include <linux/file.h> 11#include <linux/file.h>
15#include <linux/slab.h> 12#include <linux/slab.h>
16#include <linux/export.h> 13#include <linux/export.h>
17#include "lg.h" 14#include "lg.h"
18 15
19/*L:056 16/*L:052
20 * Before we move on, let's jump ahead and look at what the kernel does when 17 The Launcher can get the registers, and also set some of them.
21 * it needs to look up the eventfds. That will complete our picture of how we 18*/
22 * use RCU. 19static int getreg_setup(struct lg_cpu *cpu, const unsigned long __user *input)
23 *
24 * The notification value is in cpu->pending_notify: we return true if it went
25 * to an eventfd.
26 */
27bool send_notify_to_eventfd(struct lg_cpu *cpu)
28{
29 unsigned int i;
30 struct lg_eventfd_map *map;
31
32 /*
33 * This "rcu_read_lock()" helps track when someone is still looking at
34 * the (RCU-using) eventfds array. It's not actually a lock at all;
35 * indeed it's a noop in many configurations. (You didn't expect me to
36 * explain all the RCU secrets here, did you?)
37 */
38 rcu_read_lock();
39 /*
40 * rcu_dereference is the counter-side of rcu_assign_pointer(); it
41 * makes sure we don't access the memory pointed to by
42 * cpu->lg->eventfds before cpu->lg->eventfds is set. Sounds crazy,
43 * but Alpha allows this! Paul McKenney points out that a really
44 * aggressive compiler could have the same effect:
45 * http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html
46 *
47 * So play safe, use rcu_dereference to get the rcu-protected pointer:
48 */
49 map = rcu_dereference(cpu->lg->eventfds);
50 /*
51 * Simple array search: even if they add an eventfd while we do this,
52 * we'll continue to use the old array and just won't see the new one.
53 */
54 for (i = 0; i < map->num; i++) {
55 if (map->map[i].addr == cpu->pending_notify) {
56 eventfd_signal(map->map[i].event, 1);
57 cpu->pending_notify = 0;
58 break;
59 }
60 }
61 /* We're done with the rcu-protected variable cpu->lg->eventfds. */
62 rcu_read_unlock();
63
64 /* If we cleared the notification, it's because we found a match. */
65 return cpu->pending_notify == 0;
66}
67
68/*L:055
69 * One of the more tricksy tricks in the Linux Kernel is a technique called
70 * Read Copy Update. Since one point of lguest is to teach lguest journeyers
71 * about kernel coding, I use it here. (In case you're curious, other purposes
72 * include learning about virtualization and instilling a deep appreciation for
73 * simplicity and puppies).
74 *
75 * We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we
76 * add new eventfds without ever blocking readers from accessing the array.
77 * The current Launcher only does this during boot, so that never happens. But
78 * Read Copy Update is cool, and adding a lock risks damaging even more puppies
79 * than this code does.
80 *
81 * We allocate a brand new one-larger array, copy the old one and add our new
82 * element. Then we make the lg eventfd pointer point to the new array.
83 * That's the easy part: now we need to free the old one, but we need to make
84 * sure no slow CPU somewhere is still looking at it. That's what
85 * synchronize_rcu does for us: waits until every CPU has indicated that it has
86 * moved on to know it's no longer using the old one.
87 *
88 * If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update.
89 */
90static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
91{ 20{
92 struct lg_eventfd_map *new, *old = lg->eventfds; 21 unsigned long which;
93
94 /*
95 * We don't allow notifications on value 0 anyway (pending_notify of
96 * 0 means "nothing pending").
97 */
98 if (!addr)
99 return -EINVAL;
100
101 /*
102 * Replace the old array with the new one, carefully: others can
103 * be accessing it at the same time.
104 */
105 new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1),
106 GFP_KERNEL);
107 if (!new)
108 return -ENOMEM;
109 22
110 /* First make identical copy. */ 23 /* We re-use the ptrace structure to specify which register to read. */
111 memcpy(new->map, old->map, sizeof(old->map[0]) * old->num); 24 if (get_user(which, input) != 0)
112 new->num = old->num; 25 return -EFAULT;
113
114 /* Now append new entry. */
115 new->map[new->num].addr = addr;
116 new->map[new->num].event = eventfd_ctx_fdget(fd);
117 if (IS_ERR(new->map[new->num].event)) {
118 int err = PTR_ERR(new->map[new->num].event);
119 kfree(new);
120 return err;
121 }
122 new->num++;
123 26
124 /* 27 /*
125 * Now put new one in place: rcu_assign_pointer() is a fancy way of 28 * We set up the cpu register pointer, and their next read will
126 * doing "lg->eventfds = new", but it uses memory barriers to make 29 * actually get the value (instead of running the guest).
127 * absolutely sure that the contents of "new" written above is nailed
128 * down before we actually do the assignment.
129 * 30 *
130 * We have to think about these kinds of things when we're operating on 31 * The last argument 'true' says we can access any register.
131 * live data without locks.
132 */ 32 */
133 rcu_assign_pointer(lg->eventfds, new); 33 cpu->reg_read = lguest_arch_regptr(cpu, which, true);
34 if (!cpu->reg_read)
35 return -ENOENT;
134 36
135 /* 37 /* And because this is a write() call, we return the length used. */
136 * We're not in a big hurry. Wait until no one's looking at old 38 return sizeof(unsigned long) * 2;
137 * version, then free it.
138 */
139 synchronize_rcu();
140 kfree(old);
141
142 return 0;
143} 39}
144 40
145/*L:052 41static int setreg(struct lg_cpu *cpu, const unsigned long __user *input)
146 * Receiving notifications from the Guest is usually done by attaching a
147 * particular LHCALL_NOTIFY value to an event filedescriptor. The eventfd will
148 * become readable when the Guest does an LHCALL_NOTIFY with that value.
149 *
150 * This is really convenient for processing each virtqueue in a separate
151 * thread.
152 */
153static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)
154{ 42{
155 unsigned long addr, fd; 43 unsigned long which, value, *reg;
156 int err;
157 44
158 if (get_user(addr, input) != 0) 45 /* We re-use the ptrace structure to specify which register to read. */
46 if (get_user(which, input) != 0)
159 return -EFAULT; 47 return -EFAULT;
160 input++; 48 input++;
161 if (get_user(fd, input) != 0) 49 if (get_user(value, input) != 0)
162 return -EFAULT; 50 return -EFAULT;
163 51
164 /* 52 /* The last argument 'false' means we can't access all registers. */
165 * Just make sure two callers don't add eventfds at once. We really 53 reg = lguest_arch_regptr(cpu, which, false);
166 * only need to lock against callers adding to the same Guest, so using 54 if (!reg)
167 * the Big Lguest Lock is overkill. But this is setup, not a fast path. 55 return -ENOENT;
168 */
169 mutex_lock(&lguest_lock);
170 err = add_eventfd(lg, addr, fd);
171 mutex_unlock(&lguest_lock);
172 56
173 return err; 57 *reg = value;
58
59 /* And because this is a write() call, we return the length used. */
60 return sizeof(unsigned long) * 3;
174} 61}
175 62
176/*L:050 63/*L:050
@@ -194,6 +81,23 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
194 return 0; 81 return 0;
195} 82}
196 83
84/*L:053
85 * Deliver a trap: this is used by the Launcher if it can't emulate
86 * an instruction.
87 */
88static int trap(struct lg_cpu *cpu, const unsigned long __user *input)
89{
90 unsigned long trapnum;
91
92 if (get_user(trapnum, input) != 0)
93 return -EFAULT;
94
95 if (!deliver_trap(cpu, trapnum))
96 return -EINVAL;
97
98 return 0;
99}
100
197/*L:040 101/*L:040
198 * Once our Guest is initialized, the Launcher makes it run by reading 102 * Once our Guest is initialized, the Launcher makes it run by reading
199 * from /dev/lguest. 103 * from /dev/lguest.
@@ -237,8 +141,8 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
237 * If we returned from read() last time because the Guest sent I/O, 141 * If we returned from read() last time because the Guest sent I/O,
238 * clear the flag. 142 * clear the flag.
239 */ 143 */
240 if (cpu->pending_notify) 144 if (cpu->pending.trap)
241 cpu->pending_notify = 0; 145 cpu->pending.trap = 0;
242 146
243 /* Run the Guest until something interesting happens. */ 147 /* Run the Guest until something interesting happens. */
244 return run_guest(cpu, (unsigned long __user *)user); 148 return run_guest(cpu, (unsigned long __user *)user);
@@ -319,7 +223,7 @@ static int initialize(struct file *file, const unsigned long __user *input)
319 /* "struct lguest" contains all we (the Host) know about a Guest. */ 223 /* "struct lguest" contains all we (the Host) know about a Guest. */
320 struct lguest *lg; 224 struct lguest *lg;
321 int err; 225 int err;
322 unsigned long args[3]; 226 unsigned long args[4];
323 227
324 /* 228 /*
325 * We grab the Big Lguest lock, which protects against multiple 229 * We grab the Big Lguest lock, which protects against multiple
@@ -343,21 +247,15 @@ static int initialize(struct file *file, const unsigned long __user *input)
343 goto unlock; 247 goto unlock;
344 } 248 }
345 249
346 lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL);
347 if (!lg->eventfds) {
348 err = -ENOMEM;
349 goto free_lg;
350 }
351 lg->eventfds->num = 0;
352
353 /* Populate the easy fields of our "struct lguest" */ 250 /* Populate the easy fields of our "struct lguest" */
354 lg->mem_base = (void __user *)args[0]; 251 lg->mem_base = (void __user *)args[0];
355 lg->pfn_limit = args[1]; 252 lg->pfn_limit = args[1];
253 lg->device_limit = args[3];
356 254
357 /* This is the first cpu (cpu 0) and it will start booting at args[2] */ 255 /* This is the first cpu (cpu 0) and it will start booting at args[2] */
358 err = lg_cpu_start(&lg->cpus[0], 0, args[2]); 256 err = lg_cpu_start(&lg->cpus[0], 0, args[2]);
359 if (err) 257 if (err)
360 goto free_eventfds; 258 goto free_lg;
361 259
362 /* 260 /*
363 * Initialize the Guest's shadow page tables. This allocates 261 * Initialize the Guest's shadow page tables. This allocates
@@ -378,8 +276,6 @@ static int initialize(struct file *file, const unsigned long __user *input)
378free_regs: 276free_regs:
379 /* FIXME: This should be in free_vcpu */ 277 /* FIXME: This should be in free_vcpu */
380 free_page(lg->cpus[0].regs_page); 278 free_page(lg->cpus[0].regs_page);
381free_eventfds:
382 kfree(lg->eventfds);
383free_lg: 279free_lg:
384 kfree(lg); 280 kfree(lg);
385unlock: 281unlock:
@@ -432,8 +328,12 @@ static ssize_t write(struct file *file, const char __user *in,
432 return initialize(file, input); 328 return initialize(file, input);
433 case LHREQ_IRQ: 329 case LHREQ_IRQ:
434 return user_send_irq(cpu, input); 330 return user_send_irq(cpu, input);
435 case LHREQ_EVENTFD: 331 case LHREQ_GETREG:
436 return attach_eventfd(lg, input); 332 return getreg_setup(cpu, input);
333 case LHREQ_SETREG:
334 return setreg(cpu, input);
335 case LHREQ_TRAP:
336 return trap(cpu, input);
437 default: 337 default:
438 return -EINVAL; 338 return -EINVAL;
439 } 339 }
@@ -478,11 +378,6 @@ static int close(struct inode *inode, struct file *file)
478 mmput(lg->cpus[i].mm); 378 mmput(lg->cpus[i].mm);
479 } 379 }
480 380
481 /* Release any eventfds they registered. */
482 for (i = 0; i < lg->eventfds->num; i++)
483 eventfd_ctx_put(lg->eventfds->map[i].event);
484 kfree(lg->eventfds);
485
486 /* 381 /*
487 * If lg->dead doesn't contain an error code it will be NULL or a 382 * If lg->dead doesn't contain an error code it will be NULL or a
488 * kmalloc()ed string, either of which is ok to hand to kfree(). 383 * kmalloc()ed string, either of which is ok to hand to kfree().
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index e8b55c3a6170..e3abebc912c0 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -250,6 +250,16 @@ static void release_pte(pte_t pte)
250} 250}
251/*:*/ 251/*:*/
252 252
253static bool gpte_in_iomem(struct lg_cpu *cpu, pte_t gpte)
254{
255 /* We don't handle large pages. */
256 if (pte_flags(gpte) & _PAGE_PSE)
257 return false;
258
259 return (pte_pfn(gpte) >= cpu->lg->pfn_limit
260 && pte_pfn(gpte) < cpu->lg->device_limit);
261}
262
253static bool check_gpte(struct lg_cpu *cpu, pte_t gpte) 263static bool check_gpte(struct lg_cpu *cpu, pte_t gpte)
254{ 264{
255 if ((pte_flags(gpte) & _PAGE_PSE) || 265 if ((pte_flags(gpte) & _PAGE_PSE) ||
@@ -374,8 +384,14 @@ static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate,
374 * 384 *
375 * If we fixed up the fault (ie. we mapped the address), this routine returns 385 * If we fixed up the fault (ie. we mapped the address), this routine returns
376 * true. Otherwise, it was a real fault and we need to tell the Guest. 386 * true. Otherwise, it was a real fault and we need to tell the Guest.
387 *
388 * There's a corner case: they're trying to access memory between
389 * pfn_limit and device_limit, which is I/O memory. In this case, we
390 * return false and set @iomem to the physical address, so the the
391 * Launcher can handle the instruction manually.
377 */ 392 */
378bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) 393bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode,
394 unsigned long *iomem)
379{ 395{
380 unsigned long gpte_ptr; 396 unsigned long gpte_ptr;
381 pte_t gpte; 397 pte_t gpte;
@@ -383,6 +399,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
383 pmd_t gpmd; 399 pmd_t gpmd;
384 pgd_t gpgd; 400 pgd_t gpgd;
385 401
402 *iomem = 0;
403
386 /* We never demand page the Switcher, so trying is a mistake. */ 404 /* We never demand page the Switcher, so trying is a mistake. */
387 if (vaddr >= switcher_addr) 405 if (vaddr >= switcher_addr)
388 return false; 406 return false;
@@ -459,6 +477,12 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
459 if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) 477 if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
460 return false; 478 return false;
461 479
480 /* If they're accessing io memory, we expect a fault. */
481 if (gpte_in_iomem(cpu, gpte)) {
482 *iomem = (pte_pfn(gpte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
483 return false;
484 }
485
462 /* 486 /*
463 * Check that the Guest PTE flags are OK, and the page number is below 487 * Check that the Guest PTE flags are OK, and the page number is below
464 * the pfn_limit (ie. not mapping the Launcher binary). 488 * the pfn_limit (ie. not mapping the Launcher binary).
@@ -553,7 +577,9 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
553 */ 577 */
554void pin_page(struct lg_cpu *cpu, unsigned long vaddr) 578void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
555{ 579{
556 if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) 580 unsigned long iomem;
581
582 if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2, &iomem))
557 kill_guest(cpu, "bad stack page %#lx", vaddr); 583 kill_guest(cpu, "bad stack page %#lx", vaddr);
558} 584}
559/*:*/ 585/*:*/
@@ -647,7 +673,7 @@ void guest_pagetable_flush_user(struct lg_cpu *cpu)
647/*:*/ 673/*:*/
648 674
649/* We walk down the guest page tables to get a guest-physical address */ 675/* We walk down the guest page tables to get a guest-physical address */
650unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) 676bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr)
651{ 677{
652 pgd_t gpgd; 678 pgd_t gpgd;
653 pte_t gpte; 679 pte_t gpte;
@@ -656,31 +682,47 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
656#endif 682#endif
657 683
658 /* Still not set up? Just map 1:1. */ 684 /* Still not set up? Just map 1:1. */
659 if (unlikely(cpu->linear_pages)) 685 if (unlikely(cpu->linear_pages)) {
660 return vaddr; 686 *paddr = vaddr;
687 return true;
688 }
661 689
662 /* First step: get the top-level Guest page table entry. */ 690 /* First step: get the top-level Guest page table entry. */
663 gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); 691 gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
664 /* Toplevel not present? We can't map it in. */ 692 /* Toplevel not present? We can't map it in. */
665 if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) { 693 if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
666 kill_guest(cpu, "Bad address %#lx", vaddr); 694 goto fail;
667 return -1UL;
668 }
669 695
670#ifdef CONFIG_X86_PAE 696#ifdef CONFIG_X86_PAE
671 gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); 697 gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
672 if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) { 698 if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
673 kill_guest(cpu, "Bad address %#lx", vaddr); 699 goto fail;
674 return -1UL;
675 }
676 gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t); 700 gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t);
677#else 701#else
678 gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); 702 gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t);
679#endif 703#endif
680 if (!(pte_flags(gpte) & _PAGE_PRESENT)) 704 if (!(pte_flags(gpte) & _PAGE_PRESENT))
681 kill_guest(cpu, "Bad address %#lx", vaddr); 705 goto fail;
706
707 *paddr = pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
708 return true;
709
710fail:
711 *paddr = -1UL;
712 return false;
713}
682 714
683 return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); 715/*
716 * This is the version we normally use: kills the Guest if it uses a
717 * bad address
718 */
719unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
720{
721 unsigned long paddr;
722
723 if (!__guest_pa(cpu, vaddr, &paddr))
724 kill_guest(cpu, "Bad address %#lx", vaddr);
725 return paddr;
684} 726}
685 727
686/* 728/*
@@ -912,7 +954,8 @@ static void __guest_set_pte(struct lg_cpu *cpu, int idx,
912 * now. This shaves 10% off a copy-on-write 954 * now. This shaves 10% off a copy-on-write
913 * micro-benchmark. 955 * micro-benchmark.
914 */ 956 */
915 if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { 957 if ((pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED))
958 && !gpte_in_iomem(cpu, gpte)) {
916 if (!check_gpte(cpu, gpte)) 959 if (!check_gpte(cpu, gpte))
917 return; 960 return;
918 set_pte(spte, 961 set_pte(spte,
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 6adfd7ba4c97..30f2aef69d78 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -182,6 +182,52 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
182} 182}
183/*:*/ 183/*:*/
184 184
185unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any)
186{
187 switch (reg_off) {
188 case offsetof(struct pt_regs, bx):
189 return &cpu->regs->ebx;
190 case offsetof(struct pt_regs, cx):
191 return &cpu->regs->ecx;
192 case offsetof(struct pt_regs, dx):
193 return &cpu->regs->edx;
194 case offsetof(struct pt_regs, si):
195 return &cpu->regs->esi;
196 case offsetof(struct pt_regs, di):
197 return &cpu->regs->edi;
198 case offsetof(struct pt_regs, bp):
199 return &cpu->regs->ebp;
200 case offsetof(struct pt_regs, ax):
201 return &cpu->regs->eax;
202 case offsetof(struct pt_regs, ip):
203 return &cpu->regs->eip;
204 case offsetof(struct pt_regs, sp):
205 return &cpu->regs->esp;
206 }
207
208 /* Launcher can read these, but we don't allow any setting. */
209 if (any) {
210 switch (reg_off) {
211 case offsetof(struct pt_regs, ds):
212 return &cpu->regs->ds;
213 case offsetof(struct pt_regs, es):
214 return &cpu->regs->es;
215 case offsetof(struct pt_regs, fs):
216 return &cpu->regs->fs;
217 case offsetof(struct pt_regs, gs):
218 return &cpu->regs->gs;
219 case offsetof(struct pt_regs, cs):
220 return &cpu->regs->cs;
221 case offsetof(struct pt_regs, flags):
222 return &cpu->regs->eflags;
223 case offsetof(struct pt_regs, ss):
224 return &cpu->regs->ss;
225 }
226 }
227
228 return NULL;
229}
230
185/*M:002 231/*M:002
186 * There are hooks in the scheduler which we can register to tell when we 232 * There are hooks in the scheduler which we can register to tell when we
187 * get kicked off the CPU (preempt_notifier_register()). This would allow us 233 * get kicked off the CPU (preempt_notifier_register()). This would allow us
@@ -269,110 +315,73 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
269 * usually attached to a PC. 315 * usually attached to a PC.
270 * 316 *
271 * When the Guest uses one of these instructions, we get a trap (General 317 * When the Guest uses one of these instructions, we get a trap (General
272 * Protection Fault) and come here. We see if it's one of those troublesome 318 * Protection Fault) and come here. We queue this to be sent out to the
273 * instructions and skip over it. We return true if we did. 319 * Launcher to handle.
274 */ 320 */
275static int emulate_insn(struct lg_cpu *cpu)
276{
277 u8 insn;
278 unsigned int insnlen = 0, in = 0, small_operand = 0;
279 /*
280 * The eip contains the *virtual* address of the Guest's instruction:
281 * walk the Guest's page tables to find the "physical" address.
282 */
283 unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);
284
285 /*
286 * This must be the Guest kernel trying to do something, not userspace!
287 * The bottom two bits of the CS segment register are the privilege
288 * level.
289 */
290 if ((cpu->regs->cs & 3) != GUEST_PL)
291 return 0;
292
293 /* Decoding x86 instructions is icky. */
294 insn = lgread(cpu, physaddr, u8);
295 321
296 /* 322/*
297 * Around 2.6.33, the kernel started using an emulation for the 323 * The eip contains the *virtual* address of the Guest's instruction:
298 * cmpxchg8b instruction in early boot on many configurations. This 324 * we copy the instruction here so the Launcher doesn't have to walk
299 * code isn't paravirtualized, and it tries to disable interrupts. 325 * the page tables to decode it. We handle the case (eg. in a kernel
300 * Ignore it, which will Mostly Work. 326 * module) where the instruction is over two pages, and the pages are
301 */ 327 * virtually but not physically contiguous.
302 if (insn == 0xfa) { 328 *
303 /* "cli", or Clear Interrupt Enable instruction. Skip it. */ 329 * The longest possible x86 instruction is 15 bytes, but we don't handle
304 cpu->regs->eip++; 330 * anything that strange.
305 return 1; 331 */
332static void copy_from_guest(struct lg_cpu *cpu,
333 void *dst, unsigned long vaddr, size_t len)
334{
335 size_t to_page_end = PAGE_SIZE - (vaddr % PAGE_SIZE);
336 unsigned long paddr;
337
338 BUG_ON(len > PAGE_SIZE);
339
340 /* If it goes over a page, copy in two parts. */
341 if (len > to_page_end) {
342 /* But make sure the next page is mapped! */
343 if (__guest_pa(cpu, vaddr + to_page_end, &paddr))
344 copy_from_guest(cpu, dst + to_page_end,
345 vaddr + to_page_end,
346 len - to_page_end);
347 else
348 /* Otherwise fill with zeroes. */
349 memset(dst + to_page_end, 0, len - to_page_end);
350 len = to_page_end;
306 } 351 }
307 352
308 /* 353 /* This will kill the guest if it isn't mapped, but that
309 * 0x66 is an "operand prefix". It means a 16, not 32 bit in/out. 354 * shouldn't happen. */
310 */ 355 __lgread(cpu, dst, guest_pa(cpu, vaddr), len);
311 if (insn == 0x66) { 356}
312 small_operand = 1;
313 /* The instruction is 1 byte so far, read the next byte. */
314 insnlen = 1;
315 insn = lgread(cpu, physaddr + insnlen, u8);
316 }
317 357
318 /*
319 * We can ignore the lower bit for the moment and decode the 4 opcodes
320 * we need to emulate.
321 */
322 switch (insn & 0xFE) {
323 case 0xE4: /* in <next byte>,%al */
324 insnlen += 2;
325 in = 1;
326 break;
327 case 0xEC: /* in (%dx),%al */
328 insnlen += 1;
329 in = 1;
330 break;
331 case 0xE6: /* out %al,<next byte> */
332 insnlen += 2;
333 break;
334 case 0xEE: /* out %al,(%dx) */
335 insnlen += 1;
336 break;
337 default:
338 /* OK, we don't know what this is, can't emulate. */
339 return 0;
340 }
341 358
342 /* 359static void setup_emulate_insn(struct lg_cpu *cpu)
343 * If it was an "IN" instruction, they expect the result to be read 360{
344 * into %eax, so we change %eax. We always return all-ones, which 361 cpu->pending.trap = 13;
345 * traditionally means "there's nothing there". 362 copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
346 */ 363 sizeof(cpu->pending.insn));
347 if (in) { 364}
348 /* Lower bit tells means it's a 32/16 bit access */ 365
349 if (insn & 0x1) { 366static void setup_iomem_insn(struct lg_cpu *cpu, unsigned long iomem_addr)
350 if (small_operand) 367{
351 cpu->regs->eax |= 0xFFFF; 368 cpu->pending.trap = 14;
352 else 369 cpu->pending.addr = iomem_addr;
353 cpu->regs->eax = 0xFFFFFFFF; 370 copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
354 } else 371 sizeof(cpu->pending.insn));
355 cpu->regs->eax |= 0xFF;
356 }
357 /* Finally, we've "done" the instruction, so move past it. */
358 cpu->regs->eip += insnlen;
359 /* Success! */
360 return 1;
361} 372}
362 373
363/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ 374/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
364void lguest_arch_handle_trap(struct lg_cpu *cpu) 375void lguest_arch_handle_trap(struct lg_cpu *cpu)
365{ 376{
377 unsigned long iomem_addr;
378
366 switch (cpu->regs->trapnum) { 379 switch (cpu->regs->trapnum) {
367 case 13: /* We've intercepted a General Protection Fault. */ 380 case 13: /* We've intercepted a General Protection Fault. */
368 /* 381 /* Hand to Launcher to emulate those pesky IN and OUT insns */
369 * Check if this was one of those annoying IN or OUT
370 * instructions which we need to emulate. If so, we just go
371 * back into the Guest after we've done it.
372 */
373 if (cpu->regs->errcode == 0) { 382 if (cpu->regs->errcode == 0) {
374 if (emulate_insn(cpu)) 383 setup_emulate_insn(cpu);
375 return; 384 return;
376 } 385 }
377 break; 386 break;
378 case 14: /* We've intercepted a Page Fault. */ 387 case 14: /* We've intercepted a Page Fault. */
@@ -387,9 +396,16 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
387 * whether kernel or userspace code. 396 * whether kernel or userspace code.
388 */ 397 */
389 if (demand_page(cpu, cpu->arch.last_pagefault, 398 if (demand_page(cpu, cpu->arch.last_pagefault,
390 cpu->regs->errcode)) 399 cpu->regs->errcode, &iomem_addr))
391 return; 400 return;
392 401
402 /* Was this an access to memory mapped IO? */
403 if (iomem_addr) {
404 /* Tell Launcher, let it handle it. */
405 setup_iomem_insn(cpu, iomem_addr);
406 return;
407 }
408
393 /* 409 /*
394 * OK, it's really not there (or not OK): the Guest needs to 410 * OK, it's really not there (or not OK): the Guest needs to
395 * know. We write out the cr2 value so it knows where the 411 * know. We write out the cr2 value so it knows where the
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 110a2cf67244..f1ff3666f090 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1710,6 +1710,12 @@ static int virtnet_probe(struct virtio_device *vdev)
1710 struct virtnet_info *vi; 1710 struct virtnet_info *vi;
1711 u16 max_queue_pairs; 1711 u16 max_queue_pairs;
1712 1712
1713 if (!vdev->config->get) {
1714 dev_err(&vdev->dev, "%s failure: config access disabled\n",
1715 __func__);
1716 return -EINVAL;
1717 }
1718
1713 if (!virtnet_validate_features(vdev)) 1719 if (!virtnet_validate_features(vdev))
1714 return -EINVAL; 1720 return -EINVAL;
1715 1721
diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index c52bb5dfaedb..f164f24a4a55 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -950,6 +950,12 @@ static int virtscsi_probe(struct virtio_device *vdev)
950 u32 num_queues; 950 u32 num_queues;
951 struct scsi_host_template *hostt; 951 struct scsi_host_template *hostt;
952 952
953 if (!vdev->config->get) {
954 dev_err(&vdev->dev, "%s failure: config access disabled\n",
955 __func__);
956 return -EINVAL;
957 }
958
953 /* We need to know how many queues before we allocate. */ 959 /* We need to know how many queues before we allocate. */
954 num_queues = virtscsi_config_get(vdev, num_queues) ? : 1; 960 num_queues = virtscsi_config_get(vdev, num_queues) ? : 1;
955 961
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index 00b228638274..b546da5d8ea3 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -12,16 +12,32 @@ config VIRTIO_PCI
12 depends on PCI 12 depends on PCI
13 select VIRTIO 13 select VIRTIO
14 ---help--- 14 ---help---
15 This drivers provides support for virtio based paravirtual device 15 This driver provides support for virtio based paravirtual device
16 drivers over PCI. This requires that your VMM has appropriate PCI 16 drivers over PCI. This requires that your VMM has appropriate PCI
17 virtio backends. Most QEMU based VMMs should support these devices 17 virtio backends. Most QEMU based VMMs should support these devices
18 (like KVM or Xen). 18 (like KVM or Xen).
19 19
20 Currently, the ABI is not considered stable so there is no guarantee
21 that this version of the driver will work with your VMM.
22
23 If unsure, say M. 20 If unsure, say M.
24 21
22config VIRTIO_PCI_LEGACY
23 bool "Support for legacy virtio draft 0.9.X and older devices"
24 default y
25 depends on VIRTIO_PCI
26 ---help---
27 Virtio PCI Card 0.9.X Draft (circa 2014) and older device support.
28
29 This option enables building a transitional driver, supporting
30 both devices conforming to Virtio 1 specification, and legacy devices.
31 If disabled, you get a slightly smaller, non-transitional driver,
32 with no legacy compatibility.
33
34 So look out into your driveway. Do you have a flying car? If
35 so, you can happily disable this option and virtio will not
36 break. Otherwise, leave it set. Unless you're testing what
37 life will be like in The Future.
38
39 If unsure, say Y.
40
25config VIRTIO_BALLOON 41config VIRTIO_BALLOON
26 tristate "Virtio balloon driver" 42 tristate "Virtio balloon driver"
27 depends on VIRTIO 43 depends on VIRTIO
diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
index bf5104b56894..d85565b8ea46 100644
--- a/drivers/virtio/Makefile
+++ b/drivers/virtio/Makefile
@@ -1,5 +1,6 @@
1obj-$(CONFIG_VIRTIO) += virtio.o virtio_ring.o 1obj-$(CONFIG_VIRTIO) += virtio.o virtio_ring.o
2obj-$(CONFIG_VIRTIO_MMIO) += virtio_mmio.o 2obj-$(CONFIG_VIRTIO_MMIO) += virtio_mmio.o
3obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o 3obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o
4virtio_pci-y := virtio_pci_legacy.o virtio_pci_common.o 4virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o
5virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o
5obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o 6obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o
diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index b9f70dfc4751..5ce2aa48fc6e 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -236,7 +236,10 @@ static int virtio_dev_probe(struct device *_d)
236 if (err) 236 if (err)
237 goto err; 237 goto err;
238 238
239 add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK); 239 /* If probe didn't do it, mark device DRIVER_OK ourselves. */
240 if (!(dev->config->get_status(dev) & VIRTIO_CONFIG_S_DRIVER_OK))
241 virtio_device_ready(dev);
242
240 if (drv->scan) 243 if (drv->scan)
241 drv->scan(dev); 244 drv->scan(dev);
242 245
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 50c5f42d7a9f..0413157f3b49 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -44,8 +44,7 @@ static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES;
44module_param(oom_pages, int, S_IRUSR | S_IWUSR); 44module_param(oom_pages, int, S_IRUSR | S_IWUSR);
45MODULE_PARM_DESC(oom_pages, "pages to free on OOM"); 45MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
46 46
47struct virtio_balloon 47struct virtio_balloon {
48{
49 struct virtio_device *vdev; 48 struct virtio_device *vdev;
50 struct virtqueue *inflate_vq, *deflate_vq, *stats_vq; 49 struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
51 50
@@ -466,6 +465,12 @@ static int virtballoon_probe(struct virtio_device *vdev)
466 struct virtio_balloon *vb; 465 struct virtio_balloon *vb;
467 int err; 466 int err;
468 467
468 if (!vdev->config->get) {
469 dev_err(&vdev->dev, "%s failure: config access disabled\n",
470 __func__);
471 return -EINVAL;
472 }
473
469 vdev->priv = vb = kmalloc(sizeof(*vb), GFP_KERNEL); 474 vdev->priv = vb = kmalloc(sizeof(*vb), GFP_KERNEL);
470 if (!vb) { 475 if (!vb) {
471 err = -ENOMEM; 476 err = -ENOMEM;
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index 00d115b22bd8..cad569890908 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Virtio memory mapped device driver 2 * Virtio memory mapped device driver
3 * 3 *
4 * Copyright 2011, ARM Ltd. 4 * Copyright 2011-2014, ARM Ltd.
5 * 5 *
6 * This module allows virtio devices to be used over a virtual, memory mapped 6 * This module allows virtio devices to be used over a virtual, memory mapped
7 * platform device. 7 * platform device.
@@ -50,36 +50,6 @@
50 * 50 *
51 * 51 *
52 * 52 *
53 * Registers layout (all 32-bit wide):
54 *
55 * offset d. name description
56 * ------ -- ---------------- -----------------
57 *
58 * 0x000 R MagicValue Magic value "virt"
59 * 0x004 R Version Device version (current max. 1)
60 * 0x008 R DeviceID Virtio device ID
61 * 0x00c R VendorID Virtio vendor ID
62 *
63 * 0x010 R HostFeatures Features supported by the host
64 * 0x014 W HostFeaturesSel Set of host features to access via HostFeatures
65 *
66 * 0x020 W GuestFeatures Features activated by the guest
67 * 0x024 W GuestFeaturesSel Set of activated features to set via GuestFeatures
68 * 0x028 W GuestPageSize Size of guest's memory page in bytes
69 *
70 * 0x030 W QueueSel Queue selector
71 * 0x034 R QueueNumMax Maximum size of the currently selected queue
72 * 0x038 W QueueNum Queue size for the currently selected queue
73 * 0x03c W QueueAlign Used Ring alignment for the current queue
74 * 0x040 RW QueuePFN PFN for the currently selected queue
75 *
76 * 0x050 W QueueNotify Queue notifier
77 * 0x060 R InterruptStatus Interrupt status register
78 * 0x064 W InterruptACK Interrupt acknowledge register
79 * 0x070 RW Status Device status register
80 *
81 * 0x100+ RW Device-specific configuration space
82 *
83 * Based on Virtio PCI driver by Anthony Liguori, copyright IBM Corp. 2007 53 * Based on Virtio PCI driver by Anthony Liguori, copyright IBM Corp. 2007
84 * 54 *
85 * This work is licensed under the terms of the GNU GPL, version 2 or later. 55 * This work is licensed under the terms of the GNU GPL, version 2 or later.
@@ -145,11 +115,16 @@ struct virtio_mmio_vq_info {
145static u64 vm_get_features(struct virtio_device *vdev) 115static u64 vm_get_features(struct virtio_device *vdev)
146{ 116{
147 struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); 117 struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
118 u64 features;
119
120 writel(1, vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES_SEL);
121 features = readl(vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES);
122 features <<= 32;
148 123
149 /* TODO: Features > 32 bits */ 124 writel(0, vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES_SEL);
150 writel(0, vm_dev->base + VIRTIO_MMIO_HOST_FEATURES_SEL); 125 features |= readl(vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES);
151 126
152 return readl(vm_dev->base + VIRTIO_MMIO_HOST_FEATURES); 127 return features;
153} 128}
154 129
155static int vm_finalize_features(struct virtio_device *vdev) 130static int vm_finalize_features(struct virtio_device *vdev)
@@ -159,11 +134,20 @@ static int vm_finalize_features(struct virtio_device *vdev)
159 /* Give virtio_ring a chance to accept features. */ 134 /* Give virtio_ring a chance to accept features. */
160 vring_transport_features(vdev); 135 vring_transport_features(vdev);
161 136
162 /* Make sure we don't have any features > 32 bits! */ 137 /* Make sure there is are no mixed devices */
163 BUG_ON((u32)vdev->features != vdev->features); 138 if (vm_dev->version == 2 &&
139 !__virtio_test_bit(vdev, VIRTIO_F_VERSION_1)) {
140 dev_err(&vdev->dev, "New virtio-mmio devices (version 2) must provide VIRTIO_F_VERSION_1 feature!\n");
141 return -EINVAL;
142 }
143
144 writel(1, vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES_SEL);
145 writel((u32)(vdev->features >> 32),
146 vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES);
164 147
165 writel(0, vm_dev->base + VIRTIO_MMIO_GUEST_FEATURES_SEL); 148 writel(0, vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES_SEL);
166 writel(vdev->features, vm_dev->base + VIRTIO_MMIO_GUEST_FEATURES); 149 writel((u32)vdev->features,
150 vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES);
167 151
168 return 0; 152 return 0;
169} 153}
@@ -275,7 +259,12 @@ static void vm_del_vq(struct virtqueue *vq)
275 259
276 /* Select and deactivate the queue */ 260 /* Select and deactivate the queue */
277 writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL); 261 writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL);
278 writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN); 262 if (vm_dev->version == 1) {
263 writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN);
264 } else {
265 writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_READY);
266 WARN_ON(readl(vm_dev->base + VIRTIO_MMIO_QUEUE_READY));
267 }
279 268
280 size = PAGE_ALIGN(vring_size(info->num, VIRTIO_MMIO_VRING_ALIGN)); 269 size = PAGE_ALIGN(vring_size(info->num, VIRTIO_MMIO_VRING_ALIGN));
281 free_pages_exact(info->queue, size); 270 free_pages_exact(info->queue, size);
@@ -312,7 +301,8 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
312 writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL); 301 writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL);
313 302
314 /* Queue shouldn't already be set up. */ 303 /* Queue shouldn't already be set up. */
315 if (readl(vm_dev->base + VIRTIO_MMIO_QUEUE_PFN)) { 304 if (readl(vm_dev->base + (vm_dev->version == 1 ?
305 VIRTIO_MMIO_QUEUE_PFN : VIRTIO_MMIO_QUEUE_READY))) {
316 err = -ENOENT; 306 err = -ENOENT;
317 goto error_available; 307 goto error_available;
318 } 308 }
@@ -356,13 +346,6 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
356 info->num /= 2; 346 info->num /= 2;
357 } 347 }
358 348
359 /* Activate the queue */
360 writel(info->num, vm_dev->base + VIRTIO_MMIO_QUEUE_NUM);
361 writel(VIRTIO_MMIO_VRING_ALIGN,
362 vm_dev->base + VIRTIO_MMIO_QUEUE_ALIGN);
363 writel(virt_to_phys(info->queue) >> PAGE_SHIFT,
364 vm_dev->base + VIRTIO_MMIO_QUEUE_PFN);
365
366 /* Create the vring */ 349 /* Create the vring */
367 vq = vring_new_virtqueue(index, info->num, VIRTIO_MMIO_VRING_ALIGN, vdev, 350 vq = vring_new_virtqueue(index, info->num, VIRTIO_MMIO_VRING_ALIGN, vdev,
368 true, info->queue, vm_notify, callback, name); 351 true, info->queue, vm_notify, callback, name);
@@ -371,6 +354,33 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
371 goto error_new_virtqueue; 354 goto error_new_virtqueue;
372 } 355 }
373 356
357 /* Activate the queue */
358 writel(info->num, vm_dev->base + VIRTIO_MMIO_QUEUE_NUM);
359 if (vm_dev->version == 1) {
360 writel(PAGE_SIZE, vm_dev->base + VIRTIO_MMIO_QUEUE_ALIGN);
361 writel(virt_to_phys(info->queue) >> PAGE_SHIFT,
362 vm_dev->base + VIRTIO_MMIO_QUEUE_PFN);
363 } else {
364 u64 addr;
365
366 addr = virt_to_phys(info->queue);
367 writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_DESC_LOW);
368 writel((u32)(addr >> 32),
369 vm_dev->base + VIRTIO_MMIO_QUEUE_DESC_HIGH);
370
371 addr = virt_to_phys(virtqueue_get_avail(vq));
372 writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_AVAIL_LOW);
373 writel((u32)(addr >> 32),
374 vm_dev->base + VIRTIO_MMIO_QUEUE_AVAIL_HIGH);
375
376 addr = virt_to_phys(virtqueue_get_used(vq));
377 writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_USED_LOW);
378 writel((u32)(addr >> 32),
379 vm_dev->base + VIRTIO_MMIO_QUEUE_USED_HIGH);
380
381 writel(1, vm_dev->base + VIRTIO_MMIO_QUEUE_READY);
382 }
383
374 vq->priv = info; 384 vq->priv = info;
375 info->vq = vq; 385 info->vq = vq;
376 386
@@ -381,7 +391,12 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
381 return vq; 391 return vq;
382 392
383error_new_virtqueue: 393error_new_virtqueue:
384 writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN); 394 if (vm_dev->version == 1) {
395 writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN);
396 } else {
397 writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_READY);
398 WARN_ON(readl(vm_dev->base + VIRTIO_MMIO_QUEUE_READY));
399 }
385 free_pages_exact(info->queue, size); 400 free_pages_exact(info->queue, size);
386error_alloc_pages: 401error_alloc_pages:
387 kfree(info); 402 kfree(info);
@@ -476,16 +491,32 @@ static int virtio_mmio_probe(struct platform_device *pdev)
476 491
477 /* Check device version */ 492 /* Check device version */
478 vm_dev->version = readl(vm_dev->base + VIRTIO_MMIO_VERSION); 493 vm_dev->version = readl(vm_dev->base + VIRTIO_MMIO_VERSION);
479 if (vm_dev->version != 1) { 494 if (vm_dev->version < 1 || vm_dev->version > 2) {
480 dev_err(&pdev->dev, "Version %ld not supported!\n", 495 dev_err(&pdev->dev, "Version %ld not supported!\n",
481 vm_dev->version); 496 vm_dev->version);
482 return -ENXIO; 497 return -ENXIO;
483 } 498 }
484 499
485 vm_dev->vdev.id.device = readl(vm_dev->base + VIRTIO_MMIO_DEVICE_ID); 500 vm_dev->vdev.id.device = readl(vm_dev->base + VIRTIO_MMIO_DEVICE_ID);
501 if (vm_dev->vdev.id.device == 0) {
502 /*
503 * virtio-mmio device with an ID 0 is a (dummy) placeholder
504 * with no function. End probing now with no error reported.
505 */
506 return -ENODEV;
507 }
486 vm_dev->vdev.id.vendor = readl(vm_dev->base + VIRTIO_MMIO_VENDOR_ID); 508 vm_dev->vdev.id.vendor = readl(vm_dev->base + VIRTIO_MMIO_VENDOR_ID);
487 509
488 writel(PAGE_SIZE, vm_dev->base + VIRTIO_MMIO_GUEST_PAGE_SIZE); 510 /* Reject legacy-only IDs for version 2 devices */
511 if (vm_dev->version == 2 &&
512 virtio_device_is_legacy_only(vm_dev->vdev.id)) {
513 dev_err(&pdev->dev, "Version 2 not supported for devices %u!\n",
514 vm_dev->vdev.id.device);
515 return -ENODEV;
516 }
517
518 if (vm_dev->version == 1)
519 writel(PAGE_SIZE, vm_dev->base + VIRTIO_MMIO_GUEST_PAGE_SIZE);
489 520
490 platform_set_drvdata(pdev, vm_dev); 521 platform_set_drvdata(pdev, vm_dev);
491 522
diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index 9756f21b809e..e894eb278d83 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -19,6 +19,14 @@
19 19
20#include "virtio_pci_common.h" 20#include "virtio_pci_common.h"
21 21
22static bool force_legacy = false;
23
24#if IS_ENABLED(CONFIG_VIRTIO_PCI_LEGACY)
25module_param(force_legacy, bool, 0444);
26MODULE_PARM_DESC(force_legacy,
27 "Force legacy mode for transitional virtio 1 devices");
28#endif
29
22/* wait for pending irq handlers */ 30/* wait for pending irq handlers */
23void vp_synchronize_vectors(struct virtio_device *vdev) 31void vp_synchronize_vectors(struct virtio_device *vdev)
24{ 32{
@@ -464,15 +472,97 @@ static const struct pci_device_id virtio_pci_id_table[] = {
464 472
465MODULE_DEVICE_TABLE(pci, virtio_pci_id_table); 473MODULE_DEVICE_TABLE(pci, virtio_pci_id_table);
466 474
475static void virtio_pci_release_dev(struct device *_d)
476{
477 struct virtio_device *vdev = dev_to_virtio(_d);
478 struct virtio_pci_device *vp_dev = to_vp_device(vdev);
479
480 /* As struct device is a kobject, it's not safe to
481 * free the memory (including the reference counter itself)
482 * until it's release callback. */
483 kfree(vp_dev);
484}
485
467static int virtio_pci_probe(struct pci_dev *pci_dev, 486static int virtio_pci_probe(struct pci_dev *pci_dev,
468 const struct pci_device_id *id) 487 const struct pci_device_id *id)
469{ 488{
470 return virtio_pci_legacy_probe(pci_dev, id); 489 struct virtio_pci_device *vp_dev;
490 int rc;
491
492 /* allocate our structure and fill it out */
493 vp_dev = kzalloc(sizeof(struct virtio_pci_device), GFP_KERNEL);
494 if (!vp_dev)
495 return -ENOMEM;
496
497 pci_set_drvdata(pci_dev, vp_dev);
498 vp_dev->vdev.dev.parent = &pci_dev->dev;
499 vp_dev->vdev.dev.release = virtio_pci_release_dev;
500 vp_dev->pci_dev = pci_dev;
501 INIT_LIST_HEAD(&vp_dev->virtqueues);
502 spin_lock_init(&vp_dev->lock);
503
504 /* Disable MSI/MSIX to bring device to a known good state. */
505 pci_msi_off(pci_dev);
506
507 /* enable the device */
508 rc = pci_enable_device(pci_dev);
509 if (rc)
510 goto err_enable_device;
511
512 rc = pci_request_regions(pci_dev, "virtio-pci");
513 if (rc)
514 goto err_request_regions;
515
516 if (force_legacy) {
517 rc = virtio_pci_legacy_probe(vp_dev);
518 /* Also try modern mode if we can't map BAR0 (no IO space). */
519 if (rc == -ENODEV || rc == -ENOMEM)
520 rc = virtio_pci_modern_probe(vp_dev);
521 if (rc)
522 goto err_probe;
523 } else {
524 rc = virtio_pci_modern_probe(vp_dev);
525 if (rc == -ENODEV)
526 rc = virtio_pci_legacy_probe(vp_dev);
527 if (rc)
528 goto err_probe;
529 }
530
531 pci_set_master(pci_dev);
532
533 rc = register_virtio_device(&vp_dev->vdev);
534 if (rc)
535 goto err_register;
536
537 return 0;
538
539err_register:
540 if (vp_dev->ioaddr)
541 virtio_pci_legacy_remove(vp_dev);
542 else
543 virtio_pci_modern_remove(vp_dev);
544err_probe:
545 pci_release_regions(pci_dev);
546err_request_regions:
547 pci_disable_device(pci_dev);
548err_enable_device:
549 kfree(vp_dev);
550 return rc;
471} 551}
472 552
473static void virtio_pci_remove(struct pci_dev *pci_dev) 553static void virtio_pci_remove(struct pci_dev *pci_dev)
474{ 554{
475 virtio_pci_legacy_remove(pci_dev); 555 struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
556
557 unregister_virtio_device(&vp_dev->vdev);
558
559 if (vp_dev->ioaddr)
560 virtio_pci_legacy_remove(vp_dev);
561 else
562 virtio_pci_modern_remove(vp_dev);
563
564 pci_release_regions(pci_dev);
565 pci_disable_device(pci_dev);
476} 566}
477 567
478static struct pci_driver virtio_pci_driver = { 568static struct pci_driver virtio_pci_driver = {
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index 5a497289b7e9..28ee4e56badf 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -53,12 +53,32 @@ struct virtio_pci_device {
53 struct virtio_device vdev; 53 struct virtio_device vdev;
54 struct pci_dev *pci_dev; 54 struct pci_dev *pci_dev;
55 55
56 /* In legacy mode, these two point to within ->legacy. */
57 /* Where to read and clear interrupt */
58 u8 __iomem *isr;
59
60 /* Modern only fields */
61 /* The IO mapping for the PCI config space (non-legacy mode) */
62 struct virtio_pci_common_cfg __iomem *common;
63 /* Device-specific data (non-legacy mode) */
64 void __iomem *device;
65 /* Base of vq notifications (non-legacy mode). */
66 void __iomem *notify_base;
67
68 /* So we can sanity-check accesses. */
69 size_t notify_len;
70 size_t device_len;
71
72 /* Capability for when we need to map notifications per-vq. */
73 int notify_map_cap;
74
75 /* Multiply queue_notify_off by this value. (non-legacy mode). */
76 u32 notify_offset_multiplier;
77
78 /* Legacy only field */
56 /* the IO mapping for the PCI config space */ 79 /* the IO mapping for the PCI config space */
57 void __iomem *ioaddr; 80 void __iomem *ioaddr;
58 81
59 /* the IO mapping for ISR operation */
60 void __iomem *isr;
61
62 /* a list of queues so we can dispatch IRQs */ 82 /* a list of queues so we can dispatch IRQs */
63 spinlock_t lock; 83 spinlock_t lock;
64 struct list_head virtqueues; 84 struct list_head virtqueues;
@@ -127,8 +147,19 @@ const char *vp_bus_name(struct virtio_device *vdev);
127 */ 147 */
128int vp_set_vq_affinity(struct virtqueue *vq, int cpu); 148int vp_set_vq_affinity(struct virtqueue *vq, int cpu);
129 149
130int virtio_pci_legacy_probe(struct pci_dev *pci_dev, 150#if IS_ENABLED(CONFIG_VIRTIO_PCI_LEGACY)
131 const struct pci_device_id *id); 151int virtio_pci_legacy_probe(struct virtio_pci_device *);
132void virtio_pci_legacy_remove(struct pci_dev *pci_dev); 152void virtio_pci_legacy_remove(struct virtio_pci_device *);
153#else
154static inline int virtio_pci_legacy_probe(struct virtio_pci_device *vp_dev)
155{
156 return -ENODEV;
157}
158static inline void virtio_pci_legacy_remove(struct virtio_pci_device *vp_dev)
159{
160}
161#endif
162int virtio_pci_modern_probe(struct virtio_pci_device *);
163void virtio_pci_modern_remove(struct virtio_pci_device *);
133 164
134#endif 165#endif
diff --git a/drivers/virtio/virtio_pci_legacy.c b/drivers/virtio/virtio_pci_legacy.c
index a5486e65e04b..256a5278a515 100644
--- a/drivers/virtio/virtio_pci_legacy.c
+++ b/drivers/virtio/virtio_pci_legacy.c
@@ -211,23 +211,10 @@ static const struct virtio_config_ops virtio_pci_config_ops = {
211 .set_vq_affinity = vp_set_vq_affinity, 211 .set_vq_affinity = vp_set_vq_affinity,
212}; 212};
213 213
214static void virtio_pci_release_dev(struct device *_d)
215{
216 struct virtio_device *vdev = dev_to_virtio(_d);
217 struct virtio_pci_device *vp_dev = to_vp_device(vdev);
218
219 /* As struct device is a kobject, it's not safe to
220 * free the memory (including the reference counter itself)
221 * until it's release callback. */
222 kfree(vp_dev);
223}
224
225/* the PCI probing function */ 214/* the PCI probing function */
226int virtio_pci_legacy_probe(struct pci_dev *pci_dev, 215int virtio_pci_legacy_probe(struct virtio_pci_device *vp_dev)
227 const struct pci_device_id *id)
228{ 216{
229 struct virtio_pci_device *vp_dev; 217 struct pci_dev *pci_dev = vp_dev->pci_dev;
230 int err;
231 218
232 /* We only own devices >= 0x1000 and <= 0x103f: leave the rest. */ 219 /* We only own devices >= 0x1000 and <= 0x103f: leave the rest. */
233 if (pci_dev->device < 0x1000 || pci_dev->device > 0x103f) 220 if (pci_dev->device < 0x1000 || pci_dev->device > 0x103f)
@@ -239,41 +226,12 @@ int virtio_pci_legacy_probe(struct pci_dev *pci_dev,
239 return -ENODEV; 226 return -ENODEV;
240 } 227 }
241 228
242 /* allocate our structure and fill it out */
243 vp_dev = kzalloc(sizeof(struct virtio_pci_device), GFP_KERNEL);
244 if (vp_dev == NULL)
245 return -ENOMEM;
246
247 vp_dev->vdev.dev.parent = &pci_dev->dev;
248 vp_dev->vdev.dev.release = virtio_pci_release_dev;
249 vp_dev->vdev.config = &virtio_pci_config_ops;
250 vp_dev->pci_dev = pci_dev;
251 INIT_LIST_HEAD(&vp_dev->virtqueues);
252 spin_lock_init(&vp_dev->lock);
253
254 /* Disable MSI/MSIX to bring device to a known good state. */
255 pci_msi_off(pci_dev);
256
257 /* enable the device */
258 err = pci_enable_device(pci_dev);
259 if (err)
260 goto out;
261
262 err = pci_request_regions(pci_dev, "virtio-pci");
263 if (err)
264 goto out_enable_device;
265
266 vp_dev->ioaddr = pci_iomap(pci_dev, 0, 0); 229 vp_dev->ioaddr = pci_iomap(pci_dev, 0, 0);
267 if (vp_dev->ioaddr == NULL) { 230 if (!vp_dev->ioaddr)
268 err = -ENOMEM; 231 return -ENOMEM;
269 goto out_req_regions;
270 }
271 232
272 vp_dev->isr = vp_dev->ioaddr + VIRTIO_PCI_ISR; 233 vp_dev->isr = vp_dev->ioaddr + VIRTIO_PCI_ISR;
273 234
274 pci_set_drvdata(pci_dev, vp_dev);
275 pci_set_master(pci_dev);
276
277 /* we use the subsystem vendor/device id as the virtio vendor/device 235 /* we use the subsystem vendor/device id as the virtio vendor/device
278 * id. this allows us to use the same PCI vendor/device id for all 236 * id. this allows us to use the same PCI vendor/device id for all
279 * virtio devices and to identify the particular virtio driver by 237 * virtio devices and to identify the particular virtio driver by
@@ -281,36 +239,18 @@ int virtio_pci_legacy_probe(struct pci_dev *pci_dev,
281 vp_dev->vdev.id.vendor = pci_dev->subsystem_vendor; 239 vp_dev->vdev.id.vendor = pci_dev->subsystem_vendor;
282 vp_dev->vdev.id.device = pci_dev->subsystem_device; 240 vp_dev->vdev.id.device = pci_dev->subsystem_device;
283 241
242 vp_dev->vdev.config = &virtio_pci_config_ops;
243
284 vp_dev->config_vector = vp_config_vector; 244 vp_dev->config_vector = vp_config_vector;
285 vp_dev->setup_vq = setup_vq; 245 vp_dev->setup_vq = setup_vq;
286 vp_dev->del_vq = del_vq; 246 vp_dev->del_vq = del_vq;
287 247
288 /* finally register the virtio device */
289 err = register_virtio_device(&vp_dev->vdev);
290 if (err)
291 goto out_set_drvdata;
292
293 return 0; 248 return 0;
294
295out_set_drvdata:
296 pci_iounmap(pci_dev, vp_dev->ioaddr);
297out_req_regions:
298 pci_release_regions(pci_dev);
299out_enable_device:
300 pci_disable_device(pci_dev);
301out:
302 kfree(vp_dev);
303 return err;
304} 249}
305 250
306void virtio_pci_legacy_remove(struct pci_dev *pci_dev) 251void virtio_pci_legacy_remove(struct virtio_pci_device *vp_dev)
307{ 252{
308 struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); 253 struct pci_dev *pci_dev = vp_dev->pci_dev;
309
310 unregister_virtio_device(&vp_dev->vdev);
311 254
312 vp_del_vqs(&vp_dev->vdev);
313 pci_iounmap(pci_dev, vp_dev->ioaddr); 255 pci_iounmap(pci_dev, vp_dev->ioaddr);
314 pci_release_regions(pci_dev);
315 pci_disable_device(pci_dev);
316} 256}
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
new file mode 100644
index 000000000000..2aa38e59db2e
--- /dev/null
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -0,0 +1,695 @@
1/*
2 * Virtio PCI driver - modern (virtio 1.0) device support
3 *
4 * This module allows virtio devices to be used over a virtual PCI device.
5 * This can be used with QEMU based VMMs like KVM or Xen.
6 *
7 * Copyright IBM Corp. 2007
8 * Copyright Red Hat, Inc. 2014
9 *
10 * Authors:
11 * Anthony Liguori <aliguori@us.ibm.com>
12 * Rusty Russell <rusty@rustcorp.com.au>
13 * Michael S. Tsirkin <mst@redhat.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2 or later.
16 * See the COPYING file in the top-level directory.
17 *
18 */
19
20#define VIRTIO_PCI_NO_LEGACY
21#include "virtio_pci_common.h"
22
23static void __iomem *map_capability(struct pci_dev *dev, int off,
24 size_t minlen,
25 u32 align,
26 u32 start, u32 size,
27 size_t *len)
28{
29 u8 bar;
30 u32 offset, length;
31 void __iomem *p;
32
33 pci_read_config_byte(dev, off + offsetof(struct virtio_pci_cap,
34 bar),
35 &bar);
36 pci_read_config_dword(dev, off + offsetof(struct virtio_pci_cap, offset),
37 &offset);
38 pci_read_config_dword(dev, off + offsetof(struct virtio_pci_cap, length),
39 &length);
40
41 if (length <= start) {
42 dev_err(&dev->dev,
43 "virtio_pci: bad capability len %u (>%u expected)\n",
44 length, start);
45 return NULL;
46 }
47
48 if (length - start < minlen) {
49 dev_err(&dev->dev,
50 "virtio_pci: bad capability len %u (>=%zu expected)\n",
51 length, minlen);
52 return NULL;
53 }
54
55 length -= start;
56
57 if (start + offset < offset) {
58 dev_err(&dev->dev,
59 "virtio_pci: map wrap-around %u+%u\n",
60 start, offset);
61 return NULL;
62 }
63
64 offset += start;
65
66 if (offset & (align - 1)) {
67 dev_err(&dev->dev,
68 "virtio_pci: offset %u not aligned to %u\n",
69 offset, align);
70 return NULL;
71 }
72
73 if (length > size)
74 length = size;
75
76 if (len)
77 *len = length;
78
79 if (minlen + offset < minlen ||
80 minlen + offset > pci_resource_len(dev, bar)) {
81 dev_err(&dev->dev,
82 "virtio_pci: map virtio %zu@%u "
83 "out of range on bar %i length %lu\n",
84 minlen, offset,
85 bar, (unsigned long)pci_resource_len(dev, bar));
86 return NULL;
87 }
88
89 p = pci_iomap_range(dev, bar, offset, length);
90 if (!p)
91 dev_err(&dev->dev,
92 "virtio_pci: unable to map virtio %u@%u on bar %i\n",
93 length, offset, bar);
94 return p;
95}
96
97static void iowrite64_twopart(u64 val, __le32 __iomem *lo, __le32 __iomem *hi)
98{
99 iowrite32((u32)val, lo);
100 iowrite32(val >> 32, hi);
101}
102
103/* virtio config->get_features() implementation */
104static u64 vp_get_features(struct virtio_device *vdev)
105{
106 struct virtio_pci_device *vp_dev = to_vp_device(vdev);
107 u64 features;
108
109 iowrite32(0, &vp_dev->common->device_feature_select);
110 features = ioread32(&vp_dev->common->device_feature);
111 iowrite32(1, &vp_dev->common->device_feature_select);
112 features |= ((u64)ioread32(&vp_dev->common->device_feature) << 32);
113
114 return features;
115}
116
117/* virtio config->finalize_features() implementation */
118static int vp_finalize_features(struct virtio_device *vdev)
119{
120 struct virtio_pci_device *vp_dev = to_vp_device(vdev);
121
122 /* Give virtio_ring a chance to accept features. */
123 vring_transport_features(vdev);
124
125 if (!__virtio_test_bit(vdev, VIRTIO_F_VERSION_1)) {
126 dev_err(&vdev->dev, "virtio: device uses modern interface "
127 "but does not have VIRTIO_F_VERSION_1\n");
128 return -EINVAL;
129 }
130
131 iowrite32(0, &vp_dev->common->guest_feature_select);
132 iowrite32((u32)vdev->features, &vp_dev->common->guest_feature);
133 iowrite32(1, &vp_dev->common->guest_feature_select);
134 iowrite32(vdev->features >> 32, &vp_dev->common->guest_feature);
135
136 return 0;
137}
138
139/* virtio config->get() implementation */
140static void vp_get(struct virtio_device *vdev, unsigned offset,
141 void *buf, unsigned len)
142{
143 struct virtio_pci_device *vp_dev = to_vp_device(vdev);
144 u8 b;
145 __le16 w;
146 __le32 l;
147
148 BUG_ON(offset + len > vp_dev->device_len);
149
150 switch (len) {
151 case 1:
152 b = ioread8(vp_dev->device + offset);
153 memcpy(buf, &b, sizeof b);
154 break;
155 case 2:
156 w = cpu_to_le16(ioread16(vp_dev->device + offset));
157 memcpy(buf, &w, sizeof w);
158 break;
159 case 4:
160 l = cpu_to_le32(ioread32(vp_dev->device + offset));
161 memcpy(buf, &l, sizeof l);
162 break;
163 case 8:
164 l = cpu_to_le32(ioread32(vp_dev->device + offset));
165 memcpy(buf, &l, sizeof l);
166 l = cpu_to_le32(ioread32(vp_dev->device + offset + sizeof l));
167 memcpy(buf + sizeof l, &l, sizeof l);
168 break;
169 default:
170 BUG();
171 }
172}
173
174/* the config->set() implementation. it's symmetric to the config->get()
175 * implementation */
176static void vp_set(struct virtio_device *vdev, unsigned offset,
177 const void *buf, unsigned len)
178{
179 struct virtio_pci_device *vp_dev = to_vp_device(vdev);
180 u8 b;
181 __le16 w;
182 __le32 l;
183
184 BUG_ON(offset + len > vp_dev->device_len);
185
186 switch (len) {
187 case 1:
188 memcpy(&b, buf, sizeof b);
189 iowrite8(b, vp_dev->device + offset);
190 break;
191 case 2:
192 memcpy(&w, buf, sizeof w);
193 iowrite16(le16_to_cpu(w), vp_dev->device + offset);
194 break;
195 case 4:
196 memcpy(&l, buf, sizeof l);
197 iowrite32(le32_to_cpu(l), vp_dev->device + offset);
198 break;
199 case 8:
200 memcpy(&l, buf, sizeof l);
201 iowrite32(le32_to_cpu(l), vp_dev->device + offset);
202 memcpy(&l, buf + sizeof l, sizeof l);
203 iowrite32(le32_to_cpu(l), vp_dev->device + offset + sizeof l);
204 break;
205 default:
206 BUG();
207 }
208}
209
210static u32 vp_generation(struct virtio_device *vdev)
211{
212 struct virtio_pci_device *vp_dev = to_vp_device(vdev);
213 return ioread8(&vp_dev->common->config_generation);
214}
215
216/* config->{get,set}_status() implementations */
217static u8 vp_get_status(struct virtio_device *vdev)
218{
219 struct virtio_pci_device *vp_dev = to_vp_device(vdev);
220 return ioread8(&vp_dev->common->device_status);
221}
222
223static void vp_set_status(struct virtio_device *vdev, u8 status)
224{
225 struct virtio_pci_device *vp_dev = to_vp_device(vdev);
226 /* We should never be setting status to 0. */
227 BUG_ON(status == 0);
228 iowrite8(status, &vp_dev->common->device_status);
229}
230
231static void vp_reset(struct virtio_device *vdev)
232{
233 struct virtio_pci_device *vp_dev = to_vp_device(vdev);
234 /* 0 status means a reset. */
235 iowrite8(0, &vp_dev->common->device_status);
236 /* Flush out the status write, and flush in device writes,
237 * including MSI-X interrupts, if any. */
238 ioread8(&vp_dev->common->device_status);
239 /* Flush pending VQ/configuration callbacks. */
240 vp_synchronize_vectors(vdev);
241}
242
243static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector)
244{
245 /* Setup the vector used for configuration events */
246 iowrite16(vector, &vp_dev->common->msix_config);
247 /* Verify we had enough resources to assign the vector */
248 /* Will also flush the write out to device */
249 return ioread16(&vp_dev->common->msix_config);
250}
251
252static size_t vring_pci_size(u16 num)
253{
254 /* We only need a cacheline separation. */
255 return PAGE_ALIGN(vring_size(num, SMP_CACHE_BYTES));
256}
257
258static void *alloc_virtqueue_pages(int *num)
259{
260 void *pages;
261
262 /* TODO: allocate each queue chunk individually */
263 for (; *num && vring_pci_size(*num) > PAGE_SIZE; *num /= 2) {
264 pages = alloc_pages_exact(vring_pci_size(*num),
265 GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN);
266 if (pages)
267 return pages;
268 }
269
270 if (!*num)
271 return NULL;
272
273 /* Try to get a single page. You are my only hope! */
274 return alloc_pages_exact(vring_pci_size(*num), GFP_KERNEL|__GFP_ZERO);
275}
276
277static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
278 struct virtio_pci_vq_info *info,
279 unsigned index,
280 void (*callback)(struct virtqueue *vq),
281 const char *name,
282 u16 msix_vec)
283{
284 struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common;
285 struct virtqueue *vq;
286 u16 num, off;
287 int err;
288
289 if (index >= ioread16(&cfg->num_queues))
290 return ERR_PTR(-ENOENT);
291
292 /* Select the queue we're interested in */
293 iowrite16(index, &cfg->queue_select);
294
295 /* Check if queue is either not available or already active. */
296 num = ioread16(&cfg->queue_size);
297 if (!num || ioread16(&cfg->queue_enable))
298 return ERR_PTR(-ENOENT);
299
300 if (num & (num - 1)) {
301 dev_warn(&vp_dev->pci_dev->dev, "bad queue size %u", num);
302 return ERR_PTR(-EINVAL);
303 }
304
305 /* get offset of notification word for this vq */
306 off = ioread16(&cfg->queue_notify_off);
307
308 info->num = num;
309 info->msix_vector = msix_vec;
310
311 info->queue = alloc_virtqueue_pages(&info->num);
312 if (info->queue == NULL)
313 return ERR_PTR(-ENOMEM);
314
315 /* create the vring */
316 vq = vring_new_virtqueue(index, info->num,
317 SMP_CACHE_BYTES, &vp_dev->vdev,
318 true, info->queue, vp_notify, callback, name);
319 if (!vq) {
320 err = -ENOMEM;
321 goto err_new_queue;
322 }
323
324 /* activate the queue */
325 iowrite16(num, &cfg->queue_size);
326 iowrite64_twopart(virt_to_phys(info->queue),
327 &cfg->queue_desc_lo, &cfg->queue_desc_hi);
328 iowrite64_twopart(virt_to_phys(virtqueue_get_avail(vq)),
329 &cfg->queue_avail_lo, &cfg->queue_avail_hi);
330 iowrite64_twopart(virt_to_phys(virtqueue_get_used(vq)),
331 &cfg->queue_used_lo, &cfg->queue_used_hi);
332
333 if (vp_dev->notify_base) {
334 /* offset should not wrap */
335 if ((u64)off * vp_dev->notify_offset_multiplier + 2
336 > vp_dev->notify_len) {
337 dev_warn(&vp_dev->pci_dev->dev,
338 "bad notification offset %u (x %u) "
339 "for queue %u > %zd",
340 off, vp_dev->notify_offset_multiplier,
341 index, vp_dev->notify_len);
342 err = -EINVAL;
343 goto err_map_notify;
344 }
345 vq->priv = (void __force *)vp_dev->notify_base +
346 off * vp_dev->notify_offset_multiplier;
347 } else {
348 vq->priv = (void __force *)map_capability(vp_dev->pci_dev,
349 vp_dev->notify_map_cap, 2, 2,
350 off * vp_dev->notify_offset_multiplier, 2,
351 NULL);
352 }
353
354 if (!vq->priv) {
355 err = -ENOMEM;
356 goto err_map_notify;
357 }
358
359 if (msix_vec != VIRTIO_MSI_NO_VECTOR) {
360 iowrite16(msix_vec, &cfg->queue_msix_vector);
361 msix_vec = ioread16(&cfg->queue_msix_vector);
362 if (msix_vec == VIRTIO_MSI_NO_VECTOR) {
363 err = -EBUSY;
364 goto err_assign_vector;
365 }
366 }
367
368 return vq;
369
370err_assign_vector:
371 if (!vp_dev->notify_base)
372 pci_iounmap(vp_dev->pci_dev, (void __iomem __force *)vq->priv);
373err_map_notify:
374 vring_del_virtqueue(vq);
375err_new_queue:
376 free_pages_exact(info->queue, vring_pci_size(info->num));
377 return ERR_PTR(err);
378}
379
380static int vp_modern_find_vqs(struct virtio_device *vdev, unsigned nvqs,
381 struct virtqueue *vqs[],
382 vq_callback_t *callbacks[],
383 const char *names[])
384{
385 struct virtio_pci_device *vp_dev = to_vp_device(vdev);
386 struct virtqueue *vq;
387 int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names);
388
389 if (rc)
390 return rc;
391
392 /* Select and activate all queues. Has to be done last: once we do
393 * this, there's no way to go back except reset.
394 */
395 list_for_each_entry(vq, &vdev->vqs, list) {
396 iowrite16(vq->index, &vp_dev->common->queue_select);
397 iowrite16(1, &vp_dev->common->queue_enable);
398 }
399
400 return 0;
401}
402
403static void del_vq(struct virtio_pci_vq_info *info)
404{
405 struct virtqueue *vq = info->vq;
406 struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
407
408 iowrite16(vq->index, &vp_dev->common->queue_select);
409
410 if (vp_dev->msix_enabled) {
411 iowrite16(VIRTIO_MSI_NO_VECTOR,
412 &vp_dev->common->queue_msix_vector);
413 /* Flush the write out to device */
414 ioread16(&vp_dev->common->queue_msix_vector);
415 }
416
417 if (!vp_dev->notify_base)
418 pci_iounmap(vp_dev->pci_dev, (void __force __iomem *)vq->priv);
419
420 vring_del_virtqueue(vq);
421
422 free_pages_exact(info->queue, vring_pci_size(info->num));
423}
424
425static const struct virtio_config_ops virtio_pci_config_nodev_ops = {
426 .get = NULL,
427 .set = NULL,
428 .generation = vp_generation,
429 .get_status = vp_get_status,
430 .set_status = vp_set_status,
431 .reset = vp_reset,
432 .find_vqs = vp_modern_find_vqs,
433 .del_vqs = vp_del_vqs,
434 .get_features = vp_get_features,
435 .finalize_features = vp_finalize_features,
436 .bus_name = vp_bus_name,
437 .set_vq_affinity = vp_set_vq_affinity,
438};
439
440static const struct virtio_config_ops virtio_pci_config_ops = {
441 .get = vp_get,
442 .set = vp_set,
443 .generation = vp_generation,
444 .get_status = vp_get_status,
445 .set_status = vp_set_status,
446 .reset = vp_reset,
447 .find_vqs = vp_modern_find_vqs,
448 .del_vqs = vp_del_vqs,
449 .get_features = vp_get_features,
450 .finalize_features = vp_finalize_features,
451 .bus_name = vp_bus_name,
452 .set_vq_affinity = vp_set_vq_affinity,
453};
454
455/**
456 * virtio_pci_find_capability - walk capabilities to find device info.
457 * @dev: the pci device
458 * @cfg_type: the VIRTIO_PCI_CAP_* value we seek
459 * @ioresource_types: IORESOURCE_MEM and/or IORESOURCE_IO.
460 *
461 * Returns offset of the capability, or 0.
462 */
463static inline int virtio_pci_find_capability(struct pci_dev *dev, u8 cfg_type,
464 u32 ioresource_types)
465{
466 int pos;
467
468 for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR);
469 pos > 0;
470 pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) {
471 u8 type, bar;
472 pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap,
473 cfg_type),
474 &type);
475 pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap,
476 bar),
477 &bar);
478
479 /* Ignore structures with reserved BAR values */
480 if (bar > 0x5)
481 continue;
482
483 if (type == cfg_type) {
484 if (pci_resource_len(dev, bar) &&
485 pci_resource_flags(dev, bar) & ioresource_types)
486 return pos;
487 }
488 }
489 return 0;
490}
491
492/* This is part of the ABI. Don't screw with it. */
493static inline void check_offsets(void)
494{
495 /* Note: disk space was harmed in compilation of this function. */
496 BUILD_BUG_ON(VIRTIO_PCI_CAP_VNDR !=
497 offsetof(struct virtio_pci_cap, cap_vndr));
498 BUILD_BUG_ON(VIRTIO_PCI_CAP_NEXT !=
499 offsetof(struct virtio_pci_cap, cap_next));
500 BUILD_BUG_ON(VIRTIO_PCI_CAP_LEN !=
501 offsetof(struct virtio_pci_cap, cap_len));
502 BUILD_BUG_ON(VIRTIO_PCI_CAP_CFG_TYPE !=
503 offsetof(struct virtio_pci_cap, cfg_type));
504 BUILD_BUG_ON(VIRTIO_PCI_CAP_BAR !=
505 offsetof(struct virtio_pci_cap, bar));
506 BUILD_BUG_ON(VIRTIO_PCI_CAP_OFFSET !=
507 offsetof(struct virtio_pci_cap, offset));
508 BUILD_BUG_ON(VIRTIO_PCI_CAP_LENGTH !=
509 offsetof(struct virtio_pci_cap, length));
510 BUILD_BUG_ON(VIRTIO_PCI_NOTIFY_CAP_MULT !=
511 offsetof(struct virtio_pci_notify_cap,
512 notify_off_multiplier));
513 BUILD_BUG_ON(VIRTIO_PCI_COMMON_DFSELECT !=
514 offsetof(struct virtio_pci_common_cfg,
515 device_feature_select));
516 BUILD_BUG_ON(VIRTIO_PCI_COMMON_DF !=
517 offsetof(struct virtio_pci_common_cfg, device_feature));
518 BUILD_BUG_ON(VIRTIO_PCI_COMMON_GFSELECT !=
519 offsetof(struct virtio_pci_common_cfg,
520 guest_feature_select));
521 BUILD_BUG_ON(VIRTIO_PCI_COMMON_GF !=
522 offsetof(struct virtio_pci_common_cfg, guest_feature));
523 BUILD_BUG_ON(VIRTIO_PCI_COMMON_MSIX !=
524 offsetof(struct virtio_pci_common_cfg, msix_config));
525 BUILD_BUG_ON(VIRTIO_PCI_COMMON_NUMQ !=
526 offsetof(struct virtio_pci_common_cfg, num_queues));
527 BUILD_BUG_ON(VIRTIO_PCI_COMMON_STATUS !=
528 offsetof(struct virtio_pci_common_cfg, device_status));
529 BUILD_BUG_ON(VIRTIO_PCI_COMMON_CFGGENERATION !=
530 offsetof(struct virtio_pci_common_cfg, config_generation));
531 BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SELECT !=
532 offsetof(struct virtio_pci_common_cfg, queue_select));
533 BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SIZE !=
534 offsetof(struct virtio_pci_common_cfg, queue_size));
535 BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_MSIX !=
536 offsetof(struct virtio_pci_common_cfg, queue_msix_vector));
537 BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_ENABLE !=
538 offsetof(struct virtio_pci_common_cfg, queue_enable));
539 BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_NOFF !=
540 offsetof(struct virtio_pci_common_cfg, queue_notify_off));
541 BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCLO !=
542 offsetof(struct virtio_pci_common_cfg, queue_desc_lo));
543 BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCHI !=
544 offsetof(struct virtio_pci_common_cfg, queue_desc_hi));
545 BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILLO !=
546 offsetof(struct virtio_pci_common_cfg, queue_avail_lo));
547 BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILHI !=
548 offsetof(struct virtio_pci_common_cfg, queue_avail_hi));
549 BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDLO !=
550 offsetof(struct virtio_pci_common_cfg, queue_used_lo));
551 BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDHI !=
552 offsetof(struct virtio_pci_common_cfg, queue_used_hi));
553}
554
555/* the PCI probing function */
556int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev)
557{
558 struct pci_dev *pci_dev = vp_dev->pci_dev;
559 int err, common, isr, notify, device;
560 u32 notify_length;
561 u32 notify_offset;
562
563 check_offsets();
564
565 /* We only own devices >= 0x1000 and <= 0x107f: leave the rest. */
566 if (pci_dev->device < 0x1000 || pci_dev->device > 0x107f)
567 return -ENODEV;
568
569 if (pci_dev->device < 0x1040) {
570 /* Transitional devices: use the PCI subsystem device id as
571 * virtio device id, same as legacy driver always did.
572 */
573 vp_dev->vdev.id.device = pci_dev->subsystem_device;
574 } else {
575 /* Modern devices: simply use PCI device id, but start from 0x1040. */
576 vp_dev->vdev.id.device = pci_dev->device - 0x1040;
577 }
578 vp_dev->vdev.id.vendor = pci_dev->subsystem_vendor;
579
580 if (virtio_device_is_legacy_only(vp_dev->vdev.id))
581 return -ENODEV;
582
583 /* check for a common config: if not, use legacy mode (bar 0). */
584 common = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_COMMON_CFG,
585 IORESOURCE_IO | IORESOURCE_MEM);
586 if (!common) {
587 dev_info(&pci_dev->dev,
588 "virtio_pci: leaving for legacy driver\n");
589 return -ENODEV;
590 }
591
592 /* If common is there, these should be too... */
593 isr = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_ISR_CFG,
594 IORESOURCE_IO | IORESOURCE_MEM);
595 notify = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_NOTIFY_CFG,
596 IORESOURCE_IO | IORESOURCE_MEM);
597 if (!isr || !notify) {
598 dev_err(&pci_dev->dev,
599 "virtio_pci: missing capabilities %i/%i/%i\n",
600 common, isr, notify);
601 return -EINVAL;
602 }
603
604 /* Device capability is only mandatory for devices that have
605 * device-specific configuration.
606 */
607 device = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_DEVICE_CFG,
608 IORESOURCE_IO | IORESOURCE_MEM);
609
610 err = -EINVAL;
611 vp_dev->common = map_capability(pci_dev, common,
612 sizeof(struct virtio_pci_common_cfg), 4,
613 0, sizeof(struct virtio_pci_common_cfg),
614 NULL);
615 if (!vp_dev->common)
616 goto err_map_common;
617 vp_dev->isr = map_capability(pci_dev, isr, sizeof(u8), 1,
618 0, 1,
619 NULL);
620 if (!vp_dev->isr)
621 goto err_map_isr;
622
623 /* Read notify_off_multiplier from config space. */
624 pci_read_config_dword(pci_dev,
625 notify + offsetof(struct virtio_pci_notify_cap,
626 notify_off_multiplier),
627 &vp_dev->notify_offset_multiplier);
628 /* Read notify length and offset from config space. */
629 pci_read_config_dword(pci_dev,
630 notify + offsetof(struct virtio_pci_notify_cap,
631 cap.length),
632 &notify_length);
633
634 pci_read_config_dword(pci_dev,
635 notify + offsetof(struct virtio_pci_notify_cap,
636 cap.length),
637 &notify_offset);
638
639 /* We don't know how many VQs we'll map, ahead of the time.
640 * If notify length is small, map it all now.
641 * Otherwise, map each VQ individually later.
642 */
643 if ((u64)notify_length + (notify_offset % PAGE_SIZE) <= PAGE_SIZE) {
644 vp_dev->notify_base = map_capability(pci_dev, notify, 2, 2,
645 0, notify_length,
646 &vp_dev->notify_len);
647 if (!vp_dev->notify_base)
648 goto err_map_notify;
649 } else {
650 vp_dev->notify_map_cap = notify;
651 }
652
653 /* Again, we don't know how much we should map, but PAGE_SIZE
654 * is more than enough for all existing devices.
655 */
656 if (device) {
657 vp_dev->device = map_capability(pci_dev, device, 0, 4,
658 0, PAGE_SIZE,
659 &vp_dev->device_len);
660 if (!vp_dev->device)
661 goto err_map_device;
662
663 vp_dev->vdev.config = &virtio_pci_config_ops;
664 } else {
665 vp_dev->vdev.config = &virtio_pci_config_nodev_ops;
666 }
667
668 vp_dev->config_vector = vp_config_vector;
669 vp_dev->setup_vq = setup_vq;
670 vp_dev->del_vq = del_vq;
671
672 return 0;
673
674err_map_device:
675 if (vp_dev->notify_base)
676 pci_iounmap(pci_dev, vp_dev->notify_base);
677err_map_notify:
678 pci_iounmap(pci_dev, vp_dev->isr);
679err_map_isr:
680 pci_iounmap(pci_dev, vp_dev->common);
681err_map_common:
682 return err;
683}
684
685void virtio_pci_modern_remove(struct virtio_pci_device *vp_dev)
686{
687 struct pci_dev *pci_dev = vp_dev->pci_dev;
688
689 if (vp_dev->device)
690 pci_iounmap(pci_dev, vp_dev->device);
691 if (vp_dev->notify_base)
692 pci_iounmap(pci_dev, vp_dev->notify_base);
693 pci_iounmap(pci_dev, vp_dev->isr);
694 pci_iounmap(pci_dev, vp_dev->common);
695}
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 00ec6b3f96b2..096b857e7b75 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -54,8 +54,7 @@
54#define END_USE(vq) 54#define END_USE(vq)
55#endif 55#endif
56 56
57struct vring_virtqueue 57struct vring_virtqueue {
58{
59 struct virtqueue vq; 58 struct virtqueue vq;
60 59
61 /* Actual memory layout for this queue */ 60 /* Actual memory layout for this queue */
@@ -245,14 +244,14 @@ static inline int virtqueue_add(struct virtqueue *_vq,
245 vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) + 1); 244 vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) + 1);
246 vq->num_added++; 245 vq->num_added++;
247 246
247 pr_debug("Added buffer head %i to %p\n", head, vq);
248 END_USE(vq);
249
248 /* This is very unlikely, but theoretically possible. Kick 250 /* This is very unlikely, but theoretically possible. Kick
249 * just in case. */ 251 * just in case. */
250 if (unlikely(vq->num_added == (1 << 16) - 1)) 252 if (unlikely(vq->num_added == (1 << 16) - 1))
251 virtqueue_kick(_vq); 253 virtqueue_kick(_vq);
252 254
253 pr_debug("Added buffer head %i to %p\n", head, vq);
254 END_USE(vq);
255
256 return 0; 255 return 0;
257} 256}
258 257
diff --git a/include/asm-generic/pci_iomap.h b/include/asm-generic/pci_iomap.h
index ce37349860fe..7389c87116a0 100644
--- a/include/asm-generic/pci_iomap.h
+++ b/include/asm-generic/pci_iomap.h
@@ -15,6 +15,9 @@ struct pci_dev;
15#ifdef CONFIG_PCI 15#ifdef CONFIG_PCI
16/* Create a virtual mapping cookie for a PCI BAR (memory or IO) */ 16/* Create a virtual mapping cookie for a PCI BAR (memory or IO) */
17extern void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max); 17extern void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max);
18extern void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
19 unsigned long offset,
20 unsigned long maxlen);
18/* Create a virtual mapping cookie for a port on a given PCI device. 21/* Create a virtual mapping cookie for a port on a given PCI device.
19 * Do not call this directly, it exists to make it easier for architectures 22 * Do not call this directly, it exists to make it easier for architectures
20 * to override */ 23 * to override */
@@ -30,6 +33,13 @@ static inline void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned lon
30{ 33{
31 return NULL; 34 return NULL;
32} 35}
36
37static inline void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
38 unsigned long offset,
39 unsigned long maxlen)
40{
41 return NULL;
42}
33#endif 43#endif
34 44
35#endif /* __ASM_GENERIC_IO_H */ 45#endif /* __ASM_GENERIC_IO_H */
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
index 495203ff221c..acd5b12565cc 100644
--- a/include/linux/lguest_launcher.h
+++ b/include/linux/lguest_launcher.h
@@ -8,52 +8,13 @@
8 * 8 *
9 * The Guest needs devices to do anything useful. Since we don't let it touch 9 * The Guest needs devices to do anything useful. Since we don't let it touch
10 * real devices (think of the damage it could do!) we provide virtual devices. 10 * real devices (think of the damage it could do!) we provide virtual devices.
11 * We could emulate a PCI bus with various devices on it, but that is a fairly 11 * We emulate a PCI bus with virtio devices on it; we used to have our own
12 * complex burden for the Host and suboptimal for the Guest, so we have our own 12 * lguest bus which was far simpler, but this tests the virtio 1.0 standard.
13 * simple lguest bus and we use "virtio" drivers. These drivers need a set of
14 * routines from us which will actually do the virtual I/O, but they handle all
15 * the net/block/console stuff themselves. This means that if we want to add
16 * a new device, we simply need to write a new virtio driver and create support
17 * for it in the Launcher: this code won't need to change.
18 * 13 *
19 * Virtio devices are also used by kvm, so we can simply reuse their optimized 14 * Virtio devices are also used by kvm, so we can simply reuse their optimized
20 * device drivers. And one day when everyone uses virtio, my plan will be 15 * device drivers. And one day when everyone uses virtio, my plan will be
21 * complete. Bwahahahah! 16 * complete. Bwahahahah!
22 *
23 * Devices are described by a simplified ID, a status byte, and some "config"
24 * bytes which describe this device's configuration. This is placed by the
25 * Launcher just above the top of physical memory:
26 */
27struct lguest_device_desc {
28 /* The device type: console, network, disk etc. Type 0 terminates. */
29 __u8 type;
30 /* The number of virtqueues (first in config array) */
31 __u8 num_vq;
32 /*
33 * The number of bytes of feature bits. Multiply by 2: one for host
34 * features and one for Guest acknowledgements.
35 */
36 __u8 feature_len;
37 /* The number of bytes of the config array after virtqueues. */
38 __u8 config_len;
39 /* A status byte, written by the Guest. */
40 __u8 status;
41 __u8 config[0];
42};
43
44/*D:135
45 * This is how we expect the device configuration field for a virtqueue
46 * to be laid out in config space.
47 */ 17 */
48struct lguest_vqconfig {
49 /* The number of entries in the virtio_ring */
50 __u16 num;
51 /* The interrupt we get when something happens. */
52 __u16 irq;
53 /* The page number of the virtio ring for this device. */
54 __u32 pfn;
55};
56/*:*/
57 18
58/* Write command first word is a request. */ 19/* Write command first word is a request. */
59enum lguest_req 20enum lguest_req
@@ -62,12 +23,22 @@ enum lguest_req
62 LHREQ_GETDMA, /* No longer used */ 23 LHREQ_GETDMA, /* No longer used */
63 LHREQ_IRQ, /* + irq */ 24 LHREQ_IRQ, /* + irq */
64 LHREQ_BREAK, /* No longer used */ 25 LHREQ_BREAK, /* No longer used */
65 LHREQ_EVENTFD, /* + address, fd. */ 26 LHREQ_EVENTFD, /* No longer used. */
27 LHREQ_GETREG, /* + offset within struct pt_regs (then read value). */
28 LHREQ_SETREG, /* + offset within struct pt_regs, value. */
29 LHREQ_TRAP, /* + trap number to deliver to guest. */
66}; 30};
67 31
68/* 32/*
69 * The alignment to use between consumer and producer parts of vring. 33 * This is what read() of the lguest fd populates. trap ==
70 * x86 pagesize for historical reasons. 34 * LGUEST_TRAP_ENTRY for an LHCALL_NOTIFY (addr is the
35 * argument), 14 for a page fault in the MMIO region (addr is
36 * the trap address, insn is the instruction), or 13 for a GPF
37 * (insn is the instruction).
71 */ 38 */
72#define LGUEST_VRING_ALIGN 4096 39struct lguest_pending {
40 __u8 trap;
41 __u8 insn[7];
42 __u32 addr;
43};
73#endif /* _LINUX_LGUEST_LAUNCHER */ 44#endif /* _LINUX_LGUEST_LAUNCHER */
diff --git a/include/linux/virtio_mmio.h b/include/linux/virtio_mmio.h
index 5c7b6f0daef8..c4b09689ab64 100644
--- a/include/linux/virtio_mmio.h
+++ b/include/linux/virtio_mmio.h
@@ -51,23 +51,29 @@
51/* Virtio vendor ID - Read Only */ 51/* Virtio vendor ID - Read Only */
52#define VIRTIO_MMIO_VENDOR_ID 0x00c 52#define VIRTIO_MMIO_VENDOR_ID 0x00c
53 53
54/* Bitmask of the features supported by the host 54/* Bitmask of the features supported by the device (host)
55 * (32 bits per set) - Read Only */ 55 * (32 bits per set) - Read Only */
56#define VIRTIO_MMIO_HOST_FEATURES 0x010 56#define VIRTIO_MMIO_DEVICE_FEATURES 0x010
57 57
58/* Host features set selector - Write Only */ 58/* Device (host) features set selector - Write Only */
59#define VIRTIO_MMIO_HOST_FEATURES_SEL 0x014 59#define VIRTIO_MMIO_DEVICE_FEATURES_SEL 0x014
60 60
61/* Bitmask of features activated by the guest 61/* Bitmask of features activated by the driver (guest)
62 * (32 bits per set) - Write Only */ 62 * (32 bits per set) - Write Only */
63#define VIRTIO_MMIO_GUEST_FEATURES 0x020 63#define VIRTIO_MMIO_DRIVER_FEATURES 0x020
64 64
65/* Activated features set selector - Write Only */ 65/* Activated features set selector - Write Only */
66#define VIRTIO_MMIO_GUEST_FEATURES_SEL 0x024 66#define VIRTIO_MMIO_DRIVER_FEATURES_SEL 0x024
67
68
69#ifndef VIRTIO_MMIO_NO_LEGACY /* LEGACY DEVICES ONLY! */
67 70
68/* Guest's memory page size in bytes - Write Only */ 71/* Guest's memory page size in bytes - Write Only */
69#define VIRTIO_MMIO_GUEST_PAGE_SIZE 0x028 72#define VIRTIO_MMIO_GUEST_PAGE_SIZE 0x028
70 73
74#endif
75
76
71/* Queue selector - Write Only */ 77/* Queue selector - Write Only */
72#define VIRTIO_MMIO_QUEUE_SEL 0x030 78#define VIRTIO_MMIO_QUEUE_SEL 0x030
73 79
@@ -77,12 +83,21 @@
77/* Queue size for the currently selected queue - Write Only */ 83/* Queue size for the currently selected queue - Write Only */
78#define VIRTIO_MMIO_QUEUE_NUM 0x038 84#define VIRTIO_MMIO_QUEUE_NUM 0x038
79 85
86
87#ifndef VIRTIO_MMIO_NO_LEGACY /* LEGACY DEVICES ONLY! */
88
80/* Used Ring alignment for the currently selected queue - Write Only */ 89/* Used Ring alignment for the currently selected queue - Write Only */
81#define VIRTIO_MMIO_QUEUE_ALIGN 0x03c 90#define VIRTIO_MMIO_QUEUE_ALIGN 0x03c
82 91
83/* Guest's PFN for the currently selected queue - Read Write */ 92/* Guest's PFN for the currently selected queue - Read Write */
84#define VIRTIO_MMIO_QUEUE_PFN 0x040 93#define VIRTIO_MMIO_QUEUE_PFN 0x040
85 94
95#endif
96
97
98/* Ready bit for the currently selected queue - Read Write */
99#define VIRTIO_MMIO_QUEUE_READY 0x044
100
86/* Queue notifier - Write Only */ 101/* Queue notifier - Write Only */
87#define VIRTIO_MMIO_QUEUE_NOTIFY 0x050 102#define VIRTIO_MMIO_QUEUE_NOTIFY 0x050
88 103
@@ -95,6 +110,21 @@
95/* Device status register - Read Write */ 110/* Device status register - Read Write */
96#define VIRTIO_MMIO_STATUS 0x070 111#define VIRTIO_MMIO_STATUS 0x070
97 112
113/* Selected queue's Descriptor Table address, 64 bits in two halves */
114#define VIRTIO_MMIO_QUEUE_DESC_LOW 0x080
115#define VIRTIO_MMIO_QUEUE_DESC_HIGH 0x084
116
117/* Selected queue's Available Ring address, 64 bits in two halves */
118#define VIRTIO_MMIO_QUEUE_AVAIL_LOW 0x090
119#define VIRTIO_MMIO_QUEUE_AVAIL_HIGH 0x094
120
121/* Selected queue's Used Ring address, 64 bits in two halves */
122#define VIRTIO_MMIO_QUEUE_USED_LOW 0x0a0
123#define VIRTIO_MMIO_QUEUE_USED_HIGH 0x0a4
124
125/* Configuration atomicity value */
126#define VIRTIO_MMIO_CONFIG_GENERATION 0x0fc
127
98/* The config space is defined by each driver as 128/* The config space is defined by each driver as
99 * the per-driver configuration space - Read Write */ 129 * the per-driver configuration space - Read Write */
100#define VIRTIO_MMIO_CONFIG 0x100 130#define VIRTIO_MMIO_CONFIG 0x100
diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
index be40f7059e93..4b0488f20b2e 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -36,8 +36,7 @@
36/* Size of a PFN in the balloon interface. */ 36/* Size of a PFN in the balloon interface. */
37#define VIRTIO_BALLOON_PFN_SHIFT 12 37#define VIRTIO_BALLOON_PFN_SHIFT 12
38 38
39struct virtio_balloon_config 39struct virtio_balloon_config {
40{
41 /* Number of pages host wants Guest to give up. */ 40 /* Number of pages host wants Guest to give up. */
42 __le32 num_pages; 41 __le32 num_pages;
43 /* Number of pages we've actually got in balloon. */ 42 /* Number of pages we've actually got in balloon. */
diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
index 247c8ba8544a..3c53eec4ae22 100644
--- a/include/uapi/linux/virtio_blk.h
+++ b/include/uapi/linux/virtio_blk.h
@@ -31,22 +31,25 @@
31#include <linux/virtio_types.h> 31#include <linux/virtio_types.h>
32 32
33/* Feature bits */ 33/* Feature bits */
34#define VIRTIO_BLK_F_BARRIER 0 /* Does host support barriers? */
35#define VIRTIO_BLK_F_SIZE_MAX 1 /* Indicates maximum segment size */ 34#define VIRTIO_BLK_F_SIZE_MAX 1 /* Indicates maximum segment size */
36#define VIRTIO_BLK_F_SEG_MAX 2 /* Indicates maximum # of segments */ 35#define VIRTIO_BLK_F_SEG_MAX 2 /* Indicates maximum # of segments */
37#define VIRTIO_BLK_F_GEOMETRY 4 /* Legacy geometry available */ 36#define VIRTIO_BLK_F_GEOMETRY 4 /* Legacy geometry available */
38#define VIRTIO_BLK_F_RO 5 /* Disk is read-only */ 37#define VIRTIO_BLK_F_RO 5 /* Disk is read-only */
39#define VIRTIO_BLK_F_BLK_SIZE 6 /* Block size of disk is available*/ 38#define VIRTIO_BLK_F_BLK_SIZE 6 /* Block size of disk is available*/
40#define VIRTIO_BLK_F_SCSI 7 /* Supports scsi command passthru */
41#define VIRTIO_BLK_F_WCE 9 /* Writeback mode enabled after reset */
42#define VIRTIO_BLK_F_TOPOLOGY 10 /* Topology information is available */ 39#define VIRTIO_BLK_F_TOPOLOGY 10 /* Topology information is available */
43#define VIRTIO_BLK_F_CONFIG_WCE 11 /* Writeback mode available in config */
44#define VIRTIO_BLK_F_MQ 12 /* support more than one vq */ 40#define VIRTIO_BLK_F_MQ 12 /* support more than one vq */
45 41
42/* Legacy feature bits */
43#ifndef VIRTIO_BLK_NO_LEGACY
44#define VIRTIO_BLK_F_BARRIER 0 /* Does host support barriers? */
45#define VIRTIO_BLK_F_SCSI 7 /* Supports scsi command passthru */
46#define VIRTIO_BLK_F_WCE 9 /* Writeback mode enabled after reset */
47#define VIRTIO_BLK_F_CONFIG_WCE 11 /* Writeback mode available in config */
46#ifndef __KERNEL__ 48#ifndef __KERNEL__
47/* Old (deprecated) name for VIRTIO_BLK_F_WCE. */ 49/* Old (deprecated) name for VIRTIO_BLK_F_WCE. */
48#define VIRTIO_BLK_F_FLUSH VIRTIO_BLK_F_WCE 50#define VIRTIO_BLK_F_FLUSH VIRTIO_BLK_F_WCE
49#endif 51#endif
52#endif /* !VIRTIO_BLK_NO_LEGACY */
50 53
51#define VIRTIO_BLK_ID_BYTES 20 /* ID string length */ 54#define VIRTIO_BLK_ID_BYTES 20 /* ID string length */
52 55
@@ -100,8 +103,10 @@ struct virtio_blk_config {
100#define VIRTIO_BLK_T_IN 0 103#define VIRTIO_BLK_T_IN 0
101#define VIRTIO_BLK_T_OUT 1 104#define VIRTIO_BLK_T_OUT 1
102 105
106#ifndef VIRTIO_BLK_NO_LEGACY
103/* This bit says it's a scsi command, not an actual read or write. */ 107/* This bit says it's a scsi command, not an actual read or write. */
104#define VIRTIO_BLK_T_SCSI_CMD 2 108#define VIRTIO_BLK_T_SCSI_CMD 2
109#endif /* VIRTIO_BLK_NO_LEGACY */
105 110
106/* Cache flush command */ 111/* Cache flush command */
107#define VIRTIO_BLK_T_FLUSH 4 112#define VIRTIO_BLK_T_FLUSH 4
@@ -109,8 +114,10 @@ struct virtio_blk_config {
109/* Get device ID command */ 114/* Get device ID command */
110#define VIRTIO_BLK_T_GET_ID 8 115#define VIRTIO_BLK_T_GET_ID 8
111 116
117#ifndef VIRTIO_BLK_NO_LEGACY
112/* Barrier before this op. */ 118/* Barrier before this op. */
113#define VIRTIO_BLK_T_BARRIER 0x80000000 119#define VIRTIO_BLK_T_BARRIER 0x80000000
120#endif /* !VIRTIO_BLK_NO_LEGACY */
114 121
115/* This is the first element of the read scatter-gather list. */ 122/* This is the first element of the read scatter-gather list. */
116struct virtio_blk_outhdr { 123struct virtio_blk_outhdr {
@@ -122,12 +129,14 @@ struct virtio_blk_outhdr {
122 __virtio64 sector; 129 __virtio64 sector;
123}; 130};
124 131
132#ifndef VIRTIO_BLK_NO_LEGACY
125struct virtio_scsi_inhdr { 133struct virtio_scsi_inhdr {
126 __virtio32 errors; 134 __virtio32 errors;
127 __virtio32 data_len; 135 __virtio32 data_len;
128 __virtio32 sense_len; 136 __virtio32 sense_len;
129 __virtio32 residual; 137 __virtio32 residual;
130}; 138};
139#endif /* !VIRTIO_BLK_NO_LEGACY */
131 140
132/* And this is the final byte of the write scatter-gather list. */ 141/* And this is the final byte of the write scatter-gather list. */
133#define VIRTIO_BLK_S_OK 0 142#define VIRTIO_BLK_S_OK 0
diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
index a6d0cdeaacd4..c18264df9504 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -49,12 +49,14 @@
49#define VIRTIO_TRANSPORT_F_START 28 49#define VIRTIO_TRANSPORT_F_START 28
50#define VIRTIO_TRANSPORT_F_END 33 50#define VIRTIO_TRANSPORT_F_END 33
51 51
52#ifndef VIRTIO_CONFIG_NO_LEGACY
52/* Do we get callbacks when the ring is completely used, even if we've 53/* Do we get callbacks when the ring is completely used, even if we've
53 * suppressed them? */ 54 * suppressed them? */
54#define VIRTIO_F_NOTIFY_ON_EMPTY 24 55#define VIRTIO_F_NOTIFY_ON_EMPTY 24
55 56
56/* Can the device handle any descriptor layout? */ 57/* Can the device handle any descriptor layout? */
57#define VIRTIO_F_ANY_LAYOUT 27 58#define VIRTIO_F_ANY_LAYOUT 27
59#endif /* VIRTIO_CONFIG_NO_LEGACY */
58 60
59/* v1.0 compliant. */ 61/* v1.0 compliant. */
60#define VIRTIO_F_VERSION_1 32 62#define VIRTIO_F_VERSION_1 32
diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index b5f1677b291c..7bbee79ca293 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -35,7 +35,6 @@
35#define VIRTIO_NET_F_CSUM 0 /* Host handles pkts w/ partial csum */ 35#define VIRTIO_NET_F_CSUM 0 /* Host handles pkts w/ partial csum */
36#define VIRTIO_NET_F_GUEST_CSUM 1 /* Guest handles pkts w/ partial csum */ 36#define VIRTIO_NET_F_GUEST_CSUM 1 /* Guest handles pkts w/ partial csum */
37#define VIRTIO_NET_F_MAC 5 /* Host has given MAC address. */ 37#define VIRTIO_NET_F_MAC 5 /* Host has given MAC address. */
38#define VIRTIO_NET_F_GSO 6 /* Host handles pkts w/ any GSO type */
39#define VIRTIO_NET_F_GUEST_TSO4 7 /* Guest can handle TSOv4 in. */ 38#define VIRTIO_NET_F_GUEST_TSO4 7 /* Guest can handle TSOv4 in. */
40#define VIRTIO_NET_F_GUEST_TSO6 8 /* Guest can handle TSOv6 in. */ 39#define VIRTIO_NET_F_GUEST_TSO6 8 /* Guest can handle TSOv6 in. */
41#define VIRTIO_NET_F_GUEST_ECN 9 /* Guest can handle TSO[6] w/ ECN in. */ 40#define VIRTIO_NET_F_GUEST_ECN 9 /* Guest can handle TSO[6] w/ ECN in. */
@@ -56,6 +55,10 @@
56 * Steering */ 55 * Steering */
57#define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */ 56#define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */
58 57
58#ifndef VIRTIO_NET_NO_LEGACY
59#define VIRTIO_NET_F_GSO 6 /* Host handles pkts w/ any GSO type */
60#endif /* VIRTIO_NET_NO_LEGACY */
61
59#define VIRTIO_NET_S_LINK_UP 1 /* Link is up */ 62#define VIRTIO_NET_S_LINK_UP 1 /* Link is up */
60#define VIRTIO_NET_S_ANNOUNCE 2 /* Announcement is needed */ 63#define VIRTIO_NET_S_ANNOUNCE 2 /* Announcement is needed */
61 64
@@ -71,19 +74,39 @@ struct virtio_net_config {
71 __u16 max_virtqueue_pairs; 74 __u16 max_virtqueue_pairs;
72} __attribute__((packed)); 75} __attribute__((packed));
73 76
77/*
78 * This header comes first in the scatter-gather list. If you don't
79 * specify GSO or CSUM features, you can simply ignore the header.
80 *
81 * This is bitwise-equivalent to the legacy struct virtio_net_hdr_mrg_rxbuf,
82 * only flattened.
83 */
84struct virtio_net_hdr_v1 {
85#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* Use csum_start, csum_offset */
86#define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */
87 __u8 flags;
88#define VIRTIO_NET_HDR_GSO_NONE 0 /* Not a GSO frame */
89#define VIRTIO_NET_HDR_GSO_TCPV4 1 /* GSO frame, IPv4 TCP (TSO) */
90#define VIRTIO_NET_HDR_GSO_UDP 3 /* GSO frame, IPv4 UDP (UFO) */
91#define VIRTIO_NET_HDR_GSO_TCPV6 4 /* GSO frame, IPv6 TCP */
92#define VIRTIO_NET_HDR_GSO_ECN 0x80 /* TCP has ECN set */
93 __u8 gso_type;
94 __virtio16 hdr_len; /* Ethernet + IP + tcp/udp hdrs */
95 __virtio16 gso_size; /* Bytes to append to hdr_len per frame */
96 __virtio16 csum_start; /* Position to start checksumming from */
97 __virtio16 csum_offset; /* Offset after that to place checksum */
98 __virtio16 num_buffers; /* Number of merged rx buffers */
99};
100
101#ifndef VIRTIO_NET_NO_LEGACY
74/* This header comes first in the scatter-gather list. 102/* This header comes first in the scatter-gather list.
75 * If VIRTIO_F_ANY_LAYOUT is not negotiated, it must 103 * For legacy virtio, if VIRTIO_F_ANY_LAYOUT is not negotiated, it must
76 * be the first element of the scatter-gather list. If you don't 104 * be the first element of the scatter-gather list. If you don't
77 * specify GSO or CSUM features, you can simply ignore the header. */ 105 * specify GSO or CSUM features, you can simply ignore the header. */
78struct virtio_net_hdr { 106struct virtio_net_hdr {
79#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 // Use csum_start, csum_offset 107 /* See VIRTIO_NET_HDR_F_* */
80#define VIRTIO_NET_HDR_F_DATA_VALID 2 // Csum is valid
81 __u8 flags; 108 __u8 flags;
82#define VIRTIO_NET_HDR_GSO_NONE 0 // Not a GSO frame 109 /* See VIRTIO_NET_HDR_GSO_* */
83#define VIRTIO_NET_HDR_GSO_TCPV4 1 // GSO frame, IPv4 TCP (TSO)
84#define VIRTIO_NET_HDR_GSO_UDP 3 // GSO frame, IPv4 UDP (UFO)
85#define VIRTIO_NET_HDR_GSO_TCPV6 4 // GSO frame, IPv6 TCP
86#define VIRTIO_NET_HDR_GSO_ECN 0x80 // TCP has ECN set
87 __u8 gso_type; 110 __u8 gso_type;
88 __virtio16 hdr_len; /* Ethernet + IP + tcp/udp hdrs */ 111 __virtio16 hdr_len; /* Ethernet + IP + tcp/udp hdrs */
89 __virtio16 gso_size; /* Bytes to append to hdr_len per frame */ 112 __virtio16 gso_size; /* Bytes to append to hdr_len per frame */
@@ -97,6 +120,7 @@ struct virtio_net_hdr_mrg_rxbuf {
97 struct virtio_net_hdr hdr; 120 struct virtio_net_hdr hdr;
98 __virtio16 num_buffers; /* Number of merged rx buffers */ 121 __virtio16 num_buffers; /* Number of merged rx buffers */
99}; 122};
123#endif /* ...VIRTIO_NET_NO_LEGACY */
100 124
101/* 125/*
102 * Control virtqueue data structures 126 * Control virtqueue data structures
diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index 35b552c7f330..75301468359f 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -39,7 +39,7 @@
39#ifndef _LINUX_VIRTIO_PCI_H 39#ifndef _LINUX_VIRTIO_PCI_H
40#define _LINUX_VIRTIO_PCI_H 40#define _LINUX_VIRTIO_PCI_H
41 41
42#include <linux/virtio_config.h> 42#include <linux/types.h>
43 43
44#ifndef VIRTIO_PCI_NO_LEGACY 44#ifndef VIRTIO_PCI_NO_LEGACY
45 45
@@ -99,4 +99,95 @@
99/* Vector value used to disable MSI for queue */ 99/* Vector value used to disable MSI for queue */
100#define VIRTIO_MSI_NO_VECTOR 0xffff 100#define VIRTIO_MSI_NO_VECTOR 0xffff
101 101
102#ifndef VIRTIO_PCI_NO_MODERN
103
104/* IDs for different capabilities. Must all exist. */
105
106/* Common configuration */
107#define VIRTIO_PCI_CAP_COMMON_CFG 1
108/* Notifications */
109#define VIRTIO_PCI_CAP_NOTIFY_CFG 2
110/* ISR access */
111#define VIRTIO_PCI_CAP_ISR_CFG 3
112/* Device specific configuration */
113#define VIRTIO_PCI_CAP_DEVICE_CFG 4
114/* PCI configuration access */
115#define VIRTIO_PCI_CAP_PCI_CFG 5
116
117/* This is the PCI capability header: */
118struct virtio_pci_cap {
119 __u8 cap_vndr; /* Generic PCI field: PCI_CAP_ID_VNDR */
120 __u8 cap_next; /* Generic PCI field: next ptr. */
121 __u8 cap_len; /* Generic PCI field: capability length */
122 __u8 cfg_type; /* Identifies the structure. */
123 __u8 bar; /* Where to find it. */
124 __u8 padding[3]; /* Pad to full dword. */
125 __le32 offset; /* Offset within bar. */
126 __le32 length; /* Length of the structure, in bytes. */
127};
128
129struct virtio_pci_notify_cap {
130 struct virtio_pci_cap cap;
131 __le32 notify_off_multiplier; /* Multiplier for queue_notify_off. */
132};
133
134/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
135struct virtio_pci_common_cfg {
136 /* About the whole device. */
137 __le32 device_feature_select; /* read-write */
138 __le32 device_feature; /* read-only */
139 __le32 guest_feature_select; /* read-write */
140 __le32 guest_feature; /* read-write */
141 __le16 msix_config; /* read-write */
142 __le16 num_queues; /* read-only */
143 __u8 device_status; /* read-write */
144 __u8 config_generation; /* read-only */
145
146 /* About a specific virtqueue. */
147 __le16 queue_select; /* read-write */
148 __le16 queue_size; /* read-write, power of 2. */
149 __le16 queue_msix_vector; /* read-write */
150 __le16 queue_enable; /* read-write */
151 __le16 queue_notify_off; /* read-only */
152 __le32 queue_desc_lo; /* read-write */
153 __le32 queue_desc_hi; /* read-write */
154 __le32 queue_avail_lo; /* read-write */
155 __le32 queue_avail_hi; /* read-write */
156 __le32 queue_used_lo; /* read-write */
157 __le32 queue_used_hi; /* read-write */
158};
159
160/* Macro versions of offsets for the Old Timers! */
161#define VIRTIO_PCI_CAP_VNDR 0
162#define VIRTIO_PCI_CAP_NEXT 1
163#define VIRTIO_PCI_CAP_LEN 2
164#define VIRTIO_PCI_CAP_CFG_TYPE 3
165#define VIRTIO_PCI_CAP_BAR 4
166#define VIRTIO_PCI_CAP_OFFSET 8
167#define VIRTIO_PCI_CAP_LENGTH 12
168
169#define VIRTIO_PCI_NOTIFY_CAP_MULT 16
170
171#define VIRTIO_PCI_COMMON_DFSELECT 0
172#define VIRTIO_PCI_COMMON_DF 4
173#define VIRTIO_PCI_COMMON_GFSELECT 8
174#define VIRTIO_PCI_COMMON_GF 12
175#define VIRTIO_PCI_COMMON_MSIX 16
176#define VIRTIO_PCI_COMMON_NUMQ 18
177#define VIRTIO_PCI_COMMON_STATUS 20
178#define VIRTIO_PCI_COMMON_CFGGENERATION 21
179#define VIRTIO_PCI_COMMON_Q_SELECT 22
180#define VIRTIO_PCI_COMMON_Q_SIZE 24
181#define VIRTIO_PCI_COMMON_Q_MSIX 26
182#define VIRTIO_PCI_COMMON_Q_ENABLE 28
183#define VIRTIO_PCI_COMMON_Q_NOFF 30
184#define VIRTIO_PCI_COMMON_Q_DESCLO 32
185#define VIRTIO_PCI_COMMON_Q_DESCHI 36
186#define VIRTIO_PCI_COMMON_Q_AVAILLO 40
187#define VIRTIO_PCI_COMMON_Q_AVAILHI 44
188#define VIRTIO_PCI_COMMON_Q_USEDLO 48
189#define VIRTIO_PCI_COMMON_Q_USEDHI 52
190
191#endif /* VIRTIO_PCI_NO_MODERN */
192
102#endif 193#endif
diff --git a/lib/pci_iomap.c b/lib/pci_iomap.c
index 0d83ea8a9605..bcce5f149310 100644
--- a/lib/pci_iomap.c
+++ b/lib/pci_iomap.c
@@ -10,10 +10,11 @@
10 10
11#ifdef CONFIG_PCI 11#ifdef CONFIG_PCI
12/** 12/**
13 * pci_iomap - create a virtual mapping cookie for a PCI BAR 13 * pci_iomap_range - create a virtual mapping cookie for a PCI BAR
14 * @dev: PCI device that owns the BAR 14 * @dev: PCI device that owns the BAR
15 * @bar: BAR number 15 * @bar: BAR number
16 * @maxlen: length of the memory to map 16 * @offset: map memory at the given offset in BAR
17 * @maxlen: max length of the memory to map
17 * 18 *
18 * Using this function you will get a __iomem address to your device BAR. 19 * Using this function you will get a __iomem address to your device BAR.
19 * You can access it using ioread*() and iowrite*(). These functions hide 20 * You can access it using ioread*() and iowrite*(). These functions hide
@@ -21,16 +22,21 @@
21 * you expect from them in the correct way. 22 * you expect from them in the correct way.
22 * 23 *
23 * @maxlen specifies the maximum length to map. If you want to get access to 24 * @maxlen specifies the maximum length to map. If you want to get access to
24 * the complete BAR without checking for its length first, pass %0 here. 25 * the complete BAR from offset to the end, pass %0 here.
25 * */ 26 * */
26void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen) 27void __iomem *pci_iomap_range(struct pci_dev *dev,
28 int bar,
29 unsigned long offset,
30 unsigned long maxlen)
27{ 31{
28 resource_size_t start = pci_resource_start(dev, bar); 32 resource_size_t start = pci_resource_start(dev, bar);
29 resource_size_t len = pci_resource_len(dev, bar); 33 resource_size_t len = pci_resource_len(dev, bar);
30 unsigned long flags = pci_resource_flags(dev, bar); 34 unsigned long flags = pci_resource_flags(dev, bar);
31 35
32 if (!len || !start) 36 if (len <= offset || !start)
33 return NULL; 37 return NULL;
38 len -= offset;
39 start += offset;
34 if (maxlen && len > maxlen) 40 if (maxlen && len > maxlen)
35 len = maxlen; 41 len = maxlen;
36 if (flags & IORESOURCE_IO) 42 if (flags & IORESOURCE_IO)
@@ -43,6 +49,25 @@ void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
43 /* What? */ 49 /* What? */
44 return NULL; 50 return NULL;
45} 51}
52EXPORT_SYMBOL(pci_iomap_range);
46 53
54/**
55 * pci_iomap - create a virtual mapping cookie for a PCI BAR
56 * @dev: PCI device that owns the BAR
57 * @bar: BAR number
58 * @maxlen: length of the memory to map
59 *
60 * Using this function you will get a __iomem address to your device BAR.
61 * You can access it using ioread*() and iowrite*(). These functions hide
62 * the details if this is a MMIO or PIO address space and will just do what
63 * you expect from them in the correct way.
64 *
65 * @maxlen specifies the maximum length to map. If you want to get access to
66 * the complete BAR without checking for its length first, pass %0 here.
67 * */
68void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
69{
70 return pci_iomap_range(dev, bar, 0, maxlen);
71}
47EXPORT_SYMBOL(pci_iomap); 72EXPORT_SYMBOL(pci_iomap);
48#endif /* CONFIG_PCI */ 73#endif /* CONFIG_PCI */
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index daa749c8b3fb..d8e376a5f0f1 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -524,6 +524,12 @@ static int p9_virtio_probe(struct virtio_device *vdev)
524 int err; 524 int err;
525 struct virtio_chan *chan; 525 struct virtio_chan *chan;
526 526
527 if (!vdev->config->get) {
528 dev_err(&vdev->dev, "%s failure: config access disabled\n",
529 __func__);
530 return -EINVAL;
531 }
532
527 chan = kmalloc(sizeof(struct virtio_chan), GFP_KERNEL); 533 chan = kmalloc(sizeof(struct virtio_chan), GFP_KERNEL);
528 if (!chan) { 534 if (!chan) {
529 pr_err("Failed to allocate virtio 9P channel\n"); 535 pr_err("Failed to allocate virtio 9P channel\n");
diff --git a/tools/lguest/Makefile b/tools/lguest/Makefile
index 97bca4871ea3..a107b5e4da13 100644
--- a/tools/lguest/Makefile
+++ b/tools/lguest/Makefile
@@ -1,7 +1,13 @@
1# This creates the demonstration utility "lguest" which runs a Linux guest. 1# This creates the demonstration utility "lguest" which runs a Linux guest.
2CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -U_FORTIFY_SOURCE 2CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -U_FORTIFY_SOURCE -Iinclude
3 3
4all: lguest 4all: lguest
5 5
6include/linux/virtio_types.h: ../../include/uapi/linux/virtio_types.h
7 mkdir -p include/linux 2>&1 || true
8 ln -sf ../../../../include/uapi/linux/virtio_types.h $@
9
10lguest: include/linux/virtio_types.h
11
6clean: 12clean:
7 rm -f lguest 13 rm -f lguest
diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c
index 32cf2ce15d69..e44052483ed9 100644
--- a/tools/lguest/lguest.c
+++ b/tools/lguest/lguest.c
@@ -41,6 +41,8 @@
41#include <signal.h> 41#include <signal.h>
42#include <pwd.h> 42#include <pwd.h>
43#include <grp.h> 43#include <grp.h>
44#include <sys/user.h>
45#include <linux/pci_regs.h>
44 46
45#ifndef VIRTIO_F_ANY_LAYOUT 47#ifndef VIRTIO_F_ANY_LAYOUT
46#define VIRTIO_F_ANY_LAYOUT 27 48#define VIRTIO_F_ANY_LAYOUT 27
@@ -61,12 +63,19 @@ typedef uint16_t u16;
61typedef uint8_t u8; 63typedef uint8_t u8;
62/*:*/ 64/*:*/
63 65
64#include <linux/virtio_config.h> 66#define VIRTIO_CONFIG_NO_LEGACY
65#include <linux/virtio_net.h> 67#define VIRTIO_PCI_NO_LEGACY
66#include <linux/virtio_blk.h> 68#define VIRTIO_BLK_NO_LEGACY
67#include <linux/virtio_console.h> 69#define VIRTIO_NET_NO_LEGACY
68#include <linux/virtio_rng.h> 70
71/* Use in-kernel ones, which defines VIRTIO_F_VERSION_1 */
72#include "../../include/uapi/linux/virtio_config.h"
73#include "../../include/uapi/linux/virtio_net.h"
74#include "../../include/uapi/linux/virtio_blk.h"
75#include "../../include/uapi/linux/virtio_console.h"
76#include "../../include/uapi/linux/virtio_rng.h"
69#include <linux/virtio_ring.h> 77#include <linux/virtio_ring.h>
78#include "../../include/uapi/linux/virtio_pci.h"
70#include <asm/bootparam.h> 79#include <asm/bootparam.h>
71#include "../../include/linux/lguest_launcher.h" 80#include "../../include/linux/lguest_launcher.h"
72 81
@@ -91,13 +100,16 @@ static bool verbose;
91/* The pointer to the start of guest memory. */ 100/* The pointer to the start of guest memory. */
92static void *guest_base; 101static void *guest_base;
93/* The maximum guest physical address allowed, and maximum possible. */ 102/* The maximum guest physical address allowed, and maximum possible. */
94static unsigned long guest_limit, guest_max; 103static unsigned long guest_limit, guest_max, guest_mmio;
95/* The /dev/lguest file descriptor. */ 104/* The /dev/lguest file descriptor. */
96static int lguest_fd; 105static int lguest_fd;
97 106
98/* a per-cpu variable indicating whose vcpu is currently running */ 107/* a per-cpu variable indicating whose vcpu is currently running */
99static unsigned int __thread cpu_id; 108static unsigned int __thread cpu_id;
100 109
110/* 5 bit device number in the PCI_CONFIG_ADDR => 32 only */
111#define MAX_PCI_DEVICES 32
112
101/* This is our list of devices. */ 113/* This is our list of devices. */
102struct device_list { 114struct device_list {
103 /* Counter to assign interrupt numbers. */ 115 /* Counter to assign interrupt numbers. */
@@ -106,30 +118,50 @@ struct device_list {
106 /* Counter to print out convenient device numbers. */ 118 /* Counter to print out convenient device numbers. */
107 unsigned int device_num; 119 unsigned int device_num;
108 120
109 /* The descriptor page for the devices. */ 121 /* PCI devices. */
110 u8 *descpage; 122 struct device *pci[MAX_PCI_DEVICES];
111
112 /* A single linked list of devices. */
113 struct device *dev;
114 /* And a pointer to the last device for easy append. */
115 struct device *lastdev;
116}; 123};
117 124
118/* The list of Guest devices, based on command line arguments. */ 125/* The list of Guest devices, based on command line arguments. */
119static struct device_list devices; 126static struct device_list devices;
120 127
121/* The device structure describes a single device. */ 128struct virtio_pci_cfg_cap {
122struct device { 129 struct virtio_pci_cap cap;
123 /* The linked-list pointer. */ 130 u32 pci_cfg_data; /* Data for BAR access. */
124 struct device *next; 131};
125 132
126 /* The device's descriptor, as mapped into the Guest. */ 133struct virtio_pci_mmio {
127 struct lguest_device_desc *desc; 134 struct virtio_pci_common_cfg cfg;
135 u16 notify;
136 u8 isr;
137 u8 padding;
138 /* Device-specific configuration follows this. */
139};
128 140
129 /* We can't trust desc values once Guest has booted: we use these. */ 141/* This is the layout (little-endian) of the PCI config space. */
130 unsigned int feature_len; 142struct pci_config {
131 unsigned int num_vq; 143 u16 vendor_id, device_id;
144 u16 command, status;
145 u8 revid, prog_if, subclass, class;
146 u8 cacheline_size, lat_timer, header_type, bist;
147 u32 bar[6];
148 u32 cardbus_cis_ptr;
149 u16 subsystem_vendor_id, subsystem_device_id;
150 u32 expansion_rom_addr;
151 u8 capabilities, reserved1[3];
152 u32 reserved2;
153 u8 irq_line, irq_pin, min_grant, max_latency;
154
155 /* Now, this is the linked capability list. */
156 struct virtio_pci_cap common;
157 struct virtio_pci_notify_cap notify;
158 struct virtio_pci_cap isr;
159 struct virtio_pci_cap device;
160 struct virtio_pci_cfg_cap cfg_access;
161};
132 162
163/* The device structure describes a single device. */
164struct device {
133 /* The name of this device, for --verbose. */ 165 /* The name of this device, for --verbose. */
134 const char *name; 166 const char *name;
135 167
@@ -139,6 +171,25 @@ struct device {
139 /* Is it operational */ 171 /* Is it operational */
140 bool running; 172 bool running;
141 173
174 /* Has it written FEATURES_OK but not re-checked it? */
175 bool wrote_features_ok;
176
177 /* PCI configuration */
178 union {
179 struct pci_config config;
180 u32 config_words[sizeof(struct pci_config) / sizeof(u32)];
181 };
182
183 /* Features we offer, and those accepted. */
184 u64 features, features_accepted;
185
186 /* Device-specific config hangs off the end of this. */
187 struct virtio_pci_mmio *mmio;
188
189 /* PCI MMIO resources (all in BAR0) */
190 size_t mmio_size;
191 u32 mmio_addr;
192
142 /* Device-specific data. */ 193 /* Device-specific data. */
143 void *priv; 194 void *priv;
144}; 195};
@@ -150,12 +201,15 @@ struct virtqueue {
150 /* Which device owns me. */ 201 /* Which device owns me. */
151 struct device *dev; 202 struct device *dev;
152 203
153 /* The configuration for this queue. */ 204 /* Name for printing errors. */
154 struct lguest_vqconfig config; 205 const char *name;
155 206
156 /* The actual ring of buffers. */ 207 /* The actual ring of buffers. */
157 struct vring vring; 208 struct vring vring;
158 209
210 /* The information about this virtqueue (we only use queue_size on) */
211 struct virtio_pci_common_cfg pci_config;
212
159 /* Last available index we saw. */ 213 /* Last available index we saw. */
160 u16 last_avail_idx; 214 u16 last_avail_idx;
161 215
@@ -199,6 +253,16 @@ static struct termios orig_term;
199#define le32_to_cpu(v32) (v32) 253#define le32_to_cpu(v32) (v32)
200#define le64_to_cpu(v64) (v64) 254#define le64_to_cpu(v64) (v64)
201 255
256/*
257 * A real device would ignore weird/non-compliant driver behaviour. We
258 * stop and flag it, to help debugging Linux problems.
259 */
260#define bad_driver(d, fmt, ...) \
261 errx(1, "%s: bad driver: " fmt, (d)->name, ## __VA_ARGS__)
262#define bad_driver_vq(vq, fmt, ...) \
263 errx(1, "%s vq %s: bad driver: " fmt, (vq)->dev->name, \
264 vq->name, ## __VA_ARGS__)
265
202/* Is this iovec empty? */ 266/* Is this iovec empty? */
203static bool iov_empty(const struct iovec iov[], unsigned int num_iov) 267static bool iov_empty(const struct iovec iov[], unsigned int num_iov)
204{ 268{
@@ -211,7 +275,8 @@ static bool iov_empty(const struct iovec iov[], unsigned int num_iov)
211} 275}
212 276
213/* Take len bytes from the front of this iovec. */ 277/* Take len bytes from the front of this iovec. */
214static void iov_consume(struct iovec iov[], unsigned num_iov, 278static void iov_consume(struct device *d,
279 struct iovec iov[], unsigned num_iov,
215 void *dest, unsigned len) 280 void *dest, unsigned len)
216{ 281{
217 unsigned int i; 282 unsigned int i;
@@ -229,14 +294,7 @@ static void iov_consume(struct iovec iov[], unsigned num_iov,
229 len -= used; 294 len -= used;
230 } 295 }
231 if (len != 0) 296 if (len != 0)
232 errx(1, "iovec too short!"); 297 bad_driver(d, "iovec too short!");
233}
234
235/* The device virtqueue descriptors are followed by feature bitmasks. */
236static u8 *get_feature_bits(struct device *dev)
237{
238 return (u8 *)(dev->desc + 1)
239 + dev->num_vq * sizeof(struct lguest_vqconfig);
240} 298}
241 299
242/*L:100 300/*L:100
@@ -309,14 +367,20 @@ static void *map_zeroed_pages(unsigned int num)
309 return addr + getpagesize(); 367 return addr + getpagesize();
310} 368}
311 369
312/* Get some more pages for a device. */ 370/* Get some bytes which won't be mapped into the guest. */
313static void *get_pages(unsigned int num) 371static unsigned long get_mmio_region(size_t size)
314{ 372{
315 void *addr = from_guest_phys(guest_limit); 373 unsigned long addr = guest_mmio;
374 size_t i;
375
376 if (!size)
377 return addr;
378
379 /* Size has to be a power of 2 (and multiple of 16) */
380 for (i = 1; i < size; i <<= 1);
381
382 guest_mmio += i;
316 383
317 guest_limit += num * getpagesize();
318 if (guest_limit > guest_max)
319 errx(1, "Not enough memory for devices");
320 return addr; 384 return addr;
321} 385}
322 386
@@ -547,9 +611,11 @@ static void tell_kernel(unsigned long start)
547{ 611{
548 unsigned long args[] = { LHREQ_INITIALIZE, 612 unsigned long args[] = { LHREQ_INITIALIZE,
549 (unsigned long)guest_base, 613 (unsigned long)guest_base,
550 guest_limit / getpagesize(), start }; 614 guest_limit / getpagesize(), start,
551 verbose("Guest: %p - %p (%#lx)\n", 615 (guest_mmio+getpagesize()-1) / getpagesize() };
552 guest_base, guest_base + guest_limit, guest_limit); 616 verbose("Guest: %p - %p (%#lx, MMIO %#lx)\n",
617 guest_base, guest_base + guest_limit,
618 guest_limit, guest_mmio);
553 lguest_fd = open_or_die("/dev/lguest", O_RDWR); 619 lguest_fd = open_or_die("/dev/lguest", O_RDWR);
554 if (write(lguest_fd, args, sizeof(args)) < 0) 620 if (write(lguest_fd, args, sizeof(args)) < 0)
555 err(1, "Writing to /dev/lguest"); 621 err(1, "Writing to /dev/lguest");
@@ -564,7 +630,8 @@ static void tell_kernel(unsigned long start)
564 * we have a convenient routine which checks it and exits with an error message 630 * we have a convenient routine which checks it and exits with an error message
565 * if something funny is going on: 631 * if something funny is going on:
566 */ 632 */
567static void *_check_pointer(unsigned long addr, unsigned int size, 633static void *_check_pointer(struct device *d,
634 unsigned long addr, unsigned int size,
568 unsigned int line) 635 unsigned int line)
569{ 636{
570 /* 637 /*
@@ -572,7 +639,8 @@ static void *_check_pointer(unsigned long addr, unsigned int size,
572 * or addr + size wraps around. 639 * or addr + size wraps around.
573 */ 640 */
574 if ((addr + size) > guest_limit || (addr + size) < addr) 641 if ((addr + size) > guest_limit || (addr + size) < addr)
575 errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr); 642 bad_driver(d, "%s:%i: Invalid address %#lx",
643 __FILE__, line, addr);
576 /* 644 /*
577 * We return a pointer for the caller's convenience, now we know it's 645 * We return a pointer for the caller's convenience, now we know it's
578 * safe to use. 646 * safe to use.
@@ -580,14 +648,14 @@ static void *_check_pointer(unsigned long addr, unsigned int size,
580 return from_guest_phys(addr); 648 return from_guest_phys(addr);
581} 649}
582/* A macro which transparently hands the line number to the real function. */ 650/* A macro which transparently hands the line number to the real function. */
583#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) 651#define check_pointer(d,addr,size) _check_pointer(d, addr, size, __LINE__)
584 652
585/* 653/*
586 * Each buffer in the virtqueues is actually a chain of descriptors. This 654 * Each buffer in the virtqueues is actually a chain of descriptors. This
587 * function returns the next descriptor in the chain, or vq->vring.num if we're 655 * function returns the next descriptor in the chain, or vq->vring.num if we're
588 * at the end. 656 * at the end.
589 */ 657 */
590static unsigned next_desc(struct vring_desc *desc, 658static unsigned next_desc(struct device *d, struct vring_desc *desc,
591 unsigned int i, unsigned int max) 659 unsigned int i, unsigned int max)
592{ 660{
593 unsigned int next; 661 unsigned int next;
@@ -602,7 +670,7 @@ static unsigned next_desc(struct vring_desc *desc,
602 wmb(); 670 wmb();
603 671
604 if (next >= max) 672 if (next >= max)
605 errx(1, "Desc next is %u", next); 673 bad_driver(d, "Desc next is %u", next);
606 674
607 return next; 675 return next;
608} 676}
@@ -613,21 +681,48 @@ static unsigned next_desc(struct vring_desc *desc,
613 */ 681 */
614static void trigger_irq(struct virtqueue *vq) 682static void trigger_irq(struct virtqueue *vq)
615{ 683{
616 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; 684 unsigned long buf[] = { LHREQ_IRQ, vq->dev->config.irq_line };
617 685
618 /* Don't inform them if nothing used. */ 686 /* Don't inform them if nothing used. */
619 if (!vq->pending_used) 687 if (!vq->pending_used)
620 return; 688 return;
621 vq->pending_used = 0; 689 vq->pending_used = 0;
622 690
623 /* If they don't want an interrupt, don't send one... */ 691 /*
692 * 2.4.7.1:
693 *
694 * If the VIRTIO_F_EVENT_IDX feature bit is not negotiated:
695 * The driver MUST set flags to 0 or 1.
696 */
697 if (vq->vring.avail->flags > 1)
698 bad_driver_vq(vq, "avail->flags = %u\n", vq->vring.avail->flags);
699
700 /*
701 * 2.4.7.2:
702 *
703 * If the VIRTIO_F_EVENT_IDX feature bit is not negotiated:
704 *
705 * - The device MUST ignore the used_event value.
706 * - After the device writes a descriptor index into the used ring:
707 * - If flags is 1, the device SHOULD NOT send an interrupt.
708 * - If flags is 0, the device MUST send an interrupt.
709 */
624 if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) { 710 if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) {
625 return; 711 return;
626 } 712 }
627 713
714 /*
715 * 4.1.4.5.1:
716 *
717 * If MSI-X capability is disabled, the device MUST set the Queue
718 * Interrupt bit in ISR status before sending a virtqueue notification
719 * to the driver.
720 */
721 vq->dev->mmio->isr = 0x1;
722
628 /* Send the Guest an interrupt tell them we used something up. */ 723 /* Send the Guest an interrupt tell them we used something up. */
629 if (write(lguest_fd, buf, sizeof(buf)) != 0) 724 if (write(lguest_fd, buf, sizeof(buf)) != 0)
630 err(1, "Triggering irq %i", vq->config.irq); 725 err(1, "Triggering irq %i", vq->dev->config.irq_line);
631} 726}
632 727
633/* 728/*
@@ -646,6 +741,14 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
646 struct vring_desc *desc; 741 struct vring_desc *desc;
647 u16 last_avail = lg_last_avail(vq); 742 u16 last_avail = lg_last_avail(vq);
648 743
744 /*
745 * 2.4.7.1:
746 *
747 * The driver MUST handle spurious interrupts from the device.
748 *
749 * That's why this is a while loop.
750 */
751
649 /* There's nothing available? */ 752 /* There's nothing available? */
650 while (last_avail == vq->vring.avail->idx) { 753 while (last_avail == vq->vring.avail->idx) {
651 u64 event; 754 u64 event;
@@ -679,8 +782,8 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
679 782
680 /* Check it isn't doing very strange things with descriptor numbers. */ 783 /* Check it isn't doing very strange things with descriptor numbers. */
681 if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) 784 if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
682 errx(1, "Guest moved used index from %u to %u", 785 bad_driver_vq(vq, "Guest moved used index from %u to %u",
683 last_avail, vq->vring.avail->idx); 786 last_avail, vq->vring.avail->idx);
684 787
685 /* 788 /*
686 * Make sure we read the descriptor number *after* we read the ring 789 * Make sure we read the descriptor number *after* we read the ring
@@ -697,7 +800,7 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
697 800
698 /* If their number is silly, that's a fatal mistake. */ 801 /* If their number is silly, that's a fatal mistake. */
699 if (head >= vq->vring.num) 802 if (head >= vq->vring.num)
700 errx(1, "Guest says index %u is available", head); 803 bad_driver_vq(vq, "Guest says index %u is available", head);
701 804
702 /* When we start there are none of either input nor output. */ 805 /* When we start there are none of either input nor output. */
703 *out_num = *in_num = 0; 806 *out_num = *in_num = 0;
@@ -712,24 +815,73 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
712 * that: no rmb() required. 815 * that: no rmb() required.
713 */ 816 */
714 817
715 /* 818 do {
716 * If this is an indirect entry, then this buffer contains a descriptor 819 /*
717 * table which we handle as if it's any normal descriptor chain. 820 * If this is an indirect entry, then this buffer contains a
718 */ 821 * descriptor table which we handle as if it's any normal
719 if (desc[i].flags & VRING_DESC_F_INDIRECT) { 822 * descriptor chain.
720 if (desc[i].len % sizeof(struct vring_desc)) 823 */
721 errx(1, "Invalid size for indirect buffer table"); 824 if (desc[i].flags & VRING_DESC_F_INDIRECT) {
825 /* 2.4.5.3.1:
826 *
827 * The driver MUST NOT set the VIRTQ_DESC_F_INDIRECT
828 * flag unless the VIRTIO_F_INDIRECT_DESC feature was
829 * negotiated.
830 */
831 if (!(vq->dev->features_accepted &
832 (1<<VIRTIO_RING_F_INDIRECT_DESC)))
833 bad_driver_vq(vq, "vq indirect not negotiated");
722 834
723 max = desc[i].len / sizeof(struct vring_desc); 835 /*
724 desc = check_pointer(desc[i].addr, desc[i].len); 836 * 2.4.5.3.1:
725 i = 0; 837 *
726 } 838 * The driver MUST NOT set the VIRTQ_DESC_F_INDIRECT
839 * flag within an indirect descriptor (ie. only one
840 * table per descriptor).
841 */
842 if (desc != vq->vring.desc)
843 bad_driver_vq(vq, "Indirect within indirect");
844
845 /*
846 * Proposed update VIRTIO-134 spells this out:
847 *
848 * A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT
849 * and VIRTQ_DESC_F_NEXT in flags.
850 */
851 if (desc[i].flags & VRING_DESC_F_NEXT)
852 bad_driver_vq(vq, "indirect and next together");
853
854 if (desc[i].len % sizeof(struct vring_desc))
855 bad_driver_vq(vq,
856 "Invalid size for indirect table");
857 /*
858 * 2.4.5.3.2:
859 *
860 * The device MUST ignore the write-only flag
861 * (flags&VIRTQ_DESC_F_WRITE) in the descriptor that
862 * refers to an indirect table.
863 *
864 * We ignore it here: :)
865 */
866
867 max = desc[i].len / sizeof(struct vring_desc);
868 desc = check_pointer(vq->dev, desc[i].addr, desc[i].len);
869 i = 0;
870
871 /* 2.4.5.3.1:
872 *
873 * A driver MUST NOT create a descriptor chain longer
874 * than the Queue Size of the device.
875 */
876 if (max > vq->pci_config.queue_size)
877 bad_driver_vq(vq,
878 "indirect has too many entries");
879 }
727 880
728 do {
729 /* Grab the first descriptor, and check it's OK. */ 881 /* Grab the first descriptor, and check it's OK. */
730 iov[*out_num + *in_num].iov_len = desc[i].len; 882 iov[*out_num + *in_num].iov_len = desc[i].len;
731 iov[*out_num + *in_num].iov_base 883 iov[*out_num + *in_num].iov_base
732 = check_pointer(desc[i].addr, desc[i].len); 884 = check_pointer(vq->dev, desc[i].addr, desc[i].len);
733 /* If this is an input descriptor, increment that count. */ 885 /* If this is an input descriptor, increment that count. */
734 if (desc[i].flags & VRING_DESC_F_WRITE) 886 if (desc[i].flags & VRING_DESC_F_WRITE)
735 (*in_num)++; 887 (*in_num)++;
@@ -739,14 +891,15 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
739 * to come before any input descriptors. 891 * to come before any input descriptors.
740 */ 892 */
741 if (*in_num) 893 if (*in_num)
742 errx(1, "Descriptor has out after in"); 894 bad_driver_vq(vq,
895 "Descriptor has out after in");
743 (*out_num)++; 896 (*out_num)++;
744 } 897 }
745 898
746 /* If we've got too many, that implies a descriptor loop. */ 899 /* If we've got too many, that implies a descriptor loop. */
747 if (*out_num + *in_num > max) 900 if (*out_num + *in_num > max)
748 errx(1, "Looped descriptor"); 901 bad_driver_vq(vq, "Looped descriptor");
749 } while ((i = next_desc(desc, i, max)) != max); 902 } while ((i = next_desc(vq->dev, desc, i, max)) != max);
750 903
751 return head; 904 return head;
752} 905}
@@ -803,7 +956,7 @@ static void console_input(struct virtqueue *vq)
803 /* Make sure there's a descriptor available. */ 956 /* Make sure there's a descriptor available. */
804 head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 957 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
805 if (out_num) 958 if (out_num)
806 errx(1, "Output buffers in console in queue?"); 959 bad_driver_vq(vq, "Output buffers in console in queue?");
807 960
808 /* Read into it. This is where we usually wait. */ 961 /* Read into it. This is where we usually wait. */
809 len = readv(STDIN_FILENO, iov, in_num); 962 len = readv(STDIN_FILENO, iov, in_num);
@@ -856,7 +1009,7 @@ static void console_output(struct virtqueue *vq)
856 /* We usually wait in here, for the Guest to give us something. */ 1009 /* We usually wait in here, for the Guest to give us something. */
857 head = wait_for_vq_desc(vq, iov, &out, &in); 1010 head = wait_for_vq_desc(vq, iov, &out, &in);
858 if (in) 1011 if (in)
859 errx(1, "Input buffers in console output queue?"); 1012 bad_driver_vq(vq, "Input buffers in console output queue?");
860 1013
861 /* writev can return a partial write, so we loop here. */ 1014 /* writev can return a partial write, so we loop here. */
862 while (!iov_empty(iov, out)) { 1015 while (!iov_empty(iov, out)) {
@@ -865,7 +1018,7 @@ static void console_output(struct virtqueue *vq)
865 warn("Write to stdout gave %i (%d)", len, errno); 1018 warn("Write to stdout gave %i (%d)", len, errno);
866 break; 1019 break;
867 } 1020 }
868 iov_consume(iov, out, NULL, len); 1021 iov_consume(vq->dev, iov, out, NULL, len);
869 } 1022 }
870 1023
871 /* 1024 /*
@@ -894,7 +1047,7 @@ static void net_output(struct virtqueue *vq)
894 /* We usually wait in here for the Guest to give us a packet. */ 1047 /* We usually wait in here for the Guest to give us a packet. */
895 head = wait_for_vq_desc(vq, iov, &out, &in); 1048 head = wait_for_vq_desc(vq, iov, &out, &in);
896 if (in) 1049 if (in)
897 errx(1, "Input buffers in net output queue?"); 1050 bad_driver_vq(vq, "Input buffers in net output queue?");
898 /* 1051 /*
899 * Send the whole thing through to /dev/net/tun. It expects the exact 1052 * Send the whole thing through to /dev/net/tun. It expects the exact
900 * same format: what a coincidence! 1053 * same format: what a coincidence!
@@ -942,7 +1095,7 @@ static void net_input(struct virtqueue *vq)
942 */ 1095 */
943 head = wait_for_vq_desc(vq, iov, &out, &in); 1096 head = wait_for_vq_desc(vq, iov, &out, &in);
944 if (out) 1097 if (out)
945 errx(1, "Output buffers in net input queue?"); 1098 bad_driver_vq(vq, "Output buffers in net input queue?");
946 1099
947 /* 1100 /*
948 * If it looks like we'll block reading from the tun device, send them 1101 * If it looks like we'll block reading from the tun device, send them
@@ -986,6 +1139,12 @@ static void kill_launcher(int signal)
986 kill(0, SIGTERM); 1139 kill(0, SIGTERM);
987} 1140}
988 1141
1142static void reset_vq_pci_config(struct virtqueue *vq)
1143{
1144 vq->pci_config.queue_size = VIRTQUEUE_NUM;
1145 vq->pci_config.queue_enable = 0;
1146}
1147
989static void reset_device(struct device *dev) 1148static void reset_device(struct device *dev)
990{ 1149{
991 struct virtqueue *vq; 1150 struct virtqueue *vq;
@@ -993,53 +1152,705 @@ static void reset_device(struct device *dev)
993 verbose("Resetting device %s\n", dev->name); 1152 verbose("Resetting device %s\n", dev->name);
994 1153
995 /* Clear any features they've acked. */ 1154 /* Clear any features they've acked. */
996 memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len); 1155 dev->features_accepted = 0;
997 1156
998 /* We're going to be explicitly killing threads, so ignore them. */ 1157 /* We're going to be explicitly killing threads, so ignore them. */
999 signal(SIGCHLD, SIG_IGN); 1158 signal(SIGCHLD, SIG_IGN);
1000 1159
1001 /* Zero out the virtqueues, get rid of their threads */ 1160 /*
1161 * 4.1.4.3.1:
1162 *
1163 * The device MUST present a 0 in queue_enable on reset.
1164 *
1165 * This means we set it here, and reset the saved ones in every vq.
1166 */
1167 dev->mmio->cfg.queue_enable = 0;
1168
1169 /* Get rid of the virtqueue threads */
1002 for (vq = dev->vq; vq; vq = vq->next) { 1170 for (vq = dev->vq; vq; vq = vq->next) {
1171 vq->last_avail_idx = 0;
1172 reset_vq_pci_config(vq);
1003 if (vq->thread != (pid_t)-1) { 1173 if (vq->thread != (pid_t)-1) {
1004 kill(vq->thread, SIGTERM); 1174 kill(vq->thread, SIGTERM);
1005 waitpid(vq->thread, NULL, 0); 1175 waitpid(vq->thread, NULL, 0);
1006 vq->thread = (pid_t)-1; 1176 vq->thread = (pid_t)-1;
1007 } 1177 }
1008 memset(vq->vring.desc, 0,
1009 vring_size(vq->config.num, LGUEST_VRING_ALIGN));
1010 lg_last_avail(vq) = 0;
1011 } 1178 }
1012 dev->running = false; 1179 dev->running = false;
1180 dev->wrote_features_ok = false;
1013 1181
1014 /* Now we care if threads die. */ 1182 /* Now we care if threads die. */
1015 signal(SIGCHLD, (void *)kill_launcher); 1183 signal(SIGCHLD, (void *)kill_launcher);
1016} 1184}
1017 1185
1186static void cleanup_devices(void)
1187{
1188 unsigned int i;
1189
1190 for (i = 1; i < MAX_PCI_DEVICES; i++) {
1191 struct device *d = devices.pci[i];
1192 if (!d)
1193 continue;
1194 reset_device(d);
1195 }
1196
1197 /* If we saved off the original terminal settings, restore them now. */
1198 if (orig_term.c_lflag & (ISIG|ICANON|ECHO))
1199 tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
1200}
1201
1202/*L:217
1203 * We do PCI. This is mainly done to let us test the kernel virtio PCI
1204 * code.
1205 */
1206
1207/* Linux expects a PCI host bridge: ours is a dummy, and first on the bus. */
1208static struct device pci_host_bridge;
1209
1210static void init_pci_host_bridge(void)
1211{
1212 pci_host_bridge.name = "PCI Host Bridge";
1213 pci_host_bridge.config.class = 0x06; /* bridge */
1214 pci_host_bridge.config.subclass = 0; /* host bridge */
1215 devices.pci[0] = &pci_host_bridge;
1216}
1217
1218/* The IO ports used to read the PCI config space. */
1219#define PCI_CONFIG_ADDR 0xCF8
1220#define PCI_CONFIG_DATA 0xCFC
1221
1222/*
1223 * Not really portable, but does help readability: this is what the Guest
1224 * writes to the PCI_CONFIG_ADDR IO port.
1225 */
1226union pci_config_addr {
1227 struct {
1228 unsigned mbz: 2;
1229 unsigned offset: 6;
1230 unsigned funcnum: 3;
1231 unsigned devnum: 5;
1232 unsigned busnum: 8;
1233 unsigned reserved: 7;
1234 unsigned enabled : 1;
1235 } bits;
1236 u32 val;
1237};
1238
1239/*
1240 * We cache what they wrote to the address port, so we know what they're
1241 * talking about when they access the data port.
1242 */
1243static union pci_config_addr pci_config_addr;
1244
1245static struct device *find_pci_device(unsigned int index)
1246{
1247 return devices.pci[index];
1248}
1249
1250/* PCI can do 1, 2 and 4 byte reads; we handle that here. */
1251static void ioread(u16 off, u32 v, u32 mask, u32 *val)
1252{
1253 assert(off < 4);
1254 assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF);
1255 *val = (v >> (off * 8)) & mask;
1256}
1257
1258/* PCI can do 1, 2 and 4 byte writes; we handle that here. */
1259static void iowrite(u16 off, u32 v, u32 mask, u32 *dst)
1260{
1261 assert(off < 4);
1262 assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF);
1263 *dst &= ~(mask << (off * 8));
1264 *dst |= (v & mask) << (off * 8);
1265}
1266
1267/*
1268 * Where PCI_CONFIG_DATA accesses depends on the previous write to
1269 * PCI_CONFIG_ADDR.
1270 */
1271static struct device *dev_and_reg(u32 *reg)
1272{
1273 if (!pci_config_addr.bits.enabled)
1274 return NULL;
1275
1276 if (pci_config_addr.bits.funcnum != 0)
1277 return NULL;
1278
1279 if (pci_config_addr.bits.busnum != 0)
1280 return NULL;
1281
1282 if (pci_config_addr.bits.offset * 4 >= sizeof(struct pci_config))
1283 return NULL;
1284
1285 *reg = pci_config_addr.bits.offset;
1286 return find_pci_device(pci_config_addr.bits.devnum);
1287}
1288
1289/*
1290 * We can get invalid combinations of values while they're writing, so we
1291 * only fault if they try to write with some invalid bar/offset/length.
1292 */
1293static bool valid_bar_access(struct device *d,
1294 struct virtio_pci_cfg_cap *cfg_access)
1295{
1296 /* We only have 1 bar (BAR0) */
1297 if (cfg_access->cap.bar != 0)
1298 return false;
1299
1300 /* Check it's within BAR0. */
1301 if (cfg_access->cap.offset >= d->mmio_size
1302 || cfg_access->cap.offset + cfg_access->cap.length > d->mmio_size)
1303 return false;
1304
1305 /* Check length is 1, 2 or 4. */
1306 if (cfg_access->cap.length != 1
1307 && cfg_access->cap.length != 2
1308 && cfg_access->cap.length != 4)
1309 return false;
1310
1311 /*
1312 * 4.1.4.7.2:
1313 *
1314 * The driver MUST NOT write a cap.offset which is not a multiple of
1315 * cap.length (ie. all accesses MUST be aligned).
1316 */
1317 if (cfg_access->cap.offset % cfg_access->cap.length != 0)
1318 return false;
1319
1320 /* Return pointer into word in BAR0. */
1321 return true;
1322}
1323
1324/* Is this accessing the PCI config address port?. */
1325static bool is_pci_addr_port(u16 port)
1326{
1327 return port >= PCI_CONFIG_ADDR && port < PCI_CONFIG_ADDR + 4;
1328}
1329
1330static bool pci_addr_iowrite(u16 port, u32 mask, u32 val)
1331{
1332 iowrite(port - PCI_CONFIG_ADDR, val, mask,
1333 &pci_config_addr.val);
1334 verbose("PCI%s: %#x/%x: bus %u dev %u func %u reg %u\n",
1335 pci_config_addr.bits.enabled ? "" : " DISABLED",
1336 val, mask,
1337 pci_config_addr.bits.busnum,
1338 pci_config_addr.bits.devnum,
1339 pci_config_addr.bits.funcnum,
1340 pci_config_addr.bits.offset);
1341 return true;
1342}
1343
1344static void pci_addr_ioread(u16 port, u32 mask, u32 *val)
1345{
1346 ioread(port - PCI_CONFIG_ADDR, pci_config_addr.val, mask, val);
1347}
1348
1349/* Is this accessing the PCI config data port?. */
1350static bool is_pci_data_port(u16 port)
1351{
1352 return port >= PCI_CONFIG_DATA && port < PCI_CONFIG_DATA + 4;
1353}
1354
1355static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask);
1356
1357static bool pci_data_iowrite(u16 port, u32 mask, u32 val)
1358{
1359 u32 reg, portoff;
1360 struct device *d = dev_and_reg(&reg);
1361
1362 /* Complain if they don't belong to a device. */
1363 if (!d)
1364 return false;
1365
1366 /* They can do 1 byte writes, etc. */
1367 portoff = port - PCI_CONFIG_DATA;
1368
1369 /*
1370 * PCI uses a weird way to determine the BAR size: the OS
1371 * writes all 1's, and sees which ones stick.
1372 */
1373 if (&d->config_words[reg] == &d->config.bar[0]) {
1374 int i;
1375
1376 iowrite(portoff, val, mask, &d->config.bar[0]);
1377 for (i = 0; (1 << i) < d->mmio_size; i++)
1378 d->config.bar[0] &= ~(1 << i);
1379 return true;
1380 } else if ((&d->config_words[reg] > &d->config.bar[0]
1381 && &d->config_words[reg] <= &d->config.bar[6])
1382 || &d->config_words[reg] == &d->config.expansion_rom_addr) {
1383 /* Allow writing to any other BAR, or expansion ROM */
1384 iowrite(portoff, val, mask, &d->config_words[reg]);
1385 return true;
1386 /* We let them overide latency timer and cacheline size */
1387 } else if (&d->config_words[reg] == (void *)&d->config.cacheline_size) {
1388 /* Only let them change the first two fields. */
1389 if (mask == 0xFFFFFFFF)
1390 mask = 0xFFFF;
1391 iowrite(portoff, val, mask, &d->config_words[reg]);
1392 return true;
1393 } else if (&d->config_words[reg] == (void *)&d->config.command
1394 && mask == 0xFFFF) {
1395 /* Ignore command writes. */
1396 return true;
1397 } else if (&d->config_words[reg]
1398 == (void *)&d->config.cfg_access.cap.bar
1399 || &d->config_words[reg]
1400 == &d->config.cfg_access.cap.length
1401 || &d->config_words[reg]
1402 == &d->config.cfg_access.cap.offset) {
1403
1404 /*
1405 * The VIRTIO_PCI_CAP_PCI_CFG capability
1406 * provides a backdoor to access the MMIO
1407 * regions without mapping them. Weird, but
1408 * useful.
1409 */
1410 iowrite(portoff, val, mask, &d->config_words[reg]);
1411 return true;
1412 } else if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) {
1413 u32 write_mask;
1414
1415 /*
1416 * 4.1.4.7.1:
1417 *
1418 * Upon detecting driver write access to pci_cfg_data, the
1419 * device MUST execute a write access at offset cap.offset at
1420 * BAR selected by cap.bar using the first cap.length bytes
1421 * from pci_cfg_data.
1422 */
1423
1424 /* Must be bar 0 */
1425 if (!valid_bar_access(d, &d->config.cfg_access))
1426 return false;
1427
1428 iowrite(portoff, val, mask, &d->config.cfg_access.pci_cfg_data);
1429
1430 /*
1431 * Now emulate a write. The mask we use is set by
1432 * len, *not* this write!
1433 */
1434 write_mask = (1ULL<<(8*d->config.cfg_access.cap.length)) - 1;
1435 verbose("Window writing %#x/%#x to bar %u, offset %u len %u\n",
1436 d->config.cfg_access.pci_cfg_data, write_mask,
1437 d->config.cfg_access.cap.bar,
1438 d->config.cfg_access.cap.offset,
1439 d->config.cfg_access.cap.length);
1440
1441 emulate_mmio_write(d, d->config.cfg_access.cap.offset,
1442 d->config.cfg_access.pci_cfg_data,
1443 write_mask);
1444 return true;
1445 }
1446
1447 /*
1448 * 4.1.4.1:
1449 *
1450 * The driver MUST NOT write into any field of the capability
1451 * structure, with the exception of those with cap_type
1452 * VIRTIO_PCI_CAP_PCI_CFG...
1453 */
1454 return false;
1455}
1456
1457static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask);
1458
1459static void pci_data_ioread(u16 port, u32 mask, u32 *val)
1460{
1461 u32 reg;
1462 struct device *d = dev_and_reg(&reg);
1463
1464 if (!d)
1465 return;
1466
1467 /* Read through the PCI MMIO access window is special */
1468 if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) {
1469 u32 read_mask;
1470
1471 /*
1472 * 4.1.4.7.1:
1473 *
1474 * Upon detecting driver read access to pci_cfg_data, the
1475 * device MUST execute a read access of length cap.length at
1476 * offset cap.offset at BAR selected by cap.bar and store the
1477 * first cap.length bytes in pci_cfg_data.
1478 */
1479 /* Must be bar 0 */
1480 if (!valid_bar_access(d, &d->config.cfg_access))
1481 bad_driver(d,
1482 "Invalid cfg_access to bar%u, offset %u len %u",
1483 d->config.cfg_access.cap.bar,
1484 d->config.cfg_access.cap.offset,
1485 d->config.cfg_access.cap.length);
1486
1487 /*
1488 * Read into the window. The mask we use is set by
1489 * len, *not* this read!
1490 */
1491 read_mask = (1ULL<<(8*d->config.cfg_access.cap.length))-1;
1492 d->config.cfg_access.pci_cfg_data
1493 = emulate_mmio_read(d,
1494 d->config.cfg_access.cap.offset,
1495 read_mask);
1496 verbose("Window read %#x/%#x from bar %u, offset %u len %u\n",
1497 d->config.cfg_access.pci_cfg_data, read_mask,
1498 d->config.cfg_access.cap.bar,
1499 d->config.cfg_access.cap.offset,
1500 d->config.cfg_access.cap.length);
1501 }
1502 ioread(port - PCI_CONFIG_DATA, d->config_words[reg], mask, val);
1503}
1504
1018/*L:216 1505/*L:216
1019 * This actually creates the thread which services the virtqueue for a device. 1506 * This is where we emulate a handful of Guest instructions. It's ugly
1507 * and we used to do it in the kernel but it grew over time.
1508 */
1509
1510/*
1511 * We use the ptrace syscall's pt_regs struct to talk about registers
1512 * to lguest: these macros convert the names to the offsets.
1513 */
1514#define getreg(name) getreg_off(offsetof(struct user_regs_struct, name))
1515#define setreg(name, val) \
1516 setreg_off(offsetof(struct user_regs_struct, name), (val))
1517
1518static u32 getreg_off(size_t offset)
1519{
1520 u32 r;
1521 unsigned long args[] = { LHREQ_GETREG, offset };
1522
1523 if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
1524 err(1, "Getting register %u", offset);
1525 if (pread(lguest_fd, &r, sizeof(r), cpu_id) != sizeof(r))
1526 err(1, "Reading register %u", offset);
1527
1528 return r;
1529}
1530
1531static void setreg_off(size_t offset, u32 val)
1532{
1533 unsigned long args[] = { LHREQ_SETREG, offset, val };
1534
1535 if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
1536 err(1, "Setting register %u", offset);
1537}
1538
1539/* Get register by instruction encoding */
1540static u32 getreg_num(unsigned regnum, u32 mask)
1541{
1542 /* 8 bit ops use regnums 4-7 for high parts of word */
1543 if (mask == 0xFF && (regnum & 0x4))
1544 return getreg_num(regnum & 0x3, 0xFFFF) >> 8;
1545
1546 switch (regnum) {
1547 case 0: return getreg(eax) & mask;
1548 case 1: return getreg(ecx) & mask;
1549 case 2: return getreg(edx) & mask;
1550 case 3: return getreg(ebx) & mask;
1551 case 4: return getreg(esp) & mask;
1552 case 5: return getreg(ebp) & mask;
1553 case 6: return getreg(esi) & mask;
1554 case 7: return getreg(edi) & mask;
1555 }
1556 abort();
1557}
1558
1559/* Set register by instruction encoding */
1560static void setreg_num(unsigned regnum, u32 val, u32 mask)
1561{
1562 /* Don't try to set bits out of range */
1563 assert(~(val & ~mask));
1564
1565 /* 8 bit ops use regnums 4-7 for high parts of word */
1566 if (mask == 0xFF && (regnum & 0x4)) {
1567 /* Construct the 16 bits we want. */
1568 val = (val << 8) | getreg_num(regnum & 0x3, 0xFF);
1569 setreg_num(regnum & 0x3, val, 0xFFFF);
1570 return;
1571 }
1572
1573 switch (regnum) {
1574 case 0: setreg(eax, val | (getreg(eax) & ~mask)); return;
1575 case 1: setreg(ecx, val | (getreg(ecx) & ~mask)); return;
1576 case 2: setreg(edx, val | (getreg(edx) & ~mask)); return;
1577 case 3: setreg(ebx, val | (getreg(ebx) & ~mask)); return;
1578 case 4: setreg(esp, val | (getreg(esp) & ~mask)); return;
1579 case 5: setreg(ebp, val | (getreg(ebp) & ~mask)); return;
1580 case 6: setreg(esi, val | (getreg(esi) & ~mask)); return;
1581 case 7: setreg(edi, val | (getreg(edi) & ~mask)); return;
1582 }
1583 abort();
1584}
1585
1586/* Get bytes of displacement appended to instruction, from r/m encoding */
1587static u32 insn_displacement_len(u8 mod_reg_rm)
1588{
1589 /* Switch on the mod bits */
1590 switch (mod_reg_rm >> 6) {
1591 case 0:
1592 /* If mod == 0, and r/m == 101, 16-bit displacement follows */
1593 if ((mod_reg_rm & 0x7) == 0x5)
1594 return 2;
1595 /* Normally, mod == 0 means no literal displacement */
1596 return 0;
1597 case 1:
1598 /* One byte displacement */
1599 return 1;
1600 case 2:
1601 /* Four byte displacement */
1602 return 4;
1603 case 3:
1604 /* Register mode */
1605 return 0;
1606 }
1607 abort();
1608}
1609
1610static void emulate_insn(const u8 insn[])
1611{
1612 unsigned long args[] = { LHREQ_TRAP, 13 };
1613 unsigned int insnlen = 0, in = 0, small_operand = 0, byte_access;
1614 unsigned int eax, port, mask;
1615 /*
1616 * Default is to return all-ones on IO port reads, which traditionally
1617 * means "there's nothing there".
1618 */
1619 u32 val = 0xFFFFFFFF;
1620
1621 /*
1622 * This must be the Guest kernel trying to do something, not userspace!
1623 * The bottom two bits of the CS segment register are the privilege
1624 * level.
1625 */
1626 if ((getreg(xcs) & 3) != 0x1)
1627 goto no_emulate;
1628
1629 /* Decoding x86 instructions is icky. */
1630
1631 /*
1632 * Around 2.6.33, the kernel started using an emulation for the
1633 * cmpxchg8b instruction in early boot on many configurations. This
1634 * code isn't paravirtualized, and it tries to disable interrupts.
1635 * Ignore it, which will Mostly Work.
1636 */
1637 if (insn[insnlen] == 0xfa) {
1638 /* "cli", or Clear Interrupt Enable instruction. Skip it. */
1639 insnlen = 1;
1640 goto skip_insn;
1641 }
1642
1643 /*
1644 * 0x66 is an "operand prefix". It means a 16, not 32 bit in/out.
1645 */
1646 if (insn[insnlen] == 0x66) {
1647 small_operand = 1;
1648 /* The instruction is 1 byte so far, read the next byte. */
1649 insnlen = 1;
1650 }
1651
1652 /* If the lower bit isn't set, it's a single byte access */
1653 byte_access = !(insn[insnlen] & 1);
1654
1655 /*
1656 * Now we can ignore the lower bit and decode the 4 opcodes
1657 * we need to emulate.
1658 */
1659 switch (insn[insnlen] & 0xFE) {
1660 case 0xE4: /* in <next byte>,%al */
1661 port = insn[insnlen+1];
1662 insnlen += 2;
1663 in = 1;
1664 break;
1665 case 0xEC: /* in (%dx),%al */
1666 port = getreg(edx) & 0xFFFF;
1667 insnlen += 1;
1668 in = 1;
1669 break;
1670 case 0xE6: /* out %al,<next byte> */
1671 port = insn[insnlen+1];
1672 insnlen += 2;
1673 break;
1674 case 0xEE: /* out %al,(%dx) */
1675 port = getreg(edx) & 0xFFFF;
1676 insnlen += 1;
1677 break;
1678 default:
1679 /* OK, we don't know what this is, can't emulate. */
1680 goto no_emulate;
1681 }
1682
1683 /* Set a mask of the 1, 2 or 4 bytes, depending on size of IO */
1684 if (byte_access)
1685 mask = 0xFF;
1686 else if (small_operand)
1687 mask = 0xFFFF;
1688 else
1689 mask = 0xFFFFFFFF;
1690
1691 /*
1692 * If it was an "IN" instruction, they expect the result to be read
1693 * into %eax, so we change %eax.
1694 */
1695 eax = getreg(eax);
1696
1697 if (in) {
1698 /* This is the PS/2 keyboard status; 1 means ready for output */
1699 if (port == 0x64)
1700 val = 1;
1701 else if (is_pci_addr_port(port))
1702 pci_addr_ioread(port, mask, &val);
1703 else if (is_pci_data_port(port))
1704 pci_data_ioread(port, mask, &val);
1705
1706 /* Clear the bits we're about to read */
1707 eax &= ~mask;
1708 /* Copy bits in from val. */
1709 eax |= val & mask;
1710 /* Now update the register. */
1711 setreg(eax, eax);
1712 } else {
1713 if (is_pci_addr_port(port)) {
1714 if (!pci_addr_iowrite(port, mask, eax))
1715 goto bad_io;
1716 } else if (is_pci_data_port(port)) {
1717 if (!pci_data_iowrite(port, mask, eax))
1718 goto bad_io;
1719 }
1720 /* There are many other ports, eg. CMOS clock, serial
1721 * and parallel ports, so we ignore them all. */
1722 }
1723
1724 verbose("IO %s of %x to %u: %#08x\n",
1725 in ? "IN" : "OUT", mask, port, eax);
1726skip_insn:
1727 /* Finally, we've "done" the instruction, so move past it. */
1728 setreg(eip, getreg(eip) + insnlen);
1729 return;
1730
1731bad_io:
1732 warnx("Attempt to %s port %u (%#x mask)",
1733 in ? "read from" : "write to", port, mask);
1734
1735no_emulate:
1736 /* Inject trap into Guest. */
1737 if (write(lguest_fd, args, sizeof(args)) < 0)
1738 err(1, "Reinjecting trap 13 for fault at %#x", getreg(eip));
1739}
1740
1741static struct device *find_mmio_region(unsigned long paddr, u32 *off)
1742{
1743 unsigned int i;
1744
1745 for (i = 1; i < MAX_PCI_DEVICES; i++) {
1746 struct device *d = devices.pci[i];
1747
1748 if (!d)
1749 continue;
1750 if (paddr < d->mmio_addr)
1751 continue;
1752 if (paddr >= d->mmio_addr + d->mmio_size)
1753 continue;
1754 *off = paddr - d->mmio_addr;
1755 return d;
1756 }
1757 return NULL;
1758}
1759
1760/* FIXME: Use vq array. */
1761static struct virtqueue *vq_by_num(struct device *d, u32 num)
1762{
1763 struct virtqueue *vq = d->vq;
1764
1765 while (num-- && vq)
1766 vq = vq->next;
1767
1768 return vq;
1769}
1770
1771static void save_vq_config(const struct virtio_pci_common_cfg *cfg,
1772 struct virtqueue *vq)
1773{
1774 vq->pci_config = *cfg;
1775}
1776
1777static void restore_vq_config(struct virtio_pci_common_cfg *cfg,
1778 struct virtqueue *vq)
1779{
1780 /* Only restore the per-vq part */
1781 size_t off = offsetof(struct virtio_pci_common_cfg, queue_size);
1782
1783 memcpy((void *)cfg + off, (void *)&vq->pci_config + off,
1784 sizeof(*cfg) - off);
1785}
1786
1787/*
1788 * 4.1.4.3.2:
1789 *
1790 * The driver MUST configure the other virtqueue fields before
1791 * enabling the virtqueue with queue_enable.
1792 *
1793 * When they enable the virtqueue, we check that their setup is valid.
1020 */ 1794 */
1021static void create_thread(struct virtqueue *vq) 1795static void check_virtqueue(struct device *d, struct virtqueue *vq)
1796{
1797 /* Because lguest is 32 bit, all the descriptor high bits must be 0 */
1798 if (vq->pci_config.queue_desc_hi
1799 || vq->pci_config.queue_avail_hi
1800 || vq->pci_config.queue_used_hi)
1801 bad_driver_vq(vq, "invalid 64-bit queue address");
1802
1803 /*
1804 * 2.4.1:
1805 *
1806 * The driver MUST ensure that the physical address of the first byte
1807 * of each virtqueue part is a multiple of the specified alignment
1808 * value in the above table.
1809 */
1810 if (vq->pci_config.queue_desc_lo % 16
1811 || vq->pci_config.queue_avail_lo % 2
1812 || vq->pci_config.queue_used_lo % 4)
1813 bad_driver_vq(vq, "invalid alignment in queue addresses");
1814
1815 /* Initialize the virtqueue and check they're all in range. */
1816 vq->vring.num = vq->pci_config.queue_size;
1817 vq->vring.desc = check_pointer(vq->dev,
1818 vq->pci_config.queue_desc_lo,
1819 sizeof(*vq->vring.desc) * vq->vring.num);
1820 vq->vring.avail = check_pointer(vq->dev,
1821 vq->pci_config.queue_avail_lo,
1822 sizeof(*vq->vring.avail)
1823 + (sizeof(vq->vring.avail->ring[0])
1824 * vq->vring.num));
1825 vq->vring.used = check_pointer(vq->dev,
1826 vq->pci_config.queue_used_lo,
1827 sizeof(*vq->vring.used)
1828 + (sizeof(vq->vring.used->ring[0])
1829 * vq->vring.num));
1830
1831 /*
1832 * 2.4.9.1:
1833 *
1834 * The driver MUST initialize flags in the used ring to 0
1835 * when allocating the used ring.
1836 */
1837 if (vq->vring.used->flags != 0)
1838 bad_driver_vq(vq, "invalid initial used.flags %#x",
1839 vq->vring.used->flags);
1840}
1841
1842static void start_virtqueue(struct virtqueue *vq)
1022{ 1843{
1023 /* 1844 /*
1024 * Create stack for thread. Since the stack grows upwards, we point 1845 * Create stack for thread. Since the stack grows upwards, we point
1025 * the stack pointer to the end of this region. 1846 * the stack pointer to the end of this region.
1026 */ 1847 */
1027 char *stack = malloc(32768); 1848 char *stack = malloc(32768);
1028 unsigned long args[] = { LHREQ_EVENTFD,
1029 vq->config.pfn*getpagesize(), 0 };
1030 1849
1031 /* Create a zero-initialized eventfd. */ 1850 /* Create a zero-initialized eventfd. */
1032 vq->eventfd = eventfd(0, 0); 1851 vq->eventfd = eventfd(0, 0);
1033 if (vq->eventfd < 0) 1852 if (vq->eventfd < 0)
1034 err(1, "Creating eventfd"); 1853 err(1, "Creating eventfd");
1035 args[2] = vq->eventfd;
1036
1037 /*
1038 * Attach an eventfd to this virtqueue: it will go off when the Guest
1039 * does an LHCALL_NOTIFY for this vq.
1040 */
1041 if (write(lguest_fd, &args, sizeof(args)) != 0)
1042 err(1, "Attaching eventfd");
1043 1854
1044 /* 1855 /*
1045 * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so 1856 * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so
@@ -1048,167 +1859,531 @@ static void create_thread(struct virtqueue *vq)
1048 vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); 1859 vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
1049 if (vq->thread == (pid_t)-1) 1860 if (vq->thread == (pid_t)-1)
1050 err(1, "Creating clone"); 1861 err(1, "Creating clone");
1051
1052 /* We close our local copy now the child has it. */
1053 close(vq->eventfd);
1054} 1862}
1055 1863
1056static void start_device(struct device *dev) 1864static void start_virtqueues(struct device *d)
1057{ 1865{
1058 unsigned int i;
1059 struct virtqueue *vq; 1866 struct virtqueue *vq;
1060 1867
1061 verbose("Device %s OK: offered", dev->name); 1868 for (vq = d->vq; vq; vq = vq->next) {
1062 for (i = 0; i < dev->feature_len; i++) 1869 if (vq->pci_config.queue_enable)
1063 verbose(" %02x", get_feature_bits(dev)[i]); 1870 start_virtqueue(vq);
1064 verbose(", accepted");
1065 for (i = 0; i < dev->feature_len; i++)
1066 verbose(" %02x", get_feature_bits(dev)
1067 [dev->feature_len+i]);
1068
1069 for (vq = dev->vq; vq; vq = vq->next) {
1070 if (vq->service)
1071 create_thread(vq);
1072 } 1871 }
1073 dev->running = true;
1074} 1872}
1075 1873
1076static void cleanup_devices(void) 1874static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask)
1077{ 1875{
1078 struct device *dev; 1876 struct virtqueue *vq;
1079 1877
1080 for (dev = devices.dev; dev; dev = dev->next) 1878 switch (off) {
1081 reset_device(dev); 1879 case offsetof(struct virtio_pci_mmio, cfg.device_feature_select):
1880 /*
1881 * 4.1.4.3.1:
1882 *
1883 * The device MUST present the feature bits it is offering in
1884 * device_feature, starting at bit device_feature_select ∗ 32
1885 * for any device_feature_select written by the driver
1886 */
1887 if (val == 0)
1888 d->mmio->cfg.device_feature = d->features;
1889 else if (val == 1)
1890 d->mmio->cfg.device_feature = (d->features >> 32);
1891 else
1892 d->mmio->cfg.device_feature = 0;
1893 goto feature_write_through32;
1894 case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select):
1895 if (val > 1)
1896 bad_driver(d, "Unexpected driver select %u", val);
1897 goto feature_write_through32;
1898 case offsetof(struct virtio_pci_mmio, cfg.guest_feature):
1899 if (d->mmio->cfg.guest_feature_select == 0) {
1900 d->features_accepted &= ~((u64)0xFFFFFFFF);
1901 d->features_accepted |= val;
1902 } else {
1903 assert(d->mmio->cfg.guest_feature_select == 1);
1904 d->features_accepted &= 0xFFFFFFFF;
1905 d->features_accepted |= ((u64)val) << 32;
1906 }
1907 /*
1908 * 2.2.1:
1909 *
1910 * The driver MUST NOT accept a feature which the device did
1911 * not offer
1912 */
1913 if (d->features_accepted & ~d->features)
1914 bad_driver(d, "over-accepted features %#llx of %#llx",
1915 d->features_accepted, d->features);
1916 goto feature_write_through32;
1917 case offsetof(struct virtio_pci_mmio, cfg.device_status): {
1918 u8 prev;
1919
1920 verbose("%s: device status -> %#x\n", d->name, val);
1921 /*
1922 * 4.1.4.3.1:
1923 *
1924 * The device MUST reset when 0 is written to device_status,
1925 * and present a 0 in device_status once that is done.
1926 */
1927 if (val == 0) {
1928 reset_device(d);
1929 goto write_through8;
1930 }
1082 1931
1083 /* If we saved off the original terminal settings, restore them now. */ 1932 /* 2.1.1: The driver MUST NOT clear a device status bit. */
1084 if (orig_term.c_lflag & (ISIG|ICANON|ECHO)) 1933 if (d->mmio->cfg.device_status & ~val)
1085 tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); 1934 bad_driver(d, "unset of device status bit %#x -> %#x",
1086} 1935 d->mmio->cfg.device_status, val);
1087 1936
1088/* When the Guest tells us they updated the status field, we handle it. */ 1937 /*
1089static void update_device_status(struct device *dev) 1938 * 2.1.2:
1090{ 1939 *
1091 /* A zero status is a reset, otherwise it's a set of flags. */ 1940 * The device MUST NOT consume buffers or notify the driver
1092 if (dev->desc->status == 0) 1941 * before DRIVER_OK.
1093 reset_device(dev); 1942 */
1094 else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { 1943 if (val & VIRTIO_CONFIG_S_DRIVER_OK
1095 warnx("Device %s configuration FAILED", dev->name); 1944 && !(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK))
1096 if (dev->running) 1945 start_virtqueues(d);
1097 reset_device(dev); 1946
1098 } else { 1947 /*
1099 if (dev->running) 1948 * 3.1.1:
1100 err(1, "Device %s features finalized twice", dev->name); 1949 *
1101 start_device(dev); 1950 * The driver MUST follow this sequence to initialize a device:
1951 * - Reset the device.
1952 * - Set the ACKNOWLEDGE status bit: the guest OS has
1953 * notice the device.
1954 * - Set the DRIVER status bit: the guest OS knows how
1955 * to drive the device.
1956 * - Read device feature bits, and write the subset
1957 * of feature bits understood by the OS and driver
1958 * to the device. During this step the driver MAY
1959 * read (but MUST NOT write) the device-specific
1960 * configuration fields to check that it can
1961 * support the device before accepting it.
1962 * - Set the FEATURES_OK status bit. The driver
1963 * MUST not accept new feature bits after this
1964 * step.
1965 * - Re-read device status to ensure the FEATURES_OK
1966 * bit is still set: otherwise, the device does
1967 * not support our subset of features and the
1968 * device is unusable.
1969 * - Perform device-specific setup, including
1970 * discovery of virtqueues for the device,
1971 * optional per-bus setup, reading and possibly
1972 * writing the device’s virtio configuration
1973 * space, and population of virtqueues.
1974 * - Set the DRIVER_OK status bit. At this point the
1975 * device is “live”.
1976 */
1977 prev = 0;
1978 switch (val & ~d->mmio->cfg.device_status) {
1979 case VIRTIO_CONFIG_S_DRIVER_OK:
1980 prev |= VIRTIO_CONFIG_S_FEATURES_OK; /* fall thru */
1981 case VIRTIO_CONFIG_S_FEATURES_OK:
1982 prev |= VIRTIO_CONFIG_S_DRIVER; /* fall thru */
1983 case VIRTIO_CONFIG_S_DRIVER:
1984 prev |= VIRTIO_CONFIG_S_ACKNOWLEDGE; /* fall thru */
1985 case VIRTIO_CONFIG_S_ACKNOWLEDGE:
1986 break;
1987 default:
1988 bad_driver(d, "unknown device status bit %#x -> %#x",
1989 d->mmio->cfg.device_status, val);
1990 }
1991 if (d->mmio->cfg.device_status != prev)
1992 bad_driver(d, "unexpected status transition %#x -> %#x",
1993 d->mmio->cfg.device_status, val);
1994
1995 /* If they just wrote FEATURES_OK, we make sure they read */
1996 switch (val & ~d->mmio->cfg.device_status) {
1997 case VIRTIO_CONFIG_S_FEATURES_OK:
1998 d->wrote_features_ok = true;
1999 break;
2000 case VIRTIO_CONFIG_S_DRIVER_OK:
2001 if (d->wrote_features_ok)
2002 bad_driver(d, "did not re-read FEATURES_OK");
2003 break;
2004 }
2005 goto write_through8;
1102 } 2006 }
1103} 2007 case offsetof(struct virtio_pci_mmio, cfg.queue_select):
2008 vq = vq_by_num(d, val);
2009 /*
2010 * 4.1.4.3.1:
2011 *
2012 * The device MUST present a 0 in queue_size if the virtqueue
2013 * corresponding to the current queue_select is unavailable.
2014 */
2015 if (!vq) {
2016 d->mmio->cfg.queue_size = 0;
2017 goto write_through16;
2018 }
2019 /* Save registers for old vq, if it was a valid vq */
2020 if (d->mmio->cfg.queue_size)
2021 save_vq_config(&d->mmio->cfg,
2022 vq_by_num(d, d->mmio->cfg.queue_select));
2023 /* Restore the registers for the queue they asked for */
2024 restore_vq_config(&d->mmio->cfg, vq);
2025 goto write_through16;
2026 case offsetof(struct virtio_pci_mmio, cfg.queue_size):
2027 /*
2028 * 4.1.4.3.2:
2029 *
2030 * The driver MUST NOT write a value which is not a power of 2
2031 * to queue_size.
2032 */
2033 if (val & (val-1))
2034 bad_driver(d, "invalid queue size %u", val);
2035 if (d->mmio->cfg.queue_enable)
2036 bad_driver(d, "changing queue size on live device");
2037 goto write_through16;
2038 case offsetof(struct virtio_pci_mmio, cfg.queue_msix_vector):
2039 bad_driver(d, "attempt to set MSIX vector to %u", val);
2040 case offsetof(struct virtio_pci_mmio, cfg.queue_enable): {
2041 struct virtqueue *vq = vq_by_num(d, d->mmio->cfg.queue_select);
1104 2042
1105/*L:215 2043 /*
1106 * This is the generic routine we call when the Guest uses LHCALL_NOTIFY. In 2044 * 4.1.4.3.2:
1107 * particular, it's used to notify us of device status changes during boot. 2045 *
1108 */ 2046 * The driver MUST NOT write a 0 to queue_enable.
1109static void handle_output(unsigned long addr) 2047 */
1110{ 2048 if (val != 1)
1111 struct device *i; 2049 bad_driver(d, "setting queue_enable to %u", val);
1112 2050
1113 /* Check each device. */ 2051 /*
1114 for (i = devices.dev; i; i = i->next) { 2052 * 3.1.1:
1115 struct virtqueue *vq; 2053 *
2054 * 7. Perform device-specific setup, including discovery of
2055 * virtqueues for the device, optional per-bus setup,
2056 * reading and possibly writing the device’s virtio
2057 * configuration space, and population of virtqueues.
2058 * 8. Set the DRIVER_OK status bit.
2059 *
2060 * All our devices require all virtqueues to be enabled, so
2061 * they should have done that before setting DRIVER_OK.
2062 */
2063 if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK)
2064 bad_driver(d, "enabling vq after DRIVER_OK");
1116 2065
2066 d->mmio->cfg.queue_enable = val;
2067 save_vq_config(&d->mmio->cfg, vq);
2068 check_virtqueue(d, vq);
2069 goto write_through16;
2070 }
2071 case offsetof(struct virtio_pci_mmio, cfg.queue_notify_off):
2072 bad_driver(d, "attempt to write to queue_notify_off");
2073 case offsetof(struct virtio_pci_mmio, cfg.queue_desc_lo):
2074 case offsetof(struct virtio_pci_mmio, cfg.queue_desc_hi):
2075 case offsetof(struct virtio_pci_mmio, cfg.queue_avail_lo):
2076 case offsetof(struct virtio_pci_mmio, cfg.queue_avail_hi):
2077 case offsetof(struct virtio_pci_mmio, cfg.queue_used_lo):
2078 case offsetof(struct virtio_pci_mmio, cfg.queue_used_hi):
1117 /* 2079 /*
1118 * Notifications to device descriptors mean they updated the 2080 * 4.1.4.3.2:
1119 * device status. 2081 *
2082 * The driver MUST configure the other virtqueue fields before
2083 * enabling the virtqueue with queue_enable.
1120 */ 2084 */
1121 if (from_guest_phys(addr) == i->desc) { 2085 if (d->mmio->cfg.queue_enable)
1122 update_device_status(i); 2086 bad_driver(d, "changing queue on live device");
1123 return; 2087
1124 } 2088 /*
2089 * 3.1.1:
2090 *
2091 * The driver MUST follow this sequence to initialize a device:
2092 *...
2093 * 5. Set the FEATURES_OK status bit. The driver MUST not
2094 * accept new feature bits after this step.
2095 */
2096 if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK))
2097 bad_driver(d, "setting up vq before FEATURES_OK");
1125 2098
1126 /* Devices should not be used before features are finalized. */ 2099 /*
1127 for (vq = i->vq; vq; vq = vq->next) { 2100 * 6. Re-read device status to ensure the FEATURES_OK bit is
1128 if (addr != vq->config.pfn*getpagesize()) 2101 * still set...
1129 continue; 2102 */
1130 errx(1, "Notification on %s before setup!", i->name); 2103 if (d->wrote_features_ok)
2104 bad_driver(d, "didn't re-read FEATURES_OK before setup");
2105
2106 goto write_through32;
2107 case offsetof(struct virtio_pci_mmio, notify):
2108 vq = vq_by_num(d, val);
2109 if (!vq)
2110 bad_driver(d, "Invalid vq notification on %u", val);
2111 /* Notify the process handling this vq by adding 1 to eventfd */
2112 write(vq->eventfd, "\1\0\0\0\0\0\0\0", 8);
2113 goto write_through16;
2114 case offsetof(struct virtio_pci_mmio, isr):
2115 bad_driver(d, "Unexpected write to isr");
2116 /* Weird corner case: write to emerg_wr of console */
2117 case sizeof(struct virtio_pci_mmio)
2118 + offsetof(struct virtio_console_config, emerg_wr):
2119 if (strcmp(d->name, "console") == 0) {
2120 char c = val;
2121 write(STDOUT_FILENO, &c, 1);
2122 goto write_through32;
1131 } 2123 }
2124 /* Fall through... */
2125 default:
2126 /*
2127 * 4.1.4.3.2:
2128 *
2129 * The driver MUST NOT write to device_feature, num_queues,
2130 * config_generation or queue_notify_off.
2131 */
2132 bad_driver(d, "Unexpected write to offset %u", off);
1132 } 2133 }
1133 2134
2135feature_write_through32:
1134 /* 2136 /*
1135 * Early console write is done using notify on a nul-terminated string 2137 * 3.1.1:
1136 * in Guest memory. It's also great for hacking debugging messages 2138 *
1137 * into a Guest. 2139 * The driver MUST follow this sequence to initialize a device:
2140 *...
2141 * - Set the DRIVER status bit: the guest OS knows how
2142 * to drive the device.
2143 * - Read device feature bits, and write the subset
2144 * of feature bits understood by the OS and driver
2145 * to the device.
2146 *...
2147 * - Set the FEATURES_OK status bit. The driver MUST not
2148 * accept new feature bits after this step.
1138 */ 2149 */
1139 if (addr >= guest_limit) 2150 if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
1140 errx(1, "Bad NOTIFY %#lx", addr); 2151 bad_driver(d, "feature write before VIRTIO_CONFIG_S_DRIVER");
2152 if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK)
2153 bad_driver(d, "feature write after VIRTIO_CONFIG_S_FEATURES_OK");
1141 2154
1142 write(STDOUT_FILENO, from_guest_phys(addr), 2155 /*
1143 strnlen(from_guest_phys(addr), guest_limit - addr)); 2156 * 4.1.3.1:
2157 *
2158 * The driver MUST access each field using the “natural” access
2159 * method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses for
2160 * 16-bit fields and 8-bit accesses for 8-bit fields.
2161 */
2162write_through32:
2163 if (mask != 0xFFFFFFFF) {
2164 bad_driver(d, "non-32-bit write to offset %u (%#x)",
2165 off, getreg(eip));
2166 return;
2167 }
2168 memcpy((char *)d->mmio + off, &val, 4);
2169 return;
2170
2171write_through16:
2172 if (mask != 0xFFFF)
2173 bad_driver(d, "non-16-bit write to offset %u (%#x)",
2174 off, getreg(eip));
2175 memcpy((char *)d->mmio + off, &val, 2);
2176 return;
2177
2178write_through8:
2179 if (mask != 0xFF)
2180 bad_driver(d, "non-8-bit write to offset %u (%#x)",
2181 off, getreg(eip));
2182 memcpy((char *)d->mmio + off, &val, 1);
2183 return;
1144} 2184}
1145 2185
1146/*L:190 2186static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask)
1147 * Device Setup
1148 *
1149 * All devices need a descriptor so the Guest knows it exists, and a "struct
1150 * device" so the Launcher can keep track of it. We have common helper
1151 * routines to allocate and manage them.
1152 */
1153
1154/*
1155 * The layout of the device page is a "struct lguest_device_desc" followed by a
1156 * number of virtqueue descriptors, then two sets of feature bits, then an
1157 * array of configuration bytes. This routine returns the configuration
1158 * pointer.
1159 */
1160static u8 *device_config(const struct device *dev)
1161{ 2187{
1162 return (void *)(dev->desc + 1) 2188 u8 isr;
1163 + dev->num_vq * sizeof(struct lguest_vqconfig) 2189 u32 val = 0;
1164 + dev->feature_len * 2; 2190
2191 switch (off) {
2192 case offsetof(struct virtio_pci_mmio, cfg.device_feature_select):
2193 case offsetof(struct virtio_pci_mmio, cfg.device_feature):
2194 case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select):
2195 case offsetof(struct virtio_pci_mmio, cfg.guest_feature):
2196 /*
2197 * 3.1.1:
2198 *
2199 * The driver MUST follow this sequence to initialize a device:
2200 *...
2201 * - Set the DRIVER status bit: the guest OS knows how
2202 * to drive the device.
2203 * - Read device feature bits, and write the subset
2204 * of feature bits understood by the OS and driver
2205 * to the device.
2206 */
2207 if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
2208 bad_driver(d,
2209 "feature read before VIRTIO_CONFIG_S_DRIVER");
2210 goto read_through32;
2211 case offsetof(struct virtio_pci_mmio, cfg.msix_config):
2212 bad_driver(d, "read of msix_config");
2213 case offsetof(struct virtio_pci_mmio, cfg.num_queues):
2214 goto read_through16;
2215 case offsetof(struct virtio_pci_mmio, cfg.device_status):
2216 /* As they did read, any write of FEATURES_OK is now fine. */
2217 d->wrote_features_ok = false;
2218 goto read_through8;
2219 case offsetof(struct virtio_pci_mmio, cfg.config_generation):
2220 /*
2221 * 4.1.4.3.1:
2222 *
2223 * The device MUST present a changed config_generation after
2224 * the driver has read a device-specific configuration value
2225 * which has changed since any part of the device-specific
2226 * configuration was last read.
2227 *
2228 * This is simple: none of our devices change config, so this
2229 * is always 0.
2230 */
2231 goto read_through8;
2232 case offsetof(struct virtio_pci_mmio, notify):
2233 /*
2234 * 3.1.1:
2235 *
2236 * The driver MUST NOT notify the device before setting
2237 * DRIVER_OK.
2238 */
2239 if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK))
2240 bad_driver(d, "notify before VIRTIO_CONFIG_S_DRIVER_OK");
2241 goto read_through16;
2242 case offsetof(struct virtio_pci_mmio, isr):
2243 if (mask != 0xFF)
2244 bad_driver(d, "non-8-bit read from offset %u (%#x)",
2245 off, getreg(eip));
2246 isr = d->mmio->isr;
2247 /*
2248 * 4.1.4.5.1:
2249 *
2250 * The device MUST reset ISR status to 0 on driver read.
2251 */
2252 d->mmio->isr = 0;
2253 return isr;
2254 case offsetof(struct virtio_pci_mmio, padding):
2255 bad_driver(d, "read from padding (%#x)", getreg(eip));
2256 default:
2257 /* Read from device config space, beware unaligned overflow */
2258 if (off > d->mmio_size - 4)
2259 bad_driver(d, "read past end (%#x)", getreg(eip));
2260
2261 /*
2262 * 3.1.1:
2263 * The driver MUST follow this sequence to initialize a device:
2264 *...
2265 * 3. Set the DRIVER status bit: the guest OS knows how to
2266 * drive the device.
2267 * 4. Read device feature bits, and write the subset of
2268 * feature bits understood by the OS and driver to the
2269 * device. During this step the driver MAY read (but MUST NOT
2270 * write) the device-specific configuration fields to check
2271 * that it can support the device before accepting it.
2272 */
2273 if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
2274 bad_driver(d,
2275 "config read before VIRTIO_CONFIG_S_DRIVER");
2276
2277 if (mask == 0xFFFFFFFF)
2278 goto read_through32;
2279 else if (mask == 0xFFFF)
2280 goto read_through16;
2281 else
2282 goto read_through8;
2283 }
2284
2285 /*
2286 * 4.1.3.1:
2287 *
2288 * The driver MUST access each field using the “natural” access
2289 * method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses for
2290 * 16-bit fields and 8-bit accesses for 8-bit fields.
2291 */
2292read_through32:
2293 if (mask != 0xFFFFFFFF)
2294 bad_driver(d, "non-32-bit read to offset %u (%#x)",
2295 off, getreg(eip));
2296 memcpy(&val, (char *)d->mmio + off, 4);
2297 return val;
2298
2299read_through16:
2300 if (mask != 0xFFFF)
2301 bad_driver(d, "non-16-bit read to offset %u (%#x)",
2302 off, getreg(eip));
2303 memcpy(&val, (char *)d->mmio + off, 2);
2304 return val;
2305
2306read_through8:
2307 if (mask != 0xFF)
2308 bad_driver(d, "non-8-bit read to offset %u (%#x)",
2309 off, getreg(eip));
2310 memcpy(&val, (char *)d->mmio + off, 1);
2311 return val;
1165} 2312}
1166 2313
1167/* 2314static void emulate_mmio(unsigned long paddr, const u8 *insn)
1168 * This routine allocates a new "struct lguest_device_desc" from descriptor
1169 * table page just above the Guest's normal memory. It returns a pointer to
1170 * that descriptor.
1171 */
1172static struct lguest_device_desc *new_dev_desc(u16 type)
1173{ 2315{
1174 struct lguest_device_desc d = { .type = type }; 2316 u32 val, off, mask = 0xFFFFFFFF, insnlen = 0;
1175 void *p; 2317 struct device *d = find_mmio_region(paddr, &off);
2318 unsigned long args[] = { LHREQ_TRAP, 14 };
1176 2319
1177 /* Figure out where the next device config is, based on the last one. */ 2320 if (!d) {
1178 if (devices.lastdev) 2321 warnx("MMIO touching %#08lx (not a device)", paddr);
1179 p = device_config(devices.lastdev) 2322 goto reinject;
1180 + devices.lastdev->desc->config_len; 2323 }
1181 else 2324
1182 p = devices.descpage; 2325 /* Prefix makes it a 16 bit op */
2326 if (insn[0] == 0x66) {
2327 mask = 0xFFFF;
2328 insnlen++;
2329 }
1183 2330
1184 /* We only have one page for all the descriptors. */ 2331 /* iowrite */
1185 if (p + sizeof(d) > (void *)devices.descpage + getpagesize()) 2332 if (insn[insnlen] == 0x89) {
1186 errx(1, "Too many devices"); 2333 /* Next byte is r/m byte: bits 3-5 are register. */
2334 val = getreg_num((insn[insnlen+1] >> 3) & 0x7, mask);
2335 emulate_mmio_write(d, off, val, mask);
2336 insnlen += 2 + insn_displacement_len(insn[insnlen+1]);
2337 } else if (insn[insnlen] == 0x8b) { /* ioread */
2338 /* Next byte is r/m byte: bits 3-5 are register. */
2339 val = emulate_mmio_read(d, off, mask);
2340 setreg_num((insn[insnlen+1] >> 3) & 0x7, val, mask);
2341 insnlen += 2 + insn_displacement_len(insn[insnlen+1]);
2342 } else if (insn[0] == 0x88) { /* 8-bit iowrite */
2343 mask = 0xff;
2344 /* Next byte is r/m byte: bits 3-5 are register. */
2345 val = getreg_num((insn[1] >> 3) & 0x7, mask);
2346 emulate_mmio_write(d, off, val, mask);
2347 insnlen = 2 + insn_displacement_len(insn[1]);
2348 } else if (insn[0] == 0x8a) { /* 8-bit ioread */
2349 mask = 0xff;
2350 val = emulate_mmio_read(d, off, mask);
2351 setreg_num((insn[1] >> 3) & 0x7, val, mask);
2352 insnlen = 2 + insn_displacement_len(insn[1]);
2353 } else {
2354 warnx("Unknown MMIO instruction touching %#08lx:"
2355 " %02x %02x %02x %02x at %u",
2356 paddr, insn[0], insn[1], insn[2], insn[3], getreg(eip));
2357 reinject:
2358 /* Inject trap into Guest. */
2359 if (write(lguest_fd, args, sizeof(args)) < 0)
2360 err(1, "Reinjecting trap 14 for fault at %#x",
2361 getreg(eip));
2362 return;
2363 }
1187 2364
1188 /* p might not be aligned, so we memcpy in. */ 2365 /* Finally, we've "done" the instruction, so move past it. */
1189 return memcpy(p, &d, sizeof(d)); 2366 setreg(eip, getreg(eip) + insnlen);
1190} 2367}
1191 2368
1192/* 2369/*L:190
1193 * Each device descriptor is followed by the description of its virtqueues. We 2370 * Device Setup
1194 * specify how many descriptors the virtqueue is to have. 2371 *
2372 * All devices need a descriptor so the Guest knows it exists, and a "struct
2373 * device" so the Launcher can keep track of it. We have common helper
2374 * routines to allocate and manage them.
1195 */ 2375 */
1196static void add_virtqueue(struct device *dev, unsigned int num_descs, 2376static void add_pci_virtqueue(struct device *dev,
1197 void (*service)(struct virtqueue *)) 2377 void (*service)(struct virtqueue *),
2378 const char *name)
1198{ 2379{
1199 unsigned int pages;
1200 struct virtqueue **i, *vq = malloc(sizeof(*vq)); 2380 struct virtqueue **i, *vq = malloc(sizeof(*vq));
1201 void *p;
1202
1203 /* First we need some memory for this virtqueue. */
1204 pages = (vring_size(num_descs, LGUEST_VRING_ALIGN) + getpagesize() - 1)
1205 / getpagesize();
1206 p = get_pages(pages);
1207 2381
1208 /* Initialize the virtqueue */ 2382 /* Initialize the virtqueue */
1209 vq->next = NULL; 2383 vq->next = NULL;
1210 vq->last_avail_idx = 0; 2384 vq->last_avail_idx = 0;
1211 vq->dev = dev; 2385 vq->dev = dev;
2386 vq->name = name;
1212 2387
1213 /* 2388 /*
1214 * This is the routine the service thread will run, and its Process ID 2389 * This is the routine the service thread will run, and its Process ID
@@ -1218,25 +2393,11 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1218 vq->thread = (pid_t)-1; 2393 vq->thread = (pid_t)-1;
1219 2394
1220 /* Initialize the configuration. */ 2395 /* Initialize the configuration. */
1221 vq->config.num = num_descs; 2396 reset_vq_pci_config(vq);
1222 vq->config.irq = devices.next_irq++; 2397 vq->pci_config.queue_notify_off = 0;
1223 vq->config.pfn = to_guest_phys(p) / getpagesize();
1224
1225 /* Initialize the vring. */
1226 vring_init(&vq->vring, num_descs, p, LGUEST_VRING_ALIGN);
1227
1228 /*
1229 * Append virtqueue to this device's descriptor. We use
1230 * device_config() to get the end of the device's current virtqueues;
1231 * we check that we haven't added any config or feature information
1232 * yet, otherwise we'd be overwriting them.
1233 */
1234 assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0);
1235 memcpy(device_config(dev), &vq->config, sizeof(vq->config));
1236 dev->num_vq++;
1237 dev->desc->num_vq++;
1238 2398
1239 verbose("Virtqueue page %#lx\n", to_guest_phys(p)); 2399 /* Add one to the number of queues */
2400 vq->dev->mmio->cfg.num_queues++;
1240 2401
1241 /* 2402 /*
1242 * Add to tail of list, so dev->vq is first vq, dev->vq->next is 2403 * Add to tail of list, so dev->vq is first vq, dev->vq->next is
@@ -1246,73 +2407,239 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1246 *i = vq; 2407 *i = vq;
1247} 2408}
1248 2409
1249/* 2410/* The Guest accesses the feature bits via the PCI common config MMIO region */
1250 * The first half of the feature bitmask is for us to advertise features. The 2411static void add_pci_feature(struct device *dev, unsigned bit)
1251 * second half is for the Guest to accept features.
1252 */
1253static void add_feature(struct device *dev, unsigned bit)
1254{ 2412{
1255 u8 *features = get_feature_bits(dev); 2413 dev->features |= (1ULL << bit);
2414}
1256 2415
1257 /* We can't extend the feature bits once we've added config bytes */ 2416/* For devices with no config. */
1258 if (dev->desc->feature_len <= bit / CHAR_BIT) { 2417static void no_device_config(struct device *dev)
1259 assert(dev->desc->config_len == 0); 2418{
1260 dev->feature_len = dev->desc->feature_len = (bit/CHAR_BIT) + 1; 2419 dev->mmio_addr = get_mmio_region(dev->mmio_size);
1261 }
1262 2420
1263 features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); 2421 dev->config.bar[0] = dev->mmio_addr;
2422 /* Bottom 4 bits must be zero */
2423 assert(~(dev->config.bar[0] & 0xF));
2424}
2425
2426/* This puts the device config into BAR0 */
2427static void set_device_config(struct device *dev, const void *conf, size_t len)
2428{
2429 /* Set up BAR 0 */
2430 dev->mmio_size += len;
2431 dev->mmio = realloc(dev->mmio, dev->mmio_size);
2432 memcpy(dev->mmio + 1, conf, len);
2433
2434 /*
2435 * 4.1.4.6:
2436 *
2437 * The device MUST present at least one VIRTIO_PCI_CAP_DEVICE_CFG
2438 * capability for any device type which has a device-specific
2439 * configuration.
2440 */
2441 /* Hook up device cfg */
2442 dev->config.cfg_access.cap.cap_next
2443 = offsetof(struct pci_config, device);
2444
2445 /*
2446 * 4.1.4.6.1:
2447 *
2448 * The offset for the device-specific configuration MUST be 4-byte
2449 * aligned.
2450 */
2451 assert(dev->config.cfg_access.cap.cap_next % 4 == 0);
2452
2453 /* Fix up device cfg field length. */
2454 dev->config.device.length = len;
2455
2456 /* The rest is the same as the no-config case */
2457 no_device_config(dev);
2458}
2459
2460static void init_cap(struct virtio_pci_cap *cap, size_t caplen, int type,
2461 size_t bar_offset, size_t bar_bytes, u8 next)
2462{
2463 cap->cap_vndr = PCI_CAP_ID_VNDR;
2464 cap->cap_next = next;
2465 cap->cap_len = caplen;
2466 cap->cfg_type = type;
2467 cap->bar = 0;
2468 memset(cap->padding, 0, sizeof(cap->padding));
2469 cap->offset = bar_offset;
2470 cap->length = bar_bytes;
1264} 2471}
1265 2472
1266/* 2473/*
1267 * This routine sets the configuration fields for an existing device's 2474 * This sets up the pci_config structure, as defined in the virtio 1.0
1268 * descriptor. It only works for the last device, but that's OK because that's 2475 * standard (and PCI standard).
1269 * how we use it.
1270 */ 2476 */
1271static void set_config(struct device *dev, unsigned len, const void *conf) 2477static void init_pci_config(struct pci_config *pci, u16 type,
2478 u8 class, u8 subclass)
1272{ 2479{
1273 /* Check we haven't overflowed our single page. */ 2480 size_t bar_offset, bar_len;
1274 if (device_config(dev) + len > devices.descpage + getpagesize()) 2481
1275 errx(1, "Too many devices"); 2482 /*
2483 * 4.1.4.4.1:
2484 *
2485 * The device MUST either present notify_off_multiplier as an even
2486 * power of 2, or present notify_off_multiplier as 0.
2487 *
2488 * 2.1.2:
2489 *
2490 * The device MUST initialize device status to 0 upon reset.
2491 */
2492 memset(pci, 0, sizeof(*pci));
2493
2494 /* 4.1.2.1: Devices MUST have the PCI Vendor ID 0x1AF4 */
2495 pci->vendor_id = 0x1AF4;
2496 /* 4.1.2.1: ... PCI Device ID calculated by adding 0x1040 ... */
2497 pci->device_id = 0x1040 + type;
2498
2499 /*
2500 * PCI have specific codes for different types of devices.
2501 * Linux doesn't care, but it's a good clue for people looking
2502 * at the device.
2503 */
2504 pci->class = class;
2505 pci->subclass = subclass;
2506
2507 /*
2508 * 4.1.2.1:
2509 *
2510 * Non-transitional devices SHOULD have a PCI Revision ID of 1 or
2511 * higher
2512 */
2513 pci->revid = 1;
2514
2515 /*
2516 * 4.1.2.1:
2517 *
2518 * Non-transitional devices SHOULD have a PCI Subsystem Device ID of
2519 * 0x40 or higher.
2520 */
2521 pci->subsystem_device_id = 0x40;
2522
2523 /* We use our dummy interrupt controller, and irq_line is the irq */
2524 pci->irq_line = devices.next_irq++;
2525 pci->irq_pin = 0;
2526
2527 /* Support for extended capabilities. */
2528 pci->status = (1 << 4);
2529
2530 /* Link them in. */
2531 /*
2532 * 4.1.4.3.1:
2533 *
2534 * The device MUST present at least one common configuration
2535 * capability.
2536 */
2537 pci->capabilities = offsetof(struct pci_config, common);
2538
2539 /* 4.1.4.3.1 ... offset MUST be 4-byte aligned. */
2540 assert(pci->capabilities % 4 == 0);
2541
2542 bar_offset = offsetof(struct virtio_pci_mmio, cfg);
2543 bar_len = sizeof(((struct virtio_pci_mmio *)0)->cfg);
2544 init_cap(&pci->common, sizeof(pci->common), VIRTIO_PCI_CAP_COMMON_CFG,
2545 bar_offset, bar_len,
2546 offsetof(struct pci_config, notify));
2547
2548 /*
2549 * 4.1.4.4.1:
2550 *
2551 * The device MUST present at least one notification capability.
2552 */
2553 bar_offset += bar_len;
2554 bar_len = sizeof(((struct virtio_pci_mmio *)0)->notify);
2555
2556 /*
2557 * 4.1.4.4.1:
2558 *
2559 * The cap.offset MUST be 2-byte aligned.
2560 */
2561 assert(pci->common.cap_next % 2 == 0);
2562
2563 /* FIXME: Use a non-zero notify_off, for per-queue notification? */
2564 /*
2565 * 4.1.4.4.1:
2566 *
2567 * The value cap.length presented by the device MUST be at least 2 and
2568 * MUST be large enough to support queue notification offsets for all
2569 * supported queues in all possible configurations.
2570 */
2571 assert(bar_len >= 2);
2572
2573 init_cap(&pci->notify.cap, sizeof(pci->notify),
2574 VIRTIO_PCI_CAP_NOTIFY_CFG,
2575 bar_offset, bar_len,
2576 offsetof(struct pci_config, isr));
2577
2578 bar_offset += bar_len;
2579 bar_len = sizeof(((struct virtio_pci_mmio *)0)->isr);
2580 /*
2581 * 4.1.4.5.1:
2582 *
2583 * The device MUST present at least one VIRTIO_PCI_CAP_ISR_CFG
2584 * capability.
2585 */
2586 init_cap(&pci->isr, sizeof(pci->isr),
2587 VIRTIO_PCI_CAP_ISR_CFG,
2588 bar_offset, bar_len,
2589 offsetof(struct pci_config, cfg_access));
2590
2591 /*
2592 * 4.1.4.7.1:
2593 *
2594 * The device MUST present at least one VIRTIO_PCI_CAP_PCI_CFG
2595 * capability.
2596 */
2597 /* This doesn't have any presence in the BAR */
2598 init_cap(&pci->cfg_access.cap, sizeof(pci->cfg_access),
2599 VIRTIO_PCI_CAP_PCI_CFG,
2600 0, 0, 0);
1276 2601
1277 /* Copy in the config information, and store the length. */ 2602 bar_offset += bar_len + sizeof(((struct virtio_pci_mmio *)0)->padding);
1278 memcpy(device_config(dev), conf, len); 2603 assert(bar_offset == sizeof(struct virtio_pci_mmio));
1279 dev->desc->config_len = len;
1280 2604
1281 /* Size must fit in config_len field (8 bits)! */ 2605 /*
1282 assert(dev->desc->config_len == len); 2606 * This gets sewn in and length set in set_device_config().
2607 * Some devices don't have a device configuration interface, so
2608 * we never expose this if we don't call set_device_config().
2609 */
2610 init_cap(&pci->device, sizeof(pci->device), VIRTIO_PCI_CAP_DEVICE_CFG,
2611 bar_offset, 0, 0);
1283} 2612}
1284 2613
1285/* 2614/*
1286 * This routine does all the creation and setup of a new device, including 2615 * This routine does all the creation and setup of a new device, but we don't
1287 * calling new_dev_desc() to allocate the descriptor and device memory. We 2616 * actually place the MMIO region until we know the size (if any) of the
1288 * don't actually start the service threads until later. 2617 * device-specific config. And we don't actually start the service threads
2618 * until later.
1289 * 2619 *
1290 * See what I mean about userspace being boring? 2620 * See what I mean about userspace being boring?
1291 */ 2621 */
1292static struct device *new_device(const char *name, u16 type) 2622static struct device *new_pci_device(const char *name, u16 type,
2623 u8 class, u8 subclass)
1293{ 2624{
1294 struct device *dev = malloc(sizeof(*dev)); 2625 struct device *dev = malloc(sizeof(*dev));
1295 2626
1296 /* Now we populate the fields one at a time. */ 2627 /* Now we populate the fields one at a time. */
1297 dev->desc = new_dev_desc(type);
1298 dev->name = name; 2628 dev->name = name;
1299 dev->vq = NULL; 2629 dev->vq = NULL;
1300 dev->feature_len = 0;
1301 dev->num_vq = 0;
1302 dev->running = false; 2630 dev->running = false;
1303 dev->next = NULL; 2631 dev->wrote_features_ok = false;
2632 dev->mmio_size = sizeof(struct virtio_pci_mmio);
2633 dev->mmio = calloc(1, dev->mmio_size);
2634 dev->features = (u64)1 << VIRTIO_F_VERSION_1;
2635 dev->features_accepted = 0;
1304 2636
1305 /* 2637 if (devices.device_num + 1 >= MAX_PCI_DEVICES)
1306 * Append to device list. Prepending to a single-linked list is 2638 errx(1, "Can only handle 31 PCI devices");
1307 * easier, but the user expects the devices to be arranged on the bus 2639
1308 * in command-line order. The first network device on the command line 2640 init_pci_config(&dev->config, type, class, subclass);
1309 * is eth0, the first block device /dev/vda, etc. 2641 assert(!devices.pci[devices.device_num+1]);
1310 */ 2642 devices.pci[++devices.device_num] = dev;
1311 if (devices.lastdev)
1312 devices.lastdev->next = dev;
1313 else
1314 devices.dev = dev;
1315 devices.lastdev = dev;
1316 2643
1317 return dev; 2644 return dev;
1318} 2645}
@@ -1324,6 +2651,7 @@ static struct device *new_device(const char *name, u16 type)
1324static void setup_console(void) 2651static void setup_console(void)
1325{ 2652{
1326 struct device *dev; 2653 struct device *dev;
2654 struct virtio_console_config conf;
1327 2655
1328 /* If we can save the initial standard input settings... */ 2656 /* If we can save the initial standard input settings... */
1329 if (tcgetattr(STDIN_FILENO, &orig_term) == 0) { 2657 if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
@@ -1336,7 +2664,7 @@ static void setup_console(void)
1336 tcsetattr(STDIN_FILENO, TCSANOW, &term); 2664 tcsetattr(STDIN_FILENO, TCSANOW, &term);
1337 } 2665 }
1338 2666
1339 dev = new_device("console", VIRTIO_ID_CONSOLE); 2667 dev = new_pci_device("console", VIRTIO_ID_CONSOLE, 0x07, 0x00);
1340 2668
1341 /* We store the console state in dev->priv, and initialize it. */ 2669 /* We store the console state in dev->priv, and initialize it. */
1342 dev->priv = malloc(sizeof(struct console_abort)); 2670 dev->priv = malloc(sizeof(struct console_abort));
@@ -1348,10 +2676,14 @@ static void setup_console(void)
1348 * stdin. When they put something in the output queue, we write it to 2676 * stdin. When they put something in the output queue, we write it to
1349 * stdout. 2677 * stdout.
1350 */ 2678 */
1351 add_virtqueue(dev, VIRTQUEUE_NUM, console_input); 2679 add_pci_virtqueue(dev, console_input, "input");
1352 add_virtqueue(dev, VIRTQUEUE_NUM, console_output); 2680 add_pci_virtqueue(dev, console_output, "output");
2681
2682 /* We need a configuration area for the emerg_wr early writes. */
2683 add_pci_feature(dev, VIRTIO_CONSOLE_F_EMERG_WRITE);
2684 set_device_config(dev, &conf, sizeof(conf));
1353 2685
1354 verbose("device %u: console\n", ++devices.device_num); 2686 verbose("device %u: console\n", devices.device_num);
1355} 2687}
1356/*:*/ 2688/*:*/
1357 2689
@@ -1449,6 +2781,7 @@ static void configure_device(int fd, const char *tapif, u32 ipaddr)
1449static int get_tun_device(char tapif[IFNAMSIZ]) 2781static int get_tun_device(char tapif[IFNAMSIZ])
1450{ 2782{
1451 struct ifreq ifr; 2783 struct ifreq ifr;
2784 int vnet_hdr_sz;
1452 int netfd; 2785 int netfd;
1453 2786
1454 /* Start with this zeroed. Messy but sure. */ 2787 /* Start with this zeroed. Messy but sure. */
@@ -1476,6 +2809,18 @@ static int get_tun_device(char tapif[IFNAMSIZ])
1476 */ 2809 */
1477 ioctl(netfd, TUNSETNOCSUM, 1); 2810 ioctl(netfd, TUNSETNOCSUM, 1);
1478 2811
2812 /*
2813 * In virtio before 1.0 (aka legacy virtio), we added a 16-bit
2814 * field at the end of the network header iff
2815 * VIRTIO_NET_F_MRG_RXBUF was negotiated. For virtio 1.0,
2816 * that became the norm, but we need to tell the tun device
2817 * about our expanded header (which is called
2818 * virtio_net_hdr_mrg_rxbuf in the legacy system).
2819 */
2820 vnet_hdr_sz = sizeof(struct virtio_net_hdr_v1);
2821 if (ioctl(netfd, TUNSETVNETHDRSZ, &vnet_hdr_sz) != 0)
2822 err(1, "Setting tun header size to %u", vnet_hdr_sz);
2823
1479 memcpy(tapif, ifr.ifr_name, IFNAMSIZ); 2824 memcpy(tapif, ifr.ifr_name, IFNAMSIZ);
1480 return netfd; 2825 return netfd;
1481} 2826}
@@ -1499,12 +2844,12 @@ static void setup_tun_net(char *arg)
1499 net_info->tunfd = get_tun_device(tapif); 2844 net_info->tunfd = get_tun_device(tapif);
1500 2845
1501 /* First we create a new network device. */ 2846 /* First we create a new network device. */
1502 dev = new_device("net", VIRTIO_ID_NET); 2847 dev = new_pci_device("net", VIRTIO_ID_NET, 0x02, 0x00);
1503 dev->priv = net_info; 2848 dev->priv = net_info;
1504 2849
1505 /* Network devices need a recv and a send queue, just like console. */ 2850 /* Network devices need a recv and a send queue, just like console. */
1506 add_virtqueue(dev, VIRTQUEUE_NUM, net_input); 2851 add_pci_virtqueue(dev, net_input, "rx");
1507 add_virtqueue(dev, VIRTQUEUE_NUM, net_output); 2852 add_pci_virtqueue(dev, net_output, "tx");
1508 2853
1509 /* 2854 /*
1510 * We need a socket to perform the magic network ioctls to bring up the 2855 * We need a socket to perform the magic network ioctls to bring up the
@@ -1524,7 +2869,7 @@ static void setup_tun_net(char *arg)
1524 p = strchr(arg, ':'); 2869 p = strchr(arg, ':');
1525 if (p) { 2870 if (p) {
1526 str2mac(p+1, conf.mac); 2871 str2mac(p+1, conf.mac);
1527 add_feature(dev, VIRTIO_NET_F_MAC); 2872 add_pci_feature(dev, VIRTIO_NET_F_MAC);
1528 *p = '\0'; 2873 *p = '\0';
1529 } 2874 }
1530 2875
@@ -1538,25 +2883,21 @@ static void setup_tun_net(char *arg)
1538 configure_device(ipfd, tapif, ip); 2883 configure_device(ipfd, tapif, ip);
1539 2884
1540 /* Expect Guest to handle everything except UFO */ 2885 /* Expect Guest to handle everything except UFO */
1541 add_feature(dev, VIRTIO_NET_F_CSUM); 2886 add_pci_feature(dev, VIRTIO_NET_F_CSUM);
1542 add_feature(dev, VIRTIO_NET_F_GUEST_CSUM); 2887 add_pci_feature(dev, VIRTIO_NET_F_GUEST_CSUM);
1543 add_feature(dev, VIRTIO_NET_F_GUEST_TSO4); 2888 add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO4);
1544 add_feature(dev, VIRTIO_NET_F_GUEST_TSO6); 2889 add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO6);
1545 add_feature(dev, VIRTIO_NET_F_GUEST_ECN); 2890 add_pci_feature(dev, VIRTIO_NET_F_GUEST_ECN);
1546 add_feature(dev, VIRTIO_NET_F_HOST_TSO4); 2891 add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO4);
1547 add_feature(dev, VIRTIO_NET_F_HOST_TSO6); 2892 add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO6);
1548 add_feature(dev, VIRTIO_NET_F_HOST_ECN); 2893 add_pci_feature(dev, VIRTIO_NET_F_HOST_ECN);
1549 /* We handle indirect ring entries */ 2894 /* We handle indirect ring entries */
1550 add_feature(dev, VIRTIO_RING_F_INDIRECT_DESC); 2895 add_pci_feature(dev, VIRTIO_RING_F_INDIRECT_DESC);
1551 /* We're compliant with the damn spec. */ 2896 set_device_config(dev, &conf, sizeof(conf));
1552 add_feature(dev, VIRTIO_F_ANY_LAYOUT);
1553 set_config(dev, sizeof(conf), &conf);
1554 2897
1555 /* We don't need the socket any more; setup is done. */ 2898 /* We don't need the socket any more; setup is done. */
1556 close(ipfd); 2899 close(ipfd);
1557 2900
1558 devices.device_num++;
1559
1560 if (bridging) 2901 if (bridging)
1561 verbose("device %u: tun %s attached to bridge: %s\n", 2902 verbose("device %u: tun %s attached to bridge: %s\n",
1562 devices.device_num, tapif, arg); 2903 devices.device_num, tapif, arg);
@@ -1607,7 +2948,7 @@ static void blk_request(struct virtqueue *vq)
1607 head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 2948 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
1608 2949
1609 /* Copy the output header from the front of the iov (adjusts iov) */ 2950 /* Copy the output header from the front of the iov (adjusts iov) */
1610 iov_consume(iov, out_num, &out, sizeof(out)); 2951 iov_consume(vq->dev, iov, out_num, &out, sizeof(out));
1611 2952
1612 /* Find and trim end of iov input array, for our status byte. */ 2953 /* Find and trim end of iov input array, for our status byte. */
1613 in = NULL; 2954 in = NULL;
@@ -1619,7 +2960,7 @@ static void blk_request(struct virtqueue *vq)
1619 } 2960 }
1620 } 2961 }
1621 if (!in) 2962 if (!in)
1622 errx(1, "Bad virtblk cmd with no room for status"); 2963 bad_driver_vq(vq, "Bad virtblk cmd with no room for status");
1623 2964
1624 /* 2965 /*
1625 * For historical reasons, block operations are expressed in 512 byte 2966 * For historical reasons, block operations are expressed in 512 byte
@@ -1627,15 +2968,7 @@ static void blk_request(struct virtqueue *vq)
1627 */ 2968 */
1628 off = out.sector * 512; 2969 off = out.sector * 512;
1629 2970
1630 /* 2971 if (out.type & VIRTIO_BLK_T_OUT) {
1631 * In general the virtio block driver is allowed to try SCSI commands.
1632 * It'd be nice if we supported eject, for example, but we don't.
1633 */
1634 if (out.type & VIRTIO_BLK_T_SCSI_CMD) {
1635 fprintf(stderr, "Scsi commands unsupported\n");
1636 *in = VIRTIO_BLK_S_UNSUPP;
1637 wlen = sizeof(*in);
1638 } else if (out.type & VIRTIO_BLK_T_OUT) {
1639 /* 2972 /*
1640 * Write 2973 * Write
1641 * 2974 *
@@ -1657,7 +2990,7 @@ static void blk_request(struct virtqueue *vq)
1657 /* Trim it back to the correct length */ 2990 /* Trim it back to the correct length */
1658 ftruncate64(vblk->fd, vblk->len); 2991 ftruncate64(vblk->fd, vblk->len);
1659 /* Die, bad Guest, die. */ 2992 /* Die, bad Guest, die. */
1660 errx(1, "Write past end %llu+%u", off, ret); 2993 bad_driver_vq(vq, "Write past end %llu+%u", off, ret);
1661 } 2994 }
1662 2995
1663 wlen = sizeof(*in); 2996 wlen = sizeof(*in);
@@ -1699,11 +3032,11 @@ static void setup_block_file(const char *filename)
1699 struct vblk_info *vblk; 3032 struct vblk_info *vblk;
1700 struct virtio_blk_config conf; 3033 struct virtio_blk_config conf;
1701 3034
1702 /* Creat the device. */ 3035 /* Create the device. */
1703 dev = new_device("block", VIRTIO_ID_BLOCK); 3036 dev = new_pci_device("block", VIRTIO_ID_BLOCK, 0x01, 0x80);
1704 3037
1705 /* The device has one virtqueue, where the Guest places requests. */ 3038 /* The device has one virtqueue, where the Guest places requests. */
1706 add_virtqueue(dev, VIRTQUEUE_NUM, blk_request); 3039 add_pci_virtqueue(dev, blk_request, "request");
1707 3040
1708 /* Allocate the room for our own bookkeeping */ 3041 /* Allocate the room for our own bookkeeping */
1709 vblk = dev->priv = malloc(sizeof(*vblk)); 3042 vblk = dev->priv = malloc(sizeof(*vblk));
@@ -1712,9 +3045,6 @@ static void setup_block_file(const char *filename)
1712 vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE); 3045 vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
1713 vblk->len = lseek64(vblk->fd, 0, SEEK_END); 3046 vblk->len = lseek64(vblk->fd, 0, SEEK_END);
1714 3047
1715 /* We support FLUSH. */
1716 add_feature(dev, VIRTIO_BLK_F_FLUSH);
1717
1718 /* Tell Guest how many sectors this device has. */ 3048 /* Tell Guest how many sectors this device has. */
1719 conf.capacity = cpu_to_le64(vblk->len / 512); 3049 conf.capacity = cpu_to_le64(vblk->len / 512);
1720 3050
@@ -1722,20 +3052,19 @@ static void setup_block_file(const char *filename)
1722 * Tell Guest not to put in too many descriptors at once: two are used 3052 * Tell Guest not to put in too many descriptors at once: two are used
1723 * for the in and out elements. 3053 * for the in and out elements.
1724 */ 3054 */
1725 add_feature(dev, VIRTIO_BLK_F_SEG_MAX); 3055 add_pci_feature(dev, VIRTIO_BLK_F_SEG_MAX);
1726 conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2); 3056 conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2);
1727 3057
1728 /* Don't try to put whole struct: we have 8 bit limit. */ 3058 set_device_config(dev, &conf, sizeof(struct virtio_blk_config));
1729 set_config(dev, offsetof(struct virtio_blk_config, geometry), &conf);
1730 3059
1731 verbose("device %u: virtblock %llu sectors\n", 3060 verbose("device %u: virtblock %llu sectors\n",
1732 ++devices.device_num, le64_to_cpu(conf.capacity)); 3061 devices.device_num, le64_to_cpu(conf.capacity));
1733} 3062}
1734 3063
1735/*L:211 3064/*L:211
1736 * Our random number generator device reads from /dev/random into the Guest's 3065 * Our random number generator device reads from /dev/urandom into the Guest's
1737 * input buffers. The usual case is that the Guest doesn't want random numbers 3066 * input buffers. The usual case is that the Guest doesn't want random numbers
1738 * and so has no buffers although /dev/random is still readable, whereas 3067 * and so has no buffers although /dev/urandom is still readable, whereas
1739 * console is the reverse. 3068 * console is the reverse.
1740 * 3069 *
1741 * The same logic applies, however. 3070 * The same logic applies, however.
@@ -1754,7 +3083,7 @@ static void rng_input(struct virtqueue *vq)
1754 /* First we need a buffer from the Guests's virtqueue. */ 3083 /* First we need a buffer from the Guests's virtqueue. */
1755 head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 3084 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
1756 if (out_num) 3085 if (out_num)
1757 errx(1, "Output buffers in rng?"); 3086 bad_driver_vq(vq, "Output buffers in rng?");
1758 3087
1759 /* 3088 /*
1760 * Just like the console write, we loop to cover the whole iovec. 3089 * Just like the console write, we loop to cover the whole iovec.
@@ -1763,8 +3092,8 @@ static void rng_input(struct virtqueue *vq)
1763 while (!iov_empty(iov, in_num)) { 3092 while (!iov_empty(iov, in_num)) {
1764 len = readv(rng_info->rfd, iov, in_num); 3093 len = readv(rng_info->rfd, iov, in_num);
1765 if (len <= 0) 3094 if (len <= 0)
1766 err(1, "Read from /dev/random gave %i", len); 3095 err(1, "Read from /dev/urandom gave %i", len);
1767 iov_consume(iov, in_num, NULL, len); 3096 iov_consume(vq->dev, iov, in_num, NULL, len);
1768 totlen += len; 3097 totlen += len;
1769 } 3098 }
1770 3099
@@ -1780,17 +3109,20 @@ static void setup_rng(void)
1780 struct device *dev; 3109 struct device *dev;
1781 struct rng_info *rng_info = malloc(sizeof(*rng_info)); 3110 struct rng_info *rng_info = malloc(sizeof(*rng_info));
1782 3111
1783 /* Our device's privat info simply contains the /dev/random fd. */ 3112 /* Our device's private info simply contains the /dev/urandom fd. */
1784 rng_info->rfd = open_or_die("/dev/random", O_RDONLY); 3113 rng_info->rfd = open_or_die("/dev/urandom", O_RDONLY);
1785 3114
1786 /* Create the new device. */ 3115 /* Create the new device. */
1787 dev = new_device("rng", VIRTIO_ID_RNG); 3116 dev = new_pci_device("rng", VIRTIO_ID_RNG, 0xff, 0);
1788 dev->priv = rng_info; 3117 dev->priv = rng_info;
1789 3118
1790 /* The device has one virtqueue, where the Guest places inbufs. */ 3119 /* The device has one virtqueue, where the Guest places inbufs. */
1791 add_virtqueue(dev, VIRTQUEUE_NUM, rng_input); 3120 add_pci_virtqueue(dev, rng_input, "input");
1792 3121
1793 verbose("device %u: rng\n", devices.device_num++); 3122 /* We don't have any configuration space */
3123 no_device_config(dev);
3124
3125 verbose("device %u: rng\n", devices.device_num);
1794} 3126}
1795/* That's the end of device setup. */ 3127/* That's the end of device setup. */
1796 3128
@@ -1820,17 +3152,23 @@ static void __attribute__((noreturn)) restart_guest(void)
1820static void __attribute__((noreturn)) run_guest(void) 3152static void __attribute__((noreturn)) run_guest(void)
1821{ 3153{
1822 for (;;) { 3154 for (;;) {
1823 unsigned long notify_addr; 3155 struct lguest_pending notify;
1824 int readval; 3156 int readval;
1825 3157
1826 /* We read from the /dev/lguest device to run the Guest. */ 3158 /* We read from the /dev/lguest device to run the Guest. */
1827 readval = pread(lguest_fd, &notify_addr, 3159 readval = pread(lguest_fd, &notify, sizeof(notify), cpu_id);
1828 sizeof(notify_addr), cpu_id); 3160 if (readval == sizeof(notify)) {
1829 3161 if (notify.trap == 13) {
1830 /* One unsigned long means the Guest did HCALL_NOTIFY */ 3162 verbose("Emulating instruction at %#x\n",
1831 if (readval == sizeof(notify_addr)) { 3163 getreg(eip));
1832 verbose("Notify on address %#lx\n", notify_addr); 3164 emulate_insn(notify.insn);
1833 handle_output(notify_addr); 3165 } else if (notify.trap == 14) {
3166 verbose("Emulating MMIO at %#x\n",
3167 getreg(eip));
3168 emulate_mmio(notify.addr, notify.insn);
3169 } else
3170 errx(1, "Unknown trap %i addr %#08x\n",
3171 notify.trap, notify.addr);
1834 /* ENOENT means the Guest died. Reading tells us why. */ 3172 /* ENOENT means the Guest died. Reading tells us why. */
1835 } else if (errno == ENOENT) { 3173 } else if (errno == ENOENT) {
1836 char reason[1024] = { 0 }; 3174 char reason[1024] = { 0 };
@@ -1893,11 +3231,9 @@ int main(int argc, char *argv[])
1893 main_args = argv; 3231 main_args = argv;
1894 3232
1895 /* 3233 /*
1896 * First we initialize the device list. We keep a pointer to the last 3234 * First we initialize the device list. We remember next interrupt
1897 * device, and the next interrupt number to use for devices (1: 3235 * number to use for devices (1: remember that 0 is used by the timer).
1898 * remember that 0 is used by the timer).
1899 */ 3236 */
1900 devices.lastdev = NULL;
1901 devices.next_irq = 1; 3237 devices.next_irq = 1;
1902 3238
1903 /* We're CPU 0. In fact, that's the only CPU possible right now. */ 3239 /* We're CPU 0. In fact, that's the only CPU possible right now. */
@@ -1921,12 +3257,14 @@ int main(int argc, char *argv[])
1921 guest_base = map_zeroed_pages(mem / getpagesize() 3257 guest_base = map_zeroed_pages(mem / getpagesize()
1922 + DEVICE_PAGES); 3258 + DEVICE_PAGES);
1923 guest_limit = mem; 3259 guest_limit = mem;
1924 guest_max = mem + DEVICE_PAGES*getpagesize(); 3260 guest_max = guest_mmio = mem + DEVICE_PAGES*getpagesize();
1925 devices.descpage = get_pages(1);
1926 break; 3261 break;
1927 } 3262 }
1928 } 3263 }
1929 3264
3265 /* We always have a console device, and it's always device 1. */
3266 setup_console();
3267
1930 /* The options are fairly straight-forward */ 3268 /* The options are fairly straight-forward */
1931 while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) { 3269 while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) {
1932 switch (c) { 3270 switch (c) {
@@ -1967,8 +3305,8 @@ int main(int argc, char *argv[])
1967 3305
1968 verbose("Guest base is at %p\n", guest_base); 3306 verbose("Guest base is at %p\n", guest_base);
1969 3307
1970 /* We always have a console device */ 3308 /* Initialize the (fake) PCI host bridge device. */
1971 setup_console(); 3309 init_pci_host_bridge();
1972 3310
1973 /* Now we load the kernel */ 3311 /* Now we load the kernel */
1974 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); 3312 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));