aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/lguest/lguest.c231
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--drivers/Kconfig2
-rw-r--r--drivers/block/Kconfig3
-rw-r--r--drivers/block/virtio_blk.c106
-rw-r--r--drivers/char/virtio_console.c4
-rw-r--r--drivers/lguest/lguest_device.c146
-rw-r--r--drivers/net/Kconfig3
-rw-r--r--drivers/net/virtio_net.c155
-rw-r--r--drivers/virtio/Kconfig31
-rw-r--r--drivers/virtio/Makefile2
-rw-r--r--drivers/virtio/virtio.c65
-rw-r--r--drivers/virtio/virtio_balloon.c284
-rw-r--r--drivers/virtio/virtio_pci.c446
-rw-r--r--drivers/virtio/virtio_ring.c51
-rw-r--r--include/linux/lguest_launcher.h9
-rw-r--r--include/linux/skbuff.h1
-rw-r--r--include/linux/virtio.h19
-rw-r--r--include/linux/virtio_balloon.h18
-rw-r--r--include/linux/virtio_blk.h22
-rw-r--r--include/linux/virtio_config.h104
-rw-r--r--include/linux/virtio_net.h32
-rw-r--r--include/linux/virtio_pci.h57
-rw-r--r--include/linux/virtio_ring.h14
-rw-r--r--net/9p/trans_virtio.c8
-rw-r--r--net/core/skbuff.c29
26 files changed, 1422 insertions, 421 deletions
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 6c8a2386cd50..0f23d67f958f 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -34,6 +34,8 @@
34#include <zlib.h> 34#include <zlib.h>
35#include <assert.h> 35#include <assert.h>
36#include <sched.h> 36#include <sched.h>
37#include <limits.h>
38#include <stddef.h>
37#include "linux/lguest_launcher.h" 39#include "linux/lguest_launcher.h"
38#include "linux/virtio_config.h" 40#include "linux/virtio_config.h"
39#include "linux/virtio_net.h" 41#include "linux/virtio_net.h"
@@ -99,13 +101,11 @@ struct device_list
99 /* The descriptor page for the devices. */ 101 /* The descriptor page for the devices. */
100 u8 *descpage; 102 u8 *descpage;
101 103
102 /* The tail of the last descriptor. */
103 unsigned int desc_used;
104
105 /* A single linked list of devices. */ 104 /* A single linked list of devices. */
106 struct device *dev; 105 struct device *dev;
107 /* ... And an end pointer so we can easily append new devices */ 106 /* And a pointer to the last device for easy append and also for
108 struct device **lastdev; 107 * configuration appending. */
108 struct device *lastdev;
109}; 109};
110 110
111/* The list of Guest devices, based on command line arguments. */ 111/* The list of Guest devices, based on command line arguments. */
@@ -191,7 +191,14 @@ static void *_convert(struct iovec *iov, size_t size, size_t align,
191#define cpu_to_le64(v64) (v64) 191#define cpu_to_le64(v64) (v64)
192#define le16_to_cpu(v16) (v16) 192#define le16_to_cpu(v16) (v16)
193#define le32_to_cpu(v32) (v32) 193#define le32_to_cpu(v32) (v32)
194#define le64_to_cpu(v32) (v64) 194#define le64_to_cpu(v64) (v64)
195
196/* The device virtqueue descriptors are followed by feature bitmasks. */
197static u8 *get_feature_bits(struct device *dev)
198{
199 return (u8 *)(dev->desc + 1)
200 + dev->desc->num_vq * sizeof(struct lguest_vqconfig);
201}
195 202
196/*L:100 The Launcher code itself takes us out into userspace, that scary place 203/*L:100 The Launcher code itself takes us out into userspace, that scary place
197 * where pointers run wild and free! Unfortunately, like most userspace 204 * where pointers run wild and free! Unfortunately, like most userspace
@@ -914,21 +921,58 @@ static void enable_fd(int fd, struct virtqueue *vq)
914 write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd)); 921 write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd));
915} 922}
916 923
924/* Resetting a device is fairly easy. */
925static void reset_device(struct device *dev)
926{
927 struct virtqueue *vq;
928
929 verbose("Resetting device %s\n", dev->name);
930 /* Clear the status. */
931 dev->desc->status = 0;
932
933 /* Clear any features they've acked. */
934 memset(get_feature_bits(dev) + dev->desc->feature_len, 0,
935 dev->desc->feature_len);
936
937 /* Zero out the virtqueues. */
938 for (vq = dev->vq; vq; vq = vq->next) {
939 memset(vq->vring.desc, 0,
940 vring_size(vq->config.num, getpagesize()));
941 vq->last_avail_idx = 0;
942 }
943}
944
917/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ 945/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */
918static void handle_output(int fd, unsigned long addr) 946static void handle_output(int fd, unsigned long addr)
919{ 947{
920 struct device *i; 948 struct device *i;
921 struct virtqueue *vq; 949 struct virtqueue *vq;
922 950
923 /* Check each virtqueue. */ 951 /* Check each device and virtqueue. */
924 for (i = devices.dev; i; i = i->next) { 952 for (i = devices.dev; i; i = i->next) {
953 /* Notifications to device descriptors reset the device. */
954 if (from_guest_phys(addr) == i->desc) {
955 reset_device(i);
956 return;
957 }
958
959 /* Notifications to virtqueues mean output has occurred. */
925 for (vq = i->vq; vq; vq = vq->next) { 960 for (vq = i->vq; vq; vq = vq->next) {
926 if (vq->config.pfn == addr/getpagesize() 961 if (vq->config.pfn != addr/getpagesize())
927 && vq->handle_output) { 962 continue;
928 verbose("Output to %s\n", vq->dev->name); 963
929 vq->handle_output(fd, vq); 964 /* Guest should acknowledge (and set features!) before
965 * using the device. */
966 if (i->desc->status == 0) {
967 warnx("%s gave early output", i->name);
930 return; 968 return;
931 } 969 }
970
971 if (strcmp(vq->dev->name, "console") != 0)
972 verbose("Output to %s\n", vq->dev->name);
973 if (vq->handle_output)
974 vq->handle_output(fd, vq);
975 return;
932 } 976 }
933 } 977 }
934 978
@@ -986,54 +1030,44 @@ static void handle_input(int fd)
986 * 1030 *
987 * All devices need a descriptor so the Guest knows it exists, and a "struct 1031 * All devices need a descriptor so the Guest knows it exists, and a "struct
988 * device" so the Launcher can keep track of it. We have common helper 1032 * device" so the Launcher can keep track of it. We have common helper
989 * routines to allocate them. 1033 * routines to allocate and manage them. */
990 *
991 * This routine allocates a new "struct lguest_device_desc" from descriptor
992 * table just above the Guest's normal memory. It returns a pointer to that
993 * descriptor. */
994static struct lguest_device_desc *new_dev_desc(u16 type)
995{
996 struct lguest_device_desc *d;
997 1034
998 /* We only have one page for all the descriptors. */ 1035/* The layout of the device page is a "struct lguest_device_desc" followed by a
999 if (devices.desc_used + sizeof(*d) > getpagesize()) 1036 * number of virtqueue descriptors, then two sets of feature bits, then an
1000 errx(1, "Too many devices"); 1037 * array of configuration bytes. This routine returns the configuration
1001 1038 * pointer. */
1002 /* We don't need to set config_len or status: page is 0 already. */ 1039static u8 *device_config(const struct device *dev)
1003 d = (void *)devices.descpage + devices.desc_used; 1040{
1004 d->type = type; 1041 return (void *)(dev->desc + 1)
1005 devices.desc_used += sizeof(*d); 1042 + dev->desc->num_vq * sizeof(struct lguest_vqconfig)
1006 1043 + dev->desc->feature_len * 2;
1007 return d;
1008} 1044}
1009 1045
1010/* Each device descriptor is followed by some configuration information. 1046/* This routine allocates a new "struct lguest_device_desc" from descriptor
1011 * Each configuration field looks like: u8 type, u8 len, [... len bytes...]. 1047 * table page just above the Guest's normal memory. It returns a pointer to
1012 * 1048 * that descriptor. */
1013 * This routine adds a new field to an existing device's descriptor. It only 1049static struct lguest_device_desc *new_dev_desc(u16 type)
1014 * works for the last device, but that's OK because that's how we use it. */
1015static void add_desc_field(struct device *dev, u8 type, u8 len, const void *c)
1016{ 1050{
1017 /* This is the last descriptor, right? */ 1051 struct lguest_device_desc d = { .type = type };
1018 assert(devices.descpage + devices.desc_used 1052 void *p;
1019 == (u8 *)(dev->desc + 1) + dev->desc->config_len);
1020 1053
1021 /* We only have one page of device descriptions. */ 1054 /* Figure out where the next device config is, based on the last one. */
1022 if (devices.desc_used + 2 + len > getpagesize()) 1055 if (devices.lastdev)
1023 errx(1, "Too many devices"); 1056 p = device_config(devices.lastdev)
1057 + devices.lastdev->desc->config_len;
1058 else
1059 p = devices.descpage;
1024 1060
1025 /* Copy in the new config header: type then length. */ 1061 /* We only have one page for all the descriptors. */
1026 devices.descpage[devices.desc_used++] = type; 1062 if (p + sizeof(d) > (void *)devices.descpage + getpagesize())
1027 devices.descpage[devices.desc_used++] = len; 1063 errx(1, "Too many devices");
1028 memcpy(devices.descpage + devices.desc_used, c, len);
1029 devices.desc_used += len;
1030 1064
1031 /* Update the device descriptor length: two byte head then data. */ 1065 /* p might not be aligned, so we memcpy in. */
1032 dev->desc->config_len += 2 + len; 1066 return memcpy(p, &d, sizeof(d));
1033} 1067}
1034 1068
1035/* This routine adds a virtqueue to a device. We specify how many descriptors 1069/* Each device descriptor is followed by the description of its virtqueues. We
1036 * the virtqueue is to have. */ 1070 * specify how many descriptors the virtqueue is to have. */
1037static void add_virtqueue(struct device *dev, unsigned int num_descs, 1071static void add_virtqueue(struct device *dev, unsigned int num_descs,
1038 void (*handle_output)(int fd, struct virtqueue *me)) 1072 void (*handle_output)(int fd, struct virtqueue *me))
1039{ 1073{
@@ -1059,9 +1093,15 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1059 /* Initialize the vring. */ 1093 /* Initialize the vring. */
1060 vring_init(&vq->vring, num_descs, p, getpagesize()); 1094 vring_init(&vq->vring, num_descs, p, getpagesize());
1061 1095
1062 /* Add the configuration information to this device's descriptor. */ 1096 /* Append virtqueue to this device's descriptor. We use
1063 add_desc_field(dev, VIRTIO_CONFIG_F_VIRTQUEUE, 1097 * device_config() to get the end of the device's current virtqueues;
1064 sizeof(vq->config), &vq->config); 1098 * we check that we haven't added any config or feature information
1099 * yet, otherwise we'd be overwriting them. */
1100 assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0);
1101 memcpy(device_config(dev), &vq->config, sizeof(vq->config));
1102 dev->desc->num_vq++;
1103
1104 verbose("Virtqueue page %#lx\n", to_guest_phys(p));
1065 1105
1066 /* Add to tail of list, so dev->vq is first vq, dev->vq->next is 1106 /* Add to tail of list, so dev->vq is first vq, dev->vq->next is
1067 * second. */ 1107 * second. */
@@ -1072,11 +1112,41 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1072 * virtqueue. */ 1112 * virtqueue. */
1073 vq->handle_output = handle_output; 1113 vq->handle_output = handle_output;
1074 1114
1075 /* Set the "Don't Notify Me" flag if we don't have a handler */ 1115 /* As an optimization, set the advisory "Don't Notify Me" flag if we
1116 * don't have a handler */
1076 if (!handle_output) 1117 if (!handle_output)
1077 vq->vring.used->flags = VRING_USED_F_NO_NOTIFY; 1118 vq->vring.used->flags = VRING_USED_F_NO_NOTIFY;
1078} 1119}
1079 1120
1121/* The first half of the feature bitmask is for us to advertise features. The
1122 * second half if for the Guest to accept features. */
1123static void add_feature(struct device *dev, unsigned bit)
1124{
1125 u8 *features = get_feature_bits(dev);
1126
1127 /* We can't extend the feature bits once we've added config bytes */
1128 if (dev->desc->feature_len <= bit / CHAR_BIT) {
1129 assert(dev->desc->config_len == 0);
1130 dev->desc->feature_len = (bit / CHAR_BIT) + 1;
1131 }
1132
1133 features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT));
1134}
1135
1136/* This routine sets the configuration fields for an existing device's
1137 * descriptor. It only works for the last device, but that's OK because that's
1138 * how we use it. */
1139static void set_config(struct device *dev, unsigned len, const void *conf)
1140{
1141 /* Check we haven't overflowed our single page. */
1142 if (device_config(dev) + len > devices.descpage + getpagesize())
1143 errx(1, "Too many devices");
1144
1145 /* Copy in the config information, and store the length. */
1146 memcpy(device_config(dev), conf, len);
1147 dev->desc->config_len = len;
1148}
1149
1080/* This routine does all the creation and setup of a new device, including 1150/* This routine does all the creation and setup of a new device, including
1081 * calling new_dev_desc() to allocate the descriptor and device memory. */ 1151 * calling new_dev_desc() to allocate the descriptor and device memory. */
1082static struct device *new_device(const char *name, u16 type, int fd, 1152static struct device *new_device(const char *name, u16 type, int fd,
@@ -1084,14 +1154,6 @@ static struct device *new_device(const char *name, u16 type, int fd,
1084{ 1154{
1085 struct device *dev = malloc(sizeof(*dev)); 1155 struct device *dev = malloc(sizeof(*dev));
1086 1156
1087 /* Append to device list. Prepending to a single-linked list is
1088 * easier, but the user expects the devices to be arranged on the bus
1089 * in command-line order. The first network device on the command line
1090 * is eth0, the first block device /dev/vda, etc. */
1091 *devices.lastdev = dev;
1092 dev->next = NULL;
1093 devices.lastdev = &dev->next;
1094
1095 /* Now we populate the fields one at a time. */ 1157 /* Now we populate the fields one at a time. */
1096 dev->fd = fd; 1158 dev->fd = fd;
1097 /* If we have an input handler for this file descriptor, then we add it 1159 /* If we have an input handler for this file descriptor, then we add it
@@ -1102,6 +1164,17 @@ static struct device *new_device(const char *name, u16 type, int fd,
1102 dev->handle_input = handle_input; 1164 dev->handle_input = handle_input;
1103 dev->name = name; 1165 dev->name = name;
1104 dev->vq = NULL; 1166 dev->vq = NULL;
1167
1168 /* Append to device list. Prepending to a single-linked list is
1169 * easier, but the user expects the devices to be arranged on the bus
1170 * in command-line order. The first network device on the command line
1171 * is eth0, the first block device /dev/vda, etc. */
1172 if (devices.lastdev)
1173 devices.lastdev->next = dev;
1174 else
1175 devices.dev = dev;
1176 devices.lastdev = dev;
1177
1105 return dev; 1178 return dev;
1106} 1179}
1107 1180
@@ -1226,7 +1299,7 @@ static void setup_tun_net(const char *arg)
1226 int netfd, ipfd; 1299 int netfd, ipfd;
1227 u32 ip; 1300 u32 ip;
1228 const char *br_name = NULL; 1301 const char *br_name = NULL;
1229 u8 hwaddr[6]; 1302 struct virtio_net_config conf;
1230 1303
1231 /* We open the /dev/net/tun device and tell it we want a tap device. A 1304 /* We open the /dev/net/tun device and tell it we want a tap device. A
1232 * tap device is like a tun device, only somehow different. To tell 1305 * tap device is like a tun device, only somehow different. To tell
@@ -1265,12 +1338,13 @@ static void setup_tun_net(const char *arg)
1265 ip = str2ip(arg); 1338 ip = str2ip(arg);
1266 1339
1267 /* Set up the tun device, and get the mac address for the interface. */ 1340 /* Set up the tun device, and get the mac address for the interface. */
1268 configure_device(ipfd, ifr.ifr_name, ip, hwaddr); 1341 configure_device(ipfd, ifr.ifr_name, ip, conf.mac);
1269 1342
1270 /* Tell Guest what MAC address to use. */ 1343 /* Tell Guest what MAC address to use. */
1271 add_desc_field(dev, VIRTIO_CONFIG_NET_MAC_F, sizeof(hwaddr), hwaddr); 1344 add_feature(dev, VIRTIO_NET_F_MAC);
1345 set_config(dev, sizeof(conf), &conf);
1272 1346
1273 /* We don't seed the socket any more; setup is done. */ 1347 /* We don't need the socket any more; setup is done. */
1274 close(ipfd); 1348 close(ipfd);
1275 1349
1276 verbose("device %u: tun net %u.%u.%u.%u\n", 1350 verbose("device %u: tun net %u.%u.%u.%u\n",
@@ -1458,8 +1532,7 @@ static void setup_block_file(const char *filename)
1458 struct device *dev; 1532 struct device *dev;
1459 struct vblk_info *vblk; 1533 struct vblk_info *vblk;
1460 void *stack; 1534 void *stack;
1461 u64 cap; 1535 struct virtio_blk_config conf;
1462 unsigned int val;
1463 1536
1464 /* This is the pipe the I/O thread will use to tell us I/O is done. */ 1537 /* This is the pipe the I/O thread will use to tell us I/O is done. */
1465 pipe(p); 1538 pipe(p);
@@ -1477,14 +1550,18 @@ static void setup_block_file(const char *filename)
1477 vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE); 1550 vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
1478 vblk->len = lseek64(vblk->fd, 0, SEEK_END); 1551 vblk->len = lseek64(vblk->fd, 0, SEEK_END);
1479 1552
1553 /* We support barriers. */
1554 add_feature(dev, VIRTIO_BLK_F_BARRIER);
1555
1480 /* Tell Guest how many sectors this device has. */ 1556 /* Tell Guest how many sectors this device has. */
1481 cap = cpu_to_le64(vblk->len / 512); 1557 conf.capacity = cpu_to_le64(vblk->len / 512);
1482 add_desc_field(dev, VIRTIO_CONFIG_BLK_F_CAPACITY, sizeof(cap), &cap);
1483 1558
1484 /* Tell Guest not to put in too many descriptors at once: two are used 1559 /* Tell Guest not to put in too many descriptors at once: two are used
1485 * for the in and out elements. */ 1560 * for the in and out elements. */
1486 val = cpu_to_le32(VIRTQUEUE_NUM - 2); 1561 add_feature(dev, VIRTIO_BLK_F_SEG_MAX);
1487 add_desc_field(dev, VIRTIO_CONFIG_BLK_F_SEG_MAX, sizeof(val), &val); 1562 conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2);
1563
1564 set_config(dev, sizeof(conf), &conf);
1488 1565
1489 /* The I/O thread writes to this end of the pipe when done. */ 1566 /* The I/O thread writes to this end of the pipe when done. */
1490 vblk->done_fd = p[1]; 1567 vblk->done_fd = p[1];
@@ -1505,7 +1582,7 @@ static void setup_block_file(const char *filename)
1505 close(vblk->workpipe[0]); 1582 close(vblk->workpipe[0]);
1506 1583
1507 verbose("device %u: virtblock %llu sectors\n", 1584 verbose("device %u: virtblock %llu sectors\n",
1508 devices.device_num, cap); 1585 devices.device_num, le64_to_cpu(conf.capacity));
1509} 1586}
1510/* That's the end of device setup. :*/ 1587/* That's the end of device setup. :*/
1511 1588
@@ -1610,12 +1687,12 @@ int main(int argc, char *argv[])
1610 /* First we initialize the device list. Since console and network 1687 /* First we initialize the device list. Since console and network
1611 * device receive input from a file descriptor, we keep an fdset 1688 * device receive input from a file descriptor, we keep an fdset
1612 * (infds) and the maximum fd number (max_infd) with the head of the 1689 * (infds) and the maximum fd number (max_infd) with the head of the
1613 * list. We also keep a pointer to the last device, for easy appending 1690 * list. We also keep a pointer to the last device. Finally, we keep
1614 * to the list. Finally, we keep the next interrupt number to hand out 1691 * the next interrupt number to hand out (1: remember that 0 is used by
1615 * (1: remember that 0 is used by the timer). */ 1692 * the timer). */
1616 FD_ZERO(&devices.infds); 1693 FD_ZERO(&devices.infds);
1617 devices.max_infd = -1; 1694 devices.max_infd = -1;
1618 devices.lastdev = &devices.dev; 1695 devices.lastdev = NULL;
1619 devices.next_irq = 1; 1696 devices.next_irq = 1;
1620 1697
1621 cpu_id = 0; 1698 cpu_id = 0;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index c83e1c9b5129..41962e793c0f 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -53,5 +53,6 @@ config KVM_AMD
53# OK, it's a little counter-intuitive to do this, but it puts it neatly under 53# OK, it's a little counter-intuitive to do this, but it puts it neatly under
54# the virtualization menu. 54# the virtualization menu.
55source drivers/lguest/Kconfig 55source drivers/lguest/Kconfig
56source drivers/virtio/Kconfig
56 57
57endif # VIRTUALIZATION 58endif # VIRTUALIZATION
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 08d4ae201597..3f8a231fe754 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -91,6 +91,4 @@ source "drivers/dca/Kconfig"
91source "drivers/auxdisplay/Kconfig" 91source "drivers/auxdisplay/Kconfig"
92 92
93source "drivers/uio/Kconfig" 93source "drivers/uio/Kconfig"
94
95source "drivers/virtio/Kconfig"
96endmenu 94endmenu
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index f2122855d4ec..64e5148d82bc 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -440,6 +440,7 @@ config VIRTIO_BLK
440 tristate "Virtio block driver (EXPERIMENTAL)" 440 tristate "Virtio block driver (EXPERIMENTAL)"
441 depends on EXPERIMENTAL && VIRTIO 441 depends on EXPERIMENTAL && VIRTIO
442 ---help--- 442 ---help---
443 This is the virtual block driver for lguest. Say Y or M. 443 This is the virtual block driver for virtio. It can be used with
444 lguest or QEMU based VMMs (like KVM or Xen). Say Y or M.
444 445
445endif # BLK_DEV 446endif # BLK_DEV
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 924ddd8bccd2..3b1a68d6eddb 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -7,8 +7,10 @@
7#include <linux/scatterlist.h> 7#include <linux/scatterlist.h>
8 8
9#define VIRTIO_MAX_SG (3+MAX_PHYS_SEGMENTS) 9#define VIRTIO_MAX_SG (3+MAX_PHYS_SEGMENTS)
10#define PART_BITS 4
11
12static int major, index;
10 13
11static unsigned char virtblk_index = 'a';
12struct virtio_blk 14struct virtio_blk
13{ 15{
14 spinlock_t lock; 16 spinlock_t lock;
@@ -36,7 +38,7 @@ struct virtblk_req
36 struct virtio_blk_inhdr in_hdr; 38 struct virtio_blk_inhdr in_hdr;
37}; 39};
38 40
39static bool blk_done(struct virtqueue *vq) 41static void blk_done(struct virtqueue *vq)
40{ 42{
41 struct virtio_blk *vblk = vq->vdev->priv; 43 struct virtio_blk *vblk = vq->vdev->priv;
42 struct virtblk_req *vbr; 44 struct virtblk_req *vbr;
@@ -65,7 +67,6 @@ static bool blk_done(struct virtqueue *vq)
65 /* In case queue is stopped waiting for more buffers. */ 67 /* In case queue is stopped waiting for more buffers. */
66 blk_start_queue(vblk->disk->queue); 68 blk_start_queue(vblk->disk->queue);
67 spin_unlock_irqrestore(&vblk->lock, flags); 69 spin_unlock_irqrestore(&vblk->lock, flags);
68 return true;
69} 70}
70 71
71static bool do_req(struct request_queue *q, struct virtio_blk *vblk, 72static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
@@ -153,20 +154,37 @@ static int virtblk_ioctl(struct inode *inode, struct file *filp,
153 (void __user *)data); 154 (void __user *)data);
154} 155}
155 156
157/* We provide getgeo only to please some old bootloader/partitioning tools */
158static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
159{
160 /* some standard values, similar to sd */
161 geo->heads = 1 << 6;
162 geo->sectors = 1 << 5;
163 geo->cylinders = get_capacity(bd->bd_disk) >> 11;
164 return 0;
165}
166
156static struct block_device_operations virtblk_fops = { 167static struct block_device_operations virtblk_fops = {
157 .ioctl = virtblk_ioctl, 168 .ioctl = virtblk_ioctl,
158 .owner = THIS_MODULE, 169 .owner = THIS_MODULE,
170 .getgeo = virtblk_getgeo,
159}; 171};
160 172
173static int index_to_minor(int index)
174{
175 return index << PART_BITS;
176}
177
161static int virtblk_probe(struct virtio_device *vdev) 178static int virtblk_probe(struct virtio_device *vdev)
162{ 179{
163 struct virtio_blk *vblk; 180 struct virtio_blk *vblk;
164 int err, major; 181 int err;
165 void *token;
166 unsigned int len;
167 u64 cap; 182 u64 cap;
168 u32 v; 183 u32 v;
169 184
185 if (index_to_minor(index) >= 1 << MINORBITS)
186 return -ENOSPC;
187
170 vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL); 188 vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);
171 if (!vblk) { 189 if (!vblk) {
172 err = -ENOMEM; 190 err = -ENOMEM;
@@ -178,7 +196,7 @@ static int virtblk_probe(struct virtio_device *vdev)
178 vblk->vdev = vdev; 196 vblk->vdev = vdev;
179 197
180 /* We expect one virtqueue, for output. */ 198 /* We expect one virtqueue, for output. */
181 vblk->vq = vdev->config->find_vq(vdev, blk_done); 199 vblk->vq = vdev->config->find_vq(vdev, 0, blk_done);
182 if (IS_ERR(vblk->vq)) { 200 if (IS_ERR(vblk->vq)) {
183 err = PTR_ERR(vblk->vq); 201 err = PTR_ERR(vblk->vq);
184 goto out_free_vblk; 202 goto out_free_vblk;
@@ -190,17 +208,11 @@ static int virtblk_probe(struct virtio_device *vdev)
190 goto out_free_vq; 208 goto out_free_vq;
191 } 209 }
192 210
193 major = register_blkdev(0, "virtblk");
194 if (major < 0) {
195 err = major;
196 goto out_mempool;
197 }
198
199 /* FIXME: How many partitions? How long is a piece of string? */ 211 /* FIXME: How many partitions? How long is a piece of string? */
200 vblk->disk = alloc_disk(1 << 4); 212 vblk->disk = alloc_disk(1 << PART_BITS);
201 if (!vblk->disk) { 213 if (!vblk->disk) {
202 err = -ENOMEM; 214 err = -ENOMEM;
203 goto out_unregister_blkdev; 215 goto out_mempool;
204 } 216 }
205 217
206 vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock); 218 vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock);
@@ -209,22 +221,32 @@ static int virtblk_probe(struct virtio_device *vdev)
209 goto out_put_disk; 221 goto out_put_disk;
210 } 222 }
211 223
212 sprintf(vblk->disk->disk_name, "vd%c", virtblk_index++); 224 if (index < 26) {
225 sprintf(vblk->disk->disk_name, "vd%c", 'a' + index % 26);
226 } else if (index < (26 + 1) * 26) {
227 sprintf(vblk->disk->disk_name, "vd%c%c",
228 'a' + index / 26 - 1, 'a' + index % 26);
229 } else {
230 const unsigned int m1 = (index / 26 - 1) / 26 - 1;
231 const unsigned int m2 = (index / 26 - 1) % 26;
232 const unsigned int m3 = index % 26;
233 sprintf(vblk->disk->disk_name, "vd%c%c%c",
234 'a' + m1, 'a' + m2, 'a' + m3);
235 }
236
213 vblk->disk->major = major; 237 vblk->disk->major = major;
214 vblk->disk->first_minor = 0; 238 vblk->disk->first_minor = index_to_minor(index);
215 vblk->disk->private_data = vblk; 239 vblk->disk->private_data = vblk;
216 vblk->disk->fops = &virtblk_fops; 240 vblk->disk->fops = &virtblk_fops;
241 index++;
217 242
218 /* If barriers are supported, tell block layer that queue is ordered */ 243 /* If barriers are supported, tell block layer that queue is ordered */
219 token = vdev->config->find(vdev, VIRTIO_CONFIG_BLK_F, &len); 244 if (vdev->config->feature(vdev, VIRTIO_BLK_F_BARRIER))
220 if (virtio_use_bit(vdev, token, len, VIRTIO_BLK_F_BARRIER))
221 blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_TAG, NULL); 245 blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_TAG, NULL);
222 246
223 err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_CAPACITY, &cap); 247 /* Host must always specify the capacity. */
224 if (err) { 248 __virtio_config_val(vdev, offsetof(struct virtio_blk_config, capacity),
225 dev_err(&vdev->dev, "Bad/missing capacity in config\n"); 249 &cap);
226 goto out_cleanup_queue;
227 }
228 250
229 /* If capacity is too big, truncate with warning. */ 251 /* If capacity is too big, truncate with warning. */
230 if ((sector_t)cap != cap) { 252 if ((sector_t)cap != cap) {
@@ -234,31 +256,25 @@ static int virtblk_probe(struct virtio_device *vdev)
234 } 256 }
235 set_capacity(vblk->disk, cap); 257 set_capacity(vblk->disk, cap);
236 258
237 err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_SIZE_MAX, &v); 259 /* Host can optionally specify maximum segment size and number of
260 * segments. */
261 err = virtio_config_val(vdev, VIRTIO_BLK_F_SIZE_MAX,
262 offsetof(struct virtio_blk_config, size_max),
263 &v);
238 if (!err) 264 if (!err)
239 blk_queue_max_segment_size(vblk->disk->queue, v); 265 blk_queue_max_segment_size(vblk->disk->queue, v);
240 else if (err != -ENOENT) {
241 dev_err(&vdev->dev, "Bad SIZE_MAX in config\n");
242 goto out_cleanup_queue;
243 }
244 266
245 err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_SEG_MAX, &v); 267 err = virtio_config_val(vdev, VIRTIO_BLK_F_SEG_MAX,
268 offsetof(struct virtio_blk_config, seg_max),
269 &v);
246 if (!err) 270 if (!err)
247 blk_queue_max_hw_segments(vblk->disk->queue, v); 271 blk_queue_max_hw_segments(vblk->disk->queue, v);
248 else if (err != -ENOENT) {
249 dev_err(&vdev->dev, "Bad SEG_MAX in config\n");
250 goto out_cleanup_queue;
251 }
252 272
253 add_disk(vblk->disk); 273 add_disk(vblk->disk);
254 return 0; 274 return 0;
255 275
256out_cleanup_queue:
257 blk_cleanup_queue(vblk->disk->queue);
258out_put_disk: 276out_put_disk:
259 put_disk(vblk->disk); 277 put_disk(vblk->disk);
260out_unregister_blkdev:
261 unregister_blkdev(major, "virtblk");
262out_mempool: 278out_mempool:
263 mempool_destroy(vblk->pool); 279 mempool_destroy(vblk->pool);
264out_free_vq: 280out_free_vq:
@@ -274,12 +290,16 @@ static void virtblk_remove(struct virtio_device *vdev)
274 struct virtio_blk *vblk = vdev->priv; 290 struct virtio_blk *vblk = vdev->priv;
275 int major = vblk->disk->major; 291 int major = vblk->disk->major;
276 292
293 /* Nothing should be pending. */
277 BUG_ON(!list_empty(&vblk->reqs)); 294 BUG_ON(!list_empty(&vblk->reqs));
295
296 /* Stop all the virtqueues. */
297 vdev->config->reset(vdev);
298
278 blk_cleanup_queue(vblk->disk->queue); 299 blk_cleanup_queue(vblk->disk->queue);
279 put_disk(vblk->disk); 300 put_disk(vblk->disk);
280 unregister_blkdev(major, "virtblk"); 301 unregister_blkdev(major, "virtblk");
281 mempool_destroy(vblk->pool); 302 mempool_destroy(vblk->pool);
282 /* There should be nothing in the queue now, so no need to shutdown */
283 vdev->config->del_vq(vblk->vq); 303 vdev->config->del_vq(vblk->vq);
284 kfree(vblk); 304 kfree(vblk);
285} 305}
@@ -299,11 +319,15 @@ static struct virtio_driver virtio_blk = {
299 319
300static int __init init(void) 320static int __init init(void)
301{ 321{
322 major = register_blkdev(0, "virtblk");
323 if (major < 0)
324 return major;
302 return register_virtio_driver(&virtio_blk); 325 return register_virtio_driver(&virtio_blk);
303} 326}
304 327
305static void __exit fini(void) 328static void __exit fini(void)
306{ 329{
330 unregister_blkdev(major, "virtblk");
307 unregister_virtio_driver(&virtio_blk); 331 unregister_virtio_driver(&virtio_blk);
308} 332}
309module_init(init); 333module_init(init);
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index e34da5c97196..dc17fe3a88bc 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -158,13 +158,13 @@ static int __devinit virtcons_probe(struct virtio_device *dev)
158 /* Find the input queue. */ 158 /* Find the input queue. */
159 /* FIXME: This is why we want to wean off hvc: we do nothing 159 /* FIXME: This is why we want to wean off hvc: we do nothing
160 * when input comes in. */ 160 * when input comes in. */
161 in_vq = vdev->config->find_vq(vdev, NULL); 161 in_vq = vdev->config->find_vq(vdev, 0, NULL);
162 if (IS_ERR(in_vq)) { 162 if (IS_ERR(in_vq)) {
163 err = PTR_ERR(in_vq); 163 err = PTR_ERR(in_vq);
164 goto free; 164 goto free;
165 } 165 }
166 166
167 out_vq = vdev->config->find_vq(vdev, NULL); 167 out_vq = vdev->config->find_vq(vdev, 1, NULL);
168 if (IS_ERR(out_vq)) { 168 if (IS_ERR(out_vq)) {
169 err = PTR_ERR(out_vq); 169 err = PTR_ERR(out_vq);
170 goto free_in_vq; 170 goto free_in_vq;
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
index e2eec38c83c2..84f85e23cca7 100644
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -52,57 +52,82 @@ struct lguest_device {
52/*D:130 52/*D:130
53 * Device configurations 53 * Device configurations
54 * 54 *
55 * The configuration information for a device consists of a series of fields. 55 * The configuration information for a device consists of one or more
56 * We don't really care what they are: the Launcher set them up, and the driver 56 * virtqueues, a feature bitmaks, and some configuration bytes. The
57 * will look at them during setup. 57 * configuration bytes don't really matter to us: the Launcher sets them up, and
58 * the driver will look at them during setup.
58 * 59 *
59 * For us these fields come immediately after that device's descriptor in the 60 * A convenient routine to return the device's virtqueue config array:
60 * lguest_devices page. 61 * immediately after the descriptor. */
61 * 62static struct lguest_vqconfig *lg_vq(const struct lguest_device_desc *desc)
62 * Each field starts with a "type" byte, a "length" byte, then that number of 63{
63 * bytes of configuration information. The device descriptor tells us the 64 return (void *)(desc + 1);
64 * total configuration length so we know when we've reached the last field. */ 65}
65 66
66/* type + length bytes */ 67/* The features come immediately after the virtqueues. */
67#define FHDR_LEN 2 68static u8 *lg_features(const struct lguest_device_desc *desc)
69{
70 return (void *)(lg_vq(desc) + desc->num_vq);
71}
68 72
69/* This finds the first field of a given type for a device's configuration. */ 73/* The config space comes after the two feature bitmasks. */
70static void *lg_find(struct virtio_device *vdev, u8 type, unsigned int *len) 74static u8 *lg_config(const struct lguest_device_desc *desc)
71{ 75{
72 struct lguest_device_desc *desc = to_lgdev(vdev)->desc; 76 return lg_features(desc) + desc->feature_len * 2;
73 int i; 77}
74
75 for (i = 0; i < desc->config_len; i += FHDR_LEN + desc->config[i+1]) {
76 if (desc->config[i] == type) {
77 /* Mark it used, so Host can know we looked at it, and
78 * also so we won't find the same one twice. */
79 desc->config[i] |= 0x80;
80 /* Remember, the second byte is the length. */
81 *len = desc->config[i+1];
82 /* We return a pointer to the field header. */
83 return desc->config + i;
84 }
85 }
86 78
87 /* Not found: return NULL for failure. */ 79/* The total size of the config page used by this device (incl. desc) */
88 return NULL; 80static unsigned desc_size(const struct lguest_device_desc *desc)
81{
82 return sizeof(*desc)
83 + desc->num_vq * sizeof(struct lguest_vqconfig)
84 + desc->feature_len * 2
85 + desc->config_len;
86}
87
88/* This tests (and acknowleges) a feature bit. */
89static bool lg_feature(struct virtio_device *vdev, unsigned fbit)
90{
91 struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
92 u8 *features;
93
94 /* Obviously if they ask for a feature off the end of our feature
95 * bitmap, it's not set. */
96 if (fbit / 8 > desc->feature_len)
97 return false;
98
99 /* The feature bitmap comes after the virtqueues. */
100 features = lg_features(desc);
101 if (!(features[fbit / 8] & (1 << (fbit % 8))))
102 return false;
103
104 /* We set the matching bit in the other half of the bitmap to tell the
105 * Host we want to use this feature. We don't use this yet, but we
106 * could in future. */
107 features[desc->feature_len + fbit / 8] |= (1 << (fbit % 8));
108 return true;
89} 109}
90 110
91/* Once they've found a field, getting a copy of it is easy. */ 111/* Once they've found a field, getting a copy of it is easy. */
92static void lg_get(struct virtio_device *vdev, void *token, 112static void lg_get(struct virtio_device *vdev, unsigned int offset,
93 void *buf, unsigned len) 113 void *buf, unsigned len)
94{ 114{
95 /* Check they didn't ask for more than the length of the field! */ 115 struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
96 BUG_ON(len > ((u8 *)token)[1]); 116
97 memcpy(buf, token + FHDR_LEN, len); 117 /* Check they didn't ask for more than the length of the config! */
118 BUG_ON(offset + len > desc->config_len);
119 memcpy(buf, lg_config(desc) + offset, len);
98} 120}
99 121
100/* Setting the contents is also trivial. */ 122/* Setting the contents is also trivial. */
101static void lg_set(struct virtio_device *vdev, void *token, 123static void lg_set(struct virtio_device *vdev, unsigned int offset,
102 const void *buf, unsigned len) 124 const void *buf, unsigned len)
103{ 125{
104 BUG_ON(len > ((u8 *)token)[1]); 126 struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
105 memcpy(token + FHDR_LEN, buf, len); 127
128 /* Check they didn't ask for more than the length of the config! */
129 BUG_ON(offset + len > desc->config_len);
130 memcpy(lg_config(desc) + offset, buf, len);
106} 131}
107 132
108/* The operations to get and set the status word just access the status field 133/* The operations to get and set the status word just access the status field
@@ -114,9 +139,20 @@ static u8 lg_get_status(struct virtio_device *vdev)
114 139
115static void lg_set_status(struct virtio_device *vdev, u8 status) 140static void lg_set_status(struct virtio_device *vdev, u8 status)
116{ 141{
142 BUG_ON(!status);
117 to_lgdev(vdev)->desc->status = status; 143 to_lgdev(vdev)->desc->status = status;
118} 144}
119 145
146/* To reset the device, we (ab)use the NOTIFY hypercall, with the descriptor
147 * address of the device. The Host will zero the status and all the
148 * features. */
149static void lg_reset(struct virtio_device *vdev)
150{
151 unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices;
152
153 hcall(LHCALL_NOTIFY, (max_pfn<<PAGE_SHIFT) + offset, 0, 0);
154}
155
120/* 156/*
121 * Virtqueues 157 * Virtqueues
122 * 158 *
@@ -165,39 +201,29 @@ static void lg_notify(struct virtqueue *vq)
165 * 201 *
166 * So we provide devices with a "find virtqueue and set it up" function. */ 202 * So we provide devices with a "find virtqueue and set it up" function. */
167static struct virtqueue *lg_find_vq(struct virtio_device *vdev, 203static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
168 bool (*callback)(struct virtqueue *vq)) 204 unsigned index,
205 void (*callback)(struct virtqueue *vq))
169{ 206{
207 struct lguest_device *ldev = to_lgdev(vdev);
170 struct lguest_vq_info *lvq; 208 struct lguest_vq_info *lvq;
171 struct virtqueue *vq; 209 struct virtqueue *vq;
172 unsigned int len;
173 void *token;
174 int err; 210 int err;
175 211
176 /* Look for a field of the correct type to mark a virtqueue. Note that 212 /* We must have this many virtqueues. */
177 * if this succeeds, then the type will be changed so it won't be found 213 if (index >= ldev->desc->num_vq)
178 * again, and future lg_find_vq() calls will find the next
179 * virtqueue (if any). */
180 token = vdev->config->find(vdev, VIRTIO_CONFIG_F_VIRTQUEUE, &len);
181 if (!token)
182 return ERR_PTR(-ENOENT); 214 return ERR_PTR(-ENOENT);
183 215
184 lvq = kmalloc(sizeof(*lvq), GFP_KERNEL); 216 lvq = kmalloc(sizeof(*lvq), GFP_KERNEL);
185 if (!lvq) 217 if (!lvq)
186 return ERR_PTR(-ENOMEM); 218 return ERR_PTR(-ENOMEM);
187 219
188 /* Note: we could use a configuration space inside here, just like we 220 /* Make a copy of the "struct lguest_vqconfig" entry, which sits after
189 * do for the device. This would allow expansion in future, because 221 * the descriptor. We need a copy because the config space might not
190 * our configuration system is designed to be expansible. But this is 222 * be aligned correctly. */
191 * way easier. */ 223 memcpy(&lvq->config, lg_vq(ldev->desc)+index, sizeof(lvq->config));
192 if (len != sizeof(lvq->config)) {
193 dev_err(&vdev->dev, "Unexpected virtio config len %u\n", len);
194 err = -EIO;
195 goto free_lvq;
196 }
197 /* Make a copy of the "struct lguest_vqconfig" field. We need a copy
198 * because the config space might not be aligned correctly. */
199 vdev->config->get(vdev, token, &lvq->config, sizeof(lvq->config));
200 224
225 printk("Mapping virtqueue %i addr %lx\n", index,
226 (unsigned long)lvq->config.pfn << PAGE_SHIFT);
201 /* Figure out how many pages the ring will take, and map that memory */ 227 /* Figure out how many pages the ring will take, and map that memory */
202 lvq->pages = lguest_map((unsigned long)lvq->config.pfn << PAGE_SHIFT, 228 lvq->pages = lguest_map((unsigned long)lvq->config.pfn << PAGE_SHIFT,
203 DIV_ROUND_UP(vring_size(lvq->config.num, 229 DIV_ROUND_UP(vring_size(lvq->config.num,
@@ -259,11 +285,12 @@ static void lg_del_vq(struct virtqueue *vq)
259 285
260/* The ops structure which hooks everything together. */ 286/* The ops structure which hooks everything together. */
261static struct virtio_config_ops lguest_config_ops = { 287static struct virtio_config_ops lguest_config_ops = {
262 .find = lg_find, 288 .feature = lg_feature,
263 .get = lg_get, 289 .get = lg_get,
264 .set = lg_set, 290 .set = lg_set,
265 .get_status = lg_get_status, 291 .get_status = lg_get_status,
266 .set_status = lg_set_status, 292 .set_status = lg_set_status,
293 .reset = lg_reset,
267 .find_vq = lg_find_vq, 294 .find_vq = lg_find_vq,
268 .del_vq = lg_del_vq, 295 .del_vq = lg_del_vq,
269}; 296};
@@ -329,13 +356,14 @@ static void scan_devices(void)
329 struct lguest_device_desc *d; 356 struct lguest_device_desc *d;
330 357
331 /* We start at the page beginning, and skip over each entry. */ 358 /* We start at the page beginning, and skip over each entry. */
332 for (i = 0; i < PAGE_SIZE; i += sizeof(*d) + d->config_len) { 359 for (i = 0; i < PAGE_SIZE; i += desc_size(d)) {
333 d = lguest_devices + i; 360 d = lguest_devices + i;
334 361
335 /* Once we hit a zero, stop. */ 362 /* Once we hit a zero, stop. */
336 if (d->type == 0) 363 if (d->type == 0)
337 break; 364 break;
338 365
366 printk("Device at %i has size %u\n", i, desc_size(d));
339 add_lguest_device(d); 367 add_lguest_device(d);
340 } 368 }
341} 369}
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 55d224c8a0b9..f234ba3f0404 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -3114,6 +3114,7 @@ config VIRTIO_NET
3114 tristate "Virtio network driver (EXPERIMENTAL)" 3114 tristate "Virtio network driver (EXPERIMENTAL)"
3115 depends on EXPERIMENTAL && VIRTIO 3115 depends on EXPERIMENTAL && VIRTIO
3116 ---help--- 3116 ---help---
3117 This is the virtual network driver for lguest. Say Y or M. 3117 This is the virtual network driver for virtio. It can be used with
3118 lguest or QEMU based VMMs (like KVM or Xen). Say Y or M.
3118 3119
3119endif # NETDEVICES 3120endif # NETDEVICES
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 5413dbf3d4ac..e66de0c12fc1 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -24,6 +24,13 @@
24#include <linux/virtio_net.h> 24#include <linux/virtio_net.h>
25#include <linux/scatterlist.h> 25#include <linux/scatterlist.h>
26 26
27static int napi_weight = 128;
28module_param(napi_weight, int, 0444);
29
30static int csum = 1, gso = 1;
31module_param(csum, bool, 0444);
32module_param(gso, bool, 0444);
33
27/* FIXME: MTU in config. */ 34/* FIXME: MTU in config. */
28#define MAX_PACKET_LEN (ETH_HLEN+ETH_DATA_LEN) 35#define MAX_PACKET_LEN (ETH_HLEN+ETH_DATA_LEN)
29 36
@@ -52,13 +59,14 @@ static inline void vnet_hdr_to_sg(struct scatterlist *sg, struct sk_buff *skb)
52 sg_init_one(sg, skb_vnet_hdr(skb), sizeof(struct virtio_net_hdr)); 59 sg_init_one(sg, skb_vnet_hdr(skb), sizeof(struct virtio_net_hdr));
53} 60}
54 61
55static bool skb_xmit_done(struct virtqueue *rvq) 62static void skb_xmit_done(struct virtqueue *svq)
56{ 63{
57 struct virtnet_info *vi = rvq->vdev->priv; 64 struct virtnet_info *vi = svq->vdev->priv;
58 65
59 /* In case we were waiting for output buffers. */ 66 /* Suppress further interrupts. */
67 svq->vq_ops->disable_cb(svq);
68 /* We were waiting for more output buffers. */
60 netif_wake_queue(vi->dev); 69 netif_wake_queue(vi->dev);
61 return true;
62} 70}
63 71
64static void receive_skb(struct net_device *dev, struct sk_buff *skb, 72static void receive_skb(struct net_device *dev, struct sk_buff *skb,
@@ -83,28 +91,16 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb,
83 91
84 if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { 92 if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
85 pr_debug("Needs csum!\n"); 93 pr_debug("Needs csum!\n");
86 skb->ip_summed = CHECKSUM_PARTIAL; 94 if (!skb_partial_csum_set(skb,hdr->csum_start,hdr->csum_offset))
87 skb->csum_start = hdr->csum_start;
88 skb->csum_offset = hdr->csum_offset;
89 if (skb->csum_start > skb->len - 2
90 || skb->csum_offset > skb->len - 2) {
91 if (net_ratelimit())
92 printk(KERN_WARNING "%s: csum=%u/%u len=%u\n",
93 dev->name, skb->csum_start,
94 skb->csum_offset, skb->len);
95 goto frame_err; 95 goto frame_err;
96 }
97 } 96 }
98 97
99 if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { 98 if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
100 pr_debug("GSO!\n"); 99 pr_debug("GSO!\n");
101 switch (hdr->gso_type) { 100 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
102 case VIRTIO_NET_HDR_GSO_TCPV4: 101 case VIRTIO_NET_HDR_GSO_TCPV4:
103 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; 102 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
104 break; 103 break;
105 case VIRTIO_NET_HDR_GSO_TCPV4_ECN:
106 skb_shinfo(skb)->gso_type = SKB_GSO_TCP_ECN;
107 break;
108 case VIRTIO_NET_HDR_GSO_UDP: 104 case VIRTIO_NET_HDR_GSO_UDP:
109 skb_shinfo(skb)->gso_type = SKB_GSO_UDP; 105 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
110 break; 106 break;
@@ -118,6 +114,9 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb,
118 goto frame_err; 114 goto frame_err;
119 } 115 }
120 116
117 if (hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN)
118 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
119
121 skb_shinfo(skb)->gso_size = hdr->gso_size; 120 skb_shinfo(skb)->gso_size = hdr->gso_size;
122 if (skb_shinfo(skb)->gso_size == 0) { 121 if (skb_shinfo(skb)->gso_size == 0) {
123 if (net_ratelimit()) 122 if (net_ratelimit())
@@ -170,12 +169,14 @@ static void try_fill_recv(struct virtnet_info *vi)
170 vi->rvq->vq_ops->kick(vi->rvq); 169 vi->rvq->vq_ops->kick(vi->rvq);
171} 170}
172 171
173static bool skb_recv_done(struct virtqueue *rvq) 172static void skb_recv_done(struct virtqueue *rvq)
174{ 173{
175 struct virtnet_info *vi = rvq->vdev->priv; 174 struct virtnet_info *vi = rvq->vdev->priv;
176 netif_rx_schedule(vi->dev, &vi->napi); 175 /* Schedule NAPI, Suppress further interrupts if successful. */
177 /* Suppress further interrupts. */ 176 if (netif_rx_schedule_prep(vi->dev, &vi->napi)) {
178 return false; 177 rvq->vq_ops->disable_cb(rvq);
178 __netif_rx_schedule(vi->dev, &vi->napi);
179 }
179} 180}
180 181
181static int virtnet_poll(struct napi_struct *napi, int budget) 182static int virtnet_poll(struct napi_struct *napi, int budget)
@@ -201,7 +202,7 @@ again:
201 /* Out of packets? */ 202 /* Out of packets? */
202 if (received < budget) { 203 if (received < budget) {
203 netif_rx_complete(vi->dev, napi); 204 netif_rx_complete(vi->dev, napi);
204 if (unlikely(!vi->rvq->vq_ops->restart(vi->rvq)) 205 if (unlikely(!vi->rvq->vq_ops->enable_cb(vi->rvq))
205 && netif_rx_reschedule(vi->dev, napi)) 206 && netif_rx_reschedule(vi->dev, napi))
206 goto again; 207 goto again;
207 } 208 }
@@ -236,8 +237,6 @@ static int start_xmit(struct sk_buff *skb, struct net_device *dev)
236 237
237 pr_debug("%s: xmit %p %s\n", dev->name, skb, print_mac(mac, dest)); 238 pr_debug("%s: xmit %p %s\n", dev->name, skb, print_mac(mac, dest));
238 239
239 free_old_xmit_skbs(vi);
240
241 /* Encode metadata header at front. */ 240 /* Encode metadata header at front. */
242 hdr = skb_vnet_hdr(skb); 241 hdr = skb_vnet_hdr(skb);
243 if (skb->ip_summed == CHECKSUM_PARTIAL) { 242 if (skb->ip_summed == CHECKSUM_PARTIAL) {
@@ -250,10 +249,9 @@ static int start_xmit(struct sk_buff *skb, struct net_device *dev)
250 } 249 }
251 250
252 if (skb_is_gso(skb)) { 251 if (skb_is_gso(skb)) {
252 hdr->hdr_len = skb_transport_header(skb) - skb->data;
253 hdr->gso_size = skb_shinfo(skb)->gso_size; 253 hdr->gso_size = skb_shinfo(skb)->gso_size;
254 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_ECN) 254 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)
255 hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4_ECN;
256 else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)
257 hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; 255 hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
258 else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) 256 else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)
259 hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; 257 hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
@@ -261,19 +259,34 @@ static int start_xmit(struct sk_buff *skb, struct net_device *dev)
261 hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP; 259 hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
262 else 260 else
263 BUG(); 261 BUG();
262 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_ECN)
263 hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
264 } else { 264 } else {
265 hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; 265 hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
266 hdr->gso_size = 0; 266 hdr->gso_size = hdr->hdr_len = 0;
267 } 267 }
268 268
269 vnet_hdr_to_sg(sg, skb); 269 vnet_hdr_to_sg(sg, skb);
270 num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1; 270 num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1;
271 __skb_queue_head(&vi->send, skb); 271 __skb_queue_head(&vi->send, skb);
272
273again:
274 /* Free up any pending old buffers before queueing new ones. */
275 free_old_xmit_skbs(vi);
272 err = vi->svq->vq_ops->add_buf(vi->svq, sg, num, 0, skb); 276 err = vi->svq->vq_ops->add_buf(vi->svq, sg, num, 0, skb);
273 if (err) { 277 if (err) {
274 pr_debug("%s: virtio not prepared to send\n", dev->name); 278 pr_debug("%s: virtio not prepared to send\n", dev->name);
275 skb_unlink(skb, &vi->send);
276 netif_stop_queue(dev); 279 netif_stop_queue(dev);
280
281 /* Activate callback for using skbs: if this fails it
282 * means some were used in the meantime. */
283 if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq))) {
284 printk("Unlikely: restart svq failed\n");
285 netif_start_queue(dev);
286 goto again;
287 }
288 __skb_unlink(skb, &vi->send);
289
277 return NETDEV_TX_BUSY; 290 return NETDEV_TX_BUSY;
278 } 291 }
279 vi->svq->vq_ops->kick(vi->svq); 292 vi->svq->vq_ops->kick(vi->svq);
@@ -285,45 +298,31 @@ static int virtnet_open(struct net_device *dev)
285{ 298{
286 struct virtnet_info *vi = netdev_priv(dev); 299 struct virtnet_info *vi = netdev_priv(dev);
287 300
288 try_fill_recv(vi); 301 napi_enable(&vi->napi);
289 302
290 /* If we didn't even get one input buffer, we're useless. */ 303 /* If all buffers were filled by other side before we napi_enabled, we
291 if (vi->num == 0) 304 * won't get another interrupt, so process any outstanding packets
292 return -ENOMEM; 305 * now. virtnet_poll wants re-enable the queue, so we disable here. */
306 vi->rvq->vq_ops->disable_cb(vi->rvq);
307 netif_rx_schedule(vi->dev, &vi->napi);
293 308
294 napi_enable(&vi->napi);
295 return 0; 309 return 0;
296} 310}
297 311
298static int virtnet_close(struct net_device *dev) 312static int virtnet_close(struct net_device *dev)
299{ 313{
300 struct virtnet_info *vi = netdev_priv(dev); 314 struct virtnet_info *vi = netdev_priv(dev);
301 struct sk_buff *skb;
302 315
303 napi_disable(&vi->napi); 316 napi_disable(&vi->napi);
304 317
305 /* networking core has neutered skb_xmit_done/skb_recv_done, so don't
306 * worry about races vs. get(). */
307 vi->rvq->vq_ops->shutdown(vi->rvq);
308 while ((skb = __skb_dequeue(&vi->recv)) != NULL) {
309 kfree_skb(skb);
310 vi->num--;
311 }
312 vi->svq->vq_ops->shutdown(vi->svq);
313 while ((skb = __skb_dequeue(&vi->send)) != NULL)
314 kfree_skb(skb);
315
316 BUG_ON(vi->num != 0);
317 return 0; 318 return 0;
318} 319}
319 320
320static int virtnet_probe(struct virtio_device *vdev) 321static int virtnet_probe(struct virtio_device *vdev)
321{ 322{
322 int err; 323 int err;
323 unsigned int len;
324 struct net_device *dev; 324 struct net_device *dev;
325 struct virtnet_info *vi; 325 struct virtnet_info *vi;
326 void *token;
327 326
328 /* Allocate ourselves a network device with room for our info */ 327 /* Allocate ourselves a network device with room for our info */
329 dev = alloc_etherdev(sizeof(struct virtnet_info)); 328 dev = alloc_etherdev(sizeof(struct virtnet_info));
@@ -331,7 +330,6 @@ static int virtnet_probe(struct virtio_device *vdev)
331 return -ENOMEM; 330 return -ENOMEM;
332 331
333 /* Set up network device as normal. */ 332 /* Set up network device as normal. */
334 ether_setup(dev);
335 dev->open = virtnet_open; 333 dev->open = virtnet_open;
336 dev->stop = virtnet_close; 334 dev->stop = virtnet_close;
337 dev->hard_start_xmit = start_xmit; 335 dev->hard_start_xmit = start_xmit;
@@ -339,42 +337,37 @@ static int virtnet_probe(struct virtio_device *vdev)
339 SET_NETDEV_DEV(dev, &vdev->dev); 337 SET_NETDEV_DEV(dev, &vdev->dev);
340 338
341 /* Do we support "hardware" checksums? */ 339 /* Do we support "hardware" checksums? */
342 token = vdev->config->find(vdev, VIRTIO_CONFIG_NET_F, &len); 340 if (csum && vdev->config->feature(vdev, VIRTIO_NET_F_CSUM)) {
343 if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_NO_CSUM)) {
344 /* This opens up the world of extra features. */ 341 /* This opens up the world of extra features. */
345 dev->features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST; 342 dev->features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST;
346 if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO4)) 343 if (gso && vdev->config->feature(vdev, VIRTIO_NET_F_GSO)) {
347 dev->features |= NETIF_F_TSO; 344 dev->features |= NETIF_F_TSO | NETIF_F_UFO
348 if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_UFO)) 345 | NETIF_F_TSO_ECN | NETIF_F_TSO6;
349 dev->features |= NETIF_F_UFO; 346 }
350 if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO4_ECN))
351 dev->features |= NETIF_F_TSO_ECN;
352 if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO6))
353 dev->features |= NETIF_F_TSO6;
354 } 347 }
355 348
356 /* Configuration may specify what MAC to use. Otherwise random. */ 349 /* Configuration may specify what MAC to use. Otherwise random. */
357 token = vdev->config->find(vdev, VIRTIO_CONFIG_NET_MAC_F, &len); 350 if (vdev->config->feature(vdev, VIRTIO_NET_F_MAC)) {
358 if (token) { 351 vdev->config->get(vdev,
359 dev->addr_len = len; 352 offsetof(struct virtio_net_config, mac),
360 vdev->config->get(vdev, token, dev->dev_addr, len); 353 dev->dev_addr, dev->addr_len);
361 } else 354 } else
362 random_ether_addr(dev->dev_addr); 355 random_ether_addr(dev->dev_addr);
363 356
364 /* Set up our device-specific information */ 357 /* Set up our device-specific information */
365 vi = netdev_priv(dev); 358 vi = netdev_priv(dev);
366 netif_napi_add(dev, &vi->napi, virtnet_poll, 16); 359 netif_napi_add(dev, &vi->napi, virtnet_poll, napi_weight);
367 vi->dev = dev; 360 vi->dev = dev;
368 vi->vdev = vdev; 361 vi->vdev = vdev;
369 362
370 /* We expect two virtqueues, receive then send. */ 363 /* We expect two virtqueues, receive then send. */
371 vi->rvq = vdev->config->find_vq(vdev, skb_recv_done); 364 vi->rvq = vdev->config->find_vq(vdev, 0, skb_recv_done);
372 if (IS_ERR(vi->rvq)) { 365 if (IS_ERR(vi->rvq)) {
373 err = PTR_ERR(vi->rvq); 366 err = PTR_ERR(vi->rvq);
374 goto free; 367 goto free;
375 } 368 }
376 369
377 vi->svq = vdev->config->find_vq(vdev, skb_xmit_done); 370 vi->svq = vdev->config->find_vq(vdev, 1, skb_xmit_done);
378 if (IS_ERR(vi->svq)) { 371 if (IS_ERR(vi->svq)) {
379 err = PTR_ERR(vi->svq); 372 err = PTR_ERR(vi->svq);
380 goto free_recv; 373 goto free_recv;
@@ -389,10 +382,22 @@ static int virtnet_probe(struct virtio_device *vdev)
389 pr_debug("virtio_net: registering device failed\n"); 382 pr_debug("virtio_net: registering device failed\n");
390 goto free_send; 383 goto free_send;
391 } 384 }
385
386 /* Last of all, set up some receive buffers. */
387 try_fill_recv(vi);
388
389 /* If we didn't even get one input buffer, we're useless. */
390 if (vi->num == 0) {
391 err = -ENOMEM;
392 goto unregister;
393 }
394
392 pr_debug("virtnet: registered device %s\n", dev->name); 395 pr_debug("virtnet: registered device %s\n", dev->name);
393 vdev->priv = vi; 396 vdev->priv = vi;
394 return 0; 397 return 0;
395 398
399unregister:
400 unregister_netdev(dev);
396free_send: 401free_send:
397 vdev->config->del_vq(vi->svq); 402 vdev->config->del_vq(vi->svq);
398free_recv: 403free_recv:
@@ -405,6 +410,20 @@ free:
405static void virtnet_remove(struct virtio_device *vdev) 410static void virtnet_remove(struct virtio_device *vdev)
406{ 411{
407 struct virtnet_info *vi = vdev->priv; 412 struct virtnet_info *vi = vdev->priv;
413 struct sk_buff *skb;
414
415 /* Stop all the virtqueues. */
416 vdev->config->reset(vdev);
417
418 /* Free our skbs in send and recv queues, if any. */
419 while ((skb = __skb_dequeue(&vi->recv)) != NULL) {
420 kfree_skb(skb);
421 vi->num--;
422 }
423 while ((skb = __skb_dequeue(&vi->send)) != NULL)
424 kfree_skb(skb);
425
426 BUG_ON(vi->num != 0);
408 427
409 vdev->config->del_vq(vi->svq); 428 vdev->config->del_vq(vi->svq);
410 vdev->config->del_vq(vi->rvq); 429 vdev->config->del_vq(vi->rvq);
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index 9e33fc4da875..3dd6294d10b6 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -1,8 +1,35 @@
1# Virtio always gets selected by whoever wants it. 1# Virtio always gets selected by whoever wants it.
2config VIRTIO 2config VIRTIO
3 bool 3 tristate
4 4
5# Similarly the virtio ring implementation. 5# Similarly the virtio ring implementation.
6config VIRTIO_RING 6config VIRTIO_RING
7 bool 7 tristate
8 depends on VIRTIO 8 depends on VIRTIO
9
10config VIRTIO_PCI
11 tristate "PCI driver for virtio devices (EXPERIMENTAL)"
12 depends on PCI && EXPERIMENTAL
13 select VIRTIO
14 select VIRTIO_RING
15 ---help---
16 This drivers provides support for virtio based paravirtual device
17 drivers over PCI. This requires that your VMM has appropriate PCI
18 virtio backends. Most QEMU based VMMs should support these devices
19 (like KVM or Xen).
20
21 Currently, the ABI is not considered stable so there is no guarantee
22 that this version of the driver will work with your VMM.
23
24 If unsure, say M.
25
26config VIRTIO_BALLOON
27 tristate "Virtio balloon driver (EXPERIMENTAL)"
28 select VIRTIO
29 select VIRTIO_RING
30 ---help---
31 This driver supports increasing and decreasing the amount
32 of memory within a KVM guest.
33
34 If unsure, say M.
35
diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
index f70e40971dd9..6738c446c199 100644
--- a/drivers/virtio/Makefile
+++ b/drivers/virtio/Makefile
@@ -1,2 +1,4 @@
1obj-$(CONFIG_VIRTIO) += virtio.o 1obj-$(CONFIG_VIRTIO) += virtio.o
2obj-$(CONFIG_VIRTIO_RING) += virtio_ring.o 2obj-$(CONFIG_VIRTIO_RING) += virtio_ring.o
3obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o
4obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o
diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index 69d7ea02cd48..b535483bc556 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -102,9 +102,13 @@ static int virtio_dev_remove(struct device *_d)
102 struct virtio_driver *drv = container_of(dev->dev.driver, 102 struct virtio_driver *drv = container_of(dev->dev.driver,
103 struct virtio_driver, driver); 103 struct virtio_driver, driver);
104 104
105 dev->config->set_status(dev, dev->config->get_status(dev)
106 & ~VIRTIO_CONFIG_S_DRIVER);
107 drv->remove(dev); 105 drv->remove(dev);
106
107 /* Driver should have reset device. */
108 BUG_ON(dev->config->get_status(dev));
109
110 /* Acknowledge the device's existence again. */
111 add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
108 return 0; 112 return 0;
109} 113}
110 114
@@ -130,6 +134,10 @@ int register_virtio_device(struct virtio_device *dev)
130 dev->dev.bus = &virtio_bus; 134 dev->dev.bus = &virtio_bus;
131 sprintf(dev->dev.bus_id, "%u", dev->index); 135 sprintf(dev->dev.bus_id, "%u", dev->index);
132 136
137 /* We always start by resetting the device, in case a previous
138 * driver messed it up. This also tests that code path a little. */
139 dev->config->reset(dev);
140
133 /* Acknowledge that we've seen the device. */ 141 /* Acknowledge that we've seen the device. */
134 add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); 142 add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
135 143
@@ -148,55 +156,18 @@ void unregister_virtio_device(struct virtio_device *dev)
148} 156}
149EXPORT_SYMBOL_GPL(unregister_virtio_device); 157EXPORT_SYMBOL_GPL(unregister_virtio_device);
150 158
151int __virtio_config_val(struct virtio_device *vdev,
152 u8 type, void *val, size_t size)
153{
154 void *token;
155 unsigned int len;
156
157 token = vdev->config->find(vdev, type, &len);
158 if (!token)
159 return -ENOENT;
160
161 if (len != size)
162 return -EIO;
163
164 vdev->config->get(vdev, token, val, size);
165 return 0;
166}
167EXPORT_SYMBOL_GPL(__virtio_config_val);
168
169int virtio_use_bit(struct virtio_device *vdev,
170 void *token, unsigned int len, unsigned int bitnum)
171{
172 unsigned long bits[16];
173
174 /* This makes it convenient to pass-through find() results. */
175 if (!token)
176 return 0;
177
178 /* bit not in range of this bitfield? */
179 if (bitnum * 8 >= len / 2)
180 return 0;
181
182 /* Giant feature bitfields are silly. */
183 BUG_ON(len > sizeof(bits));
184 vdev->config->get(vdev, token, bits, len);
185
186 if (!test_bit(bitnum, bits))
187 return 0;
188
189 /* Set acknowledge bit, and write it back. */
190 set_bit(bitnum + len * 8 / 2, bits);
191 vdev->config->set(vdev, token, bits, len);
192 return 1;
193}
194EXPORT_SYMBOL_GPL(virtio_use_bit);
195
196static int virtio_init(void) 159static int virtio_init(void)
197{ 160{
198 if (bus_register(&virtio_bus) != 0) 161 if (bus_register(&virtio_bus) != 0)
199 panic("virtio bus registration failed"); 162 panic("virtio bus registration failed");
200 return 0; 163 return 0;
201} 164}
165
166static void __exit virtio_exit(void)
167{
168 bus_unregister(&virtio_bus);
169}
202core_initcall(virtio_init); 170core_initcall(virtio_init);
171module_exit(virtio_exit);
172
173MODULE_LICENSE("GPL");
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
new file mode 100644
index 000000000000..622aece1acce
--- /dev/null
+++ b/drivers/virtio/virtio_balloon.c
@@ -0,0 +1,284 @@
1/* Virtio balloon implementation, inspired by Dor Loar and Marcelo
2 * Tosatti's implementations.
3 *
4 * Copyright 2008 Rusty Russell IBM Corporation
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20//#define DEBUG
21#include <linux/virtio.h>
22#include <linux/virtio_balloon.h>
23#include <linux/swap.h>
24#include <linux/kthread.h>
25#include <linux/freezer.h>
26
27struct virtio_balloon
28{
29 struct virtio_device *vdev;
30 struct virtqueue *inflate_vq, *deflate_vq;
31
32 /* Where the ballooning thread waits for config to change. */
33 wait_queue_head_t config_change;
34
35 /* The thread servicing the balloon. */
36 struct task_struct *thread;
37
38 /* Waiting for host to ack the pages we released. */
39 struct completion acked;
40
41 /* Do we have to tell Host *before* we reuse pages? */
42 bool tell_host_first;
43
44 /* The pages we've told the Host we're not using. */
45 unsigned int num_pages;
46 struct list_head pages;
47
48 /* The array of pfns we tell the Host about. */
49 unsigned int num_pfns;
50 u32 pfns[256];
51};
52
53static struct virtio_device_id id_table[] = {
54 { VIRTIO_ID_BALLOON, VIRTIO_DEV_ANY_ID },
55 { 0 },
56};
57
58static void balloon_ack(struct virtqueue *vq)
59{
60 struct virtio_balloon *vb;
61 unsigned int len;
62
63 vb = vq->vq_ops->get_buf(vq, &len);
64 if (vb)
65 complete(&vb->acked);
66}
67
68static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
69{
70 struct scatterlist sg;
71
72 sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
73
74 init_completion(&vb->acked);
75
76 /* We should always be able to add one buffer to an empty queue. */
77 if (vq->vq_ops->add_buf(vq, &sg, 1, 0, vb) != 0)
78 BUG();
79 vq->vq_ops->kick(vq);
80
81 /* When host has read buffer, this completes via balloon_ack */
82 wait_for_completion(&vb->acked);
83}
84
85static void fill_balloon(struct virtio_balloon *vb, size_t num)
86{
87 /* We can only do one array worth at a time. */
88 num = min(num, ARRAY_SIZE(vb->pfns));
89
90 for (vb->num_pfns = 0; vb->num_pfns < num; vb->num_pfns++) {
91 struct page *page = alloc_page(GFP_HIGHUSER | __GFP_NORETRY);
92 if (!page) {
93 if (printk_ratelimit())
94 dev_printk(KERN_INFO, &vb->vdev->dev,
95 "Out of puff! Can't get %zu pages\n",
96 num);
97 /* Sleep for at least 1/5 of a second before retry. */
98 msleep(200);
99 break;
100 }
101 vb->pfns[vb->num_pfns] = page_to_pfn(page);
102 totalram_pages--;
103 vb->num_pages++;
104 list_add(&page->lru, &vb->pages);
105 }
106
107 /* Didn't get any? Oh well. */
108 if (vb->num_pfns == 0)
109 return;
110
111 tell_host(vb, vb->inflate_vq);
112}
113
114static void release_pages_by_pfn(const u32 pfns[], unsigned int num)
115{
116 unsigned int i;
117
118 for (i = 0; i < num; i++) {
119 __free_page(pfn_to_page(pfns[i]));
120 totalram_pages++;
121 }
122}
123
124static void leak_balloon(struct virtio_balloon *vb, size_t num)
125{
126 struct page *page;
127
128 /* We can only do one array worth at a time. */
129 num = min(num, ARRAY_SIZE(vb->pfns));
130
131 for (vb->num_pfns = 0; vb->num_pfns < num; vb->num_pfns++) {
132 page = list_first_entry(&vb->pages, struct page, lru);
133 list_del(&page->lru);
134 vb->pfns[vb->num_pfns] = page_to_pfn(page);
135 vb->num_pages--;
136 }
137
138 if (vb->tell_host_first) {
139 tell_host(vb, vb->deflate_vq);
140 release_pages_by_pfn(vb->pfns, vb->num_pfns);
141 } else {
142 release_pages_by_pfn(vb->pfns, vb->num_pfns);
143 tell_host(vb, vb->deflate_vq);
144 }
145}
146
147static void virtballoon_changed(struct virtio_device *vdev)
148{
149 struct virtio_balloon *vb = vdev->priv;
150
151 wake_up(&vb->config_change);
152}
153
154static inline int towards_target(struct virtio_balloon *vb)
155{
156 u32 v;
157 __virtio_config_val(vb->vdev,
158 offsetof(struct virtio_balloon_config, num_pages),
159 &v);
160 return v - vb->num_pages;
161}
162
163static void update_balloon_size(struct virtio_balloon *vb)
164{
165 __le32 actual = cpu_to_le32(vb->num_pages);
166
167 vb->vdev->config->set(vb->vdev,
168 offsetof(struct virtio_balloon_config, actual),
169 &actual, sizeof(actual));
170}
171
172static int balloon(void *_vballoon)
173{
174 struct virtio_balloon *vb = _vballoon;
175
176 set_freezable();
177 while (!kthread_should_stop()) {
178 int diff;
179
180 try_to_freeze();
181 wait_event_interruptible(vb->config_change,
182 (diff = towards_target(vb)) != 0
183 || kthread_should_stop());
184 if (diff > 0)
185 fill_balloon(vb, diff);
186 else if (diff < 0)
187 leak_balloon(vb, -diff);
188 update_balloon_size(vb);
189 }
190 return 0;
191}
192
193static int virtballoon_probe(struct virtio_device *vdev)
194{
195 struct virtio_balloon *vb;
196 int err;
197
198 vdev->priv = vb = kmalloc(sizeof(*vb), GFP_KERNEL);
199 if (!vb) {
200 err = -ENOMEM;
201 goto out;
202 }
203
204 INIT_LIST_HEAD(&vb->pages);
205 vb->num_pages = 0;
206 init_waitqueue_head(&vb->config_change);
207 vb->vdev = vdev;
208
209 /* We expect two virtqueues. */
210 vb->inflate_vq = vdev->config->find_vq(vdev, 0, balloon_ack);
211 if (IS_ERR(vb->inflate_vq)) {
212 err = PTR_ERR(vb->inflate_vq);
213 goto out_free_vb;
214 }
215
216 vb->deflate_vq = vdev->config->find_vq(vdev, 1, balloon_ack);
217 if (IS_ERR(vb->deflate_vq)) {
218 err = PTR_ERR(vb->deflate_vq);
219 goto out_del_inflate_vq;
220 }
221
222 vb->thread = kthread_run(balloon, vb, "vballoon");
223 if (IS_ERR(vb->thread)) {
224 err = PTR_ERR(vb->thread);
225 goto out_del_deflate_vq;
226 }
227
228 vb->tell_host_first
229 = vdev->config->feature(vdev, VIRTIO_BALLOON_F_MUST_TELL_HOST);
230
231 return 0;
232
233out_del_deflate_vq:
234 vdev->config->del_vq(vb->deflate_vq);
235out_del_inflate_vq:
236 vdev->config->del_vq(vb->inflate_vq);
237out_free_vb:
238 kfree(vb);
239out:
240 return err;
241}
242
243static void virtballoon_remove(struct virtio_device *vdev)
244{
245 struct virtio_balloon *vb = vdev->priv;
246
247 kthread_stop(vb->thread);
248
249 /* There might be pages left in the balloon: free them. */
250 while (vb->num_pages)
251 leak_balloon(vb, vb->num_pages);
252
253 /* Now we reset the device so we can clean up the queues. */
254 vdev->config->reset(vdev);
255
256 vdev->config->del_vq(vb->deflate_vq);
257 vdev->config->del_vq(vb->inflate_vq);
258 kfree(vb);
259}
260
261static struct virtio_driver virtio_balloon = {
262 .driver.name = KBUILD_MODNAME,
263 .driver.owner = THIS_MODULE,
264 .id_table = id_table,
265 .probe = virtballoon_probe,
266 .remove = __devexit_p(virtballoon_remove),
267 .config_changed = virtballoon_changed,
268};
269
270static int __init init(void)
271{
272 return register_virtio_driver(&virtio_balloon);
273}
274
275static void __exit fini(void)
276{
277 unregister_virtio_driver(&virtio_balloon);
278}
279module_init(init);
280module_exit(fini);
281
282MODULE_DEVICE_TABLE(virtio, id_table);
283MODULE_DESCRIPTION("Virtio balloon driver");
284MODULE_LICENSE("GPL");
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
new file mode 100644
index 000000000000..26f787ddd5ff
--- /dev/null
+++ b/drivers/virtio/virtio_pci.c
@@ -0,0 +1,446 @@
1/*
2 * Virtio PCI driver
3 *
4 * This module allows virtio devices to be used over a virtual PCI device.
5 * This can be used with QEMU based VMMs like KVM or Xen.
6 *
7 * Copyright IBM Corp. 2007
8 *
9 * Authors:
10 * Anthony Liguori <aliguori@us.ibm.com>
11 *
12 * This work is licensed under the terms of the GNU GPL, version 2 or later.
13 * See the COPYING file in the top-level directory.
14 *
15 */
16
17#include <linux/module.h>
18#include <linux/list.h>
19#include <linux/pci.h>
20#include <linux/interrupt.h>
21#include <linux/virtio.h>
22#include <linux/virtio_config.h>
23#include <linux/virtio_ring.h>
24#include <linux/virtio_pci.h>
25#include <linux/highmem.h>
26#include <linux/spinlock.h>
27
28MODULE_AUTHOR("Anthony Liguori <aliguori@us.ibm.com>");
29MODULE_DESCRIPTION("virtio-pci");
30MODULE_LICENSE("GPL");
31MODULE_VERSION("1");
32
33/* Our device structure */
34struct virtio_pci_device
35{
36 struct virtio_device vdev;
37 struct pci_dev *pci_dev;
38
39 /* the IO mapping for the PCI config space */
40 void *ioaddr;
41
42 /* a list of queues so we can dispatch IRQs */
43 spinlock_t lock;
44 struct list_head virtqueues;
45};
46
47struct virtio_pci_vq_info
48{
49 /* the actual virtqueue */
50 struct virtqueue *vq;
51
52 /* the number of entries in the queue */
53 int num;
54
55 /* the index of the queue */
56 int queue_index;
57
58 /* the virtual address of the ring queue */
59 void *queue;
60
61 /* the list node for the virtqueues list */
62 struct list_head node;
63};
64
65/* Qumranet donated their vendor ID for devices 0x1000 thru 0x10FF. */
66static struct pci_device_id virtio_pci_id_table[] = {
67 { 0x1af4, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 },
68 { 0 },
69};
70
71MODULE_DEVICE_TABLE(pci, virtio_pci_id_table);
72
73/* A PCI device has it's own struct device and so does a virtio device so
74 * we create a place for the virtio devices to show up in sysfs. I think it
75 * would make more sense for virtio to not insist on having it's own device. */
76static struct device virtio_pci_root = {
77 .parent = NULL,
78 .bus_id = "virtio-pci",
79};
80
81/* Unique numbering for devices under the kvm root */
82static unsigned int dev_index;
83
84/* Convert a generic virtio device to our structure */
85static struct virtio_pci_device *to_vp_device(struct virtio_device *vdev)
86{
87 return container_of(vdev, struct virtio_pci_device, vdev);
88}
89
90/* virtio config->feature() implementation */
91static bool vp_feature(struct virtio_device *vdev, unsigned bit)
92{
93 struct virtio_pci_device *vp_dev = to_vp_device(vdev);
94 u32 mask;
95
96 /* Since this function is supposed to have the side effect of
97 * enabling a queried feature, we simulate that by doing a read
98 * from the host feature bitmask and then writing to the guest
99 * feature bitmask */
100 mask = ioread32(vp_dev->ioaddr + VIRTIO_PCI_HOST_FEATURES);
101 if (mask & (1 << bit)) {
102 mask |= (1 << bit);
103 iowrite32(mask, vp_dev->ioaddr + VIRTIO_PCI_GUEST_FEATURES);
104 }
105
106 return !!(mask & (1 << bit));
107}
108
109/* virtio config->get() implementation */
110static void vp_get(struct virtio_device *vdev, unsigned offset,
111 void *buf, unsigned len)
112{
113 struct virtio_pci_device *vp_dev = to_vp_device(vdev);
114 void *ioaddr = vp_dev->ioaddr + VIRTIO_PCI_CONFIG + offset;
115 u8 *ptr = buf;
116 int i;
117
118 for (i = 0; i < len; i++)
119 ptr[i] = ioread8(ioaddr + i);
120}
121
122/* the config->set() implementation. it's symmetric to the config->get()
123 * implementation */
124static void vp_set(struct virtio_device *vdev, unsigned offset,
125 const void *buf, unsigned len)
126{
127 struct virtio_pci_device *vp_dev = to_vp_device(vdev);
128 void *ioaddr = vp_dev->ioaddr + VIRTIO_PCI_CONFIG + offset;
129 const u8 *ptr = buf;
130 int i;
131
132 for (i = 0; i < len; i++)
133 iowrite8(ptr[i], ioaddr + i);
134}
135
136/* config->{get,set}_status() implementations */
137static u8 vp_get_status(struct virtio_device *vdev)
138{
139 struct virtio_pci_device *vp_dev = to_vp_device(vdev);
140 return ioread8(vp_dev->ioaddr + VIRTIO_PCI_STATUS);
141}
142
143static void vp_set_status(struct virtio_device *vdev, u8 status)
144{
145 struct virtio_pci_device *vp_dev = to_vp_device(vdev);
146 /* We should never be setting status to 0. */
147 BUG_ON(status == 0);
148 return iowrite8(status, vp_dev->ioaddr + VIRTIO_PCI_STATUS);
149}
150
151static void vp_reset(struct virtio_device *vdev)
152{
153 struct virtio_pci_device *vp_dev = to_vp_device(vdev);
154 /* 0 status means a reset. */
155 return iowrite8(0, vp_dev->ioaddr + VIRTIO_PCI_STATUS);
156}
157
158/* the notify function used when creating a virt queue */
159static void vp_notify(struct virtqueue *vq)
160{
161 struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
162 struct virtio_pci_vq_info *info = vq->priv;
163
164 /* we write the queue's selector into the notification register to
165 * signal the other end */
166 iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
167}
168
169/* A small wrapper to also acknowledge the interrupt when it's handled.
170 * I really need an EIO hook for the vring so I can ack the interrupt once we
171 * know that we'll be handling the IRQ but before we invoke the callback since
172 * the callback may notify the host which results in the host attempting to
173 * raise an interrupt that we would then mask once we acknowledged the
174 * interrupt. */
175static irqreturn_t vp_interrupt(int irq, void *opaque)
176{
177 struct virtio_pci_device *vp_dev = opaque;
178 struct virtio_pci_vq_info *info;
179 irqreturn_t ret = IRQ_NONE;
180 u8 isr;
181
182 /* reading the ISR has the effect of also clearing it so it's very
183 * important to save off the value. */
184 isr = ioread8(vp_dev->ioaddr + VIRTIO_PCI_ISR);
185
186 /* It's definitely not us if the ISR was not high */
187 if (!isr)
188 return IRQ_NONE;
189
190 /* Configuration change? Tell driver if it wants to know. */
191 if (isr & VIRTIO_PCI_ISR_CONFIG) {
192 struct virtio_driver *drv;
193 drv = container_of(vp_dev->vdev.dev.driver,
194 struct virtio_driver, driver);
195
196 if (drv->config_changed)
197 drv->config_changed(&vp_dev->vdev);
198 }
199
200 spin_lock(&vp_dev->lock);
201 list_for_each_entry(info, &vp_dev->virtqueues, node) {
202 if (vring_interrupt(irq, info->vq) == IRQ_HANDLED)
203 ret = IRQ_HANDLED;
204 }
205 spin_unlock(&vp_dev->lock);
206
207 return ret;
208}
209
210/* the config->find_vq() implementation */
211static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index,
212 void (*callback)(struct virtqueue *vq))
213{
214 struct virtio_pci_device *vp_dev = to_vp_device(vdev);
215 struct virtio_pci_vq_info *info;
216 struct virtqueue *vq;
217 u16 num;
218 int err;
219
220 /* Select the queue we're interested in */
221 iowrite16(index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
222
223 /* Check if queue is either not available or already active. */
224 num = ioread16(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NUM);
225 if (!num || ioread32(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN))
226 return ERR_PTR(-ENOENT);
227
228 /* allocate and fill out our structure the represents an active
229 * queue */
230 info = kmalloc(sizeof(struct virtio_pci_vq_info), GFP_KERNEL);
231 if (!info)
232 return ERR_PTR(-ENOMEM);
233
234 info->queue_index = index;
235 info->num = num;
236
237 info->queue = kzalloc(PAGE_ALIGN(vring_size(num,PAGE_SIZE)), GFP_KERNEL);
238 if (info->queue == NULL) {
239 err = -ENOMEM;
240 goto out_info;
241 }
242
243 /* activate the queue */
244 iowrite32(virt_to_phys(info->queue) >> PAGE_SHIFT,
245 vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
246
247 /* create the vring */
248 vq = vring_new_virtqueue(info->num, vdev, info->queue,
249 vp_notify, callback);
250 if (!vq) {
251 err = -ENOMEM;
252 goto out_activate_queue;
253 }
254
255 vq->priv = info;
256 info->vq = vq;
257
258 spin_lock(&vp_dev->lock);
259 list_add(&info->node, &vp_dev->virtqueues);
260 spin_unlock(&vp_dev->lock);
261
262 return vq;
263
264out_activate_queue:
265 iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
266 kfree(info->queue);
267out_info:
268 kfree(info);
269 return ERR_PTR(err);
270}
271
272/* the config->del_vq() implementation */
273static void vp_del_vq(struct virtqueue *vq)
274{
275 struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
276 struct virtio_pci_vq_info *info = vq->priv;
277
278 spin_lock(&vp_dev->lock);
279 list_del(&info->node);
280 spin_unlock(&vp_dev->lock);
281
282 vring_del_virtqueue(vq);
283
284 /* Select and deactivate the queue */
285 iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
286 iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
287
288 kfree(info->queue);
289 kfree(info);
290}
291
292static struct virtio_config_ops virtio_pci_config_ops = {
293 .feature = vp_feature,
294 .get = vp_get,
295 .set = vp_set,
296 .get_status = vp_get_status,
297 .set_status = vp_set_status,
298 .reset = vp_reset,
299 .find_vq = vp_find_vq,
300 .del_vq = vp_del_vq,
301};
302
303/* the PCI probing function */
304static int __devinit virtio_pci_probe(struct pci_dev *pci_dev,
305 const struct pci_device_id *id)
306{
307 struct virtio_pci_device *vp_dev;
308 int err;
309
310 /* We only own devices >= 0x1000 and <= 0x103f: leave the rest. */
311 if (pci_dev->device < 0x1000 || pci_dev->device > 0x103f)
312 return -ENODEV;
313
314 if (pci_dev->revision != VIRTIO_PCI_ABI_VERSION) {
315 printk(KERN_ERR "virtio_pci: expected ABI version %d, got %d\n",
316 VIRTIO_PCI_ABI_VERSION, pci_dev->revision);
317 return -ENODEV;
318 }
319
320 /* allocate our structure and fill it out */
321 vp_dev = kzalloc(sizeof(struct virtio_pci_device), GFP_KERNEL);
322 if (vp_dev == NULL)
323 return -ENOMEM;
324
325 snprintf(vp_dev->vdev.dev.bus_id, BUS_ID_SIZE, "virtio%d", dev_index);
326 vp_dev->vdev.index = dev_index;
327 dev_index++;
328
329 vp_dev->vdev.dev.parent = &virtio_pci_root;
330 vp_dev->vdev.config = &virtio_pci_config_ops;
331 vp_dev->pci_dev = pci_dev;
332 INIT_LIST_HEAD(&vp_dev->virtqueues);
333 spin_lock_init(&vp_dev->lock);
334
335 /* enable the device */
336 err = pci_enable_device(pci_dev);
337 if (err)
338 goto out;
339
340 err = pci_request_regions(pci_dev, "virtio-pci");
341 if (err)
342 goto out_enable_device;
343
344 vp_dev->ioaddr = pci_iomap(pci_dev, 0, 0);
345 if (vp_dev->ioaddr == NULL)
346 goto out_req_regions;
347
348 pci_set_drvdata(pci_dev, vp_dev);
349
350 /* we use the subsystem vendor/device id as the virtio vendor/device
351 * id. this allows us to use the same PCI vendor/device id for all
352 * virtio devices and to identify the particular virtio driver by
353 * the subsytem ids */
354 vp_dev->vdev.id.vendor = pci_dev->subsystem_vendor;
355 vp_dev->vdev.id.device = pci_dev->subsystem_device;
356
357 /* register a handler for the queue with the PCI device's interrupt */
358 err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, IRQF_SHARED,
359 vp_dev->vdev.dev.bus_id, vp_dev);
360 if (err)
361 goto out_set_drvdata;
362
363 /* finally register the virtio device */
364 err = register_virtio_device(&vp_dev->vdev);
365 if (err)
366 goto out_req_irq;
367
368 return 0;
369
370out_req_irq:
371 free_irq(pci_dev->irq, vp_dev);
372out_set_drvdata:
373 pci_set_drvdata(pci_dev, NULL);
374 pci_iounmap(pci_dev, vp_dev->ioaddr);
375out_req_regions:
376 pci_release_regions(pci_dev);
377out_enable_device:
378 pci_disable_device(pci_dev);
379out:
380 kfree(vp_dev);
381 return err;
382}
383
384static void __devexit virtio_pci_remove(struct pci_dev *pci_dev)
385{
386 struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
387
388 free_irq(pci_dev->irq, vp_dev);
389 pci_set_drvdata(pci_dev, NULL);
390 pci_iounmap(pci_dev, vp_dev->ioaddr);
391 pci_release_regions(pci_dev);
392 pci_disable_device(pci_dev);
393 kfree(vp_dev);
394}
395
396#ifdef CONFIG_PM
397static int virtio_pci_suspend(struct pci_dev *pci_dev, pm_message_t state)
398{
399 pci_save_state(pci_dev);
400 pci_set_power_state(pci_dev, PCI_D3hot);
401 return 0;
402}
403
404static int virtio_pci_resume(struct pci_dev *pci_dev)
405{
406 pci_restore_state(pci_dev);
407 pci_set_power_state(pci_dev, PCI_D0);
408 return 0;
409}
410#endif
411
412static struct pci_driver virtio_pci_driver = {
413 .name = "virtio-pci",
414 .id_table = virtio_pci_id_table,
415 .probe = virtio_pci_probe,
416 .remove = virtio_pci_remove,
417#ifdef CONFIG_PM
418 .suspend = virtio_pci_suspend,
419 .resume = virtio_pci_resume,
420#endif
421};
422
423static int __init virtio_pci_init(void)
424{
425 int err;
426
427 err = device_register(&virtio_pci_root);
428 if (err)
429 return err;
430
431 err = pci_register_driver(&virtio_pci_driver);
432 if (err)
433 device_unregister(&virtio_pci_root);
434
435 return err;
436}
437
438module_init(virtio_pci_init);
439
440static void __exit virtio_pci_exit(void)
441{
442 device_unregister(&virtio_pci_root);
443 pci_unregister_driver(&virtio_pci_driver);
444}
445
446module_exit(virtio_pci_exit);
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 1dc04b6684e6..3a28c1382131 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -87,6 +87,8 @@ static int vring_add_buf(struct virtqueue *_vq,
87 if (vq->num_free < out + in) { 87 if (vq->num_free < out + in) {
88 pr_debug("Can't add buf len %i - avail = %i\n", 88 pr_debug("Can't add buf len %i - avail = %i\n",
89 out + in, vq->num_free); 89 out + in, vq->num_free);
90 /* We notify *even if* VRING_USED_F_NO_NOTIFY is set here. */
91 vq->notify(&vq->vq);
90 END_USE(vq); 92 END_USE(vq);
91 return -ENOSPC; 93 return -ENOSPC;
92 } 94 }
@@ -97,16 +99,14 @@ static int vring_add_buf(struct virtqueue *_vq,
97 head = vq->free_head; 99 head = vq->free_head;
98 for (i = vq->free_head; out; i = vq->vring.desc[i].next, out--) { 100 for (i = vq->free_head; out; i = vq->vring.desc[i].next, out--) {
99 vq->vring.desc[i].flags = VRING_DESC_F_NEXT; 101 vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
100 vq->vring.desc[i].addr = (page_to_pfn(sg_page(sg))<<PAGE_SHIFT) 102 vq->vring.desc[i].addr = sg_phys(sg);
101 + sg->offset;
102 vq->vring.desc[i].len = sg->length; 103 vq->vring.desc[i].len = sg->length;
103 prev = i; 104 prev = i;
104 sg++; 105 sg++;
105 } 106 }
106 for (; in; i = vq->vring.desc[i].next, in--) { 107 for (; in; i = vq->vring.desc[i].next, in--) {
107 vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; 108 vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
108 vq->vring.desc[i].addr = (page_to_pfn(sg_page(sg))<<PAGE_SHIFT) 109 vq->vring.desc[i].addr = sg_phys(sg);
109 + sg->offset;
110 vq->vring.desc[i].len = sg->length; 110 vq->vring.desc[i].len = sg->length;
111 prev = i; 111 prev = i;
112 sg++; 112 sg++;
@@ -171,16 +171,6 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
171 vq->num_free++; 171 vq->num_free++;
172} 172}
173 173
174/* FIXME: We need to tell other side about removal, to synchronize. */
175static void vring_shutdown(struct virtqueue *_vq)
176{
177 struct vring_virtqueue *vq = to_vvq(_vq);
178 unsigned int i;
179
180 for (i = 0; i < vq->vring.num; i++)
181 detach_buf(vq, i);
182}
183
184static inline bool more_used(const struct vring_virtqueue *vq) 174static inline bool more_used(const struct vring_virtqueue *vq)
185{ 175{
186 return vq->last_used_idx != vq->vring.used->idx; 176 return vq->last_used_idx != vq->vring.used->idx;
@@ -220,7 +210,17 @@ static void *vring_get_buf(struct virtqueue *_vq, unsigned int *len)
220 return ret; 210 return ret;
221} 211}
222 212
223static bool vring_restart(struct virtqueue *_vq) 213static void vring_disable_cb(struct virtqueue *_vq)
214{
215 struct vring_virtqueue *vq = to_vvq(_vq);
216
217 START_USE(vq);
218 BUG_ON(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT);
219 vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
220 END_USE(vq);
221}
222
223static bool vring_enable_cb(struct virtqueue *_vq)
224{ 224{
225 struct vring_virtqueue *vq = to_vvq(_vq); 225 struct vring_virtqueue *vq = to_vvq(_vq);
226 226
@@ -253,26 +253,34 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
253 if (unlikely(vq->broken)) 253 if (unlikely(vq->broken))
254 return IRQ_HANDLED; 254 return IRQ_HANDLED;
255 255
256 /* Other side may have missed us turning off the interrupt,
257 * but we should preserve disable semantic for virtio users. */
258 if (unlikely(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
259 pr_debug("virtqueue interrupt after disable for %p\n", vq);
260 return IRQ_HANDLED;
261 }
262
256 pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback); 263 pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback);
257 if (vq->vq.callback && !vq->vq.callback(&vq->vq)) 264 if (vq->vq.callback)
258 vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; 265 vq->vq.callback(&vq->vq);
259 266
260 return IRQ_HANDLED; 267 return IRQ_HANDLED;
261} 268}
269EXPORT_SYMBOL_GPL(vring_interrupt);
262 270
263static struct virtqueue_ops vring_vq_ops = { 271static struct virtqueue_ops vring_vq_ops = {
264 .add_buf = vring_add_buf, 272 .add_buf = vring_add_buf,
265 .get_buf = vring_get_buf, 273 .get_buf = vring_get_buf,
266 .kick = vring_kick, 274 .kick = vring_kick,
267 .restart = vring_restart, 275 .disable_cb = vring_disable_cb,
268 .shutdown = vring_shutdown, 276 .enable_cb = vring_enable_cb,
269}; 277};
270 278
271struct virtqueue *vring_new_virtqueue(unsigned int num, 279struct virtqueue *vring_new_virtqueue(unsigned int num,
272 struct virtio_device *vdev, 280 struct virtio_device *vdev,
273 void *pages, 281 void *pages,
274 void (*notify)(struct virtqueue *), 282 void (*notify)(struct virtqueue *),
275 bool (*callback)(struct virtqueue *)) 283 void (*callback)(struct virtqueue *))
276{ 284{
277 struct vring_virtqueue *vq; 285 struct vring_virtqueue *vq;
278 unsigned int i; 286 unsigned int i;
@@ -311,9 +319,12 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
311 319
312 return &vq->vq; 320 return &vq->vq;
313} 321}
322EXPORT_SYMBOL_GPL(vring_new_virtqueue);
314 323
315void vring_del_virtqueue(struct virtqueue *vq) 324void vring_del_virtqueue(struct virtqueue *vq)
316{ 325{
317 kfree(to_vvq(vq)); 326 kfree(to_vvq(vq));
318} 327}
328EXPORT_SYMBOL_GPL(vring_del_virtqueue);
319 329
330MODULE_LICENSE("GPL");
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
index 697104da91f1..589be3e1f3ac 100644
--- a/include/linux/lguest_launcher.h
+++ b/include/linux/lguest_launcher.h
@@ -23,7 +23,12 @@
23struct lguest_device_desc { 23struct lguest_device_desc {
24 /* The device type: console, network, disk etc. Type 0 terminates. */ 24 /* The device type: console, network, disk etc. Type 0 terminates. */
25 __u8 type; 25 __u8 type;
26 /* The number of bytes of the config array. */ 26 /* The number of virtqueues (first in config array) */
27 __u8 num_vq;
28 /* The number of bytes of feature bits. Multiply by 2: one for host
29 * features and one for guest acknowledgements. */
30 __u8 feature_len;
31 /* The number of bytes of the config array after virtqueues. */
27 __u8 config_len; 32 __u8 config_len;
28 /* A status byte, written by the Guest. */ 33 /* A status byte, written by the Guest. */
29 __u8 status; 34 __u8 status;
@@ -31,7 +36,7 @@ struct lguest_device_desc {
31}; 36};
32 37
33/*D:135 This is how we expect the device configuration field for a virtqueue 38/*D:135 This is how we expect the device configuration field for a virtqueue
34 * (type VIRTIO_CONFIG_F_VIRTQUEUE) to be laid out: */ 39 * to be laid out in config space. */
35struct lguest_vqconfig { 40struct lguest_vqconfig {
36 /* The number of entries in the virtio_ring */ 41 /* The number of entries in the virtio_ring */
37 __u16 num; 42 __u16 num;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index dfe975a9967e..412672a79e8a 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1810,5 +1810,6 @@ static inline void skb_forward_csum(struct sk_buff *skb)
1810 skb->ip_summed = CHECKSUM_NONE; 1810 skb->ip_summed = CHECKSUM_NONE;
1811} 1811}
1812 1812
1813bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
1813#endif /* __KERNEL__ */ 1814#endif /* __KERNEL__ */
1814#endif /* _LINUX_SKBUFF_H */ 1815#endif /* _LINUX_SKBUFF_H */
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 14e1379876d3..260d1fcf29a4 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -11,15 +11,13 @@
11/** 11/**
12 * virtqueue - a queue to register buffers for sending or receiving. 12 * virtqueue - a queue to register buffers for sending or receiving.
13 * @callback: the function to call when buffers are consumed (can be NULL). 13 * @callback: the function to call when buffers are consumed (can be NULL).
14 * If this returns false, callbacks are suppressed until vq_ops->restart
15 * is called.
16 * @vdev: the virtio device this queue was created for. 14 * @vdev: the virtio device this queue was created for.
17 * @vq_ops: the operations for this virtqueue (see below). 15 * @vq_ops: the operations for this virtqueue (see below).
18 * @priv: a pointer for the virtqueue implementation to use. 16 * @priv: a pointer for the virtqueue implementation to use.
19 */ 17 */
20struct virtqueue 18struct virtqueue
21{ 19{
22 bool (*callback)(struct virtqueue *vq); 20 void (*callback)(struct virtqueue *vq);
23 struct virtio_device *vdev; 21 struct virtio_device *vdev;
24 struct virtqueue_ops *vq_ops; 22 struct virtqueue_ops *vq_ops;
25 void *priv; 23 void *priv;
@@ -41,13 +39,12 @@ struct virtqueue
41 * vq: the struct virtqueue we're talking about. 39 * vq: the struct virtqueue we're talking about.
42 * len: the length written into the buffer 40 * len: the length written into the buffer
43 * Returns NULL or the "data" token handed to add_buf. 41 * Returns NULL or the "data" token handed to add_buf.
44 * @restart: restart callbacks after callback returned false. 42 * @disable_cb: disable callbacks
43 * vq: the struct virtqueue we're talking about.
44 * @enable_cb: restart callbacks after disable_cb.
45 * vq: the struct virtqueue we're talking about. 45 * vq: the struct virtqueue we're talking about.
46 * This returns "false" (and doesn't re-enable) if there are pending 46 * This returns "false" (and doesn't re-enable) if there are pending
47 * buffers in the queue, to avoid a race. 47 * buffers in the queue, to avoid a race.
48 * @shutdown: "unadd" all buffers.
49 * vq: the struct virtqueue we're talking about.
50 * Remove everything from the queue.
51 * 48 *
52 * Locking rules are straightforward: the driver is responsible for 49 * Locking rules are straightforward: the driver is responsible for
53 * locking. No two operations may be invoked simultaneously. 50 * locking. No two operations may be invoked simultaneously.
@@ -65,9 +62,8 @@ struct virtqueue_ops {
65 62
66 void *(*get_buf)(struct virtqueue *vq, unsigned int *len); 63 void *(*get_buf)(struct virtqueue *vq, unsigned int *len);
67 64
68 bool (*restart)(struct virtqueue *vq); 65 void (*disable_cb)(struct virtqueue *vq);
69 66 bool (*enable_cb)(struct virtqueue *vq);
70 void (*shutdown)(struct virtqueue *vq);
71}; 67};
72 68
73/** 69/**
@@ -97,12 +93,15 @@ void unregister_virtio_device(struct virtio_device *dev);
97 * @probe: the function to call when a device is found. Returns a token for 93 * @probe: the function to call when a device is found. Returns a token for
98 * remove, or PTR_ERR(). 94 * remove, or PTR_ERR().
99 * @remove: the function when a device is removed. 95 * @remove: the function when a device is removed.
96 * @config_changed: optional function to call when the device configuration
97 * changes; may be called in interrupt context.
100 */ 98 */
101struct virtio_driver { 99struct virtio_driver {
102 struct device_driver driver; 100 struct device_driver driver;
103 const struct virtio_device_id *id_table; 101 const struct virtio_device_id *id_table;
104 int (*probe)(struct virtio_device *dev); 102 int (*probe)(struct virtio_device *dev);
105 void (*remove)(struct virtio_device *dev); 103 void (*remove)(struct virtio_device *dev);
104 void (*config_changed)(struct virtio_device *dev);
106}; 105};
107 106
108int register_virtio_driver(struct virtio_driver *drv); 107int register_virtio_driver(struct virtio_driver *drv);
diff --git a/include/linux/virtio_balloon.h b/include/linux/virtio_balloon.h
new file mode 100644
index 000000000000..979524ee75b7
--- /dev/null
+++ b/include/linux/virtio_balloon.h
@@ -0,0 +1,18 @@
1#ifndef _LINUX_VIRTIO_BALLOON_H
2#define _LINUX_VIRTIO_BALLOON_H
3#include <linux/virtio_config.h>
4
5/* The ID for virtio_balloon */
6#define VIRTIO_ID_BALLOON 5
7
8/* The feature bitmap for virtio balloon */
9#define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming pages */
10
11struct virtio_balloon_config
12{
13 /* Number of pages host wants Guest to give up. */
14 __le32 num_pages;
15 /* Number of pages we've actually got in balloon. */
16 __le32 actual;
17};
18#endif /* _LINUX_VIRTIO_BALLOON_H */
diff --git a/include/linux/virtio_blk.h b/include/linux/virtio_blk.h
index 7bd2bce0cfd9..bca0b10d7947 100644
--- a/include/linux/virtio_blk.h
+++ b/include/linux/virtio_blk.h
@@ -6,15 +6,19 @@
6#define VIRTIO_ID_BLOCK 2 6#define VIRTIO_ID_BLOCK 2
7 7
8/* Feature bits */ 8/* Feature bits */
9#define VIRTIO_CONFIG_BLK_F 0x40 9#define VIRTIO_BLK_F_BARRIER 0 /* Does host support barriers? */
10#define VIRTIO_BLK_F_BARRIER 1 /* Does host support barriers? */ 10#define VIRTIO_BLK_F_SIZE_MAX 1 /* Indicates maximum segment size */
11#define VIRTIO_BLK_F_SEG_MAX 2 /* Indicates maximum # of segments */
11 12
12/* The capacity (in 512-byte sectors). */ 13struct virtio_blk_config
13#define VIRTIO_CONFIG_BLK_F_CAPACITY 0x41 14{
14/* The maximum segment size. */ 15 /* The capacity (in 512-byte sectors). */
15#define VIRTIO_CONFIG_BLK_F_SIZE_MAX 0x42 16 __le64 capacity;
16/* The maximum number of segments. */ 17 /* The maximum segment size (if VIRTIO_BLK_F_SIZE_MAX) */
17#define VIRTIO_CONFIG_BLK_F_SEG_MAX 0x43 18 __le32 size_max;
19 /* The maximum number of segments (if VIRTIO_BLK_F_SEG_MAX) */
20 __le32 seg_max;
21} __attribute__((packed));
18 22
19/* These two define direction. */ 23/* These two define direction. */
20#define VIRTIO_BLK_T_IN 0 24#define VIRTIO_BLK_T_IN 0
@@ -35,8 +39,6 @@ struct virtio_blk_outhdr
35 __u32 ioprio; 39 __u32 ioprio;
36 /* Sector (ie. 512 byte offset) */ 40 /* Sector (ie. 512 byte offset) */
37 __u64 sector; 41 __u64 sector;
38 /* Where to put reply. */
39 __u64 id;
40}; 42};
41 43
42#define VIRTIO_BLK_S_OK 0 44#define VIRTIO_BLK_S_OK 0
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index bcc01888df78..d581b2914b34 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -5,7 +5,7 @@
5 * store and access that space differently. */ 5 * store and access that space differently. */
6#include <linux/types.h> 6#include <linux/types.h>
7 7
8/* Status byte for guest to report progress, and synchronize config. */ 8/* Status byte for guest to report progress, and synchronize features. */
9/* We have seen device and processed generic fields (VIRTIO_CONFIG_F_VIRTIO) */ 9/* We have seen device and processed generic fields (VIRTIO_CONFIG_F_VIRTIO) */
10#define VIRTIO_CONFIG_S_ACKNOWLEDGE 1 10#define VIRTIO_CONFIG_S_ACKNOWLEDGE 1
11/* We have found a driver for the device. */ 11/* We have found a driver for the device. */
@@ -15,34 +15,27 @@
15/* We've given up on this device. */ 15/* We've given up on this device. */
16#define VIRTIO_CONFIG_S_FAILED 0x80 16#define VIRTIO_CONFIG_S_FAILED 0x80
17 17
18/* Feature byte (actually 7 bits availabe): */
19/* Requirements/features of the virtio implementation. */
20#define VIRTIO_CONFIG_F_VIRTIO 1
21/* Requirements/features of the virtqueue (may have more than one). */
22#define VIRTIO_CONFIG_F_VIRTQUEUE 2
23
24#ifdef __KERNEL__ 18#ifdef __KERNEL__
25struct virtio_device; 19struct virtio_device;
26 20
27/** 21/**
28 * virtio_config_ops - operations for configuring a virtio device 22 * virtio_config_ops - operations for configuring a virtio device
29 * @find: search for the next configuration field of the given type. 23 * @feature: search for a feature in this config
30 * vdev: the virtio_device 24 * vdev: the virtio_device
31 * type: the feature type 25 * bit: the feature bit
32 * len: the (returned) length of the field if found. 26 * Returns true if the feature is supported. Acknowledges the feature
33 * Returns a token if found, or NULL. Never returnes the same field twice 27 * so the host can see it.
34 * (ie. it's used up). 28 * @get: read the value of a configuration field
35 * @get: read the value of a configuration field after find().
36 * vdev: the virtio_device 29 * vdev: the virtio_device
37 * token: the token returned from find(). 30 * offset: the offset of the configuration field
38 * buf: the buffer to write the field value into. 31 * buf: the buffer to write the field value into.
39 * len: the length of the buffer (given by find()). 32 * len: the length of the buffer
40 * Note that contents are conventionally little-endian. 33 * Note that contents are conventionally little-endian.
41 * @set: write the value of a configuration field after find(). 34 * @set: write the value of a configuration field
42 * vdev: the virtio_device 35 * vdev: the virtio_device
43 * token: the token returned from find(). 36 * offset: the offset of the configuration field
44 * buf: the buffer to read the field value from. 37 * buf: the buffer to read the field value from.
45 * len: the length of the buffer (given by find()). 38 * len: the length of the buffer
46 * Note that contents are conventionally little-endian. 39 * Note that contents are conventionally little-endian.
47 * @get_status: read the status byte 40 * @get_status: read the status byte
48 * vdev: the virtio_device 41 * vdev: the virtio_device
@@ -50,62 +43,67 @@ struct virtio_device;
50 * @set_status: write the status byte 43 * @set_status: write the status byte
51 * vdev: the virtio_device 44 * vdev: the virtio_device
52 * status: the new status byte 45 * status: the new status byte
53 * @find_vq: find the first VIRTIO_CONFIG_F_VIRTQUEUE and create a virtqueue. 46 * @reset: reset the device
47 * vdev: the virtio device
48 * After this, status and feature negotiation must be done again
49 * @find_vq: find a virtqueue and instantiate it.
54 * vdev: the virtio_device 50 * vdev: the virtio_device
51 * index: the 0-based virtqueue number in case there's more than one.
55 * callback: the virqtueue callback 52 * callback: the virqtueue callback
56 * Returns the new virtqueue or ERR_PTR(). 53 * Returns the new virtqueue or ERR_PTR() (eg. -ENOENT).
57 * @del_vq: free a virtqueue found by find_vq(). 54 * @del_vq: free a virtqueue found by find_vq().
58 */ 55 */
59struct virtio_config_ops 56struct virtio_config_ops
60{ 57{
61 void *(*find)(struct virtio_device *vdev, u8 type, unsigned *len); 58 bool (*feature)(struct virtio_device *vdev, unsigned bit);
62 void (*get)(struct virtio_device *vdev, void *token, 59 void (*get)(struct virtio_device *vdev, unsigned offset,
63 void *buf, unsigned len); 60 void *buf, unsigned len);
64 void (*set)(struct virtio_device *vdev, void *token, 61 void (*set)(struct virtio_device *vdev, unsigned offset,
65 const void *buf, unsigned len); 62 const void *buf, unsigned len);
66 u8 (*get_status)(struct virtio_device *vdev); 63 u8 (*get_status)(struct virtio_device *vdev);
67 void (*set_status)(struct virtio_device *vdev, u8 status); 64 void (*set_status)(struct virtio_device *vdev, u8 status);
65 void (*reset)(struct virtio_device *vdev);
68 struct virtqueue *(*find_vq)(struct virtio_device *vdev, 66 struct virtqueue *(*find_vq)(struct virtio_device *vdev,
69 bool (*callback)(struct virtqueue *)); 67 unsigned index,
68 void (*callback)(struct virtqueue *));
70 void (*del_vq)(struct virtqueue *vq); 69 void (*del_vq)(struct virtqueue *vq);
71}; 70};
72 71
73/** 72/**
74 * virtio_config_val - get a single virtio config and mark it used. 73 * virtio_config_val - look for a feature and get a single virtio config.
75 * @config: the virtio config space 74 * @vdev: the virtio device
76 * @type: the type to search for. 75 * @fbit: the feature bit
76 * @offset: the type to search for.
77 * @val: a pointer to the value to fill in. 77 * @val: a pointer to the value to fill in.
78 * 78 *
79 * Once used, the config type is marked with VIRTIO_CONFIG_F_USED so it can't 79 * The return value is -ENOENT if the feature doesn't exist. Otherwise
80 * be found again. This version does endian conversion. */ 80 * the value is endian-corrected and returned in v. */
81#define virtio_config_val(vdev, type, v) ({ \ 81#define virtio_config_val(vdev, fbit, offset, v) ({ \
82 int _err = __virtio_config_val((vdev),(type),(v),sizeof(*(v))); \ 82 int _err; \
83 \ 83 if ((vdev)->config->feature((vdev), (fbit))) { \
84 BUILD_BUG_ON(sizeof(*(v)) != 1 && sizeof(*(v)) != 2 \ 84 __virtio_config_val((vdev), (offset), (v)); \
85 && sizeof(*(v)) != 4 && sizeof(*(v)) != 8); \ 85 _err = 0; \
86 if (!_err) { \ 86 } else \
87 switch (sizeof(*(v))) { \ 87 _err = -ENOENT; \
88 case 2: le16_to_cpus((__u16 *) v); break; \
89 case 4: le32_to_cpus((__u32 *) v); break; \
90 case 8: le64_to_cpus((__u64 *) v); break; \
91 } \
92 } \
93 _err; \ 88 _err; \
94}) 89})
95 90
96int __virtio_config_val(struct virtio_device *dev,
97 u8 type, void *val, size_t size);
98
99/** 91/**
100 * virtio_use_bit - helper to use a feature bit in a bitfield value. 92 * __virtio_config_val - get a single virtio config without feature check.
101 * @dev: the virtio device 93 * @vdev: the virtio device
102 * @token: the token as returned from vdev->config->find(). 94 * @offset: the type to search for.
103 * @len: the length of the field. 95 * @val: a pointer to the value to fill in.
104 * @bitnum: the bit to test.
105 * 96 *
106 * If handed a NULL token, it returns false, otherwise returns bit status. 97 * The value is endian-corrected and returned in v. */
107 * If it's one, it sets the mirroring acknowledgement bit. */ 98#define __virtio_config_val(vdev, offset, v) do { \
108int virtio_use_bit(struct virtio_device *vdev, 99 BUILD_BUG_ON(sizeof(*(v)) != 1 && sizeof(*(v)) != 2 \
109 void *token, unsigned int len, unsigned int bitnum); 100 && sizeof(*(v)) != 4 && sizeof(*(v)) != 8); \
101 (vdev)->config->get((vdev), (offset), (v), sizeof(*(v))); \
102 switch (sizeof(*(v))) { \
103 case 2: le16_to_cpus((__u16 *) v); break; \
104 case 4: le32_to_cpus((__u32 *) v); break; \
105 case 8: le64_to_cpus((__u64 *) v); break; \
106 } \
107} while(0)
110#endif /* __KERNEL__ */ 108#endif /* __KERNEL__ */
111#endif /* _LINUX_VIRTIO_CONFIG_H */ 109#endif /* _LINUX_VIRTIO_CONFIG_H */
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index ae469ae55d36..1ea3351df609 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -5,32 +5,32 @@
5/* The ID for virtio_net */ 5/* The ID for virtio_net */
6#define VIRTIO_ID_NET 1 6#define VIRTIO_ID_NET 1
7 7
8/* The bitmap of config for virtio net */ 8/* The feature bitmap for virtio net */
9#define VIRTIO_CONFIG_NET_F 0x40 9#define VIRTIO_NET_F_CSUM 0 /* Can handle pkts w/ partial csum */
10#define VIRTIO_NET_F_NO_CSUM 0 10#define VIRTIO_NET_F_MAC 5 /* Host has given MAC address. */
11#define VIRTIO_NET_F_TSO4 1 11#define VIRTIO_NET_F_GSO 6 /* Can handle pkts w/ any GSO type */
12#define VIRTIO_NET_F_UFO 2
13#define VIRTIO_NET_F_TSO4_ECN 3
14#define VIRTIO_NET_F_TSO6 4
15 12
16/* The config defining mac address. */ 13struct virtio_net_config
17#define VIRTIO_CONFIG_NET_MAC_F 0x41 14{
15 /* The config defining mac address (if VIRTIO_NET_F_MAC) */
16 __u8 mac[6];
17} __attribute__((packed));
18 18
19/* This is the first element of the scatter-gather list. If you don't 19/* This is the first element of the scatter-gather list. If you don't
20 * specify GSO or CSUM features, you can simply ignore the header. */ 20 * specify GSO or CSUM features, you can simply ignore the header. */
21struct virtio_net_hdr 21struct virtio_net_hdr
22{ 22{
23#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 // Use csum_start, csum_offset 23#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 // Use csum_start, csum_offset
24 __u8 flags; 24 __u8 flags;
25#define VIRTIO_NET_HDR_GSO_NONE 0 // Not a GSO frame 25#define VIRTIO_NET_HDR_GSO_NONE 0 // Not a GSO frame
26#define VIRTIO_NET_HDR_GSO_TCPV4 1 // GSO frame, IPv4 TCP (TSO) 26#define VIRTIO_NET_HDR_GSO_TCPV4 1 // GSO frame, IPv4 TCP (TSO)
27/* FIXME: Do we need this? If they said they can handle ECN, do they care? */
28#define VIRTIO_NET_HDR_GSO_TCPV4_ECN 2 // GSO frame, IPv4 TCP w/ ECN
29#define VIRTIO_NET_HDR_GSO_UDP 3 // GSO frame, IPv4 UDP (UFO) 27#define VIRTIO_NET_HDR_GSO_UDP 3 // GSO frame, IPv4 UDP (UFO)
30#define VIRTIO_NET_HDR_GSO_TCPV6 4 // GSO frame, IPv6 TCP 28#define VIRTIO_NET_HDR_GSO_TCPV6 4 // GSO frame, IPv6 TCP
31 __u8 gso_type; 29#define VIRTIO_NET_HDR_GSO_ECN 0x80 // TCP has ECN set
32 __u16 gso_size; 30 __u8 gso_type;
33 __u16 csum_start; 31 __u16 hdr_len; /* Ethernet + IP + tcp/udp hdrs */
34 __u16 csum_offset; 32 __u16 gso_size; /* Bytes to append to gso_hdr_len per frame */
33 __u16 csum_start; /* Position to start checksumming from */
34 __u16 csum_offset; /* Offset after that to place checksum */
35}; 35};
36#endif /* _LINUX_VIRTIO_NET_H */ 36#endif /* _LINUX_VIRTIO_NET_H */
diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h
new file mode 100644
index 000000000000..b3151659cf49
--- /dev/null
+++ b/include/linux/virtio_pci.h
@@ -0,0 +1,57 @@
1/*
2 * Virtio PCI driver
3 *
4 * This module allows virtio devices to be used over a virtual PCI device.
5 * This can be used with QEMU based VMMs like KVM or Xen.
6 *
7 * Copyright IBM Corp. 2007
8 *
9 * Authors:
10 * Anthony Liguori <aliguori@us.ibm.com>
11 *
12 * This work is licensed under the terms of the GNU GPL, version 2 or later.
13 * See the COPYING file in the top-level directory.
14 *
15 */
16
17#ifndef _LINUX_VIRTIO_PCI_H
18#define _LINUX_VIRTIO_PCI_H
19
20#include <linux/virtio_config.h>
21
22/* A 32-bit r/o bitmask of the features supported by the host */
23#define VIRTIO_PCI_HOST_FEATURES 0
24
25/* A 32-bit r/w bitmask of features activated by the guest */
26#define VIRTIO_PCI_GUEST_FEATURES 4
27
28/* A 32-bit r/w PFN for the currently selected queue */
29#define VIRTIO_PCI_QUEUE_PFN 8
30
31/* A 16-bit r/o queue size for the currently selected queue */
32#define VIRTIO_PCI_QUEUE_NUM 12
33
34/* A 16-bit r/w queue selector */
35#define VIRTIO_PCI_QUEUE_SEL 14
36
37/* A 16-bit r/w queue notifier */
38#define VIRTIO_PCI_QUEUE_NOTIFY 16
39
40/* An 8-bit device status register. */
41#define VIRTIO_PCI_STATUS 18
42
43/* An 8-bit r/o interrupt status register. Reading the value will return the
44 * current contents of the ISR and will also clear it. This is effectively
45 * a read-and-acknowledge. */
46#define VIRTIO_PCI_ISR 19
47
48/* The bit of the ISR which indicates a device configuration change. */
49#define VIRTIO_PCI_ISR_CONFIG 0x2
50
51/* The remaining space is defined by each driver as the per-driver
52 * configuration space */
53#define VIRTIO_PCI_CONFIG 20
54
55/* Virtio ABI version, this must match exactly */
56#define VIRTIO_PCI_ABI_VERSION 0
57#endif
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index 1a4ed49f6478..abe481ed990e 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -15,9 +15,13 @@
15/* This marks a buffer as write-only (otherwise read-only). */ 15/* This marks a buffer as write-only (otherwise read-only). */
16#define VRING_DESC_F_WRITE 2 16#define VRING_DESC_F_WRITE 2
17 17
18/* This means don't notify other side when buffer added. */ 18/* The Host uses this in used->flags to advise the Guest: don't kick me when
19 * you add a buffer. It's unreliable, so it's simply an optimization. Guest
20 * will still kick if it's out of buffers. */
19#define VRING_USED_F_NO_NOTIFY 1 21#define VRING_USED_F_NO_NOTIFY 1
20/* This means don't interrupt guest when buffer consumed. */ 22/* The Guest uses this in avail->flags to advise the Host: don't interrupt me
23 * when you consume a buffer. It's unreliable, so it's simply an
24 * optimization. */
21#define VRING_AVAIL_F_NO_INTERRUPT 1 25#define VRING_AVAIL_F_NO_INTERRUPT 1
22 26
23/* Virtio ring descriptors: 16 bytes. These can chain together via "next". */ 27/* Virtio ring descriptors: 16 bytes. These can chain together via "next". */
@@ -89,7 +93,7 @@ struct vring {
89 * }; 93 * };
90 */ 94 */
91static inline void vring_init(struct vring *vr, unsigned int num, void *p, 95static inline void vring_init(struct vring *vr, unsigned int num, void *p,
92 unsigned int pagesize) 96 unsigned long pagesize)
93{ 97{
94 vr->num = num; 98 vr->num = num;
95 vr->desc = p; 99 vr->desc = p;
@@ -98,7 +102,7 @@ static inline void vring_init(struct vring *vr, unsigned int num, void *p,
98 & ~(pagesize - 1)); 102 & ~(pagesize - 1));
99} 103}
100 104
101static inline unsigned vring_size(unsigned int num, unsigned int pagesize) 105static inline unsigned vring_size(unsigned int num, unsigned long pagesize)
102{ 106{
103 return ((sizeof(struct vring_desc) * num + sizeof(__u16) * (2 + num) 107 return ((sizeof(struct vring_desc) * num + sizeof(__u16) * (2 + num)
104 + pagesize - 1) & ~(pagesize - 1)) 108 + pagesize - 1) & ~(pagesize - 1))
@@ -114,7 +118,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
114 struct virtio_device *vdev, 118 struct virtio_device *vdev,
115 void *pages, 119 void *pages,
116 void (*notify)(struct virtqueue *vq), 120 void (*notify)(struct virtqueue *vq),
117 bool (*callback)(struct virtqueue *vq)); 121 void (*callback)(struct virtqueue *vq));
118void vring_del_virtqueue(struct virtqueue *vq); 122void vring_del_virtqueue(struct virtqueue *vq);
119 123
120irqreturn_t vring_interrupt(int irq, void *_vq); 124irqreturn_t vring_interrupt(int irq, void *_vq);
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index 40b71a29fc3f..42eea5fe2628 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -199,14 +199,12 @@ static void p9_virtio_close(struct p9_trans *trans)
199 kfree(trans); 199 kfree(trans);
200} 200}
201 201
202static bool p9_virtio_intr(struct virtqueue *q) 202static void p9_virtio_intr(struct virtqueue *q)
203{ 203{
204 struct virtio_chan *chan = q->vdev->priv; 204 struct virtio_chan *chan = q->vdev->priv;
205 205
206 P9_DPRINTK(P9_DEBUG_TRANS, "9p poll_wakeup: %p\n", &chan->wq); 206 P9_DPRINTK(P9_DEBUG_TRANS, "9p poll_wakeup: %p\n", &chan->wq);
207 wake_up_interruptible(&chan->wq); 207 wake_up_interruptible(&chan->wq);
208
209 return true;
210} 208}
211 209
212static int p9_virtio_probe(struct virtio_device *dev) 210static int p9_virtio_probe(struct virtio_device *dev)
@@ -236,13 +234,13 @@ static int p9_virtio_probe(struct virtio_device *dev)
236 234
237 /* Find the input queue. */ 235 /* Find the input queue. */
238 dev->priv = chan; 236 dev->priv = chan;
239 chan->in_vq = dev->config->find_vq(dev, p9_virtio_intr); 237 chan->in_vq = dev->config->find_vq(dev, 0, p9_virtio_intr);
240 if (IS_ERR(chan->in_vq)) { 238 if (IS_ERR(chan->in_vq)) {
241 err = PTR_ERR(chan->in_vq); 239 err = PTR_ERR(chan->in_vq);
242 goto free; 240 goto free;
243 } 241 }
244 242
245 chan->out_vq = dev->config->find_vq(dev, NULL); 243 chan->out_vq = dev->config->find_vq(dev, 1, NULL);
246 if (IS_ERR(chan->out_vq)) { 244 if (IS_ERR(chan->out_vq)) {
247 err = PTR_ERR(chan->out_vq); 245 err = PTR_ERR(chan->out_vq);
248 goto free_in_vq; 246 goto free_in_vq;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 98420f9c4b6d..4e354221ec23 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2461,6 +2461,34 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
2461 return elt; 2461 return elt;
2462} 2462}
2463 2463
2464/**
2465 * skb_partial_csum_set - set up and verify partial csum values for packet
2466 * @skb: the skb to set
2467 * @start: the number of bytes after skb->data to start checksumming.
2468 * @off: the offset from start to place the checksum.
2469 *
2470 * For untrusted partially-checksummed packets, we need to make sure the values
2471 * for skb->csum_start and skb->csum_offset are valid so we don't oops.
2472 *
2473 * This function checks and sets those values and skb->ip_summed: if this
2474 * returns false you should drop the packet.
2475 */
2476bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
2477{
2478 if (unlikely(start > skb->len - 2) ||
2479 unlikely((int)start + off > skb->len - 2)) {
2480 if (net_ratelimit())
2481 printk(KERN_WARNING
2482 "bad partial csum: csum=%u/%u len=%u\n",
2483 start, off, skb->len);
2484 return false;
2485 }
2486 skb->ip_summed = CHECKSUM_PARTIAL;
2487 skb->csum_start = skb_headroom(skb) + start;
2488 skb->csum_offset = off;
2489 return true;
2490}
2491
2464EXPORT_SYMBOL(___pskb_trim); 2492EXPORT_SYMBOL(___pskb_trim);
2465EXPORT_SYMBOL(__kfree_skb); 2493EXPORT_SYMBOL(__kfree_skb);
2466EXPORT_SYMBOL(kfree_skb); 2494EXPORT_SYMBOL(kfree_skb);
@@ -2497,3 +2525,4 @@ EXPORT_SYMBOL(skb_append_datato_frags);
2497 2525
2498EXPORT_SYMBOL_GPL(skb_to_sgvec); 2526EXPORT_SYMBOL_GPL(skb_to_sgvec);
2499EXPORT_SYMBOL_GPL(skb_cow_data); 2527EXPORT_SYMBOL_GPL(skb_cow_data);
2528EXPORT_SYMBOL_GPL(skb_partial_csum_set);