aboutsummaryrefslogtreecommitdiffstats
path: root/tools
diff options
context:
space:
mode:
Diffstat (limited to 'tools')
-rw-r--r--tools/hv/Makefile4
-rw-r--r--tools/hv/hv_fcopy_daemon.c10
-rw-r--r--tools/hv/hv_kvp_daemon.c41
-rw-r--r--tools/lguest/Makefile8
-rw-r--r--tools/lguest/lguest.c2016
-rw-r--r--tools/power/acpi/common/cmfsize.c2
-rw-r--r--tools/power/acpi/common/getopt.c2
-rw-r--r--tools/power/acpi/os_specific/service_layers/oslibcfs.c2
-rw-r--r--tools/power/acpi/os_specific/service_layers/oslinuxtbl.c2
-rw-r--r--tools/power/acpi/os_specific/service_layers/osunixdir.c2
-rw-r--r--tools/power/acpi/os_specific/service_layers/osunixmap.c2
-rw-r--r--tools/power/acpi/os_specific/service_layers/osunixxf.c2
-rw-r--r--tools/power/acpi/tools/acpidump/acpidump.h2
-rw-r--r--tools/power/acpi/tools/acpidump/apdump.c2
-rw-r--r--tools/power/acpi/tools/acpidump/apfiles.c2
-rw-r--r--tools/power/acpi/tools/acpidump/apmain.c2
-rw-r--r--tools/power/cpupower/Makefile2
-rw-r--r--tools/power/x86/turbostat/turbostat.8126
-rw-r--r--tools/power/x86/turbostat/turbostat.c662
-rwxr-xr-xtools/testing/ktest/ktest.pl259
-rw-r--r--tools/testing/selftests/powerpc/Makefile2
-rw-r--r--tools/testing/selftests/powerpc/copyloops/.gitignore4
-rw-r--r--tools/testing/selftests/powerpc/mm/.gitignore3
-rw-r--r--tools/testing/selftests/powerpc/mm/Makefile9
-rw-r--r--tools/testing/selftests/powerpc/mm/subpage_prot.c220
-rw-r--r--tools/testing/selftests/powerpc/pmu/.gitignore3
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/.gitignore22
-rw-r--r--tools/testing/selftests/powerpc/primitives/.gitignore1
-rw-r--r--tools/testing/selftests/powerpc/stringloops/.gitignore1
-rw-r--r--tools/testing/selftests/powerpc/stringloops/Makefile20
-rw-r--r--tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h7
-rw-r--r--tools/testing/selftests/powerpc/stringloops/memcmp.c103
l---------tools/testing/selftests/powerpc/stringloops/memcmp_64.S1
-rw-r--r--tools/testing/selftests/powerpc/tm/.gitignore1
-rw-r--r--tools/usb/ffs-aio-example/multibuff/host_app/test.c14
-rw-r--r--tools/usb/ffs-aio-example/simple/device_app/aio_simple.c2
-rw-r--r--tools/usb/ffs-aio-example/simple/host_app/test.c17
-rw-r--r--tools/vm/page-types.c1
38 files changed, 2911 insertions, 670 deletions
diff --git a/tools/hv/Makefile b/tools/hv/Makefile
index bd22f786a60c..99ffe61051a7 100644
--- a/tools/hv/Makefile
+++ b/tools/hv/Makefile
@@ -5,9 +5,9 @@ PTHREAD_LIBS = -lpthread
5WARNINGS = -Wall -Wextra 5WARNINGS = -Wall -Wextra
6CFLAGS = $(WARNINGS) -g $(PTHREAD_LIBS) 6CFLAGS = $(WARNINGS) -g $(PTHREAD_LIBS)
7 7
8all: hv_kvp_daemon hv_vss_daemon 8all: hv_kvp_daemon hv_vss_daemon hv_fcopy_daemon
9%: %.c 9%: %.c
10 $(CC) $(CFLAGS) -o $@ $^ 10 $(CC) $(CFLAGS) -o $@ $^
11 11
12clean: 12clean:
13 $(RM) hv_kvp_daemon hv_vss_daemon 13 $(RM) hv_kvp_daemon hv_vss_daemon hv_fcopy_daemon
diff --git a/tools/hv/hv_fcopy_daemon.c b/tools/hv/hv_fcopy_daemon.c
index f437d739f37d..9445d8f264a4 100644
--- a/tools/hv/hv_fcopy_daemon.c
+++ b/tools/hv/hv_fcopy_daemon.c
@@ -43,15 +43,9 @@ static int hv_start_fcopy(struct hv_start_fcopy *smsg)
43 int error = HV_E_FAIL; 43 int error = HV_E_FAIL;
44 char *q, *p; 44 char *q, *p;
45 45
46 /*
47 * If possile append a path seperator to the path.
48 */
49 if (strlen((char *)smsg->path_name) < (W_MAX_PATH - 2))
50 strcat((char *)smsg->path_name, "/");
51
52 p = (char *)smsg->path_name; 46 p = (char *)smsg->path_name;
53 snprintf(target_fname, sizeof(target_fname), "%s/%s", 47 snprintf(target_fname, sizeof(target_fname), "%s/%s",
54 (char *)smsg->path_name, smsg->file_name); 48 (char *)smsg->path_name, (char *)smsg->file_name);
55 49
56 syslog(LOG_INFO, "Target file name: %s", target_fname); 50 syslog(LOG_INFO, "Target file name: %s", target_fname);
57 /* 51 /*
@@ -137,7 +131,7 @@ void print_usage(char *argv[])
137 131
138int main(int argc, char *argv[]) 132int main(int argc, char *argv[])
139{ 133{
140 int fd, fcopy_fd, len; 134 int fcopy_fd, len;
141 int error; 135 int error;
142 int daemonize = 1, long_index = 0, opt; 136 int daemonize = 1, long_index = 0, opt;
143 int version = FCOPY_CURRENT_VERSION; 137 int version = FCOPY_CURRENT_VERSION;
diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c
index 6a6432a20a1d..408bb076a234 100644
--- a/tools/hv/hv_kvp_daemon.c
+++ b/tools/hv/hv_kvp_daemon.c
@@ -147,7 +147,6 @@ static void kvp_release_lock(int pool)
147static void kvp_update_file(int pool) 147static void kvp_update_file(int pool)
148{ 148{
149 FILE *filep; 149 FILE *filep;
150 size_t bytes_written;
151 150
152 /* 151 /*
153 * We are going to write our in-memory registry out to 152 * We are going to write our in-memory registry out to
@@ -163,8 +162,7 @@ static void kvp_update_file(int pool)
163 exit(EXIT_FAILURE); 162 exit(EXIT_FAILURE);
164 } 163 }
165 164
166 bytes_written = fwrite(kvp_file_info[pool].records, 165 fwrite(kvp_file_info[pool].records, sizeof(struct kvp_record),
167 sizeof(struct kvp_record),
168 kvp_file_info[pool].num_records, filep); 166 kvp_file_info[pool].num_records, filep);
169 167
170 if (ferror(filep) || fclose(filep)) { 168 if (ferror(filep) || fclose(filep)) {
@@ -310,7 +308,7 @@ static int kvp_file_init(void)
310 return 0; 308 return 0;
311} 309}
312 310
313static int kvp_key_delete(int pool, const char *key, int key_size) 311static int kvp_key_delete(int pool, const __u8 *key, int key_size)
314{ 312{
315 int i; 313 int i;
316 int j, k; 314 int j, k;
@@ -353,8 +351,8 @@ static int kvp_key_delete(int pool, const char *key, int key_size)
353 return 1; 351 return 1;
354} 352}
355 353
356static int kvp_key_add_or_modify(int pool, const char *key, int key_size, const char *value, 354static int kvp_key_add_or_modify(int pool, const __u8 *key, int key_size,
357 int value_size) 355 const __u8 *value, int value_size)
358{ 356{
359 int i; 357 int i;
360 int num_records; 358 int num_records;
@@ -407,7 +405,7 @@ static int kvp_key_add_or_modify(int pool, const char *key, int key_size, const
407 return 0; 405 return 0;
408} 406}
409 407
410static int kvp_get_value(int pool, const char *key, int key_size, char *value, 408static int kvp_get_value(int pool, const __u8 *key, int key_size, __u8 *value,
411 int value_size) 409 int value_size)
412{ 410{
413 int i; 411 int i;
@@ -439,8 +437,8 @@ static int kvp_get_value(int pool, const char *key, int key_size, char *value,
439 return 1; 437 return 1;
440} 438}
441 439
442static int kvp_pool_enumerate(int pool, int index, char *key, int key_size, 440static int kvp_pool_enumerate(int pool, int index, __u8 *key, int key_size,
443 char *value, int value_size) 441 __u8 *value, int value_size)
444{ 442{
445 struct kvp_record *record; 443 struct kvp_record *record;
446 444
@@ -661,7 +659,7 @@ static char *kvp_if_name_to_mac(char *if_name)
661 char *p, *x; 659 char *p, *x;
662 char buf[256]; 660 char buf[256];
663 char addr_file[256]; 661 char addr_file[256];
664 int i; 662 unsigned int i;
665 char *mac_addr = NULL; 663 char *mac_addr = NULL;
666 664
667 snprintf(addr_file, sizeof(addr_file), "%s%s%s", "/sys/class/net/", 665 snprintf(addr_file, sizeof(addr_file), "%s%s%s", "/sys/class/net/",
@@ -700,7 +698,7 @@ static char *kvp_mac_to_if_name(char *mac)
700 char buf[256]; 698 char buf[256];
701 char *kvp_net_dir = "/sys/class/net/"; 699 char *kvp_net_dir = "/sys/class/net/";
702 char dev_id[256]; 700 char dev_id[256];
703 int i; 701 unsigned int i;
704 702
705 dir = opendir(kvp_net_dir); 703 dir = opendir(kvp_net_dir);
706 if (dir == NULL) 704 if (dir == NULL)
@@ -750,7 +748,7 @@ static char *kvp_mac_to_if_name(char *mac)
750 748
751 749
752static void kvp_process_ipconfig_file(char *cmd, 750static void kvp_process_ipconfig_file(char *cmd,
753 char *config_buf, int len, 751 char *config_buf, unsigned int len,
754 int element_size, int offset) 752 int element_size, int offset)
755{ 753{
756 char buf[256]; 754 char buf[256];
@@ -768,7 +766,7 @@ static void kvp_process_ipconfig_file(char *cmd,
768 if (offset == 0) 766 if (offset == 0)
769 memset(config_buf, 0, len); 767 memset(config_buf, 0, len);
770 while ((p = fgets(buf, sizeof(buf), file)) != NULL) { 768 while ((p = fgets(buf, sizeof(buf), file)) != NULL) {
771 if ((len - strlen(config_buf)) < (element_size + 1)) 769 if (len < strlen(config_buf) + element_size + 1)
772 break; 770 break;
773 771
774 x = strchr(p, '\n'); 772 x = strchr(p, '\n');
@@ -916,7 +914,7 @@ static int kvp_process_ip_address(void *addrp,
916 914
917static int 915static int
918kvp_get_ip_info(int family, char *if_name, int op, 916kvp_get_ip_info(int family, char *if_name, int op,
919 void *out_buffer, int length) 917 void *out_buffer, unsigned int length)
920{ 918{
921 struct ifaddrs *ifap; 919 struct ifaddrs *ifap;
922 struct ifaddrs *curp; 920 struct ifaddrs *curp;
@@ -1019,8 +1017,7 @@ kvp_get_ip_info(int family, char *if_name, int op,
1019 weight += hweight32(&w[i]); 1017 weight += hweight32(&w[i]);
1020 1018
1021 sprintf(cidr_mask, "/%d", weight); 1019 sprintf(cidr_mask, "/%d", weight);
1022 if ((length - sn_offset) < 1020 if (length < sn_offset + strlen(cidr_mask) + 1)
1023 (strlen(cidr_mask) + 1))
1024 goto gather_ipaddr; 1021 goto gather_ipaddr;
1025 1022
1026 if (sn_offset == 0) 1023 if (sn_offset == 0)
@@ -1308,16 +1305,17 @@ static int kvp_set_ip_info(char *if_name, struct hv_kvp_ipaddr_value *new_val)
1308 if (error) 1305 if (error)
1309 goto setval_error; 1306 goto setval_error;
1310 1307
1308 /*
1309 * The dhcp_enabled flag is only for IPv4. In the case the host only
1310 * injects an IPv6 address, the flag is true, but we still need to
1311 * proceed to parse and pass the IPv6 information to the
1312 * disto-specific script hv_set_ifconfig.
1313 */
1311 if (new_val->dhcp_enabled) { 1314 if (new_val->dhcp_enabled) {
1312 error = kvp_write_file(file, "BOOTPROTO", "", "dhcp"); 1315 error = kvp_write_file(file, "BOOTPROTO", "", "dhcp");
1313 if (error) 1316 if (error)
1314 goto setval_error; 1317 goto setval_error;
1315 1318
1316 /*
1317 * We are done!.
1318 */
1319 goto setval_done;
1320
1321 } else { 1319 } else {
1322 error = kvp_write_file(file, "BOOTPROTO", "", "none"); 1320 error = kvp_write_file(file, "BOOTPROTO", "", "none");
1323 if (error) 1321 if (error)
@@ -1345,7 +1343,6 @@ static int kvp_set_ip_info(char *if_name, struct hv_kvp_ipaddr_value *new_val)
1345 if (error) 1343 if (error)
1346 goto setval_error; 1344 goto setval_error;
1347 1345
1348setval_done:
1349 fclose(file); 1346 fclose(file);
1350 1347
1351 /* 1348 /*
diff --git a/tools/lguest/Makefile b/tools/lguest/Makefile
index 97bca4871ea3..a107b5e4da13 100644
--- a/tools/lguest/Makefile
+++ b/tools/lguest/Makefile
@@ -1,7 +1,13 @@
1# This creates the demonstration utility "lguest" which runs a Linux guest. 1# This creates the demonstration utility "lguest" which runs a Linux guest.
2CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -U_FORTIFY_SOURCE 2CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -U_FORTIFY_SOURCE -Iinclude
3 3
4all: lguest 4all: lguest
5 5
6include/linux/virtio_types.h: ../../include/uapi/linux/virtio_types.h
7 mkdir -p include/linux 2>&1 || true
8 ln -sf ../../../../include/uapi/linux/virtio_types.h $@
9
10lguest: include/linux/virtio_types.h
11
6clean: 12clean:
7 rm -f lguest 13 rm -f lguest
diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c
index 32cf2ce15d69..e44052483ed9 100644
--- a/tools/lguest/lguest.c
+++ b/tools/lguest/lguest.c
@@ -41,6 +41,8 @@
41#include <signal.h> 41#include <signal.h>
42#include <pwd.h> 42#include <pwd.h>
43#include <grp.h> 43#include <grp.h>
44#include <sys/user.h>
45#include <linux/pci_regs.h>
44 46
45#ifndef VIRTIO_F_ANY_LAYOUT 47#ifndef VIRTIO_F_ANY_LAYOUT
46#define VIRTIO_F_ANY_LAYOUT 27 48#define VIRTIO_F_ANY_LAYOUT 27
@@ -61,12 +63,19 @@ typedef uint16_t u16;
61typedef uint8_t u8; 63typedef uint8_t u8;
62/*:*/ 64/*:*/
63 65
64#include <linux/virtio_config.h> 66#define VIRTIO_CONFIG_NO_LEGACY
65#include <linux/virtio_net.h> 67#define VIRTIO_PCI_NO_LEGACY
66#include <linux/virtio_blk.h> 68#define VIRTIO_BLK_NO_LEGACY
67#include <linux/virtio_console.h> 69#define VIRTIO_NET_NO_LEGACY
68#include <linux/virtio_rng.h> 70
71/* Use in-kernel ones, which defines VIRTIO_F_VERSION_1 */
72#include "../../include/uapi/linux/virtio_config.h"
73#include "../../include/uapi/linux/virtio_net.h"
74#include "../../include/uapi/linux/virtio_blk.h"
75#include "../../include/uapi/linux/virtio_console.h"
76#include "../../include/uapi/linux/virtio_rng.h"
69#include <linux/virtio_ring.h> 77#include <linux/virtio_ring.h>
78#include "../../include/uapi/linux/virtio_pci.h"
70#include <asm/bootparam.h> 79#include <asm/bootparam.h>
71#include "../../include/linux/lguest_launcher.h" 80#include "../../include/linux/lguest_launcher.h"
72 81
@@ -91,13 +100,16 @@ static bool verbose;
91/* The pointer to the start of guest memory. */ 100/* The pointer to the start of guest memory. */
92static void *guest_base; 101static void *guest_base;
93/* The maximum guest physical address allowed, and maximum possible. */ 102/* The maximum guest physical address allowed, and maximum possible. */
94static unsigned long guest_limit, guest_max; 103static unsigned long guest_limit, guest_max, guest_mmio;
95/* The /dev/lguest file descriptor. */ 104/* The /dev/lguest file descriptor. */
96static int lguest_fd; 105static int lguest_fd;
97 106
98/* a per-cpu variable indicating whose vcpu is currently running */ 107/* a per-cpu variable indicating whose vcpu is currently running */
99static unsigned int __thread cpu_id; 108static unsigned int __thread cpu_id;
100 109
110/* 5 bit device number in the PCI_CONFIG_ADDR => 32 only */
111#define MAX_PCI_DEVICES 32
112
101/* This is our list of devices. */ 113/* This is our list of devices. */
102struct device_list { 114struct device_list {
103 /* Counter to assign interrupt numbers. */ 115 /* Counter to assign interrupt numbers. */
@@ -106,30 +118,50 @@ struct device_list {
106 /* Counter to print out convenient device numbers. */ 118 /* Counter to print out convenient device numbers. */
107 unsigned int device_num; 119 unsigned int device_num;
108 120
109 /* The descriptor page for the devices. */ 121 /* PCI devices. */
110 u8 *descpage; 122 struct device *pci[MAX_PCI_DEVICES];
111
112 /* A single linked list of devices. */
113 struct device *dev;
114 /* And a pointer to the last device for easy append. */
115 struct device *lastdev;
116}; 123};
117 124
118/* The list of Guest devices, based on command line arguments. */ 125/* The list of Guest devices, based on command line arguments. */
119static struct device_list devices; 126static struct device_list devices;
120 127
121/* The device structure describes a single device. */ 128struct virtio_pci_cfg_cap {
122struct device { 129 struct virtio_pci_cap cap;
123 /* The linked-list pointer. */ 130 u32 pci_cfg_data; /* Data for BAR access. */
124 struct device *next; 131};
125 132
126 /* The device's descriptor, as mapped into the Guest. */ 133struct virtio_pci_mmio {
127 struct lguest_device_desc *desc; 134 struct virtio_pci_common_cfg cfg;
135 u16 notify;
136 u8 isr;
137 u8 padding;
138 /* Device-specific configuration follows this. */
139};
128 140
129 /* We can't trust desc values once Guest has booted: we use these. */ 141/* This is the layout (little-endian) of the PCI config space. */
130 unsigned int feature_len; 142struct pci_config {
131 unsigned int num_vq; 143 u16 vendor_id, device_id;
144 u16 command, status;
145 u8 revid, prog_if, subclass, class;
146 u8 cacheline_size, lat_timer, header_type, bist;
147 u32 bar[6];
148 u32 cardbus_cis_ptr;
149 u16 subsystem_vendor_id, subsystem_device_id;
150 u32 expansion_rom_addr;
151 u8 capabilities, reserved1[3];
152 u32 reserved2;
153 u8 irq_line, irq_pin, min_grant, max_latency;
154
155 /* Now, this is the linked capability list. */
156 struct virtio_pci_cap common;
157 struct virtio_pci_notify_cap notify;
158 struct virtio_pci_cap isr;
159 struct virtio_pci_cap device;
160 struct virtio_pci_cfg_cap cfg_access;
161};
132 162
163/* The device structure describes a single device. */
164struct device {
133 /* The name of this device, for --verbose. */ 165 /* The name of this device, for --verbose. */
134 const char *name; 166 const char *name;
135 167
@@ -139,6 +171,25 @@ struct device {
139 /* Is it operational */ 171 /* Is it operational */
140 bool running; 172 bool running;
141 173
174 /* Has it written FEATURES_OK but not re-checked it? */
175 bool wrote_features_ok;
176
177 /* PCI configuration */
178 union {
179 struct pci_config config;
180 u32 config_words[sizeof(struct pci_config) / sizeof(u32)];
181 };
182
183 /* Features we offer, and those accepted. */
184 u64 features, features_accepted;
185
186 /* Device-specific config hangs off the end of this. */
187 struct virtio_pci_mmio *mmio;
188
189 /* PCI MMIO resources (all in BAR0) */
190 size_t mmio_size;
191 u32 mmio_addr;
192
142 /* Device-specific data. */ 193 /* Device-specific data. */
143 void *priv; 194 void *priv;
144}; 195};
@@ -150,12 +201,15 @@ struct virtqueue {
150 /* Which device owns me. */ 201 /* Which device owns me. */
151 struct device *dev; 202 struct device *dev;
152 203
153 /* The configuration for this queue. */ 204 /* Name for printing errors. */
154 struct lguest_vqconfig config; 205 const char *name;
155 206
156 /* The actual ring of buffers. */ 207 /* The actual ring of buffers. */
157 struct vring vring; 208 struct vring vring;
158 209
210 /* The information about this virtqueue (we only use queue_size on) */
211 struct virtio_pci_common_cfg pci_config;
212
159 /* Last available index we saw. */ 213 /* Last available index we saw. */
160 u16 last_avail_idx; 214 u16 last_avail_idx;
161 215
@@ -199,6 +253,16 @@ static struct termios orig_term;
199#define le32_to_cpu(v32) (v32) 253#define le32_to_cpu(v32) (v32)
200#define le64_to_cpu(v64) (v64) 254#define le64_to_cpu(v64) (v64)
201 255
256/*
257 * A real device would ignore weird/non-compliant driver behaviour. We
258 * stop and flag it, to help debugging Linux problems.
259 */
260#define bad_driver(d, fmt, ...) \
261 errx(1, "%s: bad driver: " fmt, (d)->name, ## __VA_ARGS__)
262#define bad_driver_vq(vq, fmt, ...) \
263 errx(1, "%s vq %s: bad driver: " fmt, (vq)->dev->name, \
264 vq->name, ## __VA_ARGS__)
265
202/* Is this iovec empty? */ 266/* Is this iovec empty? */
203static bool iov_empty(const struct iovec iov[], unsigned int num_iov) 267static bool iov_empty(const struct iovec iov[], unsigned int num_iov)
204{ 268{
@@ -211,7 +275,8 @@ static bool iov_empty(const struct iovec iov[], unsigned int num_iov)
211} 275}
212 276
213/* Take len bytes from the front of this iovec. */ 277/* Take len bytes from the front of this iovec. */
214static void iov_consume(struct iovec iov[], unsigned num_iov, 278static void iov_consume(struct device *d,
279 struct iovec iov[], unsigned num_iov,
215 void *dest, unsigned len) 280 void *dest, unsigned len)
216{ 281{
217 unsigned int i; 282 unsigned int i;
@@ -229,14 +294,7 @@ static void iov_consume(struct iovec iov[], unsigned num_iov,
229 len -= used; 294 len -= used;
230 } 295 }
231 if (len != 0) 296 if (len != 0)
232 errx(1, "iovec too short!"); 297 bad_driver(d, "iovec too short!");
233}
234
235/* The device virtqueue descriptors are followed by feature bitmasks. */
236static u8 *get_feature_bits(struct device *dev)
237{
238 return (u8 *)(dev->desc + 1)
239 + dev->num_vq * sizeof(struct lguest_vqconfig);
240} 298}
241 299
242/*L:100 300/*L:100
@@ -309,14 +367,20 @@ static void *map_zeroed_pages(unsigned int num)
309 return addr + getpagesize(); 367 return addr + getpagesize();
310} 368}
311 369
312/* Get some more pages for a device. */ 370/* Get some bytes which won't be mapped into the guest. */
313static void *get_pages(unsigned int num) 371static unsigned long get_mmio_region(size_t size)
314{ 372{
315 void *addr = from_guest_phys(guest_limit); 373 unsigned long addr = guest_mmio;
374 size_t i;
375
376 if (!size)
377 return addr;
378
379 /* Size has to be a power of 2 (and multiple of 16) */
380 for (i = 1; i < size; i <<= 1);
381
382 guest_mmio += i;
316 383
317 guest_limit += num * getpagesize();
318 if (guest_limit > guest_max)
319 errx(1, "Not enough memory for devices");
320 return addr; 384 return addr;
321} 385}
322 386
@@ -547,9 +611,11 @@ static void tell_kernel(unsigned long start)
547{ 611{
548 unsigned long args[] = { LHREQ_INITIALIZE, 612 unsigned long args[] = { LHREQ_INITIALIZE,
549 (unsigned long)guest_base, 613 (unsigned long)guest_base,
550 guest_limit / getpagesize(), start }; 614 guest_limit / getpagesize(), start,
551 verbose("Guest: %p - %p (%#lx)\n", 615 (guest_mmio+getpagesize()-1) / getpagesize() };
552 guest_base, guest_base + guest_limit, guest_limit); 616 verbose("Guest: %p - %p (%#lx, MMIO %#lx)\n",
617 guest_base, guest_base + guest_limit,
618 guest_limit, guest_mmio);
553 lguest_fd = open_or_die("/dev/lguest", O_RDWR); 619 lguest_fd = open_or_die("/dev/lguest", O_RDWR);
554 if (write(lguest_fd, args, sizeof(args)) < 0) 620 if (write(lguest_fd, args, sizeof(args)) < 0)
555 err(1, "Writing to /dev/lguest"); 621 err(1, "Writing to /dev/lguest");
@@ -564,7 +630,8 @@ static void tell_kernel(unsigned long start)
564 * we have a convenient routine which checks it and exits with an error message 630 * we have a convenient routine which checks it and exits with an error message
565 * if something funny is going on: 631 * if something funny is going on:
566 */ 632 */
567static void *_check_pointer(unsigned long addr, unsigned int size, 633static void *_check_pointer(struct device *d,
634 unsigned long addr, unsigned int size,
568 unsigned int line) 635 unsigned int line)
569{ 636{
570 /* 637 /*
@@ -572,7 +639,8 @@ static void *_check_pointer(unsigned long addr, unsigned int size,
572 * or addr + size wraps around. 639 * or addr + size wraps around.
573 */ 640 */
574 if ((addr + size) > guest_limit || (addr + size) < addr) 641 if ((addr + size) > guest_limit || (addr + size) < addr)
575 errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr); 642 bad_driver(d, "%s:%i: Invalid address %#lx",
643 __FILE__, line, addr);
576 /* 644 /*
577 * We return a pointer for the caller's convenience, now we know it's 645 * We return a pointer for the caller's convenience, now we know it's
578 * safe to use. 646 * safe to use.
@@ -580,14 +648,14 @@ static void *_check_pointer(unsigned long addr, unsigned int size,
580 return from_guest_phys(addr); 648 return from_guest_phys(addr);
581} 649}
582/* A macro which transparently hands the line number to the real function. */ 650/* A macro which transparently hands the line number to the real function. */
583#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) 651#define check_pointer(d,addr,size) _check_pointer(d, addr, size, __LINE__)
584 652
585/* 653/*
586 * Each buffer in the virtqueues is actually a chain of descriptors. This 654 * Each buffer in the virtqueues is actually a chain of descriptors. This
587 * function returns the next descriptor in the chain, or vq->vring.num if we're 655 * function returns the next descriptor in the chain, or vq->vring.num if we're
588 * at the end. 656 * at the end.
589 */ 657 */
590static unsigned next_desc(struct vring_desc *desc, 658static unsigned next_desc(struct device *d, struct vring_desc *desc,
591 unsigned int i, unsigned int max) 659 unsigned int i, unsigned int max)
592{ 660{
593 unsigned int next; 661 unsigned int next;
@@ -602,7 +670,7 @@ static unsigned next_desc(struct vring_desc *desc,
602 wmb(); 670 wmb();
603 671
604 if (next >= max) 672 if (next >= max)
605 errx(1, "Desc next is %u", next); 673 bad_driver(d, "Desc next is %u", next);
606 674
607 return next; 675 return next;
608} 676}
@@ -613,21 +681,48 @@ static unsigned next_desc(struct vring_desc *desc,
613 */ 681 */
614static void trigger_irq(struct virtqueue *vq) 682static void trigger_irq(struct virtqueue *vq)
615{ 683{
616 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; 684 unsigned long buf[] = { LHREQ_IRQ, vq->dev->config.irq_line };
617 685
618 /* Don't inform them if nothing used. */ 686 /* Don't inform them if nothing used. */
619 if (!vq->pending_used) 687 if (!vq->pending_used)
620 return; 688 return;
621 vq->pending_used = 0; 689 vq->pending_used = 0;
622 690
623 /* If they don't want an interrupt, don't send one... */ 691 /*
692 * 2.4.7.1:
693 *
694 * If the VIRTIO_F_EVENT_IDX feature bit is not negotiated:
695 * The driver MUST set flags to 0 or 1.
696 */
697 if (vq->vring.avail->flags > 1)
698 bad_driver_vq(vq, "avail->flags = %u\n", vq->vring.avail->flags);
699
700 /*
701 * 2.4.7.2:
702 *
703 * If the VIRTIO_F_EVENT_IDX feature bit is not negotiated:
704 *
705 * - The device MUST ignore the used_event value.
706 * - After the device writes a descriptor index into the used ring:
707 * - If flags is 1, the device SHOULD NOT send an interrupt.
708 * - If flags is 0, the device MUST send an interrupt.
709 */
624 if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) { 710 if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) {
625 return; 711 return;
626 } 712 }
627 713
714 /*
715 * 4.1.4.5.1:
716 *
717 * If MSI-X capability is disabled, the device MUST set the Queue
718 * Interrupt bit in ISR status before sending a virtqueue notification
719 * to the driver.
720 */
721 vq->dev->mmio->isr = 0x1;
722
628 /* Send the Guest an interrupt tell them we used something up. */ 723 /* Send the Guest an interrupt tell them we used something up. */
629 if (write(lguest_fd, buf, sizeof(buf)) != 0) 724 if (write(lguest_fd, buf, sizeof(buf)) != 0)
630 err(1, "Triggering irq %i", vq->config.irq); 725 err(1, "Triggering irq %i", vq->dev->config.irq_line);
631} 726}
632 727
633/* 728/*
@@ -646,6 +741,14 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
646 struct vring_desc *desc; 741 struct vring_desc *desc;
647 u16 last_avail = lg_last_avail(vq); 742 u16 last_avail = lg_last_avail(vq);
648 743
744 /*
745 * 2.4.7.1:
746 *
747 * The driver MUST handle spurious interrupts from the device.
748 *
749 * That's why this is a while loop.
750 */
751
649 /* There's nothing available? */ 752 /* There's nothing available? */
650 while (last_avail == vq->vring.avail->idx) { 753 while (last_avail == vq->vring.avail->idx) {
651 u64 event; 754 u64 event;
@@ -679,8 +782,8 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
679 782
680 /* Check it isn't doing very strange things with descriptor numbers. */ 783 /* Check it isn't doing very strange things with descriptor numbers. */
681 if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) 784 if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
682 errx(1, "Guest moved used index from %u to %u", 785 bad_driver_vq(vq, "Guest moved used index from %u to %u",
683 last_avail, vq->vring.avail->idx); 786 last_avail, vq->vring.avail->idx);
684 787
685 /* 788 /*
686 * Make sure we read the descriptor number *after* we read the ring 789 * Make sure we read the descriptor number *after* we read the ring
@@ -697,7 +800,7 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
697 800
698 /* If their number is silly, that's a fatal mistake. */ 801 /* If their number is silly, that's a fatal mistake. */
699 if (head >= vq->vring.num) 802 if (head >= vq->vring.num)
700 errx(1, "Guest says index %u is available", head); 803 bad_driver_vq(vq, "Guest says index %u is available", head);
701 804
702 /* When we start there are none of either input nor output. */ 805 /* When we start there are none of either input nor output. */
703 *out_num = *in_num = 0; 806 *out_num = *in_num = 0;
@@ -712,24 +815,73 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
712 * that: no rmb() required. 815 * that: no rmb() required.
713 */ 816 */
714 817
715 /* 818 do {
716 * If this is an indirect entry, then this buffer contains a descriptor 819 /*
717 * table which we handle as if it's any normal descriptor chain. 820 * If this is an indirect entry, then this buffer contains a
718 */ 821 * descriptor table which we handle as if it's any normal
719 if (desc[i].flags & VRING_DESC_F_INDIRECT) { 822 * descriptor chain.
720 if (desc[i].len % sizeof(struct vring_desc)) 823 */
721 errx(1, "Invalid size for indirect buffer table"); 824 if (desc[i].flags & VRING_DESC_F_INDIRECT) {
825 /* 2.4.5.3.1:
826 *
827 * The driver MUST NOT set the VIRTQ_DESC_F_INDIRECT
828 * flag unless the VIRTIO_F_INDIRECT_DESC feature was
829 * negotiated.
830 */
831 if (!(vq->dev->features_accepted &
832 (1<<VIRTIO_RING_F_INDIRECT_DESC)))
833 bad_driver_vq(vq, "vq indirect not negotiated");
722 834
723 max = desc[i].len / sizeof(struct vring_desc); 835 /*
724 desc = check_pointer(desc[i].addr, desc[i].len); 836 * 2.4.5.3.1:
725 i = 0; 837 *
726 } 838 * The driver MUST NOT set the VIRTQ_DESC_F_INDIRECT
839 * flag within an indirect descriptor (ie. only one
840 * table per descriptor).
841 */
842 if (desc != vq->vring.desc)
843 bad_driver_vq(vq, "Indirect within indirect");
844
845 /*
846 * Proposed update VIRTIO-134 spells this out:
847 *
848 * A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT
849 * and VIRTQ_DESC_F_NEXT in flags.
850 */
851 if (desc[i].flags & VRING_DESC_F_NEXT)
852 bad_driver_vq(vq, "indirect and next together");
853
854 if (desc[i].len % sizeof(struct vring_desc))
855 bad_driver_vq(vq,
856 "Invalid size for indirect table");
857 /*
858 * 2.4.5.3.2:
859 *
860 * The device MUST ignore the write-only flag
861 * (flags&VIRTQ_DESC_F_WRITE) in the descriptor that
862 * refers to an indirect table.
863 *
864 * We ignore it here: :)
865 */
866
867 max = desc[i].len / sizeof(struct vring_desc);
868 desc = check_pointer(vq->dev, desc[i].addr, desc[i].len);
869 i = 0;
870
871 /* 2.4.5.3.1:
872 *
873 * A driver MUST NOT create a descriptor chain longer
874 * than the Queue Size of the device.
875 */
876 if (max > vq->pci_config.queue_size)
877 bad_driver_vq(vq,
878 "indirect has too many entries");
879 }
727 880
728 do {
729 /* Grab the first descriptor, and check it's OK. */ 881 /* Grab the first descriptor, and check it's OK. */
730 iov[*out_num + *in_num].iov_len = desc[i].len; 882 iov[*out_num + *in_num].iov_len = desc[i].len;
731 iov[*out_num + *in_num].iov_base 883 iov[*out_num + *in_num].iov_base
732 = check_pointer(desc[i].addr, desc[i].len); 884 = check_pointer(vq->dev, desc[i].addr, desc[i].len);
733 /* If this is an input descriptor, increment that count. */ 885 /* If this is an input descriptor, increment that count. */
734 if (desc[i].flags & VRING_DESC_F_WRITE) 886 if (desc[i].flags & VRING_DESC_F_WRITE)
735 (*in_num)++; 887 (*in_num)++;
@@ -739,14 +891,15 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
739 * to come before any input descriptors. 891 * to come before any input descriptors.
740 */ 892 */
741 if (*in_num) 893 if (*in_num)
742 errx(1, "Descriptor has out after in"); 894 bad_driver_vq(vq,
895 "Descriptor has out after in");
743 (*out_num)++; 896 (*out_num)++;
744 } 897 }
745 898
746 /* If we've got too many, that implies a descriptor loop. */ 899 /* If we've got too many, that implies a descriptor loop. */
747 if (*out_num + *in_num > max) 900 if (*out_num + *in_num > max)
748 errx(1, "Looped descriptor"); 901 bad_driver_vq(vq, "Looped descriptor");
749 } while ((i = next_desc(desc, i, max)) != max); 902 } while ((i = next_desc(vq->dev, desc, i, max)) != max);
750 903
751 return head; 904 return head;
752} 905}
@@ -803,7 +956,7 @@ static void console_input(struct virtqueue *vq)
803 /* Make sure there's a descriptor available. */ 956 /* Make sure there's a descriptor available. */
804 head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 957 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
805 if (out_num) 958 if (out_num)
806 errx(1, "Output buffers in console in queue?"); 959 bad_driver_vq(vq, "Output buffers in console in queue?");
807 960
808 /* Read into it. This is where we usually wait. */ 961 /* Read into it. This is where we usually wait. */
809 len = readv(STDIN_FILENO, iov, in_num); 962 len = readv(STDIN_FILENO, iov, in_num);
@@ -856,7 +1009,7 @@ static void console_output(struct virtqueue *vq)
856 /* We usually wait in here, for the Guest to give us something. */ 1009 /* We usually wait in here, for the Guest to give us something. */
857 head = wait_for_vq_desc(vq, iov, &out, &in); 1010 head = wait_for_vq_desc(vq, iov, &out, &in);
858 if (in) 1011 if (in)
859 errx(1, "Input buffers in console output queue?"); 1012 bad_driver_vq(vq, "Input buffers in console output queue?");
860 1013
861 /* writev can return a partial write, so we loop here. */ 1014 /* writev can return a partial write, so we loop here. */
862 while (!iov_empty(iov, out)) { 1015 while (!iov_empty(iov, out)) {
@@ -865,7 +1018,7 @@ static void console_output(struct virtqueue *vq)
865 warn("Write to stdout gave %i (%d)", len, errno); 1018 warn("Write to stdout gave %i (%d)", len, errno);
866 break; 1019 break;
867 } 1020 }
868 iov_consume(iov, out, NULL, len); 1021 iov_consume(vq->dev, iov, out, NULL, len);
869 } 1022 }
870 1023
871 /* 1024 /*
@@ -894,7 +1047,7 @@ static void net_output(struct virtqueue *vq)
894 /* We usually wait in here for the Guest to give us a packet. */ 1047 /* We usually wait in here for the Guest to give us a packet. */
895 head = wait_for_vq_desc(vq, iov, &out, &in); 1048 head = wait_for_vq_desc(vq, iov, &out, &in);
896 if (in) 1049 if (in)
897 errx(1, "Input buffers in net output queue?"); 1050 bad_driver_vq(vq, "Input buffers in net output queue?");
898 /* 1051 /*
899 * Send the whole thing through to /dev/net/tun. It expects the exact 1052 * Send the whole thing through to /dev/net/tun. It expects the exact
900 * same format: what a coincidence! 1053 * same format: what a coincidence!
@@ -942,7 +1095,7 @@ static void net_input(struct virtqueue *vq)
942 */ 1095 */
943 head = wait_for_vq_desc(vq, iov, &out, &in); 1096 head = wait_for_vq_desc(vq, iov, &out, &in);
944 if (out) 1097 if (out)
945 errx(1, "Output buffers in net input queue?"); 1098 bad_driver_vq(vq, "Output buffers in net input queue?");
946 1099
947 /* 1100 /*
948 * If it looks like we'll block reading from the tun device, send them 1101 * If it looks like we'll block reading from the tun device, send them
@@ -986,6 +1139,12 @@ static void kill_launcher(int signal)
986 kill(0, SIGTERM); 1139 kill(0, SIGTERM);
987} 1140}
988 1141
1142static void reset_vq_pci_config(struct virtqueue *vq)
1143{
1144 vq->pci_config.queue_size = VIRTQUEUE_NUM;
1145 vq->pci_config.queue_enable = 0;
1146}
1147
989static void reset_device(struct device *dev) 1148static void reset_device(struct device *dev)
990{ 1149{
991 struct virtqueue *vq; 1150 struct virtqueue *vq;
@@ -993,53 +1152,705 @@ static void reset_device(struct device *dev)
993 verbose("Resetting device %s\n", dev->name); 1152 verbose("Resetting device %s\n", dev->name);
994 1153
995 /* Clear any features they've acked. */ 1154 /* Clear any features they've acked. */
996 memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len); 1155 dev->features_accepted = 0;
997 1156
998 /* We're going to be explicitly killing threads, so ignore them. */ 1157 /* We're going to be explicitly killing threads, so ignore them. */
999 signal(SIGCHLD, SIG_IGN); 1158 signal(SIGCHLD, SIG_IGN);
1000 1159
1001 /* Zero out the virtqueues, get rid of their threads */ 1160 /*
1161 * 4.1.4.3.1:
1162 *
1163 * The device MUST present a 0 in queue_enable on reset.
1164 *
1165 * This means we set it here, and reset the saved ones in every vq.
1166 */
1167 dev->mmio->cfg.queue_enable = 0;
1168
1169 /* Get rid of the virtqueue threads */
1002 for (vq = dev->vq; vq; vq = vq->next) { 1170 for (vq = dev->vq; vq; vq = vq->next) {
1171 vq->last_avail_idx = 0;
1172 reset_vq_pci_config(vq);
1003 if (vq->thread != (pid_t)-1) { 1173 if (vq->thread != (pid_t)-1) {
1004 kill(vq->thread, SIGTERM); 1174 kill(vq->thread, SIGTERM);
1005 waitpid(vq->thread, NULL, 0); 1175 waitpid(vq->thread, NULL, 0);
1006 vq->thread = (pid_t)-1; 1176 vq->thread = (pid_t)-1;
1007 } 1177 }
1008 memset(vq->vring.desc, 0,
1009 vring_size(vq->config.num, LGUEST_VRING_ALIGN));
1010 lg_last_avail(vq) = 0;
1011 } 1178 }
1012 dev->running = false; 1179 dev->running = false;
1180 dev->wrote_features_ok = false;
1013 1181
1014 /* Now we care if threads die. */ 1182 /* Now we care if threads die. */
1015 signal(SIGCHLD, (void *)kill_launcher); 1183 signal(SIGCHLD, (void *)kill_launcher);
1016} 1184}
1017 1185
1186static void cleanup_devices(void)
1187{
1188 unsigned int i;
1189
1190 for (i = 1; i < MAX_PCI_DEVICES; i++) {
1191 struct device *d = devices.pci[i];
1192 if (!d)
1193 continue;
1194 reset_device(d);
1195 }
1196
1197 /* If we saved off the original terminal settings, restore them now. */
1198 if (orig_term.c_lflag & (ISIG|ICANON|ECHO))
1199 tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
1200}
1201
1202/*L:217
1203 * We do PCI. This is mainly done to let us test the kernel virtio PCI
1204 * code.
1205 */
1206
1207/* Linux expects a PCI host bridge: ours is a dummy, and first on the bus. */
1208static struct device pci_host_bridge;
1209
1210static void init_pci_host_bridge(void)
1211{
1212 pci_host_bridge.name = "PCI Host Bridge";
1213 pci_host_bridge.config.class = 0x06; /* bridge */
1214 pci_host_bridge.config.subclass = 0; /* host bridge */
1215 devices.pci[0] = &pci_host_bridge;
1216}
1217
1218/* The IO ports used to read the PCI config space. */
1219#define PCI_CONFIG_ADDR 0xCF8
1220#define PCI_CONFIG_DATA 0xCFC
1221
1222/*
1223 * Not really portable, but does help readability: this is what the Guest
1224 * writes to the PCI_CONFIG_ADDR IO port.
1225 */
1226union pci_config_addr {
1227 struct {
1228 unsigned mbz: 2;
1229 unsigned offset: 6;
1230 unsigned funcnum: 3;
1231 unsigned devnum: 5;
1232 unsigned busnum: 8;
1233 unsigned reserved: 7;
1234 unsigned enabled : 1;
1235 } bits;
1236 u32 val;
1237};
1238
1239/*
1240 * We cache what they wrote to the address port, so we know what they're
1241 * talking about when they access the data port.
1242 */
1243static union pci_config_addr pci_config_addr;
1244
1245static struct device *find_pci_device(unsigned int index)
1246{
1247 return devices.pci[index];
1248}
1249
1250/* PCI can do 1, 2 and 4 byte reads; we handle that here. */
1251static void ioread(u16 off, u32 v, u32 mask, u32 *val)
1252{
1253 assert(off < 4);
1254 assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF);
1255 *val = (v >> (off * 8)) & mask;
1256}
1257
1258/* PCI can do 1, 2 and 4 byte writes; we handle that here. */
1259static void iowrite(u16 off, u32 v, u32 mask, u32 *dst)
1260{
1261 assert(off < 4);
1262 assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF);
1263 *dst &= ~(mask << (off * 8));
1264 *dst |= (v & mask) << (off * 8);
1265}
1266
1267/*
1268 * Where PCI_CONFIG_DATA accesses depends on the previous write to
1269 * PCI_CONFIG_ADDR.
1270 */
1271static struct device *dev_and_reg(u32 *reg)
1272{
1273 if (!pci_config_addr.bits.enabled)
1274 return NULL;
1275
1276 if (pci_config_addr.bits.funcnum != 0)
1277 return NULL;
1278
1279 if (pci_config_addr.bits.busnum != 0)
1280 return NULL;
1281
1282 if (pci_config_addr.bits.offset * 4 >= sizeof(struct pci_config))
1283 return NULL;
1284
1285 *reg = pci_config_addr.bits.offset;
1286 return find_pci_device(pci_config_addr.bits.devnum);
1287}
1288
1289/*
1290 * We can get invalid combinations of values while they're writing, so we
1291 * only fault if they try to write with some invalid bar/offset/length.
1292 */
1293static bool valid_bar_access(struct device *d,
1294 struct virtio_pci_cfg_cap *cfg_access)
1295{
1296 /* We only have 1 bar (BAR0) */
1297 if (cfg_access->cap.bar != 0)
1298 return false;
1299
1300 /* Check it's within BAR0. */
1301 if (cfg_access->cap.offset >= d->mmio_size
1302 || cfg_access->cap.offset + cfg_access->cap.length > d->mmio_size)
1303 return false;
1304
1305 /* Check length is 1, 2 or 4. */
1306 if (cfg_access->cap.length != 1
1307 && cfg_access->cap.length != 2
1308 && cfg_access->cap.length != 4)
1309 return false;
1310
1311 /*
1312 * 4.1.4.7.2:
1313 *
1314 * The driver MUST NOT write a cap.offset which is not a multiple of
1315 * cap.length (ie. all accesses MUST be aligned).
1316 */
1317 if (cfg_access->cap.offset % cfg_access->cap.length != 0)
1318 return false;
1319
1320 /* Return pointer into word in BAR0. */
1321 return true;
1322}
1323
1324/* Is this accessing the PCI config address port?. */
1325static bool is_pci_addr_port(u16 port)
1326{
1327 return port >= PCI_CONFIG_ADDR && port < PCI_CONFIG_ADDR + 4;
1328}
1329
1330static bool pci_addr_iowrite(u16 port, u32 mask, u32 val)
1331{
1332 iowrite(port - PCI_CONFIG_ADDR, val, mask,
1333 &pci_config_addr.val);
1334 verbose("PCI%s: %#x/%x: bus %u dev %u func %u reg %u\n",
1335 pci_config_addr.bits.enabled ? "" : " DISABLED",
1336 val, mask,
1337 pci_config_addr.bits.busnum,
1338 pci_config_addr.bits.devnum,
1339 pci_config_addr.bits.funcnum,
1340 pci_config_addr.bits.offset);
1341 return true;
1342}
1343
1344static void pci_addr_ioread(u16 port, u32 mask, u32 *val)
1345{
1346 ioread(port - PCI_CONFIG_ADDR, pci_config_addr.val, mask, val);
1347}
1348
1349/* Is this accessing the PCI config data port?. */
1350static bool is_pci_data_port(u16 port)
1351{
1352 return port >= PCI_CONFIG_DATA && port < PCI_CONFIG_DATA + 4;
1353}
1354
1355static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask);
1356
1357static bool pci_data_iowrite(u16 port, u32 mask, u32 val)
1358{
1359 u32 reg, portoff;
1360 struct device *d = dev_and_reg(&reg);
1361
1362 /* Complain if they don't belong to a device. */
1363 if (!d)
1364 return false;
1365
1366 /* They can do 1 byte writes, etc. */
1367 portoff = port - PCI_CONFIG_DATA;
1368
1369 /*
1370 * PCI uses a weird way to determine the BAR size: the OS
1371 * writes all 1's, and sees which ones stick.
1372 */
1373 if (&d->config_words[reg] == &d->config.bar[0]) {
1374 int i;
1375
1376 iowrite(portoff, val, mask, &d->config.bar[0]);
1377 for (i = 0; (1 << i) < d->mmio_size; i++)
1378 d->config.bar[0] &= ~(1 << i);
1379 return true;
1380 } else if ((&d->config_words[reg] > &d->config.bar[0]
1381 && &d->config_words[reg] <= &d->config.bar[6])
1382 || &d->config_words[reg] == &d->config.expansion_rom_addr) {
1383 /* Allow writing to any other BAR, or expansion ROM */
1384 iowrite(portoff, val, mask, &d->config_words[reg]);
1385 return true;
1386 /* We let them overide latency timer and cacheline size */
1387 } else if (&d->config_words[reg] == (void *)&d->config.cacheline_size) {
1388 /* Only let them change the first two fields. */
1389 if (mask == 0xFFFFFFFF)
1390 mask = 0xFFFF;
1391 iowrite(portoff, val, mask, &d->config_words[reg]);
1392 return true;
1393 } else if (&d->config_words[reg] == (void *)&d->config.command
1394 && mask == 0xFFFF) {
1395 /* Ignore command writes. */
1396 return true;
1397 } else if (&d->config_words[reg]
1398 == (void *)&d->config.cfg_access.cap.bar
1399 || &d->config_words[reg]
1400 == &d->config.cfg_access.cap.length
1401 || &d->config_words[reg]
1402 == &d->config.cfg_access.cap.offset) {
1403
1404 /*
1405 * The VIRTIO_PCI_CAP_PCI_CFG capability
1406 * provides a backdoor to access the MMIO
1407 * regions without mapping them. Weird, but
1408 * useful.
1409 */
1410 iowrite(portoff, val, mask, &d->config_words[reg]);
1411 return true;
1412 } else if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) {
1413 u32 write_mask;
1414
1415 /*
1416 * 4.1.4.7.1:
1417 *
1418 * Upon detecting driver write access to pci_cfg_data, the
1419 * device MUST execute a write access at offset cap.offset at
1420 * BAR selected by cap.bar using the first cap.length bytes
1421 * from pci_cfg_data.
1422 */
1423
1424 /* Must be bar 0 */
1425 if (!valid_bar_access(d, &d->config.cfg_access))
1426 return false;
1427
1428 iowrite(portoff, val, mask, &d->config.cfg_access.pci_cfg_data);
1429
1430 /*
1431 * Now emulate a write. The mask we use is set by
1432 * len, *not* this write!
1433 */
1434 write_mask = (1ULL<<(8*d->config.cfg_access.cap.length)) - 1;
1435 verbose("Window writing %#x/%#x to bar %u, offset %u len %u\n",
1436 d->config.cfg_access.pci_cfg_data, write_mask,
1437 d->config.cfg_access.cap.bar,
1438 d->config.cfg_access.cap.offset,
1439 d->config.cfg_access.cap.length);
1440
1441 emulate_mmio_write(d, d->config.cfg_access.cap.offset,
1442 d->config.cfg_access.pci_cfg_data,
1443 write_mask);
1444 return true;
1445 }
1446
1447 /*
1448 * 4.1.4.1:
1449 *
1450 * The driver MUST NOT write into any field of the capability
1451 * structure, with the exception of those with cap_type
1452 * VIRTIO_PCI_CAP_PCI_CFG...
1453 */
1454 return false;
1455}
1456
1457static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask);
1458
1459static void pci_data_ioread(u16 port, u32 mask, u32 *val)
1460{
1461 u32 reg;
1462 struct device *d = dev_and_reg(&reg);
1463
1464 if (!d)
1465 return;
1466
1467 /* Read through the PCI MMIO access window is special */
1468 if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) {
1469 u32 read_mask;
1470
1471 /*
1472 * 4.1.4.7.1:
1473 *
1474 * Upon detecting driver read access to pci_cfg_data, the
1475 * device MUST execute a read access of length cap.length at
1476 * offset cap.offset at BAR selected by cap.bar and store the
1477 * first cap.length bytes in pci_cfg_data.
1478 */
1479 /* Must be bar 0 */
1480 if (!valid_bar_access(d, &d->config.cfg_access))
1481 bad_driver(d,
1482 "Invalid cfg_access to bar%u, offset %u len %u",
1483 d->config.cfg_access.cap.bar,
1484 d->config.cfg_access.cap.offset,
1485 d->config.cfg_access.cap.length);
1486
1487 /*
1488 * Read into the window. The mask we use is set by
1489 * len, *not* this read!
1490 */
1491 read_mask = (1ULL<<(8*d->config.cfg_access.cap.length))-1;
1492 d->config.cfg_access.pci_cfg_data
1493 = emulate_mmio_read(d,
1494 d->config.cfg_access.cap.offset,
1495 read_mask);
1496 verbose("Window read %#x/%#x from bar %u, offset %u len %u\n",
1497 d->config.cfg_access.pci_cfg_data, read_mask,
1498 d->config.cfg_access.cap.bar,
1499 d->config.cfg_access.cap.offset,
1500 d->config.cfg_access.cap.length);
1501 }
1502 ioread(port - PCI_CONFIG_DATA, d->config_words[reg], mask, val);
1503}
1504
1018/*L:216 1505/*L:216
1019 * This actually creates the thread which services the virtqueue for a device. 1506 * This is where we emulate a handful of Guest instructions. It's ugly
1507 * and we used to do it in the kernel but it grew over time.
1508 */
1509
1510/*
1511 * We use the ptrace syscall's pt_regs struct to talk about registers
1512 * to lguest: these macros convert the names to the offsets.
1513 */
1514#define getreg(name) getreg_off(offsetof(struct user_regs_struct, name))
1515#define setreg(name, val) \
1516 setreg_off(offsetof(struct user_regs_struct, name), (val))
1517
1518static u32 getreg_off(size_t offset)
1519{
1520 u32 r;
1521 unsigned long args[] = { LHREQ_GETREG, offset };
1522
1523 if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
1524 err(1, "Getting register %u", offset);
1525 if (pread(lguest_fd, &r, sizeof(r), cpu_id) != sizeof(r))
1526 err(1, "Reading register %u", offset);
1527
1528 return r;
1529}
1530
1531static void setreg_off(size_t offset, u32 val)
1532{
1533 unsigned long args[] = { LHREQ_SETREG, offset, val };
1534
1535 if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
1536 err(1, "Setting register %u", offset);
1537}
1538
1539/* Get register by instruction encoding */
1540static u32 getreg_num(unsigned regnum, u32 mask)
1541{
1542 /* 8 bit ops use regnums 4-7 for high parts of word */
1543 if (mask == 0xFF && (regnum & 0x4))
1544 return getreg_num(regnum & 0x3, 0xFFFF) >> 8;
1545
1546 switch (regnum) {
1547 case 0: return getreg(eax) & mask;
1548 case 1: return getreg(ecx) & mask;
1549 case 2: return getreg(edx) & mask;
1550 case 3: return getreg(ebx) & mask;
1551 case 4: return getreg(esp) & mask;
1552 case 5: return getreg(ebp) & mask;
1553 case 6: return getreg(esi) & mask;
1554 case 7: return getreg(edi) & mask;
1555 }
1556 abort();
1557}
1558
1559/* Set register by instruction encoding */
1560static void setreg_num(unsigned regnum, u32 val, u32 mask)
1561{
1562 /* Don't try to set bits out of range */
1563 assert(~(val & ~mask));
1564
1565 /* 8 bit ops use regnums 4-7 for high parts of word */
1566 if (mask == 0xFF && (regnum & 0x4)) {
1567 /* Construct the 16 bits we want. */
1568 val = (val << 8) | getreg_num(regnum & 0x3, 0xFF);
1569 setreg_num(regnum & 0x3, val, 0xFFFF);
1570 return;
1571 }
1572
1573 switch (regnum) {
1574 case 0: setreg(eax, val | (getreg(eax) & ~mask)); return;
1575 case 1: setreg(ecx, val | (getreg(ecx) & ~mask)); return;
1576 case 2: setreg(edx, val | (getreg(edx) & ~mask)); return;
1577 case 3: setreg(ebx, val | (getreg(ebx) & ~mask)); return;
1578 case 4: setreg(esp, val | (getreg(esp) & ~mask)); return;
1579 case 5: setreg(ebp, val | (getreg(ebp) & ~mask)); return;
1580 case 6: setreg(esi, val | (getreg(esi) & ~mask)); return;
1581 case 7: setreg(edi, val | (getreg(edi) & ~mask)); return;
1582 }
1583 abort();
1584}
1585
1586/* Get bytes of displacement appended to instruction, from r/m encoding */
1587static u32 insn_displacement_len(u8 mod_reg_rm)
1588{
1589 /* Switch on the mod bits */
1590 switch (mod_reg_rm >> 6) {
1591 case 0:
1592 /* If mod == 0, and r/m == 101, 16-bit displacement follows */
1593 if ((mod_reg_rm & 0x7) == 0x5)
1594 return 2;
1595 /* Normally, mod == 0 means no literal displacement */
1596 return 0;
1597 case 1:
1598 /* One byte displacement */
1599 return 1;
1600 case 2:
1601 /* Four byte displacement */
1602 return 4;
1603 case 3:
1604 /* Register mode */
1605 return 0;
1606 }
1607 abort();
1608}
1609
1610static void emulate_insn(const u8 insn[])
1611{
1612 unsigned long args[] = { LHREQ_TRAP, 13 };
1613 unsigned int insnlen = 0, in = 0, small_operand = 0, byte_access;
1614 unsigned int eax, port, mask;
1615 /*
1616 * Default is to return all-ones on IO port reads, which traditionally
1617 * means "there's nothing there".
1618 */
1619 u32 val = 0xFFFFFFFF;
1620
1621 /*
1622 * This must be the Guest kernel trying to do something, not userspace!
1623 * The bottom two bits of the CS segment register are the privilege
1624 * level.
1625 */
1626 if ((getreg(xcs) & 3) != 0x1)
1627 goto no_emulate;
1628
1629 /* Decoding x86 instructions is icky. */
1630
1631 /*
1632 * Around 2.6.33, the kernel started using an emulation for the
1633 * cmpxchg8b instruction in early boot on many configurations. This
1634 * code isn't paravirtualized, and it tries to disable interrupts.
1635 * Ignore it, which will Mostly Work.
1636 */
1637 if (insn[insnlen] == 0xfa) {
1638 /* "cli", or Clear Interrupt Enable instruction. Skip it. */
1639 insnlen = 1;
1640 goto skip_insn;
1641 }
1642
1643 /*
1644 * 0x66 is an "operand prefix". It means a 16, not 32 bit in/out.
1645 */
1646 if (insn[insnlen] == 0x66) {
1647 small_operand = 1;
1648 /* The instruction is 1 byte so far, read the next byte. */
1649 insnlen = 1;
1650 }
1651
1652 /* If the lower bit isn't set, it's a single byte access */
1653 byte_access = !(insn[insnlen] & 1);
1654
1655 /*
1656 * Now we can ignore the lower bit and decode the 4 opcodes
1657 * we need to emulate.
1658 */
1659 switch (insn[insnlen] & 0xFE) {
1660 case 0xE4: /* in <next byte>,%al */
1661 port = insn[insnlen+1];
1662 insnlen += 2;
1663 in = 1;
1664 break;
1665 case 0xEC: /* in (%dx),%al */
1666 port = getreg(edx) & 0xFFFF;
1667 insnlen += 1;
1668 in = 1;
1669 break;
1670 case 0xE6: /* out %al,<next byte> */
1671 port = insn[insnlen+1];
1672 insnlen += 2;
1673 break;
1674 case 0xEE: /* out %al,(%dx) */
1675 port = getreg(edx) & 0xFFFF;
1676 insnlen += 1;
1677 break;
1678 default:
1679 /* OK, we don't know what this is, can't emulate. */
1680 goto no_emulate;
1681 }
1682
1683 /* Set a mask of the 1, 2 or 4 bytes, depending on size of IO */
1684 if (byte_access)
1685 mask = 0xFF;
1686 else if (small_operand)
1687 mask = 0xFFFF;
1688 else
1689 mask = 0xFFFFFFFF;
1690
1691 /*
1692 * If it was an "IN" instruction, they expect the result to be read
1693 * into %eax, so we change %eax.
1694 */
1695 eax = getreg(eax);
1696
1697 if (in) {
1698 /* This is the PS/2 keyboard status; 1 means ready for output */
1699 if (port == 0x64)
1700 val = 1;
1701 else if (is_pci_addr_port(port))
1702 pci_addr_ioread(port, mask, &val);
1703 else if (is_pci_data_port(port))
1704 pci_data_ioread(port, mask, &val);
1705
1706 /* Clear the bits we're about to read */
1707 eax &= ~mask;
1708 /* Copy bits in from val. */
1709 eax |= val & mask;
1710 /* Now update the register. */
1711 setreg(eax, eax);
1712 } else {
1713 if (is_pci_addr_port(port)) {
1714 if (!pci_addr_iowrite(port, mask, eax))
1715 goto bad_io;
1716 } else if (is_pci_data_port(port)) {
1717 if (!pci_data_iowrite(port, mask, eax))
1718 goto bad_io;
1719 }
1720 /* There are many other ports, eg. CMOS clock, serial
1721 * and parallel ports, so we ignore them all. */
1722 }
1723
1724 verbose("IO %s of %x to %u: %#08x\n",
1725 in ? "IN" : "OUT", mask, port, eax);
1726skip_insn:
1727 /* Finally, we've "done" the instruction, so move past it. */
1728 setreg(eip, getreg(eip) + insnlen);
1729 return;
1730
1731bad_io:
1732 warnx("Attempt to %s port %u (%#x mask)",
1733 in ? "read from" : "write to", port, mask);
1734
1735no_emulate:
1736 /* Inject trap into Guest. */
1737 if (write(lguest_fd, args, sizeof(args)) < 0)
1738 err(1, "Reinjecting trap 13 for fault at %#x", getreg(eip));
1739}
1740
1741static struct device *find_mmio_region(unsigned long paddr, u32 *off)
1742{
1743 unsigned int i;
1744
1745 for (i = 1; i < MAX_PCI_DEVICES; i++) {
1746 struct device *d = devices.pci[i];
1747
1748 if (!d)
1749 continue;
1750 if (paddr < d->mmio_addr)
1751 continue;
1752 if (paddr >= d->mmio_addr + d->mmio_size)
1753 continue;
1754 *off = paddr - d->mmio_addr;
1755 return d;
1756 }
1757 return NULL;
1758}
1759
1760/* FIXME: Use vq array. */
1761static struct virtqueue *vq_by_num(struct device *d, u32 num)
1762{
1763 struct virtqueue *vq = d->vq;
1764
1765 while (num-- && vq)
1766 vq = vq->next;
1767
1768 return vq;
1769}
1770
1771static void save_vq_config(const struct virtio_pci_common_cfg *cfg,
1772 struct virtqueue *vq)
1773{
1774 vq->pci_config = *cfg;
1775}
1776
1777static void restore_vq_config(struct virtio_pci_common_cfg *cfg,
1778 struct virtqueue *vq)
1779{
1780 /* Only restore the per-vq part */
1781 size_t off = offsetof(struct virtio_pci_common_cfg, queue_size);
1782
1783 memcpy((void *)cfg + off, (void *)&vq->pci_config + off,
1784 sizeof(*cfg) - off);
1785}
1786
1787/*
1788 * 4.1.4.3.2:
1789 *
1790 * The driver MUST configure the other virtqueue fields before
1791 * enabling the virtqueue with queue_enable.
1792 *
1793 * When they enable the virtqueue, we check that their setup is valid.
1020 */ 1794 */
1021static void create_thread(struct virtqueue *vq) 1795static void check_virtqueue(struct device *d, struct virtqueue *vq)
1796{
1797 /* Because lguest is 32 bit, all the descriptor high bits must be 0 */
1798 if (vq->pci_config.queue_desc_hi
1799 || vq->pci_config.queue_avail_hi
1800 || vq->pci_config.queue_used_hi)
1801 bad_driver_vq(vq, "invalid 64-bit queue address");
1802
1803 /*
1804 * 2.4.1:
1805 *
1806 * The driver MUST ensure that the physical address of the first byte
1807 * of each virtqueue part is a multiple of the specified alignment
1808 * value in the above table.
1809 */
1810 if (vq->pci_config.queue_desc_lo % 16
1811 || vq->pci_config.queue_avail_lo % 2
1812 || vq->pci_config.queue_used_lo % 4)
1813 bad_driver_vq(vq, "invalid alignment in queue addresses");
1814
1815 /* Initialize the virtqueue and check they're all in range. */
1816 vq->vring.num = vq->pci_config.queue_size;
1817 vq->vring.desc = check_pointer(vq->dev,
1818 vq->pci_config.queue_desc_lo,
1819 sizeof(*vq->vring.desc) * vq->vring.num);
1820 vq->vring.avail = check_pointer(vq->dev,
1821 vq->pci_config.queue_avail_lo,
1822 sizeof(*vq->vring.avail)
1823 + (sizeof(vq->vring.avail->ring[0])
1824 * vq->vring.num));
1825 vq->vring.used = check_pointer(vq->dev,
1826 vq->pci_config.queue_used_lo,
1827 sizeof(*vq->vring.used)
1828 + (sizeof(vq->vring.used->ring[0])
1829 * vq->vring.num));
1830
1831 /*
1832 * 2.4.9.1:
1833 *
1834 * The driver MUST initialize flags in the used ring to 0
1835 * when allocating the used ring.
1836 */
1837 if (vq->vring.used->flags != 0)
1838 bad_driver_vq(vq, "invalid initial used.flags %#x",
1839 vq->vring.used->flags);
1840}
1841
1842static void start_virtqueue(struct virtqueue *vq)
1022{ 1843{
1023 /* 1844 /*
1024 * Create stack for thread. Since the stack grows upwards, we point 1845 * Create stack for thread. Since the stack grows upwards, we point
1025 * the stack pointer to the end of this region. 1846 * the stack pointer to the end of this region.
1026 */ 1847 */
1027 char *stack = malloc(32768); 1848 char *stack = malloc(32768);
1028 unsigned long args[] = { LHREQ_EVENTFD,
1029 vq->config.pfn*getpagesize(), 0 };
1030 1849
1031 /* Create a zero-initialized eventfd. */ 1850 /* Create a zero-initialized eventfd. */
1032 vq->eventfd = eventfd(0, 0); 1851 vq->eventfd = eventfd(0, 0);
1033 if (vq->eventfd < 0) 1852 if (vq->eventfd < 0)
1034 err(1, "Creating eventfd"); 1853 err(1, "Creating eventfd");
1035 args[2] = vq->eventfd;
1036
1037 /*
1038 * Attach an eventfd to this virtqueue: it will go off when the Guest
1039 * does an LHCALL_NOTIFY for this vq.
1040 */
1041 if (write(lguest_fd, &args, sizeof(args)) != 0)
1042 err(1, "Attaching eventfd");
1043 1854
1044 /* 1855 /*
1045 * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so 1856 * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so
@@ -1048,167 +1859,531 @@ static void create_thread(struct virtqueue *vq)
1048 vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); 1859 vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
1049 if (vq->thread == (pid_t)-1) 1860 if (vq->thread == (pid_t)-1)
1050 err(1, "Creating clone"); 1861 err(1, "Creating clone");
1051
1052 /* We close our local copy now the child has it. */
1053 close(vq->eventfd);
1054} 1862}
1055 1863
1056static void start_device(struct device *dev) 1864static void start_virtqueues(struct device *d)
1057{ 1865{
1058 unsigned int i;
1059 struct virtqueue *vq; 1866 struct virtqueue *vq;
1060 1867
1061 verbose("Device %s OK: offered", dev->name); 1868 for (vq = d->vq; vq; vq = vq->next) {
1062 for (i = 0; i < dev->feature_len; i++) 1869 if (vq->pci_config.queue_enable)
1063 verbose(" %02x", get_feature_bits(dev)[i]); 1870 start_virtqueue(vq);
1064 verbose(", accepted");
1065 for (i = 0; i < dev->feature_len; i++)
1066 verbose(" %02x", get_feature_bits(dev)
1067 [dev->feature_len+i]);
1068
1069 for (vq = dev->vq; vq; vq = vq->next) {
1070 if (vq->service)
1071 create_thread(vq);
1072 } 1871 }
1073 dev->running = true;
1074} 1872}
1075 1873
1076static void cleanup_devices(void) 1874static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask)
1077{ 1875{
1078 struct device *dev; 1876 struct virtqueue *vq;
1079 1877
1080 for (dev = devices.dev; dev; dev = dev->next) 1878 switch (off) {
1081 reset_device(dev); 1879 case offsetof(struct virtio_pci_mmio, cfg.device_feature_select):
1880 /*
1881 * 4.1.4.3.1:
1882 *
1883 * The device MUST present the feature bits it is offering in
1884 * device_feature, starting at bit device_feature_select ∗ 32
1885 * for any device_feature_select written by the driver
1886 */
1887 if (val == 0)
1888 d->mmio->cfg.device_feature = d->features;
1889 else if (val == 1)
1890 d->mmio->cfg.device_feature = (d->features >> 32);
1891 else
1892 d->mmio->cfg.device_feature = 0;
1893 goto feature_write_through32;
1894 case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select):
1895 if (val > 1)
1896 bad_driver(d, "Unexpected driver select %u", val);
1897 goto feature_write_through32;
1898 case offsetof(struct virtio_pci_mmio, cfg.guest_feature):
1899 if (d->mmio->cfg.guest_feature_select == 0) {
1900 d->features_accepted &= ~((u64)0xFFFFFFFF);
1901 d->features_accepted |= val;
1902 } else {
1903 assert(d->mmio->cfg.guest_feature_select == 1);
1904 d->features_accepted &= 0xFFFFFFFF;
1905 d->features_accepted |= ((u64)val) << 32;
1906 }
1907 /*
1908 * 2.2.1:
1909 *
1910 * The driver MUST NOT accept a feature which the device did
1911 * not offer
1912 */
1913 if (d->features_accepted & ~d->features)
1914 bad_driver(d, "over-accepted features %#llx of %#llx",
1915 d->features_accepted, d->features);
1916 goto feature_write_through32;
1917 case offsetof(struct virtio_pci_mmio, cfg.device_status): {
1918 u8 prev;
1919
1920 verbose("%s: device status -> %#x\n", d->name, val);
1921 /*
1922 * 4.1.4.3.1:
1923 *
1924 * The device MUST reset when 0 is written to device_status,
1925 * and present a 0 in device_status once that is done.
1926 */
1927 if (val == 0) {
1928 reset_device(d);
1929 goto write_through8;
1930 }
1082 1931
1083 /* If we saved off the original terminal settings, restore them now. */ 1932 /* 2.1.1: The driver MUST NOT clear a device status bit. */
1084 if (orig_term.c_lflag & (ISIG|ICANON|ECHO)) 1933 if (d->mmio->cfg.device_status & ~val)
1085 tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); 1934 bad_driver(d, "unset of device status bit %#x -> %#x",
1086} 1935 d->mmio->cfg.device_status, val);
1087 1936
1088/* When the Guest tells us they updated the status field, we handle it. */ 1937 /*
1089static void update_device_status(struct device *dev) 1938 * 2.1.2:
1090{ 1939 *
1091 /* A zero status is a reset, otherwise it's a set of flags. */ 1940 * The device MUST NOT consume buffers or notify the driver
1092 if (dev->desc->status == 0) 1941 * before DRIVER_OK.
1093 reset_device(dev); 1942 */
1094 else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { 1943 if (val & VIRTIO_CONFIG_S_DRIVER_OK
1095 warnx("Device %s configuration FAILED", dev->name); 1944 && !(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK))
1096 if (dev->running) 1945 start_virtqueues(d);
1097 reset_device(dev); 1946
1098 } else { 1947 /*
1099 if (dev->running) 1948 * 3.1.1:
1100 err(1, "Device %s features finalized twice", dev->name); 1949 *
1101 start_device(dev); 1950 * The driver MUST follow this sequence to initialize a device:
1951 * - Reset the device.
1952 * - Set the ACKNOWLEDGE status bit: the guest OS has
1953 * notice the device.
1954 * - Set the DRIVER status bit: the guest OS knows how
1955 * to drive the device.
1956 * - Read device feature bits, and write the subset
1957 * of feature bits understood by the OS and driver
1958 * to the device. During this step the driver MAY
1959 * read (but MUST NOT write) the device-specific
1960 * configuration fields to check that it can
1961 * support the device before accepting it.
1962 * - Set the FEATURES_OK status bit. The driver
1963 * MUST not accept new feature bits after this
1964 * step.
1965 * - Re-read device status to ensure the FEATURES_OK
1966 * bit is still set: otherwise, the device does
1967 * not support our subset of features and the
1968 * device is unusable.
1969 * - Perform device-specific setup, including
1970 * discovery of virtqueues for the device,
1971 * optional per-bus setup, reading and possibly
1972 * writing the device’s virtio configuration
1973 * space, and population of virtqueues.
1974 * - Set the DRIVER_OK status bit. At this point the
1975 * device is “live”.
1976 */
1977 prev = 0;
1978 switch (val & ~d->mmio->cfg.device_status) {
1979 case VIRTIO_CONFIG_S_DRIVER_OK:
1980 prev |= VIRTIO_CONFIG_S_FEATURES_OK; /* fall thru */
1981 case VIRTIO_CONFIG_S_FEATURES_OK:
1982 prev |= VIRTIO_CONFIG_S_DRIVER; /* fall thru */
1983 case VIRTIO_CONFIG_S_DRIVER:
1984 prev |= VIRTIO_CONFIG_S_ACKNOWLEDGE; /* fall thru */
1985 case VIRTIO_CONFIG_S_ACKNOWLEDGE:
1986 break;
1987 default:
1988 bad_driver(d, "unknown device status bit %#x -> %#x",
1989 d->mmio->cfg.device_status, val);
1990 }
1991 if (d->mmio->cfg.device_status != prev)
1992 bad_driver(d, "unexpected status transition %#x -> %#x",
1993 d->mmio->cfg.device_status, val);
1994
1995 /* If they just wrote FEATURES_OK, we make sure they read */
1996 switch (val & ~d->mmio->cfg.device_status) {
1997 case VIRTIO_CONFIG_S_FEATURES_OK:
1998 d->wrote_features_ok = true;
1999 break;
2000 case VIRTIO_CONFIG_S_DRIVER_OK:
2001 if (d->wrote_features_ok)
2002 bad_driver(d, "did not re-read FEATURES_OK");
2003 break;
2004 }
2005 goto write_through8;
1102 } 2006 }
1103} 2007 case offsetof(struct virtio_pci_mmio, cfg.queue_select):
2008 vq = vq_by_num(d, val);
2009 /*
2010 * 4.1.4.3.1:
2011 *
2012 * The device MUST present a 0 in queue_size if the virtqueue
2013 * corresponding to the current queue_select is unavailable.
2014 */
2015 if (!vq) {
2016 d->mmio->cfg.queue_size = 0;
2017 goto write_through16;
2018 }
2019 /* Save registers for old vq, if it was a valid vq */
2020 if (d->mmio->cfg.queue_size)
2021 save_vq_config(&d->mmio->cfg,
2022 vq_by_num(d, d->mmio->cfg.queue_select));
2023 /* Restore the registers for the queue they asked for */
2024 restore_vq_config(&d->mmio->cfg, vq);
2025 goto write_through16;
2026 case offsetof(struct virtio_pci_mmio, cfg.queue_size):
2027 /*
2028 * 4.1.4.3.2:
2029 *
2030 * The driver MUST NOT write a value which is not a power of 2
2031 * to queue_size.
2032 */
2033 if (val & (val-1))
2034 bad_driver(d, "invalid queue size %u", val);
2035 if (d->mmio->cfg.queue_enable)
2036 bad_driver(d, "changing queue size on live device");
2037 goto write_through16;
2038 case offsetof(struct virtio_pci_mmio, cfg.queue_msix_vector):
2039 bad_driver(d, "attempt to set MSIX vector to %u", val);
2040 case offsetof(struct virtio_pci_mmio, cfg.queue_enable): {
2041 struct virtqueue *vq = vq_by_num(d, d->mmio->cfg.queue_select);
1104 2042
1105/*L:215 2043 /*
1106 * This is the generic routine we call when the Guest uses LHCALL_NOTIFY. In 2044 * 4.1.4.3.2:
1107 * particular, it's used to notify us of device status changes during boot. 2045 *
1108 */ 2046 * The driver MUST NOT write a 0 to queue_enable.
1109static void handle_output(unsigned long addr) 2047 */
1110{ 2048 if (val != 1)
1111 struct device *i; 2049 bad_driver(d, "setting queue_enable to %u", val);
1112 2050
1113 /* Check each device. */ 2051 /*
1114 for (i = devices.dev; i; i = i->next) { 2052 * 3.1.1:
1115 struct virtqueue *vq; 2053 *
2054 * 7. Perform device-specific setup, including discovery of
2055 * virtqueues for the device, optional per-bus setup,
2056 * reading and possibly writing the device’s virtio
2057 * configuration space, and population of virtqueues.
2058 * 8. Set the DRIVER_OK status bit.
2059 *
2060 * All our devices require all virtqueues to be enabled, so
2061 * they should have done that before setting DRIVER_OK.
2062 */
2063 if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK)
2064 bad_driver(d, "enabling vq after DRIVER_OK");
1116 2065
2066 d->mmio->cfg.queue_enable = val;
2067 save_vq_config(&d->mmio->cfg, vq);
2068 check_virtqueue(d, vq);
2069 goto write_through16;
2070 }
2071 case offsetof(struct virtio_pci_mmio, cfg.queue_notify_off):
2072 bad_driver(d, "attempt to write to queue_notify_off");
2073 case offsetof(struct virtio_pci_mmio, cfg.queue_desc_lo):
2074 case offsetof(struct virtio_pci_mmio, cfg.queue_desc_hi):
2075 case offsetof(struct virtio_pci_mmio, cfg.queue_avail_lo):
2076 case offsetof(struct virtio_pci_mmio, cfg.queue_avail_hi):
2077 case offsetof(struct virtio_pci_mmio, cfg.queue_used_lo):
2078 case offsetof(struct virtio_pci_mmio, cfg.queue_used_hi):
1117 /* 2079 /*
1118 * Notifications to device descriptors mean they updated the 2080 * 4.1.4.3.2:
1119 * device status. 2081 *
2082 * The driver MUST configure the other virtqueue fields before
2083 * enabling the virtqueue with queue_enable.
1120 */ 2084 */
1121 if (from_guest_phys(addr) == i->desc) { 2085 if (d->mmio->cfg.queue_enable)
1122 update_device_status(i); 2086 bad_driver(d, "changing queue on live device");
1123 return; 2087
1124 } 2088 /*
2089 * 3.1.1:
2090 *
2091 * The driver MUST follow this sequence to initialize a device:
2092 *...
2093 * 5. Set the FEATURES_OK status bit. The driver MUST not
2094 * accept new feature bits after this step.
2095 */
2096 if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK))
2097 bad_driver(d, "setting up vq before FEATURES_OK");
1125 2098
1126 /* Devices should not be used before features are finalized. */ 2099 /*
1127 for (vq = i->vq; vq; vq = vq->next) { 2100 * 6. Re-read device status to ensure the FEATURES_OK bit is
1128 if (addr != vq->config.pfn*getpagesize()) 2101 * still set...
1129 continue; 2102 */
1130 errx(1, "Notification on %s before setup!", i->name); 2103 if (d->wrote_features_ok)
2104 bad_driver(d, "didn't re-read FEATURES_OK before setup");
2105
2106 goto write_through32;
2107 case offsetof(struct virtio_pci_mmio, notify):
2108 vq = vq_by_num(d, val);
2109 if (!vq)
2110 bad_driver(d, "Invalid vq notification on %u", val);
2111 /* Notify the process handling this vq by adding 1 to eventfd */
2112 write(vq->eventfd, "\1\0\0\0\0\0\0\0", 8);
2113 goto write_through16;
2114 case offsetof(struct virtio_pci_mmio, isr):
2115 bad_driver(d, "Unexpected write to isr");
2116 /* Weird corner case: write to emerg_wr of console */
2117 case sizeof(struct virtio_pci_mmio)
2118 + offsetof(struct virtio_console_config, emerg_wr):
2119 if (strcmp(d->name, "console") == 0) {
2120 char c = val;
2121 write(STDOUT_FILENO, &c, 1);
2122 goto write_through32;
1131 } 2123 }
2124 /* Fall through... */
2125 default:
2126 /*
2127 * 4.1.4.3.2:
2128 *
2129 * The driver MUST NOT write to device_feature, num_queues,
2130 * config_generation or queue_notify_off.
2131 */
2132 bad_driver(d, "Unexpected write to offset %u", off);
1132 } 2133 }
1133 2134
2135feature_write_through32:
1134 /* 2136 /*
1135 * Early console write is done using notify on a nul-terminated string 2137 * 3.1.1:
1136 * in Guest memory. It's also great for hacking debugging messages 2138 *
1137 * into a Guest. 2139 * The driver MUST follow this sequence to initialize a device:
2140 *...
2141 * - Set the DRIVER status bit: the guest OS knows how
2142 * to drive the device.
2143 * - Read device feature bits, and write the subset
2144 * of feature bits understood by the OS and driver
2145 * to the device.
2146 *...
2147 * - Set the FEATURES_OK status bit. The driver MUST not
2148 * accept new feature bits after this step.
1138 */ 2149 */
1139 if (addr >= guest_limit) 2150 if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
1140 errx(1, "Bad NOTIFY %#lx", addr); 2151 bad_driver(d, "feature write before VIRTIO_CONFIG_S_DRIVER");
2152 if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK)
2153 bad_driver(d, "feature write after VIRTIO_CONFIG_S_FEATURES_OK");
1141 2154
1142 write(STDOUT_FILENO, from_guest_phys(addr), 2155 /*
1143 strnlen(from_guest_phys(addr), guest_limit - addr)); 2156 * 4.1.3.1:
2157 *
2158 * The driver MUST access each field using the “natural” access
2159 * method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses for
2160 * 16-bit fields and 8-bit accesses for 8-bit fields.
2161 */
2162write_through32:
2163 if (mask != 0xFFFFFFFF) {
2164 bad_driver(d, "non-32-bit write to offset %u (%#x)",
2165 off, getreg(eip));
2166 return;
2167 }
2168 memcpy((char *)d->mmio + off, &val, 4);
2169 return;
2170
2171write_through16:
2172 if (mask != 0xFFFF)
2173 bad_driver(d, "non-16-bit write to offset %u (%#x)",
2174 off, getreg(eip));
2175 memcpy((char *)d->mmio + off, &val, 2);
2176 return;
2177
2178write_through8:
2179 if (mask != 0xFF)
2180 bad_driver(d, "non-8-bit write to offset %u (%#x)",
2181 off, getreg(eip));
2182 memcpy((char *)d->mmio + off, &val, 1);
2183 return;
1144} 2184}
1145 2185
1146/*L:190 2186static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask)
1147 * Device Setup
1148 *
1149 * All devices need a descriptor so the Guest knows it exists, and a "struct
1150 * device" so the Launcher can keep track of it. We have common helper
1151 * routines to allocate and manage them.
1152 */
1153
1154/*
1155 * The layout of the device page is a "struct lguest_device_desc" followed by a
1156 * number of virtqueue descriptors, then two sets of feature bits, then an
1157 * array of configuration bytes. This routine returns the configuration
1158 * pointer.
1159 */
1160static u8 *device_config(const struct device *dev)
1161{ 2187{
1162 return (void *)(dev->desc + 1) 2188 u8 isr;
1163 + dev->num_vq * sizeof(struct lguest_vqconfig) 2189 u32 val = 0;
1164 + dev->feature_len * 2; 2190
2191 switch (off) {
2192 case offsetof(struct virtio_pci_mmio, cfg.device_feature_select):
2193 case offsetof(struct virtio_pci_mmio, cfg.device_feature):
2194 case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select):
2195 case offsetof(struct virtio_pci_mmio, cfg.guest_feature):
2196 /*
2197 * 3.1.1:
2198 *
2199 * The driver MUST follow this sequence to initialize a device:
2200 *...
2201 * - Set the DRIVER status bit: the guest OS knows how
2202 * to drive the device.
2203 * - Read device feature bits, and write the subset
2204 * of feature bits understood by the OS and driver
2205 * to the device.
2206 */
2207 if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
2208 bad_driver(d,
2209 "feature read before VIRTIO_CONFIG_S_DRIVER");
2210 goto read_through32;
2211 case offsetof(struct virtio_pci_mmio, cfg.msix_config):
2212 bad_driver(d, "read of msix_config");
2213 case offsetof(struct virtio_pci_mmio, cfg.num_queues):
2214 goto read_through16;
2215 case offsetof(struct virtio_pci_mmio, cfg.device_status):
2216 /* As they did read, any write of FEATURES_OK is now fine. */
2217 d->wrote_features_ok = false;
2218 goto read_through8;
2219 case offsetof(struct virtio_pci_mmio, cfg.config_generation):
2220 /*
2221 * 4.1.4.3.1:
2222 *
2223 * The device MUST present a changed config_generation after
2224 * the driver has read a device-specific configuration value
2225 * which has changed since any part of the device-specific
2226 * configuration was last read.
2227 *
2228 * This is simple: none of our devices change config, so this
2229 * is always 0.
2230 */
2231 goto read_through8;
2232 case offsetof(struct virtio_pci_mmio, notify):
2233 /*
2234 * 3.1.1:
2235 *
2236 * The driver MUST NOT notify the device before setting
2237 * DRIVER_OK.
2238 */
2239 if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK))
2240 bad_driver(d, "notify before VIRTIO_CONFIG_S_DRIVER_OK");
2241 goto read_through16;
2242 case offsetof(struct virtio_pci_mmio, isr):
2243 if (mask != 0xFF)
2244 bad_driver(d, "non-8-bit read from offset %u (%#x)",
2245 off, getreg(eip));
2246 isr = d->mmio->isr;
2247 /*
2248 * 4.1.4.5.1:
2249 *
2250 * The device MUST reset ISR status to 0 on driver read.
2251 */
2252 d->mmio->isr = 0;
2253 return isr;
2254 case offsetof(struct virtio_pci_mmio, padding):
2255 bad_driver(d, "read from padding (%#x)", getreg(eip));
2256 default:
2257 /* Read from device config space, beware unaligned overflow */
2258 if (off > d->mmio_size - 4)
2259 bad_driver(d, "read past end (%#x)", getreg(eip));
2260
2261 /*
2262 * 3.1.1:
2263 * The driver MUST follow this sequence to initialize a device:
2264 *...
2265 * 3. Set the DRIVER status bit: the guest OS knows how to
2266 * drive the device.
2267 * 4. Read device feature bits, and write the subset of
2268 * feature bits understood by the OS and driver to the
2269 * device. During this step the driver MAY read (but MUST NOT
2270 * write) the device-specific configuration fields to check
2271 * that it can support the device before accepting it.
2272 */
2273 if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
2274 bad_driver(d,
2275 "config read before VIRTIO_CONFIG_S_DRIVER");
2276
2277 if (mask == 0xFFFFFFFF)
2278 goto read_through32;
2279 else if (mask == 0xFFFF)
2280 goto read_through16;
2281 else
2282 goto read_through8;
2283 }
2284
2285 /*
2286 * 4.1.3.1:
2287 *
2288 * The driver MUST access each field using the “natural” access
2289 * method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses for
2290 * 16-bit fields and 8-bit accesses for 8-bit fields.
2291 */
2292read_through32:
2293 if (mask != 0xFFFFFFFF)
2294 bad_driver(d, "non-32-bit read to offset %u (%#x)",
2295 off, getreg(eip));
2296 memcpy(&val, (char *)d->mmio + off, 4);
2297 return val;
2298
2299read_through16:
2300 if (mask != 0xFFFF)
2301 bad_driver(d, "non-16-bit read to offset %u (%#x)",
2302 off, getreg(eip));
2303 memcpy(&val, (char *)d->mmio + off, 2);
2304 return val;
2305
2306read_through8:
2307 if (mask != 0xFF)
2308 bad_driver(d, "non-8-bit read to offset %u (%#x)",
2309 off, getreg(eip));
2310 memcpy(&val, (char *)d->mmio + off, 1);
2311 return val;
1165} 2312}
1166 2313
1167/* 2314static void emulate_mmio(unsigned long paddr, const u8 *insn)
1168 * This routine allocates a new "struct lguest_device_desc" from descriptor
1169 * table page just above the Guest's normal memory. It returns a pointer to
1170 * that descriptor.
1171 */
1172static struct lguest_device_desc *new_dev_desc(u16 type)
1173{ 2315{
1174 struct lguest_device_desc d = { .type = type }; 2316 u32 val, off, mask = 0xFFFFFFFF, insnlen = 0;
1175 void *p; 2317 struct device *d = find_mmio_region(paddr, &off);
2318 unsigned long args[] = { LHREQ_TRAP, 14 };
1176 2319
1177 /* Figure out where the next device config is, based on the last one. */ 2320 if (!d) {
1178 if (devices.lastdev) 2321 warnx("MMIO touching %#08lx (not a device)", paddr);
1179 p = device_config(devices.lastdev) 2322 goto reinject;
1180 + devices.lastdev->desc->config_len; 2323 }
1181 else 2324
1182 p = devices.descpage; 2325 /* Prefix makes it a 16 bit op */
2326 if (insn[0] == 0x66) {
2327 mask = 0xFFFF;
2328 insnlen++;
2329 }
1183 2330
1184 /* We only have one page for all the descriptors. */ 2331 /* iowrite */
1185 if (p + sizeof(d) > (void *)devices.descpage + getpagesize()) 2332 if (insn[insnlen] == 0x89) {
1186 errx(1, "Too many devices"); 2333 /* Next byte is r/m byte: bits 3-5 are register. */
2334 val = getreg_num((insn[insnlen+1] >> 3) & 0x7, mask);
2335 emulate_mmio_write(d, off, val, mask);
2336 insnlen += 2 + insn_displacement_len(insn[insnlen+1]);
2337 } else if (insn[insnlen] == 0x8b) { /* ioread */
2338 /* Next byte is r/m byte: bits 3-5 are register. */
2339 val = emulate_mmio_read(d, off, mask);
2340 setreg_num((insn[insnlen+1] >> 3) & 0x7, val, mask);
2341 insnlen += 2 + insn_displacement_len(insn[insnlen+1]);
2342 } else if (insn[0] == 0x88) { /* 8-bit iowrite */
2343 mask = 0xff;
2344 /* Next byte is r/m byte: bits 3-5 are register. */
2345 val = getreg_num((insn[1] >> 3) & 0x7, mask);
2346 emulate_mmio_write(d, off, val, mask);
2347 insnlen = 2 + insn_displacement_len(insn[1]);
2348 } else if (insn[0] == 0x8a) { /* 8-bit ioread */
2349 mask = 0xff;
2350 val = emulate_mmio_read(d, off, mask);
2351 setreg_num((insn[1] >> 3) & 0x7, val, mask);
2352 insnlen = 2 + insn_displacement_len(insn[1]);
2353 } else {
2354 warnx("Unknown MMIO instruction touching %#08lx:"
2355 " %02x %02x %02x %02x at %u",
2356 paddr, insn[0], insn[1], insn[2], insn[3], getreg(eip));
2357 reinject:
2358 /* Inject trap into Guest. */
2359 if (write(lguest_fd, args, sizeof(args)) < 0)
2360 err(1, "Reinjecting trap 14 for fault at %#x",
2361 getreg(eip));
2362 return;
2363 }
1187 2364
1188 /* p might not be aligned, so we memcpy in. */ 2365 /* Finally, we've "done" the instruction, so move past it. */
1189 return memcpy(p, &d, sizeof(d)); 2366 setreg(eip, getreg(eip) + insnlen);
1190} 2367}
1191 2368
1192/* 2369/*L:190
1193 * Each device descriptor is followed by the description of its virtqueues. We 2370 * Device Setup
1194 * specify how many descriptors the virtqueue is to have. 2371 *
2372 * All devices need a descriptor so the Guest knows it exists, and a "struct
2373 * device" so the Launcher can keep track of it. We have common helper
2374 * routines to allocate and manage them.
1195 */ 2375 */
1196static void add_virtqueue(struct device *dev, unsigned int num_descs, 2376static void add_pci_virtqueue(struct device *dev,
1197 void (*service)(struct virtqueue *)) 2377 void (*service)(struct virtqueue *),
2378 const char *name)
1198{ 2379{
1199 unsigned int pages;
1200 struct virtqueue **i, *vq = malloc(sizeof(*vq)); 2380 struct virtqueue **i, *vq = malloc(sizeof(*vq));
1201 void *p;
1202
1203 /* First we need some memory for this virtqueue. */
1204 pages = (vring_size(num_descs, LGUEST_VRING_ALIGN) + getpagesize() - 1)
1205 / getpagesize();
1206 p = get_pages(pages);
1207 2381
1208 /* Initialize the virtqueue */ 2382 /* Initialize the virtqueue */
1209 vq->next = NULL; 2383 vq->next = NULL;
1210 vq->last_avail_idx = 0; 2384 vq->last_avail_idx = 0;
1211 vq->dev = dev; 2385 vq->dev = dev;
2386 vq->name = name;
1212 2387
1213 /* 2388 /*
1214 * This is the routine the service thread will run, and its Process ID 2389 * This is the routine the service thread will run, and its Process ID
@@ -1218,25 +2393,11 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1218 vq->thread = (pid_t)-1; 2393 vq->thread = (pid_t)-1;
1219 2394
1220 /* Initialize the configuration. */ 2395 /* Initialize the configuration. */
1221 vq->config.num = num_descs; 2396 reset_vq_pci_config(vq);
1222 vq->config.irq = devices.next_irq++; 2397 vq->pci_config.queue_notify_off = 0;
1223 vq->config.pfn = to_guest_phys(p) / getpagesize();
1224
1225 /* Initialize the vring. */
1226 vring_init(&vq->vring, num_descs, p, LGUEST_VRING_ALIGN);
1227
1228 /*
1229 * Append virtqueue to this device's descriptor. We use
1230 * device_config() to get the end of the device's current virtqueues;
1231 * we check that we haven't added any config or feature information
1232 * yet, otherwise we'd be overwriting them.
1233 */
1234 assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0);
1235 memcpy(device_config(dev), &vq->config, sizeof(vq->config));
1236 dev->num_vq++;
1237 dev->desc->num_vq++;
1238 2398
1239 verbose("Virtqueue page %#lx\n", to_guest_phys(p)); 2399 /* Add one to the number of queues */
2400 vq->dev->mmio->cfg.num_queues++;
1240 2401
1241 /* 2402 /*
1242 * Add to tail of list, so dev->vq is first vq, dev->vq->next is 2403 * Add to tail of list, so dev->vq is first vq, dev->vq->next is
@@ -1246,73 +2407,239 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1246 *i = vq; 2407 *i = vq;
1247} 2408}
1248 2409
1249/* 2410/* The Guest accesses the feature bits via the PCI common config MMIO region */
1250 * The first half of the feature bitmask is for us to advertise features. The 2411static void add_pci_feature(struct device *dev, unsigned bit)
1251 * second half is for the Guest to accept features.
1252 */
1253static void add_feature(struct device *dev, unsigned bit)
1254{ 2412{
1255 u8 *features = get_feature_bits(dev); 2413 dev->features |= (1ULL << bit);
2414}
1256 2415
1257 /* We can't extend the feature bits once we've added config bytes */ 2416/* For devices with no config. */
1258 if (dev->desc->feature_len <= bit / CHAR_BIT) { 2417static void no_device_config(struct device *dev)
1259 assert(dev->desc->config_len == 0); 2418{
1260 dev->feature_len = dev->desc->feature_len = (bit/CHAR_BIT) + 1; 2419 dev->mmio_addr = get_mmio_region(dev->mmio_size);
1261 }
1262 2420
1263 features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); 2421 dev->config.bar[0] = dev->mmio_addr;
2422 /* Bottom 4 bits must be zero */
2423 assert(~(dev->config.bar[0] & 0xF));
2424}
2425
2426/* This puts the device config into BAR0 */
2427static void set_device_config(struct device *dev, const void *conf, size_t len)
2428{
2429 /* Set up BAR 0 */
2430 dev->mmio_size += len;
2431 dev->mmio = realloc(dev->mmio, dev->mmio_size);
2432 memcpy(dev->mmio + 1, conf, len);
2433
2434 /*
2435 * 4.1.4.6:
2436 *
2437 * The device MUST present at least one VIRTIO_PCI_CAP_DEVICE_CFG
2438 * capability for any device type which has a device-specific
2439 * configuration.
2440 */
2441 /* Hook up device cfg */
2442 dev->config.cfg_access.cap.cap_next
2443 = offsetof(struct pci_config, device);
2444
2445 /*
2446 * 4.1.4.6.1:
2447 *
2448 * The offset for the device-specific configuration MUST be 4-byte
2449 * aligned.
2450 */
2451 assert(dev->config.cfg_access.cap.cap_next % 4 == 0);
2452
2453 /* Fix up device cfg field length. */
2454 dev->config.device.length = len;
2455
2456 /* The rest is the same as the no-config case */
2457 no_device_config(dev);
2458}
2459
2460static void init_cap(struct virtio_pci_cap *cap, size_t caplen, int type,
2461 size_t bar_offset, size_t bar_bytes, u8 next)
2462{
2463 cap->cap_vndr = PCI_CAP_ID_VNDR;
2464 cap->cap_next = next;
2465 cap->cap_len = caplen;
2466 cap->cfg_type = type;
2467 cap->bar = 0;
2468 memset(cap->padding, 0, sizeof(cap->padding));
2469 cap->offset = bar_offset;
2470 cap->length = bar_bytes;
1264} 2471}
1265 2472
1266/* 2473/*
1267 * This routine sets the configuration fields for an existing device's 2474 * This sets up the pci_config structure, as defined in the virtio 1.0
1268 * descriptor. It only works for the last device, but that's OK because that's 2475 * standard (and PCI standard).
1269 * how we use it.
1270 */ 2476 */
1271static void set_config(struct device *dev, unsigned len, const void *conf) 2477static void init_pci_config(struct pci_config *pci, u16 type,
2478 u8 class, u8 subclass)
1272{ 2479{
1273 /* Check we haven't overflowed our single page. */ 2480 size_t bar_offset, bar_len;
1274 if (device_config(dev) + len > devices.descpage + getpagesize()) 2481
1275 errx(1, "Too many devices"); 2482 /*
2483 * 4.1.4.4.1:
2484 *
2485 * The device MUST either present notify_off_multiplier as an even
2486 * power of 2, or present notify_off_multiplier as 0.
2487 *
2488 * 2.1.2:
2489 *
2490 * The device MUST initialize device status to 0 upon reset.
2491 */
2492 memset(pci, 0, sizeof(*pci));
2493
2494 /* 4.1.2.1: Devices MUST have the PCI Vendor ID 0x1AF4 */
2495 pci->vendor_id = 0x1AF4;
2496 /* 4.1.2.1: ... PCI Device ID calculated by adding 0x1040 ... */
2497 pci->device_id = 0x1040 + type;
2498
2499 /*
2500 * PCI have specific codes for different types of devices.
2501 * Linux doesn't care, but it's a good clue for people looking
2502 * at the device.
2503 */
2504 pci->class = class;
2505 pci->subclass = subclass;
2506
2507 /*
2508 * 4.1.2.1:
2509 *
2510 * Non-transitional devices SHOULD have a PCI Revision ID of 1 or
2511 * higher
2512 */
2513 pci->revid = 1;
2514
2515 /*
2516 * 4.1.2.1:
2517 *
2518 * Non-transitional devices SHOULD have a PCI Subsystem Device ID of
2519 * 0x40 or higher.
2520 */
2521 pci->subsystem_device_id = 0x40;
2522
2523 /* We use our dummy interrupt controller, and irq_line is the irq */
2524 pci->irq_line = devices.next_irq++;
2525 pci->irq_pin = 0;
2526
2527 /* Support for extended capabilities. */
2528 pci->status = (1 << 4);
2529
2530 /* Link them in. */
2531 /*
2532 * 4.1.4.3.1:
2533 *
2534 * The device MUST present at least one common configuration
2535 * capability.
2536 */
2537 pci->capabilities = offsetof(struct pci_config, common);
2538
2539 /* 4.1.4.3.1 ... offset MUST be 4-byte aligned. */
2540 assert(pci->capabilities % 4 == 0);
2541
2542 bar_offset = offsetof(struct virtio_pci_mmio, cfg);
2543 bar_len = sizeof(((struct virtio_pci_mmio *)0)->cfg);
2544 init_cap(&pci->common, sizeof(pci->common), VIRTIO_PCI_CAP_COMMON_CFG,
2545 bar_offset, bar_len,
2546 offsetof(struct pci_config, notify));
2547
2548 /*
2549 * 4.1.4.4.1:
2550 *
2551 * The device MUST present at least one notification capability.
2552 */
2553 bar_offset += bar_len;
2554 bar_len = sizeof(((struct virtio_pci_mmio *)0)->notify);
2555
2556 /*
2557 * 4.1.4.4.1:
2558 *
2559 * The cap.offset MUST be 2-byte aligned.
2560 */
2561 assert(pci->common.cap_next % 2 == 0);
2562
2563 /* FIXME: Use a non-zero notify_off, for per-queue notification? */
2564 /*
2565 * 4.1.4.4.1:
2566 *
2567 * The value cap.length presented by the device MUST be at least 2 and
2568 * MUST be large enough to support queue notification offsets for all
2569 * supported queues in all possible configurations.
2570 */
2571 assert(bar_len >= 2);
2572
2573 init_cap(&pci->notify.cap, sizeof(pci->notify),
2574 VIRTIO_PCI_CAP_NOTIFY_CFG,
2575 bar_offset, bar_len,
2576 offsetof(struct pci_config, isr));
2577
2578 bar_offset += bar_len;
2579 bar_len = sizeof(((struct virtio_pci_mmio *)0)->isr);
2580 /*
2581 * 4.1.4.5.1:
2582 *
2583 * The device MUST present at least one VIRTIO_PCI_CAP_ISR_CFG
2584 * capability.
2585 */
2586 init_cap(&pci->isr, sizeof(pci->isr),
2587 VIRTIO_PCI_CAP_ISR_CFG,
2588 bar_offset, bar_len,
2589 offsetof(struct pci_config, cfg_access));
2590
2591 /*
2592 * 4.1.4.7.1:
2593 *
2594 * The device MUST present at least one VIRTIO_PCI_CAP_PCI_CFG
2595 * capability.
2596 */
2597 /* This doesn't have any presence in the BAR */
2598 init_cap(&pci->cfg_access.cap, sizeof(pci->cfg_access),
2599 VIRTIO_PCI_CAP_PCI_CFG,
2600 0, 0, 0);
1276 2601
1277 /* Copy in the config information, and store the length. */ 2602 bar_offset += bar_len + sizeof(((struct virtio_pci_mmio *)0)->padding);
1278 memcpy(device_config(dev), conf, len); 2603 assert(bar_offset == sizeof(struct virtio_pci_mmio));
1279 dev->desc->config_len = len;
1280 2604
1281 /* Size must fit in config_len field (8 bits)! */ 2605 /*
1282 assert(dev->desc->config_len == len); 2606 * This gets sewn in and length set in set_device_config().
2607 * Some devices don't have a device configuration interface, so
2608 * we never expose this if we don't call set_device_config().
2609 */
2610 init_cap(&pci->device, sizeof(pci->device), VIRTIO_PCI_CAP_DEVICE_CFG,
2611 bar_offset, 0, 0);
1283} 2612}
1284 2613
1285/* 2614/*
1286 * This routine does all the creation and setup of a new device, including 2615 * This routine does all the creation and setup of a new device, but we don't
1287 * calling new_dev_desc() to allocate the descriptor and device memory. We 2616 * actually place the MMIO region until we know the size (if any) of the
1288 * don't actually start the service threads until later. 2617 * device-specific config. And we don't actually start the service threads
2618 * until later.
1289 * 2619 *
1290 * See what I mean about userspace being boring? 2620 * See what I mean about userspace being boring?
1291 */ 2621 */
1292static struct device *new_device(const char *name, u16 type) 2622static struct device *new_pci_device(const char *name, u16 type,
2623 u8 class, u8 subclass)
1293{ 2624{
1294 struct device *dev = malloc(sizeof(*dev)); 2625 struct device *dev = malloc(sizeof(*dev));
1295 2626
1296 /* Now we populate the fields one at a time. */ 2627 /* Now we populate the fields one at a time. */
1297 dev->desc = new_dev_desc(type);
1298 dev->name = name; 2628 dev->name = name;
1299 dev->vq = NULL; 2629 dev->vq = NULL;
1300 dev->feature_len = 0;
1301 dev->num_vq = 0;
1302 dev->running = false; 2630 dev->running = false;
1303 dev->next = NULL; 2631 dev->wrote_features_ok = false;
2632 dev->mmio_size = sizeof(struct virtio_pci_mmio);
2633 dev->mmio = calloc(1, dev->mmio_size);
2634 dev->features = (u64)1 << VIRTIO_F_VERSION_1;
2635 dev->features_accepted = 0;
1304 2636
1305 /* 2637 if (devices.device_num + 1 >= MAX_PCI_DEVICES)
1306 * Append to device list. Prepending to a single-linked list is 2638 errx(1, "Can only handle 31 PCI devices");
1307 * easier, but the user expects the devices to be arranged on the bus 2639
1308 * in command-line order. The first network device on the command line 2640 init_pci_config(&dev->config, type, class, subclass);
1309 * is eth0, the first block device /dev/vda, etc. 2641 assert(!devices.pci[devices.device_num+1]);
1310 */ 2642 devices.pci[++devices.device_num] = dev;
1311 if (devices.lastdev)
1312 devices.lastdev->next = dev;
1313 else
1314 devices.dev = dev;
1315 devices.lastdev = dev;
1316 2643
1317 return dev; 2644 return dev;
1318} 2645}
@@ -1324,6 +2651,7 @@ static struct device *new_device(const char *name, u16 type)
1324static void setup_console(void) 2651static void setup_console(void)
1325{ 2652{
1326 struct device *dev; 2653 struct device *dev;
2654 struct virtio_console_config conf;
1327 2655
1328 /* If we can save the initial standard input settings... */ 2656 /* If we can save the initial standard input settings... */
1329 if (tcgetattr(STDIN_FILENO, &orig_term) == 0) { 2657 if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
@@ -1336,7 +2664,7 @@ static void setup_console(void)
1336 tcsetattr(STDIN_FILENO, TCSANOW, &term); 2664 tcsetattr(STDIN_FILENO, TCSANOW, &term);
1337 } 2665 }
1338 2666
1339 dev = new_device("console", VIRTIO_ID_CONSOLE); 2667 dev = new_pci_device("console", VIRTIO_ID_CONSOLE, 0x07, 0x00);
1340 2668
1341 /* We store the console state in dev->priv, and initialize it. */ 2669 /* We store the console state in dev->priv, and initialize it. */
1342 dev->priv = malloc(sizeof(struct console_abort)); 2670 dev->priv = malloc(sizeof(struct console_abort));
@@ -1348,10 +2676,14 @@ static void setup_console(void)
1348 * stdin. When they put something in the output queue, we write it to 2676 * stdin. When they put something in the output queue, we write it to
1349 * stdout. 2677 * stdout.
1350 */ 2678 */
1351 add_virtqueue(dev, VIRTQUEUE_NUM, console_input); 2679 add_pci_virtqueue(dev, console_input, "input");
1352 add_virtqueue(dev, VIRTQUEUE_NUM, console_output); 2680 add_pci_virtqueue(dev, console_output, "output");
2681
2682 /* We need a configuration area for the emerg_wr early writes. */
2683 add_pci_feature(dev, VIRTIO_CONSOLE_F_EMERG_WRITE);
2684 set_device_config(dev, &conf, sizeof(conf));
1353 2685
1354 verbose("device %u: console\n", ++devices.device_num); 2686 verbose("device %u: console\n", devices.device_num);
1355} 2687}
1356/*:*/ 2688/*:*/
1357 2689
@@ -1449,6 +2781,7 @@ static void configure_device(int fd, const char *tapif, u32 ipaddr)
1449static int get_tun_device(char tapif[IFNAMSIZ]) 2781static int get_tun_device(char tapif[IFNAMSIZ])
1450{ 2782{
1451 struct ifreq ifr; 2783 struct ifreq ifr;
2784 int vnet_hdr_sz;
1452 int netfd; 2785 int netfd;
1453 2786
1454 /* Start with this zeroed. Messy but sure. */ 2787 /* Start with this zeroed. Messy but sure. */
@@ -1476,6 +2809,18 @@ static int get_tun_device(char tapif[IFNAMSIZ])
1476 */ 2809 */
1477 ioctl(netfd, TUNSETNOCSUM, 1); 2810 ioctl(netfd, TUNSETNOCSUM, 1);
1478 2811
2812 /*
2813 * In virtio before 1.0 (aka legacy virtio), we added a 16-bit
2814 * field at the end of the network header iff
2815 * VIRTIO_NET_F_MRG_RXBUF was negotiated. For virtio 1.0,
2816 * that became the norm, but we need to tell the tun device
2817 * about our expanded header (which is called
2818 * virtio_net_hdr_mrg_rxbuf in the legacy system).
2819 */
2820 vnet_hdr_sz = sizeof(struct virtio_net_hdr_v1);
2821 if (ioctl(netfd, TUNSETVNETHDRSZ, &vnet_hdr_sz) != 0)
2822 err(1, "Setting tun header size to %u", vnet_hdr_sz);
2823
1479 memcpy(tapif, ifr.ifr_name, IFNAMSIZ); 2824 memcpy(tapif, ifr.ifr_name, IFNAMSIZ);
1480 return netfd; 2825 return netfd;
1481} 2826}
@@ -1499,12 +2844,12 @@ static void setup_tun_net(char *arg)
1499 net_info->tunfd = get_tun_device(tapif); 2844 net_info->tunfd = get_tun_device(tapif);
1500 2845
1501 /* First we create a new network device. */ 2846 /* First we create a new network device. */
1502 dev = new_device("net", VIRTIO_ID_NET); 2847 dev = new_pci_device("net", VIRTIO_ID_NET, 0x02, 0x00);
1503 dev->priv = net_info; 2848 dev->priv = net_info;
1504 2849
1505 /* Network devices need a recv and a send queue, just like console. */ 2850 /* Network devices need a recv and a send queue, just like console. */
1506 add_virtqueue(dev, VIRTQUEUE_NUM, net_input); 2851 add_pci_virtqueue(dev, net_input, "rx");
1507 add_virtqueue(dev, VIRTQUEUE_NUM, net_output); 2852 add_pci_virtqueue(dev, net_output, "tx");
1508 2853
1509 /* 2854 /*
1510 * We need a socket to perform the magic network ioctls to bring up the 2855 * We need a socket to perform the magic network ioctls to bring up the
@@ -1524,7 +2869,7 @@ static void setup_tun_net(char *arg)
1524 p = strchr(arg, ':'); 2869 p = strchr(arg, ':');
1525 if (p) { 2870 if (p) {
1526 str2mac(p+1, conf.mac); 2871 str2mac(p+1, conf.mac);
1527 add_feature(dev, VIRTIO_NET_F_MAC); 2872 add_pci_feature(dev, VIRTIO_NET_F_MAC);
1528 *p = '\0'; 2873 *p = '\0';
1529 } 2874 }
1530 2875
@@ -1538,25 +2883,21 @@ static void setup_tun_net(char *arg)
1538 configure_device(ipfd, tapif, ip); 2883 configure_device(ipfd, tapif, ip);
1539 2884
1540 /* Expect Guest to handle everything except UFO */ 2885 /* Expect Guest to handle everything except UFO */
1541 add_feature(dev, VIRTIO_NET_F_CSUM); 2886 add_pci_feature(dev, VIRTIO_NET_F_CSUM);
1542 add_feature(dev, VIRTIO_NET_F_GUEST_CSUM); 2887 add_pci_feature(dev, VIRTIO_NET_F_GUEST_CSUM);
1543 add_feature(dev, VIRTIO_NET_F_GUEST_TSO4); 2888 add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO4);
1544 add_feature(dev, VIRTIO_NET_F_GUEST_TSO6); 2889 add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO6);
1545 add_feature(dev, VIRTIO_NET_F_GUEST_ECN); 2890 add_pci_feature(dev, VIRTIO_NET_F_GUEST_ECN);
1546 add_feature(dev, VIRTIO_NET_F_HOST_TSO4); 2891 add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO4);
1547 add_feature(dev, VIRTIO_NET_F_HOST_TSO6); 2892 add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO6);
1548 add_feature(dev, VIRTIO_NET_F_HOST_ECN); 2893 add_pci_feature(dev, VIRTIO_NET_F_HOST_ECN);
1549 /* We handle indirect ring entries */ 2894 /* We handle indirect ring entries */
1550 add_feature(dev, VIRTIO_RING_F_INDIRECT_DESC); 2895 add_pci_feature(dev, VIRTIO_RING_F_INDIRECT_DESC);
1551 /* We're compliant with the damn spec. */ 2896 set_device_config(dev, &conf, sizeof(conf));
1552 add_feature(dev, VIRTIO_F_ANY_LAYOUT);
1553 set_config(dev, sizeof(conf), &conf);
1554 2897
1555 /* We don't need the socket any more; setup is done. */ 2898 /* We don't need the socket any more; setup is done. */
1556 close(ipfd); 2899 close(ipfd);
1557 2900
1558 devices.device_num++;
1559
1560 if (bridging) 2901 if (bridging)
1561 verbose("device %u: tun %s attached to bridge: %s\n", 2902 verbose("device %u: tun %s attached to bridge: %s\n",
1562 devices.device_num, tapif, arg); 2903 devices.device_num, tapif, arg);
@@ -1607,7 +2948,7 @@ static void blk_request(struct virtqueue *vq)
1607 head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 2948 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
1608 2949
1609 /* Copy the output header from the front of the iov (adjusts iov) */ 2950 /* Copy the output header from the front of the iov (adjusts iov) */
1610 iov_consume(iov, out_num, &out, sizeof(out)); 2951 iov_consume(vq->dev, iov, out_num, &out, sizeof(out));
1611 2952
1612 /* Find and trim end of iov input array, for our status byte. */ 2953 /* Find and trim end of iov input array, for our status byte. */
1613 in = NULL; 2954 in = NULL;
@@ -1619,7 +2960,7 @@ static void blk_request(struct virtqueue *vq)
1619 } 2960 }
1620 } 2961 }
1621 if (!in) 2962 if (!in)
1622 errx(1, "Bad virtblk cmd with no room for status"); 2963 bad_driver_vq(vq, "Bad virtblk cmd with no room for status");
1623 2964
1624 /* 2965 /*
1625 * For historical reasons, block operations are expressed in 512 byte 2966 * For historical reasons, block operations are expressed in 512 byte
@@ -1627,15 +2968,7 @@ static void blk_request(struct virtqueue *vq)
1627 */ 2968 */
1628 off = out.sector * 512; 2969 off = out.sector * 512;
1629 2970
1630 /* 2971 if (out.type & VIRTIO_BLK_T_OUT) {
1631 * In general the virtio block driver is allowed to try SCSI commands.
1632 * It'd be nice if we supported eject, for example, but we don't.
1633 */
1634 if (out.type & VIRTIO_BLK_T_SCSI_CMD) {
1635 fprintf(stderr, "Scsi commands unsupported\n");
1636 *in = VIRTIO_BLK_S_UNSUPP;
1637 wlen = sizeof(*in);
1638 } else if (out.type & VIRTIO_BLK_T_OUT) {
1639 /* 2972 /*
1640 * Write 2973 * Write
1641 * 2974 *
@@ -1657,7 +2990,7 @@ static void blk_request(struct virtqueue *vq)
1657 /* Trim it back to the correct length */ 2990 /* Trim it back to the correct length */
1658 ftruncate64(vblk->fd, vblk->len); 2991 ftruncate64(vblk->fd, vblk->len);
1659 /* Die, bad Guest, die. */ 2992 /* Die, bad Guest, die. */
1660 errx(1, "Write past end %llu+%u", off, ret); 2993 bad_driver_vq(vq, "Write past end %llu+%u", off, ret);
1661 } 2994 }
1662 2995
1663 wlen = sizeof(*in); 2996 wlen = sizeof(*in);
@@ -1699,11 +3032,11 @@ static void setup_block_file(const char *filename)
1699 struct vblk_info *vblk; 3032 struct vblk_info *vblk;
1700 struct virtio_blk_config conf; 3033 struct virtio_blk_config conf;
1701 3034
1702 /* Creat the device. */ 3035 /* Create the device. */
1703 dev = new_device("block", VIRTIO_ID_BLOCK); 3036 dev = new_pci_device("block", VIRTIO_ID_BLOCK, 0x01, 0x80);
1704 3037
1705 /* The device has one virtqueue, where the Guest places requests. */ 3038 /* The device has one virtqueue, where the Guest places requests. */
1706 add_virtqueue(dev, VIRTQUEUE_NUM, blk_request); 3039 add_pci_virtqueue(dev, blk_request, "request");
1707 3040
1708 /* Allocate the room for our own bookkeeping */ 3041 /* Allocate the room for our own bookkeeping */
1709 vblk = dev->priv = malloc(sizeof(*vblk)); 3042 vblk = dev->priv = malloc(sizeof(*vblk));
@@ -1712,9 +3045,6 @@ static void setup_block_file(const char *filename)
1712 vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE); 3045 vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
1713 vblk->len = lseek64(vblk->fd, 0, SEEK_END); 3046 vblk->len = lseek64(vblk->fd, 0, SEEK_END);
1714 3047
1715 /* We support FLUSH. */
1716 add_feature(dev, VIRTIO_BLK_F_FLUSH);
1717
1718 /* Tell Guest how many sectors this device has. */ 3048 /* Tell Guest how many sectors this device has. */
1719 conf.capacity = cpu_to_le64(vblk->len / 512); 3049 conf.capacity = cpu_to_le64(vblk->len / 512);
1720 3050
@@ -1722,20 +3052,19 @@ static void setup_block_file(const char *filename)
1722 * Tell Guest not to put in too many descriptors at once: two are used 3052 * Tell Guest not to put in too many descriptors at once: two are used
1723 * for the in and out elements. 3053 * for the in and out elements.
1724 */ 3054 */
1725 add_feature(dev, VIRTIO_BLK_F_SEG_MAX); 3055 add_pci_feature(dev, VIRTIO_BLK_F_SEG_MAX);
1726 conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2); 3056 conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2);
1727 3057
1728 /* Don't try to put whole struct: we have 8 bit limit. */ 3058 set_device_config(dev, &conf, sizeof(struct virtio_blk_config));
1729 set_config(dev, offsetof(struct virtio_blk_config, geometry), &conf);
1730 3059
1731 verbose("device %u: virtblock %llu sectors\n", 3060 verbose("device %u: virtblock %llu sectors\n",
1732 ++devices.device_num, le64_to_cpu(conf.capacity)); 3061 devices.device_num, le64_to_cpu(conf.capacity));
1733} 3062}
1734 3063
1735/*L:211 3064/*L:211
1736 * Our random number generator device reads from /dev/random into the Guest's 3065 * Our random number generator device reads from /dev/urandom into the Guest's
1737 * input buffers. The usual case is that the Guest doesn't want random numbers 3066 * input buffers. The usual case is that the Guest doesn't want random numbers
1738 * and so has no buffers although /dev/random is still readable, whereas 3067 * and so has no buffers although /dev/urandom is still readable, whereas
1739 * console is the reverse. 3068 * console is the reverse.
1740 * 3069 *
1741 * The same logic applies, however. 3070 * The same logic applies, however.
@@ -1754,7 +3083,7 @@ static void rng_input(struct virtqueue *vq)
1754 /* First we need a buffer from the Guests's virtqueue. */ 3083 /* First we need a buffer from the Guests's virtqueue. */
1755 head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 3084 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
1756 if (out_num) 3085 if (out_num)
1757 errx(1, "Output buffers in rng?"); 3086 bad_driver_vq(vq, "Output buffers in rng?");
1758 3087
1759 /* 3088 /*
1760 * Just like the console write, we loop to cover the whole iovec. 3089 * Just like the console write, we loop to cover the whole iovec.
@@ -1763,8 +3092,8 @@ static void rng_input(struct virtqueue *vq)
1763 while (!iov_empty(iov, in_num)) { 3092 while (!iov_empty(iov, in_num)) {
1764 len = readv(rng_info->rfd, iov, in_num); 3093 len = readv(rng_info->rfd, iov, in_num);
1765 if (len <= 0) 3094 if (len <= 0)
1766 err(1, "Read from /dev/random gave %i", len); 3095 err(1, "Read from /dev/urandom gave %i", len);
1767 iov_consume(iov, in_num, NULL, len); 3096 iov_consume(vq->dev, iov, in_num, NULL, len);
1768 totlen += len; 3097 totlen += len;
1769 } 3098 }
1770 3099
@@ -1780,17 +3109,20 @@ static void setup_rng(void)
1780 struct device *dev; 3109 struct device *dev;
1781 struct rng_info *rng_info = malloc(sizeof(*rng_info)); 3110 struct rng_info *rng_info = malloc(sizeof(*rng_info));
1782 3111
1783 /* Our device's privat info simply contains the /dev/random fd. */ 3112 /* Our device's private info simply contains the /dev/urandom fd. */
1784 rng_info->rfd = open_or_die("/dev/random", O_RDONLY); 3113 rng_info->rfd = open_or_die("/dev/urandom", O_RDONLY);
1785 3114
1786 /* Create the new device. */ 3115 /* Create the new device. */
1787 dev = new_device("rng", VIRTIO_ID_RNG); 3116 dev = new_pci_device("rng", VIRTIO_ID_RNG, 0xff, 0);
1788 dev->priv = rng_info; 3117 dev->priv = rng_info;
1789 3118
1790 /* The device has one virtqueue, where the Guest places inbufs. */ 3119 /* The device has one virtqueue, where the Guest places inbufs. */
1791 add_virtqueue(dev, VIRTQUEUE_NUM, rng_input); 3120 add_pci_virtqueue(dev, rng_input, "input");
1792 3121
1793 verbose("device %u: rng\n", devices.device_num++); 3122 /* We don't have any configuration space */
3123 no_device_config(dev);
3124
3125 verbose("device %u: rng\n", devices.device_num);
1794} 3126}
1795/* That's the end of device setup. */ 3127/* That's the end of device setup. */
1796 3128
@@ -1820,17 +3152,23 @@ static void __attribute__((noreturn)) restart_guest(void)
1820static void __attribute__((noreturn)) run_guest(void) 3152static void __attribute__((noreturn)) run_guest(void)
1821{ 3153{
1822 for (;;) { 3154 for (;;) {
1823 unsigned long notify_addr; 3155 struct lguest_pending notify;
1824 int readval; 3156 int readval;
1825 3157
1826 /* We read from the /dev/lguest device to run the Guest. */ 3158 /* We read from the /dev/lguest device to run the Guest. */
1827 readval = pread(lguest_fd, &notify_addr, 3159 readval = pread(lguest_fd, &notify, sizeof(notify), cpu_id);
1828 sizeof(notify_addr), cpu_id); 3160 if (readval == sizeof(notify)) {
1829 3161 if (notify.trap == 13) {
1830 /* One unsigned long means the Guest did HCALL_NOTIFY */ 3162 verbose("Emulating instruction at %#x\n",
1831 if (readval == sizeof(notify_addr)) { 3163 getreg(eip));
1832 verbose("Notify on address %#lx\n", notify_addr); 3164 emulate_insn(notify.insn);
1833 handle_output(notify_addr); 3165 } else if (notify.trap == 14) {
3166 verbose("Emulating MMIO at %#x\n",
3167 getreg(eip));
3168 emulate_mmio(notify.addr, notify.insn);
3169 } else
3170 errx(1, "Unknown trap %i addr %#08x\n",
3171 notify.trap, notify.addr);
1834 /* ENOENT means the Guest died. Reading tells us why. */ 3172 /* ENOENT means the Guest died. Reading tells us why. */
1835 } else if (errno == ENOENT) { 3173 } else if (errno == ENOENT) {
1836 char reason[1024] = { 0 }; 3174 char reason[1024] = { 0 };
@@ -1893,11 +3231,9 @@ int main(int argc, char *argv[])
1893 main_args = argv; 3231 main_args = argv;
1894 3232
1895 /* 3233 /*
1896 * First we initialize the device list. We keep a pointer to the last 3234 * First we initialize the device list. We remember next interrupt
1897 * device, and the next interrupt number to use for devices (1: 3235 * number to use for devices (1: remember that 0 is used by the timer).
1898 * remember that 0 is used by the timer).
1899 */ 3236 */
1900 devices.lastdev = NULL;
1901 devices.next_irq = 1; 3237 devices.next_irq = 1;
1902 3238
1903 /* We're CPU 0. In fact, that's the only CPU possible right now. */ 3239 /* We're CPU 0. In fact, that's the only CPU possible right now. */
@@ -1921,12 +3257,14 @@ int main(int argc, char *argv[])
1921 guest_base = map_zeroed_pages(mem / getpagesize() 3257 guest_base = map_zeroed_pages(mem / getpagesize()
1922 + DEVICE_PAGES); 3258 + DEVICE_PAGES);
1923 guest_limit = mem; 3259 guest_limit = mem;
1924 guest_max = mem + DEVICE_PAGES*getpagesize(); 3260 guest_max = guest_mmio = mem + DEVICE_PAGES*getpagesize();
1925 devices.descpage = get_pages(1);
1926 break; 3261 break;
1927 } 3262 }
1928 } 3263 }
1929 3264
3265 /* We always have a console device, and it's always device 1. */
3266 setup_console();
3267
1930 /* The options are fairly straight-forward */ 3268 /* The options are fairly straight-forward */
1931 while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) { 3269 while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) {
1932 switch (c) { 3270 switch (c) {
@@ -1967,8 +3305,8 @@ int main(int argc, char *argv[])
1967 3305
1968 verbose("Guest base is at %p\n", guest_base); 3306 verbose("Guest base is at %p\n", guest_base);
1969 3307
1970 /* We always have a console device */ 3308 /* Initialize the (fake) PCI host bridge device. */
1971 setup_console(); 3309 init_pci_host_bridge();
1972 3310
1973 /* Now we load the kernel */ 3311 /* Now we load the kernel */
1974 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); 3312 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
diff --git a/tools/power/acpi/common/cmfsize.c b/tools/power/acpi/common/cmfsize.c
index f4b953354ff7..eec688041500 100644
--- a/tools/power/acpi/common/cmfsize.c
+++ b/tools/power/acpi/common/cmfsize.c
@@ -5,7 +5,7 @@
5 *****************************************************************************/ 5 *****************************************************************************/
6 6
7/* 7/*
8 * Copyright (C) 2000 - 2014, Intel Corp. 8 * Copyright (C) 2000 - 2015, Intel Corp.
9 * All rights reserved. 9 * All rights reserved.
10 * 10 *
11 * Redistribution and use in source and binary forms, with or without 11 * Redistribution and use in source and binary forms, with or without
diff --git a/tools/power/acpi/common/getopt.c b/tools/power/acpi/common/getopt.c
index 2f0f34a36db4..5da129e10aa2 100644
--- a/tools/power/acpi/common/getopt.c
+++ b/tools/power/acpi/common/getopt.c
@@ -5,7 +5,7 @@
5 *****************************************************************************/ 5 *****************************************************************************/
6 6
7/* 7/*
8 * Copyright (C) 2000 - 2014, Intel Corp. 8 * Copyright (C) 2000 - 2015, Intel Corp.
9 * All rights reserved. 9 * All rights reserved.
10 * 10 *
11 * Redistribution and use in source and binary forms, with or without 11 * Redistribution and use in source and binary forms, with or without
diff --git a/tools/power/acpi/os_specific/service_layers/oslibcfs.c b/tools/power/acpi/os_specific/service_layers/oslibcfs.c
index c13ff9c51d74..b51e40a9a120 100644
--- a/tools/power/acpi/os_specific/service_layers/oslibcfs.c
+++ b/tools/power/acpi/os_specific/service_layers/oslibcfs.c
@@ -5,7 +5,7 @@
5 *****************************************************************************/ 5 *****************************************************************************/
6 6
7/* 7/*
8 * Copyright (C) 2000 - 2014, Intel Corp. 8 * Copyright (C) 2000 - 2015, Intel Corp.
9 * All rights reserved. 9 * All rights reserved.
10 * 10 *
11 * Redistribution and use in source and binary forms, with or without 11 * Redistribution and use in source and binary forms, with or without
diff --git a/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c b/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c
index 0dc2485dedf5..92f1fd700344 100644
--- a/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c
+++ b/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c
@@ -5,7 +5,7 @@
5 *****************************************************************************/ 5 *****************************************************************************/
6 6
7/* 7/*
8 * Copyright (C) 2000 - 2014, Intel Corp. 8 * Copyright (C) 2000 - 2015, Intel Corp.
9 * All rights reserved. 9 * All rights reserved.
10 * 10 *
11 * Redistribution and use in source and binary forms, with or without 11 * Redistribution and use in source and binary forms, with or without
diff --git a/tools/power/acpi/os_specific/service_layers/osunixdir.c b/tools/power/acpi/os_specific/service_layers/osunixdir.c
index 733f9e490fc4..e153fcb12b1a 100644
--- a/tools/power/acpi/os_specific/service_layers/osunixdir.c
+++ b/tools/power/acpi/os_specific/service_layers/osunixdir.c
@@ -5,7 +5,7 @@
5 *****************************************************************************/ 5 *****************************************************************************/
6 6
7/* 7/*
8 * Copyright (C) 2000 - 2014, Intel Corp. 8 * Copyright (C) 2000 - 2015, Intel Corp.
9 * All rights reserved. 9 * All rights reserved.
10 * 10 *
11 * Redistribution and use in source and binary forms, with or without 11 * Redistribution and use in source and binary forms, with or without
diff --git a/tools/power/acpi/os_specific/service_layers/osunixmap.c b/tools/power/acpi/os_specific/service_layers/osunixmap.c
index 99b47b6194a3..3853a7350440 100644
--- a/tools/power/acpi/os_specific/service_layers/osunixmap.c
+++ b/tools/power/acpi/os_specific/service_layers/osunixmap.c
@@ -5,7 +5,7 @@
5 *****************************************************************************/ 5 *****************************************************************************/
6 6
7/* 7/*
8 * Copyright (C) 2000 - 2014, Intel Corp. 8 * Copyright (C) 2000 - 2015, Intel Corp.
9 * All rights reserved. 9 * All rights reserved.
10 * 10 *
11 * Redistribution and use in source and binary forms, with or without 11 * Redistribution and use in source and binary forms, with or without
diff --git a/tools/power/acpi/os_specific/service_layers/osunixxf.c b/tools/power/acpi/os_specific/service_layers/osunixxf.c
index 7ccb073f8316..6858c0893c91 100644
--- a/tools/power/acpi/os_specific/service_layers/osunixxf.c
+++ b/tools/power/acpi/os_specific/service_layers/osunixxf.c
@@ -5,7 +5,7 @@
5 *****************************************************************************/ 5 *****************************************************************************/
6 6
7/* 7/*
8 * Copyright (C) 2000 - 2014, Intel Corp. 8 * Copyright (C) 2000 - 2015, Intel Corp.
9 * All rights reserved. 9 * All rights reserved.
10 * 10 *
11 * Redistribution and use in source and binary forms, with or without 11 * Redistribution and use in source and binary forms, with or without
diff --git a/tools/power/acpi/tools/acpidump/acpidump.h b/tools/power/acpi/tools/acpidump/acpidump.h
index a2d37d610639..84bdef0136cb 100644
--- a/tools/power/acpi/tools/acpidump/acpidump.h
+++ b/tools/power/acpi/tools/acpidump/acpidump.h
@@ -5,7 +5,7 @@
5 *****************************************************************************/ 5 *****************************************************************************/
6 6
7/* 7/*
8 * Copyright (C) 2000 - 2014, Intel Corp. 8 * Copyright (C) 2000 - 2015, Intel Corp.
9 * All rights reserved. 9 * All rights reserved.
10 * 10 *
11 * Redistribution and use in source and binary forms, with or without 11 * Redistribution and use in source and binary forms, with or without
diff --git a/tools/power/acpi/tools/acpidump/apdump.c b/tools/power/acpi/tools/acpidump/apdump.c
index 24d32968802d..c736adf5fb55 100644
--- a/tools/power/acpi/tools/acpidump/apdump.c
+++ b/tools/power/acpi/tools/acpidump/apdump.c
@@ -5,7 +5,7 @@
5 *****************************************************************************/ 5 *****************************************************************************/
6 6
7/* 7/*
8 * Copyright (C) 2000 - 2014, Intel Corp. 8 * Copyright (C) 2000 - 2015, Intel Corp.
9 * All rights reserved. 9 * All rights reserved.
10 * 10 *
11 * Redistribution and use in source and binary forms, with or without 11 * Redistribution and use in source and binary forms, with or without
diff --git a/tools/power/acpi/tools/acpidump/apfiles.c b/tools/power/acpi/tools/acpidump/apfiles.c
index d470046a6d81..8f2fe168228e 100644
--- a/tools/power/acpi/tools/acpidump/apfiles.c
+++ b/tools/power/acpi/tools/acpidump/apfiles.c
@@ -5,7 +5,7 @@
5 *****************************************************************************/ 5 *****************************************************************************/
6 6
7/* 7/*
8 * Copyright (C) 2000 - 2014, Intel Corp. 8 * Copyright (C) 2000 - 2015, Intel Corp.
9 * All rights reserved. 9 * All rights reserved.
10 * 10 *
11 * Redistribution and use in source and binary forms, with or without 11 * Redistribution and use in source and binary forms, with or without
diff --git a/tools/power/acpi/tools/acpidump/apmain.c b/tools/power/acpi/tools/acpidump/apmain.c
index 853b4da22c3e..d0ba6535f5af 100644
--- a/tools/power/acpi/tools/acpidump/apmain.c
+++ b/tools/power/acpi/tools/acpidump/apmain.c
@@ -5,7 +5,7 @@
5 *****************************************************************************/ 5 *****************************************************************************/
6 6
7/* 7/*
8 * Copyright (C) 2000 - 2014, Intel Corp. 8 * Copyright (C) 2000 - 2015, Intel Corp.
9 * All rights reserved. 9 * All rights reserved.
10 * 10 *
11 * Redistribution and use in source and binary forms, with or without 11 * Redistribution and use in source and binary forms, with or without
diff --git a/tools/power/cpupower/Makefile b/tools/power/cpupower/Makefile
index 2e2ba2efa0d9..3ed7c0476d48 100644
--- a/tools/power/cpupower/Makefile
+++ b/tools/power/cpupower/Makefile
@@ -209,7 +209,7 @@ $(OUTPUT)%.o: %.c
209 209
210$(OUTPUT)cpupower: $(UTIL_OBJS) $(OUTPUT)libcpupower.so.$(LIB_MAJ) 210$(OUTPUT)cpupower: $(UTIL_OBJS) $(OUTPUT)libcpupower.so.$(LIB_MAJ)
211 $(ECHO) " CC " $@ 211 $(ECHO) " CC " $@
212 $(QUIET) $(CC) $(CFLAGS) $(LDFLAGS) $(UTIL_OBJS) -lcpupower -lrt -lpci -L$(OUTPUT) -o $@ 212 $(QUIET) $(CC) $(CFLAGS) $(LDFLAGS) $(UTIL_OBJS) -lcpupower -Wl,-rpath=./ -lrt -lpci -L$(OUTPUT) -o $@
213 $(QUIET) $(STRIPCMD) $@ 213 $(QUIET) $(STRIPCMD) $@
214 214
215$(OUTPUT)po/$(PACKAGE).pot: $(UTIL_SRC) 215$(OUTPUT)po/$(PACKAGE).pot: $(UTIL_SRC)
diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8
index 56bfb523c5bb..feea7ad9500b 100644
--- a/tools/power/x86/turbostat/turbostat.8
+++ b/tools/power/x86/turbostat/turbostat.8
@@ -9,40 +9,50 @@ turbostat \- Report processor frequency and idle statistics
9.br 9.br
10.B turbostat 10.B turbostat
11.RB [ Options ] 11.RB [ Options ]
12.RB [ "\-i interval_sec" ] 12.RB [ "\--interval seconds" ]
13.SH DESCRIPTION 13.SH DESCRIPTION
14\fBturbostat \fP reports processor topology, frequency, 14\fBturbostat \fP reports processor topology, frequency,
15idle power-state statistics, temperature and power on modern X86 processors. 15idle power-state statistics, temperature and power on X86 processors.
16Either \fBcommand\fP is forked and statistics are printed 16There are two ways to invoke turbostat.
17upon its completion, or statistics are printed periodically. 17The first method is to supply a
18 18\fBcommand\fP, which is forked and statistics are printed
19\fBturbostat \fP 19upon its completion.
20must be run on root, and 20The second method is to omit the command,
21minimally requires that the processor 21and turbostat displays statistics every 5 seconds.
22supports an "invariant" TSC, plus the APERF and MPERF MSRs. 22The 5-second interval can be changed using the --interval option.
23Additional information is reported depending on hardware counter support. 23
24 24Some information is not available on older processors.
25.SS Options 25.SS Options
26The \fB-p\fP option limits output to the 1st thread in 1st core of each package. 26\fB--Counter MSR#\fP shows the delta of the specified 64-bit MSR counter.
27.PP
28\fB--counter MSR#\fP shows the delta of the specified 32-bit MSR counter.
29.PP
30\fB--Dump\fP displays the raw counter values.
31.PP
32\fB--debug\fP displays additional system configuration information. Invoking this parameter
33more than once may also enable internal turbostat debug information.
34.PP
35\fB--interval seconds\fP overrides the default 5-second measurement interval.
36.PP
37\fB--help\fP displays usage for the most common parameters.
27.PP 38.PP
28The \fB-P\fP option limits output to the 1st thread in each Package. 39\fB--Joules\fP displays energy in Joules, rather than dividing Joules by time to print power in Watts.
29.PP 40.PP
30The \fB-S\fP option limits output to a 1-line System Summary for each interval. 41\fB--MSR MSR#\fP shows the specified 64-bit MSR value.
31.PP 42.PP
32The \fB-v\fP option increases verbosity. 43\fB--msr MSR#\fP shows the specified 32-bit MSR value.
33.PP 44.PP
34The \fB-c MSR#\fP option includes the delta of the specified 32-bit MSR counter. 45\fB--Package\fP limits output to the system summary plus the 1st thread in each Package.
35.PP 46.PP
36The \fB-C MSR#\fP option includes the delta of the specified 64-bit MSR counter. 47\fB--processor\fP limits output to the system summary plus the 1st thread in each processor of each package. Ie. it skips hyper-threaded siblings.
37.PP 48.PP
38The \fB-m MSR#\fP option includes the the specified 32-bit MSR value. 49\fB--Summary\fP limits output to a 1-line System Summary for each interval.
39.PP 50.PP
40The \fB-M MSR#\fP option includes the the specified 64-bit MSR value. 51\fB--TCC temperature\fP sets the Thermal Control Circuit temperature for systems which do not export that value. This is used for making sense of the Digital Thermal Sensor outputs, as they return degrees Celsius below the TCC activation temperature.
41.PP 52.PP
42The \fB-i interval_sec\fP option prints statistics every \fiinterval_sec\fP seconds. 53\fB--version\fP displays the version.
43The default is 5 seconds.
44.PP 54.PP
45The \fBcommand\fP parameter forks \fBcommand\fP and upon its exit, 55The \fBcommand\fP parameter forks \fBcommand\fP, and upon its exit,
46displays the statistics gathered since it was forked. 56displays the statistics gathered since it was forked.
47.PP 57.PP
48.SH FIELD DESCRIPTIONS 58.SH FIELD DESCRIPTIONS
@@ -52,7 +62,7 @@ displays the statistics gathered since it was forked.
52\fBCPU\fP Linux CPU (logical processor) number. 62\fBCPU\fP Linux CPU (logical processor) number.
53Note that multiple CPUs per core indicate support for Intel(R) Hyper-Threading Technology. 63Note that multiple CPUs per core indicate support for Intel(R) Hyper-Threading Technology.
54\fBAVG_MHz\fP number of cycles executed divided by time elapsed. 64\fBAVG_MHz\fP number of cycles executed divided by time elapsed.
55\fB%Buzy\fP percent of the interval that the CPU retired instructions, aka. % of time in "C0" state. 65\fB%Busy\fP percent of the interval that the CPU retired instructions, aka. % of time in "C0" state.
56\fBBzy_MHz\fP average clock rate while the CPU was busy (in "c0" state). 66\fBBzy_MHz\fP average clock rate while the CPU was busy (in "c0" state).
57\fBTSC_MHz\fP average MHz that the TSC ran during the entire interval. 67\fBTSC_MHz\fP average MHz that the TSC ran during the entire interval.
58\fBCPU%c1, CPU%c3, CPU%c6, CPU%c7\fP show the percentage residency in hardware core idle states. 68\fBCPU%c1, CPU%c3, CPU%c6, CPU%c7\fP show the percentage residency in hardware core idle states.
@@ -68,7 +78,7 @@ Note that multiple CPUs per core indicate support for Intel(R) Hyper-Threading T
68.fi 78.fi
69.PP 79.PP
70.SH EXAMPLE 80.SH EXAMPLE
71Without any parameters, turbostat prints out counters ever 5 seconds. 81Without any parameters, turbostat displays statistics ever 5 seconds.
72(override interval with "-i sec" option, or specify a command 82(override interval with "-i sec" option, or specify a command
73for turbostat to fork). 83for turbostat to fork).
74 84
@@ -91,19 +101,19 @@ Subsequent rows show per-CPU statistics.
91 3 3 3 0.20 1596 3492 0 0.44 0.00 99.37 0.00 23 101 3 3 3 0.20 1596 3492 0 0.44 0.00 99.37 0.00 23
92 3 7 5 0.31 1596 3492 0 0.33 102 3 7 5 0.31 1596 3492 0 0.33
93.fi 103.fi
94.SH VERBOSE EXAMPLE 104.SH DEBUG EXAMPLE
95The "-v" option adds verbosity to the output: 105The "--debug" option prints additional system information before measurements:
96 106
97.nf 107.nf
98[root@ivy]# turbostat -v 108turbostat version 4.0 10-Feb, 2015 - Len Brown <lenb@kernel.org>
99turbostat v3.0 November 23, 2012 - Len Brown <lenb@kernel.org>
100CPUID(0): GenuineIntel 13 CPUID levels; family:model:stepping 0x6:3a:9 (6:58:9) 109CPUID(0): GenuineIntel 13 CPUID levels; family:model:stepping 0x6:3a:9 (6:58:9)
101CPUID(6): APERF, DTS, PTM, EPB 110CPUID(6): APERF, DTS, PTM, EPB
102RAPL: 851 sec. Joule Counter Range 111RAPL: 851 sec. Joule Counter Range, at 77 Watts
103cpu0: MSR_NHM_PLATFORM_INFO: 0x81010f0012300 112cpu0: MSR_NHM_PLATFORM_INFO: 0x81010f0012300
10416 * 100 = 1600 MHz max efficiency 11316 * 100 = 1600 MHz max efficiency
10535 * 100 = 3500 MHz TSC frequency 11435 * 100 = 3500 MHz TSC frequency
106cpu0: MSR_NHM_SNB_PKG_CST_CFG_CTL: 0x1e008402 (UNdemote-C3, UNdemote-C1, demote-C3, demote-C1, locked: pkg-cstate-limit=2: pc6-noret) 115cpu0: MSR_IA32_POWER_CTL: 0x0014005d (C1E auto-promotion: DISabled)
116cpu0: MSR_NHM_SNB_PKG_CST_CFG_CTL: 0x1e008402 (UNdemote-C3, UNdemote-C1, demote-C3, demote-C1, locked: pkg-cstate-limit=2: pc6n)
107cpu0: MSR_NHM_TURBO_RATIO_LIMIT: 0x25262727 117cpu0: MSR_NHM_TURBO_RATIO_LIMIT: 0x25262727
10837 * 100 = 3700 MHz max turbo 4 active cores 11837 * 100 = 3700 MHz max turbo 4 active cores
10938 * 100 = 3800 MHz max turbo 3 active cores 11938 * 100 = 3800 MHz max turbo 3 active cores
@@ -112,9 +122,9 @@ cpu0: MSR_NHM_TURBO_RATIO_LIMIT: 0x25262727
112cpu0: MSR_IA32_ENERGY_PERF_BIAS: 0x00000006 (balanced) 122cpu0: MSR_IA32_ENERGY_PERF_BIAS: 0x00000006 (balanced)
113cpu0: MSR_RAPL_POWER_UNIT: 0x000a1003 (0.125000 Watts, 0.000015 Joules, 0.000977 sec.) 123cpu0: MSR_RAPL_POWER_UNIT: 0x000a1003 (0.125000 Watts, 0.000015 Joules, 0.000977 sec.)
114cpu0: MSR_PKG_POWER_INFO: 0x01e00268 (77 W TDP, RAPL 60 - 0 W, 0.000000 sec.) 124cpu0: MSR_PKG_POWER_INFO: 0x01e00268 (77 W TDP, RAPL 60 - 0 W, 0.000000 sec.)
115cpu0: MSR_PKG_POWER_LIMIT: 0x830000148268 (UNlocked) 125cpu0: MSR_PKG_POWER_LIMIT: 0x30000148268 (UNlocked)
116cpu0: PKG Limit #1: ENabled (77.000000 Watts, 1.000000 sec, clamp DISabled) 126cpu0: PKG Limit #1: ENabled (77.000000 Watts, 1.000000 sec, clamp DISabled)
117cpu0: PKG Limit #2: ENabled (96.000000 Watts, 0.000977* sec, clamp DISabled) 127cpu0: PKG Limit #2: DISabled (96.000000 Watts, 0.000977* sec, clamp DISabled)
118cpu0: MSR_PP0_POLICY: 0 128cpu0: MSR_PP0_POLICY: 0
119cpu0: MSR_PP0_POWER_LIMIT: 0x00000000 (UNlocked) 129cpu0: MSR_PP0_POWER_LIMIT: 0x00000000 (UNlocked)
120cpu0: Cores Limit: DISabled (0.000000 Watts, 0.000977 sec, clamp DISabled) 130cpu0: Cores Limit: DISabled (0.000000 Watts, 0.000977 sec, clamp DISabled)
@@ -123,19 +133,20 @@ cpu0: MSR_PP1_POWER_LIMIT: 0x00000000 (UNlocked)
123cpu0: GFX Limit: DISabled (0.000000 Watts, 0.000977 sec, clamp DISabled) 133cpu0: GFX Limit: DISabled (0.000000 Watts, 0.000977 sec, clamp DISabled)
124cpu0: MSR_IA32_TEMPERATURE_TARGET: 0x00691400 (105 C) 134cpu0: MSR_IA32_TEMPERATURE_TARGET: 0x00691400 (105 C)
125cpu0: MSR_IA32_PACKAGE_THERM_STATUS: 0x884e0000 (27 C) 135cpu0: MSR_IA32_PACKAGE_THERM_STATUS: 0x884e0000 (27 C)
126cpu0: MSR_IA32_THERM_STATUS: 0x88560000 (19 C +/- 1) 136cpu0: MSR_IA32_THERM_STATUS: 0x88580000 (17 C +/- 1)
127cpu1: MSR_IA32_THERM_STATUS: 0x88560000 (19 C +/- 1) 137cpu1: MSR_IA32_THERM_STATUS: 0x885a0000 (15 C +/- 1)
128cpu2: MSR_IA32_THERM_STATUS: 0x88540000 (21 C +/- 1) 138cpu2: MSR_IA32_THERM_STATUS: 0x88570000 (18 C +/- 1)
129cpu3: MSR_IA32_THERM_STATUS: 0x884e0000 (27 C +/- 1) 139cpu3: MSR_IA32_THERM_STATUS: 0x884e0000 (27 C +/- 1)
130 ... 140 ...
131.fi 141.fi
132The \fBmax efficiency\fP frequency, a.k.a. Low Frequency Mode, is the frequency 142The \fBmax efficiency\fP frequency, a.k.a. Low Frequency Mode, is the frequency
133available at the minimum package voltage. The \fBTSC frequency\fP is the nominal 143available at the minimum package voltage. The \fBTSC frequency\fP is the base
134maximum frequency of the processor if turbo-mode were not available. This frequency 144frequency of the processor -- this should match the brand string
145in /proc/cpuinfo. This base frequency
135should be sustainable on all CPUs indefinitely, given nominal power and cooling. 146should be sustainable on all CPUs indefinitely, given nominal power and cooling.
136The remaining rows show what maximum turbo frequency is possible 147The remaining rows show what maximum turbo frequency is possible
137depending on the number of idle cores. Note that this information is 148depending on the number of idle cores. Note that not all information is
138not available on all processors. 149available on all processors.
139.SH FORK EXAMPLE 150.SH FORK EXAMPLE
140If turbostat is invoked with a command, it will fork that command 151If turbostat is invoked with a command, it will fork that command
141and output the statistics gathered when the command exits. 152and output the statistics gathered when the command exits.
@@ -176,6 +187,11 @@ not including any non-busy idle time.
176 187
177.B "turbostat " 188.B "turbostat "
178must be run as root. 189must be run as root.
190Alternatively, non-root users can be enabled to run turbostat this way:
191
192# setcap cap_sys_rawio=ep ./turbostat
193
194# chmod +r /dev/cpu/*/msr
179 195
180.B "turbostat " 196.B "turbostat "
181reads hardware counters, but doesn't write them. 197reads hardware counters, but doesn't write them.
@@ -184,15 +200,33 @@ multiple invocations of itself.
184 200
185\fBturbostat \fP 201\fBturbostat \fP
186may work poorly on Linux-2.6.20 through 2.6.29, 202may work poorly on Linux-2.6.20 through 2.6.29,
187as \fBacpi-cpufreq \fPperiodically cleared the APERF and MPERF 203as \fBacpi-cpufreq \fPperiodically cleared the APERF and MPERF MSRs
188in those kernels. 204in those kernels.
189 205
190If the TSC column does not make sense, then 206AVG_MHz = APERF_delta/measurement_interval. This is the actual
191the other numbers will also make no sense. 207number of elapsed cycles divided by the entire sample interval --
192Turbostat is lightweight, and its data collection is not atomic. 208including idle time. Note that this calculation is resilient
193These issues are usually caused by an extremely short measurement 209to systems lacking a non-stop TSC.
194interval (much less than 1 second), or system activity that prevents 210
195turbostat from being able to run on all CPUS to quickly collect data. 211TSC_MHz = TSC_delta/measurement_interval.
212On a system with an invariant TSC, this value will be constant
213and will closely match the base frequency value shown
214in the brand string in /proc/cpuinfo. On a system where
215the TSC stops in idle, TSC_MHz will drop
216below the processor's base frequency.
217
218%Busy = MPERF_delta/TSC_delta
219
220Bzy_MHz = TSC_delta/APERF_delta/MPERF_delta/measurement_interval
221
222Note that these calculations depend on TSC_delta, so they
223are not reliable during intervals when TSC_MHz is not running at the base frequency.
224
225Turbostat data collection is not atomic.
226Extremely short measurement intervals (much less than 1 second),
227or system activity that prevents turbostat from being able
228to run on all CPUS to quickly collect data, will result in
229inconsistent results.
196 230
197The APERF, MPERF MSRs are defined to count non-halted cycles. 231The APERF, MPERF MSRs are defined to count non-halted cycles.
198Although it is not guaranteed by the architecture, turbostat assumes 232Although it is not guaranteed by the architecture, turbostat assumes
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 5b1b807265a1..2d089cac8580 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -33,24 +33,29 @@
33#include <signal.h> 33#include <signal.h>
34#include <sys/time.h> 34#include <sys/time.h>
35#include <stdlib.h> 35#include <stdlib.h>
36#include <getopt.h>
36#include <dirent.h> 37#include <dirent.h>
37#include <string.h> 38#include <string.h>
38#include <ctype.h> 39#include <ctype.h>
39#include <sched.h> 40#include <sched.h>
40#include <cpuid.h> 41#include <cpuid.h>
42#include <linux/capability.h>
43#include <errno.h>
41 44
42char *proc_stat = "/proc/stat"; 45char *proc_stat = "/proc/stat";
43unsigned int interval_sec = 5; /* set with -i interval_sec */ 46unsigned int interval_sec = 5;
44unsigned int verbose; /* set with -v */ 47unsigned int debug;
45unsigned int rapl_verbose; /* set with -R */ 48unsigned int rapl_joules;
46unsigned int rapl_joules; /* set with -J */ 49unsigned int summary_only;
47unsigned int thermal_verbose; /* set with -T */ 50unsigned int dump_only;
48unsigned int summary_only; /* set with -S */
49unsigned int dump_only; /* set with -s */
50unsigned int skip_c0; 51unsigned int skip_c0;
51unsigned int skip_c1; 52unsigned int skip_c1;
52unsigned int do_nhm_cstates; 53unsigned int do_nhm_cstates;
53unsigned int do_snb_cstates; 54unsigned int do_snb_cstates;
55unsigned int do_pc2;
56unsigned int do_pc3;
57unsigned int do_pc6;
58unsigned int do_pc7;
54unsigned int do_c8_c9_c10; 59unsigned int do_c8_c9_c10;
55unsigned int do_slm_cstates; 60unsigned int do_slm_cstates;
56unsigned int use_c1_residency_msr; 61unsigned int use_c1_residency_msr;
@@ -59,8 +64,8 @@ unsigned int has_epb;
59unsigned int units = 1000000; /* MHz etc */ 64unsigned int units = 1000000; /* MHz etc */
60unsigned int genuine_intel; 65unsigned int genuine_intel;
61unsigned int has_invariant_tsc; 66unsigned int has_invariant_tsc;
62unsigned int do_nehalem_platform_info; 67unsigned int do_nhm_platform_info;
63unsigned int do_nehalem_turbo_ratio_limit; 68unsigned int do_nhm_turbo_ratio_limit;
64unsigned int do_ivt_turbo_ratio_limit; 69unsigned int do_ivt_turbo_ratio_limit;
65unsigned int extra_msr_offset32; 70unsigned int extra_msr_offset32;
66unsigned int extra_msr_offset64; 71unsigned int extra_msr_offset64;
@@ -81,6 +86,9 @@ unsigned int tcc_activation_temp;
81unsigned int tcc_activation_temp_override; 86unsigned int tcc_activation_temp_override;
82double rapl_power_units, rapl_energy_units, rapl_time_units; 87double rapl_power_units, rapl_energy_units, rapl_time_units;
83double rapl_joule_counter_range; 88double rapl_joule_counter_range;
89unsigned int do_core_perf_limit_reasons;
90unsigned int do_gfx_perf_limit_reasons;
91unsigned int do_ring_perf_limit_reasons;
84 92
85#define RAPL_PKG (1 << 0) 93#define RAPL_PKG (1 << 0)
86 /* 0x610 MSR_PKG_POWER_LIMIT */ 94 /* 0x610 MSR_PKG_POWER_LIMIT */
@@ -251,15 +259,13 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr)
251 sprintf(pathname, "/dev/cpu/%d/msr", cpu); 259 sprintf(pathname, "/dev/cpu/%d/msr", cpu);
252 fd = open(pathname, O_RDONLY); 260 fd = open(pathname, O_RDONLY);
253 if (fd < 0) 261 if (fd < 0)
254 return -1; 262 err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, or run as root", pathname);
255 263
256 retval = pread(fd, msr, sizeof *msr, offset); 264 retval = pread(fd, msr, sizeof *msr, offset);
257 close(fd); 265 close(fd);
258 266
259 if (retval != sizeof *msr) { 267 if (retval != sizeof *msr)
260 fprintf(stderr, "%s offset 0x%llx read failed\n", pathname, (unsigned long long)offset); 268 err(-1, "%s offset 0x%llx read failed", pathname, (unsigned long long)offset);
261 return -1;
262 }
263 269
264 return 0; 270 return 0;
265} 271}
@@ -281,7 +287,7 @@ void print_header(void)
281 outp += sprintf(outp, " CPU"); 287 outp += sprintf(outp, " CPU");
282 if (has_aperf) 288 if (has_aperf)
283 outp += sprintf(outp, " Avg_MHz"); 289 outp += sprintf(outp, " Avg_MHz");
284 if (do_nhm_cstates) 290 if (has_aperf)
285 outp += sprintf(outp, " %%Busy"); 291 outp += sprintf(outp, " %%Busy");
286 if (has_aperf) 292 if (has_aperf)
287 outp += sprintf(outp, " Bzy_MHz"); 293 outp += sprintf(outp, " Bzy_MHz");
@@ -310,13 +316,13 @@ void print_header(void)
310 if (do_ptm) 316 if (do_ptm)
311 outp += sprintf(outp, " PkgTmp"); 317 outp += sprintf(outp, " PkgTmp");
312 318
313 if (do_snb_cstates) 319 if (do_pc2)
314 outp += sprintf(outp, " Pkg%%pc2"); 320 outp += sprintf(outp, " Pkg%%pc2");
315 if (do_nhm_cstates && !do_slm_cstates) 321 if (do_pc3)
316 outp += sprintf(outp, " Pkg%%pc3"); 322 outp += sprintf(outp, " Pkg%%pc3");
317 if (do_nhm_cstates && !do_slm_cstates) 323 if (do_pc6)
318 outp += sprintf(outp, " Pkg%%pc6"); 324 outp += sprintf(outp, " Pkg%%pc6");
319 if (do_snb_cstates) 325 if (do_pc7)
320 outp += sprintf(outp, " Pkg%%pc7"); 326 outp += sprintf(outp, " Pkg%%pc7");
321 if (do_c8_c9_c10) { 327 if (do_c8_c9_c10) {
322 outp += sprintf(outp, " Pkg%%pc8"); 328 outp += sprintf(outp, " Pkg%%pc8");
@@ -337,7 +343,7 @@ void print_header(void)
337 outp += sprintf(outp, " PKG_%%"); 343 outp += sprintf(outp, " PKG_%%");
338 if (do_rapl & RAPL_DRAM_PERF_STATUS) 344 if (do_rapl & RAPL_DRAM_PERF_STATUS)
339 outp += sprintf(outp, " RAM_%%"); 345 outp += sprintf(outp, " RAM_%%");
340 } else { 346 } else if (do_rapl && rapl_joules) {
341 if (do_rapl & RAPL_PKG) 347 if (do_rapl & RAPL_PKG)
342 outp += sprintf(outp, " Pkg_J"); 348 outp += sprintf(outp, " Pkg_J");
343 if (do_rapl & RAPL_CORES) 349 if (do_rapl & RAPL_CORES)
@@ -391,9 +397,12 @@ int dump_counters(struct thread_data *t, struct core_data *c,
391 if (p) { 397 if (p) {
392 outp += sprintf(outp, "package: %d\n", p->package_id); 398 outp += sprintf(outp, "package: %d\n", p->package_id);
393 outp += sprintf(outp, "pc2: %016llX\n", p->pc2); 399 outp += sprintf(outp, "pc2: %016llX\n", p->pc2);
394 outp += sprintf(outp, "pc3: %016llX\n", p->pc3); 400 if (do_pc3)
395 outp += sprintf(outp, "pc6: %016llX\n", p->pc6); 401 outp += sprintf(outp, "pc3: %016llX\n", p->pc3);
396 outp += sprintf(outp, "pc7: %016llX\n", p->pc7); 402 if (do_pc6)
403 outp += sprintf(outp, "pc6: %016llX\n", p->pc6);
404 if (do_pc7)
405 outp += sprintf(outp, "pc7: %016llX\n", p->pc7);
397 outp += sprintf(outp, "pc8: %016llX\n", p->pc8); 406 outp += sprintf(outp, "pc8: %016llX\n", p->pc8);
398 outp += sprintf(outp, "pc9: %016llX\n", p->pc9); 407 outp += sprintf(outp, "pc9: %016llX\n", p->pc9);
399 outp += sprintf(outp, "pc10: %016llX\n", p->pc10); 408 outp += sprintf(outp, "pc10: %016llX\n", p->pc10);
@@ -457,25 +466,25 @@ int format_counters(struct thread_data *t, struct core_data *c,
457 outp += sprintf(outp, "%8d", t->cpu_id); 466 outp += sprintf(outp, "%8d", t->cpu_id);
458 } 467 }
459 468
460 /* AvgMHz */ 469 /* Avg_MHz */
461 if (has_aperf) 470 if (has_aperf)
462 outp += sprintf(outp, "%8.0f", 471 outp += sprintf(outp, "%8.0f",
463 1.0 / units * t->aperf / interval_float); 472 1.0 / units * t->aperf / interval_float);
464 473
465 /* %c0 */ 474 /* %Busy */
466 if (do_nhm_cstates) { 475 if (has_aperf) {
467 if (!skip_c0) 476 if (!skip_c0)
468 outp += sprintf(outp, "%8.2f", 100.0 * t->mperf/t->tsc); 477 outp += sprintf(outp, "%8.2f", 100.0 * t->mperf/t->tsc);
469 else 478 else
470 outp += sprintf(outp, "********"); 479 outp += sprintf(outp, "********");
471 } 480 }
472 481
473 /* BzyMHz */ 482 /* Bzy_MHz */
474 if (has_aperf) 483 if (has_aperf)
475 outp += sprintf(outp, "%8.0f", 484 outp += sprintf(outp, "%8.0f",
476 1.0 * t->tsc / units * t->aperf / t->mperf / interval_float); 485 1.0 * t->tsc / units * t->aperf / t->mperf / interval_float);
477 486
478 /* TSC */ 487 /* TSC_MHz */
479 outp += sprintf(outp, "%8.0f", 1.0 * t->tsc/units/interval_float); 488 outp += sprintf(outp, "%8.0f", 1.0 * t->tsc/units/interval_float);
480 489
481 /* SMI */ 490 /* SMI */
@@ -525,13 +534,13 @@ int format_counters(struct thread_data *t, struct core_data *c,
525 if (do_ptm) 534 if (do_ptm)
526 outp += sprintf(outp, "%8d", p->pkg_temp_c); 535 outp += sprintf(outp, "%8d", p->pkg_temp_c);
527 536
528 if (do_snb_cstates) 537 if (do_pc2)
529 outp += sprintf(outp, "%8.2f", 100.0 * p->pc2/t->tsc); 538 outp += sprintf(outp, "%8.2f", 100.0 * p->pc2/t->tsc);
530 if (do_nhm_cstates && !do_slm_cstates) 539 if (do_pc3)
531 outp += sprintf(outp, "%8.2f", 100.0 * p->pc3/t->tsc); 540 outp += sprintf(outp, "%8.2f", 100.0 * p->pc3/t->tsc);
532 if (do_nhm_cstates && !do_slm_cstates) 541 if (do_pc6)
533 outp += sprintf(outp, "%8.2f", 100.0 * p->pc6/t->tsc); 542 outp += sprintf(outp, "%8.2f", 100.0 * p->pc6/t->tsc);
534 if (do_snb_cstates) 543 if (do_pc7)
535 outp += sprintf(outp, "%8.2f", 100.0 * p->pc7/t->tsc); 544 outp += sprintf(outp, "%8.2f", 100.0 * p->pc7/t->tsc);
536 if (do_c8_c9_c10) { 545 if (do_c8_c9_c10) {
537 outp += sprintf(outp, "%8.2f", 100.0 * p->pc8/t->tsc); 546 outp += sprintf(outp, "%8.2f", 100.0 * p->pc8/t->tsc);
@@ -561,7 +570,7 @@ int format_counters(struct thread_data *t, struct core_data *c,
561 outp += sprintf(outp, fmt8, 100.0 * p->rapl_pkg_perf_status * rapl_time_units / interval_float); 570 outp += sprintf(outp, fmt8, 100.0 * p->rapl_pkg_perf_status * rapl_time_units / interval_float);
562 if (do_rapl & RAPL_DRAM_PERF_STATUS) 571 if (do_rapl & RAPL_DRAM_PERF_STATUS)
563 outp += sprintf(outp, fmt8, 100.0 * p->rapl_dram_perf_status * rapl_time_units / interval_float); 572 outp += sprintf(outp, fmt8, 100.0 * p->rapl_dram_perf_status * rapl_time_units / interval_float);
564 } else { 573 } else if (do_rapl && rapl_joules) {
565 if (do_rapl & RAPL_PKG) 574 if (do_rapl & RAPL_PKG)
566 outp += sprintf(outp, fmt8, 575 outp += sprintf(outp, fmt8,
567 p->energy_pkg * rapl_energy_units); 576 p->energy_pkg * rapl_energy_units);
@@ -578,8 +587,8 @@ int format_counters(struct thread_data *t, struct core_data *c,
578 outp += sprintf(outp, fmt8, 100.0 * p->rapl_pkg_perf_status * rapl_time_units / interval_float); 587 outp += sprintf(outp, fmt8, 100.0 * p->rapl_pkg_perf_status * rapl_time_units / interval_float);
579 if (do_rapl & RAPL_DRAM_PERF_STATUS) 588 if (do_rapl & RAPL_DRAM_PERF_STATUS)
580 outp += sprintf(outp, fmt8, 100.0 * p->rapl_dram_perf_status * rapl_time_units / interval_float); 589 outp += sprintf(outp, fmt8, 100.0 * p->rapl_dram_perf_status * rapl_time_units / interval_float);
581 outp += sprintf(outp, fmt8, interval_float);
582 590
591 outp += sprintf(outp, fmt8, interval_float);
583 } 592 }
584done: 593done:
585 outp += sprintf(outp, "\n"); 594 outp += sprintf(outp, "\n");
@@ -628,9 +637,12 @@ void
628delta_package(struct pkg_data *new, struct pkg_data *old) 637delta_package(struct pkg_data *new, struct pkg_data *old)
629{ 638{
630 old->pc2 = new->pc2 - old->pc2; 639 old->pc2 = new->pc2 - old->pc2;
631 old->pc3 = new->pc3 - old->pc3; 640 if (do_pc3)
632 old->pc6 = new->pc6 - old->pc6; 641 old->pc3 = new->pc3 - old->pc3;
633 old->pc7 = new->pc7 - old->pc7; 642 if (do_pc6)
643 old->pc6 = new->pc6 - old->pc6;
644 if (do_pc7)
645 old->pc7 = new->pc7 - old->pc7;
634 old->pc8 = new->pc8 - old->pc8; 646 old->pc8 = new->pc8 - old->pc8;
635 old->pc9 = new->pc9 - old->pc9; 647 old->pc9 = new->pc9 - old->pc9;
636 old->pc10 = new->pc10 - old->pc10; 648 old->pc10 = new->pc10 - old->pc10;
@@ -670,24 +682,26 @@ delta_thread(struct thread_data *new, struct thread_data *old,
670 682
671 old->c1 = new->c1 - old->c1; 683 old->c1 = new->c1 - old->c1;
672 684
673 if ((new->aperf > old->aperf) && (new->mperf > old->mperf)) { 685 if (has_aperf) {
674 old->aperf = new->aperf - old->aperf; 686 if ((new->aperf > old->aperf) && (new->mperf > old->mperf)) {
675 old->mperf = new->mperf - old->mperf; 687 old->aperf = new->aperf - old->aperf;
676 } else { 688 old->mperf = new->mperf - old->mperf;
689 } else {
677 690
678 if (!aperf_mperf_unstable) { 691 if (!aperf_mperf_unstable) {
679 fprintf(stderr, "%s: APERF or MPERF went backwards *\n", progname); 692 fprintf(stderr, "%s: APERF or MPERF went backwards *\n", progname);
680 fprintf(stderr, "* Frequency results do not cover entire interval *\n"); 693 fprintf(stderr, "* Frequency results do not cover entire interval *\n");
681 fprintf(stderr, "* fix this by running Linux-2.6.30 or later *\n"); 694 fprintf(stderr, "* fix this by running Linux-2.6.30 or later *\n");
682 695
683 aperf_mperf_unstable = 1; 696 aperf_mperf_unstable = 1;
697 }
698 /*
699 * mperf delta is likely a huge "positive" number
700 * can not use it for calculating c0 time
701 */
702 skip_c0 = 1;
703 skip_c1 = 1;
684 } 704 }
685 /*
686 * mperf delta is likely a huge "positive" number
687 * can not use it for calculating c0 time
688 */
689 skip_c0 = 1;
690 skip_c1 = 1;
691 } 705 }
692 706
693 707
@@ -712,7 +726,7 @@ delta_thread(struct thread_data *new, struct thread_data *old,
712 } 726 }
713 727
714 if (old->mperf == 0) { 728 if (old->mperf == 0) {
715 if (verbose > 1) fprintf(stderr, "cpu%d MPERF 0!\n", old->cpu_id); 729 if (debug > 1) fprintf(stderr, "cpu%d MPERF 0!\n", old->cpu_id);
716 old->mperf = 1; /* divide by 0 protection */ 730 old->mperf = 1; /* divide by 0 protection */
717 } 731 }
718 732
@@ -769,9 +783,12 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data
769 c->core_temp_c = 0; 783 c->core_temp_c = 0;
770 784
771 p->pc2 = 0; 785 p->pc2 = 0;
772 p->pc3 = 0; 786 if (do_pc3)
773 p->pc6 = 0; 787 p->pc3 = 0;
774 p->pc7 = 0; 788 if (do_pc6)
789 p->pc6 = 0;
790 if (do_pc7)
791 p->pc7 = 0;
775 p->pc8 = 0; 792 p->pc8 = 0;
776 p->pc9 = 0; 793 p->pc9 = 0;
777 p->pc10 = 0; 794 p->pc10 = 0;
@@ -810,9 +827,12 @@ int sum_counters(struct thread_data *t, struct core_data *c,
810 return 0; 827 return 0;
811 828
812 average.packages.pc2 += p->pc2; 829 average.packages.pc2 += p->pc2;
813 average.packages.pc3 += p->pc3; 830 if (do_pc3)
814 average.packages.pc6 += p->pc6; 831 average.packages.pc3 += p->pc3;
815 average.packages.pc7 += p->pc7; 832 if (do_pc6)
833 average.packages.pc6 += p->pc6;
834 if (do_pc7)
835 average.packages.pc7 += p->pc7;
816 average.packages.pc8 += p->pc8; 836 average.packages.pc8 += p->pc8;
817 average.packages.pc9 += p->pc9; 837 average.packages.pc9 += p->pc9;
818 average.packages.pc10 += p->pc10; 838 average.packages.pc10 += p->pc10;
@@ -854,9 +874,12 @@ void compute_average(struct thread_data *t, struct core_data *c,
854 average.cores.c7 /= topo.num_cores; 874 average.cores.c7 /= topo.num_cores;
855 875
856 average.packages.pc2 /= topo.num_packages; 876 average.packages.pc2 /= topo.num_packages;
857 average.packages.pc3 /= topo.num_packages; 877 if (do_pc3)
858 average.packages.pc6 /= topo.num_packages; 878 average.packages.pc3 /= topo.num_packages;
859 average.packages.pc7 /= topo.num_packages; 879 if (do_pc6)
880 average.packages.pc6 /= topo.num_packages;
881 if (do_pc7)
882 average.packages.pc7 /= topo.num_packages;
860 883
861 average.packages.pc8 /= topo.num_packages; 884 average.packages.pc8 /= topo.num_packages;
862 average.packages.pc9 /= topo.num_packages; 885 average.packages.pc9 /= topo.num_packages;
@@ -956,18 +979,18 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
956 if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) 979 if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
957 return 0; 980 return 0;
958 981
959 if (do_nhm_cstates && !do_slm_cstates) { 982 if (do_pc3)
960 if (get_msr(cpu, MSR_PKG_C3_RESIDENCY, &p->pc3)) 983 if (get_msr(cpu, MSR_PKG_C3_RESIDENCY, &p->pc3))
961 return -9; 984 return -9;
985 if (do_pc6)
962 if (get_msr(cpu, MSR_PKG_C6_RESIDENCY, &p->pc6)) 986 if (get_msr(cpu, MSR_PKG_C6_RESIDENCY, &p->pc6))
963 return -10; 987 return -10;
964 } 988 if (do_pc2)
965 if (do_snb_cstates) {
966 if (get_msr(cpu, MSR_PKG_C2_RESIDENCY, &p->pc2)) 989 if (get_msr(cpu, MSR_PKG_C2_RESIDENCY, &p->pc2))
967 return -11; 990 return -11;
991 if (do_pc7)
968 if (get_msr(cpu, MSR_PKG_C7_RESIDENCY, &p->pc7)) 992 if (get_msr(cpu, MSR_PKG_C7_RESIDENCY, &p->pc7))
969 return -12; 993 return -12;
970 }
971 if (do_c8_c9_c10) { 994 if (do_c8_c9_c10) {
972 if (get_msr(cpu, MSR_PKG_C8_RESIDENCY, &p->pc8)) 995 if (get_msr(cpu, MSR_PKG_C8_RESIDENCY, &p->pc8))
973 return -13; 996 return -13;
@@ -1014,12 +1037,43 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
1014 return 0; 1037 return 0;
1015} 1038}
1016 1039
1040/*
1041 * MSR_PKG_CST_CONFIG_CONTROL decoding for pkg_cstate_limit:
1042 * If you change the values, note they are used both in comparisons
1043 * (>= PCL__7) and to index pkg_cstate_limit_strings[].
1044 */
1045
1046#define PCLUKN 0 /* Unknown */
1047#define PCLRSV 1 /* Reserved */
1048#define PCL__0 2 /* PC0 */
1049#define PCL__1 3 /* PC1 */
1050#define PCL__2 4 /* PC2 */
1051#define PCL__3 5 /* PC3 */
1052#define PCL__4 6 /* PC4 */
1053#define PCL__6 7 /* PC6 */
1054#define PCL_6N 8 /* PC6 No Retention */
1055#define PCL_6R 9 /* PC6 Retention */
1056#define PCL__7 10 /* PC7 */
1057#define PCL_7S 11 /* PC7 Shrink */
1058#define PCLUNL 12 /* Unlimited */
1059
1060int pkg_cstate_limit = PCLUKN;
1061char *pkg_cstate_limit_strings[] = { "reserved", "unknown", "pc0", "pc1", "pc2",
1062 "pc3", "pc4", "pc6", "pc6n", "pc6r", "pc7", "pc7s", "unlimited"};
1063
1064int nhm_pkg_cstate_limits[8] = {PCL__0, PCL__1, PCL__3, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLUNL};
1065int snb_pkg_cstate_limits[8] = {PCL__0, PCL__2, PCL_6N, PCL_6R, PCL__7, PCL_7S, PCLRSV, PCLUNL};
1066int hsw_pkg_cstate_limits[8] = {PCL__0, PCL__2, PCL__3, PCL__6, PCL__7, PCL_7S, PCLRSV, PCLUNL};
1067int slv_pkg_cstate_limits[8] = {PCL__0, PCL__1, PCLRSV, PCLRSV, PCL__4, PCLRSV, PCL__6, PCL__7};
1068int amt_pkg_cstate_limits[8] = {PCL__0, PCL__1, PCL__2, PCLRSV, PCLRSV, PCLRSV, PCL__6, PCL__7};
1069int phi_pkg_cstate_limits[8] = {PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL};
1070
1017void print_verbose_header(void) 1071void print_verbose_header(void)
1018{ 1072{
1019 unsigned long long msr; 1073 unsigned long long msr;
1020 unsigned int ratio; 1074 unsigned int ratio;
1021 1075
1022 if (!do_nehalem_platform_info) 1076 if (!do_nhm_platform_info)
1023 return; 1077 return;
1024 1078
1025 get_msr(0, MSR_NHM_PLATFORM_INFO, &msr); 1079 get_msr(0, MSR_NHM_PLATFORM_INFO, &msr);
@@ -1093,46 +1147,16 @@ print_nhm_turbo_ratio_limits:
1093 1147
1094 fprintf(stderr, "cpu0: MSR_NHM_SNB_PKG_CST_CFG_CTL: 0x%08llx", msr); 1148 fprintf(stderr, "cpu0: MSR_NHM_SNB_PKG_CST_CFG_CTL: 0x%08llx", msr);
1095 1149
1096 fprintf(stderr, " (%s%s%s%s%slocked: pkg-cstate-limit=%d: ", 1150 fprintf(stderr, " (%s%s%s%s%slocked: pkg-cstate-limit=%d: %s)\n",
1097 (msr & SNB_C3_AUTO_UNDEMOTE) ? "UNdemote-C3, " : "", 1151 (msr & SNB_C3_AUTO_UNDEMOTE) ? "UNdemote-C3, " : "",
1098 (msr & SNB_C1_AUTO_UNDEMOTE) ? "UNdemote-C1, " : "", 1152 (msr & SNB_C1_AUTO_UNDEMOTE) ? "UNdemote-C1, " : "",
1099 (msr & NHM_C3_AUTO_DEMOTE) ? "demote-C3, " : "", 1153 (msr & NHM_C3_AUTO_DEMOTE) ? "demote-C3, " : "",
1100 (msr & NHM_C1_AUTO_DEMOTE) ? "demote-C1, " : "", 1154 (msr & NHM_C1_AUTO_DEMOTE) ? "demote-C1, " : "",
1101 (msr & (1 << 15)) ? "" : "UN", 1155 (msr & (1 << 15)) ? "" : "UN",
1102 (unsigned int)msr & 7); 1156 (unsigned int)msr & 7,
1103 1157 pkg_cstate_limit_strings[pkg_cstate_limit]);
1104
1105 switch(msr & 0x7) {
1106 case 0:
1107 fprintf(stderr, do_slm_cstates ? "no pkg states" : "pc0");
1108 break;
1109 case 1:
1110 fprintf(stderr, do_slm_cstates ? "no pkg states" : do_snb_cstates ? "pc2" : "pc0");
1111 break;
1112 case 2:
1113 fprintf(stderr, do_slm_cstates ? "invalid" : do_snb_cstates ? "pc6-noret" : "pc3");
1114 break;
1115 case 3:
1116 fprintf(stderr, do_slm_cstates ? "invalid" : "pc6");
1117 break;
1118 case 4:
1119 fprintf(stderr, do_slm_cstates ? "pc4" : "pc7");
1120 break;
1121 case 5:
1122 fprintf(stderr, do_slm_cstates ? "invalid" : do_snb_cstates ? "pc7s" : "invalid");
1123 break;
1124 case 6:
1125 fprintf(stderr, do_slm_cstates ? "pc6" : "invalid");
1126 break;
1127 case 7:
1128 fprintf(stderr, do_slm_cstates ? "pc7" : "unlimited");
1129 break;
1130 default:
1131 fprintf(stderr, "invalid");
1132 }
1133 fprintf(stderr, ")\n");
1134 1158
1135 if (!do_nehalem_turbo_ratio_limit) 1159 if (!do_nhm_turbo_ratio_limit)
1136 return; 1160 return;
1137 1161
1138 get_msr(0, MSR_NHM_TURBO_RATIO_LIMIT, &msr); 1162 get_msr(0, MSR_NHM_TURBO_RATIO_LIMIT, &msr);
@@ -1178,6 +1202,7 @@ print_nhm_turbo_ratio_limits:
1178 if (ratio) 1202 if (ratio)
1179 fprintf(stderr, "%d * %.0f = %.0f MHz max turbo 1 active cores\n", 1203 fprintf(stderr, "%d * %.0f = %.0f MHz max turbo 1 active cores\n",
1180 ratio, bclk, ratio * bclk); 1204 ratio, bclk, ratio * bclk);
1205
1181} 1206}
1182 1207
1183void free_all_buffers(void) 1208void free_all_buffers(void)
@@ -1458,18 +1483,66 @@ void check_dev_msr()
1458 struct stat sb; 1483 struct stat sb;
1459 1484
1460 if (stat("/dev/cpu/0/msr", &sb)) 1485 if (stat("/dev/cpu/0/msr", &sb))
1461 err(-5, "no /dev/cpu/0/msr\n" 1486 err(-5, "no /dev/cpu/0/msr, Try \"# modprobe msr\" ");
1462 "Try \"# modprobe msr\"");
1463} 1487}
1464 1488
1465void check_super_user() 1489void check_permissions()
1466{ 1490{
1467 if (getuid() != 0) 1491 struct __user_cap_header_struct cap_header_data;
1468 errx(-6, "must be root"); 1492 cap_user_header_t cap_header = &cap_header_data;
1493 struct __user_cap_data_struct cap_data_data;
1494 cap_user_data_t cap_data = &cap_data_data;
1495 extern int capget(cap_user_header_t hdrp, cap_user_data_t datap);
1496 int do_exit = 0;
1497
1498 /* check for CAP_SYS_RAWIO */
1499 cap_header->pid = getpid();
1500 cap_header->version = _LINUX_CAPABILITY_VERSION;
1501 if (capget(cap_header, cap_data) < 0)
1502 err(-6, "capget(2) failed");
1503
1504 if ((cap_data->effective & (1 << CAP_SYS_RAWIO)) == 0) {
1505 do_exit++;
1506 warnx("capget(CAP_SYS_RAWIO) failed,"
1507 " try \"# setcap cap_sys_rawio=ep %s\"", progname);
1508 }
1509
1510 /* test file permissions */
1511 if (euidaccess("/dev/cpu/0/msr", R_OK)) {
1512 do_exit++;
1513 warn("/dev/cpu/0/msr open failed, try chown or chmod +r /dev/cpu/*/msr");
1514 }
1515
1516 /* if all else fails, thell them to be root */
1517 if (do_exit)
1518 if (getuid() != 0)
1519 warnx("... or simply run as root");
1520
1521 if (do_exit)
1522 exit(-6);
1469} 1523}
1470 1524
1471int has_nehalem_turbo_ratio_limit(unsigned int family, unsigned int model) 1525/*
1526 * NHM adds support for additional MSRs:
1527 *
1528 * MSR_SMI_COUNT 0x00000034
1529 *
1530 * MSR_NHM_PLATFORM_INFO 0x000000ce
1531 * MSR_NHM_SNB_PKG_CST_CFG_CTL 0x000000e2
1532 *
1533 * MSR_PKG_C3_RESIDENCY 0x000003f8
1534 * MSR_PKG_C6_RESIDENCY 0x000003f9
1535 * MSR_CORE_C3_RESIDENCY 0x000003fc
1536 * MSR_CORE_C6_RESIDENCY 0x000003fd
1537 *
1538 * Side effect:
1539 * sets global pkg_cstate_limit to decode MSR_NHM_SNB_PKG_CST_CFG_CTL
1540 */
1541int probe_nhm_msrs(unsigned int family, unsigned int model)
1472{ 1542{
1543 unsigned long long msr;
1544 int *pkg_cstate_limits;
1545
1473 if (!genuine_intel) 1546 if (!genuine_intel)
1474 return 0; 1547 return 0;
1475 1548
@@ -1482,24 +1555,54 @@ int has_nehalem_turbo_ratio_limit(unsigned int family, unsigned int model)
1482 case 0x1F: /* Core i7 and i5 Processor - Nehalem */ 1555 case 0x1F: /* Core i7 and i5 Processor - Nehalem */
1483 case 0x25: /* Westmere Client - Clarkdale, Arrandale */ 1556 case 0x25: /* Westmere Client - Clarkdale, Arrandale */
1484 case 0x2C: /* Westmere EP - Gulftown */ 1557 case 0x2C: /* Westmere EP - Gulftown */
1558 case 0x2E: /* Nehalem-EX Xeon - Beckton */
1559 case 0x2F: /* Westmere-EX Xeon - Eagleton */
1560 pkg_cstate_limits = nhm_pkg_cstate_limits;
1561 break;
1485 case 0x2A: /* SNB */ 1562 case 0x2A: /* SNB */
1486 case 0x2D: /* SNB Xeon */ 1563 case 0x2D: /* SNB Xeon */
1487 case 0x3A: /* IVB */ 1564 case 0x3A: /* IVB */
1488 case 0x3E: /* IVB Xeon */ 1565 case 0x3E: /* IVB Xeon */
1566 pkg_cstate_limits = snb_pkg_cstate_limits;
1567 break;
1489 case 0x3C: /* HSW */ 1568 case 0x3C: /* HSW */
1490 case 0x3F: /* HSX */ 1569 case 0x3F: /* HSX */
1491 case 0x45: /* HSW */ 1570 case 0x45: /* HSW */
1492 case 0x46: /* HSW */ 1571 case 0x46: /* HSW */
1493 case 0x37: /* BYT */
1494 case 0x4D: /* AVN */
1495 case 0x3D: /* BDW */ 1572 case 0x3D: /* BDW */
1573 case 0x47: /* BDW */
1496 case 0x4F: /* BDX */ 1574 case 0x4F: /* BDX */
1497 case 0x56: /* BDX-DE */ 1575 case 0x56: /* BDX-DE */
1498 return 1; 1576 pkg_cstate_limits = hsw_pkg_cstate_limits;
1577 break;
1578 case 0x37: /* BYT */
1579 case 0x4D: /* AVN */
1580 pkg_cstate_limits = slv_pkg_cstate_limits;
1581 break;
1582 case 0x4C: /* AMT */
1583 pkg_cstate_limits = amt_pkg_cstate_limits;
1584 break;
1585 case 0x57: /* PHI */
1586 pkg_cstate_limits = phi_pkg_cstate_limits;
1587 break;
1588 default:
1589 return 0;
1590 }
1591 get_msr(0, MSR_NHM_SNB_PKG_CST_CFG_CTL, &msr);
1592
1593 pkg_cstate_limit = pkg_cstate_limits[msr & 0x7];
1594
1595 return 1;
1596}
1597int has_nhm_turbo_ratio_limit(unsigned int family, unsigned int model)
1598{
1599 switch (model) {
1600 /* Nehalem compatible, but do not include turbo-ratio limit support */
1499 case 0x2E: /* Nehalem-EX Xeon - Beckton */ 1601 case 0x2E: /* Nehalem-EX Xeon - Beckton */
1500 case 0x2F: /* Westmere-EX Xeon - Eagleton */ 1602 case 0x2F: /* Westmere-EX Xeon - Eagleton */
1501 default:
1502 return 0; 1603 return 0;
1604 default:
1605 return 1;
1503 } 1606 }
1504} 1607}
1505int has_ivt_turbo_ratio_limit(unsigned int family, unsigned int model) 1608int has_ivt_turbo_ratio_limit(unsigned int family, unsigned int model)
@@ -1564,6 +1667,103 @@ int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p)
1564 return 0; 1667 return 0;
1565} 1668}
1566 1669
1670/*
1671 * print_perf_limit()
1672 */
1673int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data *p)
1674{
1675 unsigned long long msr;
1676 int cpu;
1677
1678 cpu = t->cpu_id;
1679
1680 /* per-package */
1681 if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
1682 return 0;
1683
1684 if (cpu_migrate(cpu)) {
1685 fprintf(stderr, "Could not migrate to CPU %d\n", cpu);
1686 return -1;
1687 }
1688
1689 if (do_core_perf_limit_reasons) {
1690 get_msr(cpu, MSR_CORE_PERF_LIMIT_REASONS, &msr);
1691 fprintf(stderr, "cpu%d: MSR_CORE_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
1692 fprintf(stderr, " (Active: %s%s%s%s%s%s%s%s%s%s%s%s%s%s)",
1693 (msr & 1 << 0) ? "PROCHOT, " : "",
1694 (msr & 1 << 1) ? "ThermStatus, " : "",
1695 (msr & 1 << 2) ? "bit2, " : "",
1696 (msr & 1 << 4) ? "Graphics, " : "",
1697 (msr & 1 << 5) ? "Auto-HWP, " : "",
1698 (msr & 1 << 6) ? "VR-Therm, " : "",
1699 (msr & 1 << 8) ? "Amps, " : "",
1700 (msr & 1 << 9) ? "CorePwr, " : "",
1701 (msr & 1 << 10) ? "PkgPwrL1, " : "",
1702 (msr & 1 << 11) ? "PkgPwrL2, " : "",
1703 (msr & 1 << 12) ? "MultiCoreTurbo, " : "",
1704 (msr & 1 << 13) ? "Transitions, " : "",
1705 (msr & 1 << 14) ? "bit14, " : "",
1706 (msr & 1 << 15) ? "bit15, " : "");
1707 fprintf(stderr, " (Logged: %s%s%s%s%s%s%s%s%s%s%s%s%s%s)\n",
1708 (msr & 1 << 16) ? "PROCHOT, " : "",
1709 (msr & 1 << 17) ? "ThermStatus, " : "",
1710 (msr & 1 << 18) ? "bit18, " : "",
1711 (msr & 1 << 20) ? "Graphics, " : "",
1712 (msr & 1 << 21) ? "Auto-HWP, " : "",
1713 (msr & 1 << 22) ? "VR-Therm, " : "",
1714 (msr & 1 << 24) ? "Amps, " : "",
1715 (msr & 1 << 25) ? "CorePwr, " : "",
1716 (msr & 1 << 26) ? "PkgPwrL1, " : "",
1717 (msr & 1 << 27) ? "PkgPwrL2, " : "",
1718 (msr & 1 << 28) ? "MultiCoreTurbo, " : "",
1719 (msr & 1 << 29) ? "Transitions, " : "",
1720 (msr & 1 << 30) ? "bit30, " : "",
1721 (msr & 1 << 31) ? "bit31, " : "");
1722
1723 }
1724 if (do_gfx_perf_limit_reasons) {
1725 get_msr(cpu, MSR_GFX_PERF_LIMIT_REASONS, &msr);
1726 fprintf(stderr, "cpu%d: MSR_GFX_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
1727 fprintf(stderr, " (Active: %s%s%s%s%s%s%s%s)",
1728 (msr & 1 << 0) ? "PROCHOT, " : "",
1729 (msr & 1 << 1) ? "ThermStatus, " : "",
1730 (msr & 1 << 4) ? "Graphics, " : "",
1731 (msr & 1 << 6) ? "VR-Therm, " : "",
1732 (msr & 1 << 8) ? "Amps, " : "",
1733 (msr & 1 << 9) ? "GFXPwr, " : "",
1734 (msr & 1 << 10) ? "PkgPwrL1, " : "",
1735 (msr & 1 << 11) ? "PkgPwrL2, " : "");
1736 fprintf(stderr, " (Logged: %s%s%s%s%s%s%s%s)\n",
1737 (msr & 1 << 16) ? "PROCHOT, " : "",
1738 (msr & 1 << 17) ? "ThermStatus, " : "",
1739 (msr & 1 << 20) ? "Graphics, " : "",
1740 (msr & 1 << 22) ? "VR-Therm, " : "",
1741 (msr & 1 << 24) ? "Amps, " : "",
1742 (msr & 1 << 25) ? "GFXPwr, " : "",
1743 (msr & 1 << 26) ? "PkgPwrL1, " : "",
1744 (msr & 1 << 27) ? "PkgPwrL2, " : "");
1745 }
1746 if (do_ring_perf_limit_reasons) {
1747 get_msr(cpu, MSR_RING_PERF_LIMIT_REASONS, &msr);
1748 fprintf(stderr, "cpu%d: MSR_RING_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
1749 fprintf(stderr, " (Active: %s%s%s%s%s%s)",
1750 (msr & 1 << 0) ? "PROCHOT, " : "",
1751 (msr & 1 << 1) ? "ThermStatus, " : "",
1752 (msr & 1 << 6) ? "VR-Therm, " : "",
1753 (msr & 1 << 8) ? "Amps, " : "",
1754 (msr & 1 << 10) ? "PkgPwrL1, " : "",
1755 (msr & 1 << 11) ? "PkgPwrL2, " : "");
1756 fprintf(stderr, " (Logged: %s%s%s%s%s%s)\n",
1757 (msr & 1 << 16) ? "PROCHOT, " : "",
1758 (msr & 1 << 17) ? "ThermStatus, " : "",
1759 (msr & 1 << 22) ? "VR-Therm, " : "",
1760 (msr & 1 << 24) ? "Amps, " : "",
1761 (msr & 1 << 26) ? "PkgPwrL1, " : "",
1762 (msr & 1 << 27) ? "PkgPwrL2, " : "");
1763 }
1764 return 0;
1765}
1766
1567#define RAPL_POWER_GRANULARITY 0x7FFF /* 15 bit power granularity */ 1767#define RAPL_POWER_GRANULARITY 0x7FFF /* 15 bit power granularity */
1568#define RAPL_TIME_GRANULARITY 0x3F /* 6 bit time granularity */ 1768#define RAPL_TIME_GRANULARITY 0x3F /* 6 bit time granularity */
1569 1769
@@ -1609,6 +1809,7 @@ void rapl_probe(unsigned int family, unsigned int model)
1609 case 0x45: /* HSW */ 1809 case 0x45: /* HSW */
1610 case 0x46: /* HSW */ 1810 case 0x46: /* HSW */
1611 case 0x3D: /* BDW */ 1811 case 0x3D: /* BDW */
1812 case 0x47: /* BDW */
1612 do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_GFX | RAPL_PKG_POWER_INFO; 1813 do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_GFX | RAPL_PKG_POWER_INFO;
1613 break; 1814 break;
1614 case 0x3F: /* HSX */ 1815 case 0x3F: /* HSX */
@@ -1647,12 +1848,33 @@ void rapl_probe(unsigned int family, unsigned int model)
1647 tdp = get_tdp(model); 1848 tdp = get_tdp(model);
1648 1849
1649 rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp; 1850 rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp;
1650 if (verbose) 1851 if (debug)
1651 fprintf(stderr, "RAPL: %.0f sec. Joule Counter Range, at %.0f Watts\n", rapl_joule_counter_range, tdp); 1852 fprintf(stderr, "RAPL: %.0f sec. Joule Counter Range, at %.0f Watts\n", rapl_joule_counter_range, tdp);
1652 1853
1653 return; 1854 return;
1654} 1855}
1655 1856
1857void perf_limit_reasons_probe(family, model)
1858{
1859 if (!genuine_intel)
1860 return;
1861
1862 if (family != 6)
1863 return;
1864
1865 switch (model) {
1866 case 0x3C: /* HSW */
1867 case 0x45: /* HSW */
1868 case 0x46: /* HSW */
1869 do_gfx_perf_limit_reasons = 1;
1870 case 0x3F: /* HSX */
1871 do_core_perf_limit_reasons = 1;
1872 do_ring_perf_limit_reasons = 1;
1873 default:
1874 return;
1875 }
1876}
1877
1656int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p) 1878int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p)
1657{ 1879{
1658 unsigned long long msr; 1880 unsigned long long msr;
@@ -1751,7 +1973,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
1751 if (get_msr(cpu, MSR_RAPL_POWER_UNIT, &msr)) 1973 if (get_msr(cpu, MSR_RAPL_POWER_UNIT, &msr))
1752 return -1; 1974 return -1;
1753 1975
1754 if (verbose) { 1976 if (debug) {
1755 fprintf(stderr, "cpu%d: MSR_RAPL_POWER_UNIT: 0x%08llx " 1977 fprintf(stderr, "cpu%d: MSR_RAPL_POWER_UNIT: 0x%08llx "
1756 "(%f Watts, %f Joules, %f sec.)\n", cpu, msr, 1978 "(%f Watts, %f Joules, %f sec.)\n", cpu, msr,
1757 rapl_power_units, rapl_energy_units, rapl_time_units); 1979 rapl_power_units, rapl_energy_units, rapl_time_units);
@@ -1808,7 +2030,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
1808 print_power_limit_msr(cpu, msr, "DRAM Limit"); 2030 print_power_limit_msr(cpu, msr, "DRAM Limit");
1809 } 2031 }
1810 if (do_rapl & RAPL_CORE_POLICY) { 2032 if (do_rapl & RAPL_CORE_POLICY) {
1811 if (verbose) { 2033 if (debug) {
1812 if (get_msr(cpu, MSR_PP0_POLICY, &msr)) 2034 if (get_msr(cpu, MSR_PP0_POLICY, &msr))
1813 return -7; 2035 return -7;
1814 2036
@@ -1816,7 +2038,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
1816 } 2038 }
1817 } 2039 }
1818 if (do_rapl & RAPL_CORES) { 2040 if (do_rapl & RAPL_CORES) {
1819 if (verbose) { 2041 if (debug) {
1820 2042
1821 if (get_msr(cpu, MSR_PP0_POWER_LIMIT, &msr)) 2043 if (get_msr(cpu, MSR_PP0_POWER_LIMIT, &msr))
1822 return -9; 2044 return -9;
@@ -1826,7 +2048,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
1826 } 2048 }
1827 } 2049 }
1828 if (do_rapl & RAPL_GFX) { 2050 if (do_rapl & RAPL_GFX) {
1829 if (verbose) { 2051 if (debug) {
1830 if (get_msr(cpu, MSR_PP1_POLICY, &msr)) 2052 if (get_msr(cpu, MSR_PP1_POLICY, &msr))
1831 return -8; 2053 return -8;
1832 2054
@@ -1842,8 +2064,15 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
1842 return 0; 2064 return 0;
1843} 2065}
1844 2066
2067/*
2068 * SNB adds support for additional MSRs:
2069 *
2070 * MSR_PKG_C7_RESIDENCY 0x000003fa
2071 * MSR_CORE_C7_RESIDENCY 0x000003fe
2072 * MSR_PKG_C2_RESIDENCY 0x0000060d
2073 */
1845 2074
1846int is_snb(unsigned int family, unsigned int model) 2075int has_snb_msrs(unsigned int family, unsigned int model)
1847{ 2076{
1848 if (!genuine_intel) 2077 if (!genuine_intel)
1849 return 0; 2078 return 0;
@@ -1858,6 +2087,7 @@ int is_snb(unsigned int family, unsigned int model)
1858 case 0x45: /* HSW */ 2087 case 0x45: /* HSW */
1859 case 0x46: /* HSW */ 2088 case 0x46: /* HSW */
1860 case 0x3D: /* BDW */ 2089 case 0x3D: /* BDW */
2090 case 0x47: /* BDW */
1861 case 0x4F: /* BDX */ 2091 case 0x4F: /* BDX */
1862 case 0x56: /* BDX-DE */ 2092 case 0x56: /* BDX-DE */
1863 return 1; 2093 return 1;
@@ -1865,7 +2095,14 @@ int is_snb(unsigned int family, unsigned int model)
1865 return 0; 2095 return 0;
1866} 2096}
1867 2097
1868int has_c8_c9_c10(unsigned int family, unsigned int model) 2098/*
2099 * HSW adds support for additional MSRs:
2100 *
2101 * MSR_PKG_C8_RESIDENCY 0x00000630
2102 * MSR_PKG_C9_RESIDENCY 0x00000631
2103 * MSR_PKG_C10_RESIDENCY 0x00000632
2104 */
2105int has_hsw_msrs(unsigned int family, unsigned int model)
1869{ 2106{
1870 if (!genuine_intel) 2107 if (!genuine_intel)
1871 return 0; 2108 return 0;
@@ -1917,7 +2154,7 @@ double slm_bclk(void)
1917 2154
1918double discover_bclk(unsigned int family, unsigned int model) 2155double discover_bclk(unsigned int family, unsigned int model)
1919{ 2156{
1920 if (is_snb(family, model)) 2157 if (has_snb_msrs(family, model))
1921 return 100.00; 2158 return 100.00;
1922 else if (is_slm(family, model)) 2159 else if (is_slm(family, model))
1923 return slm_bclk(); 2160 return slm_bclk();
@@ -1965,7 +2202,7 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk
1965 } 2202 }
1966 2203
1967 /* Temperature Target MSR is Nehalem and newer only */ 2204 /* Temperature Target MSR is Nehalem and newer only */
1968 if (!do_nehalem_platform_info) 2205 if (!do_nhm_platform_info)
1969 goto guess; 2206 goto guess;
1970 2207
1971 if (get_msr(0, MSR_IA32_TEMPERATURE_TARGET, &msr)) 2208 if (get_msr(0, MSR_IA32_TEMPERATURE_TARGET, &msr))
@@ -1973,7 +2210,7 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk
1973 2210
1974 target_c_local = (msr >> 16) & 0xFF; 2211 target_c_local = (msr >> 16) & 0xFF;
1975 2212
1976 if (verbose) 2213 if (debug)
1977 fprintf(stderr, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", 2214 fprintf(stderr, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n",
1978 cpu, msr, target_c_local); 2215 cpu, msr, target_c_local);
1979 2216
@@ -2003,7 +2240,7 @@ void check_cpuid()
2003 if (ebx == 0x756e6547 && edx == 0x49656e69 && ecx == 0x6c65746e) 2240 if (ebx == 0x756e6547 && edx == 0x49656e69 && ecx == 0x6c65746e)
2004 genuine_intel = 1; 2241 genuine_intel = 1;
2005 2242
2006 if (verbose) 2243 if (debug)
2007 fprintf(stderr, "CPUID(0): %.4s%.4s%.4s ", 2244 fprintf(stderr, "CPUID(0): %.4s%.4s%.4s ",
2008 (char *)&ebx, (char *)&edx, (char *)&ecx); 2245 (char *)&ebx, (char *)&edx, (char *)&ecx);
2009 2246
@@ -2014,7 +2251,7 @@ void check_cpuid()
2014 if (family == 6 || family == 0xf) 2251 if (family == 6 || family == 0xf)
2015 model += ((fms >> 16) & 0xf) << 4; 2252 model += ((fms >> 16) & 0xf) << 4;
2016 2253
2017 if (verbose) 2254 if (debug)
2018 fprintf(stderr, "%d CPUID levels; family:model:stepping 0x%x:%x:%x (%d:%d:%d)\n", 2255 fprintf(stderr, "%d CPUID levels; family:model:stepping 0x%x:%x:%x (%d:%d:%d)\n",
2019 max_level, family, model, stepping, family, model, stepping); 2256 max_level, family, model, stepping, family, model, stepping);
2020 2257
@@ -2029,18 +2266,15 @@ void check_cpuid()
2029 ebx = ecx = edx = 0; 2266 ebx = ecx = edx = 0;
2030 __get_cpuid(0x80000000, &max_level, &ebx, &ecx, &edx); 2267 __get_cpuid(0x80000000, &max_level, &ebx, &ecx, &edx);
2031 2268
2032 if (max_level < 0x80000007) 2269 if (max_level >= 0x80000007) {
2033 errx(1, "CPUID: no invariant TSC (max_level 0x%x)", max_level);
2034 2270
2035 /* 2271 /*
2036 * Non-Stop TSC is advertised by CPUID.EAX=0x80000007: EDX.bit8 2272 * Non-Stop TSC is advertised by CPUID.EAX=0x80000007: EDX.bit8
2037 * this check is valid for both Intel and AMD 2273 * this check is valid for both Intel and AMD
2038 */ 2274 */
2039 __get_cpuid(0x80000007, &eax, &ebx, &ecx, &edx); 2275 __get_cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
2040 has_invariant_tsc = edx & (1 << 8); 2276 has_invariant_tsc = edx & (1 << 8);
2041 2277 }
2042 if (!has_invariant_tsc)
2043 errx(1, "No invariant TSC");
2044 2278
2045 /* 2279 /*
2046 * APERF/MPERF is advertised by CPUID.EAX=0x6: ECX.bit0 2280 * APERF/MPERF is advertised by CPUID.EAX=0x6: ECX.bit0
@@ -2053,36 +2287,51 @@ void check_cpuid()
2053 do_ptm = eax & (1 << 6); 2287 do_ptm = eax & (1 << 6);
2054 has_epb = ecx & (1 << 3); 2288 has_epb = ecx & (1 << 3);
2055 2289
2056 if (verbose) 2290 if (debug)
2057 fprintf(stderr, "CPUID(6): %s%s%s%s\n", 2291 fprintf(stderr, "CPUID(6): %sAPERF, %sDTS, %sPTM, %sEPB\n",
2058 has_aperf ? "APERF" : "No APERF!", 2292 has_aperf ? "" : "No ",
2059 do_dts ? ", DTS" : "", 2293 do_dts ? "" : "No ",
2060 do_ptm ? ", PTM": "", 2294 do_ptm ? "" : "No ",
2061 has_epb ? ", EPB": ""); 2295 has_epb ? "" : "No ");
2062 2296
2063 if (!has_aperf) 2297 do_nhm_platform_info = do_nhm_cstates = do_smi = probe_nhm_msrs(family, model);
2064 errx(-1, "No APERF"); 2298 do_snb_cstates = has_snb_msrs(family, model);
2065 2299 do_pc2 = do_snb_cstates && (pkg_cstate_limit >= PCL__2);
2066 do_nehalem_platform_info = genuine_intel && has_invariant_tsc; 2300 do_pc3 = (pkg_cstate_limit >= PCL__3);
2067 do_nhm_cstates = genuine_intel; /* all Intel w/ non-stop TSC have NHM counters */ 2301 do_pc6 = (pkg_cstate_limit >= PCL__6);
2068 do_smi = do_nhm_cstates; 2302 do_pc7 = do_snb_cstates && (pkg_cstate_limit >= PCL__7);
2069 do_snb_cstates = is_snb(family, model); 2303 do_c8_c9_c10 = has_hsw_msrs(family, model);
2070 do_c8_c9_c10 = has_c8_c9_c10(family, model);
2071 do_slm_cstates = is_slm(family, model); 2304 do_slm_cstates = is_slm(family, model);
2072 bclk = discover_bclk(family, model); 2305 bclk = discover_bclk(family, model);
2073 2306
2074 do_nehalem_turbo_ratio_limit = has_nehalem_turbo_ratio_limit(family, model); 2307 do_nhm_turbo_ratio_limit = do_nhm_platform_info && has_nhm_turbo_ratio_limit(family, model);
2075 do_ivt_turbo_ratio_limit = has_ivt_turbo_ratio_limit(family, model); 2308 do_ivt_turbo_ratio_limit = has_ivt_turbo_ratio_limit(family, model);
2076 rapl_probe(family, model); 2309 rapl_probe(family, model);
2310 perf_limit_reasons_probe(family, model);
2077 2311
2078 return; 2312 return;
2079} 2313}
2080 2314
2081 2315
2082void usage() 2316void help()
2083{ 2317{
2084 errx(1, "%s: [-v][-R][-T][-p|-P|-S][-c MSR#][-C MSR#][-m MSR#][-M MSR#][-i interval_sec | command ...]\n", 2318 fprintf(stderr,
2085 progname); 2319 "Usage: turbostat [OPTIONS][(--interval seconds) | COMMAND ...]\n"
2320 "\n"
2321 "Turbostat forks the specified COMMAND and prints statistics\n"
2322 "when COMMAND completes.\n"
2323 "If no COMMAND is specified, turbostat wakes every 5-seconds\n"
2324 "to print statistics, until interrupted.\n"
2325 "--debug run in \"debug\" mode\n"
2326 "--interval sec Override default 5-second measurement interval\n"
2327 "--help print this help message\n"
2328 "--counter msr print 32-bit counter at address \"msr\"\n"
2329 "--Counter msr print 64-bit Counter at address \"msr\"\n"
2330 "--msr msr print 32-bit value at address \"msr\"\n"
2331 "--MSR msr print 64-bit Value at address \"msr\"\n"
2332 "--version print version information\n"
2333 "\n"
2334 "For more help, run \"man turbostat\"\n");
2086} 2335}
2087 2336
2088 2337
@@ -2121,7 +2370,7 @@ void topology_probe()
2121 if (!summary_only && topo.num_cpus > 1) 2370 if (!summary_only && topo.num_cpus > 1)
2122 show_cpu = 1; 2371 show_cpu = 1;
2123 2372
2124 if (verbose > 1) 2373 if (debug > 1)
2125 fprintf(stderr, "num_cpus %d max_cpu_num %d\n", topo.num_cpus, topo.max_cpu_num); 2374 fprintf(stderr, "num_cpus %d max_cpu_num %d\n", topo.num_cpus, topo.max_cpu_num);
2126 2375
2127 cpus = calloc(1, (topo.max_cpu_num + 1) * sizeof(struct cpu_topology)); 2376 cpus = calloc(1, (topo.max_cpu_num + 1) * sizeof(struct cpu_topology));
@@ -2156,7 +2405,7 @@ void topology_probe()
2156 int siblings; 2405 int siblings;
2157 2406
2158 if (cpu_is_not_present(i)) { 2407 if (cpu_is_not_present(i)) {
2159 if (verbose > 1) 2408 if (debug > 1)
2160 fprintf(stderr, "cpu%d NOT PRESENT\n", i); 2409 fprintf(stderr, "cpu%d NOT PRESENT\n", i);
2161 continue; 2410 continue;
2162 } 2411 }
@@ -2171,26 +2420,26 @@ void topology_probe()
2171 siblings = get_num_ht_siblings(i); 2420 siblings = get_num_ht_siblings(i);
2172 if (siblings > max_siblings) 2421 if (siblings > max_siblings)
2173 max_siblings = siblings; 2422 max_siblings = siblings;
2174 if (verbose > 1) 2423 if (debug > 1)
2175 fprintf(stderr, "cpu %d pkg %d core %d\n", 2424 fprintf(stderr, "cpu %d pkg %d core %d\n",
2176 i, cpus[i].physical_package_id, cpus[i].core_id); 2425 i, cpus[i].physical_package_id, cpus[i].core_id);
2177 } 2426 }
2178 topo.num_cores_per_pkg = max_core_id + 1; 2427 topo.num_cores_per_pkg = max_core_id + 1;
2179 if (verbose > 1) 2428 if (debug > 1)
2180 fprintf(stderr, "max_core_id %d, sizing for %d cores per package\n", 2429 fprintf(stderr, "max_core_id %d, sizing for %d cores per package\n",
2181 max_core_id, topo.num_cores_per_pkg); 2430 max_core_id, topo.num_cores_per_pkg);
2182 if (!summary_only && topo.num_cores_per_pkg > 1) 2431 if (!summary_only && topo.num_cores_per_pkg > 1)
2183 show_core = 1; 2432 show_core = 1;
2184 2433
2185 topo.num_packages = max_package_id + 1; 2434 topo.num_packages = max_package_id + 1;
2186 if (verbose > 1) 2435 if (debug > 1)
2187 fprintf(stderr, "max_package_id %d, sizing for %d packages\n", 2436 fprintf(stderr, "max_package_id %d, sizing for %d packages\n",
2188 max_package_id, topo.num_packages); 2437 max_package_id, topo.num_packages);
2189 if (!summary_only && topo.num_packages > 1) 2438 if (!summary_only && topo.num_packages > 1)
2190 show_pkg = 1; 2439 show_pkg = 1;
2191 2440
2192 topo.num_threads_per_core = max_siblings; 2441 topo.num_threads_per_core = max_siblings;
2193 if (verbose > 1) 2442 if (debug > 1)
2194 fprintf(stderr, "max_siblings %d\n", max_siblings); 2443 fprintf(stderr, "max_siblings %d\n", max_siblings);
2195 2444
2196 free(cpus); 2445 free(cpus);
@@ -2299,25 +2548,27 @@ void setup_all_buffers(void)
2299 2548
2300void turbostat_init() 2549void turbostat_init()
2301{ 2550{
2302 check_cpuid();
2303
2304 check_dev_msr(); 2551 check_dev_msr();
2305 check_super_user(); 2552 check_permissions();
2553 check_cpuid();
2306 2554
2307 setup_all_buffers(); 2555 setup_all_buffers();
2308 2556
2309 if (verbose) 2557 if (debug)
2310 print_verbose_header(); 2558 print_verbose_header();
2311 2559
2312 if (verbose) 2560 if (debug)
2313 for_all_cpus(print_epb, ODD_COUNTERS); 2561 for_all_cpus(print_epb, ODD_COUNTERS);
2314 2562
2315 if (verbose) 2563 if (debug)
2564 for_all_cpus(print_perf_limit, ODD_COUNTERS);
2565
2566 if (debug)
2316 for_all_cpus(print_rapl, ODD_COUNTERS); 2567 for_all_cpus(print_rapl, ODD_COUNTERS);
2317 2568
2318 for_all_cpus(set_temperature_target, ODD_COUNTERS); 2569 for_all_cpus(set_temperature_target, ODD_COUNTERS);
2319 2570
2320 if (verbose) 2571 if (debug)
2321 for_all_cpus(print_thermal, ODD_COUNTERS); 2572 for_all_cpus(print_thermal, ODD_COUNTERS);
2322} 2573}
2323 2574
@@ -2382,56 +2633,82 @@ int get_and_dump_counters(void)
2382 return status; 2633 return status;
2383} 2634}
2384 2635
2636void print_version() {
2637 fprintf(stderr, "turbostat version 4.1 10-Feb, 2015"
2638 " - Len Brown <lenb@kernel.org>\n");
2639}
2640
2385void cmdline(int argc, char **argv) 2641void cmdline(int argc, char **argv)
2386{ 2642{
2387 int opt; 2643 int opt;
2644 int option_index = 0;
2645 static struct option long_options[] = {
2646 {"Counter", required_argument, 0, 'C'},
2647 {"counter", required_argument, 0, 'c'},
2648 {"Dump", no_argument, 0, 'D'},
2649 {"debug", no_argument, 0, 'd'},
2650 {"interval", required_argument, 0, 'i'},
2651 {"help", no_argument, 0, 'h'},
2652 {"Joules", no_argument, 0, 'J'},
2653 {"MSR", required_argument, 0, 'M'},
2654 {"msr", required_argument, 0, 'm'},
2655 {"Package", no_argument, 0, 'p'},
2656 {"processor", no_argument, 0, 'p'},
2657 {"Summary", no_argument, 0, 'S'},
2658 {"TCC", required_argument, 0, 'T'},
2659 {"version", no_argument, 0, 'v' },
2660 {0, 0, 0, 0 }
2661 };
2388 2662
2389 progname = argv[0]; 2663 progname = argv[0];
2390 2664
2391 while ((opt = getopt(argc, argv, "+pPsSvi:c:C:m:M:RJT:")) != -1) { 2665 while ((opt = getopt_long_only(argc, argv, "C:c:Ddhi:JM:m:PpST:v",
2666 long_options, &option_index)) != -1) {
2392 switch (opt) { 2667 switch (opt) {
2393 case 'p': 2668 case 'C':
2394 show_core_only++; 2669 sscanf(optarg, "%x", &extra_delta_offset64);
2395 break; 2670 break;
2396 case 'P': 2671 case 'c':
2397 show_pkg_only++; 2672 sscanf(optarg, "%x", &extra_delta_offset32);
2398 break; 2673 break;
2399 case 's': 2674 case 'D':
2400 dump_only++; 2675 dump_only++;
2401 break; 2676 break;
2402 case 'S': 2677 case 'd':
2403 summary_only++; 2678 debug++;
2404 break;
2405 case 'v':
2406 verbose++;
2407 break; 2679 break;
2680 case 'h':
2681 default:
2682 help();
2683 exit(1);
2408 case 'i': 2684 case 'i':
2409 interval_sec = atoi(optarg); 2685 interval_sec = atoi(optarg);
2410 break; 2686 break;
2411 case 'c': 2687 case 'J':
2412 sscanf(optarg, "%x", &extra_delta_offset32); 2688 rapl_joules++;
2413 break; 2689 break;
2414 case 'C': 2690 case 'M':
2415 sscanf(optarg, "%x", &extra_delta_offset64); 2691 sscanf(optarg, "%x", &extra_msr_offset64);
2416 break; 2692 break;
2417 case 'm': 2693 case 'm':
2418 sscanf(optarg, "%x", &extra_msr_offset32); 2694 sscanf(optarg, "%x", &extra_msr_offset32);
2419 break; 2695 break;
2420 case 'M': 2696 case 'P':
2421 sscanf(optarg, "%x", &extra_msr_offset64); 2697 show_pkg_only++;
2422 break; 2698 break;
2423 case 'R': 2699 case 'p':
2424 rapl_verbose++; 2700 show_core_only++;
2701 break;
2702 case 'S':
2703 summary_only++;
2425 break; 2704 break;
2426 case 'T': 2705 case 'T':
2427 tcc_activation_temp_override = atoi(optarg); 2706 tcc_activation_temp_override = atoi(optarg);
2428 break; 2707 break;
2429 case 'J': 2708 case 'v':
2430 rapl_joules++; 2709 print_version();
2710 exit(0);
2431 break; 2711 break;
2432
2433 default:
2434 usage();
2435 } 2712 }
2436 } 2713 }
2437} 2714}
@@ -2440,9 +2717,8 @@ int main(int argc, char **argv)
2440{ 2717{
2441 cmdline(argc, argv); 2718 cmdline(argc, argv);
2442 2719
2443 if (verbose) 2720 if (debug)
2444 fprintf(stderr, "turbostat v3.7 Feb 6, 2014" 2721 print_version();
2445 " - Len Brown <lenb@kernel.org>\n");
2446 2722
2447 turbostat_init(); 2723 turbostat_init();
2448 2724
diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index b9cd036f0442..d08e214ec6e7 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -178,6 +178,7 @@ my $checkout;
178my $localversion; 178my $localversion;
179my $iteration = 0; 179my $iteration = 0;
180my $successes = 0; 180my $successes = 0;
181my $stty_orig;
181 182
182my $bisect_good; 183my $bisect_good;
183my $bisect_bad; 184my $bisect_bad;
@@ -197,6 +198,11 @@ my $patchcheck_start;
197my $patchcheck_cherry; 198my $patchcheck_cherry;
198my $patchcheck_end; 199my $patchcheck_end;
199 200
201my $build_time;
202my $install_time;
203my $reboot_time;
204my $test_time;
205
200# set when a test is something other that just building or install 206# set when a test is something other that just building or install
201# which would require more options. 207# which would require more options.
202my $buildonly = 1; 208my $buildonly = 1;
@@ -554,6 +560,66 @@ sub get_mandatory_config {
554 } 560 }
555} 561}
556 562
563sub show_time {
564 my ($time) = @_;
565
566 my $hours = 0;
567 my $minutes = 0;
568
569 if ($time > 3600) {
570 $hours = int($time / 3600);
571 $time -= $hours * 3600;
572 }
573 if ($time > 60) {
574 $minutes = int($time / 60);
575 $time -= $minutes * 60;
576 }
577
578 if ($hours > 0) {
579 doprint "$hours hour";
580 doprint "s" if ($hours > 1);
581 doprint " ";
582 }
583
584 if ($minutes > 0) {
585 doprint "$minutes minute";
586 doprint "s" if ($minutes > 1);
587 doprint " ";
588 }
589
590 doprint "$time second";
591 doprint "s" if ($time != 1);
592}
593
594sub print_times {
595 doprint "\n";
596 if ($build_time) {
597 doprint "Build time: ";
598 show_time($build_time);
599 doprint "\n";
600 }
601 if ($install_time) {
602 doprint "Install time: ";
603 show_time($install_time);
604 doprint "\n";
605 }
606 if ($reboot_time) {
607 doprint "Reboot time: ";
608 show_time($reboot_time);
609 doprint "\n";
610 }
611 if ($test_time) {
612 doprint "Test time: ";
613 show_time($test_time);
614 doprint "\n";
615 }
616 # reset for iterations like bisect
617 $build_time = 0;
618 $install_time = 0;
619 $reboot_time = 0;
620 $test_time = 0;
621}
622
557sub get_mandatory_configs { 623sub get_mandatory_configs {
558 get_mandatory_config("MACHINE"); 624 get_mandatory_config("MACHINE");
559 get_mandatory_config("BUILD_DIR"); 625 get_mandatory_config("BUILD_DIR");
@@ -1341,23 +1407,83 @@ sub dodie {
1341 print " See $opt{LOG_FILE} for more info.\n"; 1407 print " See $opt{LOG_FILE} for more info.\n";
1342 } 1408 }
1343 1409
1410 if ($monitor_cnt) {
1411 # restore terminal settings
1412 system("stty $stty_orig");
1413 }
1414
1344 die @_, "\n"; 1415 die @_, "\n";
1345} 1416}
1346 1417
1418sub create_pty {
1419 my ($ptm, $pts) = @_;
1420 my $tmp;
1421 my $TIOCSPTLCK = 0x40045431;
1422 my $TIOCGPTN = 0x80045430;
1423
1424 sysopen($ptm, "/dev/ptmx", O_RDWR | O_NONBLOCK) or
1425 dodie "Cant open /dev/ptmx";
1426
1427 # unlockpt()
1428 $tmp = pack("i", 0);
1429 ioctl($ptm, $TIOCSPTLCK, $tmp) or
1430 dodie "ioctl TIOCSPTLCK for /dev/ptmx failed";
1431
1432 # ptsname()
1433 ioctl($ptm, $TIOCGPTN, $tmp) or
1434 dodie "ioctl TIOCGPTN for /dev/ptmx failed";
1435 $tmp = unpack("i", $tmp);
1436
1437 sysopen($pts, "/dev/pts/$tmp", O_RDWR | O_NONBLOCK) or
1438 dodie "Can't open /dev/pts/$tmp";
1439}
1440
1441sub exec_console {
1442 my ($ptm, $pts) = @_;
1443
1444 close($ptm);
1445
1446 close(\*STDIN);
1447 close(\*STDOUT);
1448 close(\*STDERR);
1449
1450 open(\*STDIN, '<&', $pts);
1451 open(\*STDOUT, '>&', $pts);
1452 open(\*STDERR, '>&', $pts);
1453
1454 close($pts);
1455
1456 exec $console or
1457 die "Can't open console $console";
1458}
1459
1347sub open_console { 1460sub open_console {
1348 my ($fp) = @_; 1461 my ($ptm) = @_;
1462 my $pts = \*PTSFD;
1463 my $pid;
1349 1464
1350 my $flags; 1465 # save terminal settings
1466 $stty_orig = `stty -g`;
1351 1467
1352 my $pid = open($fp, "$console|") or 1468 # place terminal in cbreak mode so that stdin can be read one character at
1353 dodie "Can't open console $console"; 1469 # a time without having to wait for a newline
1470 system("stty -icanon -echo -icrnl");
1354 1471
1355 $flags = fcntl($fp, F_GETFL, 0) or 1472 create_pty($ptm, $pts);
1356 dodie "Can't get flags for the socket: $!"; 1473
1357 $flags = fcntl($fp, F_SETFL, $flags | O_NONBLOCK) or 1474 $pid = fork;
1358 dodie "Can't set flags for the socket: $!"; 1475
1476 if (!$pid) {
1477 # child
1478 exec_console($ptm, $pts)
1479 }
1480
1481 # parent
1482 close($pts);
1359 1483
1360 return $pid; 1484 return $pid;
1485
1486 open(PTSFD, "Stop perl from warning about single use of PTSFD");
1361} 1487}
1362 1488
1363sub close_console { 1489sub close_console {
@@ -1368,6 +1494,9 @@ sub close_console {
1368 1494
1369 print "closing!\n"; 1495 print "closing!\n";
1370 close($fp); 1496 close($fp);
1497
1498 # restore terminal settings
1499 system("stty $stty_orig");
1371} 1500}
1372 1501
1373sub start_monitor { 1502sub start_monitor {
@@ -1519,6 +1648,8 @@ sub fail {
1519 $name = " ($test_name)"; 1648 $name = " ($test_name)";
1520 } 1649 }
1521 1650
1651 print_times;
1652
1522 doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n"; 1653 doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n";
1523 doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n"; 1654 doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n";
1524 doprint "KTEST RESULT: TEST $i$name Failed: ", @_, "\n"; 1655 doprint "KTEST RESULT: TEST $i$name Failed: ", @_, "\n";
@@ -1534,10 +1665,14 @@ sub fail {
1534 1665
1535sub run_command { 1666sub run_command {
1536 my ($command, $redirect) = @_; 1667 my ($command, $redirect) = @_;
1668 my $start_time;
1669 my $end_time;
1537 my $dolog = 0; 1670 my $dolog = 0;
1538 my $dord = 0; 1671 my $dord = 0;
1539 my $pid; 1672 my $pid;
1540 1673
1674 $start_time = time;
1675
1541 $command =~ s/\$SSH_USER/$ssh_user/g; 1676 $command =~ s/\$SSH_USER/$ssh_user/g;
1542 $command =~ s/\$MACHINE/$machine/g; 1677 $command =~ s/\$MACHINE/$machine/g;
1543 1678
@@ -1570,6 +1705,15 @@ sub run_command {
1570 close(LOG) if ($dolog); 1705 close(LOG) if ($dolog);
1571 close(RD) if ($dord); 1706 close(RD) if ($dord);
1572 1707
1708 $end_time = time;
1709 my $delta = $end_time - $start_time;
1710
1711 if ($delta == 1) {
1712 doprint "[1 second] ";
1713 } else {
1714 doprint "[$delta seconds] ";
1715 }
1716
1573 if ($failed) { 1717 if ($failed) {
1574 doprint "FAILED!\n"; 1718 doprint "FAILED!\n";
1575 } else { 1719 } else {
@@ -1694,7 +1838,9 @@ sub wait_for_input
1694{ 1838{
1695 my ($fp, $time) = @_; 1839 my ($fp, $time) = @_;
1696 my $rin; 1840 my $rin;
1697 my $ready; 1841 my $rout;
1842 my $nr;
1843 my $buf;
1698 my $line; 1844 my $line;
1699 my $ch; 1845 my $ch;
1700 1846
@@ -1704,21 +1850,36 @@ sub wait_for_input
1704 1850
1705 $rin = ''; 1851 $rin = '';
1706 vec($rin, fileno($fp), 1) = 1; 1852 vec($rin, fileno($fp), 1) = 1;
1707 ($ready, $time) = select($rin, undef, undef, $time); 1853 vec($rin, fileno(\*STDIN), 1) = 1;
1708 1854
1709 $line = ""; 1855 while (1) {
1856 $nr = select($rout=$rin, undef, undef, $time);
1710 1857
1711 # try to read one char at a time 1858 if ($nr <= 0) {
1712 while (sysread $fp, $ch, 1) { 1859 return undef;
1713 $line .= $ch; 1860 }
1714 last if ($ch eq "\n");
1715 }
1716 1861
1717 if (!length($line)) { 1862 # copy data from stdin to the console
1718 return undef; 1863 if (vec($rout, fileno(\*STDIN), 1) == 1) {
1719 } 1864 sysread(\*STDIN, $buf, 1000);
1865 syswrite($fp, $buf, 1000);
1866 next;
1867 }
1720 1868
1721 return $line; 1869 $line = "";
1870
1871 # try to read one char at a time
1872 while (sysread $fp, $ch, 1) {
1873 $line .= $ch;
1874 last if ($ch eq "\n");
1875 }
1876
1877 if (!length($line)) {
1878 return undef;
1879 }
1880
1881 return $line;
1882 }
1722} 1883}
1723 1884
1724sub reboot_to { 1885sub reboot_to {
@@ -1766,6 +1927,8 @@ sub monitor {
1766 my $skip_call_trace = 0; 1927 my $skip_call_trace = 0;
1767 my $loops; 1928 my $loops;
1768 1929
1930 my $start_time = time;
1931
1769 wait_for_monitor 5; 1932 wait_for_monitor 5;
1770 1933
1771 my $line; 1934 my $line;
@@ -1890,6 +2053,9 @@ sub monitor {
1890 } 2053 }
1891 } 2054 }
1892 2055
2056 my $end_time = time;
2057 $reboot_time = $end_time - $start_time;
2058
1893 close(DMESG); 2059 close(DMESG);
1894 2060
1895 if ($bug) { 2061 if ($bug) {
@@ -1938,6 +2104,8 @@ sub install {
1938 2104
1939 return if ($no_install); 2105 return if ($no_install);
1940 2106
2107 my $start_time = time;
2108
1941 if (defined($pre_install)) { 2109 if (defined($pre_install)) {
1942 my $cp_pre_install = eval_kernel_version $pre_install; 2110 my $cp_pre_install = eval_kernel_version $pre_install;
1943 run_command "$cp_pre_install" or 2111 run_command "$cp_pre_install" or
@@ -1969,6 +2137,8 @@ sub install {
1969 if (!$install_mods) { 2137 if (!$install_mods) {
1970 do_post_install; 2138 do_post_install;
1971 doprint "No modules needed\n"; 2139 doprint "No modules needed\n";
2140 my $end_time = time;
2141 $install_time = $end_time - $start_time;
1972 return; 2142 return;
1973 } 2143 }
1974 2144
@@ -1996,6 +2166,9 @@ sub install {
1996 run_ssh "rm -f /tmp/$modtar"; 2166 run_ssh "rm -f /tmp/$modtar";
1997 2167
1998 do_post_install; 2168 do_post_install;
2169
2170 my $end_time = time;
2171 $install_time = $end_time - $start_time;
1999} 2172}
2000 2173
2001sub get_version { 2174sub get_version {
@@ -2008,7 +2181,7 @@ sub get_version {
2008 $have_version = 1; 2181 $have_version = 1;
2009} 2182}
2010 2183
2011sub start_monitor_and_boot { 2184sub start_monitor_and_install {
2012 # Make sure the stable kernel has finished booting 2185 # Make sure the stable kernel has finished booting
2013 2186
2014 # Install bisects, don't need console 2187 # Install bisects, don't need console
@@ -2208,6 +2381,8 @@ sub build {
2208 2381
2209 unlink $buildlog; 2382 unlink $buildlog;
2210 2383
2384 my $start_time = time;
2385
2211 # Failed builds should not reboot the target 2386 # Failed builds should not reboot the target
2212 my $save_no_reboot = $no_reboot; 2387 my $save_no_reboot = $no_reboot;
2213 $no_reboot = 1; 2388 $no_reboot = 1;
@@ -2293,6 +2468,9 @@ sub build {
2293 2468
2294 $no_reboot = $save_no_reboot; 2469 $no_reboot = $save_no_reboot;
2295 2470
2471 my $end_time = time;
2472 $build_time = $end_time - $start_time;
2473
2296 return 1; 2474 return 1;
2297} 2475}
2298 2476
@@ -2323,6 +2501,8 @@ sub success {
2323 $name = " ($test_name)"; 2501 $name = " ($test_name)";
2324 } 2502 }
2325 2503
2504 print_times;
2505
2326 doprint "\n\n*******************************************\n"; 2506 doprint "\n\n*******************************************\n";
2327 doprint "*******************************************\n"; 2507 doprint "*******************************************\n";
2328 doprint "KTEST RESULT: TEST $i$name SUCCESS!!!! **\n"; 2508 doprint "KTEST RESULT: TEST $i$name SUCCESS!!!! **\n";
@@ -2383,6 +2563,8 @@ sub do_run_test {
2383 my $bug = 0; 2563 my $bug = 0;
2384 my $bug_ignored = 0; 2564 my $bug_ignored = 0;
2385 2565
2566 my $start_time = time;
2567
2386 wait_for_monitor 1; 2568 wait_for_monitor 1;
2387 2569
2388 doprint "run test $run_test\n"; 2570 doprint "run test $run_test\n";
@@ -2449,6 +2631,9 @@ sub do_run_test {
2449 waitpid $child_pid, 0; 2631 waitpid $child_pid, 0;
2450 $child_exit = $?; 2632 $child_exit = $?;
2451 2633
2634 my $end_time = time;
2635 $test_time = $end_time - $start_time;
2636
2452 if (!$bug && $in_bisect) { 2637 if (!$bug && $in_bisect) {
2453 if (defined($bisect_ret_good)) { 2638 if (defined($bisect_ret_good)) {
2454 if ($child_exit == $bisect_ret_good) { 2639 if ($child_exit == $bisect_ret_good) {
@@ -2549,7 +2734,7 @@ sub run_bisect_test {
2549 dodie "Failed on build" if $failed; 2734 dodie "Failed on build" if $failed;
2550 2735
2551 # Now boot the box 2736 # Now boot the box
2552 start_monitor_and_boot or $failed = 1; 2737 start_monitor_and_install or $failed = 1;
2553 2738
2554 if ($type ne "boot") { 2739 if ($type ne "boot") {
2555 if ($failed && $bisect_skip) { 2740 if ($failed && $bisect_skip) {
@@ -2755,6 +2940,7 @@ sub bisect {
2755 do { 2940 do {
2756 $result = run_bisect $type; 2941 $result = run_bisect $type;
2757 $test = run_git_bisect "git bisect $result"; 2942 $test = run_git_bisect "git bisect $result";
2943 print_times;
2758 } while ($test); 2944 } while ($test);
2759 2945
2760 run_command "git bisect log" or 2946 run_command "git bisect log" or
@@ -3168,6 +3354,7 @@ sub config_bisect {
3168 3354
3169 do { 3355 do {
3170 $ret = run_config_bisect \%good_configs, \%bad_configs; 3356 $ret = run_config_bisect \%good_configs, \%bad_configs;
3357 print_times;
3171 } while (!$ret); 3358 } while (!$ret);
3172 3359
3173 return $ret if ($ret < 0); 3360 return $ret if ($ret < 0);
@@ -3260,7 +3447,7 @@ sub patchcheck {
3260 my $sha1 = $item; 3447 my $sha1 = $item;
3261 $sha1 =~ s/^([[:xdigit:]]+).*/$1/; 3448 $sha1 =~ s/^([[:xdigit:]]+).*/$1/;
3262 3449
3263 doprint "\nProcessing commit $item\n\n"; 3450 doprint "\nProcessing commit \"$item\"\n\n";
3264 3451
3265 run_command "git checkout $sha1" or 3452 run_command "git checkout $sha1" or
3266 die "Failed to checkout $sha1"; 3453 die "Failed to checkout $sha1";
@@ -3291,16 +3478,18 @@ sub patchcheck {
3291 3478
3292 my $failed = 0; 3479 my $failed = 0;
3293 3480
3294 start_monitor_and_boot or $failed = 1; 3481 start_monitor_and_install or $failed = 1;
3295 3482
3296 if (!$failed && $type ne "boot"){ 3483 if (!$failed && $type ne "boot"){
3297 do_run_test or $failed = 1; 3484 do_run_test or $failed = 1;
3298 } 3485 }
3299 end_monitor; 3486 end_monitor;
3300 return 0 if ($failed); 3487 if ($failed) {
3301 3488 print_times;
3489 return 0;
3490 }
3302 patchcheck_reboot; 3491 patchcheck_reboot;
3303 3492 print_times;
3304 } 3493 }
3305 $in_patchcheck = 0; 3494 $in_patchcheck = 0;
3306 success $i; 3495 success $i;
@@ -3753,7 +3942,7 @@ sub make_min_config {
3753 my $failed = 0; 3942 my $failed = 0;
3754 build "oldconfig" or $failed = 1; 3943 build "oldconfig" or $failed = 1;
3755 if (!$failed) { 3944 if (!$failed) {
3756 start_monitor_and_boot or $failed = 1; 3945 start_monitor_and_install or $failed = 1;
3757 3946
3758 if ($type eq "test" && !$failed) { 3947 if ($type eq "test" && !$failed) {
3759 do_run_test or $failed = 1; 3948 do_run_test or $failed = 1;
@@ -4000,6 +4189,11 @@ for (my $i = 1; $i <= $opt{"NUM_TESTS"}; $i++) {
4000 4189
4001 $iteration = $i; 4190 $iteration = $i;
4002 4191
4192 $build_time = 0;
4193 $install_time = 0;
4194 $reboot_time = 0;
4195 $test_time = 0;
4196
4003 undef %force_config; 4197 undef %force_config;
4004 4198
4005 my $makecmd = set_test_option("MAKE_CMD", $i); 4199 my $makecmd = set_test_option("MAKE_CMD", $i);
@@ -4157,15 +4351,20 @@ for (my $i = 1; $i <= $opt{"NUM_TESTS"}; $i++) {
4157 4351
4158 if ($test_type ne "build") { 4352 if ($test_type ne "build") {
4159 my $failed = 0; 4353 my $failed = 0;
4160 start_monitor_and_boot or $failed = 1; 4354 start_monitor_and_install or $failed = 1;
4161 4355
4162 if (!$failed && $test_type ne "boot" && defined($run_test)) { 4356 if (!$failed && $test_type ne "boot" && defined($run_test)) {
4163 do_run_test or $failed = 1; 4357 do_run_test or $failed = 1;
4164 } 4358 }
4165 end_monitor; 4359 end_monitor;
4166 next if ($failed); 4360 if ($failed) {
4361 print_times;
4362 next;
4363 }
4167 } 4364 }
4168 4365
4366 print_times;
4367
4169 success $i; 4368 success $i;
4170} 4369}
4171 4370
diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile
index f6ff90a76bd7..1d5e7ad2c460 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -13,7 +13,7 @@ CFLAGS := -Wall -O2 -flto -Wall -Werror -DGIT_VERSION='"$(GIT_VERSION)"' -I$(CUR
13 13
14export CC CFLAGS 14export CC CFLAGS
15 15
16TARGETS = pmu copyloops mm tm primitives 16TARGETS = pmu copyloops mm tm primitives stringloops
17 17
18endif 18endif
19 19
diff --git a/tools/testing/selftests/powerpc/copyloops/.gitignore b/tools/testing/selftests/powerpc/copyloops/.gitignore
new file mode 100644
index 000000000000..25a192f62c4d
--- /dev/null
+++ b/tools/testing/selftests/powerpc/copyloops/.gitignore
@@ -0,0 +1,4 @@
1copyuser_64
2copyuser_power7
3memcpy_64
4memcpy_power7
diff --git a/tools/testing/selftests/powerpc/mm/.gitignore b/tools/testing/selftests/powerpc/mm/.gitignore
new file mode 100644
index 000000000000..b43ade0ec861
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/.gitignore
@@ -0,0 +1,3 @@
1hugetlb_vs_thp_test
2subpage_prot
3tempfile
diff --git a/tools/testing/selftests/powerpc/mm/Makefile b/tools/testing/selftests/powerpc/mm/Makefile
index 357ccbd6bad9..a14c538dd7f8 100644
--- a/tools/testing/selftests/powerpc/mm/Makefile
+++ b/tools/testing/selftests/powerpc/mm/Makefile
@@ -1,9 +1,9 @@
1noarg: 1noarg:
2 $(MAKE) -C ../ 2 $(MAKE) -C ../
3 3
4PROGS := hugetlb_vs_thp_test 4PROGS := hugetlb_vs_thp_test subpage_prot
5 5
6all: $(PROGS) 6all: $(PROGS) tempfile
7 7
8$(PROGS): ../harness.c 8$(PROGS): ../harness.c
9 9
@@ -12,7 +12,10 @@ run_tests: all
12 ./$$PROG; \ 12 ./$$PROG; \
13 done; 13 done;
14 14
15tempfile:
16 dd if=/dev/zero of=tempfile bs=64k count=1
17
15clean: 18clean:
16 rm -f $(PROGS) 19 rm -f $(PROGS) tempfile
17 20
18.PHONY: all run_tests clean 21.PHONY: all run_tests clean
diff --git a/tools/testing/selftests/powerpc/mm/subpage_prot.c b/tools/testing/selftests/powerpc/mm/subpage_prot.c
new file mode 100644
index 000000000000..440180ff8089
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/subpage_prot.c
@@ -0,0 +1,220 @@
1/*
2 * Copyright IBM Corp.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2.1 of the GNU Lesser General Public License
6 * as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 */
13
14#include <assert.h>
15#include <errno.h>
16#include <fcntl.h>
17#include <signal.h>
18#include <stdarg.h>
19#include <stdio.h>
20#include <stdlib.h>
21#include <string.h>
22#include <sys/mman.h>
23#include <sys/ptrace.h>
24#include <sys/syscall.h>
25#include <ucontext.h>
26#include <unistd.h>
27
28#include "utils.h"
29
30char *file_name;
31
32int in_test;
33volatile int faulted;
34volatile void *dar;
35int errors;
36
37static void segv(int signum, siginfo_t *info, void *ctxt_v)
38{
39 ucontext_t *ctxt = (ucontext_t *)ctxt_v;
40 struct pt_regs *regs = ctxt->uc_mcontext.regs;
41
42 if (!in_test) {
43 fprintf(stderr, "Segfault outside of test !\n");
44 exit(1);
45 }
46
47 faulted = 1;
48 dar = (void *)regs->dar;
49 regs->nip += 4;
50}
51
52static inline void do_read(const volatile void *addr)
53{
54 int ret;
55
56 asm volatile("lwz %0,0(%1); twi 0,%0,0; isync;\n"
57 : "=r" (ret) : "r" (addr) : "memory");
58}
59
60static inline void do_write(const volatile void *addr)
61{
62 int val = 0x1234567;
63
64 asm volatile("stw %0,0(%1); sync; \n"
65 : : "r" (val), "r" (addr) : "memory");
66}
67
68static inline void check_faulted(void *addr, long page, long subpage, int write)
69{
70 int want_fault = (subpage == ((page + 3) % 16));
71
72 if (write)
73 want_fault |= (subpage == ((page + 1) % 16));
74
75 if (faulted != want_fault) {
76 printf("Failed at 0x%p (p=%ld,sp=%ld,w=%d), want=%s, got=%s !\n",
77 addr, page, subpage, write,
78 want_fault ? "fault" : "pass",
79 faulted ? "fault" : "pass");
80 ++errors;
81 }
82
83 if (faulted) {
84 if (dar != addr) {
85 printf("Fault expected at 0x%p and happened at 0x%p !\n",
86 addr, dar);
87 }
88 faulted = 0;
89 asm volatile("sync" : : : "memory");
90 }
91}
92
93static int run_test(void *addr, unsigned long size)
94{
95 unsigned int *map;
96 long i, j, pages, err;
97
98 pages = size / 0x10000;
99 map = malloc(pages * 4);
100 assert(map);
101
102 /*
103 * for each page, mark subpage i % 16 read only and subpage
104 * (i + 3) % 16 inaccessible
105 */
106 for (i = 0; i < pages; i++) {
107 map[i] = (0x40000000 >> (((i + 1) * 2) % 32)) |
108 (0xc0000000 >> (((i + 3) * 2) % 32));
109 }
110
111 err = syscall(__NR_subpage_prot, addr, size, map);
112 if (err) {
113 perror("subpage_perm");
114 return 1;
115 }
116 free(map);
117
118 in_test = 1;
119 errors = 0;
120 for (i = 0; i < pages; i++) {
121 for (j = 0; j < 16; j++, addr += 0x1000) {
122 do_read(addr);
123 check_faulted(addr, i, j, 0);
124 do_write(addr);
125 check_faulted(addr, i, j, 1);
126 }
127 }
128
129 in_test = 0;
130 if (errors) {
131 printf("%d errors detected\n", errors);
132 return 1;
133 }
134
135 return 0;
136}
137
138int test_anon(void)
139{
140 unsigned long align;
141 struct sigaction act = {
142 .sa_sigaction = segv,
143 .sa_flags = SA_SIGINFO
144 };
145 void *mallocblock;
146 unsigned long mallocsize;
147
148 if (getpagesize() != 0x10000) {
149 fprintf(stderr, "Kernel page size must be 64K!\n");
150 return 1;
151 }
152
153 sigaction(SIGSEGV, &act, NULL);
154
155 mallocsize = 4 * 16 * 1024 * 1024;
156
157 FAIL_IF(posix_memalign(&mallocblock, 64 * 1024, mallocsize));
158
159 align = (unsigned long)mallocblock;
160 if (align & 0xffff)
161 align = (align | 0xffff) + 1;
162
163 mallocblock = (void *)align;
164
165 printf("allocated malloc block of 0x%lx bytes at 0x%p\n",
166 mallocsize, mallocblock);
167
168 printf("testing malloc block...\n");
169
170 return run_test(mallocblock, mallocsize);
171}
172
173int test_file(void)
174{
175 struct sigaction act = {
176 .sa_sigaction = segv,
177 .sa_flags = SA_SIGINFO
178 };
179 void *fileblock;
180 off_t filesize;
181 int fd;
182
183 fd = open(file_name, O_RDWR);
184 if (fd == -1) {
185 perror("failed to open file");
186 return 1;
187 }
188 sigaction(SIGSEGV, &act, NULL);
189
190 filesize = lseek(fd, 0, SEEK_END);
191 if (filesize & 0xffff)
192 filesize &= ~0xfffful;
193
194 fileblock = mmap(NULL, filesize, PROT_READ | PROT_WRITE,
195 MAP_SHARED, fd, 0);
196 if (fileblock == MAP_FAILED) {
197 perror("failed to map file");
198 return 1;
199 }
200 printf("allocated %s for 0x%lx bytes at 0x%p\n",
201 file_name, filesize, fileblock);
202
203 printf("testing file map...\n");
204
205 return run_test(fileblock, filesize);
206}
207
208int main(int argc, char *argv[])
209{
210 test_harness(test_anon, "subpage_prot_anon");
211
212 if (argc > 1)
213 file_name = argv[1];
214 else
215 file_name = "tempfile";
216
217 test_harness(test_file, "subpage_prot_file");
218
219 return 0;
220}
diff --git a/tools/testing/selftests/powerpc/pmu/.gitignore b/tools/testing/selftests/powerpc/pmu/.gitignore
new file mode 100644
index 000000000000..e748f336eed3
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/.gitignore
@@ -0,0 +1,3 @@
1count_instructions
2l3_bank_test
3per_event_excludes
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/.gitignore b/tools/testing/selftests/powerpc/pmu/ebb/.gitignore
new file mode 100644
index 000000000000..42bddbed8b64
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/.gitignore
@@ -0,0 +1,22 @@
1reg_access_test
2event_attributes_test
3cycles_test
4cycles_with_freeze_test
5pmc56_overflow_test
6ebb_vs_cpu_event_test
7cpu_event_vs_ebb_test
8cpu_event_pinned_vs_ebb_test
9task_event_vs_ebb_test
10task_event_pinned_vs_ebb_test
11multi_ebb_procs_test
12multi_counter_test
13pmae_handling_test
14close_clears_pmcc_test
15instruction_count_test
16fork_cleanup_test
17ebb_on_child_test
18ebb_on_willing_child_test
19back_to_back_ebbs_test
20lost_exception_test
21no_handler_test
22cycles_with_mmcr2_test
diff --git a/tools/testing/selftests/powerpc/primitives/.gitignore b/tools/testing/selftests/powerpc/primitives/.gitignore
new file mode 100644
index 000000000000..4cc4e31bed1d
--- /dev/null
+++ b/tools/testing/selftests/powerpc/primitives/.gitignore
@@ -0,0 +1 @@
load_unaligned_zeropad
diff --git a/tools/testing/selftests/powerpc/stringloops/.gitignore b/tools/testing/selftests/powerpc/stringloops/.gitignore
new file mode 100644
index 000000000000..0b43da74ee46
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/.gitignore
@@ -0,0 +1 @@
memcmp
diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile b/tools/testing/selftests/powerpc/stringloops/Makefile
new file mode 100644
index 000000000000..506d77346477
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/Makefile
@@ -0,0 +1,20 @@
1# The loops are all 64-bit code
2CFLAGS += -m64
3CFLAGS += -I$(CURDIR)
4
5PROGS := memcmp
6EXTRA_SOURCES := memcmp_64.S ../harness.c
7
8all: $(PROGS)
9
10$(PROGS): $(EXTRA_SOURCES)
11
12run_tests: all
13 @-for PROG in $(PROGS); do \
14 ./$$PROG; \
15 done;
16
17clean:
18 rm -f $(PROGS) *.o
19
20.PHONY: all run_tests clean
diff --git a/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
new file mode 100644
index 000000000000..11bece87e880
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
@@ -0,0 +1,7 @@
1#include <ppc-asm.h>
2
3#ifndef r1
4#define r1 sp
5#endif
6
7#define _GLOBAL(A) FUNC_START(test_ ## A)
diff --git a/tools/testing/selftests/powerpc/stringloops/memcmp.c b/tools/testing/selftests/powerpc/stringloops/memcmp.c
new file mode 100644
index 000000000000..17417dd70708
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/memcmp.c
@@ -0,0 +1,103 @@
1#include <malloc.h>
2#include <stdlib.h>
3#include <string.h>
4#include "../utils.h"
5
6#define SIZE 256
7#define ITERATIONS 10000
8
9int test_memcmp(const void *s1, const void *s2, size_t n);
10
11/* test all offsets and lengths */
12static void test_one(char *s1, char *s2)
13{
14 unsigned long offset, size;
15
16 for (offset = 0; offset < SIZE; offset++) {
17 for (size = 0; size < (SIZE-offset); size++) {
18 int x, y;
19 unsigned long i;
20
21 y = memcmp(s1+offset, s2+offset, size);
22 x = test_memcmp(s1+offset, s2+offset, size);
23
24 if (((x ^ y) < 0) && /* Trick to compare sign */
25 ((x | y) != 0)) { /* check for zero */
26 printf("memcmp returned %d, should have returned %d (offset %ld size %ld)\n", x, y, offset, size);
27
28 for (i = offset; i < offset+size; i++)
29 printf("%02x ", s1[i]);
30 printf("\n");
31
32 for (i = offset; i < offset+size; i++)
33 printf("%02x ", s2[i]);
34 printf("\n");
35 abort();
36 }
37 }
38 }
39}
40
41static int testcase(void)
42{
43 char *s1;
44 char *s2;
45 unsigned long i;
46
47 s1 = memalign(128, SIZE);
48 if (!s1) {
49 perror("memalign");
50 exit(1);
51 }
52
53 s2 = memalign(128, SIZE);
54 if (!s2) {
55 perror("memalign");
56 exit(1);
57 }
58
59 srandom(1);
60
61 for (i = 0; i < ITERATIONS; i++) {
62 unsigned long j;
63 unsigned long change;
64
65 for (j = 0; j < SIZE; j++)
66 s1[j] = random();
67
68 memcpy(s2, s1, SIZE);
69
70 /* change one byte */
71 change = random() % SIZE;
72 s2[change] = random() & 0xff;
73
74 test_one(s1, s2);
75 }
76
77 srandom(1);
78
79 for (i = 0; i < ITERATIONS; i++) {
80 unsigned long j;
81 unsigned long change;
82
83 for (j = 0; j < SIZE; j++)
84 s1[j] = random();
85
86 memcpy(s2, s1, SIZE);
87
88 /* change multiple bytes, 1/8 of total */
89 for (j = 0; j < SIZE / 8; j++) {
90 change = random() % SIZE;
91 s2[change] = random() & 0xff;
92 }
93
94 test_one(s1, s2);
95 }
96
97 return 0;
98}
99
100int main(void)
101{
102 return test_harness(testcase, "memcmp");
103}
diff --git a/tools/testing/selftests/powerpc/stringloops/memcmp_64.S b/tools/testing/selftests/powerpc/stringloops/memcmp_64.S
new file mode 120000
index 000000000000..9bc87e438ae9
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/memcmp_64.S
@@ -0,0 +1 @@
../../../../../arch/powerpc/lib/memcmp_64.S \ No newline at end of file
diff --git a/tools/testing/selftests/powerpc/tm/.gitignore b/tools/testing/selftests/powerpc/tm/.gitignore
new file mode 100644
index 000000000000..33d02cc54a3e
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/.gitignore
@@ -0,0 +1 @@
tm-resched-dscr
diff --git a/tools/usb/ffs-aio-example/multibuff/host_app/test.c b/tools/usb/ffs-aio-example/multibuff/host_app/test.c
index daa3abe6bebd..2cbcce6e8dd7 100644
--- a/tools/usb/ffs-aio-example/multibuff/host_app/test.c
+++ b/tools/usb/ffs-aio-example/multibuff/host_app/test.c
@@ -33,11 +33,6 @@
33#define VENDOR 0x1d6b 33#define VENDOR 0x1d6b
34#define PRODUCT 0x0105 34#define PRODUCT 0x0105
35 35
36/* endpoints indexes */
37
38#define EP_BULK_IN (1 | LIBUSB_ENDPOINT_IN)
39#define EP_BULK_OUT (2 | LIBUSB_ENDPOINT_OUT)
40
41#define BUF_LEN 8192 36#define BUF_LEN 8192
42 37
43/* 38/*
@@ -159,14 +154,21 @@ void test_exit(struct test_state *state)
159int main(void) 154int main(void)
160{ 155{
161 struct test_state state; 156 struct test_state state;
157 struct libusb_config_descriptor *conf;
158 struct libusb_interface_descriptor const *iface;
159 unsigned char addr;
162 160
163 if (test_init(&state)) 161 if (test_init(&state))
164 return 1; 162 return 1;
165 163
164 libusb_get_config_descriptor(state.found, 0, &conf);
165 iface = &conf->interface[0].altsetting[0];
166 addr = iface->endpoint[0].bEndpointAddress;
167
166 while (1) { 168 while (1) {
167 static unsigned char buffer[BUF_LEN]; 169 static unsigned char buffer[BUF_LEN];
168 int bytes; 170 int bytes;
169 libusb_bulk_transfer(state.handle, EP_BULK_IN, buffer, BUF_LEN, 171 libusb_bulk_transfer(state.handle, addr, buffer, BUF_LEN,
170 &bytes, 500); 172 &bytes, 500);
171 } 173 }
172 test_exit(&state); 174 test_exit(&state);
diff --git a/tools/usb/ffs-aio-example/simple/device_app/aio_simple.c b/tools/usb/ffs-aio-example/simple/device_app/aio_simple.c
index adc310a6d489..1f44a29818bf 100644
--- a/tools/usb/ffs-aio-example/simple/device_app/aio_simple.c
+++ b/tools/usb/ffs-aio-example/simple/device_app/aio_simple.c
@@ -103,12 +103,14 @@ static const struct {
103 .bDescriptorType = USB_DT_ENDPOINT, 103 .bDescriptorType = USB_DT_ENDPOINT,
104 .bEndpointAddress = 1 | USB_DIR_IN, 104 .bEndpointAddress = 1 | USB_DIR_IN,
105 .bmAttributes = USB_ENDPOINT_XFER_BULK, 105 .bmAttributes = USB_ENDPOINT_XFER_BULK,
106 .wMaxPacketSize = htole16(512),
106 }, 107 },
107 .bulk_source = { 108 .bulk_source = {
108 .bLength = sizeof(descriptors.hs_descs.bulk_source), 109 .bLength = sizeof(descriptors.hs_descs.bulk_source),
109 .bDescriptorType = USB_DT_ENDPOINT, 110 .bDescriptorType = USB_DT_ENDPOINT,
110 .bEndpointAddress = 2 | USB_DIR_OUT, 111 .bEndpointAddress = 2 | USB_DIR_OUT,
111 .bmAttributes = USB_ENDPOINT_XFER_BULK, 112 .bmAttributes = USB_ENDPOINT_XFER_BULK,
113 .wMaxPacketSize = htole16(512),
112 }, 114 },
113 }, 115 },
114}; 116};
diff --git a/tools/usb/ffs-aio-example/simple/host_app/test.c b/tools/usb/ffs-aio-example/simple/host_app/test.c
index acd6332811f3..aed86ffff280 100644
--- a/tools/usb/ffs-aio-example/simple/host_app/test.c
+++ b/tools/usb/ffs-aio-example/simple/host_app/test.c
@@ -33,11 +33,6 @@
33#define VENDOR 0x1d6b 33#define VENDOR 0x1d6b
34#define PRODUCT 0x0105 34#define PRODUCT 0x0105
35 35
36/* endpoints indexes */
37
38#define EP_BULK_IN (1 | LIBUSB_ENDPOINT_IN)
39#define EP_BULK_OUT (2 | LIBUSB_ENDPOINT_OUT)
40
41#define BUF_LEN 8192 36#define BUF_LEN 8192
42 37
43/* 38/*
@@ -159,16 +154,24 @@ void test_exit(struct test_state *state)
159int main(void) 154int main(void)
160{ 155{
161 struct test_state state; 156 struct test_state state;
157 struct libusb_config_descriptor *conf;
158 struct libusb_interface_descriptor const *iface;
159 unsigned char in_addr, out_addr;
162 160
163 if (test_init(&state)) 161 if (test_init(&state))
164 return 1; 162 return 1;
165 163
164 libusb_get_config_descriptor(state.found, 0, &conf);
165 iface = &conf->interface[0].altsetting[0];
166 in_addr = iface->endpoint[0].bEndpointAddress;
167 out_addr = iface->endpoint[1].bEndpointAddress;
168
166 while (1) { 169 while (1) {
167 static unsigned char buffer[BUF_LEN]; 170 static unsigned char buffer[BUF_LEN];
168 int bytes; 171 int bytes;
169 libusb_bulk_transfer(state.handle, EP_BULK_IN, buffer, BUF_LEN, 172 libusb_bulk_transfer(state.handle, in_addr, buffer, BUF_LEN,
170 &bytes, 500); 173 &bytes, 500);
171 libusb_bulk_transfer(state.handle, EP_BULK_OUT, buffer, BUF_LEN, 174 libusb_bulk_transfer(state.handle, out_addr, buffer, BUF_LEN,
172 &bytes, 500); 175 &bytes, 500);
173 } 176 }
174 test_exit(&state); 177 test_exit(&state);
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
index 264fbc297e0b..8bdf16b8ba60 100644
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -133,6 +133,7 @@ static const char * const page_flag_names[] = {
133 [KPF_KSM] = "x:ksm", 133 [KPF_KSM] = "x:ksm",
134 [KPF_THP] = "t:thp", 134 [KPF_THP] = "t:thp",
135 [KPF_BALLOON] = "o:balloon", 135 [KPF_BALLOON] = "o:balloon",
136 [KPF_ZERO_PAGE] = "z:zero_page",
136 137
137 [KPF_RESERVED] = "r:reserved", 138 [KPF_RESERVED] = "r:reserved",
138 [KPF_MLOCKED] = "m:mlocked", 139 [KPF_MLOCKED] = "m:mlocked",