aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/kernel-parameters.txt3
-rw-r--r--Documentation/lguest/lguest.c178
-rw-r--r--Documentation/networking/ip-sysctl.txt6
-rw-r--r--Documentation/networking/tc-actions-env-rules.txt29
4 files changed, 140 insertions, 76 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index a13d69b2217d..8ae5fac08dfa 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1444,7 +1444,8 @@ and is between 256 and 4096 characters. It is defined in the file
1444 Param: "schedule" - profile schedule points. 1444 Param: "schedule" - profile schedule points.
1445 Param: <number> - step/bucket size as a power of 2 for 1445 Param: <number> - step/bucket size as a power of 2 for
1446 statistical time based profiling. 1446 statistical time based profiling.
1447 Param: "sleep" - profile D-state sleeping (millisecs) 1447 Param: "sleep" - profile D-state sleeping (millisecs).
1448 Requires CONFIG_SCHEDSTATS
1448 Param: "kvm" - profile VM exits. 1449 Param: "kvm" - profile VM exits.
1449 1450
1450 processor.max_cstate= [HW,ACPI] 1451 processor.max_cstate= [HW,ACPI]
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 5bdc37f81842..f2668390e8f7 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -34,25 +34,24 @@
34#include <zlib.h> 34#include <zlib.h>
35#include <assert.h> 35#include <assert.h>
36#include <sched.h> 36#include <sched.h>
37/*L:110 We can ignore the 30 include files we need for this program, but I do
38 * want to draw attention to the use of kernel-style types.
39 *
40 * As Linus said, "C is a Spartan language, and so should your naming be." I
41 * like these abbreviations and the header we need uses them, so we define them
42 * here.
43 */
44typedef unsigned long long u64;
45typedef uint32_t u32;
46typedef uint16_t u16;
47typedef uint8_t u8;
48#include "linux/lguest_launcher.h" 37#include "linux/lguest_launcher.h"
49#include "linux/pci_ids.h"
50#include "linux/virtio_config.h" 38#include "linux/virtio_config.h"
51#include "linux/virtio_net.h" 39#include "linux/virtio_net.h"
52#include "linux/virtio_blk.h" 40#include "linux/virtio_blk.h"
53#include "linux/virtio_console.h" 41#include "linux/virtio_console.h"
54#include "linux/virtio_ring.h" 42#include "linux/virtio_ring.h"
55#include "asm-x86/bootparam.h" 43#include "asm-x86/bootparam.h"
44/*L:110 We can ignore the 38 include files we need for this program, but I do
45 * want to draw attention to the use of kernel-style types.
46 *
47 * As Linus said, "C is a Spartan language, and so should your naming be." I
48 * like these abbreviations, so we define them here. Note that u64 is always
49 * unsigned long long, which works on all Linux systems: this means that we can
50 * use %llu in printf for any u64. */
51typedef unsigned long long u64;
52typedef uint32_t u32;
53typedef uint16_t u16;
54typedef uint8_t u8;
56/*:*/ 55/*:*/
57 56
58#define PAGE_PRESENT 0x7 /* Present, RW, Execute */ 57#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
@@ -361,8 +360,8 @@ static unsigned long load_bzimage(int fd)
361} 360}
362 361
363/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels 362/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels
364 * come wrapped up in the self-decompressing "bzImage" format. With some funky 363 * come wrapped up in the self-decompressing "bzImage" format. With a little
365 * coding, we can load those, too. */ 364 * work, we can load those, too. */
366static unsigned long load_kernel(int fd) 365static unsigned long load_kernel(int fd)
367{ 366{
368 Elf32_Ehdr hdr; 367 Elf32_Ehdr hdr;
@@ -465,6 +464,7 @@ static unsigned long setup_pagetables(unsigned long mem,
465 * to know where it is. */ 464 * to know where it is. */
466 return to_guest_phys(pgdir); 465 return to_guest_phys(pgdir);
467} 466}
467/*:*/
468 468
469/* Simple routine to roll all the commandline arguments together with spaces 469/* Simple routine to roll all the commandline arguments together with spaces
470 * between them. */ 470 * between them. */
@@ -481,9 +481,9 @@ static void concat(char *dst, char *args[])
481 dst[len] = '\0'; 481 dst[len] = '\0';
482} 482}
483 483
484/* This is where we actually tell the kernel to initialize the Guest. We saw 484/*L:185 This is where we actually tell the kernel to initialize the Guest. We
485 * the arguments it expects when we looked at initialize() in lguest_user.c: 485 * saw the arguments it expects when we looked at initialize() in lguest_user.c:
486 * the base of guest "physical" memory, the top physical page to allow, the 486 * the base of Guest "physical" memory, the top physical page to allow, the
487 * top level pagetable and the entry point for the Guest. */ 487 * top level pagetable and the entry point for the Guest. */
488static int tell_kernel(unsigned long pgdir, unsigned long start) 488static int tell_kernel(unsigned long pgdir, unsigned long start)
489{ 489{
@@ -513,13 +513,14 @@ static void add_device_fd(int fd)
513/*L:200 513/*L:200
514 * The Waker. 514 * The Waker.
515 * 515 *
516 * With a console and network devices, we can have lots of input which we need 516 * With console, block and network devices, we can have lots of input which we
517 * to process. We could try to tell the kernel what file descriptors to watch, 517 * need to process. We could try to tell the kernel what file descriptors to
518 * but handing a file descriptor mask through to the kernel is fairly icky. 518 * watch, but handing a file descriptor mask through to the kernel is fairly
519 * icky.
519 * 520 *
520 * Instead, we fork off a process which watches the file descriptors and writes 521 * Instead, we fork off a process which watches the file descriptors and writes
521 * the LHREQ_BREAK command to the /dev/lguest filedescriptor to tell the Host 522 * the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host
522 * loop to stop running the Guest. This causes it to return from the 523 * stop running the Guest. This causes the Launcher to return from the
523 * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset 524 * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset
524 * the LHREQ_BREAK and wake us up again. 525 * the LHREQ_BREAK and wake us up again.
525 * 526 *
@@ -545,7 +546,9 @@ static void wake_parent(int pipefd, int lguest_fd)
545 if (read(pipefd, &fd, sizeof(fd)) == 0) 546 if (read(pipefd, &fd, sizeof(fd)) == 0)
546 exit(0); 547 exit(0);
547 /* Otherwise it's telling us to change what file 548 /* Otherwise it's telling us to change what file
548 * descriptors we're to listen to. */ 549 * descriptors we're to listen to. Positive means
550 * listen to a new one, negative means stop
551 * listening. */
549 if (fd >= 0) 552 if (fd >= 0)
550 FD_SET(fd, &devices.infds); 553 FD_SET(fd, &devices.infds);
551 else 554 else
@@ -560,7 +563,7 @@ static int setup_waker(int lguest_fd)
560{ 563{
561 int pipefd[2], child; 564 int pipefd[2], child;
562 565
563 /* We create a pipe to talk to the waker, and also so it knows when the 566 /* We create a pipe to talk to the Waker, and also so it knows when the
564 * Launcher dies (and closes pipe). */ 567 * Launcher dies (and closes pipe). */
565 pipe(pipefd); 568 pipe(pipefd);
566 child = fork(); 569 child = fork();
@@ -568,7 +571,8 @@ static int setup_waker(int lguest_fd)
568 err(1, "forking"); 571 err(1, "forking");
569 572
570 if (child == 0) { 573 if (child == 0) {
571 /* Close the "writing" end of our copy of the pipe */ 574 /* We are the Waker: close the "writing" end of our copy of the
575 * pipe and start waiting for input. */
572 close(pipefd[1]); 576 close(pipefd[1]);
573 wake_parent(pipefd[0], lguest_fd); 577 wake_parent(pipefd[0], lguest_fd);
574 } 578 }
@@ -579,12 +583,12 @@ static int setup_waker(int lguest_fd)
579 return pipefd[1]; 583 return pipefd[1];
580} 584}
581 585
582/*L:210 586/*
583 * Device Handling. 587 * Device Handling.
584 * 588 *
585 * When the Guest sends DMA to us, it sends us an array of addresses and sizes. 589 * When the Guest gives us a buffer, it sends an array of addresses and sizes.
586 * We need to make sure it's not trying to reach into the Launcher itself, so 590 * We need to make sure it's not trying to reach into the Launcher itself, so
587 * we have a convenient routine which check it and exits with an error message 591 * we have a convenient routine which checks it and exits with an error message
588 * if something funny is going on: 592 * if something funny is going on:
589 */ 593 */
590static void *_check_pointer(unsigned long addr, unsigned int size, 594static void *_check_pointer(unsigned long addr, unsigned int size,
@@ -601,7 +605,9 @@ static void *_check_pointer(unsigned long addr, unsigned int size,
601/* A macro which transparently hands the line number to the real function. */ 605/* A macro which transparently hands the line number to the real function. */
602#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) 606#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
603 607
604/* This function returns the next descriptor in the chain, or vq->vring.num. */ 608/* Each buffer in the virtqueues is actually a chain of descriptors. This
609 * function returns the next descriptor in the chain, or vq->vring.num if we're
610 * at the end. */
605static unsigned next_desc(struct virtqueue *vq, unsigned int i) 611static unsigned next_desc(struct virtqueue *vq, unsigned int i)
606{ 612{
607 unsigned int next; 613 unsigned int next;
@@ -680,13 +686,14 @@ static unsigned get_vq_desc(struct virtqueue *vq,
680 return head; 686 return head;
681} 687}
682 688
683/* Once we've used one of their buffers, we tell them about it. We'll then 689/* After we've used one of their buffers, we tell them about it. We'll then
684 * want to send them an interrupt, using trigger_irq(). */ 690 * want to send them an interrupt, using trigger_irq(). */
685static void add_used(struct virtqueue *vq, unsigned int head, int len) 691static void add_used(struct virtqueue *vq, unsigned int head, int len)
686{ 692{
687 struct vring_used_elem *used; 693 struct vring_used_elem *used;
688 694
689 /* Get a pointer to the next entry in the used ring. */ 695 /* The virtqueue contains a ring of used buffers. Get a pointer to the
696 * next entry in that used ring. */
690 used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; 697 used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
691 used->id = head; 698 used->id = head;
692 used->len = len; 699 used->len = len;
@@ -700,6 +707,7 @@ static void trigger_irq(int fd, struct virtqueue *vq)
700{ 707{
701 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; 708 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
702 709
710 /* If they don't want an interrupt, don't send one. */
703 if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) 711 if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
704 return; 712 return;
705 713
@@ -716,8 +724,11 @@ static void add_used_and_trigger(int fd, struct virtqueue *vq,
716 trigger_irq(fd, vq); 724 trigger_irq(fd, vq);
717} 725}
718 726
719/* Here is the input terminal setting we save, and the routine to restore them 727/*
720 * on exit so the user can see what they type next. */ 728 * The Console
729 *
730 * Here is the input terminal setting we save, and the routine to restore them
731 * on exit so the user gets their terminal back. */
721static struct termios orig_term; 732static struct termios orig_term;
722static void restore_term(void) 733static void restore_term(void)
723{ 734{
@@ -818,7 +829,10 @@ static void handle_console_output(int fd, struct virtqueue *vq)
818 } 829 }
819} 830}
820 831
821/* Handling output for network is also simple: we get all the output buffers 832/*
833 * The Network
834 *
835 * Handling output for network is also simple: we get all the output buffers
822 * and write them (ignoring the first element) to this device's file descriptor 836 * and write them (ignoring the first element) to this device's file descriptor
823 * (stdout). */ 837 * (stdout). */
824static void handle_net_output(int fd, struct virtqueue *vq) 838static void handle_net_output(int fd, struct virtqueue *vq)
@@ -831,8 +845,9 @@ static void handle_net_output(int fd, struct virtqueue *vq)
831 while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { 845 while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
832 if (in) 846 if (in)
833 errx(1, "Input buffers in output queue?"); 847 errx(1, "Input buffers in output queue?");
834 /* Check header, but otherwise ignore it (we said we supported 848 /* Check header, but otherwise ignore it (we told the Guest we
835 * no features). */ 849 * supported no features, so it shouldn't have anything
850 * interesting). */
836 (void)convert(&iov[0], struct virtio_net_hdr); 851 (void)convert(&iov[0], struct virtio_net_hdr);
837 len = writev(vq->dev->fd, iov+1, out-1); 852 len = writev(vq->dev->fd, iov+1, out-1);
838 add_used_and_trigger(fd, vq, head, len); 853 add_used_and_trigger(fd, vq, head, len);
@@ -883,7 +898,8 @@ static bool handle_tun_input(int fd, struct device *dev)
883 return true; 898 return true;
884} 899}
885 900
886/* This callback ensures we try again, in case we stopped console or net 901/*L:215 This is the callback attached to the network and console input
902 * virtqueues: it ensures we try again, in case we stopped console or net
887 * delivery because Guest didn't have any buffers. */ 903 * delivery because Guest didn't have any buffers. */
888static void enable_fd(int fd, struct virtqueue *vq) 904static void enable_fd(int fd, struct virtqueue *vq)
889{ 905{
@@ -919,7 +935,7 @@ static void handle_output(int fd, unsigned long addr)
919 strnlen(from_guest_phys(addr), guest_limit - addr)); 935 strnlen(from_guest_phys(addr), guest_limit - addr));
920} 936}
921 937
922/* This is called when the waker wakes us up: check for incoming file 938/* This is called when the Waker wakes us up: check for incoming file
923 * descriptors. */ 939 * descriptors. */
924static void handle_input(int fd) 940static void handle_input(int fd)
925{ 941{
@@ -986,8 +1002,7 @@ static struct lguest_device_desc *new_dev_desc(u16 type)
986} 1002}
987 1003
988/* Each device descriptor is followed by some configuration information. 1004/* Each device descriptor is followed by some configuration information.
989 * The first byte is a "status" byte for the Guest to report what's happening. 1005 * Each configuration field looks like: u8 type, u8 len, [... len bytes...].
990 * After that are fields: u8 type, u8 len, [... len bytes...].
991 * 1006 *
992 * This routine adds a new field to an existing device's descriptor. It only 1007 * This routine adds a new field to an existing device's descriptor. It only
993 * works for the last device, but that's OK because that's how we use it. */ 1008 * works for the last device, but that's OK because that's how we use it. */
@@ -1044,14 +1059,17 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1044 /* Link virtqueue back to device. */ 1059 /* Link virtqueue back to device. */
1045 vq->dev = dev; 1060 vq->dev = dev;
1046 1061
1047 /* Set up handler. */ 1062 /* Set the routine to call when the Guest does something to this
1063 * virtqueue. */
1048 vq->handle_output = handle_output; 1064 vq->handle_output = handle_output;
1065
1066 /* Set the "Don't Notify Me" flag if we don't have a handler */
1049 if (!handle_output) 1067 if (!handle_output)
1050 vq->vring.used->flags = VRING_USED_F_NO_NOTIFY; 1068 vq->vring.used->flags = VRING_USED_F_NO_NOTIFY;
1051} 1069}
1052 1070
1053/* This routine does all the creation and setup of a new device, including 1071/* This routine does all the creation and setup of a new device, including
1054 * caling new_dev_desc() to allocate the descriptor and device memory. */ 1072 * calling new_dev_desc() to allocate the descriptor and device memory. */
1055static struct device *new_device(const char *name, u16 type, int fd, 1073static struct device *new_device(const char *name, u16 type, int fd,
1056 bool (*handle_input)(int, struct device *)) 1074 bool (*handle_input)(int, struct device *))
1057{ 1075{
@@ -1060,7 +1078,7 @@ static struct device *new_device(const char *name, u16 type, int fd,
1060 /* Append to device list. Prepending to a single-linked list is 1078 /* Append to device list. Prepending to a single-linked list is
1061 * easier, but the user expects the devices to be arranged on the bus 1079 * easier, but the user expects the devices to be arranged on the bus
1062 * in command-line order. The first network device on the command line 1080 * in command-line order. The first network device on the command line
1063 * is eth0, the first block device /dev/lgba, etc. */ 1081 * is eth0, the first block device /dev/vda, etc. */
1064 *devices.lastdev = dev; 1082 *devices.lastdev = dev;
1065 dev->next = NULL; 1083 dev->next = NULL;
1066 devices.lastdev = &dev->next; 1084 devices.lastdev = &dev->next;
@@ -1104,7 +1122,7 @@ static void setup_console(void)
1104 /* The console needs two virtqueues: the input then the output. When 1122 /* The console needs two virtqueues: the input then the output. When
1105 * they put something the input queue, we make sure we're listening to 1123 * they put something the input queue, we make sure we're listening to
1106 * stdin. When they put something in the output queue, we write it to 1124 * stdin. When they put something in the output queue, we write it to
1107 * stdout. */ 1125 * stdout. */
1108 add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); 1126 add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
1109 add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output); 1127 add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output);
1110 1128
@@ -1252,21 +1270,17 @@ static void setup_tun_net(const char *arg)
1252 verbose("attached to bridge: %s\n", br_name); 1270 verbose("attached to bridge: %s\n", br_name);
1253} 1271}
1254 1272
1255 1273/* Our block (disk) device should be really simple: the Guest asks for a block
1256/* 1274 * number and we read or write that position in the file. Unfortunately, that
1257 * Block device. 1275 * was amazingly slow: the Guest waits until the read is finished before
1276 * running anything else, even if it could have been doing useful work.
1258 * 1277 *
1259 * Serving a block device is really easy: the Guest asks for a block number and 1278 * We could use async I/O, except it's reputed to suck so hard that characters
1260 * we read or write that position in the file. 1279 * actually go missing from your code when you try to use it.
1261 *
1262 * Unfortunately, this is amazingly slow: the Guest waits until the read is
1263 * finished before running anything else, even if it could be doing useful
1264 * work. We could use async I/O, except it's reputed to suck so hard that
1265 * characters actually go missing from your code when you try to use it.
1266 * 1280 *
1267 * So we farm the I/O out to thread, and communicate with it via a pipe. */ 1281 * So we farm the I/O out to thread, and communicate with it via a pipe. */
1268 1282
1269/* This hangs off device->priv, with the data. */ 1283/* This hangs off device->priv. */
1270struct vblk_info 1284struct vblk_info
1271{ 1285{
1272 /* The size of the file. */ 1286 /* The size of the file. */
@@ -1282,8 +1296,14 @@ struct vblk_info
1282 * Launcher triggers interrupt to Guest. */ 1296 * Launcher triggers interrupt to Guest. */
1283 int done_fd; 1297 int done_fd;
1284}; 1298};
1299/*:*/
1285 1300
1286/* This is the core of the I/O thread. It returns true if it did something. */ 1301/*L:210
1302 * The Disk
1303 *
1304 * Remember that the block device is handled by a separate I/O thread. We head
1305 * straight into the core of that thread here:
1306 */
1287static bool service_io(struct device *dev) 1307static bool service_io(struct device *dev)
1288{ 1308{
1289 struct vblk_info *vblk = dev->priv; 1309 struct vblk_info *vblk = dev->priv;
@@ -1294,10 +1314,14 @@ static bool service_io(struct device *dev)
1294 struct iovec iov[dev->vq->vring.num]; 1314 struct iovec iov[dev->vq->vring.num];
1295 off64_t off; 1315 off64_t off;
1296 1316
1317 /* See if there's a request waiting. If not, nothing to do. */
1297 head = get_vq_desc(dev->vq, iov, &out_num, &in_num); 1318 head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
1298 if (head == dev->vq->vring.num) 1319 if (head == dev->vq->vring.num)
1299 return false; 1320 return false;
1300 1321
1322 /* Every block request should contain at least one output buffer
1323 * (detailing the location on disk and the type of request) and one
1324 * input buffer (to hold the result). */
1301 if (out_num == 0 || in_num == 0) 1325 if (out_num == 0 || in_num == 0)
1302 errx(1, "Bad virtblk cmd %u out=%u in=%u", 1326 errx(1, "Bad virtblk cmd %u out=%u in=%u",
1303 head, out_num, in_num); 1327 head, out_num, in_num);
@@ -1306,10 +1330,15 @@ static bool service_io(struct device *dev)
1306 in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr); 1330 in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr);
1307 off = out->sector * 512; 1331 off = out->sector * 512;
1308 1332
1309 /* This is how we implement barriers. Pretty poor, no? */ 1333 /* The block device implements "barriers", where the Guest indicates
1334 * that it wants all previous writes to occur before this write. We
1335 * don't have a way of asking our kernel to do a barrier, so we just
1336 * synchronize all the data in the file. Pretty poor, no? */
1310 if (out->type & VIRTIO_BLK_T_BARRIER) 1337 if (out->type & VIRTIO_BLK_T_BARRIER)
1311 fdatasync(vblk->fd); 1338 fdatasync(vblk->fd);
1312 1339
1340 /* In general the virtio block driver is allowed to try SCSI commands.
1341 * It'd be nice if we supported eject, for example, but we don't. */
1313 if (out->type & VIRTIO_BLK_T_SCSI_CMD) { 1342 if (out->type & VIRTIO_BLK_T_SCSI_CMD) {
1314 fprintf(stderr, "Scsi commands unsupported\n"); 1343 fprintf(stderr, "Scsi commands unsupported\n");
1315 in->status = VIRTIO_BLK_S_UNSUPP; 1344 in->status = VIRTIO_BLK_S_UNSUPP;
@@ -1375,7 +1404,7 @@ static int io_thread(void *_dev)
1375 1404
1376 /* When this read fails, it means Launcher died, so we follow. */ 1405 /* When this read fails, it means Launcher died, so we follow. */
1377 while (read(vblk->workpipe[0], &c, 1) == 1) { 1406 while (read(vblk->workpipe[0], &c, 1) == 1) {
1378 /* We acknowledge each request immediately, to reduce latency, 1407 /* We acknowledge each request immediately to reduce latency,
1379 * rather than waiting until we've done them all. I haven't 1408 * rather than waiting until we've done them all. I haven't
1380 * measured to see if it makes any difference. */ 1409 * measured to see if it makes any difference. */
1381 while (service_io(dev)) 1410 while (service_io(dev))
@@ -1384,12 +1413,14 @@ static int io_thread(void *_dev)
1384 return 0; 1413 return 0;
1385} 1414}
1386 1415
1387/* When the thread says some I/O is done, we interrupt the Guest. */ 1416/* Now we've seen the I/O thread, we return to the Launcher to see what happens
1417 * when the thread tells us it's completed some I/O. */
1388static bool handle_io_finish(int fd, struct device *dev) 1418static bool handle_io_finish(int fd, struct device *dev)
1389{ 1419{
1390 char c; 1420 char c;
1391 1421
1392 /* If child died, presumably it printed message. */ 1422 /* If the I/O thread died, presumably it printed the error, so we
1423 * simply exit. */
1393 if (read(dev->fd, &c, 1) != 1) 1424 if (read(dev->fd, &c, 1) != 1)
1394 exit(1); 1425 exit(1);
1395 1426
@@ -1398,7 +1429,7 @@ static bool handle_io_finish(int fd, struct device *dev)
1398 return true; 1429 return true;
1399} 1430}
1400 1431
1401/* When the Guest submits some I/O, we wake the I/O thread. */ 1432/* When the Guest submits some I/O, we just need to wake the I/O thread. */
1402static void handle_virtblk_output(int fd, struct virtqueue *vq) 1433static void handle_virtblk_output(int fd, struct virtqueue *vq)
1403{ 1434{
1404 struct vblk_info *vblk = vq->dev->priv; 1435 struct vblk_info *vblk = vq->dev->priv;
@@ -1410,7 +1441,7 @@ static void handle_virtblk_output(int fd, struct virtqueue *vq)
1410 exit(1); 1441 exit(1);
1411} 1442}
1412 1443
1413/* This creates a virtual block device. */ 1444/*L:198 This actually sets up a virtual block device. */
1414static void setup_block_file(const char *filename) 1445static void setup_block_file(const char *filename)
1415{ 1446{
1416 int p[2]; 1447 int p[2];
@@ -1426,7 +1457,7 @@ static void setup_block_file(const char *filename)
1426 /* The device responds to return from I/O thread. */ 1457 /* The device responds to return from I/O thread. */
1427 dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish); 1458 dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish);
1428 1459
1429 /* The device has a virtqueue. */ 1460 /* The device has one virtqueue, where the Guest places requests. */
1430 add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output); 1461 add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output);
1431 1462
1432 /* Allocate the room for our own bookkeeping */ 1463 /* Allocate the room for our own bookkeeping */
@@ -1448,7 +1479,8 @@ static void setup_block_file(const char *filename)
1448 /* The I/O thread writes to this end of the pipe when done. */ 1479 /* The I/O thread writes to this end of the pipe when done. */
1449 vblk->done_fd = p[1]; 1480 vblk->done_fd = p[1];
1450 1481
1451 /* This is how we tell the I/O thread about more work. */ 1482 /* This is the second pipe, which is how we tell the I/O thread about
1483 * more work. */
1452 pipe(vblk->workpipe); 1484 pipe(vblk->workpipe);
1453 1485
1454 /* Create stack for thread and run it */ 1486 /* Create stack for thread and run it */
@@ -1487,24 +1519,25 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd)
1487 char reason[1024] = { 0 }; 1519 char reason[1024] = { 0 };
1488 read(lguest_fd, reason, sizeof(reason)-1); 1520 read(lguest_fd, reason, sizeof(reason)-1);
1489 errx(1, "%s", reason); 1521 errx(1, "%s", reason);
1490 /* EAGAIN means the waker wanted us to look at some input. 1522 /* EAGAIN means the Waker wanted us to look at some input.
1491 * Anything else means a bug or incompatible change. */ 1523 * Anything else means a bug or incompatible change. */
1492 } else if (errno != EAGAIN) 1524 } else if (errno != EAGAIN)
1493 err(1, "Running guest failed"); 1525 err(1, "Running guest failed");
1494 1526
1495 /* Service input, then unset the BREAK which releases 1527 /* Service input, then unset the BREAK to release the Waker. */
1496 * the Waker. */
1497 handle_input(lguest_fd); 1528 handle_input(lguest_fd);
1498 if (write(lguest_fd, args, sizeof(args)) < 0) 1529 if (write(lguest_fd, args, sizeof(args)) < 0)
1499 err(1, "Resetting break"); 1530 err(1, "Resetting break");
1500 } 1531 }
1501} 1532}
1502/* 1533/*
1503 * This is the end of the Launcher. 1534 * This is the end of the Launcher. The good news: we are over halfway
1535 * through! The bad news: the most fiendish part of the code still lies ahead
1536 * of us.
1504 * 1537 *
1505 * But wait! We've seen I/O from the Launcher, and we've seen I/O from the 1538 * Are you ready? Take a deep breath and join me in the core of the Host, in
1506 * Drivers. If we were to see the Host kernel I/O code, our understanding 1539 * "make Host".
1507 * would be complete... :*/ 1540 :*/
1508 1541
1509static struct option opts[] = { 1542static struct option opts[] = {
1510 { "verbose", 0, NULL, 'v' }, 1543 { "verbose", 0, NULL, 'v' },
@@ -1527,7 +1560,7 @@ int main(int argc, char *argv[])
1527 /* Memory, top-level pagetable, code startpoint and size of the 1560 /* Memory, top-level pagetable, code startpoint and size of the
1528 * (optional) initrd. */ 1561 * (optional) initrd. */
1529 unsigned long mem = 0, pgdir, start, initrd_size = 0; 1562 unsigned long mem = 0, pgdir, start, initrd_size = 0;
1530 /* A temporary and the /dev/lguest file descriptor. */ 1563 /* Two temporaries and the /dev/lguest file descriptor. */
1531 int i, c, lguest_fd; 1564 int i, c, lguest_fd;
1532 /* The boot information for the Guest. */ 1565 /* The boot information for the Guest. */
1533 struct boot_params *boot; 1566 struct boot_params *boot;
@@ -1622,6 +1655,7 @@ int main(int argc, char *argv[])
1622 /* The boot header contains a command line pointer: we put the command 1655 /* The boot header contains a command line pointer: we put the command
1623 * line after the boot header. */ 1656 * line after the boot header. */
1624 boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); 1657 boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
1658 /* We use a simple helper to copy the arguments separated by spaces. */
1625 concat((char *)(boot + 1), argv+optind+2); 1659 concat((char *)(boot + 1), argv+optind+2);
1626 1660
1627 /* Boot protocol version: 2.07 supports the fields for lguest. */ 1661 /* Boot protocol version: 2.07 supports the fields for lguest. */
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 747a5d15d529..6f7872ba1def 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -184,14 +184,14 @@ tcp_frto - INTEGER
184 F-RTO is an enhanced recovery algorithm for TCP retransmission 184 F-RTO is an enhanced recovery algorithm for TCP retransmission
185 timeouts. It is particularly beneficial in wireless environments 185 timeouts. It is particularly beneficial in wireless environments
186 where packet loss is typically due to random radio interference 186 where packet loss is typically due to random radio interference
187 rather than intermediate router congestion. FRTO is sender-side 187 rather than intermediate router congestion. F-RTO is sender-side
188 only modification. Therefore it does not require any support from 188 only modification. Therefore it does not require any support from
189 the peer, but in a typical case, however, where wireless link is 189 the peer, but in a typical case, however, where wireless link is
190 the local access link and most of the data flows downlink, the 190 the local access link and most of the data flows downlink, the
191 faraway servers should have FRTO enabled to take advantage of it. 191 faraway servers should have F-RTO enabled to take advantage of it.
192 If set to 1, basic version is enabled. 2 enables SACK enhanced 192 If set to 1, basic version is enabled. 2 enables SACK enhanced
193 F-RTO if flow uses SACK. The basic version can be used also when 193 F-RTO if flow uses SACK. The basic version can be used also when
194 SACK is in use though scenario(s) with it exists where FRTO 194 SACK is in use though scenario(s) with it exists where F-RTO
195 interacts badly with the packet counting of the SACK enabled TCP 195 interacts badly with the packet counting of the SACK enabled TCP
196 flow. 196 flow.
197 197
diff --git a/Documentation/networking/tc-actions-env-rules.txt b/Documentation/networking/tc-actions-env-rules.txt
new file mode 100644
index 000000000000..01e716d185f4
--- /dev/null
+++ b/Documentation/networking/tc-actions-env-rules.txt
@@ -0,0 +1,29 @@
1
2The "enviromental" rules for authors of any new tc actions are:
3
41) If you stealeth or borroweth any packet thou shalt be branching
5from the righteous path and thou shalt cloneth.
6
7For example if your action queues a packet to be processed later
8or intentionaly branches by redirecting a packet then you need to
9clone the packet.
10There are certain fields in the skb tc_verd that need to be reset so we
11avoid loops etc. A few are generic enough so much so that skb_act_clone()
12resets them for you. So invoke skb_act_clone() rather than skb_clone()
13
142) If you munge any packet thou shalt call pskb_expand_head in the case
15someone else is referencing the skb. After that you "own" the skb.
16You must also tell us if it is ok to munge the packet (TC_OK2MUNGE),
17this way any action downstream can stomp on the packet.
18
193) dropping packets you dont own is a nono. You simply return
20TC_ACT_SHOT to the caller and they will drop it.
21
22The "enviromental" rules for callers of actions (qdiscs etc) are:
23
24*) thou art responsible for freeing anything returned as being
25TC_ACT_SHOT/STOLEN/QUEUED. If none of TC_ACT_SHOT/STOLEN/QUEUED is
26returned then all is great and you dont need to do anything.
27
28Post on netdev if something is unclear.
29