82 files changed, 2515 insertions, 1492 deletions
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile
index 1a7f53068ec2..054a7ecf64c6 100644
--- a/Documentation/DocBook/Makefile
+++ b/Documentation/DocBook/Makefile
@@ -165,7 +165,7 @@ quiet_cmd_db2man = MAN     $@
        @touch $@
 ###
-# Rules to generate postscripts and PNG imgages from .fig format files
+# Rules to generate postscripts and PNG images from .fig format files
 quiet_cmd_fig2eps = FIG2EPS $@
      cmd_fig2eps = fig2dev -Leps $< $@
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index d3290c46af51..aa38cc5692a0 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -46,7 +46,7 @@
     <sect1><title>Atomic and pointer manipulation</title>
 !Iinclude/asm-x86/atomic_32.h
-!Iinclude/asm-x86/unaligned_32.h
+!Iinclude/asm-x86/unaligned.h
     </sect1>
     <sect1><title>Delaying, scheduling, and timer routines</title>
diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl
index 6fbc41d98c1e..957cf5c26831 100644
--- a/Documentation/DocBook/mtdnand.tmpl
+++ b/Documentation/DocBook/mtdnand.tmpl
@@ -282,7 +282,7 @@ int __init board_init (void)
                goto out;
        }
-        /* map physical adress */
+        /* map physical address */
        baseaddr = (unsigned long)ioremap(CHIP_PHYSICAL_ADDRESS, 1024);
        if(!baseaddr){
                printk("Ioremap to access NAND chip failed\n");
@@ -306,7 +306,7 @@ int __init board_init (void)
        this->dev_ready = board_dev_ready;
        this->eccmode = NAND_ECC_SOFT;
-        /* Scan to find existance of the device */
+        /* Scan to find existence of the device */
        if (nand_scan (board_mtd, 1)) {
                err = -ENXIO;
                goto out_ior;
@@ -340,7 +340,7 @@ static void __exit board_cleanup (void)
        /* Release resources, unregister device */
        nand_release (board_mtd);
-        /* unmap physical adress */
+        /* unmap physical address */
        iounmap((void *)baseaddr);
        
        /* Free the MTD device structure */
diff --git a/Documentation/IPMI.txt b/Documentation/IPMI.txt
index 24dc3fcf1594..bc38283379f0 100644
--- a/Documentation/IPMI.txt
+++ b/Documentation/IPMI.txt
@@ -441,17 +441,20 @@ ACPI, and if none of those then a KCS device at the spec-specified
 0xca2.  If you want to turn this off, set the "trydefaults" option to
 false.
-If you have high-res timers compiled into the kernel, the driver will
+If your IPMI interface does not support interrupts and is a KCS or
-use them to provide much better performance.  Note that if you do not
+SMIC interface, the IPMI driver will start a kernel thread for the
-have high-res timers enabled in the kernel and you don't have
+interface to help speed things up.  This is a low-priority kernel
-interrupts enabled, the driver will run VERY slowly.  Don't blame me,
+thread that constantly polls the IPMI driver while an IPMI operation
+is in progress.  The force_kipmid module parameter will all the user to
+force this thread on or off.  If you force it off and don't have
+interrupts, the driver will run VERY slowly.  Don't blame me,
 these interfaces suck.
 The driver supports a hot add and remove of interfaces.  This way,
 interfaces can be added or removed after the kernel is up and running.
-This is done using /sys/modules/ipmi_si/hotmod, which is a write-only
+This is done using /sys/modules/ipmi_si/parameters/hotmod, which is a
-parameter.  You write a string to this interface.  The string has the
+write-only parameter.  You write a string to this interface.  The string
-format:
+has the format:
   <op1>[:op2[:op3...]]
 The "op"s are:
   add|remove,kcs|bt|smic,mem|i/o,<address>[,<opt1>[,<opt2>[,...]]]
@@ -581,9 +584,11 @@ The watchdog will panic and start a 120 second reset timeout if it
 gets a pre-action.  During a panic or a reboot, the watchdog will
 start a 120 timer if it is running to make sure the reboot occurs.
-Note that if you use the NMI preaction for the watchdog, you MUST
+Note that if you use the NMI preaction for the watchdog, you MUST NOT
-NOT use nmi watchdog mode 1.  If you use the NMI watchdog, you
+use the nmi watchdog.  There is no reasonable way to tell if an NMI
-must use mode 2.
+comes from the IPMI controller, so it must assume that if it gets an
+otherwise unhandled NMI, it must be from IPMI and it will panic
+immediately.
 Once you open the watchdog timer, you must write a 'V' character to the
 device to close it, or the timer will not stop.  This is a new semantic
diff --git a/Documentation/Intel-IOMMU.txt b/Documentation/Intel-IOMMU.txt
new file mode 100644
index 000000000000..c2321903aa09
--- /dev/null
+++ b/Documentation/Intel-IOMMU.txt
@@ -0,0 +1,115 @@
+Linux IOMMU Support
+===================
+The architecture spec can be obtained from the below location.
+http://www.intel.com/technology/virtualization/
+This guide gives a quick cheat sheet for some basic understanding.
+Some Keywords
+DMAR - DMA remapping
+DRHD - DMA Engine Reporting Structure
+RMRR - Reserved memory Region Reporting Structure
+ZLR  - Zero length reads from PCI devices
+IOVA - IO Virtual address.
+Basic stuff
+-----------
+ACPI enumerates and lists the different DMA engines in the platform, and
+device scope relationships between PCI devices and which DMA engine  controls
+them.
+What is RMRR?
+-------------
+There are some devices the BIOS controls, for e.g USB devices to perform
+PS2 emulation. The regions of memory used for these devices are marked
+reserved in the e820 map. When we turn on DMA translation, DMA to those
+regions will fail. Hence BIOS uses RMRR to specify these regions along with
+devices that need to access these regions. OS is expected to setup
+unity mappings for these regions for these devices to access these regions.
+How is IOVA generated?
+---------------------
+Well behaved drivers call pci_map_*() calls before sending command to device
+that needs to perform DMA. Once DMA is completed and mapping is no longer
+required, device performs a pci_unmap_*() calls to unmap the region.
+The Intel IOMMU driver allocates a virtual address per domain. Each PCIE
+device has its own domain (hence protection). Devices under p2p bridges
+share the virtual address with all devices under the p2p bridge due to
+transaction id aliasing for p2p bridges.
+IOVA generation is pretty generic. We used the same technique as vmalloc()
+but these are not global address spaces, but separate for each domain.
+Different DMA engines may support different number of domains.
+We also allocate gaurd pages with each mapping, so we can attempt to catch
+any overflow that might happen.
+Graphics Problems?
+------------------
+If you encounter issues with graphics devices, you can try adding
+option intel_iommu=igfx_off to turn off the integrated graphics engine.
+If it happens to be a PCI device included in the INCLUDE_ALL Engine,
+then try enabling CONFIG_DMAR_GFX_WA to setup a 1-1 map. We hear
+graphics drivers may be in process of using DMA api's in the near
+future and at that time this option can be yanked out.
+Some exceptions to IOVA
+-----------------------
+Interrupt ranges are not address translated, (0xfee00000 - 0xfeefffff).
+The same is true for peer to peer transactions. Hence we reserve the
+address from PCI MMIO ranges so they are not allocated for IOVA addresses.
+Fault reporting
+---------------
+When errors are reported, the DMA engine signals via an interrupt. The fault
+reason and device that caused it with fault reason is printed on console.
+See below for sample.
+Boot Message Sample
+-------------------
+Something like this gets printed indicating presence of DMAR tables
+in ACPI.
+ACPI: DMAR (v001 A M I  OEMDMAR  0x00000001 MSFT 0x00000097) @ 0x000000007f5b5ef0
+When DMAR is being processed and initialized by ACPI, prints DMAR locations
+and any RMRR's processed.
+ACPI DMAR:Host address width 36
+ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed90000
+ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed91000
+ACPI DMAR:DRHD (flags: 0x00000001)base: 0x00000000fed93000
+ACPI DMAR:RMRR base: 0x00000000000ed000 end: 0x00000000000effff
+ACPI DMAR:RMRR base: 0x000000007f600000 end: 0x000000007fffffff
+When DMAR is enabled for use, you will notice..
+PCI-DMA: Using DMAR IOMMU
+Fault reporting
+---------------
+DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
+DMAR:[fault reason 05] PTE Write access is not set
+DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
+DMAR:[fault reason 05] PTE Write access is not set
+TBD
+----
+- For compatibility testing, could use unity map domain for all devices, just
+  provide a 1-1 for all useful memory under a single domain for all devices.
+- API for paravirt ops for abstracting functionlity for VMM folks.
diff --git a/Documentation/SubmitChecklist b/Documentation/SubmitChecklist
index 19e7f65c269f..34e06d2f194f 100644
--- a/Documentation/SubmitChecklist
+++ b/Documentation/SubmitChecklist
@@ -67,7 +67,7 @@ kernel patches.
 20: Check that it all passes `make headers_check'.
 21: Has been checked with injection of at least slab and page-allocation
-    fauilures.  See Documentation/fault-injection/.
+    failures.  See Documentation/fault-injection/.
    If the new code is substantial, addition of subsystem-specific fault
    injection might be appropriate.
diff --git a/Documentation/SubmittingDrivers b/Documentation/SubmittingDrivers
index d7e26427e426..24f2eb40cae5 100644
--- a/Documentation/SubmittingDrivers
+++ b/Documentation/SubmittingDrivers
@@ -36,8 +36,7 @@ Linux 2.4:
        If the code area has a general maintainer then please submit it to
        the maintainer listed in MAINTAINERS in the kernel file. If the
        maintainer does not respond or you cannot find the appropriate
-        maintainer then please contact Marcelo Tosatti
+        maintainer then please contact Willy Tarreau <w@1wt.eu>.
-        <marcelo.tosatti@cyclades.com>.
 Linux 2.6:
        The same rules apply as 2.4 except that you should follow linux-kernel
diff --git a/Documentation/accounting/cgroupstats.txt b/Documentation/accounting/cgroupstats.txt
new file mode 100644
index 000000000000..eda40fd39cad
--- /dev/null
+++ b/Documentation/accounting/cgroupstats.txt
@@ -0,0 +1,27 @@
+Control Groupstats is inspired by the discussion at
+http://lkml.org/lkml/2007/4/11/187 and implements per cgroup statistics as
+suggested by Andrew Morton in http://lkml.org/lkml/2007/4/11/263.
+Per cgroup statistics infrastructure re-uses code from the taskstats
+interface. A new set of cgroup operations are registered with commands
+and attributes specific to cgroups. It should be very easy to
+extend per cgroup statistics, by adding members to the cgroupstats
+structure.
+The current model for cgroupstats is a pull, a push model (to post
+statistics on interesting events), should be very easy to add. Currently
+user space requests for statistics by passing the cgroup path.
+Statistics about the state of all the tasks in the cgroup is returned to
+user space.
+NOTE: We currently rely on delay accounting for extracting information
+about tasks blocked on I/O. If CONFIG_TASK_DELAY_ACCT is disabled, this
+information will not be available.
+To extract cgroup statistics a utility very similar to getdelays.c
+has been developed, the sample output of the utility is shown below
+~/balbir/cgroupstats # ./getdelays  -C "/cgroup/a"
+sleeping 1, blocked 0, running 1, stopped 0, uninterruptible 0
+~/balbir/cgroupstats # ./getdelays  -C "/cgroup"
+sleeping 155, blocked 0, running 1, stopped 0, uninterruptible 2
diff --git a/Documentation/arm/Samsung-S3C24XX/DMA.txt b/Documentation/arm/Samsung-S3C24XX/DMA.txt
index 37f4edcc5d87..3ed82383efea 100644
--- a/Documentation/arm/Samsung-S3C24XX/DMA.txt
+++ b/Documentation/arm/Samsung-S3C24XX/DMA.txt
@@ -5,7 +5,7 @@ Introduction
 ------------
   The kernel provides an interface to manage DMA transfers
-   using the DMA channels in the cpu, so that the central
+   using the DMA channels in the CPU, so that the central
   duty of managing channel mappings, and programming the
   channel generators is in one place.
@@ -17,24 +17,24 @@ DMA Channel Ordering
   channels to all sources, which means that some devices
   have a restricted number of channels that can be used.
-   To allow flexibilty for each cpu type and board, the
+   To allow flexibility for each CPU type and board, the
-   dma code can be given an dma ordering structure which
+   DMA code can be given a DMA ordering structure which
   allows the order of channel search to be specified, as
   well as allowing the prohibition of certain claims.
   struct s3c24xx_dma_order has a list of channels, and
-   each channel within has a slot for a list of dma
+   each channel within has a slot for a list of DMA
-   channel numbers. The slots are searched in order, for
+   channel numbers. The slots are searched in order for
-   the presence of a dma channel number with DMA_CH_VALID
+   the presence of a DMA channel number with DMA_CH_VALID
-   orred in.
+   or-ed in.
   If the order has the flag DMA_CH_NEVER set, then after
   checking the channel list, the system will return no
   found channel, thus denying the request.
   A board support file can call s3c24xx_dma_order_set()
-   to register an complete ordering set. The routine will
+   to register a complete ordering set. The routine will
-   copy the data, so the original can be discared with
+   copy the data, so the original can be discarded with
   __initdata.
diff --git a/Documentation/atomic_ops.txt b/Documentation/atomic_ops.txt
index d46306fea230..f20c10c2858f 100644
--- a/Documentation/atomic_ops.txt
+++ b/Documentation/atomic_ops.txt
@@ -418,6 +418,20 @@ brothers:
         */
         smp_mb__after_clear_bit();
+There are two special bitops with lock barrier semantics (acquire/release,
+same as spinlocks). These operate in the same way as their non-_lock/unlock
+postfixed variants, except that they are to provide acquire/release semantics,
+respectively. This means they can be used for bit_spin_trylock and
+bit_spin_unlock type operations without specifying any more barriers.
+        int test_and_set_bit_lock(unsigned long nr, unsigned long *addr);
+        void clear_bit_unlock(unsigned long nr, unsigned long *addr);
+        void __clear_bit_unlock(unsigned long nr, unsigned long *addr);
+The __clear_bit_unlock version is non-atomic, however it still implements
+unlock barrier semantics. This can be useful if the lock itself is protecting
+the other bits in the word.
 Finally, there are non-atomic versions of the bitmask operations
 provided.  They are used in contexts where some other higher-level SMP
 locking scheme is being used to protect the bitmask, and thus less
diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt
index 552cabac0608..da42ab414c48 100644
--- a/Documentation/cachetlb.txt
+++ b/Documentation/cachetlb.txt
@@ -87,30 +87,7 @@ changes occur:
        This is used primarily during fault processing.
-5) void flush_tlb_pgtables(struct mm_struct *mm,
+5) void update_mmu_cache(struct vm_area_struct *vma,
-                           unsigned long start, unsigned long end)
-   The software page tables for address space 'mm' for virtual
-   addresses in the range 'start' to 'end-1' are being torn down.
-   Some platforms cache the lowest level of the software page tables
-   in a linear virtually mapped array, to make TLB miss processing
-   more efficient.  On such platforms, since the TLB is caching the
-   software page table structure, it needs to be flushed when parts
-   of the software page table tree are unlinked/freed.
-   Sparc64 is one example of a platform which does this.
-   Usually, when munmap()'ing an area of user virtual address
-   space, the kernel leaves the page table parts around and just
-   marks the individual pte's as invalid.  However, if very large
-   portions of the address space are unmapped, the kernel frees up
-   those portions of the software page tables to prevent potential
-   excessive kernel memory usage caused by erratic mmap/mmunmap
-   sequences.  It is at these times that flush_tlb_pgtables will
-   be invoked.
-6) void update_mmu_cache(struct vm_area_struct *vma,
                         unsigned long address, pte_t pte)
        At the end of every page fault, this routine is invoked to
@@ -123,7 +100,7 @@ changes occur:
        translations for software managed TLB configurations.
        The sparc64 port currently does this.
-7) void tlb_migrate_finish(struct mm_struct *mm)
+6) void tlb_migrate_finish(struct mm_struct *mm)
        This interface is called at the end of an explicit
        process migration. This interface provides a hook
diff --git a/Documentation/cdrom/cdrom-standard.tex b/Documentation/cdrom/cdrom-standard.tex
index 92f94e597582..c713aeb020c4 100644
--- a/Documentation/cdrom/cdrom-standard.tex
+++ b/Documentation/cdrom/cdrom-standard.tex
@@ -1009,7 +1009,7 @@ taken over the torch in maintaining \cdromc\ and integrating much
 \cdrom-related code in the 2.1-kernel.  Thanks to Scott Snyder and
 Gerd Knorr, who were the first to implement this interface for SCSI
 and IDE-CD drivers and added many ideas for extension of the data
-structures relative to kernel~2.0.  Further thanks to Heiko Eissfeldt,
+structures relative to kernel~2.0.  Further thanks to Heiko Ei{\sz}feldt,
 Thomas Quinot, Jon Tombs, Ken Pizzini, Eberhard M\"onkeberg and Andrew
 Kroll, the \linux\ \cdrom\ device driver developers who were kind
 enough to give suggestions and criticisms during the writing. Finally
diff --git a/Documentation/cgroups.txt b/Documentation/cgroups.txt
new file mode 100644
index 000000000000..98a26f81fa75
--- /dev/null
+++ b/Documentation/cgroups.txt
@@ -0,0 +1,545 @@
+                                CGROUPS
+                                -------
+Written by Paul Menage <menage@google.com> based on Documentation/cpusets.txt
+Original copyright statements from cpusets.txt:
+Portions Copyright (C) 2004 BULL SA.
+Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
+Modified by Paul Jackson <pj@sgi.com>
+Modified by Christoph Lameter <clameter@sgi.com>
+CONTENTS:
+=========
+1. Control Groups
+  1.1 What are cgroups ?
+  1.2 Why are cgroups needed ?
+  1.3 How are cgroups implemented ?
+  1.4 What does notify_on_release do ?
+  1.5 How do I use cgroups ?
+2. Usage Examples and Syntax
+  2.1 Basic Usage
+  2.2 Attaching processes
+3. Kernel API
+  3.1 Overview
+  3.2 Synchronization
+  3.3 Subsystem API
+4. Questions
+1. Control Groups
+==========
+1.1 What are cgroups ?
+----------------------
+Control Groups provide a mechanism for aggregating/partitioning sets of
+tasks, and all their future children, into hierarchical groups with
+specialized behaviour.
+Definitions:
+A *cgroup* associates a set of tasks with a set of parameters for one
+or more subsystems.
+A *subsystem* is a module that makes use of the task grouping
+facilities provided by cgroups to treat groups of tasks in
+particular ways. A subsystem is typically a "resource controller" that
+schedules a resource or applies per-cgroup limits, but it may be
+anything that wants to act on a group of processes, e.g. a
+virtualization subsystem.
+A *hierarchy* is a set of cgroups arranged in a tree, such that
+every task in the system is in exactly one of the cgroups in the
+hierarchy, and a set of subsystems; each subsystem has system-specific
+state attached to each cgroup in the hierarchy.  Each hierarchy has
+an instance of the cgroup virtual filesystem associated with it.
+At any one time there may be multiple active hierachies of task
+cgroups. Each hierarchy is a partition of all tasks in the system.
+User level code may create and destroy cgroups by name in an
+instance of the cgroup virtual file system, specify and query to
+which cgroup a task is assigned, and list the task pids assigned to
+a cgroup. Those creations and assignments only affect the hierarchy
+associated with that instance of the cgroup file system.
+On their own, the only use for cgroups is for simple job
+tracking. The intention is that other subsystems hook into the generic
+cgroup support to provide new attributes for cgroups, such as
+accounting/limiting the resources which processes in a cgroup can
+access. For example, cpusets (see Documentation/cpusets.txt) allows
+you to associate a set of CPUs and a set of memory nodes with the
+tasks in each cgroup.
+1.2 Why are cgroups needed ?
+----------------------------
+There are multiple efforts to provide process aggregations in the
+Linux kernel, mainly for resource tracking purposes. Such efforts
+include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
+namespaces. These all require the basic notion of a
+grouping/partitioning of processes, with newly forked processes ending
+in the same group (cgroup) as their parent process.
+The kernel cgroup patch provides the minimum essential kernel
+mechanisms required to efficiently implement such groups. It has
+minimal impact on the system fast paths, and provides hooks for
+specific subsystems such as cpusets to provide additional behaviour as
+desired.
+Multiple hierarchy support is provided to allow for situations where
+the division of tasks into cgroups is distinctly different for
+different subsystems - having parallel hierarchies allows each
+hierarchy to be a natural division of tasks, without having to handle
+complex combinations of tasks that would be present if several
+unrelated subsystems needed to be forced into the same tree of
+cgroups.
+At one extreme, each resource controller or subsystem could be in a
+separate hierarchy; at the other extreme, all subsystems
+would be attached to the same hierarchy.
+As an example of a scenario (originally proposed by vatsa@in.ibm.com)
+that can benefit from multiple hierarchies, consider a large
+university server with various users - students, professors, system
+tasks etc. The resource planning for this server could be along the
+following lines:
+       CPU :           Top cpuset
+                       /       \
+               CPUSet1         CPUSet2
+                  |              |
+               (Profs)         (Students)
+               In addition (system tasks) are attached to topcpuset (so
+               that they can run anywhere) with a limit of 20%
+       Memory : Professors (50%), students (30%), system (20%)
+       Disk : Prof (50%), students (30%), system (20%)
+       Network : WWW browsing (20%), Network File System (60%), others (20%)
+                               / \
+                       Prof (15%) students (5%)
+Browsers like firefox/lynx go into the WWW network class, while (k)nfsd go
+into NFS network class.
+At the same time firefox/lynx will share an appropriate CPU/Memory class
+depending on who launched it (prof/student).
+With the ability to classify tasks differently for different resources
+(by putting those resource subsystems in different hierarchies) then
+the admin can easily set up a script which receives exec notifications
+and depending on who is launching the browser he can
+       # echo browser_pid > /mnt/<restype>/<userclass>/tasks
+With only a single hierarchy, he now would potentially have to create
+a separate cgroup for every browser launched and associate it with
+approp network and other resource class.  This may lead to
+proliferation of such cgroups.
+Also lets say that the administrator would like to give enhanced network
+access temporarily to a student's browser (since it is night and the user
+wants to do online gaming :)  OR give one of the students simulation
+apps enhanced CPU power,
+With ability to write pids directly to resource classes, its just a
+matter of :
+       # echo pid > /mnt/network/<new_class>/tasks
+       (after some time)
+       # echo pid > /mnt/network/<orig_class>/tasks
+Without this ability, he would have to split the cgroup into
+multiple separate ones and then associate the new cgroups with the
+new resource classes.
+1.3 How are cgroups implemented ?
+---------------------------------
+Control Groups extends the kernel as follows:
+ - Each task in the system has a reference-counted pointer to a
+   css_set.
+ - A css_set contains a set of reference-counted pointers to
+   cgroup_subsys_state objects, one for each cgroup subsystem
+   registered in the system. There is no direct link from a task to
+   the cgroup of which it's a member in each hierarchy, but this
+   can be determined by following pointers through the
+   cgroup_subsys_state objects. This is because accessing the
+   subsystem state is something that's expected to happen frequently
+   and in performance-critical code, whereas operations that require a
+   task's actual cgroup assignments (in particular, moving between
+   cgroups) are less common. A linked list runs through the cg_list
+   field of each task_struct using the css_set, anchored at
+   css_set->tasks.
+ - A cgroup hierarchy filesystem can be mounted  for browsing and
+   manipulation from user space.
+ - You can list all the tasks (by pid) attached to any cgroup.
+The implementation of cgroups requires a few, simple hooks
+into the rest of the kernel, none in performance critical paths:
+ - in init/main.c, to initialize the root cgroups and initial
+   css_set at system boot.
+ - in fork and exit, to attach and detach a task from its css_set.
+In addition a new file system, of type "cgroup" may be mounted, to
+enable browsing and modifying the cgroups presently known to the
+kernel.  When mounting a cgroup hierarchy, you may specify a
+comma-separated list of subsystems to mount as the filesystem mount
+options.  By default, mounting the cgroup filesystem attempts to
+mount a hierarchy containing all registered subsystems.
+If an active hierarchy with exactly the same set of subsystems already
+exists, it will be reused for the new mount. If no existing hierarchy
+matches, and any of the requested subsystems are in use in an existing
+hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
+is activated, associated with the requested subsystems.
+It's not currently possible to bind a new subsystem to an active
+cgroup hierarchy, or to unbind a subsystem from an active cgroup
+hierarchy. This may be possible in future, but is fraught with nasty
+error-recovery issues.
+When a cgroup filesystem is unmounted, if there are any
+child cgroups created below the top-level cgroup, that hierarchy
+will remain active even though unmounted; if there are no
+child cgroups then the hierarchy will be deactivated.
+No new system calls are added for cgroups - all support for
+querying and modifying cgroups is via this cgroup file system.
+Each task under /proc has an added file named 'cgroup' displaying,
+for each active hierarchy, the subsystem names and the cgroup name
+as the path relative to the root of the cgroup file system.
+Each cgroup is represented by a directory in the cgroup file system
+containing the following files describing that cgroup:
+ - tasks: list of tasks (by pid) attached to that cgroup
+ - notify_on_release flag: run /sbin/cgroup_release_agent on exit?
+Other subsystems such as cpusets may add additional files in each
+cgroup dir
+New cgroups are created using the mkdir system call or shell
+command.  The properties of a cgroup, such as its flags, are
+modified by writing to the appropriate file in that cgroups
+directory, as listed above.
+The named hierarchical structure of nested cgroups allows partitioning
+a large system into nested, dynamically changeable, "soft-partitions".
+The attachment of each task, automatically inherited at fork by any
+children of that task, to a cgroup allows organizing the work load
+on a system into related sets of tasks.  A task may be re-attached to
+any other cgroup, if allowed by the permissions on the necessary
+cgroup file system directories.
+When a task is moved from one cgroup to another, it gets a new
+css_set pointer - if there's an already existing css_set with the
+desired collection of cgroups then that group is reused, else a new
+css_set is allocated. Note that the current implementation uses a
+linear search to locate an appropriate existing css_set, so isn't
+very efficient. A future version will use a hash table for better
+performance.
+To allow access from a cgroup to the css_sets (and hence tasks)
+that comprise it, a set of cg_cgroup_link objects form a lattice;
+each cg_cgroup_link is linked into a list of cg_cgroup_links for
+a single cgroup on its cont_link_list field, and a list of
+cg_cgroup_links for a single css_set on its cg_link_list.
+Thus the set of tasks in a cgroup can be listed by iterating over
+each css_set that references the cgroup, and sub-iterating over
+each css_set's task set.
+The use of a Linux virtual file system (vfs) to represent the
+cgroup hierarchy provides for a familiar permission and name space
+for cgroups, with a minimum of additional kernel code.
+1.4 What does notify_on_release do ?
+------------------------------------
+*** notify_on_release is disabled in the current patch set. It will be
+*** reactivated in a future patch in a less-intrusive manner
+If the notify_on_release flag is enabled (1) in a cgroup, then
+whenever the last task in the cgroup leaves (exits or attaches to
+some other cgroup) and the last child cgroup of that cgroup
+is removed, then the kernel runs the command specified by the contents
+of the "release_agent" file in that hierarchy's root directory,
+supplying the pathname (relative to the mount point of the cgroup
+file system) of the abandoned cgroup.  This enables automatic
+removal of abandoned cgroups.  The default value of
+notify_on_release in the root cgroup at system boot is disabled
+(0).  The default value of other cgroups at creation is the current
+value of their parents notify_on_release setting. The default value of
+a cgroup hierarchy's release_agent path is empty.
+1.5 How do I use cgroups ?
+--------------------------
+To start a new job that is to be contained within a cgroup, using
+the "cpuset" cgroup subsystem, the steps are something like:
+ 1) mkdir /dev/cgroup
+ 2) mount -t cgroup -ocpuset cpuset /dev/cgroup
+ 3) Create the new cgroup by doing mkdir's and write's (or echo's) in
+    the /dev/cgroup virtual file system.
+ 4) Start a task that will be the "founding father" of the new job.
+ 5) Attach that task to the new cgroup by writing its pid to the
+    /dev/cgroup tasks file for that cgroup.
+ 6) fork, exec or clone the job tasks from this founding father task.
+For example, the following sequence of commands will setup a cgroup
+named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
+and then start a subshell 'sh' in that cgroup:
+  mount -t cgroup cpuset -ocpuset /dev/cgroup
+  cd /dev/cgroup
+  mkdir Charlie
+  cd Charlie
+  /bin/echo 2-3 > cpus
+  /bin/echo 1 > mems
+  /bin/echo $$ > tasks
+  sh
+  # The subshell 'sh' is now running in cgroup Charlie
+  # The next line should display '/Charlie'
+  cat /proc/self/cgroup
+2. Usage Examples and Syntax
+============================
+2.1 Basic Usage
+---------------
+Creating, modifying, using the cgroups can be done through the cgroup
+virtual filesystem.
+To mount a cgroup hierarchy will all available subsystems, type:
+# mount -t cgroup xxx /dev/cgroup
+The "xxx" is not interpreted by the cgroup code, but will appear in
+/proc/mounts so may be any useful identifying string that you like.
+To mount a cgroup hierarchy with just the cpuset and numtasks
+subsystems, type:
+# mount -t cgroup -o cpuset,numtasks hier1 /dev/cgroup
+To change the set of subsystems bound to a mounted hierarchy, just
+remount with different options:
+# mount -o remount,cpuset,ns  /dev/cgroup
+Note that changing the set of subsystems is currently only supported
+when the hierarchy consists of a single (root) cgroup. Supporting
+the ability to arbitrarily bind/unbind subsystems from an existing
+cgroup hierarchy is intended to be implemented in the future.
+Then under /dev/cgroup you can find a tree that corresponds to the
+tree of the cgroups in the system. For instance, /dev/cgroup
+is the cgroup that holds the whole system.
+If you want to create a new cgroup under /dev/cgroup:
+# cd /dev/cgroup
+# mkdir my_cgroup
+Now you want to do something with this cgroup.
+# cd my_cgroup
+In this directory you can find several files:
+# ls
+notify_on_release release_agent tasks
+(plus whatever files are added by the attached subsystems)
+Now attach your shell to this cgroup:
+# /bin/echo $$ > tasks
+You can also create cgroups inside your cgroup by using mkdir in this
+directory.
+# mkdir my_sub_cs
+To remove a cgroup, just use rmdir:
+# rmdir my_sub_cs
+This will fail if the cgroup is in use (has cgroups inside, or
+has processes attached, or is held alive by other subsystem-specific
+reference).
+2.2 Attaching processes
+-----------------------
+# /bin/echo PID > tasks
+Note that it is PID, not PIDs. You can only attach ONE task at a time.
+If you have several tasks to attach, you have to do it one after another:
+# /bin/echo PID1 > tasks
+# /bin/echo PID2 > tasks
+        ...
+# /bin/echo PIDn > tasks
+3. Kernel API
+=============
+3.1 Overview
+------------
+Each kernel subsystem that wants to hook into the generic cgroup
+system needs to create a cgroup_subsys object. This contains
+various methods, which are callbacks from the cgroup system, along
+with a subsystem id which will be assigned by the cgroup system.
+Other fields in the cgroup_subsys object include:
+- subsys_id: a unique array index for the subsystem, indicating which
+  entry in cgroup->subsys[] this subsystem should be
+  managing. Initialized by cgroup_register_subsys(); prior to this
+  it should be initialized to -1
+- hierarchy: an index indicating which hierarchy, if any, this
+  subsystem is currently attached to. If this is -1, then the
+  subsystem is not attached to any hierarchy, and all tasks should be
+  considered to be members of the subsystem's top_cgroup. It should
+  be initialized to -1.
+- name: should be initialized to a unique subsystem name prior to
+  calling cgroup_register_subsystem. Should be no longer than
+  MAX_CGROUP_TYPE_NAMELEN
+Each cgroup object created by the system has an array of pointers,
+indexed by subsystem id; this pointer is entirely managed by the
+subsystem; the generic cgroup code will never touch this pointer.
+3.2 Synchronization
+-------------------
+There is a global mutex, cgroup_mutex, used by the cgroup
+system. This should be taken by anything that wants to modify a
+cgroup. It may also be taken to prevent cgroups from being
+modified, but more specific locks may be more appropriate in that
+situation.
+See kernel/cgroup.c for more details.
+Subsystems can take/release the cgroup_mutex via the functions
+cgroup_lock()/cgroup_unlock(), and can
+take/release the callback_mutex via the functions
+cgroup_lock()/cgroup_unlock().
+Accessing a task's cgroup pointer may be done in the following ways:
+- while holding cgroup_mutex
+- while holding the task's alloc_lock (via task_lock())
+- inside an rcu_read_lock() section via rcu_dereference()
+3.3 Subsystem API
+--------------------------
+Each subsystem should:
+- add an entry in linux/cgroup_subsys.h
+- define a cgroup_subsys object called <name>_subsys
+Each subsystem may export the following methods. The only mandatory
+methods are create/destroy. Any others that are null are presumed to
+be successful no-ops.
+struct cgroup_subsys_state *create(struct cgroup *cont)
+LL=cgroup_mutex
+Called to create a subsystem state object for a cgroup. The
+subsystem should allocate its subsystem state object for the passed
+cgroup, returning a pointer to the new object on success or a
+negative error code. On success, the subsystem pointer should point to
+a structure of type cgroup_subsys_state (typically embedded in a
+larger subsystem-specific object), which will be initialized by the
+cgroup system. Note that this will be called at initialization to
+create the root subsystem state for this subsystem; this case can be
+identified by the passed cgroup object having a NULL parent (since
+it's the root of the hierarchy) and may be an appropriate place for
+initialization code.
+void destroy(struct cgroup *cont)
+LL=cgroup_mutex
+The cgroup system is about to destroy the passed cgroup; the
+subsystem should do any necessary cleanup
+int can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
+               struct task_struct *task)
+LL=cgroup_mutex
+Called prior to moving a task into a cgroup; if the subsystem
+returns an error, this will abort the attach operation.  If a NULL
+task is passed, then a successful result indicates that *any*
+unspecified task can be moved into the cgroup. Note that this isn't
+called on a fork. If this method returns 0 (success) then this should
+remain valid while the caller holds cgroup_mutex.
+void attach(struct cgroup_subsys *ss, struct cgroup *cont,
+            struct cgroup *old_cont, struct task_struct *task)
+LL=cgroup_mutex
+Called after the task has been attached to the cgroup, to allow any
+post-attachment activity that requires memory allocations or blocking.
+void fork(struct cgroup_subsy *ss, struct task_struct *task)
+LL=callback_mutex, maybe read_lock(tasklist_lock)
+Called when a task is forked into a cgroup. Also called during
+registration for all existing tasks.
+void exit(struct cgroup_subsys *ss, struct task_struct *task)
+LL=callback_mutex
+Called during task exit
+int populate(struct cgroup_subsys *ss, struct cgroup *cont)
+LL=none
+Called after creation of a cgroup to allow a subsystem to populate
+the cgroup directory with file entries.  The subsystem should make
+calls to cgroup_add_file() with objects of type cftype (see
+include/linux/cgroup.h for details).  Note that although this
+method can return an error code, the error code is currently not
+always handled well.
+void post_clone(struct cgroup_subsys *ss, struct cgroup *cont)
+Called at the end of cgroup_clone() to do any paramater
+initialization which might be required before a task could attach.  For
+example in cpusets, no task may attach before 'cpus' and 'mems' are set
+up.
+void bind(struct cgroup_subsys *ss, struct cgroup *root)
+LL=callback_mutex
+Called when a cgroup subsystem is rebound to a different hierarchy
+and root cgroup. Currently this will only involve movement between
+the default hierarchy (which never has sub-cgroups) and a hierarchy
+that is being created/destroyed (and hence has no sub-cgroups).
+4. Questions
+============
+Q: what's up with this '/bin/echo' ?
+A: bash's builtin 'echo' command does not check calls to write() against
+   errors. If you use it in the cgroup file system, you won't be
+   able to tell whether a command succeeded or failed.
+Q: When I attach processes, only the first of the line gets really attached !
+A: We can only return one error code per call to write(). So you should also
+   put only ONE pid.
diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt
index b6d24c22274b..a741f658a3c9 100644
--- a/Documentation/cpu-hotplug.txt
+++ b/Documentation/cpu-hotplug.txt
@@ -220,7 +220,9 @@ A: The following happen, listed in no particular order :-)
  CPU_DOWN_PREPARE or CPU_DOWN_PREPARE_FROZEN, depending on whether or not the
  CPU is being offlined while tasks are frozen due to a suspend operation in
  progress
- All process is migrated away from this outgoing CPU to a new CPU
+- All processes are migrated away from this outgoing CPU to new CPUs.
+  The new CPU is chosen from each process' current cpuset, which may be
+  a subset of all online CPUs.
 - All interrupts targeted to this CPU is migrated to a new CPU
 - timers/bottom half/task lets are also migrated to a new CPU
 - Once all services are migrated, kernel calls an arch specific routine
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index ec9de6917f01..141bef1c8599 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -7,6 +7,7 @@ Written by Simon.Derr@bull.net
 Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
 Modified by Paul Jackson <pj@sgi.com>
 Modified by Christoph Lameter <clameter@sgi.com>
+Modified by Paul Menage <menage@google.com>
 CONTENTS:
 =========
@@ -16,9 +17,9 @@ CONTENTS:
  1.2 Why are cpusets needed ?
  1.3 How are cpusets implemented ?
  1.4 What are exclusive cpusets ?
-  1.5 What does notify_on_release do ?
+  1.5 What is memory_pressure ?
-  1.6 What is memory_pressure ?
+  1.6 What is memory spread ?
-  1.7 What is memory spread ?
+  1.7 What is sched_load_balance ?
  1.8 How do I use cpusets ?
 2. Usage Examples and Syntax
  2.1 Basic Usage
@@ -44,18 +45,19 @@ hierarchy visible in a virtual file system.  These are the essential
 hooks, beyond what is already present, required to manage dynamic
 job placement on large systems.
-Each task has a pointer to a cpuset.  Multiple tasks may reference
+Cpusets use the generic cgroup subsystem described in
-the same cpuset.  Requests by a task, using the sched_setaffinity(2)
+Documentation/cgroup.txt.
-system call to include CPUs in its CPU affinity mask, and using the
-mbind(2) and set_mempolicy(2) system calls to include Memory Nodes
+Requests by a task, using the sched_setaffinity(2) system call to
-in its memory policy, are both filtered through that tasks cpuset,
+include CPUs in its CPU affinity mask, and using the mbind(2) and
-filtering out any CPUs or Memory Nodes not in that cpuset.  The
+set_mempolicy(2) system calls to include Memory Nodes in its memory
-scheduler will not schedule a task on a CPU that is not allowed in
+policy, are both filtered through that tasks cpuset, filtering out any
-its cpus_allowed vector, and the kernel page allocator will not
+CPUs or Memory Nodes not in that cpuset.  The scheduler will not
-allocate a page on a node that is not allowed in the requesting tasks
+schedule a task on a CPU that is not allowed in its cpus_allowed
-mems_allowed vector.
+vector, and the kernel page allocator will not allocate a page on a
+node that is not allowed in the requesting tasks mems_allowed vector.
-User level code may create and destroy cpusets by name in the cpuset
+User level code may create and destroy cpusets by name in the cgroup
 virtual file system, manage the attributes and permissions of these
 cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
 specify and query to which cpuset a task is assigned, and list the
@@ -115,7 +117,7 @@ Cpusets extends these two mechanisms as follows:
 - Cpusets are sets of allowed CPUs and Memory Nodes, known to the
   kernel.
 - Each task in the system is attached to a cpuset, via a pointer
-   in the task structure to a reference counted cpuset structure.
+   in the task structure to a reference counted cgroup structure.
 - Calls to sched_setaffinity are filtered to just those CPUs
   allowed in that tasks cpuset.
 - Calls to mbind and set_mempolicy are filtered to just
@@ -145,15 +147,10 @@ into the rest of the kernel, none in performance critical paths:
 - in page_alloc.c, to restrict memory to allowed nodes.
 - in vmscan.c, to restrict page recovery to the current cpuset.
-In addition a new file system, of type "cpuset" may be mounted,
+You should mount the "cgroup" filesystem type in order to enable
-typically at /dev/cpuset, to enable browsing and modifying the cpusets
+browsing and modifying the cpusets presently known to the kernel.  No
-presently known to the kernel.  No new system calls are added for
+new system calls are added for cpusets - all support for querying and
-cpusets - all support for querying and modifying cpusets is via
+modifying cpusets is via this cpuset file system.
-this cpuset file system.
-Each task under /proc has an added file named 'cpuset', displaying
-the cpuset name, as the path relative to the root of the cpuset file
-system.
 The /proc/<pid>/status file for each task has two added lines,
 displaying the tasks cpus_allowed (on which CPUs it may be scheduled)
@@ -163,16 +160,15 @@ in the format seen in the following example:
  Cpus_allowed:   ffffffff,ffffffff,ffffffff,ffffffff
  Mems_allowed:   ffffffff,ffffffff
-Each cpuset is represented by a directory in the cpuset file system
+Each cpuset is represented by a directory in the cgroup file system
-containing the following files describing that cpuset:
+containing (on top of the standard cgroup files) the following
+files describing that cpuset:
 - cpus: list of CPUs in that cpuset
 - mems: list of Memory Nodes in that cpuset
 - memory_migrate flag: if set, move pages to cpusets nodes
 - cpu_exclusive flag: is cpu placement exclusive?
 - mem_exclusive flag: is memory placement exclusive?
- - tasks: list of tasks (by pid) attached to that cpuset
- - notify_on_release flag: run /sbin/cpuset_release_agent on exit?
 - memory_pressure: measure of how much paging pressure in cpuset
 In addition, the root cpuset only has the following file:
@@ -237,21 +233,7 @@ such as requests from interrupt handlers, is allowed to be taken
 outside even a mem_exclusive cpuset.
-1.5 What does notify_on_release do ?
+1.5 What is memory_pressure ?
------------------------------------
-If the notify_on_release flag is enabled (1) in a cpuset, then whenever
-the last task in the cpuset leaves (exits or attaches to some other
-cpuset) and the last child cpuset of that cpuset is removed, then
-the kernel runs the command /sbin/cpuset_release_agent, supplying the
-pathname (relative to the mount point of the cpuset file system) of the
-abandoned cpuset.  This enables automatic removal of abandoned cpusets.
-The default value of notify_on_release in the root cpuset at system
-boot is disabled (0).  The default value of other cpusets at creation
-is the current value of their parents notify_on_release setting.
-1.6 What is memory_pressure ?
 -----------------------------
 The memory_pressure of a cpuset provides a simple per-cpuset metric
 of the rate that the tasks in a cpuset are attempting to free up in
@@ -308,7 +290,7 @@ the tasks in the cpuset, in units of reclaims attempted per second,
 times 1000.
-1.7 What is memory spread ?
+1.6 What is memory spread ?
 ---------------------------
 There are two boolean flag files per cpuset that control where the
 kernel allocates pages for the file system buffers and related in
@@ -378,6 +360,142 @@ policy, especially for jobs that might have one thread reading in the
 data set, the memory allocation across the nodes in the jobs cpuset
 can become very uneven.
+1.7 What is sched_load_balance ?
+--------------------------------
+The kernel scheduler (kernel/sched.c) automatically load balances
+tasks.  If one CPU is underutilized, kernel code running on that
+CPU will look for tasks on other more overloaded CPUs and move those
+tasks to itself, within the constraints of such placement mechanisms
+as cpusets and sched_setaffinity.
+The algorithmic cost of load balancing and its impact on key shared
+kernel data structures such as the task list increases more than
+linearly with the number of CPUs being balanced.  So the scheduler
+has support to  partition the systems CPUs into a number of sched
+domains such that it only load balances within each sched domain.
+Each sched domain covers some subset of the CPUs in the system;
+no two sched domains overlap; some CPUs might not be in any sched
+domain and hence won't be load balanced.
+Put simply, it costs less to balance between two smaller sched domains
+than one big one, but doing so means that overloads in one of the
+two domains won't be load balanced to the other one.
+By default, there is one sched domain covering all CPUs, except those
+marked isolated using the kernel boot time "isolcpus=" argument.
+This default load balancing across all CPUs is not well suited for
+the following two situations:
+ 1) On large systems, load balancing across many CPUs is expensive.
+    If the system is managed using cpusets to place independent jobs
+    on separate sets of CPUs, full load balancing is unnecessary.
+ 2) Systems supporting realtime on some CPUs need to minimize
+    system overhead on those CPUs, including avoiding task load
+    balancing if that is not needed.
+When the per-cpuset flag "sched_load_balance" is enabled (the default
+setting), it requests that all the CPUs in that cpusets allowed 'cpus'
+be contained in a single sched domain, ensuring that load balancing
+can move a task (not otherwised pinned, as by sched_setaffinity)
+from any CPU in that cpuset to any other.
+When the per-cpuset flag "sched_load_balance" is disabled, then the
+scheduler will avoid load balancing across the CPUs in that cpuset,
+--except-- in so far as is necessary because some overlapping cpuset
+has "sched_load_balance" enabled.
+So, for example, if the top cpuset has the flag "sched_load_balance"
+enabled, then the scheduler will have one sched domain covering all
+CPUs, and the setting of the "sched_load_balance" flag in any other
+cpusets won't matter, as we're already fully load balancing.
+Therefore in the above two situations, the top cpuset flag
+"sched_load_balance" should be disabled, and only some of the smaller,
+child cpusets have this flag enabled.
+When doing this, you don't usually want to leave any unpinned tasks in
+the top cpuset that might use non-trivial amounts of CPU, as such tasks
+may be artificially constrained to some subset of CPUs, depending on
+the particulars of this flag setting in descendent cpusets.  Even if
+such a task could use spare CPU cycles in some other CPUs, the kernel
+scheduler might not consider the possibility of load balancing that
+task to that underused CPU.
+Of course, tasks pinned to a particular CPU can be left in a cpuset
+that disables "sched_load_balance" as those tasks aren't going anywhere
+else anyway.
+There is an impedance mismatch here, between cpusets and sched domains.
+Cpusets are hierarchical and nest.  Sched domains are flat; they don't
+overlap and each CPU is in at most one sched domain.
+It is necessary for sched domains to be flat because load balancing
+across partially overlapping sets of CPUs would risk unstable dynamics
+that would be beyond our understanding.  So if each of two partially
+overlapping cpusets enables the flag 'sched_load_balance', then we
+form a single sched domain that is a superset of both.  We won't move
+a task to a CPU outside it cpuset, but the scheduler load balancing
+code might waste some compute cycles considering that possibility.
+This mismatch is why there is not a simple one-to-one relation
+between which cpusets have the flag "sched_load_balance" enabled,
+and the sched domain configuration.  If a cpuset enables the flag, it
+will get balancing across all its CPUs, but if it disables the flag,
+it will only be assured of no load balancing if no other overlapping
+cpuset enables the flag.
+If two cpusets have partially overlapping 'cpus' allowed, and only
+one of them has this flag enabled, then the other may find its
+tasks only partially load balanced, just on the overlapping CPUs.
+This is just the general case of the top_cpuset example given a few
+paragraphs above.  In the general case, as in the top cpuset case,
+don't leave tasks that might use non-trivial amounts of CPU in
+such partially load balanced cpusets, as they may be artificially
+constrained to some subset of the CPUs allowed to them, for lack of
+load balancing to the other CPUs.
+1.7.1 sched_load_balance implementation details.
+------------------------------------------------
+The per-cpuset flag 'sched_load_balance' defaults to enabled (contrary
+to most cpuset flags.)  When enabled for a cpuset, the kernel will
+ensure that it can load balance across all the CPUs in that cpuset
+(makes sure that all the CPUs in the cpus_allowed of that cpuset are
+in the same sched domain.)
+If two overlapping cpusets both have 'sched_load_balance' enabled,
+then they will be (must be) both in the same sched domain.
+If, as is the default, the top cpuset has 'sched_load_balance' enabled,
+then by the above that means there is a single sched domain covering
+the whole system, regardless of any other cpuset settings.
+The kernel commits to user space that it will avoid load balancing
+where it can.  It will pick as fine a granularity partition of sched
+domains as it can while still providing load balancing for any set
+of CPUs allowed to a cpuset having 'sched_load_balance' enabled.
+The internal kernel cpuset to scheduler interface passes from the
+cpuset code to the scheduler code a partition of the load balanced
+CPUs in the system. This partition is a set of subsets (represented
+as an array of cpumask_t) of CPUs, pairwise disjoint, that cover all
+the CPUs that must be load balanced.
+Whenever the 'sched_load_balance' flag changes, or CPUs come or go
+from a cpuset with this flag enabled, or a cpuset with this flag
+enabled is removed, the cpuset code builds a new such partition and
+passes it to the scheduler sched domain setup code, to have the sched
+domains rebuilt as necessary.
+This partition exactly defines what sched domains the scheduler should
+setup - one sched domain for each element (cpumask_t) in the partition.
+The scheduler remembers the currently active sched domain partitions.
+When the scheduler routine partition_sched_domains() is invoked from
+the cpuset code to update these sched domains, it compares the new
+partition requested with the current, and updates its sched domains,
+removing the old and adding the new, for each change.
 1.8 How do I use cpusets ?
 --------------------------
@@ -469,7 +587,7 @@ than stress the kernel.
 To start a new job that is to be contained within a cpuset, the steps are:
 1) mkdir /dev/cpuset
- 2) mount -t cpuset none /dev/cpuset
+ 2) mount -t cgroup -ocpuset cpuset /dev/cpuset
 3) Create the new cpuset by doing mkdir's and write's (or echo's) in
    the /dev/cpuset virtual file system.
 4) Start a task that will be the "founding father" of the new job.
@@ -481,7 +599,7 @@ For example, the following sequence of commands will setup a cpuset
 named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
 and then start a subshell 'sh' in that cpuset:
-  mount -t cpuset none /dev/cpuset
+  mount -t cgroup -ocpuset cpuset /dev/cpuset
  cd /dev/cpuset
  mkdir Charlie
  cd Charlie
@@ -513,7 +631,7 @@ Creating, modifying, using the cpusets can be done through the cpuset
 virtual filesystem.
 To mount it, type:
-# mount -t cpuset none /dev/cpuset
+# mount -t cgroup -o cpuset cpuset /dev/cpuset
 Then under /dev/cpuset you can find a tree that corresponds to the
 tree of the cpusets in the system. For instance, /dev/cpuset
@@ -556,6 +674,18 @@ To remove a cpuset, just use rmdir:
 This will fail if the cpuset is in use (has cpusets inside, or has
 processes attached).
+Note that for legacy reasons, the "cpuset" filesystem exists as a
+wrapper around the cgroup filesystem.
+The command
+mount -t cpuset X /dev/cpuset
+is equivalent to
+mount -t cgroup -ocpuset X /dev/cpuset
+echo "/sbin/cpuset_release_agent" > /dev/cpuset/release_agent
 2.2 Adding/removing cpus
 ------------------------
diff --git a/Documentation/device-mapper/dm-uevent.txt b/Documentation/device-mapper/dm-uevent.txt
new file mode 100644
index 000000000000..07edbd85c714
--- /dev/null
+++ b/Documentation/device-mapper/dm-uevent.txt
@@ -0,0 +1,97 @@
+The device-mapper uevent code adds the capability to device-mapper to create
+and send kobject uevents (uevents).  Previously device-mapper events were only
+available through the ioctl interface.  The advantage of the uevents interface
+is the event contains environment attributes providing increased context for
+the event avoiding the need to query the state of the device-mapper device after
+the event is received.
+There are two functions currently for device-mapper events.  The first function
+listed creates the event and the second function sends the event(s).
+void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti,
+                    const char *path, unsigned nr_valid_paths)
+void dm_send_uevents(struct list_head *events, struct kobject *kobj)
+The variables added to the uevent environment are:
+Variable Name: DM_TARGET
+Uevent Action(s): KOBJ_CHANGE
+Type: string
+Description:
+Value: Name of device-mapper target that generated the event.
+Variable Name: DM_ACTION
+Uevent Action(s): KOBJ_CHANGE
+Type: string
+Description:
+Value: Device-mapper specific action that caused the uevent action.
+        PATH_FAILED - A path has failed.
+        PATH_REINSTATED - A path has been reinstated.
+Variable Name: DM_SEQNUM
+Uevent Action(s): KOBJ_CHANGE
+Type: unsigned integer
+Description: A sequence number for this specific device-mapper device.
+Value: Valid unsigned integer range.
+Variable Name: DM_PATH
+Uevent Action(s): KOBJ_CHANGE
+Type: string
+Description: Major and minor number of the path device pertaining to this
+event.
+Value: Path name in the form of "Major:Minor"
+Variable Name: DM_NR_VALID_PATHS
+Uevent Action(s): KOBJ_CHANGE
+Type: unsigned integer
+Description:
+Value: Valid unsigned integer range.
+Variable Name: DM_NAME
+Uevent Action(s): KOBJ_CHANGE
+Type: string
+Description: Name of the device-mapper device.
+Value: Name
+Variable Name: DM_UUID
+Uevent Action(s): KOBJ_CHANGE
+Type: string
+Description: UUID of the device-mapper device.
+Value: UUID. (Empty string if there isn't one.)
+An example of the uevents generated as captured by udevmonitor is shown
+below.
+1.) Path failure.
+UEVENT[1192521009.711215] change@/block/dm-3
+ACTION=change
+DEVPATH=/block/dm-3
+SUBSYSTEM=block
+DM_TARGET=multipath
+DM_ACTION=PATH_FAILED
+DM_SEQNUM=1
+DM_PATH=8:32
+DM_NR_VALID_PATHS=0
+DM_NAME=mpath2
+DM_UUID=mpath-35333333000002328
+MINOR=3
+MAJOR=253
+SEQNUM=1130
+2.) Path reinstate.
+UEVENT[1192521132.989927] change@/block/dm-3
+ACTION=change
+DEVPATH=/block/dm-3
+SUBSYSTEM=block
+DM_TARGET=multipath
+DM_ACTION=PATH_REINSTATED
+DM_SEQNUM=2
+DM_PATH=8:32
+DM_NR_VALID_PATHS=1
+DM_NAME=mpath2
+DM_UUID=mpath-35333333000002328
+MINOR=3
+MAJOR=253
+SEQNUM=1131
diff --git a/Documentation/devices.txt b/Documentation/devices.txt
index 6c46730c631a..e6244cde26e9 100644
--- a/Documentation/devices.txt
+++ b/Documentation/devices.txt
@@ -2188,7 +2188,7 @@ Your cooperation is appreciated.
 136-143 char    Unix98 PTY slaves
                  0 = /dev/pts/0        First Unix98 pseudo-TTY
-                  1 = /dev/pts/1        Second Unix98 pesudo-TTY
+                  1 = /dev/pts/1        Second Unix98 pseudo-TTY
                    ...
                These device nodes are automatically generated with
diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
index 8569072fa387..387b8a720f4a 100644
--- a/Documentation/driver-model/devres.txt
+++ b/Documentation/driver-model/devres.txt
@@ -32,7 +32,7 @@ braindamaged document, if it's finally working, well, it's working.
 For one reason or another, low level drivers don't receive as much
 attention or testing as core code, and bugs on driver detach or
-initilaization failure doesn't happen often enough to be noticeable.
+initialization failure don't happen often enough to be noticeable.
 Init failure path is worse because it's much less travelled while
 needs to handle multiple entry points.
@@ -160,7 +160,7 @@ resources on failure.  For example,
  devres_release_group(dev, NULL);
  return err_code;
-As resource acquision failure usually means probe failure, constructs
+As resource acquisition failure usually means probe failure, constructs
 like above are usually useful in midlayer driver (e.g. libata core
 layer) where interface function shouldn't have side effect on failure.
 For LLDs, just returning error code suffices in most cases.
diff --git a/Documentation/fb/deferred_io.txt b/Documentation/fb/deferred_io.txt
index 73cf9fb7cf60..63883a892120 100644
--- a/Documentation/fb/deferred_io.txt
+++ b/Documentation/fb/deferred_io.txt
@@ -3,7 +3,7 @@ Deferred IO
 Deferred IO is a way to delay and repurpose IO. It uses host memory as a
 buffer and the MMU pagefault as a pretrigger for when to perform the device
-IO. The following example may be a useful explaination of how one such setup
+IO. The following example may be a useful explanation of how one such setup
 works:
 - userspace app like Xfbdev mmaps framebuffer
@@ -28,7 +28,7 @@ a relatively more expensive operation.
 For some types of nonvolatile high latency displays, the desired image is
 the final image rather than the intermediate stages which is why it's okay
-to not update for each write that is occuring.
+to not update for each write that is occurring.
 It may be the case that this is useful in other scenarios as well. Paul Mundt
 has mentioned a case where it is beneficial to use the page count to decide
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 280ec06573e6..6bb9be54ab76 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -14,18 +14,6 @@ Who:	Jiri Slaby <jirislaby@gmail.com>
 ---------------------------
-What:   V4L2 VIDIOC_G_MPEGCOMP and VIDIOC_S_MPEGCOMP
-When:   October 2007
-Why:    Broken attempt to set MPEG compression parameters. These ioctls are
-        not able to implement the wide variety of parameters that can be set
-        by hardware MPEG encoders. A new MPEG control mechanism was created
-        in kernel 2.6.18 that replaces these ioctls. See the V4L2 specification
-        (section 1.9: Extended controls) for more information on this topic.
-Who:    Hans Verkuil <hverkuil@xs4all.nl> and
-        Mauro Carvalho Chehab <mchehab@infradead.org>
---------------------------
 What:   dev->power.power_state
 When:   July 2007
 Why:    Broken design for runtime control over driver power states, confusing
@@ -49,10 +37,10 @@ Who:	David Miller <davem@davemloft.net>
 ---------------------------
 What:   Video4Linux API 1 ioctls and video_decoder.h from Video devices.
-When:   December 2006
+When:   December 2008
-Files:  include/linux/video_decoder.h
+Files:  include/linux/video_decoder.h include/linux/videodev.h
-Check:  include/linux/video_decoder.h
+Check:  include/linux/video_decoder.h include/linux/videodev.h
-Why:    V4L1 AP1 was replaced by V4L2 API. during migration from 2.4 to 2.6
+Why:    V4L1 AP1 was replaced by V4L2 API during migration from 2.4 to 2.6
        series. The old API have lots of drawbacks and don't provide enough
        means to work with all video and audio standards. The newer API is
        already available on the main drivers and should be used instead.
@@ -61,7 +49,9 @@ Why:	V4L1 AP1 was replaced by V4L2 API. during migration from 2.4 to 2.6
        Decoder iocts are using internally to allow video drivers to
        communicate with video decoders. This should also be improved to allow
        V4L2 calls being translated into compatible internal ioctls.
-Who:    Mauro Carvalho Chehab <mchehab@brturbo.com.br>
+        Compatibility ioctls will be provided, for a while, via 
+        v4l1-compat module. 
+Who:    Mauro Carvalho Chehab <mchehab@infradead.org>
 ---------------------------
@@ -82,6 +72,41 @@ Who:	Dominik Brodowski <linux@brodo.de>
 ---------------------------
+What:   sys_sysctl
+When:   September 2010
+Option: CONFIG_SYSCTL_SYSCALL
+Why:    The same information is available in a more convenient from
+        /proc/sys, and none of the sysctl variables appear to be
+        important performance wise.
+        Binary sysctls are a long standing source of subtle kernel
+        bugs and security issues.
+        When I looked several months ago all I could find after
+        searching several distributions were 5 user space programs and
+        glibc (which falls back to /proc/sys) using this syscall.
+        The man page for sysctl(2) documents it as unusable for user
+        space programs.
+        sysctl(2) is not generally ABI compatible to a 32bit user
+        space application on a 64bit and a 32bit kernel.
+        For the last several months the policy has been no new binary
+        sysctls and no one has put forward an argument to use them.
+        Binary sysctls issues seem to keep happening appearing so
+        properly deprecating them (with a warning to user space) and a
+        2 year grace warning period will mean eventually we can kill
+        them and end the pain.
+        In the mean time individual binary sysctls can be dealt with
+        in a piecewise fashion.
+Who:    Eric Biederman <ebiederm@xmission.com>
+---------------------------
 What:  a.out interpreter support for ELF executables
 When:  2.6.25
 Files: fs/binfmt_elf.c
@@ -184,13 +209,6 @@ Who:	Jean Delvare <khali@linux-fr.org>,
 ---------------------------
-What:  drivers depending on OBSOLETE_OSS
-When:  options in 2.6.22, code in 2.6.24
-Why:   OSS drivers with ALSA replacements
-Who:   Adrian Bunk <bunk@stusta.de>
---------------------------
 What:   ACPI procfs interface
 When:   July 2008
 Why:    ACPI sysfs conversion should be finished by January 2008.
diff --git a/Documentation/filesystems/9p.txt b/Documentation/filesystems/9p.txt
index d6fd6c6e4244..bf8080640eba 100644
--- a/Documentation/filesystems/9p.txt
+++ b/Documentation/filesystems/9p.txt
@@ -42,10 +42,12 @@ OPTIONS
  trans=name    select an alternative transport.  Valid options are
                currently:
-                        unix - specifying a named pipe mount point
+                        unix    - specifying a named pipe mount point
-                        tcp  - specifying a normal TCP/IP connection
+                        tcp     - specifying a normal TCP/IP connection
-                        fd   - used passed file descriptors for connection
+                        fd      - used passed file descriptors for connection
                                (see rfdno and wfdno)
+                        virtio  - connect to the next virtio channel available
+                                (from lguest or KVM with trans_virtio module)
  uname=name    user name to attempt mount as on the remote server.  The
                server may override or ignore this value.  Certain user
@@ -54,7 +56,7 @@ OPTIONS
  aname=name    aname specifies the file tree to access when the server is
                offering several exported file systems.
-  cache=mode    specifies a cacheing policy.  By default, no caches are used.
+  cache=mode    specifies a caching policy.  By default, no caches are used.
                        loose = no attempts are made at consistency,
                                intended for exclusive, read-only mounts
diff --git a/Documentation/filesystems/Exporting b/Documentation/filesystems/Exporting
index 31047e0fe14b..87019d2b5981 100644
--- a/Documentation/filesystems/Exporting
+++ b/Documentation/filesystems/Exporting
@@ -2,9 +2,12 @@
 Making Filesystems Exportable
 =============================
-Most filesystem operations require a dentry (or two) as a starting
+Overview
+--------
+All filesystem operations require a dentry (or two) as a starting
 point.  Local applications have a reference-counted hold on suitable
-dentrys via open file descriptors or cwd/root.  However remote
+dentries via open file descriptors or cwd/root.  However remote
 applications that access a filesystem via a remote filesystem protocol
 such as NFS may not be able to hold such a reference, and so need a
 different way to refer to a particular dentry.  As the alternative
@@ -13,14 +16,14 @@ server-reboot (among other things, though these tend to be the most
 problematic), there is no simple answer like 'filename'.
 The mechanism discussed here allows each filesystem implementation to
-specify how to generate an opaque (out side of the filesystem) byte
+specify how to generate an opaque (outside of the filesystem) byte
 string for any dentry, and how to find an appropriate dentry for any
 given opaque byte string.
 This byte string will be called a "filehandle fragment" as it
 corresponds to part of an NFS filehandle.
 A filesystem which supports the mapping between filehandle fragments
-and dentrys will be termed "exportable".
+and dentries will be termed "exportable".
@@ -89,11 +92,9 @@ For a filesystem to be exportable it must:
   1/ provide the filehandle fragment routines described below.
   2/ make sure that d_splice_alias is used rather than d_add
      when ->lookup finds an inode for a given parent and name.
-      Typically the ->lookup routine will end:
+      Typically the ->lookup routine will end with a:
-                if (inode)
-                        return d_splice(inode, dentry);
+                return d_splice_alias(inode, dentry);
-                d_add(dentry, inode);
-                return NULL;
        }
@@ -101,67 +102,39 @@ For a filesystem to be exportable it must:
  A file system implementation declares that instances of the filesystem
 are exportable by setting the s_export_op field in the struct
 super_block.  This field must point to a "struct export_operations"
-struct which could potentially be full of NULLs, though normally at
+struct which has the following members:
-least get_parent will be set.
+ encode_fh  (optional)
- The primary operations are decode_fh and encode_fh.  
+    Takes a dentry and creates a filehandle fragment which can later be used
-decode_fh takes a filehandle fragment and tries to find or create a
+    to find or create a dentry for the same object.  The default
-dentry for the object referred to by the filehandle.
+    implementation creates a filehandle fragment that encodes a 32bit inode
-encode_fh takes a dentry and creates a filehandle fragment which can
+    and generation number for the inode encoded, and if necessary the
-later be used to find/create a dentry for the same object.
+    same information for the parent.
-decode_fh will probably make use of "find_exported_dentry".
+  fh_to_dentry (mandatory)
-This function lives in the "exportfs" module which a filesystem does
+    Given a filehandle fragment, this should find the implied object and
-not need unless it is being exported.  So rather that calling
+    create a dentry for it (possibly with d_alloc_anon).
-find_exported_dentry directly, each filesystem should call it through
-the find_exported_dentry pointer in it's export_operations table.
+  fh_to_parent (optional but strongly recommended)
-This field is set correctly by the exporting agent (e.g. nfsd) when a
+    Given a filehandle fragment, this should find the parent of the
-filesystem is exported, and before any export operations are called.
+    implied object and create a dentry for it (possibly with d_alloc_anon).
+    May fail if the filehandle fragment is too small.
-find_exported_dentry needs three support functions from the
-filesystem:
+  get_parent (optional but strongly recommended)
-  get_name.  When given a parent dentry and a child dentry, this
+    When given a dentry for a directory, this should return  a dentry for
-    should find a name in the directory identified by the parent
+    the parent.  Quite possibly the parent dentry will have been allocated
-    dentry, which leads to the object identified by the child dentry.
+    by d_alloc_anon.  The default get_parent function just returns an error
-    If no get_name function is supplied, a default implementation is
+    so any filehandle lookup that requires finding a parent will fail.
-    provided which uses vfs_readdir to find potential names, and
+    ->lookup("..") is *not* used as a default as it can leave ".." entries
-    matches inode numbers to find the correct match.
+    in the dcache which are too messy to work with.
-  get_parent.  When given a dentry for a directory, this should return 
+  get_name (optional)
-    a dentry for the parent.  Quite possibly the parent dentry will
+    When given a parent dentry and a child dentry, this should find a name
-    have been allocated by d_alloc_anon.  
+    in the directory identified by the parent dentry, which leads to the
-    The default get_parent function just returns an error so any
+    object identified by the child dentry.  If no get_name function is
-    filehandle lookup that requires finding a parent will fail.
+    supplied, a default implementation is provided which uses vfs_readdir
-    ->lookup("..") is *not* used as a default as it can leave ".."
+    to find potential names, and matches inode numbers to find the correct
-    entries in the dcache which are too messy to work with.
+    match.
-  get_dentry.  When given an opaque datum, this should find the
-    implied object and create a dentry for it (possibly with
-    d_alloc_anon). 
-    The opaque datum is whatever is passed down by the decode_fh
-    function, and is often simply a fragment of the filehandle
-    fragment.
-    decode_fh passes two datums through find_exported_dentry.  One that 
-    should be used to identify the target object, and one that can be
-    used to identify the object's parent, should that be necessary.
-    The default get_dentry function assumes that the datum contains an
-    inode number and a generation number, and it attempts to get the
-    inode using "iget" and check it's validity by matching the
-    generation number.  A filesystem should only depend on the default
-    if iget can safely be used this way.
-If decode_fh and/or encode_fh are left as NULL, then default
-implementations are used.  These defaults are suitable for ext2 and 
-extremely similar filesystems (like ext3).
-The default encode_fh creates a filehandle fragment from the inode
-number and generation number of the target together with the inode
-number and generation number of the parent (if the parent is
-required).
-The default decode_fh extract the target and parent datums from the
-filehandle assuming the format used by the default encode_fh and
-passed them to find_exported_dentry.
 A filehandle fragment consists of an array of 1 or more 4byte words,
@@ -172,5 +145,3 @@ generated by encode_fh, in which case it will have been padded with
 nuls.  Rather, the encode_fh routine should choose a "type" which
 indicates the decode_fh how much of the filehandle is valid, and how
 it should be interpreted.
- 
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index fe26cc978523..37c10cba7177 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -224,7 +224,7 @@ against the page the filesystem should redirty the page with
 redirty_page_for_writepage(), then unlock the page and return zero.
 This may also be done to avoid internal deadlocks, but rarely.
-If the filesytem is called for sync then it must wait on any
+If the filesystem is called for sync then it must wait on any
 in-progress I/O and then start new I/O.
 The filesystem should unlock the page synchronously, before returning to the
diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt
index 4aecc9bdb273..b45f3c1b8b43 100644
--- a/Documentation/filesystems/ext3.txt
+++ b/Documentation/filesystems/ext3.txt
@@ -130,12 +130,12 @@ Device layer.
 Journaling Block Device layer
 -----------------------------
-The Journaling Block Device layer (JBD) isn't ext3 specific.  It was design to
+The Journaling Block Device layer (JBD) isn't ext3 specific.  It was designed
-add journaling capabilities on a block device.  The ext3 filesystem code will
+to add journaling capabilities to a block device.  The ext3 filesystem code
-inform the JBD of modifications it is performing (called a transaction).  The
+will inform the JBD of modifications it is performing (called a transaction).
-journal supports the transactions start and stop, and in case of crash, the
+The journal supports the transactions start and stop, and in case of a crash,
-journal can replayed the transactions to put the partition back in a
+the journal can replay the transactions to quickly put the partition back into
-consistent state fast.
+a consistent state.
 Handles represent a single atomic update to a filesystem.  JBD can handle an
 external journal on a block device.
@@ -164,7 +164,7 @@ written to the journal first, and then to its final location.
 In the event of a crash, the journal can be replayed, bringing both data and
 metadata into a consistent state.  This mode is the slowest except when data
 needs to be read from and written to disk at the same time where it
-outperforms all others modes.
+outperforms all other modes.
 Compatibility
 -------------
diff --git a/Documentation/filesystems/files.txt b/Documentation/filesystems/files.txt
index 133e213ebb72..bb0142f61084 100644
--- a/Documentation/filesystems/files.txt
+++ b/Documentation/filesystems/files.txt
@@ -76,13 +76,13 @@ the fdtable structure -
 5. Handling of the file structures is special. Since the look-up
   of the fd (fget()/fget_light()) are lock-free, it is possible
   that look-up may race with the last put() operation on the
-   file structure. This is avoided using the rcuref APIs
+   file structure. This is avoided using atomic_inc_not_zero()
   on ->f_count :
        rcu_read_lock();
        file = fcheck_files(files, fd);
        if (file) {
-                if (rcuref_inc_lf(&file->f_count))
+                if (atomic_inc_not_zero(&file->f_count))
                        *fput_needed = 1;
                else
                /* Didn't get the reference, someone's freed */
@@ -92,7 +92,7 @@ the fdtable structure -
        ....
        return file;
-   rcuref_inc_lf() detects if refcounts is already zero or
+   atomic_inc_not_zero() detects if refcounts is already zero or
   goes to zero during increment. If it does, we fail
   fget()/fget_light().
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index e5c1df52a876..dec99455321f 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -813,9 +813,9 @@ Various pieces   of  information about  kernel activity  are  available in the
 since the system first booted.  For a quick look, simply cat the file:
  > cat /proc/stat
-  cpu  2255 34 2290 22625563 6290 127 456
+  cpu  2255 34 2290 22625563 6290 127 456 0
-  cpu0 1132 34 1441 11311718 3675 127 438
+  cpu0 1132 34 1441 11311718 3675 127 438 0
-  cpu1 1123 0 849 11313845 2614 0 18
+  cpu1 1123 0 849 11313845 2614 0 18 0
  intr 114930548 113199788 3 0 5 263 0 4 [... lots more numbers ...]
  ctxt 1990473
  btime 1062191376
@@ -835,6 +835,7 @@ second).  The meanings of the columns are as follows, from left to right:
 - iowait: waiting for I/O to complete
 - irq: servicing interrupts
 - softirq: servicing softirqs
+- steal: involuntary wait
 The "intr" line gives counts of interrupts  serviced since boot time, for each
 of the  possible system interrupts.   The first  column  is the  total of  all
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt
index 4b5ca26e5048..4598ef7b622b 100644
--- a/Documentation/filesystems/sysfs.txt
+++ b/Documentation/filesystems/sysfs.txt
@@ -51,7 +51,7 @@ for the attributes, providing a means to read and write kernel
 attributes.
 Attributes should be ASCII text files, preferably with only one value
-per file. It is noted that it may not be efficient to contain only
+per file. It is noted that it may not be efficient to contain only one
 value per file, so it is socially acceptable to express an array of
 values of the same type. 
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 6f8e16e3d6c0..9d019d35728f 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -706,7 +706,7 @@ struct address_space_operations {
        wants to make it a free page.  If ->releasepage succeeds, the
        page will be removed from the address_space and become free.
-        The second case if when a request has been made to invalidate
+        The second case is when a request has been made to invalidate
        some or all pages in an address_space.  This can happen
        through the fadvice(POSIX_FADV_DONTNEED) system call or by the
        filesystem explicitly requesting it as nfs and 9fs do (when
diff --git a/Documentation/i2c/i2c-protocol b/Documentation/i2c/i2c-protocol
index 579b92d5f3a3..10518dd58814 100644
--- a/Documentation/i2c/i2c-protocol
+++ b/Documentation/i2c/i2c-protocol
@@ -68,7 +68,7 @@ We have found some I2C devices that needs the following modifications:
  Flags I2C_M_IGNORE_NAK
    Normally message is interrupted immediately if there is [NA] from the
-    client. Setting this flag treats any [NA] as [A], and all of
+    client. Setting this flag treats any [NA] as [A], and all of
    message is sent.
    These messages may still fail to SCL lo->hi timeout.
diff --git a/Documentation/i386/boot.txt b/Documentation/i386/boot.txt
index 35985b34d5a6..2f75e750e4f5 100644
--- a/Documentation/i386/boot.txt
+++ b/Documentation/i386/boot.txt
@@ -168,6 +168,8 @@ Offset	Proto	Name		Meaning
 0234/1  2.05+   relocatable_kernel Whether kernel is relocatable or not
 0235/3  N/A     pad2            Unused
 0238/4  2.06+   cmdline_size    Maximum size of the kernel command line
+023C/4  2.07+   hardware_subarch Hardware subarchitecture
+0240/8  2.07+   hardware_subarch_data Subarchitecture-specific data
 (1) For backwards compatibility, if the setup_sects field contains 0, the
    real value is 4.
@@ -204,7 +206,7 @@ boot loaders can ignore those fields.
 The byte order of all fields is littleendian (this is x86, after all.)
-Field name:     setup_secs
+Field name:     setup_sects
 Type:           read
 Offset/size:    0x1f1/1
 Protocol:       ALL
@@ -356,6 +358,13 @@ Protocol:	2.00+
        - If 0, the protected-mode code is loaded at 0x10000.
        - If 1, the protected-mode code is loaded at 0x100000.
+  Bit 6 (write): KEEP_SEGMENTS
+        Protocol: 2.07+
+        - if 0, reload the segment registers in the 32bit entry point.
+        - if 1, do not reload the segment registers in the 32bit entry point.
+                Assume that %cs %ds %ss %es are all set to flat segments with
+                a base of 0 (or the equivalent for their environment).
  Bit 7 (write): CAN_USE_HEAP
        Set this bit to 1 to indicate that the value entered in the
        heap_end_ptr is valid.  If this field is clear, some setup code
@@ -480,6 +489,29 @@ Protocol:	2.06+
  cmdline_size characters. With protocol version 2.05 and earlier, the
  maximum size was 255.
+Field name:     hardware_subarch
+Type:           write
+Offset/size:    0x23c/4
+Protocol:       2.07+
+  In a paravirtualized environment the hardware low level architectural
+  pieces such as interrupt handling, page table handling, and
+  accessing process control registers needs to be done differently.
+  This field allows the bootloader to inform the kernel we are in one
+  one of those environments.
+  0x00000000    The default x86/PC environment
+  0x00000001    lguest
+  0x00000002    Xen
+Field name:     hardware_subarch_data
+Type:           write
+Offset/size:    0x240/8
+Protocol:       2.07+
+  A pointer to data that is specific to hardware subarch
 **** THE KERNEL COMMAND LINE
diff --git a/Documentation/ia64/err_inject.txt b/Documentation/ia64/err_inject.txt
index 6449a7090dbb..223e4f0582d0 100644
--- a/Documentation/ia64/err_inject.txt
+++ b/Documentation/ia64/err_inject.txt
@@ -21,10 +21,10 @@ software test suits to do stressful testing on IPF.
 Below is a sample application as part of the whole tool. The sample
 can be used as a working test tool. Or it can be expanded to include
-more features. It also can be a integrated into a libary or other user
+more features. It also can be a integrated into a library or other user
 application to have more thorough test.
-The sample application takes err.conf as error configuation input. Gcc
+The sample application takes err.conf as error configuration input. GCC
 compiles the code. After you install err_inject driver, you can run
 this sample application to inject errors.
@@ -809,7 +809,7 @@ int err_inj()
        }
        /* Create semaphore: If one_lock, one semaphore for all processors.
-           Otherwise, one sempaphore for each processor. */
+           Otherwise, one semaphore for each processor. */
        if (one_lock) {
                if (create_sem(0)) {
                        printf("Can not create semaphore...exit\n");
diff --git a/Documentation/input/atarikbd.txt b/Documentation/input/atarikbd.txt
index ab050621e20f..f3a3ba8847ba 100644
--- a/Documentation/input/atarikbd.txt
+++ b/Documentation/input/atarikbd.txt
@@ -170,7 +170,7 @@ major controller faults (ROM checksum and RAM test) and such things as stuck
 keys. Any keys down at power-up are presumed to be stuck, and their BREAK
 (sic) code is returned (which without the preceding MAKE code is a flag for a
 keyboard error). If the controller self-test completes without error, the code
-0xF0 is returned. (This code will be used to indicate the version/rlease of
+0xF0 is returned. (This code will be used to indicate the version/release of
 the ikbd controller. The first release of the ikbd is version 0xF0, should
 there be a second release it will be 0xF1, and so on.)
 The ikbd defaults to a mouse position reporting with threshold of 1 unit in
@@ -413,7 +413,7 @@ INTERROGATION MODE.
            %nnnnmmmm   ; where m is JOYSTICK1 state
                        ; and n is JOYSTICK0 state
-Sets the ikbd to do nothing but monitor the serial command lne, maintain the
+Sets the ikbd to do nothing but monitor the serial command line, maintain the
 time-of-day clock, and monitor the joystick. The rate sets the interval
 between joystick samples.
 N.B. The user should not set the rate higher than the serial communications
@@ -446,10 +446,10 @@ The sample interval should be as constant as possible.
                        ; until vertical cursor key is generated before RY
                        ; has elapsed
    VX                  ; length (in tenths of seconds) of joystick closure
-                        ; until horizontal cursor keystokes are generated
+                        ; until horizontal cursor keystrokes are generated
                        ; after RX has elapsed
    VY                  ; length (in tenths of seconds) of joystick closure
-                        ; until vertical cursor keystokes are generated
+                        ; until vertical cursor keystrokes are generated
                        ; after RY has elapsed
 In this mode, joystick 0 is scanned in a way that simulates cursor keystrokes.
diff --git a/Documentation/input/ff.txt b/Documentation/input/ff.txt
index 085eb15b45b7..ded4d5f53109 100644
--- a/Documentation/input/ff.txt
+++ b/Documentation/input/ff.txt
@@ -1,5 +1,5 @@
 Force feedback for Linux.
-By Johann Deneux <deneux@ifrance.com> on 2001/04/22.
+By Johann Deneux <johann.deneux@gmail.com> on 2001/04/22.
 Updated by Anssi Hannula <anssi.hannula@gmail.com> on 2006/04/09.
 You may redistribute this file. Please remember to include shape.fig and
 interactive.fig as well.
diff --git a/Documentation/input/iforce-protocol.txt b/Documentation/input/iforce-protocol.txt
index 8777d2d321e3..3ac92413c874 100644
--- a/Documentation/input/iforce-protocol.txt
+++ b/Documentation/input/iforce-protocol.txt
@@ -4,10 +4,10 @@ specify force effects to I-Force 2.0 devices.  None of this information comes
 from Immerse. That's why you should not trust what is written in this
 document. This document is intended to help understanding the protocol.
 This is not a reference. Comments and corrections are welcome.  To contact me,
-send an email to: deneux@ifrance.com
+send an email to: johann.deneux@gmail.com
 ** WARNING **
-I may not be held responsible for any dammage or harm caused if you try to
+I shall not be held responsible for any damage or harm caused if you try to
 send data to your I-Force device based on what you read in this document.
 ** Preliminary Notes:
@@ -151,13 +151,13 @@ OP=  ff
 Query command. Length varies according to the query type.
 The general format of this packet is:
 ff 01 QUERY [INDEX] CHECKSUM
-reponses are of the same form:
+responses are of the same form:
 FF LEN QUERY VALUE_QUERIED CHECKSUM2
 where LEN = 1 + length(VALUE_QUERIED)
 **** Query ram size ****
 QUERY = 42 ('B'uffer size)
-The device should reply with the same packet plus two additionnal bytes
+The device should reply with the same packet plus two additional bytes
 containing the size of the memory:
 ff 03 42 03 e8 CS would mean that the device has 1000 bytes of ram available.
@@ -234,19 +234,23 @@ is the amount of memory apparently needed for every set of parameters:
 ** Appendix: How to study the protocol ? **
-1. Generate effects using the force editor provided with the DirectX SDK, or use Immersion Studio (freely available at their web site in the developer section: www.immersion.com)
+1. Generate effects using the force editor provided with the DirectX SDK, or 
-2. Start a soft spying RS232 or USB (depending on where you connected your joystick/wheel). I used ComPortSpy from fCoder (alpha version!)
+use Immersion Studio (freely available at their web site in the developer section: 
+www.immersion.com)
+2. Start a soft spying RS232 or USB (depending on where you connected your 
+joystick/wheel). I used ComPortSpy from fCoder (alpha version!)
 3. Play the effect, and watch what happens on the spy screen.
 A few words about ComPortSpy:
-At first glance, this soft seems, hum, well... buggy. In fact, data appear with a few seconds latency. Personnaly, I restart it every time I play an effect.
+At first glance, this software seems, hum, well... buggy. In fact, data appear with a
+few seconds latency. Personally, I restart it every time I play an effect.
 Remember it's free (as in free beer) and alpha!
 ** URLS **
 Check www.immerse.com for Immersion Studio, and www.fcoder.com for ComPortSpy.
 ** Author of this document **
-Johann Deneux <deneux@ifrance.com>
+Johann Deneux <johann.deneux@gmail.com>
 Home page at http://www.esil.univ-mrs.fr/~jdeneux/projects/ff/
 Additions by Vojtech Pavlik.
diff --git a/Documentation/input/input-programming.txt b/Documentation/input/input-programming.txt
index d9d523099bb7..47fc86830cd7 100644
--- a/Documentation/input/input-programming.txt
+++ b/Documentation/input/input-programming.txt
@@ -42,8 +42,8 @@ static int __init button_init(void)
                goto err_free_irq;
        }
-        button_dev->evbit[0] = BIT(EV_KEY);
+        button_dev->evbit[0] = BIT_MASK(EV_KEY);
-        button_dev->keybit[LONG(BTN_0)] = BIT(BTN_0);
+        button_dev->keybit[BIT_WORD(BTN_0)] = BIT_MASK(BTN_0);
        error = input_register_device(button_dev);
        if (error) {
@@ -79,7 +79,7 @@ In the _init function, which is called either upon module load or when
 booting the kernel, it grabs the required resources (it should also check
 for the presence of the device).
-Then it allocates a new input device structure with input_aloocate_device()
+Then it allocates a new input device structure with input_allocate_device()
 and sets up input bitfields. This way the device driver tells the other
 parts of the input systems what it is - what events can be generated or
 accepted by this input device. Our example device can only generate EV_KEY
@@ -217,14 +217,15 @@ If you don't need absfuzz and absflat, you can set them to zero, which mean
 that the thing is precise and always returns to exactly the center position
 (if it has any).
-1.4 NBITS(), LONG(), BIT()
+1.4 BITS_TO_LONGS(), BIT_WORD(), BIT_MASK()
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
-These three macros from input.h help some bitfield computations:
+These three macros from bitops.h help some bitfield computations:
-        NBITS(x) - returns the length of a bitfield array in longs for x bits
+        BITS_TO_LONGS(x) - returns the length of a bitfield array in longs for
-        LONG(x)  - returns the index in the array in longs for bit x
+                           x bits
-        BIT(x)   - returns the index in a long for bit x
+        BIT_WORD(x)      - returns the index in the array in longs for bit x
+        BIT_MASK(x)      - returns the index in a long for bit x
 1.5 The id* and name fields
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/Documentation/isdn/CREDITS b/Documentation/isdn/CREDITS
index 7c17c837064f..8cac6c2f23ee 100644
--- a/Documentation/isdn/CREDITS
+++ b/Documentation/isdn/CREDITS
@@ -40,7 +40,7 @@ Andreas Kool (akool@Kool.f.EUnet.de)
 Pedro Roque Marques (roque@di.fc.ul.pt)
  For lot of new ideas and the pcbit driver.
-Eberhard Moenkeberg (emoenke@gwdg.de)
+Eberhard Mönkeberg (emoenke@gwdg.de)
  For testing and help to get into kernel.
 Thomas Neumann (tn@ruhr.de)
diff --git a/Documentation/isdn/README.concap b/Documentation/isdn/README.concap
index 2f114babe4b6..a76d74845a4c 100644
--- a/Documentation/isdn/README.concap
+++ b/Documentation/isdn/README.concap
@@ -111,7 +111,7 @@ struct concap_proto_ops{
        struct concap_proto *  (*proto_new) (void);
        /* delete encapsulation protocol instance and free all its resources.
-           cprot may no loger be referenced after calling this */
+           cprot may no longer be referenced after calling this */
        void (*proto_del)(struct concap_proto *cprot);
        /* initialize the protocol's data. To be called at interface startup
diff --git a/Documentation/java.txt b/Documentation/java.txt
index 3cce3fbb6644..e6a723281547 100644
--- a/Documentation/java.txt
+++ b/Documentation/java.txt
@@ -37,7 +37,7 @@ other program after you have done the following:
   or the following, if you want to be more selective:
     ':Applet:M::<!--applet::/usr/bin/appletviewer:'
-   Of cause you have to fix the path names. Given path/file names in this
+   Of course you have to fix the path names. The path/file names given in this
   document match the Debian 2.1 system. (i.e. jdk installed in /usr,
   custom wrappers from this document in /usr/local)
diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt
index fe8b0c4892cf..616043a6da99 100644
--- a/Documentation/kbuild/kconfig-language.txt
+++ b/Documentation/kbuild/kconfig-language.txt
@@ -77,7 +77,12 @@ applicable everywhere (see syntax).
  Optionally, dependencies only for this default value can be added with
  "if".
- dependencies: "depends on"/"requires" <expr>
+- type definition + default value:
+        "def_bool"/"def_tristate" <expr> ["if" <expr>]
+  This is a shorthand notation for a type definition plus a value.
+  Optionally dependencies for this default value can be added with "if".
+- dependencies: "depends on" <expr>
  This defines a dependency for this menu entry. If multiple
  dependencies are defined, they are connected with '&&'. Dependencies
  are applied to all other options within this menu entry (which also
@@ -289,3 +294,10 @@ source:
        "source" <prompt>
 This reads the specified configuration file. This file is always parsed.
+mainmenu:
+        "mainmenu" <prompt>
+This sets the config program's title bar if the config program chooses
+to use it.
diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt
index f099b814d383..7a7753321a26 100644
--- a/Documentation/kbuild/makefiles.txt
+++ b/Documentation/kbuild/makefiles.txt
@@ -518,6 +518,28 @@ more details, with real examples.
        In this example for a specific GCC version the build will error out explaining
        to the user why it stops.
+    cc-cross-prefix
+        cc-cross-prefix is used to check if there exists a $(CC) in path with
+        one of the listed prefixes. The first prefix where there exist a
+        prefix$(CC) in the PATH is returned - and if no prefix$(CC) is found
+        then nothing is returned.
+        Additional prefixes are separated by a single space in the
+        call of cc-cross-prefix.
+        This functionality is useful for architecture Makefiles that try
+        to set CROSS_COMPILE to well-known values but may have several
+        values to select between.
+        It is recommended only to try to set CROSS_COMPILE if it is a cross
+        build (host arch is different from target arch). And if CROSS_COMPILE
+        is already set then leave it with the old value.
+        Example:
+                #arch/m68k/Makefile
+                ifneq ($(SUBARCH),$(ARCH))
+                        ifeq ($(CROSS_COMPILE),)
+                               CROSS_COMPILE := $(call cc-cross-prefix, m68k-linux-gnu-)
+                        endif
+                endif
 === 4 Host Program support
 Kbuild supports building executables on the host for use during the
diff --git a/Documentation/kdump/kdump.txt b/Documentation/kdump/kdump.txt
index 1b37b28cc234..d0ac72cc19ff 100644
--- a/Documentation/kdump/kdump.txt
+++ b/Documentation/kdump/kdump.txt
@@ -231,6 +231,32 @@ Dump-capture kernel config options (Arch Dependent, ia64)
  any space below the alignment point will be wasted.
+Extended crashkernel syntax
+===========================
+While the "crashkernel=size[@offset]" syntax is sufficient for most
+configurations, sometimes it's handy to have the reserved memory dependent
+on the value of System RAM -- that's mostly for distributors that pre-setup
+the kernel command line to avoid a unbootable system after some memory has
+been removed from the machine.
+The syntax is:
+    crashkernel=<range1>:<size1>[,<range2>:<size2>,...][@offset]
+    range=start-[end]
+For example:
+    crashkernel=512M-2G:64M,2G-:128M
+This would mean:
+    1) if the RAM is smaller than 512M, then don't reserve anything
+       (this is the "rescue" case)
+    2) if the RAM size is between 512M and 2G, then reserve 64M
+    3) if the RAM size is larger than 2G, then reserve 128M
 Boot into System Kernel
 =======================
diff --git a/Documentation/kernel-docs.txt b/Documentation/kernel-docs.txt
index d9e3b199929b..5a4ef48224ae 100644
--- a/Documentation/kernel-docs.txt
+++ b/Documentation/kernel-docs.txt
@@ -76,9 +76,9 @@
     * Title: "Conceptual Architecture of the Linux Kernel"
       Author: Ivan T. Bowman.
       URL: http://plg.uwaterloo.ca/~itbowman/papers/CS746G-a1.html
-       Keywords: conceptual software arquitecture, extracted design,
+       Keywords: conceptual software architecture, extracted design,
       reverse engineering, system structure.
-       Description: Conceptual software arquitecture of the Linux kernel,
+       Description: Conceptual software architecture of the Linux kernel,
       automatically extracted from the source code. Very detailed. Good
       figures. Gives good overall kernel understanding.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 98cf90f2631d..b2361667839f 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -222,9 +222,6 @@ and is between 256 and 4096 characters. It is defined in the file
                        Warning: Many of these options can produce a lot of
                        output and make your system unusable. Be very careful.
-        acpi_fake_ecdt  [HW,ACPI] Workaround failure due to BIOS lacking ECDT
        acpi_pm_good    [X86-32,X86-64]
                        Override the pmtimer bug detection: force the kernel
                        to assume that this machine's pmtimer latches its value
@@ -297,9 +294,6 @@ and is between 256 and 4096 characters. It is defined in the file
        apm=            [APM] Advanced Power Management
                        See header of arch/i386/kernel/apm.c.
-        applicom=       [HW]
-                        Format: <mem>,<irq>
        arcrimi=        [HW,NET] ARCnet - "RIM I" (entirely mem-mapped) cards
                        Format: <io>,<irq>,<nodeID>
@@ -345,12 +339,6 @@ and is between 256 and 4096 characters. It is defined in the file
                        Format: <io>,<irq>,<mode>
                        See header of drivers/net/hamradio/baycom_ser_hdx.c.
-        blkmtd_device=  [HW,MTD]
-        blkmtd_erasesz=
-        blkmtd_ro=
-        blkmtd_bs=
-        blkmtd_count=
        boot_delay=     Milliseconds to delay each printk during boot.
                        Values larger than 10 seconds (10000) are changed to
                        no delay (0).
@@ -431,8 +419,10 @@ and is between 256 and 4096 characters. It is defined in the file
                        over the 8254 in addition to over the IO-APIC. The
                        kernel tries to set a sensible default.
-        hpet=           [X86-32,HPET] option to disable HPET and use PIT.
+        hpet=           [X86-32,HPET] option to control HPET usage
-                        Format: disable
+                        Format: { enable (default) | disable | force }
+                        disable: disable HPET and use PIT instead
+                        force: allow force enabled of undocumented chips (ICH4, VIA)
        com20020=       [HW,NET] ARCnet - COM20020 chipset
                        Format:
@@ -479,6 +469,16 @@ and is between 256 and 4096 characters. It is defined in the file
                        UART at the specified I/O port or MMIO address.
                        The options are the same as for ttyS, above.
+        no_console_suspend
+                        [HW] Never suspend the console
+                        Disable suspending of consoles during suspend and
+                        hibernate operations.  Once disabled, debugging
+                        messages can reach various consoles while the rest
+                        of the system is being put to sleep (ie, while
+                        debugging driver suspend/resume hooks).  This may
+                        not work reliably with all consoles, but is known
+                        to work with serial and VGA consoles.
        cpcihp_generic= [HW,PCI] Generic port I/O CompactPCI driver
                        Format:
                        <first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>]
@@ -487,6 +487,13 @@ and is between 256 and 4096 characters. It is defined in the file
                        [KNL] Reserve a chunk of physical memory to
                        hold a kernel to switch to with kexec on panic.
+        crashkernel=range1:size1[,range2:size2,...][@offset]
+                        [KNL] Same as above, but depends on the memory
+                        in the running system. The syntax of range is
+                        start-[end] where start and end are both
+                        a memory unit (amount[KMG]). See also
+                        Documentation/kdump/kdump.txt for a example.
        cs4232=         [HW,OSS]
                        Format: <io>,<irq>,<dma>,<dma2>,<mpuio>,<mpuirq>
@@ -496,8 +503,6 @@ and is between 256 and 4096 characters. It is defined in the file
        cs89x0_media=   [HW,NET]
                        Format: { rj45 | aui | bnc }
-        cyclades=       [HW,SERIAL] Cyclades multi-serial port adapter.
        dasd=           [HW,NET]
                        See header of drivers/s390/block/dasd_devmap.c.
@@ -555,10 +560,6 @@ and is between 256 and 4096 characters. It is defined in the file
                        See drivers/char/README.epca and
                        Documentation/digiepca.txt.
-        dmascc=         [HW,AX25,SERIAL] AX.25 Z80SCC driver with DMA
-                        support available.
-                        Format: <io_dev0>[,<io_dev1>[,..<io_dev32>]]
        dmasound=       [HW,OSS] Sound subsystem buffers
        dscc4.setup=    [NET]
@@ -589,17 +590,10 @@ and is between 256 and 4096 characters. It is defined in the file
                        0: polling mode
                        non-0: interrupt mode (default)
-        eda=            [HW,PS2]
-        edb=            [HW,PS2]
        edd=            [EDD]
                        Format: {"of[f]" | "sk[ipmbr]"}
                        See comment in arch/i386/boot/edd.S
-        eicon=          [HW,ISDN]
-                        Format: <id>,<membase>,<irq>
        eisa_irq_edge=  [PARISC,HW]
                        See header of drivers/parisc/eisa.c.
@@ -778,6 +772,23 @@ and is between 256 and 4096 characters. It is defined in the file
        inttest=        [IA64]
+        intel_iommu=    [DMAR] Intel IOMMU driver (DMAR) option
+                off
+                        Disable intel iommu driver.
+                igfx_off [Default Off]
+                        By default, gfx is mapped as normal device. If a gfx
+                        device has a dedicated DMAR unit, the DMAR unit is
+                        bypassed by not enabling DMAR with this option. In
+                        this case, gfx device will use physical address for
+                        DMA.
+                forcedac [x86_64]
+                        With this option iommu will not optimize to look
+                        for io virtual address below 32 bit forcing dual
+                        address cycle on pci bus for cards supporting greater
+                        than 32 bit addressing. The default is to look
+                        for translation below 32 bit and if not available
+                        then look in the higher range.
        io7=            [HW] IO7 for Marvel based alpha systems
                        See comment before marvel_specify_io7 in
                        arch/alpha/kernel/core_marvel.c.
@@ -875,9 +886,6 @@ and is between 256 and 4096 characters. It is defined in the file
        lapic_timer_c2_ok       [X86-32,x86-64,APIC] trust the local apic timer in
                        C2 power state.
-        lasi=           [HW,SCSI] PARISC LASI driver for the 53c700 chip
-                        Format: addr:<io>,irq:<irq>
        libata.noacpi   [LIBATA] Disables use of ACPI in libata suspend/resume
                        when set.
                        Format: <int>
@@ -1125,9 +1133,6 @@ and is between 256 and 4096 characters. It is defined in the file
        noapic          [SMP,APIC] Tells the kernel to not make use of any
                        IOAPICs that may be present in the system.
-        noasync         [HW,M68K] Disables async and sync negotiation for
-                        all devices.
        nobats          [PPC] Do not use BATs for mapping kernel lowmem
                        on "Classic" PPC cores.
@@ -1439,6 +1444,7 @@ and is between 256 and 4096 characters. It is defined in the file
                        Param: <number> - step/bucket size as a power of 2 for
                                statistical time based profiling.
                        Param: "sleep" - profile D-state sleeping (millisecs)
+                        Param: "kvm" - profile VM exits.
        processor.max_cstate=   [HW,ACPI]
                        Limit processor to maximum C-state
@@ -1565,9 +1571,6 @@ and is between 256 and 4096 characters. It is defined in the file
        sa1100ir        [NET]
                        See drivers/net/irda/sa1100_ir.c.
-        sb=             [HW,OSS]
-                        Format: <io>,<irq>,<dma>,<dma2>
        sbni=           [NET] Granch SBNI12 leased line adapter
        sc1200wdt=      [HW,WDT] SC1200 WDT (watchdog) driver
@@ -1611,8 +1614,6 @@ and is between 256 and 4096 characters. It is defined in the file
        serialnumber    [BUGS=X86-32]
-        sg_def_reserved_size=   [SCSI]
        shapers=        [NET]
                        Maximal number of shapers.
@@ -2003,10 +2004,6 @@ and is between 256 and 4096 characters. It is defined in the file
        norandmaps      Don't use address space randomization
                        Equivalent to echo 0 > /proc/sys/kernel/randomize_va_space
-        unwind_debug=N  N > 0 will enable dwarf2 unwinder debugging
-                        This is useful to get more information why
-                        you got a "dwarf2 unwinder stuck"
 ______________________________________________________________________
 TODO:
diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile
index c0b7a4556390..bac037eb1cda 100644
--- a/Documentation/lguest/Makefile
+++ b/Documentation/lguest/Makefile
@@ -1,28 +1,8 @@
 # This creates the demonstration utility "lguest" which runs a Linux guest.
+CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include
-# For those people that have a separate object dir, look there for .config
-KBUILD_OUTPUT := ../..
-ifdef O
-  ifeq ("$(origin O)", "command line")
-    KBUILD_OUTPUT := $(O)
-  endif
-endif
-# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary.
-include $(KBUILD_OUTPUT)/.config
-LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
-CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -Wl,-T,lguest.lds
 LDLIBS:=-lz
-# Removing this works for some versions of ld.so (eg. Ubuntu Feisty) and
-# not others (eg. FC7).
-LDFLAGS+=-static
-all: lguest.lds lguest
-# The linker script on x86 is so complex the only way of creating one
+all: lguest
-# which will link our binary in the right place is to mangle the
-# default one.
-lguest.lds:
-        $(LD) --verbose | awk '/^==========/ { PRINT=1; next; } /SIZEOF_HEADERS/ { gsub(/0x[0-9A-F]*/, "$(LGUEST_GUEST_TOP)") } { if (PRINT) print $$0; }' > $@
 clean:
-        rm -f lguest.lds lguest
+        rm -f lguest
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 103e346c8b6a..5bdc37f81842 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -1,10 +1,7 @@
 /*P:100 This is the Launcher code, a simple program which lays out the
 * "physical" memory for the new Guest by mapping the kernel image and the
 * virtual devices, then reads repeatedly from /dev/lguest to run the Guest.
- *
+:*/
- * The only trick: the Makefile links it at a high address so it will be clear
- * of the guest memory region.  It means that each Guest cannot have more than
- * about 2.5G of memory on a normally configured Host. :*/
 #define _LARGEFILE64_SOURCE
 #define _GNU_SOURCE
 #include <stdio.h>
@@ -15,6 +12,7 @@
 #include <stdlib.h>
 #include <elf.h>
 #include <sys/mman.h>
+#include <sys/param.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/wait.h>
@@ -34,7 +32,9 @@
 #include <termios.h>
 #include <getopt.h>
 #include <zlib.h>
-/*L:110 We can ignore the 28 include files we need for this program, but I do
+#include <assert.h>
+#include <sched.h>
+/*L:110 We can ignore the 30 include files we need for this program, but I do
 * want to draw attention to the use of kernel-style types.
 *
 * As Linus said, "C is a Spartan language, and so should your naming be."  I
@@ -45,8 +45,14 @@ typedef unsigned long long u64;
 typedef uint32_t u32;
 typedef uint16_t u16;
 typedef uint8_t u8;
-#include "../../include/linux/lguest_launcher.h"
+#include "linux/lguest_launcher.h"
-#include "../../include/asm-x86/e820_32.h"
+#include "linux/pci_ids.h"
+#include "linux/virtio_config.h"
+#include "linux/virtio_net.h"
+#include "linux/virtio_blk.h"
+#include "linux/virtio_console.h"
+#include "linux/virtio_ring.h"
+#include "asm-x86/bootparam.h"
 /*:*/
 #define PAGE_PRESENT 0x7        /* Present, RW, Execute */
@@ -55,6 +61,10 @@ typedef uint8_t u8;
 #ifndef SIOCBRADDIF
 #define SIOCBRADDIF     0x89a2          /* add interface to bridge      */
 #endif
+/* We can have up to 256 pages for devices. */
+#define DEVICE_PAGES 256
+/* This fits nicely in a single 4096-byte page. */
+#define VIRTQUEUE_NUM 127
 /*L:120 verbose is both a global flag and a macro.  The C preprocessor allows
 * this, and although I wouldn't recommend it, it works quite nicely here. */
@@ -65,8 +75,10 @@ static bool verbose;
 /* The pipe to send commands to the waker process */
 static int waker_fd;
-/* The top of guest physical memory. */
+/* The pointer to the start of guest memory. */
-static u32 top;
+static void *guest_base;
+/* The maximum guest physical address allowed, and maximum possible. */
+static unsigned long guest_limit, guest_max;
 /* This is our list of devices. */
 struct device_list
@@ -76,8 +88,17 @@ struct device_list
        fd_set infds;
        int max_infd;
+        /* Counter to assign interrupt numbers. */
+        unsigned int next_irq;
+        /* Counter to print out convenient device numbers. */
+        unsigned int device_num;
        /* The descriptor page for the devices. */
-        struct lguest_device_desc *descs;
+        u8 *descpage;
+        /* The tail of the last descriptor. */
+        unsigned int desc_used;
        /* A single linked list of devices. */
        struct device *dev;
@@ -85,31 +106,111 @@ struct device_list
        struct device **lastdev;
 };
+/* The list of Guest devices, based on command line arguments. */
+static struct device_list devices;
 /* The device structure describes a single device. */
 struct device
 {
        /* The linked-list pointer. */
        struct device *next;
-        /* The descriptor for this device, as mapped into the Guest. */
+        /* The this device's descriptor, as mapped into the Guest. */
        struct lguest_device_desc *desc;
-        /* The memory page(s) of this device, if any.  Also mapped in Guest. */
-        void *mem;
+        /* The name of this device, for --verbose. */
+        const char *name;
        /* If handle_input is set, it wants to be called when this file
         * descriptor is ready. */
        int fd;
        bool (*handle_input)(int fd, struct device *me);
-        /* If handle_output is set, it wants to be called when the Guest sends
+        /* Any queues attached to this device */
-         * DMA to this key. */
+        struct virtqueue *vq;
-        unsigned long watch_key;
-        u32 (*handle_output)(int fd, const struct iovec *iov,
-                             unsigned int num, struct device *me);
        /* Device-specific data. */
        void *priv;
 };
+/* The virtqueue structure describes a queue attached to a device. */
+struct virtqueue
+{
+        struct virtqueue *next;
+        /* Which device owns me. */
+        struct device *dev;
+        /* The configuration for this queue. */
+        struct lguest_vqconfig config;
+        /* The actual ring of buffers. */
+        struct vring vring;
+        /* Last available index we saw. */
+        u16 last_avail_idx;
+        /* The routine to call when the Guest pings us. */
+        void (*handle_output)(int fd, struct virtqueue *me);
+};
+/* Since guest is UP and we don't run at the same time, we don't need barriers.
+ * But I include them in the code in case others copy it. */
+#define wmb()
+/* Convert an iovec element to the given type.
+ *
+ * This is a fairly ugly trick: we need to know the size of the type and
+ * alignment requirement to check the pointer is kosher.  It's also nice to
+ * have the name of the type in case we report failure.
+ *
+ * Typing those three things all the time is cumbersome and error prone, so we
+ * have a macro which sets them all up and passes to the real function. */
+#define convert(iov, type) \
+        ((type *)_convert((iov), sizeof(type), __alignof__(type), #type))
+static void *_convert(struct iovec *iov, size_t size, size_t align,
+                      const char *name)
+{
+        if (iov->iov_len != size)
+                errx(1, "Bad iovec size %zu for %s", iov->iov_len, name);
+        if ((unsigned long)iov->iov_base % align != 0)
+                errx(1, "Bad alignment %p for %s", iov->iov_base, name);
+        return iov->iov_base;
+}
+/* The virtio configuration space is defined to be little-endian.  x86 is
+ * little-endian too, but it's nice to be explicit so we have these helpers. */
+#define cpu_to_le16(v16) (v16)
+#define cpu_to_le32(v32) (v32)
+#define cpu_to_le64(v64) (v64)
+#define le16_to_cpu(v16) (v16)
+#define le32_to_cpu(v32) (v32)
+#define le64_to_cpu(v32) (v64)
+/*L:100 The Launcher code itself takes us out into userspace, that scary place
+ * where pointers run wild and free!  Unfortunately, like most userspace
+ * programs, it's quite boring (which is why everyone likes to hack on the
+ * kernel!).  Perhaps if you make up an Lguest Drinking Game at this point, it
+ * will get you through this section.  Or, maybe not.
+ *
+ * The Launcher sets up a big chunk of memory to be the Guest's "physical"
+ * memory and stores it in "guest_base".  In other words, Guest physical ==
+ * Launcher virtual with an offset.
+ *
+ * This can be tough to get your head around, but usually it just means that we
+ * use these trivial conversion functions when the Guest gives us it's
+ * "physical" addresses: */
+static void *from_guest_phys(unsigned long addr)
+{
+        return guest_base + addr;
+}
+static unsigned long to_guest_phys(const void *addr)
+{
+        return (addr - guest_base);
+}
 /*L:130
 * Loading the Kernel.
 *
@@ -123,43 +224,55 @@ static int open_or_die(const char *name, int flags)
        return fd;
 }
-/* map_zeroed_pages() takes a (page-aligned) address and a number of pages. */
+/* map_zeroed_pages() takes a number of pages. */
-static void *map_zeroed_pages(unsigned long addr, unsigned int num)
+static void *map_zeroed_pages(unsigned int num)
 {
-        /* We cache the /dev/zero file-descriptor so we only open it once. */
+        int fd = open_or_die("/dev/zero", O_RDONLY);
-        static int fd = -1;
+        void *addr;
-        if (fd == -1)
-                fd = open_or_die("/dev/zero", O_RDONLY);
        /* We use a private mapping (ie. if we write to the page, it will be
-         * copied), and obviously we insist that it be mapped where we ask. */
+         * copied). */
-        if (mmap((void *)addr, getpagesize() * num,
+        addr = mmap(NULL, getpagesize() * num,
-                 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, 0)
+                    PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0);
-            != (void *)addr)
+        if (addr == MAP_FAILED)
-                err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr);
+                err(1, "Mmaping %u pages of /dev/zero", num);
-        /* Returning the address is just a courtesy: can simplify callers. */
+        return addr;
-        return (void *)addr;
 }
-/* To find out where to start we look for the magic Guest string, which marks
+/* Get some more pages for a device. */
- * the code we see in lguest_asm.S.  This is a hack which we are currently
+static void *get_pages(unsigned int num)
- * plotting to replace with the normal Linux entry point. */
-static unsigned long entry_point(void *start, void *end,
-                                 unsigned long page_offset)
 {
-        void *p;
+        void *addr = from_guest_phys(guest_limit);
-        /* The scan gives us the physical starting address.  We want the
+        guest_limit += num * getpagesize();
-         * virtual address in this case, and fortunately, we already figured
+        if (guest_limit > guest_max)
-         * out the physical-virtual difference and passed it here in
+                errx(1, "Not enough memory for devices");
-         * "page_offset". */
+        return addr;
-        for (p = start; p < end; p++)
+}
-                if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0)
-                        return (long)p + strlen("GenuineLguest") + page_offset;
-        err(1, "Is this image a genuine lguest?");
+/* This routine is used to load the kernel or initrd.  It tries mmap, but if
+ * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries),
+ * it falls back to reading the memory in. */
+static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
+{
+        ssize_t r;
+        /* We map writable even though for some segments are marked read-only.
+         * The kernel really wants to be writable: it patches its own
+         * instructions.
+         *
+         * MAP_PRIVATE means that the page won't be copied until a write is
+         * done to it.  This allows us to share untouched memory between
+         * Guests. */
+        if (mmap(addr, len, PROT_READ|PROT_WRITE|PROT_EXEC,
+                 MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED)
+                return;
+        /* pread does a seek and a read in one shot: saves a few lines. */
+        r = pread(fd, addr, len, offset);
+        if (r != len)
+                err(1, "Reading offset %lu len %lu gave %zi", offset, len, r);
 }
 /* This routine takes an open vmlinux image, which is in ELF, and maps it into
@@ -167,19 +280,14 @@ static unsigned long entry_point(void *start, void *end,
 * by all modern binaries on Linux including the kernel.
 *
 * The ELF headers give *two* addresses: a physical address, and a virtual
- * address.  The Guest kernel expects to be placed in memory at the physical
+ * address.  We use the physical address; the Guest will map itself to the
- * address, and the page tables set up so it will correspond to that virtual
+ * virtual address.
- * address.  We return the difference between the virtual and physical
- * addresses in the "page_offset" pointer.
 *
 * We return the starting address. */
-static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
+static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
-                             unsigned long *page_offset)
 {
-        void *addr;
        Elf32_Phdr phdr[ehdr->e_phnum];
        unsigned int i;
-        unsigned long start = -1UL, end = 0;
        /* Sanity checks on the main ELF header: an x86 executable with a
         * reasonable number of correctly-sized program headers. */
@@ -199,9 +307,6 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
        if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
                err(1, "Reading program headers");
-        /* We don't know page_offset yet. */
-        *page_offset = 0;
        /* Try all the headers: there are usually only three.  A read-only one,
         * a read-write one, and a "note" section which isn't loadable. */
        for (i = 0; i < ehdr->e_phnum; i++) {
@@ -212,158 +317,53 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
                verbose("Section %i: size %i addr %p\n",
                        i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
-                /* We expect a simple linear address space: every segment must
+                /* We map this section of the file at its physical address. */
-                 * have the same difference between virtual (p_vaddr) and
+                map_at(elf_fd, from_guest_phys(phdr[i].p_paddr),
-                 * physical (p_paddr) address. */
+                       phdr[i].p_offset, phdr[i].p_filesz);
-                if (!*page_offset)
-                        *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr;
-                else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
-                        errx(1, "Page offset of section %i different", i);
-                /* We track the first and last address we mapped, so we can
-                 * tell entry_point() where to scan. */
-                if (phdr[i].p_paddr < start)
-                        start = phdr[i].p_paddr;
-                if (phdr[i].p_paddr + phdr[i].p_filesz > end)
-                        end = phdr[i].p_paddr + phdr[i].p_filesz;
-                /* We map this section of the file at its physical address.  We
-                 * map it read & write even if the header says this segment is
-                 * read-only.  The kernel really wants to be writable: it
-                 * patches its own instructions which would normally be
-                 * read-only.
-                 *
-                 * MAP_PRIVATE means that the page won't be copied until a
-                 * write is done to it.  This allows us to share much of the
-                 * kernel memory between Guests. */
-                addr = mmap((void *)phdr[i].p_paddr,
-                            phdr[i].p_filesz,
-                            PROT_READ|PROT_WRITE|PROT_EXEC,
-                            MAP_FIXED|MAP_PRIVATE,
-                            elf_fd, phdr[i].p_offset);
-                if (addr != (void *)phdr[i].p_paddr)
-                        err(1, "Mmaping vmlinux seg %i gave %p not %p",
-                            i, addr, (void *)phdr[i].p_paddr);
        }
-        return entry_point((void *)start, (void *)end, *page_offset);
+        /* The entry point is given in the ELF header. */
+        return ehdr->e_entry;
 }
-/*L:170 Prepare to be SHOCKED and AMAZED.  And possibly a trifle nauseated.
+/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded.  You're
- *
+ * supposed to jump into it and it will unpack itself.  We used to have to
- * We know that CONFIG_PAGE_OFFSET sets what virtual address the kernel expects
+ * perform some hairy magic because the unpacking code scared me.
- * to be.  We don't know what that option was, but we can figure it out
- * approximately by looking at the addresses in the code.  I chose the common
- * case of reading a memory location into the %eax register:
- *
- *  movl <some-address>, %eax
- *
- * This gets encoded as five bytes: "0xA1 <4-byte-address>".  For example,
- * "0xA1 0x18 0x60 0x47 0xC0" reads the address 0xC0476018 into %eax.
- *
- * In this example can guess that the kernel was compiled with
- * CONFIG_PAGE_OFFSET set to 0xC0000000 (it's always a round number).  If the
- * kernel were larger than 16MB, we might see 0xC1 addresses show up, but our
- * kernel isn't that bloated yet.
- *
- * Unfortunately, x86 has variable-length instructions, so finding this
- * particular instruction properly involves writing a disassembler.  Instead,
- * we rely on statistics.  We look for "0xA1" and tally the different bytes
- * which occur 4 bytes later (the "0xC0" in our example above).  When one of
- * those bytes appears three times, we can be reasonably confident that it
- * forms the start of CONFIG_PAGE_OFFSET.
 *
- * This is amazingly reliable. */
+ * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote
-static unsigned long intuit_page_offset(unsigned char *img, unsigned long len)
+ * a small patch to jump over the tricky bits in the Guest, so now we just read
+ * the funky header so we know where in the file to load, and away we go! */
+static unsigned long load_bzimage(int fd)
 {
-        unsigned int i, possibilities[256] = { 0 };
+        struct boot_params boot;
+        int r;
+        /* Modern bzImages get loaded at 1M. */
+        void *p = from_guest_phys(0x100000);
-        for (i = 0; i + 4 < len; i++) {
+        /* Go back to the start of the file and read the header.  It should be
-                /* mov 0xXXXXXXXX,%eax */
+         * a Linux boot header (see Documentation/i386/boot.txt) */
-                if (img[i] == 0xA1 && ++possibilities[img[i+4]] > 3)
+        lseek(fd, 0, SEEK_SET);
-                        return (unsigned long)img[i+4] << 24;
+        read(fd, &boot, sizeof(boot));
-        }
-        errx(1, "could not determine page offset");
-}
-/*L:160 Unfortunately the entire ELF image isn't compressed: the segments
+        /* Inside the setup_hdr, we expect the magic "HdrS" */
- * which need loading are extracted and compressed raw.  This denies us the
+        if (memcmp(&boot.hdr.header, "HdrS", 4) != 0)
- * information we need to make a fully-general loader. */
+                errx(1, "This doesn't look like a bzImage to me");
-static unsigned long unpack_bzimage(int fd, unsigned long *page_offset)
-{
-        gzFile f;
-        int ret, len = 0;
-        /* A bzImage always gets loaded at physical address 1M.  This is
-         * actually configurable as CONFIG_PHYSICAL_START, but as the comment
-         * there says, "Don't change this unless you know what you are doing".
-         * Indeed. */
-        void *img = (void *)0x100000;
-        /* gzdopen takes our file descriptor (carefully placed at the start of
-         * the GZIP header we found) and returns a gzFile. */
-        f = gzdopen(fd, "rb");
-        /* We read it into memory in 64k chunks until we hit the end. */
-        while ((ret = gzread(f, img + len, 65536)) > 0)
-                len += ret;
-        if (ret < 0)
-                err(1, "reading image from bzImage");
-        verbose("Unpacked size %i addr %p\n", len, img);
-        /* Without the ELF header, we can't tell virtual-physical gap.  This is
-         * CONFIG_PAGE_OFFSET, and people do actually change it.  Fortunately,
-         * I have a clever way of figuring it out from the code itself.  */
-        *page_offset = intuit_page_offset(img, len);
-        return entry_point(img, img + len, *page_offset);
-}
-/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded.  You're
+        /* Skip over the extra sectors of the header. */
- * supposed to jump into it and it will unpack itself.  We can't do that
+        lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET);
- * because the Guest can't run the unpacking code, and adding features to
- * lguest kills puppies, so we don't want to.
+        /* Now read everything into memory. in nice big chunks. */
- *
+        while ((r = read(fd, p, 65536)) > 0)
- * The bzImage is formed by putting the decompressing code in front of the
+                p += r;
- * compressed kernel code.  So we can simple scan through it looking for the
- * first "gzip" header, and start decompressing from there. */
+        /* Finally, code32_start tells us where to enter the kernel. */
-static unsigned long load_bzimage(int fd, unsigned long *page_offset)
+        return boot.hdr.code32_start;
-{
-        unsigned char c;
-        int state = 0;
-        /* GZIP header is 0x1F 0x8B <method> <flags>... <compressed-by>. */
-        while (read(fd, &c, 1) == 1) {
-                switch (state) {
-                case 0:
-                        if (c == 0x1F)
-                                state++;
-                        break;
-                case 1:
-                        if (c == 0x8B)
-                                state++;
-                        else
-                                state = 0;
-                        break;
-                case 2 ... 8:
-                        state++;
-                        break;
-                case 9:
-                        /* Seek back to the start of the gzip header. */
-                        lseek(fd, -10, SEEK_CUR);
-                        /* One final check: "compressed under UNIX". */
-                        if (c != 0x03)
-                                state = -1;
-                        else
-                                return unpack_bzimage(fd, page_offset);
-                }
-        }
-        errx(1, "Could not find kernel in bzImage");
 }
 /*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels
 * come wrapped up in the self-decompressing "bzImage" format.  With some funky
 * coding, we can load those, too. */
-static unsigned long load_kernel(int fd, unsigned long *page_offset)
+static unsigned long load_kernel(int fd)
 {
        Elf32_Ehdr hdr;
@@ -373,10 +373,10 @@ static unsigned long load_kernel(int fd, unsigned long *page_offset)
        /* If it's an ELF file, it starts with "\177ELF" */
        if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
-                return map_elf(fd, &hdr, page_offset);
+                return map_elf(fd, &hdr);
        /* Otherwise we assume it's a bzImage, and try to unpack it */
-        return load_bzimage(fd, page_offset);
+        return load_bzimage(fd);
 }
 /* This is a trivial little helper to align pages.  Andi Kleen hated it because
@@ -402,59 +402,45 @@ static unsigned long load_initrd(const char *name, unsigned long mem)
        int ifd;
        struct stat st;
        unsigned long len;
-        void *iaddr;
        ifd = open_or_die(name, O_RDONLY);
        /* fstat() is needed to get the file size. */
        if (fstat(ifd, &st) < 0)
                err(1, "fstat() on initrd '%s'", name);
-        /* The length needs to be rounded up to a page size: mmap needs the
+        /* We map the initrd at the top of memory, but mmap wants it to be
-         * address to be page aligned. */
+         * page-aligned, so we round the size up for that. */
        len = page_align(st.st_size);
-        /* We map the initrd at the top of memory. */
+        map_at(ifd, from_guest_phys(mem - len), 0, st.st_size);
-        iaddr = mmap((void *)mem - len, st.st_size,
-                     PROT_READ|PROT_EXEC|PROT_WRITE,
-                     MAP_FIXED|MAP_PRIVATE, ifd, 0);
-        if (iaddr != (void *)mem - len)
-                err(1, "Mmaping initrd '%s' returned %p not %p",
-                    name, iaddr, (void *)mem - len);
        /* Once a file is mapped, you can close the file descriptor.  It's a
         * little odd, but quite useful. */
        close(ifd);
-        verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr);
+        verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len);
        /* We return the initrd size. */
        return len;
 }
-/* Once we know how much memory we have, and the address the Guest kernel
+/* Once we know how much memory we have, we can construct simple linear page
- * expects, we can construct simple linear page tables which will get the Guest
+ * tables which set virtual == physical which will get the Guest far enough
- * far enough into the boot to create its own.
+ * into the boot to create its own.
 *
 * We lay them out of the way, just below the initrd (which is why we need to
 * know its size). */
 static unsigned long setup_pagetables(unsigned long mem,
-                                      unsigned long initrd_size,
+                                      unsigned long initrd_size)
-                                      unsigned long page_offset)
 {
-        u32 *pgdir, *linear;
+        unsigned long *pgdir, *linear;
        unsigned int mapped_pages, i, linear_pages;
-        unsigned int ptes_per_page = getpagesize()/sizeof(u32);
+        unsigned int ptes_per_page = getpagesize()/sizeof(void *);
-        /* Ideally we map all physical memory starting at page_offset.
+        mapped_pages = mem/getpagesize();
-         * However, if page_offset is 0xC0000000 we can only map 1G of physical
-         * (0xC0000000 + 1G overflows). */
-        if (mem <= -page_offset)
-                mapped_pages = mem/getpagesize();
-        else
-                mapped_pages = -page_offset/getpagesize();
        /* Each PTE page can map ptes_per_page pages: how many do we need? */
        linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page;
        /* We put the toplevel page directory page at the top of memory. */
-        pgdir = (void *)mem - initrd_size - getpagesize();
+        pgdir = from_guest_phys(mem) - initrd_size - getpagesize();
        /* Now we use the next linear_pages pages as pte pages */
        linear = (void *)pgdir - linear_pages*getpagesize();
@@ -465,20 +451,19 @@ static unsigned long setup_pagetables(unsigned long mem,
        for (i = 0; i < mapped_pages; i++)
                linear[i] = ((i * getpagesize()) | PAGE_PRESENT);
-        /* The top level points to the linear page table pages above.  The
+        /* The top level points to the linear page table pages above. */
-         * entry representing page_offset points to the first one, and they
-         * continue from there. */
        for (i = 0; i < mapped_pages; i += ptes_per_page) {
-                pgdir[(i + page_offset/getpagesize())/ptes_per_page]
+                pgdir[i/ptes_per_page]
-                        = (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT);
+                        = ((to_guest_phys(linear) + i*sizeof(void *))
+                           | PAGE_PRESENT);
        }
-        verbose("Linear mapping of %u pages in %u pte pages at %p\n",
+        verbose("Linear mapping of %u pages in %u pte pages at %#lx\n",
-                mapped_pages, linear_pages, linear);
+                mapped_pages, linear_pages, to_guest_phys(linear));
        /* We return the top level (guest-physical) address: the kernel needs
         * to know where it is. */
-        return (unsigned long)pgdir;
+        return to_guest_phys(pgdir);
 }
 /* Simple routine to roll all the commandline arguments together with spaces
@@ -498,14 +483,17 @@ static void concat(char *dst, char *args[])
 /* This is where we actually tell the kernel to initialize the Guest.  We saw
 * the arguments it expects when we looked at initialize() in lguest_user.c:
- * the top physical page to allow, the top level pagetable, the entry point and
+ * the base of guest "physical" memory, the top physical page to allow, the
- * the page_offset constant for the Guest. */
+ * top level pagetable and the entry point for the Guest. */
-static int tell_kernel(u32 pgdir, u32 start, u32 page_offset)
+static int tell_kernel(unsigned long pgdir, unsigned long start)
 {
-        u32 args[] = { LHREQ_INITIALIZE,
+        unsigned long args[] = { LHREQ_INITIALIZE,
-                       top/getpagesize(), pgdir, start, page_offset };
+                                 (unsigned long)guest_base,
+                                 guest_limit / getpagesize(), pgdir, start };
        int fd;
+        verbose("Guest: %p - %p (%#lx)\n",
+                guest_base, guest_base + guest_limit, guest_limit);
        fd = open_or_die("/dev/lguest", O_RDWR);
        if (write(fd, args, sizeof(args)) < 0)
                err(1, "Writing to /dev/lguest");
@@ -515,11 +503,11 @@ static int tell_kernel(u32 pgdir, u32 start, u32 page_offset)
 }
 /*:*/
-static void set_fd(int fd, struct device_list *devices)
+static void add_device_fd(int fd)
 {
-        FD_SET(fd, &devices->infds);
+        FD_SET(fd, &devices.infds);
-        if (fd > devices->max_infd)
+        if (fd > devices.max_infd)
-                devices->max_infd = fd;
+                devices.max_infd = fd;
 }
 /*L:200
@@ -537,36 +525,38 @@ static void set_fd(int fd, struct device_list *devices)
 *
 * This, of course, is merely a different *kind* of icky.
 */
-static void wake_parent(int pipefd, int lguest_fd, struct device_list *devices)
+static void wake_parent(int pipefd, int lguest_fd)
 {
        /* Add the pipe from the Launcher to the fdset in the device_list, so
         * we watch it, too. */
-        set_fd(pipefd, devices);
+        add_device_fd(pipefd);
        for (;;) {
-                fd_set rfds = devices->infds;
+                fd_set rfds = devices.infds;
-                u32 args[] = { LHREQ_BREAK, 1 };
+                unsigned long args[] = { LHREQ_BREAK, 1 };
                /* Wait until input is ready from one of the devices. */
-                select(devices->max_infd+1, &rfds, NULL, NULL, NULL);
+                select(devices.max_infd+1, &rfds, NULL, NULL, NULL);
                /* Is it a message from the Launcher? */
                if (FD_ISSET(pipefd, &rfds)) {
-                        int ignorefd;
+                        int fd;
                        /* If read() returns 0, it means the Launcher has
                         * exited.  We silently follow. */
-                        if (read(pipefd, &ignorefd, sizeof(ignorefd)) == 0)
+                        if (read(pipefd, &fd, sizeof(fd)) == 0)
                                exit(0);
-                        /* Otherwise it's telling us there's a problem with one
+                        /* Otherwise it's telling us to change what file
-                         * of the devices, and we should ignore that file
+                         * descriptors we're to listen to. */
-                         * descriptor from now on. */
+                        if (fd >= 0)
-                        FD_CLR(ignorefd, &devices->infds);
+                                FD_SET(fd, &devices.infds);
+                        else
+                                FD_CLR(-fd - 1, &devices.infds);
                } else /* Send LHREQ_BREAK command. */
                        write(lguest_fd, args, sizeof(args));
        }
 }
 /* This routine just sets up a pipe to the Waker process. */
-static int setup_waker(int lguest_fd, struct device_list *device_list)
+static int setup_waker(int lguest_fd)
 {
        int pipefd[2], child;
@@ -580,7 +570,7 @@ static int setup_waker(int lguest_fd, struct device_list *device_list)
        if (child == 0) {
                /* Close the "writing" end of our copy of the pipe */
                close(pipefd[1]);
-                wake_parent(pipefd[0], lguest_fd, device_list);
+                wake_parent(pipefd[0], lguest_fd);
        }
        /* Close the reading end of our copy of the pipe. */
        close(pipefd[0]);
@@ -602,83 +592,128 @@ static void *_check_pointer(unsigned long addr, unsigned int size,
 {
        /* We have to separately check addr and addr+size, because size could
         * be huge and addr + size might wrap around. */
-        if (addr >= top || addr + size >= top)
+        if (addr >= guest_limit || addr + size >= guest_limit)
-                errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr);
+                errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr);
        /* We return a pointer for the caller's convenience, now we know it's
         * safe to use. */
-        return (void *)addr;
+        return from_guest_phys(addr);
 }
 /* A macro which transparently hands the line number to the real function. */
 #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
-/* The Guest has given us the address of a "struct lguest_dma".  We check it's
+/* This function returns the next descriptor in the chain, or vq->vring.num. */
- * OK and convert it to an iovec (which is a simple array of ptr/size
+static unsigned next_desc(struct virtqueue *vq, unsigned int i)
- * pairs). */
-static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num)
 {
-        unsigned int i;
+        unsigned int next;
-        struct lguest_dma *udma;
-        /* First we make sure that the array memory itself is valid. */
-        udma = check_pointer(dma, sizeof(*udma));
-        /* Now we check each element */
-        for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
-                /* A zero length ends the array. */
-                if (!udma->len[i])
-                        break;
-                iov[i].iov_base = check_pointer(udma->addr[i], udma->len[i]);
+        /* If this descriptor says it doesn't chain, we're done. */
-                iov[i].iov_len = udma->len[i];
+        if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT))
-        }
+                return vq->vring.num;
-        *num = i;
+        /* Check they're not leading us off end of descriptors. */
+        next = vq->vring.desc[i].next;
+        /* Make sure compiler knows to grab that: we don't want it changing! */
+        wmb();
-        /* We return the pointer to where the caller should write the amount of
+        if (next >= vq->vring.num)
-         * the buffer used. */
+                errx(1, "Desc next is %u", next);
-        return &udma->used_len;
+        return next;
+}
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access.  Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->vring.num (which
+ * is never a valid descriptor number) if none was found. */
+static unsigned get_vq_desc(struct virtqueue *vq,
+                            struct iovec iov[],
+                            unsigned int *out_num, unsigned int *in_num)
+{
+        unsigned int i, head;
+        /* Check it isn't doing very strange things with descriptor numbers. */
+        if ((u16)(vq->vring.avail->idx - vq->last_avail_idx) > vq->vring.num)
+                errx(1, "Guest moved used index from %u to %u",
+                     vq->last_avail_idx, vq->vring.avail->idx);
+        /* If there's nothing new since last we looked, return invalid. */
+        if (vq->vring.avail->idx == vq->last_avail_idx)
+                return vq->vring.num;
+        /* Grab the next descriptor number they're advertising, and increment
+         * the index we've seen. */
+        head = vq->vring.avail->ring[vq->last_avail_idx++ % vq->vring.num];
+        /* If their number is silly, that's a fatal mistake. */
+        if (head >= vq->vring.num)
+                errx(1, "Guest says index %u is available", head);
+        /* When we start there are none of either input nor output. */
+        *out_num = *in_num = 0;
+        i = head;
+        do {
+                /* Grab the first descriptor, and check it's OK. */
+                iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len;
+                iov[*out_num + *in_num].iov_base
+                        = check_pointer(vq->vring.desc[i].addr,
+                                        vq->vring.desc[i].len);
+                /* If this is an input descriptor, increment that count. */
+                if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE)
+                        (*in_num)++;
+                else {
+                        /* If it's an output descriptor, they're all supposed
+                         * to come before any input descriptors. */
+                        if (*in_num)
+                                errx(1, "Descriptor has out after in");
+                        (*out_num)++;
+                }
+                /* If we've got too many, that implies a descriptor loop. */
+                if (*out_num + *in_num > vq->vring.num)
+                        errx(1, "Looped descriptor");
+        } while ((i = next_desc(vq, i)) != vq->vring.num);
+        return head;
 }
-/* This routine gets a DMA buffer from the Guest for a given key, and converts
+/* Once we've used one of their buffers, we tell them about it.  We'll then
- * it to an iovec array.  It returns the interrupt the Guest wants when we're
+ * want to send them an interrupt, using trigger_irq(). */
- * finished, and a pointer to the "used_len" field to fill in. */
+static void add_used(struct virtqueue *vq, unsigned int head, int len)
-static u32 *get_dma_buffer(int fd, void *key,
-                           struct iovec iov[], unsigned int *num, u32 *irq)
 {
-        u32 buf[] = { LHREQ_GETDMA, (u32)key };
+        struct vring_used_elem *used;
-        unsigned long udma;
-        u32 *res;
+        /* Get a pointer to the next entry in the used ring. */
+        used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
-        /* Ask the kernel for a DMA buffer corresponding to this key. */
+        used->id = head;
-        udma = write(fd, buf, sizeof(buf));
+        used->len = len;
-        /* They haven't registered any, or they're all used? */
+        /* Make sure buffer is written before we update index. */
-        if (udma == (unsigned long)-1)
+        wmb();
-                return NULL;
+        vq->vring.used->idx++;
-        /* Convert it into our iovec array */
-        res = dma2iov(udma, iov, num);
-        /* The kernel stashes irq in ->used_len to get it out to us. */
-        *irq = *res;
-        /* Return a pointer to ((struct lguest_dma *)udma)->used_len. */
-        return res;
 }
-/* This is a convenient routine to send the Guest an interrupt. */
+/* This actually sends the interrupt for this virtqueue */
-static void trigger_irq(int fd, u32 irq)
+static void trigger_irq(int fd, struct virtqueue *vq)
 {
-        u32 buf[] = { LHREQ_IRQ, irq };
+        unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
+        if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
+                return;
+        /* Send the Guest an interrupt tell them we used something up. */
        if (write(fd, buf, sizeof(buf)) != 0)
-                err(1, "Triggering irq %i", irq);
+                err(1, "Triggering irq %i", vq->config.irq);
 }
-/* This simply sets up an iovec array where we can put data to be discarded.
+/* And here's the combo meal deal.  Supersize me! */
- * This happens when the Guest doesn't want or can't handle the input: we have
+static void add_used_and_trigger(int fd, struct virtqueue *vq,
- * to get rid of it somewhere, and if we bury it in the ceiling space it will
+                                 unsigned int head, int len)
- * start to smell after a week. */
-static void discard_iovec(struct iovec *iov, unsigned int *num)
 {
-        static char discard_buf[1024];
+        add_used(vq, head, len);
-        *num = 1;
+        trigger_irq(fd, vq);
-        iov->iov_base = discard_buf;
-        iov->iov_len = sizeof(discard_buf);
 }
 /* Here is the input terminal setting we save, and the routine to restore them
@@ -701,38 +736,39 @@ struct console_abort
 /* This is the routine which handles console input (ie. stdin). */
 static bool handle_console_input(int fd, struct device *dev)
 {
-        u32 irq = 0, *lenp;
        int len;
-        unsigned int num;
+        unsigned int head, in_num, out_num;
-        struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
+        struct iovec iov[dev->vq->vring.num];
        struct console_abort *abort = dev->priv;
-        /* First we get the console buffer from the Guest.  The key is dev->mem
+        /* First we need a console buffer from the Guests's input virtqueue. */
-         * which was set to 0 in setup_console(). */
+        head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
-        lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq);
-        if (!lenp) {
+        /* If they're not ready for input, stop listening to this file
-                /* If it's not ready for input, warn and set up to discard. */
+         * descriptor.  We'll start again once they add an input buffer. */
-                warn("console: no dma buffer!");
+        if (head == dev->vq->vring.num)
-                discard_iovec(iov, &num);
+                return false;
-        }
+        if (out_num)
+                errx(1, "Output buffers in console in queue?");
        /* This is why we convert to iovecs: the readv() call uses them, and so
         * it reads straight into the Guest's buffer. */
-        len = readv(dev->fd, iov, num);
+        len = readv(dev->fd, iov, in_num);
        if (len <= 0) {
                /* This implies that the console is closed, is /dev/null, or
-                 * something went terribly wrong.  We still go through the rest
+                 * something went terribly wrong. */
-                 * of the logic, though, especially the exit handling below. */
                warnx("Failed to get console input, ignoring console.");
-                len = 0;
+                /* Put the input terminal back. */
+                restore_term();
+                /* Remove callback from input vq, so it doesn't restart us. */
+                dev->vq->handle_output = NULL;
+                /* Stop listening to this fd: don't call us again. */
+                return false;
        }
-        /* If we read the data into the Guest, fill in the length and send the
+        /* Tell the Guest about the new input. */
-         * interrupt. */
+        add_used_and_trigger(fd, dev->vq, head, len);
-        if (lenp) {
-                *lenp = len;
-                trigger_irq(fd, irq);
-        }
        /* Three ^C within one second?  Exit.
         *
@@ -746,7 +782,7 @@ static bool handle_console_input(int fd, struct device *dev)
                        struct timeval now;
                        gettimeofday(&now, NULL);
                        if (now.tv_sec <= abort->start.tv_sec+1) {
-                                u32 args[] = { LHREQ_BREAK, 0 };
+                                unsigned long args[] = { LHREQ_BREAK, 0 };
                                /* Close the fd so Waker will know it has to
                                 * exit. */
                                close(waker_fd);
@@ -761,214 +797,163 @@ static bool handle_console_input(int fd, struct device *dev)
                /* Any other key resets the abort counter. */
                abort->count = 0;
-        /* Now, if we didn't read anything, put the input terminal back and
-         * return failure (meaning, don't call us again). */
-        if (!len) {
-                restore_term();
-                return false;
-        }
        /* Everything went OK! */
        return true;
 }
-/* Handling console output is much simpler than input. */
+/* Handling output for console is simple: we just get all the output buffers
-static u32 handle_console_output(int fd, const struct iovec *iov,
+ * and write them to stdout. */
-                                 unsigned num, struct device*dev)
+static void handle_console_output(int fd, struct virtqueue *vq)
 {
-        /* Whatever the Guest sends, write it to standard output.  Return the
+        unsigned int head, out, in;
-         * number of bytes written. */
+        int len;
-        return writev(STDOUT_FILENO, iov, num);
+        struct iovec iov[vq->vring.num];
-}
+        /* Keep getting output buffers from the Guest until we run out. */
-/* Guest->Host network output is also pretty easy. */
+        while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
-static u32 handle_tun_output(int fd, const struct iovec *iov,
+                if (in)
-                             unsigned num, struct device *dev)
+                        errx(1, "Input buffers in output queue?");
-{
+                len = writev(STDOUT_FILENO, iov, out);
-        /* We put a flag in the "priv" pointer of the network device, and set
+                add_used_and_trigger(fd, vq, head, len);
-         * it as soon as we see output.  We'll see why in handle_tun_input() */
+        }
-        *(bool *)dev->priv = true;
-        /* Whatever packet the Guest sent us, write it out to the tun
-         * device. */
-        return writev(dev->fd, iov, num);
 }
-/* This matches the peer_key() in lguest_net.c.  The key for any given slot
+/* Handling output for network is also simple: we get all the output buffers
- * is the address of the network device's page plus 4 * the slot number. */
+ * and write them (ignoring the first element) to this device's file descriptor
-static unsigned long peer_offset(unsigned int peernum)
+ * (stdout). */
+static void handle_net_output(int fd, struct virtqueue *vq)
 {
-        return 4 * peernum;
+        unsigned int head, out, in;
+        int len;
+        struct iovec iov[vq->vring.num];
+        /* Keep getting output buffers from the Guest until we run out. */
+        while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
+                if (in)
+                        errx(1, "Input buffers in output queue?");
+                /* Check header, but otherwise ignore it (we said we supported
+                 * no features). */
+                (void)convert(&iov[0], struct virtio_net_hdr);
+                len = writev(vq->dev->fd, iov+1, out-1);
+                add_used_and_trigger(fd, vq, head, len);
+        }
 }
-/* This is where we handle a packet coming in from the tun device */
+/* This is where we handle a packet coming in from the tun device to our
+ * Guest. */
 static bool handle_tun_input(int fd, struct device *dev)
 {
-        u32 irq = 0, *lenp;
+        unsigned int head, in_num, out_num;
        int len;
-        unsigned num;
+        struct iovec iov[dev->vq->vring.num];
-        struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
+        struct virtio_net_hdr *hdr;
-        /* First we get a buffer the Guest has bound to its key. */
+        /* First we need a network buffer from the Guests's recv virtqueue. */
-        lenp = get_dma_buffer(fd, dev->mem+peer_offset(NET_PEERNUM), iov, &num,
+        head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
-                              &irq);
+        if (head == dev->vq->vring.num) {
-        if (!lenp) {
                /* Now, it's expected that if we try to send a packet too
-                 * early, the Guest won't be ready yet.  This is why we set a
+                 * early, the Guest won't be ready yet.  Wait until the device
-                 * flag when the Guest sends its first packet.  If it's sent a
+                 * status says it's ready. */
-                 * packet we assume it should be ready to receive them.
+                /* FIXME: Actually want DRIVER_ACTIVE here. */
-                 *
+                if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK)
-                 * Actually, this is what the status bits in the descriptor are
-                 * for: we should *use* them.  FIXME! */
-                if (*(bool *)dev->priv)
                        warn("network: no dma buffer!");
-                discard_iovec(iov, &num);
+                /* We'll turn this back on if input buffers are registered. */
-        }
+                return false;
+        } else if (out_num)
+                errx(1, "Output buffers in network recv queue?");
+        /* First element is the header: we set it to 0 (no features). */
+        hdr = convert(&iov[0], struct virtio_net_hdr);
+        hdr->flags = 0;
+        hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
        /* Read the packet from the device directly into the Guest's buffer. */
-        len = readv(dev->fd, iov, num);
+        len = readv(dev->fd, iov+1, in_num-1);
        if (len <= 0)
                err(1, "reading network");
-        /* Write the used_len, and trigger the interrupt for the Guest */
+        /* Tell the Guest about the new packet. */
-        if (lenp) {
+        add_used_and_trigger(fd, dev->vq, head, sizeof(*hdr) + len);
-                *lenp = len;
-                trigger_irq(fd, irq);
-        }
        verbose("tun input packet len %i [%02x %02x] (%s)\n", len,
-                ((u8 *)iov[0].iov_base)[0], ((u8 *)iov[0].iov_base)[1],
+                ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1],
-                lenp ? "sent" : "discarded");
+                head != dev->vq->vring.num ? "sent" : "discarded");
        /* All good. */
        return true;
 }
-/* The last device handling routine is block output: the Guest has sent a DMA
+/* This callback ensures we try again, in case we stopped console or net
- * to the block device.  It will have placed the command it wants in the
+ * delivery because Guest didn't have any buffers. */
- * "struct lguest_block_page". */
+static void enable_fd(int fd, struct virtqueue *vq)
-static u32 handle_block_output(int fd, const struct iovec *iov,
-                               unsigned num, struct device *dev)
 {
-        struct lguest_block_page *p = dev->mem;
+        add_device_fd(vq->dev->fd);
-        u32 irq, *lenp;
+        /* Tell waker to listen to it again */
-        unsigned int len, reply_num;
+        write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd));
-        struct iovec reply[LGUEST_MAX_DMA_SECTIONS];
-        off64_t device_len, off = (off64_t)p->sector * 512;
-        /* First we extract the device length from the dev->priv pointer. */
-        device_len = *(off64_t *)dev->priv;
-        /* We first check that the read or write is within the length of the
-         * block file. */
-        if (off >= device_len)
-                err(1, "Bad offset %llu vs %llu", off, device_len);
-        /* Move to the right location in the block file.  This shouldn't fail,
-         * but best to check. */
-        if (lseek64(dev->fd, off, SEEK_SET) != off)
-                err(1, "Bad seek to sector %i", p->sector);
-        verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off);
-        /* They were supposed to bind a reply buffer at key equal to the start
-         * of the block device memory.  We need this to tell them when the
-         * request is finished. */
-        lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq);
-        if (!lenp)
-                err(1, "Block request didn't give us a dma buffer");
-        if (p->type) {
-                /* A write request.  The DMA they sent contained the data, so
-                 * write it out. */
-                len = writev(dev->fd, iov, num);
-                /* Grr... Now we know how long the "struct lguest_dma" they
-                 * sent was, we make sure they didn't try to write over the end
-                 * of the block file (possibly extending it). */
-                if (off + len > device_len) {
-                        /* Trim it back to the correct length */
-                        ftruncate64(dev->fd, device_len);
-                        /* Die, bad Guest, die. */
-                        errx(1, "Write past end %llu+%u", off, len);
-                }
-                /* The reply length is 0: we just send back an empty DMA to
-                 * interrupt them and tell them the write is finished. */
-                *lenp = 0;
-        } else {
-                /* A read request.  They sent an empty DMA to start the
-                 * request, and we put the read contents into the reply
-                 * buffer. */
-                len = readv(dev->fd, reply, reply_num);
-                *lenp = len;
-        }
-        /* The result is 1 (done), 2 if there was an error (short read or
-         * write). */
-        p->result = 1 + (p->bytes != len);
-        /* Now tell them we've used their reply buffer. */
-        trigger_irq(fd, irq);
-        /* We're supposed to return the number of bytes of the output buffer we
-         * used.  But the block device uses the "result" field instead, so we
-         * don't bother. */
-        return 0;
 }
-/* This is the generic routine we call when the Guest sends some DMA out. */
+/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */
-static void handle_output(int fd, unsigned long dma, unsigned long key,
+static void handle_output(int fd, unsigned long addr)
-                          struct device_list *devices)
 {
        struct device *i;
-        u32 *lenp;
+        struct virtqueue *vq;
-        struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
-        unsigned num = 0;
+        /* Check each virtqueue. */
+        for (i = devices.dev; i; i = i->next) {
-        /* Convert the "struct lguest_dma" they're sending to a "struct
+                for (vq = i->vq; vq; vq = vq->next) {
-         * iovec". */
+                        if (vq->config.pfn == addr/getpagesize()
-        lenp = dma2iov(dma, iov, &num);
+                            && vq->handle_output) {
+                                verbose("Output to %s\n", vq->dev->name);
-        /* Check each device: if they expect output to this key, tell them to
+                                vq->handle_output(fd, vq);
-         * handle it. */
+                                return;
-        for (i = devices->dev; i; i = i->next) {
+                        }
-                if (i->handle_output && key == i->watch_key) {
-                        /* We write the result straight into the used_len field
-                         * for them. */
-                        *lenp = i->handle_output(fd, iov, num, i);
-                        return;
                }
        }
-        /* This can happen: the kernel sends any SEND_DMA which doesn't match
+        /* Early console write is done using notify on a nul-terminated string
-         * another Guest to us.  It could be that another Guest just left a
+         * in Guest memory. */
-         * network, for example.  But it's unusual. */
+        if (addr >= guest_limit)
-        warnx("Pending dma %p, key %p", (void *)dma, (void *)key);
+                errx(1, "Bad NOTIFY %#lx", addr);
+        write(STDOUT_FILENO, from_guest_phys(addr),
+              strnlen(from_guest_phys(addr), guest_limit - addr));
 }
 /* This is called when the waker wakes us up: check for incoming file
 * descriptors. */
-static void handle_input(int fd, struct device_list *devices)
+static void handle_input(int fd)
 {
        /* select() wants a zeroed timeval to mean "don't wait". */
        struct timeval poll = { .tv_sec = 0, .tv_usec = 0 };
        for (;;) {
                struct device *i;
-                fd_set fds = devices->infds;
+                fd_set fds = devices.infds;
                /* If nothing is ready, we're done. */
-                if (select(devices->max_infd+1, &fds, NULL, NULL, &poll) == 0)
+                if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0)
                        break;
                /* Otherwise, call the device(s) which have readable
                 * file descriptors and a method of handling them.  */
-                for (i = devices->dev; i; i = i->next) {
+                for (i = devices.dev; i; i = i->next) {
                        if (i->handle_input && FD_ISSET(i->fd, &fds)) {
+                                int dev_fd;
+                                if (i->handle_input(fd, i))
+                                        continue;
                                /* If handle_input() returns false, it means we
-                                 * should no longer service it.
+                                 * should no longer service it.  Networking and
-                                 * handle_console_input() does this. */
+                                 * console do this when there's no input
-                                if (!i->handle_input(fd, i)) {
+                                 * buffers to deliver into.  Console also uses
-                                        /* Clear it from the set of input file
+                                 * it when it discovers that stdin is
-                                         * descriptors kept at the head of the
+                                 * closed. */
-                                         * device list. */
+                                FD_CLR(i->fd, &devices.infds);
-                                        FD_CLR(i->fd, &devices->infds);
+                                /* Tell waker to ignore it too, by sending a
-                                        /* Tell waker to ignore it too... */
+                                 * negative fd number (-1, since 0 is a valid
-                                        write(waker_fd, &i->fd, sizeof(i->fd));
+                                 * FD number). */
-                                }
+                                dev_fd = -i->fd - 1;
+                                write(waker_fd, &dev_fd, sizeof(dev_fd));
                        }
                }
        }
@@ -982,43 +967,93 @@ static void handle_input(int fd, struct device_list *devices)
 * routines to allocate them.
 *
 * This routine allocates a new "struct lguest_device_desc" from descriptor
- * table in the devices array just above the Guest's normal memory. */
+ * table just above the Guest's normal memory.  It returns a pointer to that
-static struct lguest_device_desc *
+ * descriptor. */
-new_dev_desc(struct lguest_device_desc *descs,
+static struct lguest_device_desc *new_dev_desc(u16 type)
-             u16 type, u16 features, u16 num_pages)
 {
-        unsigned int i;
+        struct lguest_device_desc *d;
-        for (i = 0; i < LGUEST_MAX_DEVICES; i++) {
+        /* We only have one page for all the descriptors. */
-                if (!descs[i].type) {
+        if (devices.desc_used + sizeof(*d) > getpagesize())
-                        descs[i].type = type;
+                errx(1, "Too many devices");
-                        descs[i].features = features;
-                        descs[i].num_pages = num_pages;
+        /* We don't need to set config_len or status: page is 0 already. */
-                        /* If they said the device needs memory, we allocate
+        d = (void *)devices.descpage + devices.desc_used;
-                         * that now, bumping up the top of Guest memory. */
+        d->type = type;
-                        if (num_pages) {
+        devices.desc_used += sizeof(*d);
-                                map_zeroed_pages(top, num_pages);
-                                descs[i].pfn = top/getpagesize();
+        return d;
-                                top += num_pages*getpagesize();
-                        }
-                        return &descs[i];
-                }
-        }
-        errx(1, "too many devices");
 }
-/* This monster routine does all the creation and setup of a new device,
+/* Each device descriptor is followed by some configuration information.
- * including caling new_dev_desc() to allocate the descriptor and device
+ * The first byte is a "status" byte for the Guest to report what's happening.
- * memory. */
+ * After that are fields: u8 type, u8 len, [... len bytes...].
-static struct device *new_device(struct device_list *devices,
+ *
-                                 u16 type, u16 num_pages, u16 features,
+ * This routine adds a new field to an existing device's descriptor.  It only
-                                 int fd,
+ * works for the last device, but that's OK because that's how we use it. */
-                                 bool (*handle_input)(int, struct device *),
+static void add_desc_field(struct device *dev, u8 type, u8 len, const void *c)
-                                 unsigned long watch_off,
+{
-                                 u32 (*handle_output)(int,
+        /* This is the last descriptor, right? */
-                                                      const struct iovec *,
+        assert(devices.descpage + devices.desc_used
-                                                      unsigned,
+               == (u8 *)(dev->desc + 1) + dev->desc->config_len);
-                                                      struct device *))
+        /* We only have one page of device descriptions. */
+        if (devices.desc_used + 2 + len > getpagesize())
+                errx(1, "Too many devices");
+        /* Copy in the new config header: type then length. */
+        devices.descpage[devices.desc_used++] = type;
+        devices.descpage[devices.desc_used++] = len;
+        memcpy(devices.descpage + devices.desc_used, c, len);
+        devices.desc_used += len;
+        /* Update the device descriptor length: two byte head then data. */
+        dev->desc->config_len += 2 + len;
+}
+/* This routine adds a virtqueue to a device.  We specify how many descriptors
+ * the virtqueue is to have. */
+static void add_virtqueue(struct device *dev, unsigned int num_descs,
+                          void (*handle_output)(int fd, struct virtqueue *me))
+{
+        unsigned int pages;
+        struct virtqueue **i, *vq = malloc(sizeof(*vq));
+        void *p;
+        /* First we need some pages for this virtqueue. */
+        pages = (vring_size(num_descs) + getpagesize() - 1) / getpagesize();
+        p = get_pages(pages);
+        /* Initialize the configuration. */
+        vq->config.num = num_descs;
+        vq->config.irq = devices.next_irq++;
+        vq->config.pfn = to_guest_phys(p) / getpagesize();
+        /* Initialize the vring. */
+        vring_init(&vq->vring, num_descs, p);
+        /* Add the configuration information to this device's descriptor. */
+        add_desc_field(dev, VIRTIO_CONFIG_F_VIRTQUEUE,
+                       sizeof(vq->config), &vq->config);
+        /* Add to tail of list, so dev->vq is first vq, dev->vq->next is
+         * second.  */
+        for (i = &dev->vq; *i; i = &(*i)->next);
+        *i = vq;
+        /* Link virtqueue back to device. */
+        vq->dev = dev;
+        /* Set up handler. */
+        vq->handle_output = handle_output;
+        if (!handle_output)
+                vq->vring.used->flags = VRING_USED_F_NO_NOTIFY;
+}
+/* This routine does all the creation and setup of a new device, including
+ * caling new_dev_desc() to allocate the descriptor and device memory. */
+static struct device *new_device(const char *name, u16 type, int fd,
+                                 bool (*handle_input)(int, struct device *))
 {
        struct device *dev = malloc(sizeof(*dev));
@@ -1026,27 +1061,25 @@ static struct device *new_device(struct device_list *devices,
         * easier, but the user expects the devices to be arranged on the bus
         * in command-line order.  The first network device on the command line
         * is eth0, the first block device /dev/lgba, etc. */
-        *devices->lastdev = dev;
+        *devices.lastdev = dev;
        dev->next = NULL;
-        devices->lastdev = &dev->next;
+        devices.lastdev = &dev->next;
        /* Now we populate the fields one at a time. */
        dev->fd = fd;
        /* If we have an input handler for this file descriptor, then we add it
         * to the device_list's fdset and maxfd. */
        if (handle_input)
-                set_fd(dev->fd, devices);
+                add_device_fd(dev->fd);
-        dev->desc = new_dev_desc(devices->descs, type, features, num_pages);
+        dev->desc = new_dev_desc(type);
-        dev->mem = (void *)(dev->desc->pfn * getpagesize());
        dev->handle_input = handle_input;
-        dev->watch_key = (unsigned long)dev->mem + watch_off;
+        dev->name = name;
-        dev->handle_output = handle_output;
        return dev;
 }
 /* Our first setup routine is the console.  It's a fairly simple device, but
 * UNIX tty handling makes it uglier than it could be. */
-static void setup_console(struct device_list *devices)
+static void setup_console(void)
 {
        struct device *dev;
@@ -1062,127 +1095,38 @@ static void setup_console(struct device_list *devices)
                atexit(restore_term);
        }
-        /* We don't currently require any memory for the console, so we ask for
+        dev = new_device("console", VIRTIO_ID_CONSOLE,
-         * 0 pages. */
+                         STDIN_FILENO, handle_console_input);
-        dev = new_device(devices, LGUEST_DEVICE_T_CONSOLE, 0, 0,
-                         STDIN_FILENO, handle_console_input,
-                         LGUEST_CONSOLE_DMA_KEY, handle_console_output);
        /* We store the console state in dev->priv, and initialize it. */
        dev->priv = malloc(sizeof(struct console_abort));
        ((struct console_abort *)dev->priv)->count = 0;
-        verbose("device %p: console\n",
-                (void *)(dev->desc->pfn * getpagesize()));
-}
-/* Setting up a block file is also fairly straightforward. */
+        /* The console needs two virtqueues: the input then the output.  When
-static void setup_block_file(const char *filename, struct device_list *devices)
+         * they put something the input queue, we make sure we're listening to
-{
+         * stdin.  When they put something in the output queue, we write it to
-        int fd;
+         * stdout.  */
-        struct device *dev;
+        add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
-        off64_t *device_len;
+        add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output);
-        struct lguest_block_page *p;
+        verbose("device %u: console\n", devices.device_num++);
-        /* We open with O_LARGEFILE because otherwise we get stuck at 2G.  We
-         * open with O_DIRECT because otherwise our benchmarks go much too
-         * fast. */
-        fd = open_or_die(filename, O_RDWR|O_LARGEFILE|O_DIRECT);
-        /* We want one page, and have no input handler (the block file never
-         * has anything interesting to say to us).  Our timing will be quite
-         * random, so it should be a reasonable randomness source. */
-        dev = new_device(devices, LGUEST_DEVICE_T_BLOCK, 1,
-                         LGUEST_DEVICE_F_RANDOMNESS,
-                         fd, NULL, 0, handle_block_output);
-        /* We store the device size in the private area */
-        device_len = dev->priv = malloc(sizeof(*device_len));
-        /* This is the safe way of establishing the size of our device: it
-         * might be a normal file or an actual block device like /dev/hdb. */
-        *device_len = lseek64(fd, 0, SEEK_END);
-        /* The device memory is a "struct lguest_block_page".  It's zeroed
-         * already, we just need to put in the device size.  Block devices
-         * think in sectors (ie. 512 byte chunks), so we translate here. */
-        p = dev->mem;
-        p->num_sectors = *device_len/512;
-        verbose("device %p: block %i sectors\n",
-                (void *)(dev->desc->pfn * getpagesize()), p->num_sectors);
 }
+/*:*/
-/*
+/*M:010 Inter-guest networking is an interesting area.  Simplest is to have a
- * Network Devices.
+ * --sharenet=<name> option which opens or creates a named pipe.  This can be
+ * used to send packets to another guest in a 1:1 manner.
 *
- * Setting up network devices is quite a pain, because we have three types.
+ * More sopisticated is to use one of the tools developed for project like UML
- * First, we have the inter-Guest network.  This is a file which is mapped into
+ * to do networking.
- * the address space of the Guests who are on the network.  Because it is a
- * shared mapping, the same page underlies all the devices, and they can send
- * DMA to each other.
 *
- * Remember from our network driver, the Guest is told what slot in the page it
+ * Faster is to do virtio bonding in kernel.  Doing this 1:1 would be
- * is to use.  We use exclusive fnctl locks to reserve a slot.  If another
+ * completely generic ("here's my vring, attach to your vring") and would work
- * Guest is using a slot, the lock will fail and we try another.  Because fnctl
+ * for any traffic.  Of course, namespace and permissions issues need to be
- * locks are cleaned up automatically when we die, this cleverly means that our
+ * dealt with.  A more sophisticated "multi-channel" virtio_net.c could hide
- * reservation on the slot will vanish if we crash. */
+ * multiple inter-guest channels behind one interface, although it would
-static unsigned int find_slot(int netfd, const char *filename)
+ * require some manner of hotplugging new virtio channels.
-{
+ *
-        struct flock fl;
+ * Finally, we could implement a virtio network switch in the kernel. :*/
-        fl.l_type = F_WRLCK;
-        fl.l_whence = SEEK_SET;
-        fl.l_len = 1;
-        /* Try a 1 byte lock in each possible position number */
-        for (fl.l_start = 0;
-             fl.l_start < getpagesize()/sizeof(struct lguest_net);
-             fl.l_start++) {
-                /* If we succeed, return the slot number. */
-                if (fcntl(netfd, F_SETLK, &fl) == 0)
-                        return fl.l_start;
-        }
-        errx(1, "No free slots in network file %s", filename);
-}
-/* This function sets up the network file */
-static void setup_net_file(const char *filename,
-                           struct device_list *devices)
-{
-        int netfd;
-        struct device *dev;
-        /* We don't use open_or_die() here: for friendliness we create the file
-         * if it doesn't already exist. */
-        netfd = open(filename, O_RDWR, 0);
-        if (netfd < 0) {
-                if (errno == ENOENT) {
-                        netfd = open(filename, O_RDWR|O_CREAT, 0600);
-                        if (netfd >= 0) {
-                                /* If we succeeded, initialize the file with a
-                                 * blank page. */
-                                char page[getpagesize()];
-                                memset(page, 0, sizeof(page));
-                                write(netfd, page, sizeof(page));
-                        }
-                }
-                if (netfd < 0)
-                        err(1, "cannot open net file '%s'", filename);
-        }
-        /* We need 1 page, and the features indicate the slot to use and that
-         * no checksum is needed.  We never touch this device again; it's
-         * between the Guests on the network, so we don't register input or
-         * output handlers. */
-        dev = new_device(devices, LGUEST_DEVICE_T_NET, 1,
-                         find_slot(netfd, filename)|LGUEST_NET_F_NOCSUM,
-                         -1, NULL, 0, NULL);
-        /* Map the shared file. */
-        if (mmap(dev->mem, getpagesize(), PROT_READ|PROT_WRITE,
-                         MAP_FIXED|MAP_SHARED, netfd, 0) != dev->mem)
-                        err(1, "could not mmap '%s'", filename);
-        verbose("device %p: shared net %s, peer %i\n",
-                (void *)(dev->desc->pfn * getpagesize()), filename,
-                dev->desc->features & ~LGUEST_NET_F_NOCSUM);
-}
-/*:*/
 static u32 str2ip(const char *ipaddr)
 {
@@ -1217,7 +1161,7 @@ static void add_to_bridge(int fd, const char *if_name, const char *br_name)
 /* This sets up the Host end of the network device with an IP address, brings
 * it up so packets will flow, the copies the MAC address into the hwaddr
- * pointer (in practice, the Host's slot in the network device's memory). */
+ * pointer. */
 static void configure_device(int fd, const char *devname, u32 ipaddr,
                             unsigned char hwaddr[6])
 {
@@ -1243,18 +1187,18 @@ static void configure_device(int fd, const char *devname, u32 ipaddr,
        memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6);
 }
-/*L:195 The other kind of network is a Host<->Guest network.  This can either
+/*L:195 Our network is a Host<->Guest network.  This can either use bridging or
- * use briding or routing, but the principle is the same: it uses the "tun"
+ * routing, but the principle is the same: it uses the "tun" device to inject
- * device to inject packets into the Host as if they came in from a normal
+ * packets into the Host as if they came in from a normal network card.  We
- * network card.  We just shunt packets between the Guest and the tun
+ * just shunt packets between the Guest and the tun device. */
- * device. */
+static void setup_tun_net(const char *arg)
-static void setup_tun_net(const char *arg, struct device_list *devices)
 {
        struct device *dev;
        struct ifreq ifr;
        int netfd, ipfd;
        u32 ip;
        const char *br_name = NULL;
+        u8 hwaddr[6];
        /* We open the /dev/net/tun device and tell it we want a tap device.  A
         * tap device is like a tun device, only somehow different.  To tell
@@ -1270,21 +1214,13 @@ static void setup_tun_net(const char *arg, struct device_list *devices)
         * device: trust us! */
        ioctl(netfd, TUNSETNOCSUM, 1);
-        /* We create the net device with 1 page, using the features field of
+        /* First we create a new network device. */
-         * the descriptor to tell the Guest it is in slot 1 (NET_PEERNUM), and
+        dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input);
-         * that the device has fairly random timing.  We do *not* specify
-         * LGUEST_NET_F_NOCSUM: these packets can reach the real world.
-         *
-         * We will put our MAC address is slot 0 for the Guest to see, so
-         * it will send packets to us using the key "peer_offset(0)": */
-        dev = new_device(devices, LGUEST_DEVICE_T_NET, 1,
-                         NET_PEERNUM|LGUEST_DEVICE_F_RANDOMNESS, netfd,
-                         handle_tun_input, peer_offset(0), handle_tun_output);
-        /* We keep a flag which says whether we've seen packets come out from
+        /* Network devices need a receive and a send queue, just like
-         * this network device. */
+         * console. */
-        dev->priv = malloc(sizeof(bool));
+        add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
-        *(bool *)dev->priv = false;
+        add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output);
        /* We need a socket to perform the magic network ioctls to bring up the
         * tap interface, connect to the bridge etc.  Any socket will do! */
@@ -1300,44 +1236,251 @@ static void setup_tun_net(const char *arg, struct device_list *devices)
        } else /* It is an IP address to set up the device with */
                ip = str2ip(arg);
-        /* We are peer 0, ie. first slot, so we hand dev->mem to this routine
+        /* Set up the tun device, and get the mac address for the interface. */
-         * to write the MAC address at the start of the device memory.  */
+        configure_device(ipfd, ifr.ifr_name, ip, hwaddr);
-        configure_device(ipfd, ifr.ifr_name, ip, dev->mem);
-        /* Set "promisc" bit: we want every single packet if we're going to
+        /* Tell Guest what MAC address to use. */
-         * bridge to other machines (and otherwise it doesn't matter). */
+        add_desc_field(dev, VIRTIO_CONFIG_NET_MAC_F, sizeof(hwaddr), hwaddr);
-        *((u8 *)dev->mem) |= 0x1;
+        /* We don't seed the socket any more; setup is done. */
        close(ipfd);
-        verbose("device %p: tun net %u.%u.%u.%u\n",
+        verbose("device %u: tun net %u.%u.%u.%u\n",
-                (void *)(dev->desc->pfn * getpagesize()),
+                devices.device_num++,
-                (u8)(ip>>24), (u8)(ip>>16), (u8)(ip>>8), (u8)ip);
+                (u8)(ip>>24),(u8)(ip>>16),(u8)(ip>>8),(u8)ip);
        if (br_name)
                verbose("attached to bridge: %s\n", br_name);
 }
+/*
+ * Block device.
+ *
+ * Serving a block device is really easy: the Guest asks for a block number and
+ * we read or write that position in the file.
+ *
+ * Unfortunately, this is amazingly slow: the Guest waits until the read is
+ * finished before running anything else, even if it could be doing useful
+ * work.  We could use async I/O, except it's reputed to suck so hard that
+ * characters actually go missing from your code when you try to use it.
+ *
+ * So we farm the I/O out to thread, and communicate with it via a pipe. */
+/* This hangs off device->priv, with the data. */
+struct vblk_info
+{
+        /* The size of the file. */
+        off64_t len;
+        /* The file descriptor for the file. */
+        int fd;
+        /* IO thread listens on this file descriptor [0]. */
+        int workpipe[2];
+        /* IO thread writes to this file descriptor to mark it done, then
+         * Launcher triggers interrupt to Guest. */
+        int done_fd;
+};
+/* This is the core of the I/O thread.  It returns true if it did something. */
+static bool service_io(struct device *dev)
+{
+        struct vblk_info *vblk = dev->priv;
+        unsigned int head, out_num, in_num, wlen;
+        int ret;
+        struct virtio_blk_inhdr *in;
+        struct virtio_blk_outhdr *out;
+        struct iovec iov[dev->vq->vring.num];
+        off64_t off;
+        head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
+        if (head == dev->vq->vring.num)
+                return false;
+        if (out_num == 0 || in_num == 0)
+                errx(1, "Bad virtblk cmd %u out=%u in=%u",
+                     head, out_num, in_num);
+        out = convert(&iov[0], struct virtio_blk_outhdr);
+        in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr);
+        off = out->sector * 512;
+        /* This is how we implement barriers.  Pretty poor, no? */
+        if (out->type & VIRTIO_BLK_T_BARRIER)
+                fdatasync(vblk->fd);
+        if (out->type & VIRTIO_BLK_T_SCSI_CMD) {
+                fprintf(stderr, "Scsi commands unsupported\n");
+                in->status = VIRTIO_BLK_S_UNSUPP;
+                wlen = sizeof(in);
+        } else if (out->type & VIRTIO_BLK_T_OUT) {
+                /* Write */
+                /* Move to the right location in the block file.  This can fail
+                 * if they try to write past end. */
+                if (lseek64(vblk->fd, off, SEEK_SET) != off)
+                        err(1, "Bad seek to sector %llu", out->sector);
+                ret = writev(vblk->fd, iov+1, out_num-1);
+                verbose("WRITE to sector %llu: %i\n", out->sector, ret);
+                /* Grr... Now we know how long the descriptor they sent was, we
+                 * make sure they didn't try to write over the end of the block
+                 * file (possibly extending it). */
+                if (ret > 0 && off + ret > vblk->len) {
+                        /* Trim it back to the correct length */
+                        ftruncate64(vblk->fd, vblk->len);
+                        /* Die, bad Guest, die. */
+                        errx(1, "Write past end %llu+%u", off, ret);
+                }
+                wlen = sizeof(in);
+                in->status = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
+        } else {
+                /* Read */
+                /* Move to the right location in the block file.  This can fail
+                 * if they try to read past end. */
+                if (lseek64(vblk->fd, off, SEEK_SET) != off)
+                        err(1, "Bad seek to sector %llu", out->sector);
+                ret = readv(vblk->fd, iov+1, in_num-1);
+                verbose("READ from sector %llu: %i\n", out->sector, ret);
+                if (ret >= 0) {
+                        wlen = sizeof(in) + ret;
+                        in->status = VIRTIO_BLK_S_OK;
+                } else {
+                        wlen = sizeof(in);
+                        in->status = VIRTIO_BLK_S_IOERR;
+                }
+        }
+        /* We can't trigger an IRQ, because we're not the Launcher.  It does
+         * that when we tell it we're done. */
+        add_used(dev->vq, head, wlen);
+        return true;
+}
+/* This is the thread which actually services the I/O. */
+static int io_thread(void *_dev)
+{
+        struct device *dev = _dev;
+        struct vblk_info *vblk = dev->priv;
+        char c;
+        /* Close other side of workpipe so we get 0 read when main dies. */
+        close(vblk->workpipe[1]);
+        /* Close the other side of the done_fd pipe. */
+        close(dev->fd);
+        /* When this read fails, it means Launcher died, so we follow. */
+        while (read(vblk->workpipe[0], &c, 1) == 1) {
+                /* We acknowledge each request immediately, to reduce latency,
+                 * rather than waiting until we've done them all.  I haven't
+                 * measured to see if it makes any difference. */
+                while (service_io(dev))
+                        write(vblk->done_fd, &c, 1);
+        }
+        return 0;
+}
+/* When the thread says some I/O is done, we interrupt the Guest. */
+static bool handle_io_finish(int fd, struct device *dev)
+{
+        char c;
+        /* If child died, presumably it printed message. */
+        if (read(dev->fd, &c, 1) != 1)
+                exit(1);
+        /* It did some work, so trigger the irq. */
+        trigger_irq(fd, dev->vq);
+        return true;
+}
+/* When the Guest submits some I/O, we wake the I/O thread. */
+static void handle_virtblk_output(int fd, struct virtqueue *vq)
+{
+        struct vblk_info *vblk = vq->dev->priv;
+        char c = 0;
+        /* Wake up I/O thread and tell it to go to work! */
+        if (write(vblk->workpipe[1], &c, 1) != 1)
+                /* Presumably it indicated why it died. */
+                exit(1);
+}
+/* This creates a virtual block device. */
+static void setup_block_file(const char *filename)
+{
+        int p[2];
+        struct device *dev;
+        struct vblk_info *vblk;
+        void *stack;
+        u64 cap;
+        unsigned int val;
+        /* This is the pipe the I/O thread will use to tell us I/O is done. */
+        pipe(p);
+        /* The device responds to return from I/O thread. */
+        dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish);
+        /* The device has a virtqueue. */
+        add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output);
+        /* Allocate the room for our own bookkeeping */
+        vblk = dev->priv = malloc(sizeof(*vblk));
+        /* First we open the file and store the length. */
+        vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
+        vblk->len = lseek64(vblk->fd, 0, SEEK_END);
+        /* Tell Guest how many sectors this device has. */
+        cap = cpu_to_le64(vblk->len / 512);
+        add_desc_field(dev, VIRTIO_CONFIG_BLK_F_CAPACITY, sizeof(cap), &cap);
+        /* Tell Guest not to put in too many descriptors at once: two are used
+         * for the in and out elements. */
+        val = cpu_to_le32(VIRTQUEUE_NUM - 2);
+        add_desc_field(dev, VIRTIO_CONFIG_BLK_F_SEG_MAX, sizeof(val), &val);
+        /* The I/O thread writes to this end of the pipe when done. */
+        vblk->done_fd = p[1];
+        /* This is how we tell the I/O thread about more work. */
+        pipe(vblk->workpipe);
+        /* Create stack for thread and run it */
+        stack = malloc(32768);
+        if (clone(io_thread, stack + 32768, CLONE_VM, dev) == -1)
+                err(1, "Creating clone");
+        /* We don't need to keep the I/O thread's end of the pipes open. */
+        close(vblk->done_fd);
+        close(vblk->workpipe[0]);
+        verbose("device %u: virtblock %llu sectors\n",
+                devices.device_num, cap);
+}
 /* That's the end of device setup. */
 /*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves
 * its input and output, and finally, lays it to rest. */
-static void __attribute__((noreturn))
+static void __attribute__((noreturn)) run_guest(int lguest_fd)
-run_guest(int lguest_fd, struct device_list *device_list)
 {
        for (;;) {
-                u32 args[] = { LHREQ_BREAK, 0 };
+                unsigned long args[] = { LHREQ_BREAK, 0 };
-                unsigned long arr[2];
+                unsigned long notify_addr;
                int readval;
                /* We read from the /dev/lguest device to run the Guest. */
-                readval = read(lguest_fd, arr, sizeof(arr));
+                readval = read(lguest_fd, &notify_addr, sizeof(notify_addr));
-                /* The read can only really return sizeof(arr) (the Guest did a
-                 * SEND_DMA to us), or an error. */
-                /* For a successful read, arr[0] is the address of the "struct
+                /* One unsigned long means the Guest did HCALL_NOTIFY */
-                 * lguest_dma", and arr[1] is the key the Guest sent to. */
+                if (readval == sizeof(notify_addr)) {
-                if (readval == sizeof(arr)) {
+                        verbose("Notify on address %#lx\n", notify_addr);
-                        handle_output(lguest_fd, arr[0], arr[1], device_list);
+                        handle_output(lguest_fd, notify_addr);
                        continue;
                /* ENOENT means the Guest died.  Reading tells us why. */
                } else if (errno == ENOENT) {
@@ -1351,7 +1494,7 @@ run_guest(int lguest_fd, struct device_list *device_list)
                /* Service input, then unset the BREAK which releases
                 * the Waker. */
-                handle_input(lguest_fd, device_list);
+                handle_input(lguest_fd);
                if (write(lguest_fd, args, sizeof(args)) < 0)
                        err(1, "Resetting break");
        }
@@ -1365,7 +1508,6 @@ run_guest(int lguest_fd, struct device_list *device_list)
 static struct option opts[] = {
        { "verbose", 0, NULL, 'v' },
-        { "sharenet", 1, NULL, 's' },
        { "tunnet", 1, NULL, 't' },
        { "block", 1, NULL, 'b' },
        { "initrd", 1, NULL, 'i' },
@@ -1374,37 +1516,21 @@ static struct option opts[] = {
 static void usage(void)
 {
        errx(1, "Usage: lguest [--verbose] "
-             "[--sharenet=<filename>|--tunnet=(<ipaddr>|bridge:<bridgename>)\n"
+             "[--tunnet=(<ipaddr>|bridge:<bridgename>)\n"
             "|--block=<filename>|--initrd=<filename>]...\n"
             "<mem-in-mb> vmlinux [args...]");
 }
-/*L:100 The Launcher code itself takes us out into userspace, that scary place
+/*L:105 The main routine is where the real work begins: */
- * where pointers run wild and free!  Unfortunately, like most userspace
- * programs, it's quite boring (which is why everyone like to hack on the
- * kernel!).  Perhaps if you make up an Lguest Drinking Game at this point, it
- * will get you through this section.  Or, maybe not.
- *
- * The Launcher binary sits up high, usually starting at address 0xB8000000.
- * Everything below this is the "physical" memory for the Guest.  For example,
- * if the Guest were to write a "1" at physical address 0, we would see a "1"
- * in the Launcher at "(int *)0".  Guest physical == Launcher virtual.
- *
- * This can be tough to get your head around, but usually it just means that we
- * don't need to do any conversion when the Guest gives us it's "physical"
- * addresses.
- */
 int main(int argc, char *argv[])
 {
-        /* Memory, top-level pagetable, code startpoint, PAGE_OFFSET and size
+        /* Memory, top-level pagetable, code startpoint and size of the
-         * of the (optional) initrd. */
+         * (optional) initrd. */
-        unsigned long mem = 0, pgdir, start, page_offset, initrd_size = 0;
+        unsigned long mem = 0, pgdir, start, initrd_size = 0;
        /* A temporary and the /dev/lguest file descriptor. */
        int i, c, lguest_fd;
-        /* The list of Guest devices, based on command line arguments. */
+        /* The boot information for the Guest. */
-        struct device_list device_list;
+        struct boot_params *boot;
-        /* The boot information for the Guest: at guest-physical address 0. */
-        void *boot = (void *)0;
        /* If they specify an initrd file to load. */
        const char *initrd_name = NULL;
@@ -1412,11 +1538,12 @@ int main(int argc, char *argv[])
         * device receive input from a file descriptor, we keep an fdset
         * (infds) and the maximum fd number (max_infd) with the head of the
         * list.  We also keep a pointer to the last device, for easy appending
-         * to the list. */
+         * to the list.  Finally, we keep the next interrupt number to hand out
-        device_list.max_infd = -1;
+         * (1: remember that 0 is used by the timer). */
-        device_list.dev = NULL;
+        FD_ZERO(&devices.infds);
-        device_list.lastdev = &device_list.dev;
+        devices.max_infd = -1;
-        FD_ZERO(&device_list.infds);
+        devices.lastdev = &devices.dev;
+        devices.next_irq = 1;
        /* We need to know how much memory so we can set up the device
         * descriptor and memory pages for the devices as we parse the command
@@ -1424,9 +1551,16 @@ int main(int argc, char *argv[])
         * of memory now. */
        for (i = 1; i < argc; i++) {
                if (argv[i][0] != '-') {
-                        mem = top = atoi(argv[i]) * 1024 * 1024;
+                        mem = atoi(argv[i]) * 1024 * 1024;
-                        device_list.descs = map_zeroed_pages(top, 1);
+                        /* We start by mapping anonymous pages over all of
-                        top += getpagesize();
+                         * guest-physical memory range.  This fills it with 0,
+                         * and ensures that the Guest won't be killed when it
+                         * tries to access it. */
+                        guest_base = map_zeroed_pages(mem / getpagesize()
+                                                      + DEVICE_PAGES);
+                        guest_limit = mem;
+                        guest_max = mem + DEVICE_PAGES*getpagesize();
+                        devices.descpage = get_pages(1);
                        break;
                }
        }
@@ -1437,14 +1571,11 @@ int main(int argc, char *argv[])
                case 'v':
                        verbose = true;
                        break;
-                case 's':
-                        setup_net_file(optarg, &device_list);
-                        break;
                case 't':
-                        setup_tun_net(optarg, &device_list);
+                        setup_tun_net(optarg);
                        break;
                case 'b':
-                        setup_block_file(optarg, &device_list);
+                        setup_block_file(optarg);
                        break;
                case 'i':
                        initrd_name = optarg;
@@ -1459,56 +1590,60 @@ int main(int argc, char *argv[])
        if (optind + 2 > argc)
                usage();
-        /* We always have a console device */
+        verbose("Guest base is at %p\n", guest_base);
-        setup_console(&device_list);
-        /* We start by mapping anonymous pages over all of guest-physical
+        /* We always have a console device */
-         * memory range.  This fills it with 0, and ensures that the Guest
+        setup_console();
-         * won't be killed when it tries to access it. */
-        map_zeroed_pages(0, mem / getpagesize());
        /* Now we load the kernel */
-        start = load_kernel(open_or_die(argv[optind+1], O_RDONLY),
+        start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
-                            &page_offset);
+        /* Boot information is stashed at physical address 0 */
+        boot = from_guest_phys(0);
        /* Map the initrd image if requested (at top of physical memory) */
        if (initrd_name) {
                initrd_size = load_initrd(initrd_name, mem);
                /* These are the location in the Linux boot header where the
                 * start and size of the initrd are expected to be found. */
-                *(unsigned long *)(boot+0x218) = mem - initrd_size;
+                boot->hdr.ramdisk_image = mem - initrd_size;
-                *(unsigned long *)(boot+0x21c) = initrd_size;
+                boot->hdr.ramdisk_size = initrd_size;
                /* The bootloader type 0xFF means "unknown"; that's OK. */
-                *(unsigned char *)(boot+0x210) = 0xFF;
+                boot->hdr.type_of_loader = 0xFF;
        }
        /* Set up the initial linear pagetables, starting below the initrd. */
-        pgdir = setup_pagetables(mem, initrd_size, page_offset);
+        pgdir = setup_pagetables(mem, initrd_size);
        /* The Linux boot header contains an "E820" memory map: ours is a
         * simple, single region. */
-        *(char*)(boot+E820NR) = 1;
+        boot->e820_entries = 1;
-        *((struct e820entry *)(boot+E820MAP))
+        boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM });
-                = ((struct e820entry) { 0, mem, E820_RAM });
        /* The boot header contains a command line pointer: we put the command
-         * line after the boot header (at address 4096) */
+         * line after the boot header. */
-        *(void **)(boot + 0x228) = boot + 4096;
+        boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
-        concat(boot + 4096, argv+optind+2);
+        concat((char *)(boot + 1), argv+optind+2);
+        /* Boot protocol version: 2.07 supports the fields for lguest. */
+        boot->hdr.version = 0x207;
+        /* The hardware_subarch value of "1" tells the Guest it's an lguest. */
+        boot->hdr.hardware_subarch = 1;
-        /* The guest type value of "1" tells the Guest it's under lguest. */
+        /* Tell the entry path not to try to reload segment registers. */
-        *(int *)(boot + 0x23c) = 1;
+        boot->hdr.loadflags |= KEEP_SEGMENTS;
        /* We tell the kernel to initialize the Guest: this returns the open
         * /dev/lguest file descriptor. */
-        lguest_fd = tell_kernel(pgdir, start, page_offset);
+        lguest_fd = tell_kernel(pgdir, start);
        /* We fork off a child process, which wakes the Launcher whenever one
         * of the input file descriptors needs attention.  Otherwise we would
         * run the Guest until it tries to output something. */
-        waker_fd = setup_waker(lguest_fd, &device_list);
+        waker_fd = setup_waker(lguest_fd);
        /* Finally, run the Guest.  This doesn't return. */
-        run_guest(lguest_fd, &device_list);
+        run_guest(lguest_fd);
 }
 /*:*/
diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt
index 821617bd6c04..7885ab2d5f53 100644
--- a/Documentation/lguest/lguest.txt
+++ b/Documentation/lguest/lguest.txt
@@ -6,7 +6,7 @@ Lguest is designed to be a minimal hypervisor for the Linux kernel, for
 Linux developers and users to experiment with virtualization with the
 minimum of complexity.  Nonetheless, it should have sufficient
 features to make it useful for specific tasks, and, of course, you are
-encouraged to fork and enhance it.
+encouraged to fork and enhance it (see drivers/lguest/README).
 Features:
@@ -23,19 +23,30 @@ Developer features:
 Running Lguest:
- Lguest runs the same kernel as guest and host.  You can configure
+- The easiest way to run lguest is to use same kernel as guest and host.
-  them differently, but usually it's easiest not to.
+  You can configure them differently, but usually it's easiest not to.
  You will need to configure your kernel with the following options:
-  CONFIG_HIGHMEM64G=n ("High Memory Support" "64GB")[1]
+  "General setup":
-  CONFIG_TUN=y/m ("Universal TUN/TAP device driver support")
+     "Prompt for development and/or incomplete code/drivers" = Y
-  CONFIG_EXPERIMENTAL=y ("Prompt for development and/or incomplete code/drivers")
+        (CONFIG_EXPERIMENTAL=y)
-  CONFIG_PARAVIRT=y ("Paravirtualization support (EXPERIMENTAL)")
-  CONFIG_LGUEST=y/m ("Linux hypervisor example code")
+  "Processor type and features":
+     "Paravirtualized guest support" = Y
-  and I recommend:
+        "Lguest guest support" = Y
-  CONFIG_HZ=100 ("Timer frequency")[2]
+     "High Memory Support" = off/4GB
+     "Alignment value to which kernel should be aligned" = 0x100000
+        (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
+         CONFIG_PHYSICAL_ALIGN=0x100000)
+  "Device Drivers":
+     "Network device support"
+        "Universal TUN/TAP device driver support" = M/Y
+           (CONFIG_TUN=m)
+     "Virtualization"
+        "Linux hypervisor example code" = M/Y
+           (CONFIG_LGUEST=m)
 - A tool called "lguest" is available in this directory: type "make"
  to build it.  If you didn't build your kernel in-tree, use "make
@@ -51,14 +62,17 @@ Running Lguest:
          dd if=/dev/zero of=rootfile bs=1M count=2048
          qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d
+  Make sure that you install a getty on /dev/hvc0 if you want to log in on the
+  console!
 - "modprobe lg" if you built it as a module.
 - Run an lguest as root:
-      Documentation/lguest/lguest 64m vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/lgba
+      Documentation/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/vda
   Explanation:
-    64m: the amount of memory to use.
+    64: the amount of memory to use, in MB.
    vmlinux: the kernel image found in the top of your build directory.  You
       can also use a standard bzImage.
@@ -66,10 +80,10 @@ Running Lguest:
    --tunnet=192.168.19.1: configures a "tap" device for networking with this
       IP address.
-    --block=rootfile: a file or block device which becomes /dev/lgba
+    --block=rootfile: a file or block device which becomes /dev/vda
       inside the guest.
-    root=/dev/lgba: this (and anything else on the command line) are
+    root=/dev/vda: this (and anything else on the command line) are
       kernel boot parameters.
 - Configuring networking.  I usually have the host masquerade, using
@@ -99,31 +113,7 @@ Running Lguest:
  "--sharenet=<filename>": any two guests using the same file are on
  the same network.  This file is created if it does not exist.
-Lguest I/O model:
+There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest
-Lguest uses a simplified DMA model plus shared memory for I/O.  Guests
-can communicate with each other if they share underlying memory
-(usually by the lguest program mmaping the same file), but they can
-use any non-shared memory to communicate with the lguest process.
-Guests can register DMA buffers at any key (must be a valid physical
-address) using the LHCALL_BIND_DMA(key, dmabufs, num<<8|irq)
-hypercall.  "dmabufs" is the physical address of an array of "num"
-"struct lguest_dma": each contains a used_len, and an array of
-physical addresses and lengths.  When a transfer occurs, the
-"used_len" field of one of the buffers which has used_len 0 will be
-set to the length transferred and the irq will fire.
-Using an irq value of 0 unbinds the dma buffers.
+Good luck!
-To send DMA, the LHCALL_SEND_DMA(key, dma_physaddr) hypercall is used,
-and the bytes used is written to the used_len field.  This can be 0 if
-noone else has bound a DMA buffer to that key or some other error.
-DMA buffers bound by the same guest are ignored.
-Cheers!
 Rusty Russell rusty@rustcorp.com.au.
-[1] These are on various places on the TODO list, waiting for you to
-    get annoyed enough at the limitation to fix it.
-[2] Lguest is not yet tickless when idle.  See [1].
diff --git a/Documentation/m68k/kernel-options.txt b/Documentation/m68k/kernel-options.txt
index 8a523f6af48a..248589e8bcf5 100644
--- a/Documentation/m68k/kernel-options.txt
+++ b/Documentation/m68k/kernel-options.txt
@@ -890,10 +890,7 @@ Syntax: nosync:0
 5.5.2) noasync
 --------------
-Syntax: noasync:0
+[OBSOLETE, REMOVED]
-  Disables async and sync negotiation for all devices.  Any value
-  after the colon is acceptable (and has the same effect).
 5.5.3) nodisconnect
 -------------------
diff --git a/Documentation/markers.txt b/Documentation/markers.txt
new file mode 100644
index 000000000000..295a71bc301e
--- /dev/null
+++ b/Documentation/markers.txt
@@ -0,0 +1,81 @@
+                     Using the Linux Kernel Markers
+                            Mathieu Desnoyers
+This document introduces Linux Kernel Markers and their use. It provides
+examples of how to insert markers in the kernel and connect probe functions to
+them and provides some examples of probe functions.
+* Purpose of markers
+A marker placed in code provides a hook to call a function (probe) that you can
+provide at runtime. A marker can be "on" (a probe is connected to it) or "off"
+(no probe is attached). When a marker is "off" it has no effect, except for
+adding a tiny time penalty (checking a condition for a branch) and space
+penalty (adding a few bytes for the function call at the end of the
+instrumented function and adds a data structure in a separate section).  When a
+marker is "on", the function you provide is called each time the marker is
+executed, in the execution context of the caller. When the function provided
+ends its execution, it returns to the caller (continuing from the marker site).
+You can put markers at important locations in the code. Markers are
+lightweight hooks that can pass an arbitrary number of parameters,
+described in a printk-like format string, to the attached probe function.
+They can be used for tracing and performance accounting.
+* Usage
+In order to use the macro trace_mark, you should include linux/marker.h.
+#include <linux/marker.h>
+And,
+trace_mark(subsystem_event, "%d %s", someint, somestring);
+Where :
+- subsystem_event is an identifier unique to your event
+    - subsystem is the name of your subsystem.
+    - event is the name of the event to mark.
+- "%d %s" is the formatted string for the serializer.
+- someint is an integer.
+- somestring is a char pointer.
+Connecting a function (probe) to a marker is done by providing a probe (function
+to call) for the specific marker through marker_probe_register() and can be
+activated by calling marker_arm(). Marker deactivation can be done by calling
+marker_disarm() as many times as marker_arm() has been called. Removing a probe
+is done through marker_probe_unregister(); it will disarm the probe and make
+sure there is no caller left using the probe when it returns. Probe removal is
+preempt-safe because preemption is disabled around the probe call. See the
+"Probe example" section below for a sample probe module.
+The marker mechanism supports inserting multiple instances of the same marker.
+Markers can be put in inline functions, inlined static functions, and
+unrolled loops as well as regular functions.
+The naming scheme "subsystem_event" is suggested here as a convention intended
+to limit collisions. Marker names are global to the kernel: they are considered
+as being the same whether they are in the core kernel image or in modules.
+Conflicting format strings for markers with the same name will cause the markers
+to be detected to have a different format string not to be armed and will output
+a printk warning which identifies the inconsistency:
+"Format mismatch for probe probe_name (format), marker (format)"
+* Probe / marker example
+See the example provided in samples/markers/src
+Compile them with your kernel.
+Run, as root :
+modprobe marker-example (insmod order is not important)
+modprobe probe-example
+cat /proc/marker-example (returns an expected error)
+rmmod marker-example probe-example
+dmesg
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 650657c54733..4e17beba2379 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -1479,7 +1479,8 @@ kernel.
 Any atomic operation that modifies some state in memory and returns information
 about the state (old or new) implies an SMP-conditional general memory barrier
-(smp_mb()) on each side of the actual operation.  These include:
+(smp_mb()) on each side of the actual operation (with the exception of
+explicit lock operations, described later).  These include:
        xchg();
        cmpxchg();
@@ -1536,10 +1537,19 @@ If they're used for constructing a lock of some description, then they probably
 do need memory barriers as a lock primitive generally has to do things in a
 specific order.
 Basically, each usage case has to be carefully considered as to whether memory
 barriers are needed or not.
+The following operations are special locking primitives:
+        test_and_set_bit_lock();
+        clear_bit_unlock();
+        __clear_bit_unlock();
+These implement LOCK-class and UNLOCK-class operations. These should be used in
+preference to other operations when implementing locking primitives, because
+their implementations can be optimised on many architectures.
 [!] Note that special memory barrier primitives are available for these
 situations because on some CPUs the atomic instructions used imply full memory
 barriers, and so barrier instructions are superfluous in conjunction with them,
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index 5fbcc22c98e9..168117bd6ee8 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -2,7 +2,8 @@
 Memory Hotplug
 ==============
-Last Updated: Jul 28 2007
+Created:                                        Jul 28 2007
+Add description of notifier of memory hotplug   Oct 11 2007
 This document is about memory hotplug including how-to-use and current status.
 Because Memory Hotplug is still under development, contents of this text will
@@ -24,7 +25,8 @@ be changed often.
  6.1 Memory offline and ZONE_MOVABLE
  6.2. How to offline memory
 7. Physical memory remove
-8. Future Work List
+8. Memory hotplug event notifier
+9. Future Work List
 Note(1): x86_64's has special implementation for memory hotplug.
         This text does not describe it.
@@ -307,8 +309,58 @@ Need more implementation yet....
 - Notification completion of remove works by OS to firmware.
 - Guard from remove if not yet.
+--------------------------------
+8. Memory hotplug event notifier
+--------------------------------
+Memory hotplug has event notifer. There are 6 types of notification.
+MEMORY_GOING_ONLINE
+  Generated before new memory becomes available in order to be able to
+  prepare subsystems to handle memory. The page allocator is still unable
+  to allocate from the new memory.
+MEMORY_CANCEL_ONLINE
+  Generated if MEMORY_GOING_ONLINE fails.
+MEMORY_ONLINE
+  Generated when memory has succesfully brought online. The callback may
+  allocate pages from the new memory.
+MEMORY_GOING_OFFLINE
+  Generated to begin the process of offlining memory. Allocations are no
+  longer possible from the memory but some of the memory to be offlined
+  is still in use. The callback can be used to free memory known to a
+  subsystem from the indicated memory section.
+MEMORY_CANCEL_OFFLINE
+  Generated if MEMORY_GOING_OFFLINE fails. Memory is available again from
+  the section that we attempted to offline.
+MEMORY_OFFLINE
+  Generated after offlining memory is complete.
+A callback routine can be registered by
+  hotplug_memory_notifier(callback_func, priority)
+The second argument of callback function (action) is event types of above.
+The third argument is passed by pointer of struct memory_notify.
+struct memory_notify {
+       unsigned long start_pfn;
+       unsigned long nr_pages;
+       int status_cahnge_nid;
+}
+start_pfn is start_pfn of online/offline memory.
+nr_pages is # of pages of online/offline memory.
+status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be)
+set/clear. It means a new(memoryless) node gets new memory by online and a
+node loses all memory. If this is -1, then nodemask status is not changed.
+If status_changed_nid >= 0, callback should create/discard structures for the
+node if necessary.
 --------------
-8. Future Work
+9. Future Work
 --------------
  - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like
    sysctl or new control file.
diff --git a/Documentation/mips/00-INDEX b/Documentation/mips/00-INDEX
index 9df8a2eac7b4..3f13bf8043d2 100644
--- a/Documentation/mips/00-INDEX
+++ b/Documentation/mips/00-INDEX
@@ -4,5 +4,3 @@ AU1xxx_IDE.README
        - README for MIPS AU1XXX IDE driver.
 GT64120.README
        - README for dir with info on MIPS boards using GT-64120 or GT-64120A.
-time.README
-        - README for MIPS time services.
diff --git a/Documentation/mips/AU1xxx_IDE.README b/Documentation/mips/AU1xxx_IDE.README
index afb31c141d9d..5c8334123f4f 100644
--- a/Documentation/mips/AU1xxx_IDE.README
+++ b/Documentation/mips/AU1xxx_IDE.README
@@ -59,7 +59,7 @@ Four configs variables are introduced:
  CONFIG_BLK_DEV_IDE_AU1XXX_PIO_DBDMA    - enable the PIO+DBDMA mode
  CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA  - enable the MWDMA mode
  CONFIG_BLK_DEV_IDE_AU1XXX_BURSTABLE_ON - set Burstable FIFO in DBDMA
-                                           controler
+                                           controller
  CONFIG_BLK_DEV_IDE_AU1XXX_SEQTS_PER_RQ - maximum transfer size
                                           per descriptor
diff --git a/Documentation/mips/time.README b/Documentation/mips/time.README
deleted file mode 100644
index a4ce603ed3b3..000000000000
--- a/Documentation/mips/time.README
+++ /dev/null
@@ -1,173 +0,0 @@
-README for MIPS time services
-Jun Sun
-jsun@mvista.com or jsun@junsun.net
-ABOUT
-----
-This file describes the new arch/mips/kernel/time.c, related files and the 
-services they provide. 
-If you are short in patience and just want to know how to use time.c for a 
-new board or convert an existing board, go to the last section.
-FILES, COMPATABILITY AND CONFIGS
---------------------------------
-The old arch/mips/kernel/time.c is renamed to old-time.c.
-A new time.c is put there, together with include/asm-mips/time.h.
-Two configs variables are introduced, CONFIG_OLD_TIME_C and CONFIG_NEW_TIME_C.
-So we allow boards using 
-        1) old time.c (CONFIG_OLD_TIME_C)
-        2) new time.c (CONFIG_NEW_TIME_C)
-        3) neither (their own private time.c)
-However, it is expected every board will move to the new time.c in the near
-future.
-WHAT THE NEW CODE PROVIDES?
--------------------------- 
-The new time code provide the following services:
-  a) Implements functions required by Linux common code:
-        time_init
-  b) provides an abstraction of RTC and null RTC implementation as default.
-        extern unsigned long (*rtc_get_time)(void);
-        extern int (*rtc_set_time)(unsigned long);
-  c) high-level and low-level timer interrupt routines where the timer
-     interrupt source  may or may not be the CPU timer.  The high-level
-     routine is dispatched through do_IRQ() while the low-level is
-     dispatched in assemably code (usually int-handler.S)
-WHAT THE NEW CODE REQUIRES?
---------------------------
-For the new code to work properly, each board implementation needs to supply
-the following functions or values:
-  a) board_time_init - a function pointer.  Invoked at the beginnig of
-     time_init().  It is optional.
-        1. (optional) set up RTC routines
-        2. (optional) calibrate and set the mips_hpt_frequency
-  b) plat_timer_setup - a function pointer.  Invoked at the end of time_init()
-        1. (optional) over-ride any decisions made in time_init()
-        2. set up the irqaction for timer interrupt.
-        3. enable the timer interrupt
-  c) (optional) board-specific RTC routines.
-  d) (optional) mips_hpt_frequency - It must be definied if the board
-     is using CPU counter for timer interrupt.
-PORTING GUIDE
-------------
-Step 1: decide how you like to implement the time services.
-  a) does this board have a RTC?  If yes, implement the two RTC funcs.
-  b) does the CPU have counter/compare registers? 
-     If the answer is no, you need a timer to provide the timer interrupt
-     at 100 HZ speed.
-  c) The following sub steps assume your CPU has counter register.
-     Do you plan to use the CPU counter register as the timer interrupt
-     or use an exnternal timer?
-     In order to use CPU counter register as the timer interrupt source, you
-     must know the counter speed (mips_hpt_frequency).  It is usually the
-     same as the CPU speed or an integral divisor of it.
-  d) decide on whether you want to use high-level or low-level timer
-     interrupt routines.  The low-level one is presumably faster, but should
-     not make too mcuh difference.
-Step 2:  the machine setup() function
-  If you supply board_time_init(), set the function poointer.
-Step 3: implement rtc routines, board_time_init() and plat_timer_setup()
-  if needed.
-  board_time_init() -
-        a) (optional) set up RTC routines,
-        b) (optional) calibrate and set the mips_hpt_frequency
-            (only needed if you intended to use cpu counter as timer interrupt
-             source)
-  plat_timer_setup() -
-        a) (optional) over-write any choices made above by time_init().
-        b) machine specific code should setup the timer irqaction.
-        c) enable the timer interrupt
-  If the RTC chip is a common chip, I suggest the routines are put under
-  arch/mips/libs.  For example, for DS1386 chip, one would create
-  rtc-ds1386.c under arch/mips/lib directory.  Add the following line to
-  the arch/mips/lib/Makefile:
-        obj-$(CONFIG_DDB5476) += rtc-ds1386.o
-Step 4: if you are using low-level timer interrupt, change your interrupt
-  dispathcing code to check for timer interrupt and jump to 
-  ll_timer_interrupt() directly  if one is detected.
-Step 5: Modify arch/mips/config.in and add CONFIG_NEW_TIME_C to your machine.
-  Modify the appropriate defconfig if applicable.
-Final notes: 
-For some tricky cases, you may need to add your own wrapper functions 
-for some of the functions in time.c.  
-For example, you may define your own timer interrupt routine, which does
-some of its own processing and then calls timer_interrupt().
-You can also over-ride any of the built-in functions (RTC routines
-and/or timer interrupt routine).
-PORTING NOTES FOR SMP
----------------------
-If you have a SMP box, things are slightly more complicated.
-The time service running every jiffy is logically divided into two parts:
-  1) the one for the whole system  (defined in timer_interrupt())
-  2) the one that should run for each CPU (defined in local_timer_interrupt())
-You need to decide on your timer interrupt sources.
-  case 1) - whole system has only one timer interrupt delivered to one CPU
-        In this case, you set up timer interrupt as in UP systems.  In addtion,
-        you need to set emulate_local_timer_interrupt to 1 so that other
-        CPUs get to call local_timer_interrupt().
-        THIS IS CURRENTLY NOT IMPLEMNETED.  However, it is rather easy to write
-        one should such a need arise.  You simply make a IPI call.
-  case 2) - each CPU has a separate timer interrupt
-        In this case, you need to set up IRQ such that each of them will
-        call local_timer_interrupt().  In addition, you need to arrange
-        one and only one of them to call timer_interrupt().
-        You can also do the low-level version of those interrupt routines,
-        following similar dispatching routes described above.
diff --git a/Documentation/mutex-design.txt b/Documentation/mutex-design.txt
index 51f935191ae5..aa60d1f627e5 100644
--- a/Documentation/mutex-design.txt
+++ b/Documentation/mutex-design.txt
@@ -133,4 +133,6 @@ the APIs of 'struct mutex' have been streamlined:
 int  mutex_trylock(struct mutex *lock);
 void mutex_unlock(struct mutex *lock);
 int  mutex_is_locked(struct mutex *lock);
+ void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
+ int  mutex_lock_interruptible_nested(struct mutex *lock,
+                                      unsigned int subclass);
diff --git a/Documentation/networking/bcm43xx.txt b/Documentation/networking/bcm43xx.txt
index a136721499bf..d602c8d6ff3e 100644
--- a/Documentation/networking/bcm43xx.txt
+++ b/Documentation/networking/bcm43xx.txt
@@ -37,7 +37,7 @@ all, distributions.  There is, however, additional software that is
 required. The firmware used by the chip is the intellectual property
 of Broadcom and they have not given the bcm43xx team redistribution
 rights to this firmware.  Since we cannot legally redistribute
-the firwmare we cannot include it with the driver. Furthermore, it
+the firmware we cannot include it with the driver. Furthermore, it
 cannot be placed in the downloadable archives of any distributing
 organization; therefore, the user is responsible for obtaining the
 firmware and placing it in the appropriate location so that the driver
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 6ae2feff3087..747a5d15d529 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -293,7 +293,7 @@ tcp_no_metrics_save - BOOLEAN
        when the connection closes, so that connections established in the
        near future can use these to set initial conditions.  Usually, this
        increases overall performance, but may sometimes cause performance
-        degredation.  If set, TCP will not cache metrics on closing
+        degradation.  If set, TCP will not cache metrics on closing
        connections.
 tcp_orphan_retries - INTEGER
diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt
index c36b64b0020f..c3669a3fb4af 100644
--- a/Documentation/networking/rxrpc.txt
+++ b/Documentation/networking/rxrpc.txt
@@ -689,7 +689,7 @@ such as the AFS filesystem.  This permits such a utility to:
     buffers manipulated directly.
 To use the RxRPC facility, a kernel utility must still open an AF_RXRPC socket,
-bind an addess as appropriate and listen if it's to be a server socket, but
+bind an address as appropriate and listen if it's to be a server socket, but
 then it passes this to the kernel interface functions.
 The kernel interface functions are as follows:
diff --git a/Documentation/networking/udplite.txt b/Documentation/networking/udplite.txt
index 6be09ba24a36..b6409cab075c 100644
--- a/Documentation/networking/udplite.txt
+++ b/Documentation/networking/udplite.txt
@@ -12,7 +12,7 @@
  For in-depth information, you can consult:
   o The UDP-Lite Homepage: http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/
-       Fom here you can also download some example application source code.
+       From here you can also download some example application source code.
   o The UDP-Lite HOWTO on
       http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/files/UDP-Lite-HOWTO.txt
@@ -223,7 +223,7 @@
  While it is important that such cases are dealt with correctly, they
  are (annoyingly) rare: UDP-Lite is designed for optimising multimedia
  performance over wireless (or generally noisy) links and thus smaller
-  coverage lenghts are likely to be expected.
+  coverage lengths are likely to be expected.
  V) UDP-LITE RUNTIME STATISTICS AND THEIR MEANING
@@ -259,7 +259,7 @@
  VI) IPTABLES
  There is packet match support for UDP-Lite as well as support for the LOG target.
-  If you copy and paste the following line into /etc/protcols,
+  If you copy and paste the following line into /etc/protocols,
  udplite 136     UDP-Lite        # UDP-Lite [RFC 3828]
diff --git a/Documentation/parport-lowlevel.txt b/Documentation/parport-lowlevel.txt
index 8f2302415eff..265fcdcb8e5f 100644
--- a/Documentation/parport-lowlevel.txt
+++ b/Documentation/parport-lowlevel.txt
@@ -25,7 +25,6 @@ Global functions:
  parport_open
  parport_close
  parport_device_id
-  parport_device_num
  parport_device_coords
  parport_find_class
  parport_find_device
@@ -735,7 +734,7 @@ NULL is returned.
 SEE ALSO
-parport_register_device, parport_device_num
+parport_register_device
 parport_close - unregister device for particular device number
 -------------
@@ -787,29 +786,7 @@ Many devices have ill-formed IEEE 1284 Device IDs.
 SEE ALSO
-parport_find_class, parport_find_device, parport_device_num
+parport_find_class, parport_find_device
-parport_device_num - convert device coordinates to device number
------------------
-SYNOPSIS
-#include <linux/parport.h>
-int parport_device_num (int parport, int mux, int daisy);
-DESCRIPTION
-Convert between device coordinates (port, multiplexor, daisy chain
-address) and device number (zero-based).
-RETURN VALUE
-Device number, or -1 if no device at given coordinates.
-SEE ALSO
-parport_device_coords, parport_open, parport_device_id
 parport_device_coords - convert device number to device coordinates
 ------------------
@@ -833,7 +810,7 @@ Zero on success, in which case the coordinates are (*parport, *mux,
 SEE ALSO
-parport_device_num, parport_open, parport_device_id
+parport_open, parport_device_id
 parport_find_class - find a device by its class
 ------------------
diff --git a/Documentation/power/basic-pm-debugging.txt b/Documentation/power/basic-pm-debugging.txt
index 1a85e2b964dc..57aef2f6e0de 100644
--- a/Documentation/power/basic-pm-debugging.txt
+++ b/Documentation/power/basic-pm-debugging.txt
@@ -78,8 +78,8 @@ c) Advanced debugging
 In case the STD does not work on your system even in the minimal configuration
 and compiling more drivers as modules is not practical or some modules cannot
 be unloaded, you can use one of the more advanced debugging techniques to find
-the problem.  First, if there is a serial port in your box, you can set the
+the problem.  First, if there is a serial port in your box, you can boot the
-CONFIG_DISABLE_CONSOLE_SUSPEND kernel configuration option and try to log kernel
+kernel with the 'no_console_suspend' parameter and try to log kernel
 messages using the serial console.  This may provide you with some information
 about the reasons of the suspend (resume) failure.  Alternatively, it may be
 possible to use a FireWire port for debugging with firescope
diff --git a/Documentation/power/freezing-of-tasks.txt b/Documentation/power/freezing-of-tasks.txt
index 04dc1cf9d215..38b57248fd61 100644
--- a/Documentation/power/freezing-of-tasks.txt
+++ b/Documentation/power/freezing-of-tasks.txt
@@ -19,12 +19,13 @@ we only consider hibernation, but the description also applies to suspend).
 Namely, as the first step of the hibernation procedure the function
 freeze_processes() (defined in kernel/power/process.c) is called.  It executes
 try_to_freeze_tasks() that sets TIF_FREEZE for all of the freezable tasks and
-sends a fake signal to each of them.  A task that receives such a signal and has
+either wakes them up, if they are kernel threads, or sends fake signals to them,
-TIF_FREEZE set, should react to it by calling the refrigerator() function
+if they are user space processes.  A task that has TIF_FREEZE set, should react
-(defined in kernel/power/process.c), which sets the task's PF_FROZEN flag,
+to it by calling the function called refrigerator() (defined in
-changes its state to TASK_UNINTERRUPTIBLE and makes it loop until PF_FROZEN is
+kernel/power/process.c), which sets the task's PF_FROZEN flag, changes its state
-cleared for it.  Then, we say that the task is 'frozen' and therefore the set of
+to TASK_UNINTERRUPTIBLE and makes it loop until PF_FROZEN is cleared for it.
-functions handling this mechanism is called 'the freezer' (these functions are
+Then, we say that the task is 'frozen' and therefore the set of functions
+handling this mechanism is referred to as 'the freezer' (these functions are
 defined in kernel/power/process.c and include/linux/freezer.h).  User space
 processes are generally frozen before kernel threads.
@@ -35,21 +36,27 @@ task enter refrigerator() if the flag is set.
 For user space processes try_to_freeze() is called automatically from the
 signal-handling code, but the freezable kernel threads need to call it
-explicitly in suitable places.  The code to do this may look like the following:
+explicitly in suitable places or use the wait_event_freezable() or
+wait_event_freezable_timeout() macros (defined in include/linux/freezer.h)
+that combine interruptible sleep with checking if TIF_FREEZE is set and calling
+try_to_freeze().  The main loop of a freezable kernel thread may look like the
+following one:
+        set_freezable();
        do {
                hub_events();
-                wait_event_interruptible(khubd_wait,
+                wait_event_freezable(khubd_wait,
-                                        !list_empty(&hub_event_list));
+                                !list_empty(&hub_event_list) ||
-                try_to_freeze();
+                                kthread_should_stop());
-        } while (!signal_pending(current));
+        } while (!kthread_should_stop() || !list_empty(&hub_event_list));
 (from drivers/usb/core/hub.c::hub_thread()).
 If a freezable kernel thread fails to call try_to_freeze() after the freezer has
 set TIF_FREEZE for it, the freezing of tasks will fail and the entire
 hibernation operation will be cancelled.  For this reason, freezable kernel
-threads must call try_to_freeze() somewhere.
+threads must call try_to_freeze() somewhere or use one of the
+wait_event_freezable() and wait_event_freezable_timeout() macros.
 After the system memory state has been restored from a hibernation image and
 devices have been reinitialized, the function thaw_processes() is called in
@@ -81,7 +88,16 @@ hibernation image has been created and before the system is finally powered off.
 The majority of these are user space processes, but if any of the kernel threads
 may cause something like this to happen, they have to be freezable.
-2. The second reason is to prevent user space processes and some kernel threads
+2. Next, to create the hibernation image we need to free a sufficient amount of
+memory (approximately 50% of available RAM) and we need to do that before
+devices are deactivated, because we generally need them for swapping out.  Then,
+after the memory for the image has been freed, we don't want tasks to allocate
+additional memory and we prevent them from doing that by freezing them earlier.
+[Of course, this also means that device drivers should not allocate substantial
+amounts of memory from their .suspend() callbacks before hibernation, but this
+is e separate issue.]
+3. The third reason is to prevent user space processes and some kernel threads
 from interfering with the suspending and resuming of devices.  A user space
 process running on a second CPU while we are suspending devices may, for
 example, be troublesome and without the freezing of tasks we would need some
@@ -111,7 +127,7 @@ frozen before the driver's .suspend() callback is executed and it will be
 thawed after the driver's .resume() callback has run, so it won't be accessing
 the device while it's suspended.
-3. Another reason for freezing tasks is to prevent user space processes from
+4. Another reason for freezing tasks is to prevent user space processes from
 realizing that hibernation (or suspend) operation takes place.  Ideally, user
 space processes should not notice that such a system-wide operation has occurred
 and should continue running without any problems after the restore (or resume
diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt
index fd5192a8fa8a..e67211fe0ee2 100644
--- a/Documentation/power/interface.txt
+++ b/Documentation/power/interface.txt
@@ -20,7 +20,7 @@ states.
 /sys/power/disk controls the operating mode of the suspend-to-disk
 mechanism. Suspend-to-disk can be handled in several ways. We have a
 few options for putting the system to sleep - using the platform driver
-(e.g. ACPI or other pm_ops), powering off the system or rebooting the
+(e.g. ACPI or other suspend_ops), powering off the system or rebooting the
 system (for testing).
 Additionally, /sys/power/disk can be used to turn on one of the two testing
diff --git a/Documentation/power/swsusp-and-swap-files.txt b/Documentation/power/swsusp-and-swap-files.txt
index 06f911a5f885..f281886de490 100644
--- a/Documentation/power/swsusp-and-swap-files.txt
+++ b/Documentation/power/swsusp-and-swap-files.txt
@@ -39,7 +39,7 @@ resume=<swap_file_partition> resume_offset=<swap_file_offset>
 where <swap_file_partition> is the partition on which the swap file is located
 and <swap_file_offset> is the offset of the swap header determined by the
 application in 2) (of course, this step may be carried out automatically
-by the same application that determies the swap file's header offset using the
+by the same application that determines the swap file's header offset using the
 FIBMAP ioctl)
 OR
diff --git a/Documentation/powerpc/eeh-pci-error-recovery.txt b/Documentation/powerpc/eeh-pci-error-recovery.txt
index 4530d1bf0286..df7afe43d462 100644
--- a/Documentation/powerpc/eeh-pci-error-recovery.txt
+++ b/Documentation/powerpc/eeh-pci-error-recovery.txt
@@ -36,8 +36,8 @@ Causes of EEH Errors
 EEH was originally designed to guard against hardware failure, such
 as PCI cards dying from heat, humidity, dust, vibration and bad
 electrical connections. The vast majority of EEH errors seen in
-"real life" are due to eithr poorly seated PCI cards, or,
+"real life" are due to either poorly seated PCI cards, or,
-unfortunately quite commonly, due device driver bugs, device firmware
+unfortunately quite commonly, due to device driver bugs, device firmware
 bugs, and sometimes PCI card hardware bugs.
 The most common software bug, is one that causes the device to
diff --git a/Documentation/powerpc/mpc52xx-device-tree-bindings.txt b/Documentation/powerpc/mpc52xx-device-tree-bindings.txt
index e59fcbbe338c..5e03610e186f 100644
--- a/Documentation/powerpc/mpc52xx-device-tree-bindings.txt
+++ b/Documentation/powerpc/mpc52xx-device-tree-bindings.txt
@@ -17,12 +17,12 @@ passed by the boot loader to the kernel at boot time.  The device tree
 describes what devices are present on the board and how they are
 connected.  The device tree can either be passed as a binary blob (as
 described in Documentation/powerpc/booting-without-of.txt), or passed
-by Open Firmare (IEEE 1275) compatible firmware using an OF compatible
+by Open Firmware (IEEE 1275) compatible firmware using an OF compatible
 client interface API.
 This document specifies the requirements on the device-tree for mpc5200
 based boards.  These requirements are above and beyond the details
-specified in either the OpenFirmware spec or booting-without-of.txt
+specified in either the Open Firmware spec or booting-without-of.txt
 All new mpc5200-based boards are expected to match this document.  In
 cases where this document is not sufficient to support a new board port,
@@ -73,8 +73,8 @@ match on the compatible list; the 'most compatible' driver should be
 selected.
 The split between the MPC5200 and the MPC5200B leaves a bit of a
-connundrum.  How should the compatible property be set up to provide
+conundrum.  How should the compatible property be set up to provide
-maximum compatability information; but still acurately describe the
+maximum compatibility information; but still accurately describe the
 chip?  For the MPC5200; the answer is easy.  Most of the SoC devices
 originally appeared on the MPC5200.  Since they didn't exist anywhere
 else; the 5200 compatible properties will contain only one item;
@@ -84,7 +84,7 @@ The 5200B is almost the same as the 5200, but not quite.  It fixes
 silicon bugs and it adds a small number of enhancements.  Most of the
 devices either provide exactly the same interface as on the 5200.  A few
 devices have extra functions but still have a backwards compatible mode.
-To express this infomation as completely as possible, 5200B device trees
+To express this information as completely as possible, 5200B device trees
 should have two items in the compatible list;
 "mpc5200b-<device>\0mpc5200-<device>".  It is *strongly* recommended
 that 5200B device trees follow this convention (instead of only listing
@@ -185,7 +185,7 @@ bestcomm@<addr>	dma-controller		mpc5200-bestcomm 5200 pic also requires
 Recommended soc5200 child nodes; populate as needed for your board
 name            device_type     compatible        Description
 ----            -----------     ----------        -----------
-gpt@<addr>      gpt             mpc5200-gpt       General purpose timers
+gpt@<addr>      gpt             fsl,mpc5200-gpt   General purpose timers
 rtc@<addr>      rtc             mpc5200-rtc       Real time clock
 mscan@<addr>    mscan           mpc5200-mscan     CAN bus controller
 pci@<addr>      pci             mpc5200-pci       PCI bridge
@@ -199,7 +199,7 @@ ethernet@<addr>	network		mpc5200-fec	  MPC5200 ethernet device
 ata@<addr>      ata             mpc5200-ata       IDE ATA interface
 i2c@<addr>      i2c             mpc5200-i2c       I2C controller
 usb@<addr>      usb-ohci-be     mpc5200-ohci,ohci-be    USB controller
-xlb@<addr>      xlb             mpc5200-xlb       XLB arbritrator
+xlb@<addr>      xlb             mpc5200-xlb       XLB arbitrator
 Important child node properties
 name            type            description
@@ -213,7 +213,7 @@ cell-index	int		When multiple devices are present, is the
 5) General Purpose Timer nodes (child of soc5200 node)
 On the mpc5200 and 5200b, GPT0 has a watchdog timer function.  If the board
 design supports the internal wdt, then the device node for GPT0 should
-include the empty property 'has-wdt'.
+include the empty property 'fsl,has-wdt'.
 6) PSC nodes (child of soc5200 node)
 PSC nodes can define the optional 'port-number' property to force assignment
diff --git a/Documentation/scsi/aic79xx.txt b/Documentation/scsi/aic79xx.txt
index 6aa9a891f3d0..683ccae00ad4 100644
--- a/Documentation/scsi/aic79xx.txt
+++ b/Documentation/scsi/aic79xx.txt
@@ -120,7 +120,7 @@ The following information is available in this file:
          list size to avoid SCSI malloc pool fragmentation.
        - Cleanup channel display in our /proc output.
        - Workaround duplicate device entries in the mid-layer
-          devlice list during add-single-device.
+          device list during add-single-device.
   1.3.6 (March 28th, 2003)
        - Correct a double free in the Domain Validation code.
diff --git a/Documentation/scsi/aic7xxx.txt b/Documentation/scsi/aic7xxx.txt
index 5f34d2ba69b4..b7e238cbb5a7 100644
--- a/Documentation/scsi/aic7xxx.txt
+++ b/Documentation/scsi/aic7xxx.txt
@@ -159,7 +159,7 @@ The following information is available in this file:
        - Add support for 2.5.X's scsi_report_device_reset().
   6.2.34 (May 5th, 2003)
-        - Fix locking regression instroduced in 6.2.29 that
+        - Fix locking regression introduced in 6.2.29 that
          could cause a lock order reversal between the io_request_lock
          and our per-softc lock.  This was only possible on RH9,
          SuSE, and kernel.org 2.4.X kernels.
@@ -264,7 +264,7 @@ The following information is available in this file:
              Option: tag_info:{{value[,value...]}[,{value[,value...]}...]}
          Definition: Set the per-target tagged queue depth on a
                      per controller basis.  Both controllers and targets
-                      may be ommitted indicating that they should retain
+                      may be omitted indicating that they should retain
                      the default tag depth.
            Examples: tag_info:{{16,32,32,64,8,8,,32,32,32,32,32,32,32,32,32}
                        On Controller 0
@@ -290,7 +290,7 @@ The following information is available in this file:
   -----------------------------------------------------------------
              Option: dv: {value[,value...]} 
          Definition: Set Domain Validation Policy on a per-controller basis.
-                      Controllers may be ommitted indicating that
+                      Controllers may be omitted indicating that
                      they should retain the default read streaming setting.
             Example: dv:{-1,0,,1,1,0}
                        On Controller 0 leave DV at its default setting.
diff --git a/Documentation/scsi/arcmsr_spec.txt b/Documentation/scsi/arcmsr_spec.txt
index 5e0042340fd3..45d9482c1517 100644
--- a/Documentation/scsi/arcmsr_spec.txt
+++ b/Documentation/scsi/arcmsr_spec.txt
@@ -3,7 +3,7 @@
 *******************************************************************************
 **      Usage of IOP331 adapter
 **      (All In/Out is in IOP331's view)
-**      1. Message 0 --> InitThread message and retrun code
+**      1. Message 0 --> InitThread message and return code
 **      2. Doorbell is used for RS-232 emulation
 **              inDoorBell :    bit0 -- data in ready
 **                      (DRIVER DATA WRITE OK)
diff --git a/Documentation/scsi/ibmmca.txt b/Documentation/scsi/ibmmca.txt
index a08e225653d6..a810421f1fb3 100644
--- a/Documentation/scsi/ibmmca.txt
+++ b/Documentation/scsi/ibmmca.txt
@@ -21,7 +21,7 @@
   versions older than 4.0 do not work with kernels 2.4.0 or later! If you
   try to compile your kernel with the wrong driver source, the 
   compilation is aborted and you get a corresponding error message. This is
-   no bug in the driver. It prevents you from using the wrong sourcecode
+   no bug in the driver; it prevents you from using the wrong source code
   with the wrong kernel version.
   Authors of this Driver
@@ -58,7 +58,7 @@
   5 Users' Manual
     5.1 Commandline Parameters
     5.2 Troubleshooting
-     5.3 Bugreports
+     5.3 Bug reports
     5.4 Support WWW-page
   6 References
   7 Credits to
@@ -71,13 +71,13 @@
   1 Abstract
   ----------
-   This README-file describes the IBM SCSI-subsystem low level driver for 
+   This README-file describes the IBM SCSI-subsystem low level driver for
-   Linux. The descriptions which were formerly kept in the source-code have 
+   Linux. The descriptions which were formerly kept in the source code have
-   been taken out to this file to easify the codes' readability. The driver 
+   been taken out of this file to simplify the codes readability. The driver
   description has been updated, as most of the former description was already
-   quite outdated. The history of the driver development is also kept inside 
+   quite outdated. The history of the driver development is also kept inside
-   here. Multiple historical developments have been summarized to shorten the 
+   here. Multiple historical developments have been summarized to shorten the
-   textsize a bit. At the end of this file you can find a small manual for 
+   text size a bit. At the end of this file you can find a small manual for
   this driver and hints to get it running on your machine.
   2 Driver Description
@@ -186,7 +186,7 @@
   between 0 and 7). The IBM SCSI-2 F/W adapter offers this on up to two
   busses and provides support for 30 logical devices at the same time, where
   in wide-addressing mode you can have 16 puns with 32 luns on each device.
-   This section dexribes you the handling of devices on non-F/W adapters.
+   This section describes the handling of devices on non-F/W adapters.
   Just imagine, that you can have 16 * 32 = 512 devices on a F/W adapter
   which means a lot of possible devices for such a small machine.
@@ -209,10 +209,10 @@
   --------------------------------------------------------
   One consequence of information hiding is that the real (pun,lun)    
   numbers are also hidden. The two possibilities to get around this problem
-   is to offer fake pun/lun combinations to the operating system or to 
+   are to offer fake pun/lun combinations to the operating system or to 
   delete the whole mapping of the adapter and to reassign the ldns, using
   the immediate assign command of the SCSI-subsystem for probing through
-   all possible pun/lun combinations. a ldn is a "logical device number"
+   all possible pun/lun combinations.  An ldn is a "logical device number"
   which is used by IBM SCSI-subsystems to access some valid SCSI-device.
   At the beginning of the development of this driver, the following approach 
   was used:
@@ -251,9 +251,9 @@
   lun>0 or to non-existing devices, in order to satisfy the subsystem, if 
   there are less than 15 SCSI-devices connected. In the case of more than 15 
   devices, the dynamical mapping goes active. If the get_scsi[][] reports a 
-   device to be existant, but it has no ldn assigned, it gets a ldn out of 7 
+   device to be existent, but it has no ldn assigned, it gets an ldn out of 7
-   to 14. The numbers are assigned in cyclic order. Therefore it takes 8 
+   to 14. The numbers are assigned in cyclic order, therefore it takes 8 
-   dynamical reassignments on the SCSI-devices, until a certain device 
+   dynamical reassignments on the SCSI-devices until a certain device 
   loses its ldn again. This assures that dynamical remapping is avoided 
   during intense I/O between up to 15 SCSI-devices (means pun,lun 
   combinations). A further advantage of this method is that people who
@@ -551,7 +551,7 @@
      than devices are available, they are assigned to non existing pun,lun
      combinations to satisfy the adapter. With this, the dynamical mapping
      was possible to implement. (For further info see the text in the 
-      source-code and in the description below. Read the description
+      source code and in the description below. Read the description
      below BEFORE installing this driver on your system!)
   2) Changed the name IBMMCA_DRIVER_VERSION to IBMMCA_SCSI_DRIVER_VERSION.
   3) The LED-display shows on PS/2-95 no longer the ldn, but the SCSI-ID
@@ -762,9 +762,9 @@
   - Michael Lang
   Apr 23, 2000 (v3.2pre1)
-   1) During a very long time, I collected a huge amount of bugreports from
+   1) During a very long time, I collected a huge amount of bug reports from
      various people, trying really quite different things on their SCSI-
-      PS/2s. Today, all these bugreports are taken into account and should be
+      PS/2s. Today, all these bug reports are taken into account and should be
      mostly solved. The major topics were:
      - Driver crashes during boottime by no obvious reason.
      - Driver panics while the midlevel-SCSI-driver is trying to inquire
@@ -819,7 +819,7 @@
   - Michael Lang
   
   July 17, 2000 (v3.2pre8)
-   A long period of collecting bugreports from all corners of the world
+   A long period of collecting bug reports from all corners of the world
   now lead to the following corrections to the code:
   1) SCSI-2 F/W support crashed with a COMMAND ERROR. The reason for this 
      was that it is possible to disable Fast-SCSI for the external bus.
@@ -873,7 +873,7 @@
   July 26, 2000 (v3.2pre11)
   1) I passed a horrible weekend getting mad with NMIs on kernel 2.2.14 and
      a model 9595. Asking around in the community, nobody except of me has
-      seen such errors. Weired, but I am trying to recompile everything on
+      seen such errors. Weird, but I am trying to recompile everything on
      the model 9595. Maybe, as I use a specially modified gcc, that could
      cause problems. But, it was not the reason. The true background was,
      that the kernel was compiled for i386 and the 9595 has a 486DX-2. 
@@ -886,7 +886,7 @@
      alive rotator during boottime. This makes sense, when no monitor is 
      connected to the system. You can get rid of all display activity, if
      you do not use any parameter or just ibmmcascsi=activity, for the 
-      harddrive activity LED, existant on all PS/2, except models 8595-XXX.
+      harddrive activity LED, existent on all PS/2, except models 8595-XXX.
      If no monitor is available, please use ibmmcascsi=display, which works
      fine together with the linuxinfo utility for the LED-panel.
   - Michael Lang
@@ -1115,7 +1115,7 @@
        If this really happens, do also send e-mail to the maintainer, as
        forced detection should be never necessary. Forced detection is in
        principal some flaw of the driver adapter detection and goes into 
-        bugreports.
+        bug reports.
     Q: The driver screws up, if it starts to probe SCSI-devices, is there
        some way out of it?
     A: Yes, that was some recognition problem of the correct SCSI-adapter
@@ -1172,7 +1172,7 @@
        recommended version is 3.2 or later. Here, the F/W support is in
        a stable and reliable condition. Wide-addressing is in addition 
        supported.
-     Q: I get a Ooops message and something like "killing interrupt".
+     Q: I get an Oops message and something like "killing interrupt".
     A: The reason for this is that the IBM SCSI-subsystem only sends a 
        termination status back, if some error appeared. In former releases
        of the driver, it was not checked, if the termination status block
@@ -1213,21 +1213,21 @@
        problem. Not yet tried, but guessing that it could work. To get this,
        set unchecked_isa_dma argument of ibmmca.h from 0 to 1.
-   5.3 Bugreports
+   5.3 Bug reports
   --------------
-   If you really find bugs in the sourcecode or the driver will successfully
+   If you really find bugs in the source code or the driver will successfully
   refuse to work on your machine, you should send a bug report to me. The
   best for this is to follow the instructions on the WWW-page for this
   driver. Fill out the bug-report form, placed on the WWW-page and ship it,
   so the bugs can be taken into account with maximum efforts. But, please
   do not send bug reports about this driver to Linus Torvalds or Leonard
-   Zubkoff, as Linus is burried in E-Mail and Leonard is supervising all
+   Zubkoff, as Linus is buried in E-Mail and Leonard is supervising all
   SCSI-drivers and won't have the time left to look inside every single
   driver to fix a bug and especially DO NOT send modified code to Linus
   Torvalds or Alan J. Cox which has not been checked here!!! They are both
-   quite burried in E-mail (as me, sometimes, too) and one should first check
+   quite buried in E-mail (as me, sometimes, too) and one should first check
   for problems on my local teststand. Recently, I got a lot of 
-   bugreports for errors in the ibmmca.c code, which I could not imagine, but
+   bug reports for errors in the ibmmca.c code, which I could not imagine, but
   a look inside some Linux-distribution showed me quite often some modified
   code, which did no longer work on most other machines than the one of the
   modifier. Ok, so now that there is maintenance service available for this
@@ -1261,7 +1261,7 @@
   some e-mail directly, but at least with the same information as required by
   the formular.
   
-   If you have extensive bugreports, including Ooops messages and 
+   If you have extensive bug reports, including Oops messages and
   screen-shots, please feel free to send it directly to the address
   of the maintainer, too. The current address of the maintainer is:
   
@@ -1318,7 +1318,7 @@
                detailed bug reports and ideas for this driver (and his 
                patience ;-)).
   Alan J. Cox  
-                for his bugreports and his bold activities in cross-checking
+                for his bug reports and his bold activities in cross-checking
                the driver-code with his teststand.
                
   7.2 Sponsors & Supporters
diff --git a/Documentation/sharedsubtree.txt b/Documentation/sharedsubtree.txt
index ccf1cebe744f..736540045dc7 100644
--- a/Documentation/sharedsubtree.txt
+++ b/Documentation/sharedsubtree.txt
@@ -153,6 +153,7 @@ replicas continue to be exactly same.
        #include <stdio.h>
        #include <stdlib.h>
        #include <unistd.h>
+        #include <string.h>
        #include <sys/mount.h>
        #include <sys/fsuid.h>
diff --git a/Documentation/sound/alsa/soc/DAI.txt b/Documentation/sound/alsa/soc/DAI.txt
index 58cbfd01ea8f..3feeb9ecdec4 100644
--- a/Documentation/sound/alsa/soc/DAI.txt
+++ b/Documentation/sound/alsa/soc/DAI.txt
@@ -20,12 +20,12 @@ I2S
 ===
 I2S is a common 4 wire DAI used in HiFi, STB and portable devices. The Tx and
-Rx lines are used for audio transmision, whilst the bit clock (BCLK) and
+Rx lines are used for audio transmission, whilst the bit clock (BCLK) and
 left/right clock (LRC) synchronise the link. I2S is flexible in that either the
 controller or CODEC can drive (master) the BCLK and LRC clock lines. Bit clock
 usually varies depending on the sample rate and the master system clock
 (SYSCLK). LRCLK is the same as the sample rate. A few devices support separate
-ADC and DAC LRCLK's, this allows for similtanious capture and playback at
+ADC and DAC LRCLK's, this allows for simultaneous capture and playback at
 different sample rates.
 I2S has several different operating modes:-
@@ -41,12 +41,12 @@ I2S has several different operating modes:-
 PCM
 ===
-PCM is another 4 wire interface, very similar to I2S, that can support a more
+PCM is another 4 wire interface, very similar to I2S, which can support a more
 flexible protocol. It has bit clock (BCLK) and sync (SYNC) lines that are used
 to synchronise the link whilst the Tx and Rx lines are used to transmit and
 receive the audio data. Bit clock usually varies depending on sample rate
 whilst sync runs at the sample rate. PCM also supports Time Division
-Multiplexing (TDM) in that several devices can use the bus similtaniuosly (This
+Multiplexing (TDM) in that several devices can use the bus simultaneously (this
 is sometimes referred to as network mode).
 Common PCM operating modes:-
diff --git a/Documentation/sound/alsa/soc/clocking.txt b/Documentation/sound/alsa/soc/clocking.txt
index e93960d53a1e..14930887c25f 100644
--- a/Documentation/sound/alsa/soc/clocking.txt
+++ b/Documentation/sound/alsa/soc/clocking.txt
@@ -2,20 +2,20 @@ Audio Clocking
 ==============
 This text describes the audio clocking terms in ASoC and digital audio in
-general. Note: Audio clocking can be complex !
+general. Note: Audio clocking can be complex!
 Master Clock
 ------------
-Every audio subsystem is driven by a master clock (sometimes refered to as MCLK
+Every audio subsystem is driven by a master clock (sometimes referred to as MCLK
 or SYSCLK). This audio master clock can be derived from a number of sources
 (e.g. crystal, PLL, CPU clock) and is responsible for producing the correct
 audio playback and capture sample rates.
-Some master clocks (e.g. PLL's and CPU based clocks) are configuarble in that
+Some master clocks (e.g. PLL's and CPU based clocks) are configurable in that
 their speed can be altered by software (depending on the system use and to save
-power). Other master clocks are fixed at at set frequency (i.e. crystals).
+power). Other master clocks are fixed at a set frequency (i.e. crystals).
 DAI Clocks
@@ -44,7 +44,7 @@ This relationship depends on the codec or SoC CPU in particular. In general
 it's best to configure BCLK to the lowest possible speed (depending on your
 rate, number of channels and wordsize) to save on power.
-It's also desireable to use the codec (if possible) to drive (or master) the
+It's also desirable to use the codec (if possible) to drive (or master) the
 audio clocks as it's usually gives more accurate sample rates than the CPU.
diff --git a/Documentation/sound/alsa/soc/codec.txt b/Documentation/sound/alsa/soc/codec.txt
index 48983c75aad9..1e766ad0ebd1 100644
--- a/Documentation/sound/alsa/soc/codec.txt
+++ b/Documentation/sound/alsa/soc/codec.txt
@@ -19,7 +19,7 @@ Optionally, codec drivers can also provide:-
 6) DAPM event handler.
 7) DAC Digital mute control.
-It's probably best to use this guide in conjuction with the existing codec
+It's probably best to use this guide in conjunction with the existing codec
 driver code in sound/soc/codecs/
 ASoC Codec driver breakdown
@@ -28,7 +28,7 @@ ASoC Codec driver breakdown
 1 - Codec DAI and PCM configuration
 -----------------------------------
 Each codec driver must have a struct snd_soc_codec_dai to define it's DAI and
-PCM's capablities and operations. This struct is exported so that it can be
+PCM's capabilities and operations. This struct is exported so that it can be
 registered with the core by your machine driver.
 e.g.
@@ -67,7 +67,7 @@ EXPORT_SYMBOL_GPL(wm8731_dai);
 2 - Codec control IO
 --------------------
-The codec can ususally be controlled via an I2C or SPI style interface (AC97
+The codec can usually be controlled via an I2C or SPI style interface (AC97
 combines control with data in the DAI). The codec drivers will have to provide
 functions to read and write the codec registers along with supplying a register
 cache:-
diff --git a/Documentation/sound/alsa/soc/dapm.txt b/Documentation/sound/alsa/soc/dapm.txt
index c11877f5b4a1..ab0766fd7869 100644
--- a/Documentation/sound/alsa/soc/dapm.txt
+++ b/Documentation/sound/alsa/soc/dapm.txt
@@ -11,7 +11,7 @@ other PM systems.
 DAPM is also completely transparent to all user space applications as all power
 switching is done within the ASoC core. No code changes or recompiling are
-required for user space applications. DAPM makes power switching descisions based
+required for user space applications. DAPM makes power switching decisions based
 upon any audio stream (capture/playback) activity and audio mixer settings
 within the device.
@@ -38,7 +38,7 @@ There are 4 power domains within DAPM
      Enabled and disabled when stream playback/capture is started and
      stopped respectively. e.g. aplay, arecord.
-All DAPM power switching descisons are made automatically by consulting an audio
+All DAPM power switching decisions are made automatically by consulting an audio
 routing map of the whole machine. This map is specific to each machine and
 consists of the interconnections between every audio component (including
 internal codec components). All audio components that effect power are called
diff --git a/Documentation/sound/alsa/soc/overview.txt b/Documentation/sound/alsa/soc/overview.txt
index 753c5cc5984a..c47ce9530677 100644
--- a/Documentation/sound/alsa/soc/overview.txt
+++ b/Documentation/sound/alsa/soc/overview.txt
@@ -2,18 +2,19 @@ ALSA SoC Layer
 ==============
 The overall project goal of the ALSA System on Chip (ASoC) layer is to provide
-better ALSA support for embedded system on chip procesors (e.g. pxa2xx, au1x00,
+better ALSA support for embedded system-on-chip processors (e.g. pxa2xx, au1x00,
 iMX, etc) and portable audio codecs. Currently there is some support in the
 kernel for SoC audio, however it has some limitations:-
  * Currently, codec drivers are often tightly coupled to the underlying SoC
-    cpu. This is not ideal and leads to code duplication i.e. Linux now has 4
+    CPU. This is not ideal and leads to code duplication i.e. Linux now has 4
    different wm8731 drivers for 4 different SoC platforms.
-  * There is no standard method to signal user initiated audio events.
+  * There is no standard method to signal user initiated audio events (e.g.
-    e.g. Headphone/Mic insertion, Headphone/Mic detection after an insertion
+    Headphone/Mic insertion, Headphone/Mic detection after an insertion
-    event. These are quite common events on portable devices and ofter require
+    event). These are quite common events on portable devices and often require
-    machine specific code to re route audio, enable amps etc after such an event.
+    machine specific code to re-route audio, enable amps, etc., after such an
+    event.
  * Current drivers tend to power up the entire codec when playing
    (or recording) audio. This is fine for a PC, but tends to waste a lot of
@@ -44,7 +45,7 @@ features :-
    signals the codec when to change power states.
  * Machine specific controls: Allow machines to add controls to the sound card
-    e.g. volume control for speaker amp.
+    (e.g. volume control for speaker amp).
 To achieve all this, ASoC basically splits an embedded audio system into 3
 components :-
@@ -57,7 +58,7 @@ components :-
    interface drivers (e.g. I2S, AC97, PCM) for that platform.
  * Machine driver: The machine driver handles any machine specific controls and
-    audio events. i.e. turing on an amp at start of playback.
+    audio events (e.g. turning on an amp at start of playback).
 Documentation
diff --git a/Documentation/sound/alsa/soc/platform.txt b/Documentation/sound/alsa/soc/platform.txt
index e95b16d5a53b..d4678b4dc6c6 100644
--- a/Documentation/sound/alsa/soc/platform.txt
+++ b/Documentation/sound/alsa/soc/platform.txt
@@ -20,7 +20,7 @@ struct snd_soc_ops {
        int (*trigger)(struct snd_pcm_substream *, int);
 };
-The platform driver exports it's DMA functionailty via struct snd_soc_platform:-
+The platform driver exports its DMA functionality via struct snd_soc_platform:-
 struct snd_soc_platform {
        char *name;
diff --git a/Documentation/sound/alsa/soc/pops_clicks.txt b/Documentation/sound/alsa/soc/pops_clicks.txt
index 2cf7ee5b3d74..3371bd9d7cfa 100644
--- a/Documentation/sound/alsa/soc/pops_clicks.txt
+++ b/Documentation/sound/alsa/soc/pops_clicks.txt
@@ -2,7 +2,7 @@ Audio Pops and Clicks
 =====================
 Pops and clicks are unwanted audio artifacts caused by the powering up and down
-of components within the audio subsystem. This is noticable on PC's when an
+of components within the audio subsystem. This is noticeable on PCs when an
 audio module is either loaded or unloaded (at module load time the sound card is
 powered up and causes a popping noise on the speakers).
@@ -16,7 +16,7 @@ Minimising Playback Pops and Clicks
 ===================================
 Playback pops in portable audio subsystems cannot be completely eliminated atm,
-however future audio codec hardware will have better pop and click supression.
+however future audio codec hardware will have better pop and click suppression.
 Pops can be reduced within playback by powering the audio components in a
 specific order. This order is different for startup and shutdown and follows
 some basic rules:-
@@ -33,7 +33,7 @@ Minimising Capture Pops and Clicks
 ==================================
 Capture artifacts are somewhat easier to get rid as we can delay activating the
-ADC until all the pops have occured. This follows similar power rules to
+ADC until all the pops have occurred. This follows similar power rules to
 playback in that components are powered in a sequence depending upon stream
 startup or shutdown.
diff --git a/Documentation/sound/oss/es1371 b/Documentation/sound/oss/es1371
deleted file mode 100644
index c3151266771c..000000000000
--- a/Documentation/sound/oss/es1371
+++ /dev/null
@@ -1,64 +0,0 @@
-/proc/sound, /dev/sndstat
-------------------------
-/proc/sound and /dev/sndstat is not supported by the
-driver. To find out whether the driver succeeded loading,
-check the kernel log (dmesg).
-ALaw/uLaw sample formats
------------------------
-This driver does not support the ALaw/uLaw sample formats.
-ALaw is the default mode when opening a sound device
-using OSS/Free. The reason for the lack of support is
-that the hardware does not support these formats, and adding
-conversion routines to the kernel would lead to very ugly
-code in the presence of the mmap interface to the driver.
-And since xquake uses mmap, mmap is considered important :-)
-and no sane application uses ALaw/uLaw these days anyway.
-In short, playing a Sun .au file as follows:
-cat my_file.au > /dev/dsp
-does not work. Instead, you may use the play script from
-Chris Bagwell's sox-12.14 package (available from the URL
-below) to play many different audio file formats.
-The script automatically determines the audio format
-and does do audio conversions if necessary.
-http://home.sprynet.com/sprynet/cbagwell/projects.html
-Blocking vs. nonblocking IO
---------------------------
-Unlike OSS/Free this driver honours the O_NONBLOCK file flag
-not only during open, but also during read and write.
-This is an effort to make the sound driver interface more
-regular. Timidity has problems with this; a patch
-is available from http://www.ife.ee.ethz.ch/~sailer/linux/pciaudio.html.
-(Timidity patched will also run on OSS/Free).
-MIDI UART
---------
-The driver supports a simple MIDI UART interface, with
-no ioctl's supported.
-MIDI synthesizer
----------------
-This soundcard does not have any hardware MIDI synthesizer;
-MIDI synthesis has to be done in software. To allow this
-the driver/soundcard supports two PCM (/dev/dsp) interfaces.
-There is a freely available software package that allows
-MIDI file playback on this soundcard called Timidity.
-See http://www.cgs.fi/~tt/timidity/.
-Thomas Sailer
-t.sailer@alumni.ethz.ch
diff --git a/Documentation/spi/pxa2xx b/Documentation/spi/pxa2xx
index 215e3b8e7266..f3853cc37bde 100644
--- a/Documentation/spi/pxa2xx
+++ b/Documentation/spi/pxa2xx
@@ -1,4 +1,4 @@
-PXA2xx SPI on SSP driver HOWTO
+PXA2xx SPI on SSP driver HOWTO
 ===================================================
 This a mini howto on the pxa2xx_spi driver.  The driver turns a PXA2xx
 synchronous serial port into a SPI master controller
diff --git a/Documentation/thinkpad-acpi.txt b/Documentation/thinkpad-acpi.txt
index 60953d6c919d..ec499265deca 100644
--- a/Documentation/thinkpad-acpi.txt
+++ b/Documentation/thinkpad-acpi.txt
@@ -105,10 +105,15 @@ The version of thinkpad-acpi's sysfs interface is exported by the driver
 as a driver attribute (see below).
 Sysfs driver attributes are on the driver's sysfs attribute space,
-for 2.6.20 this is /sys/bus/platform/drivers/thinkpad_acpi/.
+for 2.6.23 this is /sys/bus/platform/drivers/thinkpad_acpi/ and
+/sys/bus/platform/drivers/thinkpad_hwmon/
-Sysfs device attributes are on the driver's sysfs attribute space,
+Sysfs device attributes are on the thinkpad_acpi device sysfs attribute
-for 2.6.20 this is /sys/devices/platform/thinkpad_acpi/.
+space, for 2.6.23 this is /sys/devices/platform/thinkpad_acpi/.
+Sysfs device attributes for the sensors and fan are on the
+thinkpad_hwmon device's sysfs attribute space, but you should locate it
+looking for a hwmon device with the name attribute of "thinkpad".
 Driver version
 --------------
@@ -766,7 +771,7 @@ Temperature sensors
 -------------------
 procfs: /proc/acpi/ibm/thermal
-sysfs device attributes: (hwmon) temp*_input
+sysfs device attributes: (hwmon "thinkpad") temp*_input
 Most ThinkPads include six or more separate temperature sensors but only
 expose the CPU temperature through the standard ACPI methods.  This
@@ -989,7 +994,9 @@ Fan control and monitoring: fan speed, fan enable/disable
 ---------------------------------------------------------
 procfs: /proc/acpi/ibm/fan
-sysfs device attributes: (hwmon) fan_input, pwm1, pwm1_enable
+sysfs device attributes: (hwmon "thinkpad") fan1_input, pwm1,
+                          pwm1_enable
+sysfs hwmon driver attributes: fan_watchdog
 NOTE NOTE NOTE: fan control operations are disabled by default for
 safety reasons.  To enable them, the module parameter "fan_control=1"
@@ -1028,7 +1035,7 @@ enable it if necessary to avoid overheating.
 An enabled fan in level "auto" may stop spinning if the EC decides the
 ThinkPad is cool enough and doesn't need the extra airflow.  This is
-normal, and the EC will spin the fan up if the varios thermal readings
+normal, and the EC will spin the fan up if the various thermal readings
 rise too much.
 On the X40, this seems to depend on the CPU and HDD temperatures.
@@ -1131,7 +1138,7 @@ hwmon device attribute fan1_input:
        which can take up to two minutes.  May return rubbish on older
        ThinkPads.
-driver attribute fan_watchdog:
+hwmon driver attribute fan_watchdog:
        Fan safety watchdog timer interval, in seconds.  Minimum is
        1 second, maximum is 120 seconds.  0 disables the watchdog.
@@ -1196,7 +1203,7 @@ for example:
 Enabling debugging output
 -------------------------
-The module takes a debug paramater which can be used to selectively
+The module takes a debug parameter which can be used to selectively
 enable various classes of debugging output, for example:
         modprobe ibm_acpi debug=0xffff
@@ -1233,3 +1240,9 @@ Sysfs interface changelog:
                layer, the radio switch generates input event EV_RADIO,
                and the driver enables hot key handling by default in
                the firmware.
+0x020000:       ABI fix: added a separate hwmon platform device and
+                driver, which must be located by name (thinkpad)
+                and the hwmon class for libsensors4 (lm-sensors 3)
+                compatibility.  Moved all hwmon attributes to this
+                new platform device.
diff --git a/Documentation/usb/usb-serial.txt b/Documentation/usb/usb-serial.txt
index 4e0b62b8566f..8b077e43eee7 100644
--- a/Documentation/usb/usb-serial.txt
+++ b/Documentation/usb/usb-serial.txt
@@ -338,7 +338,7 @@ MCT USB Single Port Serial Adapter U232
  This driver is for the MCT USB-RS232 Converter (25 pin, Model No.
  U232-P25) from Magic Control Technology Corp. (there is also a 9 pin
  Model No. U232-P9). More information about this device can be found at
-  the manufacture's web-site: http://www.mct.com.tw.
+  the manufacturer's web-site: http://www.mct.com.tw.
  The driver is generally working, though it still needs some more testing.
  It is derived from the Belkin USB Serial Adapter F5U103 driver and its
diff --git a/Documentation/watchdog/src/watchdog-simple.c b/Documentation/watchdog/src/watchdog-simple.c
index 47801bc7e742..4cf72f3fa8e9 100644
--- a/Documentation/watchdog/src/watchdog-simple.c
+++ b/Documentation/watchdog/src/watchdog-simple.c
@@ -3,15 +3,25 @@
 #include <unistd.h>
 #include <fcntl.h>
-int main(int argc, const char *argv[]) {
+int main(void)
+{
        int fd = open("/dev/watchdog", O_WRONLY);
+        int ret = 0;
        if (fd == -1) {
                perror("watchdog");
-                exit(1);
+                exit(EXIT_FAILURE);
        }
        while (1) {
-                write(fd, "\0", 1);
+                ret = write(fd, "\0", 1);
-                fsync(fd);
+                if (ret != 1) {
+                        ret = -1;
+                        break;
+                }
+                ret = fsync(fd);
+                if (ret)
+                        break;
                sleep(10);
        }
+        close(fd);
+        return ret;
 }