aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/DocBook/Makefile2
-rw-r--r--Documentation/DocBook/kernel-api.tmpl2
-rw-r--r--Documentation/DocBook/mtdnand.tmpl6
-rw-r--r--Documentation/IPMI.txt25
-rw-r--r--Documentation/Intel-IOMMU.txt115
-rw-r--r--Documentation/SubmitChecklist2
-rw-r--r--Documentation/SubmittingDrivers3
-rw-r--r--Documentation/accounting/cgroupstats.txt27
-rw-r--r--Documentation/arm/Samsung-S3C24XX/DMA.txt18
-rw-r--r--Documentation/atomic_ops.txt14
-rw-r--r--Documentation/cachetlb.txt27
-rw-r--r--Documentation/cdrom/cdrom-standard.tex2
-rw-r--r--Documentation/cgroups.txt545
-rw-r--r--Documentation/cpu-hotplug.txt4
-rw-r--r--Documentation/cpusets.txt226
-rw-r--r--Documentation/device-mapper/dm-uevent.txt97
-rw-r--r--Documentation/devices.txt2
-rw-r--r--Documentation/driver-model/devres.txt4
-rw-r--r--Documentation/fb/deferred_io.txt4
-rw-r--r--Documentation/feature-removal-schedule.txt66
-rw-r--r--Documentation/filesystems/9p.txt10
-rw-r--r--Documentation/filesystems/Exporting115
-rw-r--r--Documentation/filesystems/Locking2
-rw-r--r--Documentation/filesystems/ext3.txt14
-rw-r--r--Documentation/filesystems/files.txt6
-rw-r--r--Documentation/filesystems/proc.txt7
-rw-r--r--Documentation/filesystems/sysfs.txt2
-rw-r--r--Documentation/filesystems/vfs.txt2
-rw-r--r--Documentation/i2c/i2c-protocol2
-rw-r--r--Documentation/i386/boot.txt34
-rw-r--r--Documentation/ia64/err_inject.txt6
-rw-r--r--Documentation/input/atarikbd.txt8
-rw-r--r--Documentation/input/ff.txt2
-rw-r--r--Documentation/input/iforce-protocol.txt20
-rw-r--r--Documentation/input/input-programming.txt17
-rw-r--r--Documentation/isdn/CREDITS2
-rw-r--r--Documentation/isdn/README.concap2
-rw-r--r--Documentation/java.txt2
-rw-r--r--Documentation/kbuild/kconfig-language.txt14
-rw-r--r--Documentation/kbuild/makefiles.txt22
-rw-r--r--Documentation/kdump/kdump.txt26
-rw-r--r--Documentation/kernel-docs.txt4
-rw-r--r--Documentation/kernel-parameters.txt81
-rw-r--r--Documentation/lguest/Makefile26
-rw-r--r--Documentation/lguest/lguest.c1629
-rw-r--r--Documentation/lguest/lguest.txt72
-rw-r--r--Documentation/m68k/kernel-options.txt5
-rw-r--r--Documentation/markers.txt81
-rw-r--r--Documentation/memory-barriers.txt14
-rw-r--r--Documentation/memory-hotplug.txt58
-rw-r--r--Documentation/mips/00-INDEX2
-rw-r--r--Documentation/mips/AU1xxx_IDE.README2
-rw-r--r--Documentation/mips/time.README173
-rw-r--r--Documentation/mutex-design.txt4
-rw-r--r--Documentation/networking/bcm43xx.txt2
-rw-r--r--Documentation/networking/ip-sysctl.txt2
-rw-r--r--Documentation/networking/rxrpc.txt2
-rw-r--r--Documentation/networking/udplite.txt6
-rw-r--r--Documentation/parport-lowlevel.txt29
-rw-r--r--Documentation/power/basic-pm-debugging.txt4
-rw-r--r--Documentation/power/freezing-of-tasks.txt44
-rw-r--r--Documentation/power/interface.txt2
-rw-r--r--Documentation/power/swsusp-and-swap-files.txt2
-rw-r--r--Documentation/powerpc/eeh-pci-error-recovery.txt4
-rw-r--r--Documentation/powerpc/mpc52xx-device-tree-bindings.txt16
-rw-r--r--Documentation/scsi/aic79xx.txt2
-rw-r--r--Documentation/scsi/aic7xxx.txt6
-rw-r--r--Documentation/scsi/arcmsr_spec.txt2
-rw-r--r--Documentation/scsi/ibmmca.txt58
-rw-r--r--Documentation/sharedsubtree.txt1
-rw-r--r--Documentation/sound/alsa/soc/DAI.txt8
-rw-r--r--Documentation/sound/alsa/soc/clocking.txt10
-rw-r--r--Documentation/sound/alsa/soc/codec.txt6
-rw-r--r--Documentation/sound/alsa/soc/dapm.txt4
-rw-r--r--Documentation/sound/alsa/soc/overview.txt17
-rw-r--r--Documentation/sound/alsa/soc/platform.txt2
-rw-r--r--Documentation/sound/alsa/soc/pops_clicks.txt6
-rw-r--r--Documentation/sound/oss/es137164
-rw-r--r--Documentation/spi/pxa2xx2
-rw-r--r--Documentation/thinkpad-acpi.txt29
-rw-r--r--Documentation/usb/usb-serial.txt2
-rw-r--r--Documentation/watchdog/src/watchdog-simple.c18
82 files changed, 2515 insertions, 1492 deletions
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile
index 1a7f53068ec2..054a7ecf64c6 100644
--- a/Documentation/DocBook/Makefile
+++ b/Documentation/DocBook/Makefile
@@ -165,7 +165,7 @@ quiet_cmd_db2man = MAN $@
165 @touch $@ 165 @touch $@
166 166
167### 167###
168# Rules to generate postscripts and PNG imgages from .fig format files 168# Rules to generate postscripts and PNG images from .fig format files
169quiet_cmd_fig2eps = FIG2EPS $@ 169quiet_cmd_fig2eps = FIG2EPS $@
170 cmd_fig2eps = fig2dev -Leps $< $@ 170 cmd_fig2eps = fig2dev -Leps $< $@
171 171
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index d3290c46af51..aa38cc5692a0 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -46,7 +46,7 @@
46 46
47 <sect1><title>Atomic and pointer manipulation</title> 47 <sect1><title>Atomic and pointer manipulation</title>
48!Iinclude/asm-x86/atomic_32.h 48!Iinclude/asm-x86/atomic_32.h
49!Iinclude/asm-x86/unaligned_32.h 49!Iinclude/asm-x86/unaligned.h
50 </sect1> 50 </sect1>
51 51
52 <sect1><title>Delaying, scheduling, and timer routines</title> 52 <sect1><title>Delaying, scheduling, and timer routines</title>
diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl
index 6fbc41d98c1e..957cf5c26831 100644
--- a/Documentation/DocBook/mtdnand.tmpl
+++ b/Documentation/DocBook/mtdnand.tmpl
@@ -282,7 +282,7 @@ int __init board_init (void)
282 goto out; 282 goto out;
283 } 283 }
284 284
285 /* map physical adress */ 285 /* map physical address */
286 baseaddr = (unsigned long)ioremap(CHIP_PHYSICAL_ADDRESS, 1024); 286 baseaddr = (unsigned long)ioremap(CHIP_PHYSICAL_ADDRESS, 1024);
287 if(!baseaddr){ 287 if(!baseaddr){
288 printk("Ioremap to access NAND chip failed\n"); 288 printk("Ioremap to access NAND chip failed\n");
@@ -306,7 +306,7 @@ int __init board_init (void)
306 this->dev_ready = board_dev_ready; 306 this->dev_ready = board_dev_ready;
307 this->eccmode = NAND_ECC_SOFT; 307 this->eccmode = NAND_ECC_SOFT;
308 308
309 /* Scan to find existance of the device */ 309 /* Scan to find existence of the device */
310 if (nand_scan (board_mtd, 1)) { 310 if (nand_scan (board_mtd, 1)) {
311 err = -ENXIO; 311 err = -ENXIO;
312 goto out_ior; 312 goto out_ior;
@@ -340,7 +340,7 @@ static void __exit board_cleanup (void)
340 /* Release resources, unregister device */ 340 /* Release resources, unregister device */
341 nand_release (board_mtd); 341 nand_release (board_mtd);
342 342
343 /* unmap physical adress */ 343 /* unmap physical address */
344 iounmap((void *)baseaddr); 344 iounmap((void *)baseaddr);
345 345
346 /* Free the MTD device structure */ 346 /* Free the MTD device structure */
diff --git a/Documentation/IPMI.txt b/Documentation/IPMI.txt
index 24dc3fcf1594..bc38283379f0 100644
--- a/Documentation/IPMI.txt
+++ b/Documentation/IPMI.txt
@@ -441,17 +441,20 @@ ACPI, and if none of those then a KCS device at the spec-specified
4410xca2. If you want to turn this off, set the "trydefaults" option to 4410xca2. If you want to turn this off, set the "trydefaults" option to
442false. 442false.
443 443
444If you have high-res timers compiled into the kernel, the driver will 444If your IPMI interface does not support interrupts and is a KCS or
445use them to provide much better performance. Note that if you do not 445SMIC interface, the IPMI driver will start a kernel thread for the
446have high-res timers enabled in the kernel and you don't have 446interface to help speed things up. This is a low-priority kernel
447interrupts enabled, the driver will run VERY slowly. Don't blame me, 447thread that constantly polls the IPMI driver while an IPMI operation
448is in progress. The force_kipmid module parameter will all the user to
449force this thread on or off. If you force it off and don't have
450interrupts, the driver will run VERY slowly. Don't blame me,
448these interfaces suck. 451these interfaces suck.
449 452
450The driver supports a hot add and remove of interfaces. This way, 453The driver supports a hot add and remove of interfaces. This way,
451interfaces can be added or removed after the kernel is up and running. 454interfaces can be added or removed after the kernel is up and running.
452This is done using /sys/modules/ipmi_si/hotmod, which is a write-only 455This is done using /sys/modules/ipmi_si/parameters/hotmod, which is a
453parameter. You write a string to this interface. The string has the 456write-only parameter. You write a string to this interface. The string
454format: 457has the format:
455 <op1>[:op2[:op3...]] 458 <op1>[:op2[:op3...]]
456The "op"s are: 459The "op"s are:
457 add|remove,kcs|bt|smic,mem|i/o,<address>[,<opt1>[,<opt2>[,...]]] 460 add|remove,kcs|bt|smic,mem|i/o,<address>[,<opt1>[,<opt2>[,...]]]
@@ -581,9 +584,11 @@ The watchdog will panic and start a 120 second reset timeout if it
581gets a pre-action. During a panic or a reboot, the watchdog will 584gets a pre-action. During a panic or a reboot, the watchdog will
582start a 120 timer if it is running to make sure the reboot occurs. 585start a 120 timer if it is running to make sure the reboot occurs.
583 586
584Note that if you use the NMI preaction for the watchdog, you MUST 587Note that if you use the NMI preaction for the watchdog, you MUST NOT
585NOT use nmi watchdog mode 1. If you use the NMI watchdog, you 588use the nmi watchdog. There is no reasonable way to tell if an NMI
586must use mode 2. 589comes from the IPMI controller, so it must assume that if it gets an
590otherwise unhandled NMI, it must be from IPMI and it will panic
591immediately.
587 592
588Once you open the watchdog timer, you must write a 'V' character to the 593Once you open the watchdog timer, you must write a 'V' character to the
589device to close it, or the timer will not stop. This is a new semantic 594device to close it, or the timer will not stop. This is a new semantic
diff --git a/Documentation/Intel-IOMMU.txt b/Documentation/Intel-IOMMU.txt
new file mode 100644
index 000000000000..c2321903aa09
--- /dev/null
+++ b/Documentation/Intel-IOMMU.txt
@@ -0,0 +1,115 @@
1Linux IOMMU Support
2===================
3
4The architecture spec can be obtained from the below location.
5
6http://www.intel.com/technology/virtualization/
7
8This guide gives a quick cheat sheet for some basic understanding.
9
10Some Keywords
11
12DMAR - DMA remapping
13DRHD - DMA Engine Reporting Structure
14RMRR - Reserved memory Region Reporting Structure
15ZLR - Zero length reads from PCI devices
16IOVA - IO Virtual address.
17
18Basic stuff
19-----------
20
21ACPI enumerates and lists the different DMA engines in the platform, and
22device scope relationships between PCI devices and which DMA engine controls
23them.
24
25What is RMRR?
26-------------
27
28There are some devices the BIOS controls, for e.g USB devices to perform
29PS2 emulation. The regions of memory used for these devices are marked
30reserved in the e820 map. When we turn on DMA translation, DMA to those
31regions will fail. Hence BIOS uses RMRR to specify these regions along with
32devices that need to access these regions. OS is expected to setup
33unity mappings for these regions for these devices to access these regions.
34
35How is IOVA generated?
36---------------------
37
38Well behaved drivers call pci_map_*() calls before sending command to device
39that needs to perform DMA. Once DMA is completed and mapping is no longer
40required, device performs a pci_unmap_*() calls to unmap the region.
41
42The Intel IOMMU driver allocates a virtual address per domain. Each PCIE
43device has its own domain (hence protection). Devices under p2p bridges
44share the virtual address with all devices under the p2p bridge due to
45transaction id aliasing for p2p bridges.
46
47IOVA generation is pretty generic. We used the same technique as vmalloc()
48but these are not global address spaces, but separate for each domain.
49Different DMA engines may support different number of domains.
50
51We also allocate gaurd pages with each mapping, so we can attempt to catch
52any overflow that might happen.
53
54
55Graphics Problems?
56------------------
57If you encounter issues with graphics devices, you can try adding
58option intel_iommu=igfx_off to turn off the integrated graphics engine.
59
60If it happens to be a PCI device included in the INCLUDE_ALL Engine,
61then try enabling CONFIG_DMAR_GFX_WA to setup a 1-1 map. We hear
62graphics drivers may be in process of using DMA api's in the near
63future and at that time this option can be yanked out.
64
65Some exceptions to IOVA
66-----------------------
67Interrupt ranges are not address translated, (0xfee00000 - 0xfeefffff).
68The same is true for peer to peer transactions. Hence we reserve the
69address from PCI MMIO ranges so they are not allocated for IOVA addresses.
70
71
72Fault reporting
73---------------
74When errors are reported, the DMA engine signals via an interrupt. The fault
75reason and device that caused it with fault reason is printed on console.
76
77See below for sample.
78
79
80Boot Message Sample
81-------------------
82
83Something like this gets printed indicating presence of DMAR tables
84in ACPI.
85
86ACPI: DMAR (v001 A M I OEMDMAR 0x00000001 MSFT 0x00000097) @ 0x000000007f5b5ef0
87
88When DMAR is being processed and initialized by ACPI, prints DMAR locations
89and any RMRR's processed.
90
91ACPI DMAR:Host address width 36
92ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed90000
93ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed91000
94ACPI DMAR:DRHD (flags: 0x00000001)base: 0x00000000fed93000
95ACPI DMAR:RMRR base: 0x00000000000ed000 end: 0x00000000000effff
96ACPI DMAR:RMRR base: 0x000000007f600000 end: 0x000000007fffffff
97
98When DMAR is enabled for use, you will notice..
99
100PCI-DMA: Using DMAR IOMMU
101
102Fault reporting
103---------------
104
105DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
106DMAR:[fault reason 05] PTE Write access is not set
107DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
108DMAR:[fault reason 05] PTE Write access is not set
109
110TBD
111----
112
113- For compatibility testing, could use unity map domain for all devices, just
114 provide a 1-1 for all useful memory under a single domain for all devices.
115- API for paravirt ops for abstracting functionlity for VMM folks.
diff --git a/Documentation/SubmitChecklist b/Documentation/SubmitChecklist
index 19e7f65c269f..34e06d2f194f 100644
--- a/Documentation/SubmitChecklist
+++ b/Documentation/SubmitChecklist
@@ -67,7 +67,7 @@ kernel patches.
6720: Check that it all passes `make headers_check'. 6720: Check that it all passes `make headers_check'.
68 68
6921: Has been checked with injection of at least slab and page-allocation 6921: Has been checked with injection of at least slab and page-allocation
70 fauilures. See Documentation/fault-injection/. 70 failures. See Documentation/fault-injection/.
71 71
72 If the new code is substantial, addition of subsystem-specific fault 72 If the new code is substantial, addition of subsystem-specific fault
73 injection might be appropriate. 73 injection might be appropriate.
diff --git a/Documentation/SubmittingDrivers b/Documentation/SubmittingDrivers
index d7e26427e426..24f2eb40cae5 100644
--- a/Documentation/SubmittingDrivers
+++ b/Documentation/SubmittingDrivers
@@ -36,8 +36,7 @@ Linux 2.4:
36 If the code area has a general maintainer then please submit it to 36 If the code area has a general maintainer then please submit it to
37 the maintainer listed in MAINTAINERS in the kernel file. If the 37 the maintainer listed in MAINTAINERS in the kernel file. If the
38 maintainer does not respond or you cannot find the appropriate 38 maintainer does not respond or you cannot find the appropriate
39 maintainer then please contact Marcelo Tosatti 39 maintainer then please contact Willy Tarreau <w@1wt.eu>.
40 <marcelo.tosatti@cyclades.com>.
41 40
42Linux 2.6: 41Linux 2.6:
43 The same rules apply as 2.4 except that you should follow linux-kernel 42 The same rules apply as 2.4 except that you should follow linux-kernel
diff --git a/Documentation/accounting/cgroupstats.txt b/Documentation/accounting/cgroupstats.txt
new file mode 100644
index 000000000000..eda40fd39cad
--- /dev/null
+++ b/Documentation/accounting/cgroupstats.txt
@@ -0,0 +1,27 @@
1Control Groupstats is inspired by the discussion at
2http://lkml.org/lkml/2007/4/11/187 and implements per cgroup statistics as
3suggested by Andrew Morton in http://lkml.org/lkml/2007/4/11/263.
4
5Per cgroup statistics infrastructure re-uses code from the taskstats
6interface. A new set of cgroup operations are registered with commands
7and attributes specific to cgroups. It should be very easy to
8extend per cgroup statistics, by adding members to the cgroupstats
9structure.
10
11The current model for cgroupstats is a pull, a push model (to post
12statistics on interesting events), should be very easy to add. Currently
13user space requests for statistics by passing the cgroup path.
14Statistics about the state of all the tasks in the cgroup is returned to
15user space.
16
17NOTE: We currently rely on delay accounting for extracting information
18about tasks blocked on I/O. If CONFIG_TASK_DELAY_ACCT is disabled, this
19information will not be available.
20
21To extract cgroup statistics a utility very similar to getdelays.c
22has been developed, the sample output of the utility is shown below
23
24~/balbir/cgroupstats # ./getdelays -C "/cgroup/a"
25sleeping 1, blocked 0, running 1, stopped 0, uninterruptible 0
26~/balbir/cgroupstats # ./getdelays -C "/cgroup"
27sleeping 155, blocked 0, running 1, stopped 0, uninterruptible 2
diff --git a/Documentation/arm/Samsung-S3C24XX/DMA.txt b/Documentation/arm/Samsung-S3C24XX/DMA.txt
index 37f4edcc5d87..3ed82383efea 100644
--- a/Documentation/arm/Samsung-S3C24XX/DMA.txt
+++ b/Documentation/arm/Samsung-S3C24XX/DMA.txt
@@ -5,7 +5,7 @@ Introduction
5------------ 5------------
6 6
7 The kernel provides an interface to manage DMA transfers 7 The kernel provides an interface to manage DMA transfers
8 using the DMA channels in the cpu, so that the central 8 using the DMA channels in the CPU, so that the central
9 duty of managing channel mappings, and programming the 9 duty of managing channel mappings, and programming the
10 channel generators is in one place. 10 channel generators is in one place.
11 11
@@ -17,24 +17,24 @@ DMA Channel Ordering
17 channels to all sources, which means that some devices 17 channels to all sources, which means that some devices
18 have a restricted number of channels that can be used. 18 have a restricted number of channels that can be used.
19 19
20 To allow flexibilty for each cpu type and board, the 20 To allow flexibility for each CPU type and board, the
21 dma code can be given an dma ordering structure which 21 DMA code can be given a DMA ordering structure which
22 allows the order of channel search to be specified, as 22 allows the order of channel search to be specified, as
23 well as allowing the prohibition of certain claims. 23 well as allowing the prohibition of certain claims.
24 24
25 struct s3c24xx_dma_order has a list of channels, and 25 struct s3c24xx_dma_order has a list of channels, and
26 each channel within has a slot for a list of dma 26 each channel within has a slot for a list of DMA
27 channel numbers. The slots are searched in order, for 27 channel numbers. The slots are searched in order for
28 the presence of a dma channel number with DMA_CH_VALID 28 the presence of a DMA channel number with DMA_CH_VALID
29 orred in. 29 or-ed in.
30 30
31 If the order has the flag DMA_CH_NEVER set, then after 31 If the order has the flag DMA_CH_NEVER set, then after
32 checking the channel list, the system will return no 32 checking the channel list, the system will return no
33 found channel, thus denying the request. 33 found channel, thus denying the request.
34 34
35 A board support file can call s3c24xx_dma_order_set() 35 A board support file can call s3c24xx_dma_order_set()
36 to register an complete ordering set. The routine will 36 to register a complete ordering set. The routine will
37 copy the data, so the original can be discared with 37 copy the data, so the original can be discarded with
38 __initdata. 38 __initdata.
39 39
40 40
diff --git a/Documentation/atomic_ops.txt b/Documentation/atomic_ops.txt
index d46306fea230..f20c10c2858f 100644
--- a/Documentation/atomic_ops.txt
+++ b/Documentation/atomic_ops.txt
@@ -418,6 +418,20 @@ brothers:
418 */ 418 */
419 smp_mb__after_clear_bit(); 419 smp_mb__after_clear_bit();
420 420
421There are two special bitops with lock barrier semantics (acquire/release,
422same as spinlocks). These operate in the same way as their non-_lock/unlock
423postfixed variants, except that they are to provide acquire/release semantics,
424respectively. This means they can be used for bit_spin_trylock and
425bit_spin_unlock type operations without specifying any more barriers.
426
427 int test_and_set_bit_lock(unsigned long nr, unsigned long *addr);
428 void clear_bit_unlock(unsigned long nr, unsigned long *addr);
429 void __clear_bit_unlock(unsigned long nr, unsigned long *addr);
430
431The __clear_bit_unlock version is non-atomic, however it still implements
432unlock barrier semantics. This can be useful if the lock itself is protecting
433the other bits in the word.
434
421Finally, there are non-atomic versions of the bitmask operations 435Finally, there are non-atomic versions of the bitmask operations
422provided. They are used in contexts where some other higher-level SMP 436provided. They are used in contexts where some other higher-level SMP
423locking scheme is being used to protect the bitmask, and thus less 437locking scheme is being used to protect the bitmask, and thus less
diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt
index 552cabac0608..da42ab414c48 100644
--- a/Documentation/cachetlb.txt
+++ b/Documentation/cachetlb.txt
@@ -87,30 +87,7 @@ changes occur:
87 87
88 This is used primarily during fault processing. 88 This is used primarily during fault processing.
89 89
905) void flush_tlb_pgtables(struct mm_struct *mm, 905) void update_mmu_cache(struct vm_area_struct *vma,
91 unsigned long start, unsigned long end)
92
93 The software page tables for address space 'mm' for virtual
94 addresses in the range 'start' to 'end-1' are being torn down.
95
96 Some platforms cache the lowest level of the software page tables
97 in a linear virtually mapped array, to make TLB miss processing
98 more efficient. On such platforms, since the TLB is caching the
99 software page table structure, it needs to be flushed when parts
100 of the software page table tree are unlinked/freed.
101
102 Sparc64 is one example of a platform which does this.
103
104 Usually, when munmap()'ing an area of user virtual address
105 space, the kernel leaves the page table parts around and just
106 marks the individual pte's as invalid. However, if very large
107 portions of the address space are unmapped, the kernel frees up
108 those portions of the software page tables to prevent potential
109 excessive kernel memory usage caused by erratic mmap/mmunmap
110 sequences. It is at these times that flush_tlb_pgtables will
111 be invoked.
112
1136) void update_mmu_cache(struct vm_area_struct *vma,
114 unsigned long address, pte_t pte) 91 unsigned long address, pte_t pte)
115 92
116 At the end of every page fault, this routine is invoked to 93 At the end of every page fault, this routine is invoked to
@@ -123,7 +100,7 @@ changes occur:
123 translations for software managed TLB configurations. 100 translations for software managed TLB configurations.
124 The sparc64 port currently does this. 101 The sparc64 port currently does this.
125 102
1267) void tlb_migrate_finish(struct mm_struct *mm) 1036) void tlb_migrate_finish(struct mm_struct *mm)
127 104
128 This interface is called at the end of an explicit 105 This interface is called at the end of an explicit
129 process migration. This interface provides a hook 106 process migration. This interface provides a hook
diff --git a/Documentation/cdrom/cdrom-standard.tex b/Documentation/cdrom/cdrom-standard.tex
index 92f94e597582..c713aeb020c4 100644
--- a/Documentation/cdrom/cdrom-standard.tex
+++ b/Documentation/cdrom/cdrom-standard.tex
@@ -1009,7 +1009,7 @@ taken over the torch in maintaining \cdromc\ and integrating much
1009\cdrom-related code in the 2.1-kernel. Thanks to Scott Snyder and 1009\cdrom-related code in the 2.1-kernel. Thanks to Scott Snyder and
1010Gerd Knorr, who were the first to implement this interface for SCSI 1010Gerd Knorr, who were the first to implement this interface for SCSI
1011and IDE-CD drivers and added many ideas for extension of the data 1011and IDE-CD drivers and added many ideas for extension of the data
1012structures relative to kernel~2.0. Further thanks to Heiko Eissfeldt, 1012structures relative to kernel~2.0. Further thanks to Heiko Ei{\sz}feldt,
1013Thomas Quinot, Jon Tombs, Ken Pizzini, Eberhard M\"onkeberg and Andrew 1013Thomas Quinot, Jon Tombs, Ken Pizzini, Eberhard M\"onkeberg and Andrew
1014Kroll, the \linux\ \cdrom\ device driver developers who were kind 1014Kroll, the \linux\ \cdrom\ device driver developers who were kind
1015enough to give suggestions and criticisms during the writing. Finally 1015enough to give suggestions and criticisms during the writing. Finally
diff --git a/Documentation/cgroups.txt b/Documentation/cgroups.txt
new file mode 100644
index 000000000000..98a26f81fa75
--- /dev/null
+++ b/Documentation/cgroups.txt
@@ -0,0 +1,545 @@
1 CGROUPS
2 -------
3
4Written by Paul Menage <menage@google.com> based on Documentation/cpusets.txt
5
6Original copyright statements from cpusets.txt:
7Portions Copyright (C) 2004 BULL SA.
8Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
9Modified by Paul Jackson <pj@sgi.com>
10Modified by Christoph Lameter <clameter@sgi.com>
11
12CONTENTS:
13=========
14
151. Control Groups
16 1.1 What are cgroups ?
17 1.2 Why are cgroups needed ?
18 1.3 How are cgroups implemented ?
19 1.4 What does notify_on_release do ?
20 1.5 How do I use cgroups ?
212. Usage Examples and Syntax
22 2.1 Basic Usage
23 2.2 Attaching processes
243. Kernel API
25 3.1 Overview
26 3.2 Synchronization
27 3.3 Subsystem API
284. Questions
29
301. Control Groups
31==========
32
331.1 What are cgroups ?
34----------------------
35
36Control Groups provide a mechanism for aggregating/partitioning sets of
37tasks, and all their future children, into hierarchical groups with
38specialized behaviour.
39
40Definitions:
41
42A *cgroup* associates a set of tasks with a set of parameters for one
43or more subsystems.
44
45A *subsystem* is a module that makes use of the task grouping
46facilities provided by cgroups to treat groups of tasks in
47particular ways. A subsystem is typically a "resource controller" that
48schedules a resource or applies per-cgroup limits, but it may be
49anything that wants to act on a group of processes, e.g. a
50virtualization subsystem.
51
52A *hierarchy* is a set of cgroups arranged in a tree, such that
53every task in the system is in exactly one of the cgroups in the
54hierarchy, and a set of subsystems; each subsystem has system-specific
55state attached to each cgroup in the hierarchy. Each hierarchy has
56an instance of the cgroup virtual filesystem associated with it.
57
58At any one time there may be multiple active hierachies of task
59cgroups. Each hierarchy is a partition of all tasks in the system.
60
61User level code may create and destroy cgroups by name in an
62instance of the cgroup virtual file system, specify and query to
63which cgroup a task is assigned, and list the task pids assigned to
64a cgroup. Those creations and assignments only affect the hierarchy
65associated with that instance of the cgroup file system.
66
67On their own, the only use for cgroups is for simple job
68tracking. The intention is that other subsystems hook into the generic
69cgroup support to provide new attributes for cgroups, such as
70accounting/limiting the resources which processes in a cgroup can
71access. For example, cpusets (see Documentation/cpusets.txt) allows
72you to associate a set of CPUs and a set of memory nodes with the
73tasks in each cgroup.
74
751.2 Why are cgroups needed ?
76----------------------------
77
78There are multiple efforts to provide process aggregations in the
79Linux kernel, mainly for resource tracking purposes. Such efforts
80include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
81namespaces. These all require the basic notion of a
82grouping/partitioning of processes, with newly forked processes ending
83in the same group (cgroup) as their parent process.
84
85The kernel cgroup patch provides the minimum essential kernel
86mechanisms required to efficiently implement such groups. It has
87minimal impact on the system fast paths, and provides hooks for
88specific subsystems such as cpusets to provide additional behaviour as
89desired.
90
91Multiple hierarchy support is provided to allow for situations where
92the division of tasks into cgroups is distinctly different for
93different subsystems - having parallel hierarchies allows each
94hierarchy to be a natural division of tasks, without having to handle
95complex combinations of tasks that would be present if several
96unrelated subsystems needed to be forced into the same tree of
97cgroups.
98
99At one extreme, each resource controller or subsystem could be in a
100separate hierarchy; at the other extreme, all subsystems
101would be attached to the same hierarchy.
102
103As an example of a scenario (originally proposed by vatsa@in.ibm.com)
104that can benefit from multiple hierarchies, consider a large
105university server with various users - students, professors, system
106tasks etc. The resource planning for this server could be along the
107following lines:
108
109 CPU : Top cpuset
110 / \
111 CPUSet1 CPUSet2
112 | |
113 (Profs) (Students)
114
115 In addition (system tasks) are attached to topcpuset (so
116 that they can run anywhere) with a limit of 20%
117
118 Memory : Professors (50%), students (30%), system (20%)
119
120 Disk : Prof (50%), students (30%), system (20%)
121
122 Network : WWW browsing (20%), Network File System (60%), others (20%)
123 / \
124 Prof (15%) students (5%)
125
126Browsers like firefox/lynx go into the WWW network class, while (k)nfsd go
127into NFS network class.
128
129At the same time firefox/lynx will share an appropriate CPU/Memory class
130depending on who launched it (prof/student).
131
132With the ability to classify tasks differently for different resources
133(by putting those resource subsystems in different hierarchies) then
134the admin can easily set up a script which receives exec notifications
135and depending on who is launching the browser he can
136
137 # echo browser_pid > /mnt/<restype>/<userclass>/tasks
138
139With only a single hierarchy, he now would potentially have to create
140a separate cgroup for every browser launched and associate it with
141approp network and other resource class. This may lead to
142proliferation of such cgroups.
143
144Also lets say that the administrator would like to give enhanced network
145access temporarily to a student's browser (since it is night and the user
146wants to do online gaming :) OR give one of the students simulation
147apps enhanced CPU power,
148
149With ability to write pids directly to resource classes, its just a
150matter of :
151
152 # echo pid > /mnt/network/<new_class>/tasks
153 (after some time)
154 # echo pid > /mnt/network/<orig_class>/tasks
155
156Without this ability, he would have to split the cgroup into
157multiple separate ones and then associate the new cgroups with the
158new resource classes.
159
160
161
1621.3 How are cgroups implemented ?
163---------------------------------
164
165Control Groups extends the kernel as follows:
166
167 - Each task in the system has a reference-counted pointer to a
168 css_set.
169
170 - A css_set contains a set of reference-counted pointers to
171 cgroup_subsys_state objects, one for each cgroup subsystem
172 registered in the system. There is no direct link from a task to
173 the cgroup of which it's a member in each hierarchy, but this
174 can be determined by following pointers through the
175 cgroup_subsys_state objects. This is because accessing the
176 subsystem state is something that's expected to happen frequently
177 and in performance-critical code, whereas operations that require a
178 task's actual cgroup assignments (in particular, moving between
179 cgroups) are less common. A linked list runs through the cg_list
180 field of each task_struct using the css_set, anchored at
181 css_set->tasks.
182
183 - A cgroup hierarchy filesystem can be mounted for browsing and
184 manipulation from user space.
185
186 - You can list all the tasks (by pid) attached to any cgroup.
187
188The implementation of cgroups requires a few, simple hooks
189into the rest of the kernel, none in performance critical paths:
190
191 - in init/main.c, to initialize the root cgroups and initial
192 css_set at system boot.
193
194 - in fork and exit, to attach and detach a task from its css_set.
195
196In addition a new file system, of type "cgroup" may be mounted, to
197enable browsing and modifying the cgroups presently known to the
198kernel. When mounting a cgroup hierarchy, you may specify a
199comma-separated list of subsystems to mount as the filesystem mount
200options. By default, mounting the cgroup filesystem attempts to
201mount a hierarchy containing all registered subsystems.
202
203If an active hierarchy with exactly the same set of subsystems already
204exists, it will be reused for the new mount. If no existing hierarchy
205matches, and any of the requested subsystems are in use in an existing
206hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
207is activated, associated with the requested subsystems.
208
209It's not currently possible to bind a new subsystem to an active
210cgroup hierarchy, or to unbind a subsystem from an active cgroup
211hierarchy. This may be possible in future, but is fraught with nasty
212error-recovery issues.
213
214When a cgroup filesystem is unmounted, if there are any
215child cgroups created below the top-level cgroup, that hierarchy
216will remain active even though unmounted; if there are no
217child cgroups then the hierarchy will be deactivated.
218
219No new system calls are added for cgroups - all support for
220querying and modifying cgroups is via this cgroup file system.
221
222Each task under /proc has an added file named 'cgroup' displaying,
223for each active hierarchy, the subsystem names and the cgroup name
224as the path relative to the root of the cgroup file system.
225
226Each cgroup is represented by a directory in the cgroup file system
227containing the following files describing that cgroup:
228
229 - tasks: list of tasks (by pid) attached to that cgroup
230 - notify_on_release flag: run /sbin/cgroup_release_agent on exit?
231
232Other subsystems such as cpusets may add additional files in each
233cgroup dir
234
235New cgroups are created using the mkdir system call or shell
236command. The properties of a cgroup, such as its flags, are
237modified by writing to the appropriate file in that cgroups
238directory, as listed above.
239
240The named hierarchical structure of nested cgroups allows partitioning
241a large system into nested, dynamically changeable, "soft-partitions".
242
243The attachment of each task, automatically inherited at fork by any
244children of that task, to a cgroup allows organizing the work load
245on a system into related sets of tasks. A task may be re-attached to
246any other cgroup, if allowed by the permissions on the necessary
247cgroup file system directories.
248
249When a task is moved from one cgroup to another, it gets a new
250css_set pointer - if there's an already existing css_set with the
251desired collection of cgroups then that group is reused, else a new
252css_set is allocated. Note that the current implementation uses a
253linear search to locate an appropriate existing css_set, so isn't
254very efficient. A future version will use a hash table for better
255performance.
256
257To allow access from a cgroup to the css_sets (and hence tasks)
258that comprise it, a set of cg_cgroup_link objects form a lattice;
259each cg_cgroup_link is linked into a list of cg_cgroup_links for
260a single cgroup on its cont_link_list field, and a list of
261cg_cgroup_links for a single css_set on its cg_link_list.
262
263Thus the set of tasks in a cgroup can be listed by iterating over
264each css_set that references the cgroup, and sub-iterating over
265each css_set's task set.
266
267The use of a Linux virtual file system (vfs) to represent the
268cgroup hierarchy provides for a familiar permission and name space
269for cgroups, with a minimum of additional kernel code.
270
2711.4 What does notify_on_release do ?
272------------------------------------
273
274*** notify_on_release is disabled in the current patch set. It will be
275*** reactivated in a future patch in a less-intrusive manner
276
277If the notify_on_release flag is enabled (1) in a cgroup, then
278whenever the last task in the cgroup leaves (exits or attaches to
279some other cgroup) and the last child cgroup of that cgroup
280is removed, then the kernel runs the command specified by the contents
281of the "release_agent" file in that hierarchy's root directory,
282supplying the pathname (relative to the mount point of the cgroup
283file system) of the abandoned cgroup. This enables automatic
284removal of abandoned cgroups. The default value of
285notify_on_release in the root cgroup at system boot is disabled
286(0). The default value of other cgroups at creation is the current
287value of their parents notify_on_release setting. The default value of
288a cgroup hierarchy's release_agent path is empty.
289
2901.5 How do I use cgroups ?
291--------------------------
292
293To start a new job that is to be contained within a cgroup, using
294the "cpuset" cgroup subsystem, the steps are something like:
295
296 1) mkdir /dev/cgroup
297 2) mount -t cgroup -ocpuset cpuset /dev/cgroup
298 3) Create the new cgroup by doing mkdir's and write's (or echo's) in
299 the /dev/cgroup virtual file system.
300 4) Start a task that will be the "founding father" of the new job.
301 5) Attach that task to the new cgroup by writing its pid to the
302 /dev/cgroup tasks file for that cgroup.
303 6) fork, exec or clone the job tasks from this founding father task.
304
305For example, the following sequence of commands will setup a cgroup
306named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
307and then start a subshell 'sh' in that cgroup:
308
309 mount -t cgroup cpuset -ocpuset /dev/cgroup
310 cd /dev/cgroup
311 mkdir Charlie
312 cd Charlie
313 /bin/echo 2-3 > cpus
314 /bin/echo 1 > mems
315 /bin/echo $$ > tasks
316 sh
317 # The subshell 'sh' is now running in cgroup Charlie
318 # The next line should display '/Charlie'
319 cat /proc/self/cgroup
320
3212. Usage Examples and Syntax
322============================
323
3242.1 Basic Usage
325---------------
326
327Creating, modifying, using the cgroups can be done through the cgroup
328virtual filesystem.
329
330To mount a cgroup hierarchy will all available subsystems, type:
331# mount -t cgroup xxx /dev/cgroup
332
333The "xxx" is not interpreted by the cgroup code, but will appear in
334/proc/mounts so may be any useful identifying string that you like.
335
336To mount a cgroup hierarchy with just the cpuset and numtasks
337subsystems, type:
338# mount -t cgroup -o cpuset,numtasks hier1 /dev/cgroup
339
340To change the set of subsystems bound to a mounted hierarchy, just
341remount with different options:
342
343# mount -o remount,cpuset,ns /dev/cgroup
344
345Note that changing the set of subsystems is currently only supported
346when the hierarchy consists of a single (root) cgroup. Supporting
347the ability to arbitrarily bind/unbind subsystems from an existing
348cgroup hierarchy is intended to be implemented in the future.
349
350Then under /dev/cgroup you can find a tree that corresponds to the
351tree of the cgroups in the system. For instance, /dev/cgroup
352is the cgroup that holds the whole system.
353
354If you want to create a new cgroup under /dev/cgroup:
355# cd /dev/cgroup
356# mkdir my_cgroup
357
358Now you want to do something with this cgroup.
359# cd my_cgroup
360
361In this directory you can find several files:
362# ls
363notify_on_release release_agent tasks
364(plus whatever files are added by the attached subsystems)
365
366Now attach your shell to this cgroup:
367# /bin/echo $$ > tasks
368
369You can also create cgroups inside your cgroup by using mkdir in this
370directory.
371# mkdir my_sub_cs
372
373To remove a cgroup, just use rmdir:
374# rmdir my_sub_cs
375
376This will fail if the cgroup is in use (has cgroups inside, or
377has processes attached, or is held alive by other subsystem-specific
378reference).
379
3802.2 Attaching processes
381-----------------------
382
383# /bin/echo PID > tasks
384
385Note that it is PID, not PIDs. You can only attach ONE task at a time.
386If you have several tasks to attach, you have to do it one after another:
387
388# /bin/echo PID1 > tasks
389# /bin/echo PID2 > tasks
390 ...
391# /bin/echo PIDn > tasks
392
3933. Kernel API
394=============
395
3963.1 Overview
397------------
398
399Each kernel subsystem that wants to hook into the generic cgroup
400system needs to create a cgroup_subsys object. This contains
401various methods, which are callbacks from the cgroup system, along
402with a subsystem id which will be assigned by the cgroup system.
403
404Other fields in the cgroup_subsys object include:
405
406- subsys_id: a unique array index for the subsystem, indicating which
407 entry in cgroup->subsys[] this subsystem should be
408 managing. Initialized by cgroup_register_subsys(); prior to this
409 it should be initialized to -1
410
411- hierarchy: an index indicating which hierarchy, if any, this
412 subsystem is currently attached to. If this is -1, then the
413 subsystem is not attached to any hierarchy, and all tasks should be
414 considered to be members of the subsystem's top_cgroup. It should
415 be initialized to -1.
416
417- name: should be initialized to a unique subsystem name prior to
418 calling cgroup_register_subsystem. Should be no longer than
419 MAX_CGROUP_TYPE_NAMELEN
420
421Each cgroup object created by the system has an array of pointers,
422indexed by subsystem id; this pointer is entirely managed by the
423subsystem; the generic cgroup code will never touch this pointer.
424
4253.2 Synchronization
426-------------------
427
428There is a global mutex, cgroup_mutex, used by the cgroup
429system. This should be taken by anything that wants to modify a
430cgroup. It may also be taken to prevent cgroups from being
431modified, but more specific locks may be more appropriate in that
432situation.
433
434See kernel/cgroup.c for more details.
435
436Subsystems can take/release the cgroup_mutex via the functions
437cgroup_lock()/cgroup_unlock(), and can
438take/release the callback_mutex via the functions
439cgroup_lock()/cgroup_unlock().
440
441Accessing a task's cgroup pointer may be done in the following ways:
442- while holding cgroup_mutex
443- while holding the task's alloc_lock (via task_lock())
444- inside an rcu_read_lock() section via rcu_dereference()
445
4463.3 Subsystem API
447--------------------------
448
449Each subsystem should:
450
451- add an entry in linux/cgroup_subsys.h
452- define a cgroup_subsys object called <name>_subsys
453
454Each subsystem may export the following methods. The only mandatory
455methods are create/destroy. Any others that are null are presumed to
456be successful no-ops.
457
458struct cgroup_subsys_state *create(struct cgroup *cont)
459LL=cgroup_mutex
460
461Called to create a subsystem state object for a cgroup. The
462subsystem should allocate its subsystem state object for the passed
463cgroup, returning a pointer to the new object on success or a
464negative error code. On success, the subsystem pointer should point to
465a structure of type cgroup_subsys_state (typically embedded in a
466larger subsystem-specific object), which will be initialized by the
467cgroup system. Note that this will be called at initialization to
468create the root subsystem state for this subsystem; this case can be
469identified by the passed cgroup object having a NULL parent (since
470it's the root of the hierarchy) and may be an appropriate place for
471initialization code.
472
473void destroy(struct cgroup *cont)
474LL=cgroup_mutex
475
476The cgroup system is about to destroy the passed cgroup; the
477subsystem should do any necessary cleanup
478
479int can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
480 struct task_struct *task)
481LL=cgroup_mutex
482
483Called prior to moving a task into a cgroup; if the subsystem
484returns an error, this will abort the attach operation. If a NULL
485task is passed, then a successful result indicates that *any*
486unspecified task can be moved into the cgroup. Note that this isn't
487called on a fork. If this method returns 0 (success) then this should
488remain valid while the caller holds cgroup_mutex.
489
490void attach(struct cgroup_subsys *ss, struct cgroup *cont,
491 struct cgroup *old_cont, struct task_struct *task)
492LL=cgroup_mutex
493
494
495Called after the task has been attached to the cgroup, to allow any
496post-attachment activity that requires memory allocations or blocking.
497
498void fork(struct cgroup_subsy *ss, struct task_struct *task)
499LL=callback_mutex, maybe read_lock(tasklist_lock)
500
501Called when a task is forked into a cgroup. Also called during
502registration for all existing tasks.
503
504void exit(struct cgroup_subsys *ss, struct task_struct *task)
505LL=callback_mutex
506
507Called during task exit
508
509int populate(struct cgroup_subsys *ss, struct cgroup *cont)
510LL=none
511
512Called after creation of a cgroup to allow a subsystem to populate
513the cgroup directory with file entries. The subsystem should make
514calls to cgroup_add_file() with objects of type cftype (see
515include/linux/cgroup.h for details). Note that although this
516method can return an error code, the error code is currently not
517always handled well.
518
519void post_clone(struct cgroup_subsys *ss, struct cgroup *cont)
520
521Called at the end of cgroup_clone() to do any paramater
522initialization which might be required before a task could attach. For
523example in cpusets, no task may attach before 'cpus' and 'mems' are set
524up.
525
526void bind(struct cgroup_subsys *ss, struct cgroup *root)
527LL=callback_mutex
528
529Called when a cgroup subsystem is rebound to a different hierarchy
530and root cgroup. Currently this will only involve movement between
531the default hierarchy (which never has sub-cgroups) and a hierarchy
532that is being created/destroyed (and hence has no sub-cgroups).
533
5344. Questions
535============
536
537Q: what's up with this '/bin/echo' ?
538A: bash's builtin 'echo' command does not check calls to write() against
539 errors. If you use it in the cgroup file system, you won't be
540 able to tell whether a command succeeded or failed.
541
542Q: When I attach processes, only the first of the line gets really attached !
543A: We can only return one error code per call to write(). So you should also
544 put only ONE pid.
545
diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt
index b6d24c22274b..a741f658a3c9 100644
--- a/Documentation/cpu-hotplug.txt
+++ b/Documentation/cpu-hotplug.txt
@@ -220,7 +220,9 @@ A: The following happen, listed in no particular order :-)
220 CPU_DOWN_PREPARE or CPU_DOWN_PREPARE_FROZEN, depending on whether or not the 220 CPU_DOWN_PREPARE or CPU_DOWN_PREPARE_FROZEN, depending on whether or not the
221 CPU is being offlined while tasks are frozen due to a suspend operation in 221 CPU is being offlined while tasks are frozen due to a suspend operation in
222 progress 222 progress
223- All process is migrated away from this outgoing CPU to a new CPU 223- All processes are migrated away from this outgoing CPU to new CPUs.
224 The new CPU is chosen from each process' current cpuset, which may be
225 a subset of all online CPUs.
224- All interrupts targeted to this CPU is migrated to a new CPU 226- All interrupts targeted to this CPU is migrated to a new CPU
225- timers/bottom half/task lets are also migrated to a new CPU 227- timers/bottom half/task lets are also migrated to a new CPU
226- Once all services are migrated, kernel calls an arch specific routine 228- Once all services are migrated, kernel calls an arch specific routine
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index ec9de6917f01..141bef1c8599 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -7,6 +7,7 @@ Written by Simon.Derr@bull.net
7Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. 7Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
8Modified by Paul Jackson <pj@sgi.com> 8Modified by Paul Jackson <pj@sgi.com>
9Modified by Christoph Lameter <clameter@sgi.com> 9Modified by Christoph Lameter <clameter@sgi.com>
10Modified by Paul Menage <menage@google.com>
10 11
11CONTENTS: 12CONTENTS:
12========= 13=========
@@ -16,9 +17,9 @@ CONTENTS:
16 1.2 Why are cpusets needed ? 17 1.2 Why are cpusets needed ?
17 1.3 How are cpusets implemented ? 18 1.3 How are cpusets implemented ?
18 1.4 What are exclusive cpusets ? 19 1.4 What are exclusive cpusets ?
19 1.5 What does notify_on_release do ? 20 1.5 What is memory_pressure ?
20 1.6 What is memory_pressure ? 21 1.6 What is memory spread ?
21 1.7 What is memory spread ? 22 1.7 What is sched_load_balance ?
22 1.8 How do I use cpusets ? 23 1.8 How do I use cpusets ?
232. Usage Examples and Syntax 242. Usage Examples and Syntax
24 2.1 Basic Usage 25 2.1 Basic Usage
@@ -44,18 +45,19 @@ hierarchy visible in a virtual file system. These are the essential
44hooks, beyond what is already present, required to manage dynamic 45hooks, beyond what is already present, required to manage dynamic
45job placement on large systems. 46job placement on large systems.
46 47
47Each task has a pointer to a cpuset. Multiple tasks may reference 48Cpusets use the generic cgroup subsystem described in
48the same cpuset. Requests by a task, using the sched_setaffinity(2) 49Documentation/cgroup.txt.
49system call to include CPUs in its CPU affinity mask, and using the 50
50mbind(2) and set_mempolicy(2) system calls to include Memory Nodes 51Requests by a task, using the sched_setaffinity(2) system call to
51in its memory policy, are both filtered through that tasks cpuset, 52include CPUs in its CPU affinity mask, and using the mbind(2) and
52filtering out any CPUs or Memory Nodes not in that cpuset. The 53set_mempolicy(2) system calls to include Memory Nodes in its memory
53scheduler will not schedule a task on a CPU that is not allowed in 54policy, are both filtered through that tasks cpuset, filtering out any
54its cpus_allowed vector, and the kernel page allocator will not 55CPUs or Memory Nodes not in that cpuset. The scheduler will not
55allocate a page on a node that is not allowed in the requesting tasks 56schedule a task on a CPU that is not allowed in its cpus_allowed
56mems_allowed vector. 57vector, and the kernel page allocator will not allocate a page on a
57 58node that is not allowed in the requesting tasks mems_allowed vector.
58User level code may create and destroy cpusets by name in the cpuset 59
60User level code may create and destroy cpusets by name in the cgroup
59virtual file system, manage the attributes and permissions of these 61virtual file system, manage the attributes and permissions of these
60cpusets and which CPUs and Memory Nodes are assigned to each cpuset, 62cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
61specify and query to which cpuset a task is assigned, and list the 63specify and query to which cpuset a task is assigned, and list the
@@ -115,7 +117,7 @@ Cpusets extends these two mechanisms as follows:
115 - Cpusets are sets of allowed CPUs and Memory Nodes, known to the 117 - Cpusets are sets of allowed CPUs and Memory Nodes, known to the
116 kernel. 118 kernel.
117 - Each task in the system is attached to a cpuset, via a pointer 119 - Each task in the system is attached to a cpuset, via a pointer
118 in the task structure to a reference counted cpuset structure. 120 in the task structure to a reference counted cgroup structure.
119 - Calls to sched_setaffinity are filtered to just those CPUs 121 - Calls to sched_setaffinity are filtered to just those CPUs
120 allowed in that tasks cpuset. 122 allowed in that tasks cpuset.
121 - Calls to mbind and set_mempolicy are filtered to just 123 - Calls to mbind and set_mempolicy are filtered to just
@@ -145,15 +147,10 @@ into the rest of the kernel, none in performance critical paths:
145 - in page_alloc.c, to restrict memory to allowed nodes. 147 - in page_alloc.c, to restrict memory to allowed nodes.
146 - in vmscan.c, to restrict page recovery to the current cpuset. 148 - in vmscan.c, to restrict page recovery to the current cpuset.
147 149
148In addition a new file system, of type "cpuset" may be mounted, 150You should mount the "cgroup" filesystem type in order to enable
149typically at /dev/cpuset, to enable browsing and modifying the cpusets 151browsing and modifying the cpusets presently known to the kernel. No
150presently known to the kernel. No new system calls are added for 152new system calls are added for cpusets - all support for querying and
151cpusets - all support for querying and modifying cpusets is via 153modifying cpusets is via this cpuset file system.
152this cpuset file system.
153
154Each task under /proc has an added file named 'cpuset', displaying
155the cpuset name, as the path relative to the root of the cpuset file
156system.
157 154
158The /proc/<pid>/status file for each task has two added lines, 155The /proc/<pid>/status file for each task has two added lines,
159displaying the tasks cpus_allowed (on which CPUs it may be scheduled) 156displaying the tasks cpus_allowed (on which CPUs it may be scheduled)
@@ -163,16 +160,15 @@ in the format seen in the following example:
163 Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff 160 Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff
164 Mems_allowed: ffffffff,ffffffff 161 Mems_allowed: ffffffff,ffffffff
165 162
166Each cpuset is represented by a directory in the cpuset file system 163Each cpuset is represented by a directory in the cgroup file system
167containing the following files describing that cpuset: 164containing (on top of the standard cgroup files) the following
165files describing that cpuset:
168 166
169 - cpus: list of CPUs in that cpuset 167 - cpus: list of CPUs in that cpuset
170 - mems: list of Memory Nodes in that cpuset 168 - mems: list of Memory Nodes in that cpuset
171 - memory_migrate flag: if set, move pages to cpusets nodes 169 - memory_migrate flag: if set, move pages to cpusets nodes
172 - cpu_exclusive flag: is cpu placement exclusive? 170 - cpu_exclusive flag: is cpu placement exclusive?
173 - mem_exclusive flag: is memory placement exclusive? 171 - mem_exclusive flag: is memory placement exclusive?
174 - tasks: list of tasks (by pid) attached to that cpuset
175 - notify_on_release flag: run /sbin/cpuset_release_agent on exit?
176 - memory_pressure: measure of how much paging pressure in cpuset 172 - memory_pressure: measure of how much paging pressure in cpuset
177 173
178In addition, the root cpuset only has the following file: 174In addition, the root cpuset only has the following file:
@@ -237,21 +233,7 @@ such as requests from interrupt handlers, is allowed to be taken
237outside even a mem_exclusive cpuset. 233outside even a mem_exclusive cpuset.
238 234
239 235
2401.5 What does notify_on_release do ? 2361.5 What is memory_pressure ?
241------------------------------------
242
243If the notify_on_release flag is enabled (1) in a cpuset, then whenever
244the last task in the cpuset leaves (exits or attaches to some other
245cpuset) and the last child cpuset of that cpuset is removed, then
246the kernel runs the command /sbin/cpuset_release_agent, supplying the
247pathname (relative to the mount point of the cpuset file system) of the
248abandoned cpuset. This enables automatic removal of abandoned cpusets.
249The default value of notify_on_release in the root cpuset at system
250boot is disabled (0). The default value of other cpusets at creation
251is the current value of their parents notify_on_release setting.
252
253
2541.6 What is memory_pressure ?
255----------------------------- 237-----------------------------
256The memory_pressure of a cpuset provides a simple per-cpuset metric 238The memory_pressure of a cpuset provides a simple per-cpuset metric
257of the rate that the tasks in a cpuset are attempting to free up in 239of the rate that the tasks in a cpuset are attempting to free up in
@@ -308,7 +290,7 @@ the tasks in the cpuset, in units of reclaims attempted per second,
308times 1000. 290times 1000.
309 291
310 292
3111.7 What is memory spread ? 2931.6 What is memory spread ?
312--------------------------- 294---------------------------
313There are two boolean flag files per cpuset that control where the 295There are two boolean flag files per cpuset that control where the
314kernel allocates pages for the file system buffers and related in 296kernel allocates pages for the file system buffers and related in
@@ -378,6 +360,142 @@ policy, especially for jobs that might have one thread reading in the
378data set, the memory allocation across the nodes in the jobs cpuset 360data set, the memory allocation across the nodes in the jobs cpuset
379can become very uneven. 361can become very uneven.
380 362
3631.7 What is sched_load_balance ?
364--------------------------------
365
366The kernel scheduler (kernel/sched.c) automatically load balances
367tasks. If one CPU is underutilized, kernel code running on that
368CPU will look for tasks on other more overloaded CPUs and move those
369tasks to itself, within the constraints of such placement mechanisms
370as cpusets and sched_setaffinity.
371
372The algorithmic cost of load balancing and its impact on key shared
373kernel data structures such as the task list increases more than
374linearly with the number of CPUs being balanced. So the scheduler
375has support to partition the systems CPUs into a number of sched
376domains such that it only load balances within each sched domain.
377Each sched domain covers some subset of the CPUs in the system;
378no two sched domains overlap; some CPUs might not be in any sched
379domain and hence won't be load balanced.
380
381Put simply, it costs less to balance between two smaller sched domains
382than one big one, but doing so means that overloads in one of the
383two domains won't be load balanced to the other one.
384
385By default, there is one sched domain covering all CPUs, except those
386marked isolated using the kernel boot time "isolcpus=" argument.
387
388This default load balancing across all CPUs is not well suited for
389the following two situations:
390 1) On large systems, load balancing across many CPUs is expensive.
391 If the system is managed using cpusets to place independent jobs
392 on separate sets of CPUs, full load balancing is unnecessary.
393 2) Systems supporting realtime on some CPUs need to minimize
394 system overhead on those CPUs, including avoiding task load
395 balancing if that is not needed.
396
397When the per-cpuset flag "sched_load_balance" is enabled (the default
398setting), it requests that all the CPUs in that cpusets allowed 'cpus'
399be contained in a single sched domain, ensuring that load balancing
400can move a task (not otherwised pinned, as by sched_setaffinity)
401from any CPU in that cpuset to any other.
402
403When the per-cpuset flag "sched_load_balance" is disabled, then the
404scheduler will avoid load balancing across the CPUs in that cpuset,
405--except-- in so far as is necessary because some overlapping cpuset
406has "sched_load_balance" enabled.
407
408So, for example, if the top cpuset has the flag "sched_load_balance"
409enabled, then the scheduler will have one sched domain covering all
410CPUs, and the setting of the "sched_load_balance" flag in any other
411cpusets won't matter, as we're already fully load balancing.
412
413Therefore in the above two situations, the top cpuset flag
414"sched_load_balance" should be disabled, and only some of the smaller,
415child cpusets have this flag enabled.
416
417When doing this, you don't usually want to leave any unpinned tasks in
418the top cpuset that might use non-trivial amounts of CPU, as such tasks
419may be artificially constrained to some subset of CPUs, depending on
420the particulars of this flag setting in descendent cpusets. Even if
421such a task could use spare CPU cycles in some other CPUs, the kernel
422scheduler might not consider the possibility of load balancing that
423task to that underused CPU.
424
425Of course, tasks pinned to a particular CPU can be left in a cpuset
426that disables "sched_load_balance" as those tasks aren't going anywhere
427else anyway.
428
429There is an impedance mismatch here, between cpusets and sched domains.
430Cpusets are hierarchical and nest. Sched domains are flat; they don't
431overlap and each CPU is in at most one sched domain.
432
433It is necessary for sched domains to be flat because load balancing
434across partially overlapping sets of CPUs would risk unstable dynamics
435that would be beyond our understanding. So if each of two partially
436overlapping cpusets enables the flag 'sched_load_balance', then we
437form a single sched domain that is a superset of both. We won't move
438a task to a CPU outside it cpuset, but the scheduler load balancing
439code might waste some compute cycles considering that possibility.
440
441This mismatch is why there is not a simple one-to-one relation
442between which cpusets have the flag "sched_load_balance" enabled,
443and the sched domain configuration. If a cpuset enables the flag, it
444will get balancing across all its CPUs, but if it disables the flag,
445it will only be assured of no load balancing if no other overlapping
446cpuset enables the flag.
447
448If two cpusets have partially overlapping 'cpus' allowed, and only
449one of them has this flag enabled, then the other may find its
450tasks only partially load balanced, just on the overlapping CPUs.
451This is just the general case of the top_cpuset example given a few
452paragraphs above. In the general case, as in the top cpuset case,
453don't leave tasks that might use non-trivial amounts of CPU in
454such partially load balanced cpusets, as they may be artificially
455constrained to some subset of the CPUs allowed to them, for lack of
456load balancing to the other CPUs.
457
4581.7.1 sched_load_balance implementation details.
459------------------------------------------------
460
461The per-cpuset flag 'sched_load_balance' defaults to enabled (contrary
462to most cpuset flags.) When enabled for a cpuset, the kernel will
463ensure that it can load balance across all the CPUs in that cpuset
464(makes sure that all the CPUs in the cpus_allowed of that cpuset are
465in the same sched domain.)
466
467If two overlapping cpusets both have 'sched_load_balance' enabled,
468then they will be (must be) both in the same sched domain.
469
470If, as is the default, the top cpuset has 'sched_load_balance' enabled,
471then by the above that means there is a single sched domain covering
472the whole system, regardless of any other cpuset settings.
473
474The kernel commits to user space that it will avoid load balancing
475where it can. It will pick as fine a granularity partition of sched
476domains as it can while still providing load balancing for any set
477of CPUs allowed to a cpuset having 'sched_load_balance' enabled.
478
479The internal kernel cpuset to scheduler interface passes from the
480cpuset code to the scheduler code a partition of the load balanced
481CPUs in the system. This partition is a set of subsets (represented
482as an array of cpumask_t) of CPUs, pairwise disjoint, that cover all
483the CPUs that must be load balanced.
484
485Whenever the 'sched_load_balance' flag changes, or CPUs come or go
486from a cpuset with this flag enabled, or a cpuset with this flag
487enabled is removed, the cpuset code builds a new such partition and
488passes it to the scheduler sched domain setup code, to have the sched
489domains rebuilt as necessary.
490
491This partition exactly defines what sched domains the scheduler should
492setup - one sched domain for each element (cpumask_t) in the partition.
493
494The scheduler remembers the currently active sched domain partitions.
495When the scheduler routine partition_sched_domains() is invoked from
496the cpuset code to update these sched domains, it compares the new
497partition requested with the current, and updates its sched domains,
498removing the old and adding the new, for each change.
381 499
3821.8 How do I use cpusets ? 5001.8 How do I use cpusets ?
383-------------------------- 501--------------------------
@@ -469,7 +587,7 @@ than stress the kernel.
469To start a new job that is to be contained within a cpuset, the steps are: 587To start a new job that is to be contained within a cpuset, the steps are:
470 588
471 1) mkdir /dev/cpuset 589 1) mkdir /dev/cpuset
472 2) mount -t cpuset none /dev/cpuset 590 2) mount -t cgroup -ocpuset cpuset /dev/cpuset
473 3) Create the new cpuset by doing mkdir's and write's (or echo's) in 591 3) Create the new cpuset by doing mkdir's and write's (or echo's) in
474 the /dev/cpuset virtual file system. 592 the /dev/cpuset virtual file system.
475 4) Start a task that will be the "founding father" of the new job. 593 4) Start a task that will be the "founding father" of the new job.
@@ -481,7 +599,7 @@ For example, the following sequence of commands will setup a cpuset
481named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, 599named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
482and then start a subshell 'sh' in that cpuset: 600and then start a subshell 'sh' in that cpuset:
483 601
484 mount -t cpuset none /dev/cpuset 602 mount -t cgroup -ocpuset cpuset /dev/cpuset
485 cd /dev/cpuset 603 cd /dev/cpuset
486 mkdir Charlie 604 mkdir Charlie
487 cd Charlie 605 cd Charlie
@@ -513,7 +631,7 @@ Creating, modifying, using the cpusets can be done through the cpuset
513virtual filesystem. 631virtual filesystem.
514 632
515To mount it, type: 633To mount it, type:
516# mount -t cpuset none /dev/cpuset 634# mount -t cgroup -o cpuset cpuset /dev/cpuset
517 635
518Then under /dev/cpuset you can find a tree that corresponds to the 636Then under /dev/cpuset you can find a tree that corresponds to the
519tree of the cpusets in the system. For instance, /dev/cpuset 637tree of the cpusets in the system. For instance, /dev/cpuset
@@ -556,6 +674,18 @@ To remove a cpuset, just use rmdir:
556This will fail if the cpuset is in use (has cpusets inside, or has 674This will fail if the cpuset is in use (has cpusets inside, or has
557processes attached). 675processes attached).
558 676
677Note that for legacy reasons, the "cpuset" filesystem exists as a
678wrapper around the cgroup filesystem.
679
680The command
681
682mount -t cpuset X /dev/cpuset
683
684is equivalent to
685
686mount -t cgroup -ocpuset X /dev/cpuset
687echo "/sbin/cpuset_release_agent" > /dev/cpuset/release_agent
688
5592.2 Adding/removing cpus 6892.2 Adding/removing cpus
560------------------------ 690------------------------
561 691
diff --git a/Documentation/device-mapper/dm-uevent.txt b/Documentation/device-mapper/dm-uevent.txt
new file mode 100644
index 000000000000..07edbd85c714
--- /dev/null
+++ b/Documentation/device-mapper/dm-uevent.txt
@@ -0,0 +1,97 @@
1The device-mapper uevent code adds the capability to device-mapper to create
2and send kobject uevents (uevents). Previously device-mapper events were only
3available through the ioctl interface. The advantage of the uevents interface
4is the event contains environment attributes providing increased context for
5the event avoiding the need to query the state of the device-mapper device after
6the event is received.
7
8There are two functions currently for device-mapper events. The first function
9listed creates the event and the second function sends the event(s).
10
11void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti,
12 const char *path, unsigned nr_valid_paths)
13
14void dm_send_uevents(struct list_head *events, struct kobject *kobj)
15
16
17The variables added to the uevent environment are:
18
19Variable Name: DM_TARGET
20Uevent Action(s): KOBJ_CHANGE
21Type: string
22Description:
23Value: Name of device-mapper target that generated the event.
24
25Variable Name: DM_ACTION
26Uevent Action(s): KOBJ_CHANGE
27Type: string
28Description:
29Value: Device-mapper specific action that caused the uevent action.
30 PATH_FAILED - A path has failed.
31 PATH_REINSTATED - A path has been reinstated.
32
33Variable Name: DM_SEQNUM
34Uevent Action(s): KOBJ_CHANGE
35Type: unsigned integer
36Description: A sequence number for this specific device-mapper device.
37Value: Valid unsigned integer range.
38
39Variable Name: DM_PATH
40Uevent Action(s): KOBJ_CHANGE
41Type: string
42Description: Major and minor number of the path device pertaining to this
43event.
44Value: Path name in the form of "Major:Minor"
45
46Variable Name: DM_NR_VALID_PATHS
47Uevent Action(s): KOBJ_CHANGE
48Type: unsigned integer
49Description:
50Value: Valid unsigned integer range.
51
52Variable Name: DM_NAME
53Uevent Action(s): KOBJ_CHANGE
54Type: string
55Description: Name of the device-mapper device.
56Value: Name
57
58Variable Name: DM_UUID
59Uevent Action(s): KOBJ_CHANGE
60Type: string
61Description: UUID of the device-mapper device.
62Value: UUID. (Empty string if there isn't one.)
63
64An example of the uevents generated as captured by udevmonitor is shown
65below.
66
671.) Path failure.
68UEVENT[1192521009.711215] change@/block/dm-3
69ACTION=change
70DEVPATH=/block/dm-3
71SUBSYSTEM=block
72DM_TARGET=multipath
73DM_ACTION=PATH_FAILED
74DM_SEQNUM=1
75DM_PATH=8:32
76DM_NR_VALID_PATHS=0
77DM_NAME=mpath2
78DM_UUID=mpath-35333333000002328
79MINOR=3
80MAJOR=253
81SEQNUM=1130
82
832.) Path reinstate.
84UEVENT[1192521132.989927] change@/block/dm-3
85ACTION=change
86DEVPATH=/block/dm-3
87SUBSYSTEM=block
88DM_TARGET=multipath
89DM_ACTION=PATH_REINSTATED
90DM_SEQNUM=2
91DM_PATH=8:32
92DM_NR_VALID_PATHS=1
93DM_NAME=mpath2
94DM_UUID=mpath-35333333000002328
95MINOR=3
96MAJOR=253
97SEQNUM=1131
diff --git a/Documentation/devices.txt b/Documentation/devices.txt
index 6c46730c631a..e6244cde26e9 100644
--- a/Documentation/devices.txt
+++ b/Documentation/devices.txt
@@ -2188,7 +2188,7 @@ Your cooperation is appreciated.
2188 2188
2189136-143 char Unix98 PTY slaves 2189136-143 char Unix98 PTY slaves
2190 0 = /dev/pts/0 First Unix98 pseudo-TTY 2190 0 = /dev/pts/0 First Unix98 pseudo-TTY
2191 1 = /dev/pts/1 Second Unix98 pesudo-TTY 2191 1 = /dev/pts/1 Second Unix98 pseudo-TTY
2192 ... 2192 ...
2193 2193
2194 These device nodes are automatically generated with 2194 These device nodes are automatically generated with
diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
index 8569072fa387..387b8a720f4a 100644
--- a/Documentation/driver-model/devres.txt
+++ b/Documentation/driver-model/devres.txt
@@ -32,7 +32,7 @@ braindamaged document, if it's finally working, well, it's working.
32 32
33For one reason or another, low level drivers don't receive as much 33For one reason or another, low level drivers don't receive as much
34attention or testing as core code, and bugs on driver detach or 34attention or testing as core code, and bugs on driver detach or
35initilaization failure doesn't happen often enough to be noticeable. 35initialization failure don't happen often enough to be noticeable.
36Init failure path is worse because it's much less travelled while 36Init failure path is worse because it's much less travelled while
37needs to handle multiple entry points. 37needs to handle multiple entry points.
38 38
@@ -160,7 +160,7 @@ resources on failure. For example,
160 devres_release_group(dev, NULL); 160 devres_release_group(dev, NULL);
161 return err_code; 161 return err_code;
162 162
163As resource acquision failure usually means probe failure, constructs 163As resource acquisition failure usually means probe failure, constructs
164like above are usually useful in midlayer driver (e.g. libata core 164like above are usually useful in midlayer driver (e.g. libata core
165layer) where interface function shouldn't have side effect on failure. 165layer) where interface function shouldn't have side effect on failure.
166For LLDs, just returning error code suffices in most cases. 166For LLDs, just returning error code suffices in most cases.
diff --git a/Documentation/fb/deferred_io.txt b/Documentation/fb/deferred_io.txt
index 73cf9fb7cf60..63883a892120 100644
--- a/Documentation/fb/deferred_io.txt
+++ b/Documentation/fb/deferred_io.txt
@@ -3,7 +3,7 @@ Deferred IO
3 3
4Deferred IO is a way to delay and repurpose IO. It uses host memory as a 4Deferred IO is a way to delay and repurpose IO. It uses host memory as a
5buffer and the MMU pagefault as a pretrigger for when to perform the device 5buffer and the MMU pagefault as a pretrigger for when to perform the device
6IO. The following example may be a useful explaination of how one such setup 6IO. The following example may be a useful explanation of how one such setup
7works: 7works:
8 8
9- userspace app like Xfbdev mmaps framebuffer 9- userspace app like Xfbdev mmaps framebuffer
@@ -28,7 +28,7 @@ a relatively more expensive operation.
28 28
29For some types of nonvolatile high latency displays, the desired image is 29For some types of nonvolatile high latency displays, the desired image is
30the final image rather than the intermediate stages which is why it's okay 30the final image rather than the intermediate stages which is why it's okay
31to not update for each write that is occuring. 31to not update for each write that is occurring.
32 32
33It may be the case that this is useful in other scenarios as well. Paul Mundt 33It may be the case that this is useful in other scenarios as well. Paul Mundt
34has mentioned a case where it is beneficial to use the page count to decide 34has mentioned a case where it is beneficial to use the page count to decide
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 280ec06573e6..6bb9be54ab76 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -14,18 +14,6 @@ Who: Jiri Slaby <jirislaby@gmail.com>
14 14
15--------------------------- 15---------------------------
16 16
17What: V4L2 VIDIOC_G_MPEGCOMP and VIDIOC_S_MPEGCOMP
18When: October 2007
19Why: Broken attempt to set MPEG compression parameters. These ioctls are
20 not able to implement the wide variety of parameters that can be set
21 by hardware MPEG encoders. A new MPEG control mechanism was created
22 in kernel 2.6.18 that replaces these ioctls. See the V4L2 specification
23 (section 1.9: Extended controls) for more information on this topic.
24Who: Hans Verkuil <hverkuil@xs4all.nl> and
25 Mauro Carvalho Chehab <mchehab@infradead.org>
26
27---------------------------
28
29What: dev->power.power_state 17What: dev->power.power_state
30When: July 2007 18When: July 2007
31Why: Broken design for runtime control over driver power states, confusing 19Why: Broken design for runtime control over driver power states, confusing
@@ -49,10 +37,10 @@ Who: David Miller <davem@davemloft.net>
49--------------------------- 37---------------------------
50 38
51What: Video4Linux API 1 ioctls and video_decoder.h from Video devices. 39What: Video4Linux API 1 ioctls and video_decoder.h from Video devices.
52When: December 2006 40When: December 2008
53Files: include/linux/video_decoder.h 41Files: include/linux/video_decoder.h include/linux/videodev.h
54Check: include/linux/video_decoder.h 42Check: include/linux/video_decoder.h include/linux/videodev.h
55Why: V4L1 AP1 was replaced by V4L2 API. during migration from 2.4 to 2.6 43Why: V4L1 AP1 was replaced by V4L2 API during migration from 2.4 to 2.6
56 series. The old API have lots of drawbacks and don't provide enough 44 series. The old API have lots of drawbacks and don't provide enough
57 means to work with all video and audio standards. The newer API is 45 means to work with all video and audio standards. The newer API is
58 already available on the main drivers and should be used instead. 46 already available on the main drivers and should be used instead.
@@ -61,7 +49,9 @@ Why: V4L1 AP1 was replaced by V4L2 API. during migration from 2.4 to 2.6
61 Decoder iocts are using internally to allow video drivers to 49 Decoder iocts are using internally to allow video drivers to
62 communicate with video decoders. This should also be improved to allow 50 communicate with video decoders. This should also be improved to allow
63 V4L2 calls being translated into compatible internal ioctls. 51 V4L2 calls being translated into compatible internal ioctls.
64Who: Mauro Carvalho Chehab <mchehab@brturbo.com.br> 52 Compatibility ioctls will be provided, for a while, via
53 v4l1-compat module.
54Who: Mauro Carvalho Chehab <mchehab@infradead.org>
65 55
66--------------------------- 56---------------------------
67 57
@@ -82,6 +72,41 @@ Who: Dominik Brodowski <linux@brodo.de>
82 72
83--------------------------- 73---------------------------
84 74
75What: sys_sysctl
76When: September 2010
77Option: CONFIG_SYSCTL_SYSCALL
78Why: The same information is available in a more convenient from
79 /proc/sys, and none of the sysctl variables appear to be
80 important performance wise.
81
82 Binary sysctls are a long standing source of subtle kernel
83 bugs and security issues.
84
85 When I looked several months ago all I could find after
86 searching several distributions were 5 user space programs and
87 glibc (which falls back to /proc/sys) using this syscall.
88
89 The man page for sysctl(2) documents it as unusable for user
90 space programs.
91
92 sysctl(2) is not generally ABI compatible to a 32bit user
93 space application on a 64bit and a 32bit kernel.
94
95 For the last several months the policy has been no new binary
96 sysctls and no one has put forward an argument to use them.
97
98 Binary sysctls issues seem to keep happening appearing so
99 properly deprecating them (with a warning to user space) and a
100 2 year grace warning period will mean eventually we can kill
101 them and end the pain.
102
103 In the mean time individual binary sysctls can be dealt with
104 in a piecewise fashion.
105
106Who: Eric Biederman <ebiederm@xmission.com>
107
108---------------------------
109
85What: a.out interpreter support for ELF executables 110What: a.out interpreter support for ELF executables
86When: 2.6.25 111When: 2.6.25
87Files: fs/binfmt_elf.c 112Files: fs/binfmt_elf.c
@@ -184,13 +209,6 @@ Who: Jean Delvare <khali@linux-fr.org>,
184 209
185--------------------------- 210---------------------------
186 211
187What: drivers depending on OBSOLETE_OSS
188When: options in 2.6.22, code in 2.6.24
189Why: OSS drivers with ALSA replacements
190Who: Adrian Bunk <bunk@stusta.de>
191
192---------------------------
193
194What: ACPI procfs interface 212What: ACPI procfs interface
195When: July 2008 213When: July 2008
196Why: ACPI sysfs conversion should be finished by January 2008. 214Why: ACPI sysfs conversion should be finished by January 2008.
diff --git a/Documentation/filesystems/9p.txt b/Documentation/filesystems/9p.txt
index d6fd6c6e4244..bf8080640eba 100644
--- a/Documentation/filesystems/9p.txt
+++ b/Documentation/filesystems/9p.txt
@@ -42,10 +42,12 @@ OPTIONS
42 42
43 trans=name select an alternative transport. Valid options are 43 trans=name select an alternative transport. Valid options are
44 currently: 44 currently:
45 unix - specifying a named pipe mount point 45 unix - specifying a named pipe mount point
46 tcp - specifying a normal TCP/IP connection 46 tcp - specifying a normal TCP/IP connection
47 fd - used passed file descriptors for connection 47 fd - used passed file descriptors for connection
48 (see rfdno and wfdno) 48 (see rfdno and wfdno)
49 virtio - connect to the next virtio channel available
50 (from lguest or KVM with trans_virtio module)
49 51
50 uname=name user name to attempt mount as on the remote server. The 52 uname=name user name to attempt mount as on the remote server. The
51 server may override or ignore this value. Certain user 53 server may override or ignore this value. Certain user
@@ -54,7 +56,7 @@ OPTIONS
54 aname=name aname specifies the file tree to access when the server is 56 aname=name aname specifies the file tree to access when the server is
55 offering several exported file systems. 57 offering several exported file systems.
56 58
57 cache=mode specifies a cacheing policy. By default, no caches are used. 59 cache=mode specifies a caching policy. By default, no caches are used.
58 loose = no attempts are made at consistency, 60 loose = no attempts are made at consistency,
59 intended for exclusive, read-only mounts 61 intended for exclusive, read-only mounts
60 62
diff --git a/Documentation/filesystems/Exporting b/Documentation/filesystems/Exporting
index 31047e0fe14b..87019d2b5981 100644
--- a/Documentation/filesystems/Exporting
+++ b/Documentation/filesystems/Exporting
@@ -2,9 +2,12 @@
2Making Filesystems Exportable 2Making Filesystems Exportable
3============================= 3=============================
4 4
5Most filesystem operations require a dentry (or two) as a starting 5Overview
6--------
7
8All filesystem operations require a dentry (or two) as a starting
6point. Local applications have a reference-counted hold on suitable 9point. Local applications have a reference-counted hold on suitable
7dentrys via open file descriptors or cwd/root. However remote 10dentries via open file descriptors or cwd/root. However remote
8applications that access a filesystem via a remote filesystem protocol 11applications that access a filesystem via a remote filesystem protocol
9such as NFS may not be able to hold such a reference, and so need a 12such as NFS may not be able to hold such a reference, and so need a
10different way to refer to a particular dentry. As the alternative 13different way to refer to a particular dentry. As the alternative
@@ -13,14 +16,14 @@ server-reboot (among other things, though these tend to be the most
13problematic), there is no simple answer like 'filename'. 16problematic), there is no simple answer like 'filename'.
14 17
15The mechanism discussed here allows each filesystem implementation to 18The mechanism discussed here allows each filesystem implementation to
16specify how to generate an opaque (out side of the filesystem) byte 19specify how to generate an opaque (outside of the filesystem) byte
17string for any dentry, and how to find an appropriate dentry for any 20string for any dentry, and how to find an appropriate dentry for any
18given opaque byte string. 21given opaque byte string.
19This byte string will be called a "filehandle fragment" as it 22This byte string will be called a "filehandle fragment" as it
20corresponds to part of an NFS filehandle. 23corresponds to part of an NFS filehandle.
21 24
22A filesystem which supports the mapping between filehandle fragments 25A filesystem which supports the mapping between filehandle fragments
23and dentrys will be termed "exportable". 26and dentries will be termed "exportable".
24 27
25 28
26 29
@@ -89,11 +92,9 @@ For a filesystem to be exportable it must:
89 1/ provide the filehandle fragment routines described below. 92 1/ provide the filehandle fragment routines described below.
90 2/ make sure that d_splice_alias is used rather than d_add 93 2/ make sure that d_splice_alias is used rather than d_add
91 when ->lookup finds an inode for a given parent and name. 94 when ->lookup finds an inode for a given parent and name.
92 Typically the ->lookup routine will end: 95 Typically the ->lookup routine will end with a:
93 if (inode) 96
94 return d_splice(inode, dentry); 97 return d_splice_alias(inode, dentry);
95 d_add(dentry, inode);
96 return NULL;
97 } 98 }
98 99
99 100
@@ -101,67 +102,39 @@ For a filesystem to be exportable it must:
101 A file system implementation declares that instances of the filesystem 102 A file system implementation declares that instances of the filesystem
102are exportable by setting the s_export_op field in the struct 103are exportable by setting the s_export_op field in the struct
103super_block. This field must point to a "struct export_operations" 104super_block. This field must point to a "struct export_operations"
104struct which could potentially be full of NULLs, though normally at 105struct which has the following members:
105least get_parent will be set. 106
106 107 encode_fh (optional)
107 The primary operations are decode_fh and encode_fh. 108 Takes a dentry and creates a filehandle fragment which can later be used
108decode_fh takes a filehandle fragment and tries to find or create a 109 to find or create a dentry for the same object. The default
109dentry for the object referred to by the filehandle. 110 implementation creates a filehandle fragment that encodes a 32bit inode
110encode_fh takes a dentry and creates a filehandle fragment which can 111 and generation number for the inode encoded, and if necessary the
111later be used to find/create a dentry for the same object. 112 same information for the parent.
112 113
113decode_fh will probably make use of "find_exported_dentry". 114 fh_to_dentry (mandatory)
114This function lives in the "exportfs" module which a filesystem does 115 Given a filehandle fragment, this should find the implied object and
115not need unless it is being exported. So rather that calling 116 create a dentry for it (possibly with d_alloc_anon).
116find_exported_dentry directly, each filesystem should call it through 117
117the find_exported_dentry pointer in it's export_operations table. 118 fh_to_parent (optional but strongly recommended)
118This field is set correctly by the exporting agent (e.g. nfsd) when a 119 Given a filehandle fragment, this should find the parent of the
119filesystem is exported, and before any export operations are called. 120 implied object and create a dentry for it (possibly with d_alloc_anon).
120 121 May fail if the filehandle fragment is too small.
121find_exported_dentry needs three support functions from the 122
122filesystem: 123 get_parent (optional but strongly recommended)
123 get_name. When given a parent dentry and a child dentry, this 124 When given a dentry for a directory, this should return a dentry for
124 should find a name in the directory identified by the parent 125 the parent. Quite possibly the parent dentry will have been allocated
125 dentry, which leads to the object identified by the child dentry. 126 by d_alloc_anon. The default get_parent function just returns an error
126 If no get_name function is supplied, a default implementation is 127 so any filehandle lookup that requires finding a parent will fail.
127 provided which uses vfs_readdir to find potential names, and 128 ->lookup("..") is *not* used as a default as it can leave ".." entries
128 matches inode numbers to find the correct match. 129 in the dcache which are too messy to work with.
129 130
130 get_parent. When given a dentry for a directory, this should return 131 get_name (optional)
131 a dentry for the parent. Quite possibly the parent dentry will 132 When given a parent dentry and a child dentry, this should find a name
132 have been allocated by d_alloc_anon. 133 in the directory identified by the parent dentry, which leads to the
133 The default get_parent function just returns an error so any 134 object identified by the child dentry. If no get_name function is
134 filehandle lookup that requires finding a parent will fail. 135 supplied, a default implementation is provided which uses vfs_readdir
135 ->lookup("..") is *not* used as a default as it can leave ".." 136 to find potential names, and matches inode numbers to find the correct
136 entries in the dcache which are too messy to work with. 137 match.
137
138 get_dentry. When given an opaque datum, this should find the
139 implied object and create a dentry for it (possibly with
140 d_alloc_anon).
141 The opaque datum is whatever is passed down by the decode_fh
142 function, and is often simply a fragment of the filehandle
143 fragment.
144 decode_fh passes two datums through find_exported_dentry. One that
145 should be used to identify the target object, and one that can be
146 used to identify the object's parent, should that be necessary.
147 The default get_dentry function assumes that the datum contains an
148 inode number and a generation number, and it attempts to get the
149 inode using "iget" and check it's validity by matching the
150 generation number. A filesystem should only depend on the default
151 if iget can safely be used this way.
152
153If decode_fh and/or encode_fh are left as NULL, then default
154implementations are used. These defaults are suitable for ext2 and
155extremely similar filesystems (like ext3).
156
157The default encode_fh creates a filehandle fragment from the inode
158number and generation number of the target together with the inode
159number and generation number of the parent (if the parent is
160required).
161
162The default decode_fh extract the target and parent datums from the
163filehandle assuming the format used by the default encode_fh and
164passed them to find_exported_dentry.
165 138
166 139
167A filehandle fragment consists of an array of 1 or more 4byte words, 140A filehandle fragment consists of an array of 1 or more 4byte words,
@@ -172,5 +145,3 @@ generated by encode_fh, in which case it will have been padded with
172nuls. Rather, the encode_fh routine should choose a "type" which 145nuls. Rather, the encode_fh routine should choose a "type" which
173indicates the decode_fh how much of the filehandle is valid, and how 146indicates the decode_fh how much of the filehandle is valid, and how
174it should be interpreted. 147it should be interpreted.
175
176
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index fe26cc978523..37c10cba7177 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -224,7 +224,7 @@ against the page the filesystem should redirty the page with
224redirty_page_for_writepage(), then unlock the page and return zero. 224redirty_page_for_writepage(), then unlock the page and return zero.
225This may also be done to avoid internal deadlocks, but rarely. 225This may also be done to avoid internal deadlocks, but rarely.
226 226
227If the filesytem is called for sync then it must wait on any 227If the filesystem is called for sync then it must wait on any
228in-progress I/O and then start new I/O. 228in-progress I/O and then start new I/O.
229 229
230The filesystem should unlock the page synchronously, before returning to the 230The filesystem should unlock the page synchronously, before returning to the
diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt
index 4aecc9bdb273..b45f3c1b8b43 100644
--- a/Documentation/filesystems/ext3.txt
+++ b/Documentation/filesystems/ext3.txt
@@ -130,12 +130,12 @@ Device layer.
130 130
131Journaling Block Device layer 131Journaling Block Device layer
132----------------------------- 132-----------------------------
133The Journaling Block Device layer (JBD) isn't ext3 specific. It was design to 133The Journaling Block Device layer (JBD) isn't ext3 specific. It was designed
134add journaling capabilities on a block device. The ext3 filesystem code will 134to add journaling capabilities to a block device. The ext3 filesystem code
135inform the JBD of modifications it is performing (called a transaction). The 135will inform the JBD of modifications it is performing (called a transaction).
136journal supports the transactions start and stop, and in case of crash, the 136The journal supports the transactions start and stop, and in case of a crash,
137journal can replayed the transactions to put the partition back in a 137the journal can replay the transactions to quickly put the partition back into
138consistent state fast. 138a consistent state.
139 139
140Handles represent a single atomic update to a filesystem. JBD can handle an 140Handles represent a single atomic update to a filesystem. JBD can handle an
141external journal on a block device. 141external journal on a block device.
@@ -164,7 +164,7 @@ written to the journal first, and then to its final location.
164In the event of a crash, the journal can be replayed, bringing both data and 164In the event of a crash, the journal can be replayed, bringing both data and
165metadata into a consistent state. This mode is the slowest except when data 165metadata into a consistent state. This mode is the slowest except when data
166needs to be read from and written to disk at the same time where it 166needs to be read from and written to disk at the same time where it
167outperforms all others modes. 167outperforms all other modes.
168 168
169Compatibility 169Compatibility
170------------- 170-------------
diff --git a/Documentation/filesystems/files.txt b/Documentation/filesystems/files.txt
index 133e213ebb72..bb0142f61084 100644
--- a/Documentation/filesystems/files.txt
+++ b/Documentation/filesystems/files.txt
@@ -76,13 +76,13 @@ the fdtable structure -
765. Handling of the file structures is special. Since the look-up 765. Handling of the file structures is special. Since the look-up
77 of the fd (fget()/fget_light()) are lock-free, it is possible 77 of the fd (fget()/fget_light()) are lock-free, it is possible
78 that look-up may race with the last put() operation on the 78 that look-up may race with the last put() operation on the
79 file structure. This is avoided using the rcuref APIs 79 file structure. This is avoided using atomic_inc_not_zero()
80 on ->f_count : 80 on ->f_count :
81 81
82 rcu_read_lock(); 82 rcu_read_lock();
83 file = fcheck_files(files, fd); 83 file = fcheck_files(files, fd);
84 if (file) { 84 if (file) {
85 if (rcuref_inc_lf(&file->f_count)) 85 if (atomic_inc_not_zero(&file->f_count))
86 *fput_needed = 1; 86 *fput_needed = 1;
87 else 87 else
88 /* Didn't get the reference, someone's freed */ 88 /* Didn't get the reference, someone's freed */
@@ -92,7 +92,7 @@ the fdtable structure -
92 .... 92 ....
93 return file; 93 return file;
94 94
95 rcuref_inc_lf() detects if refcounts is already zero or 95 atomic_inc_not_zero() detects if refcounts is already zero or
96 goes to zero during increment. If it does, we fail 96 goes to zero during increment. If it does, we fail
97 fget()/fget_light(). 97 fget()/fget_light().
98 98
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index e5c1df52a876..dec99455321f 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -813,9 +813,9 @@ Various pieces of information about kernel activity are available in the
813since the system first booted. For a quick look, simply cat the file: 813since the system first booted. For a quick look, simply cat the file:
814 814
815 > cat /proc/stat 815 > cat /proc/stat
816 cpu 2255 34 2290 22625563 6290 127 456 816 cpu 2255 34 2290 22625563 6290 127 456 0
817 cpu0 1132 34 1441 11311718 3675 127 438 817 cpu0 1132 34 1441 11311718 3675 127 438 0
818 cpu1 1123 0 849 11313845 2614 0 18 818 cpu1 1123 0 849 11313845 2614 0 18 0
819 intr 114930548 113199788 3 0 5 263 0 4 [... lots more numbers ...] 819 intr 114930548 113199788 3 0 5 263 0 4 [... lots more numbers ...]
820 ctxt 1990473 820 ctxt 1990473
821 btime 1062191376 821 btime 1062191376
@@ -835,6 +835,7 @@ second). The meanings of the columns are as follows, from left to right:
835- iowait: waiting for I/O to complete 835- iowait: waiting for I/O to complete
836- irq: servicing interrupts 836- irq: servicing interrupts
837- softirq: servicing softirqs 837- softirq: servicing softirqs
838- steal: involuntary wait
838 839
839The "intr" line gives counts of interrupts serviced since boot time, for each 840The "intr" line gives counts of interrupts serviced since boot time, for each
840of the possible system interrupts. The first column is the total of all 841of the possible system interrupts. The first column is the total of all
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt
index 4b5ca26e5048..4598ef7b622b 100644
--- a/Documentation/filesystems/sysfs.txt
+++ b/Documentation/filesystems/sysfs.txt
@@ -51,7 +51,7 @@ for the attributes, providing a means to read and write kernel
51attributes. 51attributes.
52 52
53Attributes should be ASCII text files, preferably with only one value 53Attributes should be ASCII text files, preferably with only one value
54per file. It is noted that it may not be efficient to contain only 54per file. It is noted that it may not be efficient to contain only one
55value per file, so it is socially acceptable to express an array of 55value per file, so it is socially acceptable to express an array of
56values of the same type. 56values of the same type.
57 57
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 6f8e16e3d6c0..9d019d35728f 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -706,7 +706,7 @@ struct address_space_operations {
706 wants to make it a free page. If ->releasepage succeeds, the 706 wants to make it a free page. If ->releasepage succeeds, the
707 page will be removed from the address_space and become free. 707 page will be removed from the address_space and become free.
708 708
709 The second case if when a request has been made to invalidate 709 The second case is when a request has been made to invalidate
710 some or all pages in an address_space. This can happen 710 some or all pages in an address_space. This can happen
711 through the fadvice(POSIX_FADV_DONTNEED) system call or by the 711 through the fadvice(POSIX_FADV_DONTNEED) system call or by the
712 filesystem explicitly requesting it as nfs and 9fs do (when 712 filesystem explicitly requesting it as nfs and 9fs do (when
diff --git a/Documentation/i2c/i2c-protocol b/Documentation/i2c/i2c-protocol
index 579b92d5f3a3..10518dd58814 100644
--- a/Documentation/i2c/i2c-protocol
+++ b/Documentation/i2c/i2c-protocol
@@ -68,7 +68,7 @@ We have found some I2C devices that needs the following modifications:
68 68
69 Flags I2C_M_IGNORE_NAK 69 Flags I2C_M_IGNORE_NAK
70 Normally message is interrupted immediately if there is [NA] from the 70 Normally message is interrupted immediately if there is [NA] from the
71 client. Setting this flag treats any [NA] as [A], and all of 71 client. Setting this flag treats any [NA] as [A], and all of
72 message is sent. 72 message is sent.
73 These messages may still fail to SCL lo->hi timeout. 73 These messages may still fail to SCL lo->hi timeout.
74 74
diff --git a/Documentation/i386/boot.txt b/Documentation/i386/boot.txt
index 35985b34d5a6..2f75e750e4f5 100644
--- a/Documentation/i386/boot.txt
+++ b/Documentation/i386/boot.txt
@@ -168,6 +168,8 @@ Offset Proto Name Meaning
1680234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not 1680234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not
1690235/3 N/A pad2 Unused 1690235/3 N/A pad2 Unused
1700238/4 2.06+ cmdline_size Maximum size of the kernel command line 1700238/4 2.06+ cmdline_size Maximum size of the kernel command line
171023C/4 2.07+ hardware_subarch Hardware subarchitecture
1720240/8 2.07+ hardware_subarch_data Subarchitecture-specific data
171 173
172(1) For backwards compatibility, if the setup_sects field contains 0, the 174(1) For backwards compatibility, if the setup_sects field contains 0, the
173 real value is 4. 175 real value is 4.
@@ -204,7 +206,7 @@ boot loaders can ignore those fields.
204 206
205The byte order of all fields is littleendian (this is x86, after all.) 207The byte order of all fields is littleendian (this is x86, after all.)
206 208
207Field name: setup_secs 209Field name: setup_sects
208Type: read 210Type: read
209Offset/size: 0x1f1/1 211Offset/size: 0x1f1/1
210Protocol: ALL 212Protocol: ALL
@@ -356,6 +358,13 @@ Protocol: 2.00+
356 - If 0, the protected-mode code is loaded at 0x10000. 358 - If 0, the protected-mode code is loaded at 0x10000.
357 - If 1, the protected-mode code is loaded at 0x100000. 359 - If 1, the protected-mode code is loaded at 0x100000.
358 360
361 Bit 6 (write): KEEP_SEGMENTS
362 Protocol: 2.07+
363 - if 0, reload the segment registers in the 32bit entry point.
364 - if 1, do not reload the segment registers in the 32bit entry point.
365 Assume that %cs %ds %ss %es are all set to flat segments with
366 a base of 0 (or the equivalent for their environment).
367
359 Bit 7 (write): CAN_USE_HEAP 368 Bit 7 (write): CAN_USE_HEAP
360 Set this bit to 1 to indicate that the value entered in the 369 Set this bit to 1 to indicate that the value entered in the
361 heap_end_ptr is valid. If this field is clear, some setup code 370 heap_end_ptr is valid. If this field is clear, some setup code
@@ -480,6 +489,29 @@ Protocol: 2.06+
480 cmdline_size characters. With protocol version 2.05 and earlier, the 489 cmdline_size characters. With protocol version 2.05 and earlier, the
481 maximum size was 255. 490 maximum size was 255.
482 491
492Field name: hardware_subarch
493Type: write
494Offset/size: 0x23c/4
495Protocol: 2.07+
496
497 In a paravirtualized environment the hardware low level architectural
498 pieces such as interrupt handling, page table handling, and
499 accessing process control registers needs to be done differently.
500
501 This field allows the bootloader to inform the kernel we are in one
502 one of those environments.
503
504 0x00000000 The default x86/PC environment
505 0x00000001 lguest
506 0x00000002 Xen
507
508Field name: hardware_subarch_data
509Type: write
510Offset/size: 0x240/8
511Protocol: 2.07+
512
513 A pointer to data that is specific to hardware subarch
514
483 515
484**** THE KERNEL COMMAND LINE 516**** THE KERNEL COMMAND LINE
485 517
diff --git a/Documentation/ia64/err_inject.txt b/Documentation/ia64/err_inject.txt
index 6449a7090dbb..223e4f0582d0 100644
--- a/Documentation/ia64/err_inject.txt
+++ b/Documentation/ia64/err_inject.txt
@@ -21,10 +21,10 @@ software test suits to do stressful testing on IPF.
21 21
22Below is a sample application as part of the whole tool. The sample 22Below is a sample application as part of the whole tool. The sample
23can be used as a working test tool. Or it can be expanded to include 23can be used as a working test tool. Or it can be expanded to include
24more features. It also can be a integrated into a libary or other user 24more features. It also can be a integrated into a library or other user
25application to have more thorough test. 25application to have more thorough test.
26 26
27The sample application takes err.conf as error configuation input. Gcc 27The sample application takes err.conf as error configuration input. GCC
28compiles the code. After you install err_inject driver, you can run 28compiles the code. After you install err_inject driver, you can run
29this sample application to inject errors. 29this sample application to inject errors.
30 30
@@ -809,7 +809,7 @@ int err_inj()
809 } 809 }
810 810
811 /* Create semaphore: If one_lock, one semaphore for all processors. 811 /* Create semaphore: If one_lock, one semaphore for all processors.
812 Otherwise, one sempaphore for each processor. */ 812 Otherwise, one semaphore for each processor. */
813 if (one_lock) { 813 if (one_lock) {
814 if (create_sem(0)) { 814 if (create_sem(0)) {
815 printf("Can not create semaphore...exit\n"); 815 printf("Can not create semaphore...exit\n");
diff --git a/Documentation/input/atarikbd.txt b/Documentation/input/atarikbd.txt
index ab050621e20f..f3a3ba8847ba 100644
--- a/Documentation/input/atarikbd.txt
+++ b/Documentation/input/atarikbd.txt
@@ -170,7 +170,7 @@ major controller faults (ROM checksum and RAM test) and such things as stuck
170keys. Any keys down at power-up are presumed to be stuck, and their BREAK 170keys. Any keys down at power-up are presumed to be stuck, and their BREAK
171(sic) code is returned (which without the preceding MAKE code is a flag for a 171(sic) code is returned (which without the preceding MAKE code is a flag for a
172keyboard error). If the controller self-test completes without error, the code 172keyboard error). If the controller self-test completes without error, the code
1730xF0 is returned. (This code will be used to indicate the version/rlease of 1730xF0 is returned. (This code will be used to indicate the version/release of
174the ikbd controller. The first release of the ikbd is version 0xF0, should 174the ikbd controller. The first release of the ikbd is version 0xF0, should
175there be a second release it will be 0xF1, and so on.) 175there be a second release it will be 0xF1, and so on.)
176The ikbd defaults to a mouse position reporting with threshold of 1 unit in 176The ikbd defaults to a mouse position reporting with threshold of 1 unit in
@@ -413,7 +413,7 @@ INTERROGATION MODE.
413 %nnnnmmmm ; where m is JOYSTICK1 state 413 %nnnnmmmm ; where m is JOYSTICK1 state
414 ; and n is JOYSTICK0 state 414 ; and n is JOYSTICK0 state
415 415
416Sets the ikbd to do nothing but monitor the serial command lne, maintain the 416Sets the ikbd to do nothing but monitor the serial command line, maintain the
417time-of-day clock, and monitor the joystick. The rate sets the interval 417time-of-day clock, and monitor the joystick. The rate sets the interval
418between joystick samples. 418between joystick samples.
419N.B. The user should not set the rate higher than the serial communications 419N.B. The user should not set the rate higher than the serial communications
@@ -446,10 +446,10 @@ The sample interval should be as constant as possible.
446 ; until vertical cursor key is generated before RY 446 ; until vertical cursor key is generated before RY
447 ; has elapsed 447 ; has elapsed
448 VX ; length (in tenths of seconds) of joystick closure 448 VX ; length (in tenths of seconds) of joystick closure
449 ; until horizontal cursor keystokes are generated 449 ; until horizontal cursor keystrokes are generated
450 ; after RX has elapsed 450 ; after RX has elapsed
451 VY ; length (in tenths of seconds) of joystick closure 451 VY ; length (in tenths of seconds) of joystick closure
452 ; until vertical cursor keystokes are generated 452 ; until vertical cursor keystrokes are generated
453 ; after RY has elapsed 453 ; after RY has elapsed
454 454
455In this mode, joystick 0 is scanned in a way that simulates cursor keystrokes. 455In this mode, joystick 0 is scanned in a way that simulates cursor keystrokes.
diff --git a/Documentation/input/ff.txt b/Documentation/input/ff.txt
index 085eb15b45b7..ded4d5f53109 100644
--- a/Documentation/input/ff.txt
+++ b/Documentation/input/ff.txt
@@ -1,5 +1,5 @@
1Force feedback for Linux. 1Force feedback for Linux.
2By Johann Deneux <deneux@ifrance.com> on 2001/04/22. 2By Johann Deneux <johann.deneux@gmail.com> on 2001/04/22.
3Updated by Anssi Hannula <anssi.hannula@gmail.com> on 2006/04/09. 3Updated by Anssi Hannula <anssi.hannula@gmail.com> on 2006/04/09.
4You may redistribute this file. Please remember to include shape.fig and 4You may redistribute this file. Please remember to include shape.fig and
5interactive.fig as well. 5interactive.fig as well.
diff --git a/Documentation/input/iforce-protocol.txt b/Documentation/input/iforce-protocol.txt
index 8777d2d321e3..3ac92413c874 100644
--- a/Documentation/input/iforce-protocol.txt
+++ b/Documentation/input/iforce-protocol.txt
@@ -4,10 +4,10 @@ specify force effects to I-Force 2.0 devices. None of this information comes
4from Immerse. That's why you should not trust what is written in this 4from Immerse. That's why you should not trust what is written in this
5document. This document is intended to help understanding the protocol. 5document. This document is intended to help understanding the protocol.
6This is not a reference. Comments and corrections are welcome. To contact me, 6This is not a reference. Comments and corrections are welcome. To contact me,
7send an email to: deneux@ifrance.com 7send an email to: johann.deneux@gmail.com
8 8
9** WARNING ** 9** WARNING **
10I may not be held responsible for any dammage or harm caused if you try to 10I shall not be held responsible for any damage or harm caused if you try to
11send data to your I-Force device based on what you read in this document. 11send data to your I-Force device based on what you read in this document.
12 12
13** Preliminary Notes: 13** Preliminary Notes:
@@ -151,13 +151,13 @@ OP= ff
151Query command. Length varies according to the query type. 151Query command. Length varies according to the query type.
152The general format of this packet is: 152The general format of this packet is:
153ff 01 QUERY [INDEX] CHECKSUM 153ff 01 QUERY [INDEX] CHECKSUM
154reponses are of the same form: 154responses are of the same form:
155FF LEN QUERY VALUE_QUERIED CHECKSUM2 155FF LEN QUERY VALUE_QUERIED CHECKSUM2
156where LEN = 1 + length(VALUE_QUERIED) 156where LEN = 1 + length(VALUE_QUERIED)
157 157
158**** Query ram size **** 158**** Query ram size ****
159QUERY = 42 ('B'uffer size) 159QUERY = 42 ('B'uffer size)
160The device should reply with the same packet plus two additionnal bytes 160The device should reply with the same packet plus two additional bytes
161containing the size of the memory: 161containing the size of the memory:
162ff 03 42 03 e8 CS would mean that the device has 1000 bytes of ram available. 162ff 03 42 03 e8 CS would mean that the device has 1000 bytes of ram available.
163 163
@@ -234,19 +234,23 @@ is the amount of memory apparently needed for every set of parameters:
234 234
235** Appendix: How to study the protocol ? ** 235** Appendix: How to study the protocol ? **
236 236
2371. Generate effects using the force editor provided with the DirectX SDK, or use Immersion Studio (freely available at their web site in the developer section: www.immersion.com) 2371. Generate effects using the force editor provided with the DirectX SDK, or
2382. Start a soft spying RS232 or USB (depending on where you connected your joystick/wheel). I used ComPortSpy from fCoder (alpha version!) 238use Immersion Studio (freely available at their web site in the developer section:
239www.immersion.com)
2402. Start a soft spying RS232 or USB (depending on where you connected your
241joystick/wheel). I used ComPortSpy from fCoder (alpha version!)
2393. Play the effect, and watch what happens on the spy screen. 2423. Play the effect, and watch what happens on the spy screen.
240 243
241A few words about ComPortSpy: 244A few words about ComPortSpy:
242At first glance, this soft seems, hum, well... buggy. In fact, data appear with a few seconds latency. Personnaly, I restart it every time I play an effect. 245At first glance, this software seems, hum, well... buggy. In fact, data appear with a
246few seconds latency. Personally, I restart it every time I play an effect.
243Remember it's free (as in free beer) and alpha! 247Remember it's free (as in free beer) and alpha!
244 248
245** URLS ** 249** URLS **
246Check www.immerse.com for Immersion Studio, and www.fcoder.com for ComPortSpy. 250Check www.immerse.com for Immersion Studio, and www.fcoder.com for ComPortSpy.
247 251
248** Author of this document ** 252** Author of this document **
249Johann Deneux <deneux@ifrance.com> 253Johann Deneux <johann.deneux@gmail.com>
250Home page at http://www.esil.univ-mrs.fr/~jdeneux/projects/ff/ 254Home page at http://www.esil.univ-mrs.fr/~jdeneux/projects/ff/
251 255
252Additions by Vojtech Pavlik. 256Additions by Vojtech Pavlik.
diff --git a/Documentation/input/input-programming.txt b/Documentation/input/input-programming.txt
index d9d523099bb7..47fc86830cd7 100644
--- a/Documentation/input/input-programming.txt
+++ b/Documentation/input/input-programming.txt
@@ -42,8 +42,8 @@ static int __init button_init(void)
42 goto err_free_irq; 42 goto err_free_irq;
43 } 43 }
44 44
45 button_dev->evbit[0] = BIT(EV_KEY); 45 button_dev->evbit[0] = BIT_MASK(EV_KEY);
46 button_dev->keybit[LONG(BTN_0)] = BIT(BTN_0); 46 button_dev->keybit[BIT_WORD(BTN_0)] = BIT_MASK(BTN_0);
47 47
48 error = input_register_device(button_dev); 48 error = input_register_device(button_dev);
49 if (error) { 49 if (error) {
@@ -79,7 +79,7 @@ In the _init function, which is called either upon module load or when
79booting the kernel, it grabs the required resources (it should also check 79booting the kernel, it grabs the required resources (it should also check
80for the presence of the device). 80for the presence of the device).
81 81
82Then it allocates a new input device structure with input_aloocate_device() 82Then it allocates a new input device structure with input_allocate_device()
83and sets up input bitfields. This way the device driver tells the other 83and sets up input bitfields. This way the device driver tells the other
84parts of the input systems what it is - what events can be generated or 84parts of the input systems what it is - what events can be generated or
85accepted by this input device. Our example device can only generate EV_KEY 85accepted by this input device. Our example device can only generate EV_KEY
@@ -217,14 +217,15 @@ If you don't need absfuzz and absflat, you can set them to zero, which mean
217that the thing is precise and always returns to exactly the center position 217that the thing is precise and always returns to exactly the center position
218(if it has any). 218(if it has any).
219 219
2201.4 NBITS(), LONG(), BIT() 2201.4 BITS_TO_LONGS(), BIT_WORD(), BIT_MASK()
221~~~~~~~~~~~~~~~~~~~~~~~~~~ 221~~~~~~~~~~~~~~~~~~~~~~~~~~
222 222
223These three macros from input.h help some bitfield computations: 223These three macros from bitops.h help some bitfield computations:
224 224
225 NBITS(x) - returns the length of a bitfield array in longs for x bits 225 BITS_TO_LONGS(x) - returns the length of a bitfield array in longs for
226 LONG(x) - returns the index in the array in longs for bit x 226 x bits
227 BIT(x) - returns the index in a long for bit x 227 BIT_WORD(x) - returns the index in the array in longs for bit x
228 BIT_MASK(x) - returns the index in a long for bit x
228 229
2291.5 The id* and name fields 2301.5 The id* and name fields
230~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 231~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/Documentation/isdn/CREDITS b/Documentation/isdn/CREDITS
index 7c17c837064f..8cac6c2f23ee 100644
--- a/Documentation/isdn/CREDITS
+++ b/Documentation/isdn/CREDITS
@@ -40,7 +40,7 @@ Andreas Kool (akool@Kool.f.EUnet.de)
40Pedro Roque Marques (roque@di.fc.ul.pt) 40Pedro Roque Marques (roque@di.fc.ul.pt)
41 For lot of new ideas and the pcbit driver. 41 For lot of new ideas and the pcbit driver.
42 42
43Eberhard Moenkeberg (emoenke@gwdg.de) 43Eberhard Mönkeberg (emoenke@gwdg.de)
44 For testing and help to get into kernel. 44 For testing and help to get into kernel.
45 45
46Thomas Neumann (tn@ruhr.de) 46Thomas Neumann (tn@ruhr.de)
diff --git a/Documentation/isdn/README.concap b/Documentation/isdn/README.concap
index 2f114babe4b6..a76d74845a4c 100644
--- a/Documentation/isdn/README.concap
+++ b/Documentation/isdn/README.concap
@@ -111,7 +111,7 @@ struct concap_proto_ops{
111 struct concap_proto * (*proto_new) (void); 111 struct concap_proto * (*proto_new) (void);
112 112
113 /* delete encapsulation protocol instance and free all its resources. 113 /* delete encapsulation protocol instance and free all its resources.
114 cprot may no loger be referenced after calling this */ 114 cprot may no longer be referenced after calling this */
115 void (*proto_del)(struct concap_proto *cprot); 115 void (*proto_del)(struct concap_proto *cprot);
116 116
117 /* initialize the protocol's data. To be called at interface startup 117 /* initialize the protocol's data. To be called at interface startup
diff --git a/Documentation/java.txt b/Documentation/java.txt
index 3cce3fbb6644..e6a723281547 100644
--- a/Documentation/java.txt
+++ b/Documentation/java.txt
@@ -37,7 +37,7 @@ other program after you have done the following:
37 or the following, if you want to be more selective: 37 or the following, if you want to be more selective:
38 ':Applet:M::<!--applet::/usr/bin/appletviewer:' 38 ':Applet:M::<!--applet::/usr/bin/appletviewer:'
39 39
40 Of cause you have to fix the path names. Given path/file names in this 40 Of course you have to fix the path names. The path/file names given in this
41 document match the Debian 2.1 system. (i.e. jdk installed in /usr, 41 document match the Debian 2.1 system. (i.e. jdk installed in /usr,
42 custom wrappers from this document in /usr/local) 42 custom wrappers from this document in /usr/local)
43 43
diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt
index fe8b0c4892cf..616043a6da99 100644
--- a/Documentation/kbuild/kconfig-language.txt
+++ b/Documentation/kbuild/kconfig-language.txt
@@ -77,7 +77,12 @@ applicable everywhere (see syntax).
77 Optionally, dependencies only for this default value can be added with 77 Optionally, dependencies only for this default value can be added with
78 "if". 78 "if".
79 79
80- dependencies: "depends on"/"requires" <expr> 80- type definition + default value:
81 "def_bool"/"def_tristate" <expr> ["if" <expr>]
82 This is a shorthand notation for a type definition plus a value.
83 Optionally dependencies for this default value can be added with "if".
84
85- dependencies: "depends on" <expr>
81 This defines a dependency for this menu entry. If multiple 86 This defines a dependency for this menu entry. If multiple
82 dependencies are defined, they are connected with '&&'. Dependencies 87 dependencies are defined, they are connected with '&&'. Dependencies
83 are applied to all other options within this menu entry (which also 88 are applied to all other options within this menu entry (which also
@@ -289,3 +294,10 @@ source:
289 "source" <prompt> 294 "source" <prompt>
290 295
291This reads the specified configuration file. This file is always parsed. 296This reads the specified configuration file. This file is always parsed.
297
298mainmenu:
299
300 "mainmenu" <prompt>
301
302This sets the config program's title bar if the config program chooses
303to use it.
diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt
index f099b814d383..7a7753321a26 100644
--- a/Documentation/kbuild/makefiles.txt
+++ b/Documentation/kbuild/makefiles.txt
@@ -518,6 +518,28 @@ more details, with real examples.
518 In this example for a specific GCC version the build will error out explaining 518 In this example for a specific GCC version the build will error out explaining
519 to the user why it stops. 519 to the user why it stops.
520 520
521 cc-cross-prefix
522 cc-cross-prefix is used to check if there exists a $(CC) in path with
523 one of the listed prefixes. The first prefix where there exist a
524 prefix$(CC) in the PATH is returned - and if no prefix$(CC) is found
525 then nothing is returned.
526 Additional prefixes are separated by a single space in the
527 call of cc-cross-prefix.
528 This functionality is useful for architecture Makefiles that try
529 to set CROSS_COMPILE to well-known values but may have several
530 values to select between.
531 It is recommended only to try to set CROSS_COMPILE if it is a cross
532 build (host arch is different from target arch). And if CROSS_COMPILE
533 is already set then leave it with the old value.
534
535 Example:
536 #arch/m68k/Makefile
537 ifneq ($(SUBARCH),$(ARCH))
538 ifeq ($(CROSS_COMPILE),)
539 CROSS_COMPILE := $(call cc-cross-prefix, m68k-linux-gnu-)
540 endif
541 endif
542
521=== 4 Host Program support 543=== 4 Host Program support
522 544
523Kbuild supports building executables on the host for use during the 545Kbuild supports building executables on the host for use during the
diff --git a/Documentation/kdump/kdump.txt b/Documentation/kdump/kdump.txt
index 1b37b28cc234..d0ac72cc19ff 100644
--- a/Documentation/kdump/kdump.txt
+++ b/Documentation/kdump/kdump.txt
@@ -231,6 +231,32 @@ Dump-capture kernel config options (Arch Dependent, ia64)
231 any space below the alignment point will be wasted. 231 any space below the alignment point will be wasted.
232 232
233 233
234Extended crashkernel syntax
235===========================
236
237While the "crashkernel=size[@offset]" syntax is sufficient for most
238configurations, sometimes it's handy to have the reserved memory dependent
239on the value of System RAM -- that's mostly for distributors that pre-setup
240the kernel command line to avoid a unbootable system after some memory has
241been removed from the machine.
242
243The syntax is:
244
245 crashkernel=<range1>:<size1>[,<range2>:<size2>,...][@offset]
246 range=start-[end]
247
248For example:
249
250 crashkernel=512M-2G:64M,2G-:128M
251
252This would mean:
253
254 1) if the RAM is smaller than 512M, then don't reserve anything
255 (this is the "rescue" case)
256 2) if the RAM size is between 512M and 2G, then reserve 64M
257 3) if the RAM size is larger than 2G, then reserve 128M
258
259
234Boot into System Kernel 260Boot into System Kernel
235======================= 261=======================
236 262
diff --git a/Documentation/kernel-docs.txt b/Documentation/kernel-docs.txt
index d9e3b199929b..5a4ef48224ae 100644
--- a/Documentation/kernel-docs.txt
+++ b/Documentation/kernel-docs.txt
@@ -76,9 +76,9 @@
76 * Title: "Conceptual Architecture of the Linux Kernel" 76 * Title: "Conceptual Architecture of the Linux Kernel"
77 Author: Ivan T. Bowman. 77 Author: Ivan T. Bowman.
78 URL: http://plg.uwaterloo.ca/~itbowman/papers/CS746G-a1.html 78 URL: http://plg.uwaterloo.ca/~itbowman/papers/CS746G-a1.html
79 Keywords: conceptual software arquitecture, extracted design, 79 Keywords: conceptual software architecture, extracted design,
80 reverse engineering, system structure. 80 reverse engineering, system structure.
81 Description: Conceptual software arquitecture of the Linux kernel, 81 Description: Conceptual software architecture of the Linux kernel,
82 automatically extracted from the source code. Very detailed. Good 82 automatically extracted from the source code. Very detailed. Good
83 figures. Gives good overall kernel understanding. 83 figures. Gives good overall kernel understanding.
84 84
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 98cf90f2631d..b2361667839f 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -222,9 +222,6 @@ and is between 256 and 4096 characters. It is defined in the file
222 Warning: Many of these options can produce a lot of 222 Warning: Many of these options can produce a lot of
223 output and make your system unusable. Be very careful. 223 output and make your system unusable. Be very careful.
224 224
225
226 acpi_fake_ecdt [HW,ACPI] Workaround failure due to BIOS lacking ECDT
227
228 acpi_pm_good [X86-32,X86-64] 225 acpi_pm_good [X86-32,X86-64]
229 Override the pmtimer bug detection: force the kernel 226 Override the pmtimer bug detection: force the kernel
230 to assume that this machine's pmtimer latches its value 227 to assume that this machine's pmtimer latches its value
@@ -297,9 +294,6 @@ and is between 256 and 4096 characters. It is defined in the file
297 apm= [APM] Advanced Power Management 294 apm= [APM] Advanced Power Management
298 See header of arch/i386/kernel/apm.c. 295 See header of arch/i386/kernel/apm.c.
299 296
300 applicom= [HW]
301 Format: <mem>,<irq>
302
303 arcrimi= [HW,NET] ARCnet - "RIM I" (entirely mem-mapped) cards 297 arcrimi= [HW,NET] ARCnet - "RIM I" (entirely mem-mapped) cards
304 Format: <io>,<irq>,<nodeID> 298 Format: <io>,<irq>,<nodeID>
305 299
@@ -345,12 +339,6 @@ and is between 256 and 4096 characters. It is defined in the file
345 Format: <io>,<irq>,<mode> 339 Format: <io>,<irq>,<mode>
346 See header of drivers/net/hamradio/baycom_ser_hdx.c. 340 See header of drivers/net/hamradio/baycom_ser_hdx.c.
347 341
348 blkmtd_device= [HW,MTD]
349 blkmtd_erasesz=
350 blkmtd_ro=
351 blkmtd_bs=
352 blkmtd_count=
353
354 boot_delay= Milliseconds to delay each printk during boot. 342 boot_delay= Milliseconds to delay each printk during boot.
355 Values larger than 10 seconds (10000) are changed to 343 Values larger than 10 seconds (10000) are changed to
356 no delay (0). 344 no delay (0).
@@ -431,8 +419,10 @@ and is between 256 and 4096 characters. It is defined in the file
431 over the 8254 in addition to over the IO-APIC. The 419 over the 8254 in addition to over the IO-APIC. The
432 kernel tries to set a sensible default. 420 kernel tries to set a sensible default.
433 421
434 hpet= [X86-32,HPET] option to disable HPET and use PIT. 422 hpet= [X86-32,HPET] option to control HPET usage
435 Format: disable 423 Format: { enable (default) | disable | force }
424 disable: disable HPET and use PIT instead
425 force: allow force enabled of undocumented chips (ICH4, VIA)
436 426
437 com20020= [HW,NET] ARCnet - COM20020 chipset 427 com20020= [HW,NET] ARCnet - COM20020 chipset
438 Format: 428 Format:
@@ -479,6 +469,16 @@ and is between 256 and 4096 characters. It is defined in the file
479 UART at the specified I/O port or MMIO address. 469 UART at the specified I/O port or MMIO address.
480 The options are the same as for ttyS, above. 470 The options are the same as for ttyS, above.
481 471
472 no_console_suspend
473 [HW] Never suspend the console
474 Disable suspending of consoles during suspend and
475 hibernate operations. Once disabled, debugging
476 messages can reach various consoles while the rest
477 of the system is being put to sleep (ie, while
478 debugging driver suspend/resume hooks). This may
479 not work reliably with all consoles, but is known
480 to work with serial and VGA consoles.
481
482 cpcihp_generic= [HW,PCI] Generic port I/O CompactPCI driver 482 cpcihp_generic= [HW,PCI] Generic port I/O CompactPCI driver
483 Format: 483 Format:
484 <first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>] 484 <first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>]
@@ -487,6 +487,13 @@ and is between 256 and 4096 characters. It is defined in the file
487 [KNL] Reserve a chunk of physical memory to 487 [KNL] Reserve a chunk of physical memory to
488 hold a kernel to switch to with kexec on panic. 488 hold a kernel to switch to with kexec on panic.
489 489
490 crashkernel=range1:size1[,range2:size2,...][@offset]
491 [KNL] Same as above, but depends on the memory
492 in the running system. The syntax of range is
493 start-[end] where start and end are both
494 a memory unit (amount[KMG]). See also
495 Documentation/kdump/kdump.txt for a example.
496
490 cs4232= [HW,OSS] 497 cs4232= [HW,OSS]
491 Format: <io>,<irq>,<dma>,<dma2>,<mpuio>,<mpuirq> 498 Format: <io>,<irq>,<dma>,<dma2>,<mpuio>,<mpuirq>
492 499
@@ -496,8 +503,6 @@ and is between 256 and 4096 characters. It is defined in the file
496 cs89x0_media= [HW,NET] 503 cs89x0_media= [HW,NET]
497 Format: { rj45 | aui | bnc } 504 Format: { rj45 | aui | bnc }
498 505
499 cyclades= [HW,SERIAL] Cyclades multi-serial port adapter.
500
501 dasd= [HW,NET] 506 dasd= [HW,NET]
502 See header of drivers/s390/block/dasd_devmap.c. 507 See header of drivers/s390/block/dasd_devmap.c.
503 508
@@ -555,10 +560,6 @@ and is between 256 and 4096 characters. It is defined in the file
555 See drivers/char/README.epca and 560 See drivers/char/README.epca and
556 Documentation/digiepca.txt. 561 Documentation/digiepca.txt.
557 562
558 dmascc= [HW,AX25,SERIAL] AX.25 Z80SCC driver with DMA
559 support available.
560 Format: <io_dev0>[,<io_dev1>[,..<io_dev32>]]
561
562 dmasound= [HW,OSS] Sound subsystem buffers 563 dmasound= [HW,OSS] Sound subsystem buffers
563 564
564 dscc4.setup= [NET] 565 dscc4.setup= [NET]
@@ -589,17 +590,10 @@ and is between 256 and 4096 characters. It is defined in the file
589 0: polling mode 590 0: polling mode
590 non-0: interrupt mode (default) 591 non-0: interrupt mode (default)
591 592
592 eda= [HW,PS2]
593
594 edb= [HW,PS2]
595
596 edd= [EDD] 593 edd= [EDD]
597 Format: {"of[f]" | "sk[ipmbr]"} 594 Format: {"of[f]" | "sk[ipmbr]"}
598 See comment in arch/i386/boot/edd.S 595 See comment in arch/i386/boot/edd.S
599 596
600 eicon= [HW,ISDN]
601 Format: <id>,<membase>,<irq>
602
603 eisa_irq_edge= [PARISC,HW] 597 eisa_irq_edge= [PARISC,HW]
604 See header of drivers/parisc/eisa.c. 598 See header of drivers/parisc/eisa.c.
605 599
@@ -778,6 +772,23 @@ and is between 256 and 4096 characters. It is defined in the file
778 772
779 inttest= [IA64] 773 inttest= [IA64]
780 774
775 intel_iommu= [DMAR] Intel IOMMU driver (DMAR) option
776 off
777 Disable intel iommu driver.
778 igfx_off [Default Off]
779 By default, gfx is mapped as normal device. If a gfx
780 device has a dedicated DMAR unit, the DMAR unit is
781 bypassed by not enabling DMAR with this option. In
782 this case, gfx device will use physical address for
783 DMA.
784 forcedac [x86_64]
785 With this option iommu will not optimize to look
786 for io virtual address below 32 bit forcing dual
787 address cycle on pci bus for cards supporting greater
788 than 32 bit addressing. The default is to look
789 for translation below 32 bit and if not available
790 then look in the higher range.
791
781 io7= [HW] IO7 for Marvel based alpha systems 792 io7= [HW] IO7 for Marvel based alpha systems
782 See comment before marvel_specify_io7 in 793 See comment before marvel_specify_io7 in
783 arch/alpha/kernel/core_marvel.c. 794 arch/alpha/kernel/core_marvel.c.
@@ -875,9 +886,6 @@ and is between 256 and 4096 characters. It is defined in the file
875 lapic_timer_c2_ok [X86-32,x86-64,APIC] trust the local apic timer in 886 lapic_timer_c2_ok [X86-32,x86-64,APIC] trust the local apic timer in
876 C2 power state. 887 C2 power state.
877 888
878 lasi= [HW,SCSI] PARISC LASI driver for the 53c700 chip
879 Format: addr:<io>,irq:<irq>
880
881 libata.noacpi [LIBATA] Disables use of ACPI in libata suspend/resume 889 libata.noacpi [LIBATA] Disables use of ACPI in libata suspend/resume
882 when set. 890 when set.
883 Format: <int> 891 Format: <int>
@@ -1125,9 +1133,6 @@ and is between 256 and 4096 characters. It is defined in the file
1125 noapic [SMP,APIC] Tells the kernel to not make use of any 1133 noapic [SMP,APIC] Tells the kernel to not make use of any
1126 IOAPICs that may be present in the system. 1134 IOAPICs that may be present in the system.
1127 1135
1128 noasync [HW,M68K] Disables async and sync negotiation for
1129 all devices.
1130
1131 nobats [PPC] Do not use BATs for mapping kernel lowmem 1136 nobats [PPC] Do not use BATs for mapping kernel lowmem
1132 on "Classic" PPC cores. 1137 on "Classic" PPC cores.
1133 1138
@@ -1439,6 +1444,7 @@ and is between 256 and 4096 characters. It is defined in the file
1439 Param: <number> - step/bucket size as a power of 2 for 1444 Param: <number> - step/bucket size as a power of 2 for
1440 statistical time based profiling. 1445 statistical time based profiling.
1441 Param: "sleep" - profile D-state sleeping (millisecs) 1446 Param: "sleep" - profile D-state sleeping (millisecs)
1447 Param: "kvm" - profile VM exits.
1442 1448
1443 processor.max_cstate= [HW,ACPI] 1449 processor.max_cstate= [HW,ACPI]
1444 Limit processor to maximum C-state 1450 Limit processor to maximum C-state
@@ -1565,9 +1571,6 @@ and is between 256 and 4096 characters. It is defined in the file
1565 sa1100ir [NET] 1571 sa1100ir [NET]
1566 See drivers/net/irda/sa1100_ir.c. 1572 See drivers/net/irda/sa1100_ir.c.
1567 1573
1568 sb= [HW,OSS]
1569 Format: <io>,<irq>,<dma>,<dma2>
1570
1571 sbni= [NET] Granch SBNI12 leased line adapter 1574 sbni= [NET] Granch SBNI12 leased line adapter
1572 1575
1573 sc1200wdt= [HW,WDT] SC1200 WDT (watchdog) driver 1576 sc1200wdt= [HW,WDT] SC1200 WDT (watchdog) driver
@@ -1611,8 +1614,6 @@ and is between 256 and 4096 characters. It is defined in the file
1611 1614
1612 serialnumber [BUGS=X86-32] 1615 serialnumber [BUGS=X86-32]
1613 1616
1614 sg_def_reserved_size= [SCSI]
1615
1616 shapers= [NET] 1617 shapers= [NET]
1617 Maximal number of shapers. 1618 Maximal number of shapers.
1618 1619
@@ -2003,10 +2004,6 @@ and is between 256 and 4096 characters. It is defined in the file
2003 norandmaps Don't use address space randomization 2004 norandmaps Don't use address space randomization
2004 Equivalent to echo 0 > /proc/sys/kernel/randomize_va_space 2005 Equivalent to echo 0 > /proc/sys/kernel/randomize_va_space
2005 2006
2006 unwind_debug=N N > 0 will enable dwarf2 unwinder debugging
2007 This is useful to get more information why
2008 you got a "dwarf2 unwinder stuck"
2009
2010______________________________________________________________________ 2007______________________________________________________________________
2011 2008
2012TODO: 2009TODO:
diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile
index c0b7a4556390..bac037eb1cda 100644
--- a/Documentation/lguest/Makefile
+++ b/Documentation/lguest/Makefile
@@ -1,28 +1,8 @@
1# This creates the demonstration utility "lguest" which runs a Linux guest. 1# This creates the demonstration utility "lguest" which runs a Linux guest.
2 2CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include
3# For those people that have a separate object dir, look there for .config
4KBUILD_OUTPUT := ../..
5ifdef O
6 ifeq ("$(origin O)", "command line")
7 KBUILD_OUTPUT := $(O)
8 endif
9endif
10# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary.
11include $(KBUILD_OUTPUT)/.config
12LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
13
14CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -Wl,-T,lguest.lds
15LDLIBS:=-lz 3LDLIBS:=-lz
16# Removing this works for some versions of ld.so (eg. Ubuntu Feisty) and
17# not others (eg. FC7).
18LDFLAGS+=-static
19all: lguest.lds lguest
20 4
21# The linker script on x86 is so complex the only way of creating one 5all: lguest
22# which will link our binary in the right place is to mangle the
23# default one.
24lguest.lds:
25 $(LD) --verbose | awk '/^==========/ { PRINT=1; next; } /SIZEOF_HEADERS/ { gsub(/0x[0-9A-F]*/, "$(LGUEST_GUEST_TOP)") } { if (PRINT) print $$0; }' > $@
26 6
27clean: 7clean:
28 rm -f lguest.lds lguest 8 rm -f lguest
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 103e346c8b6a..5bdc37f81842 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -1,10 +1,7 @@
1/*P:100 This is the Launcher code, a simple program which lays out the 1/*P:100 This is the Launcher code, a simple program which lays out the
2 * "physical" memory for the new Guest by mapping the kernel image and the 2 * "physical" memory for the new Guest by mapping the kernel image and the
3 * virtual devices, then reads repeatedly from /dev/lguest to run the Guest. 3 * virtual devices, then reads repeatedly from /dev/lguest to run the Guest.
4 * 4:*/
5 * The only trick: the Makefile links it at a high address so it will be clear
6 * of the guest memory region. It means that each Guest cannot have more than
7 * about 2.5G of memory on a normally configured Host. :*/
8#define _LARGEFILE64_SOURCE 5#define _LARGEFILE64_SOURCE
9#define _GNU_SOURCE 6#define _GNU_SOURCE
10#include <stdio.h> 7#include <stdio.h>
@@ -15,6 +12,7 @@
15#include <stdlib.h> 12#include <stdlib.h>
16#include <elf.h> 13#include <elf.h>
17#include <sys/mman.h> 14#include <sys/mman.h>
15#include <sys/param.h>
18#include <sys/types.h> 16#include <sys/types.h>
19#include <sys/stat.h> 17#include <sys/stat.h>
20#include <sys/wait.h> 18#include <sys/wait.h>
@@ -34,7 +32,9 @@
34#include <termios.h> 32#include <termios.h>
35#include <getopt.h> 33#include <getopt.h>
36#include <zlib.h> 34#include <zlib.h>
37/*L:110 We can ignore the 28 include files we need for this program, but I do 35#include <assert.h>
36#include <sched.h>
37/*L:110 We can ignore the 30 include files we need for this program, but I do
38 * want to draw attention to the use of kernel-style types. 38 * want to draw attention to the use of kernel-style types.
39 * 39 *
40 * As Linus said, "C is a Spartan language, and so should your naming be." I 40 * As Linus said, "C is a Spartan language, and so should your naming be." I
@@ -45,8 +45,14 @@ typedef unsigned long long u64;
45typedef uint32_t u32; 45typedef uint32_t u32;
46typedef uint16_t u16; 46typedef uint16_t u16;
47typedef uint8_t u8; 47typedef uint8_t u8;
48#include "../../include/linux/lguest_launcher.h" 48#include "linux/lguest_launcher.h"
49#include "../../include/asm-x86/e820_32.h" 49#include "linux/pci_ids.h"
50#include "linux/virtio_config.h"
51#include "linux/virtio_net.h"
52#include "linux/virtio_blk.h"
53#include "linux/virtio_console.h"
54#include "linux/virtio_ring.h"
55#include "asm-x86/bootparam.h"
50/*:*/ 56/*:*/
51 57
52#define PAGE_PRESENT 0x7 /* Present, RW, Execute */ 58#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
@@ -55,6 +61,10 @@ typedef uint8_t u8;
55#ifndef SIOCBRADDIF 61#ifndef SIOCBRADDIF
56#define SIOCBRADDIF 0x89a2 /* add interface to bridge */ 62#define SIOCBRADDIF 0x89a2 /* add interface to bridge */
57#endif 63#endif
64/* We can have up to 256 pages for devices. */
65#define DEVICE_PAGES 256
66/* This fits nicely in a single 4096-byte page. */
67#define VIRTQUEUE_NUM 127
58 68
59/*L:120 verbose is both a global flag and a macro. The C preprocessor allows 69/*L:120 verbose is both a global flag and a macro. The C preprocessor allows
60 * this, and although I wouldn't recommend it, it works quite nicely here. */ 70 * this, and although I wouldn't recommend it, it works quite nicely here. */
@@ -65,8 +75,10 @@ static bool verbose;
65 75
66/* The pipe to send commands to the waker process */ 76/* The pipe to send commands to the waker process */
67static int waker_fd; 77static int waker_fd;
68/* The top of guest physical memory. */ 78/* The pointer to the start of guest memory. */
69static u32 top; 79static void *guest_base;
80/* The maximum guest physical address allowed, and maximum possible. */
81static unsigned long guest_limit, guest_max;
70 82
71/* This is our list of devices. */ 83/* This is our list of devices. */
72struct device_list 84struct device_list
@@ -76,8 +88,17 @@ struct device_list
76 fd_set infds; 88 fd_set infds;
77 int max_infd; 89 int max_infd;
78 90
91 /* Counter to assign interrupt numbers. */
92 unsigned int next_irq;
93
94 /* Counter to print out convenient device numbers. */
95 unsigned int device_num;
96
79 /* The descriptor page for the devices. */ 97 /* The descriptor page for the devices. */
80 struct lguest_device_desc *descs; 98 u8 *descpage;
99
100 /* The tail of the last descriptor. */
101 unsigned int desc_used;
81 102
82 /* A single linked list of devices. */ 103 /* A single linked list of devices. */
83 struct device *dev; 104 struct device *dev;
@@ -85,31 +106,111 @@ struct device_list
85 struct device **lastdev; 106 struct device **lastdev;
86}; 107};
87 108
109/* The list of Guest devices, based on command line arguments. */
110static struct device_list devices;
111
88/* The device structure describes a single device. */ 112/* The device structure describes a single device. */
89struct device 113struct device
90{ 114{
91 /* The linked-list pointer. */ 115 /* The linked-list pointer. */
92 struct device *next; 116 struct device *next;
93 /* The descriptor for this device, as mapped into the Guest. */ 117
118 /* The this device's descriptor, as mapped into the Guest. */
94 struct lguest_device_desc *desc; 119 struct lguest_device_desc *desc;
95 /* The memory page(s) of this device, if any. Also mapped in Guest. */ 120
96 void *mem; 121 /* The name of this device, for --verbose. */
122 const char *name;
97 123
98 /* If handle_input is set, it wants to be called when this file 124 /* If handle_input is set, it wants to be called when this file
99 * descriptor is ready. */ 125 * descriptor is ready. */
100 int fd; 126 int fd;
101 bool (*handle_input)(int fd, struct device *me); 127 bool (*handle_input)(int fd, struct device *me);
102 128
103 /* If handle_output is set, it wants to be called when the Guest sends 129 /* Any queues attached to this device */
104 * DMA to this key. */ 130 struct virtqueue *vq;
105 unsigned long watch_key;
106 u32 (*handle_output)(int fd, const struct iovec *iov,
107 unsigned int num, struct device *me);
108 131
109 /* Device-specific data. */ 132 /* Device-specific data. */
110 void *priv; 133 void *priv;
111}; 134};
112 135
136/* The virtqueue structure describes a queue attached to a device. */
137struct virtqueue
138{
139 struct virtqueue *next;
140
141 /* Which device owns me. */
142 struct device *dev;
143
144 /* The configuration for this queue. */
145 struct lguest_vqconfig config;
146
147 /* The actual ring of buffers. */
148 struct vring vring;
149
150 /* Last available index we saw. */
151 u16 last_avail_idx;
152
153 /* The routine to call when the Guest pings us. */
154 void (*handle_output)(int fd, struct virtqueue *me);
155};
156
157/* Since guest is UP and we don't run at the same time, we don't need barriers.
158 * But I include them in the code in case others copy it. */
159#define wmb()
160
161/* Convert an iovec element to the given type.
162 *
163 * This is a fairly ugly trick: we need to know the size of the type and
164 * alignment requirement to check the pointer is kosher. It's also nice to
165 * have the name of the type in case we report failure.
166 *
167 * Typing those three things all the time is cumbersome and error prone, so we
168 * have a macro which sets them all up and passes to the real function. */
169#define convert(iov, type) \
170 ((type *)_convert((iov), sizeof(type), __alignof__(type), #type))
171
172static void *_convert(struct iovec *iov, size_t size, size_t align,
173 const char *name)
174{
175 if (iov->iov_len != size)
176 errx(1, "Bad iovec size %zu for %s", iov->iov_len, name);
177 if ((unsigned long)iov->iov_base % align != 0)
178 errx(1, "Bad alignment %p for %s", iov->iov_base, name);
179 return iov->iov_base;
180}
181
182/* The virtio configuration space is defined to be little-endian. x86 is
183 * little-endian too, but it's nice to be explicit so we have these helpers. */
184#define cpu_to_le16(v16) (v16)
185#define cpu_to_le32(v32) (v32)
186#define cpu_to_le64(v64) (v64)
187#define le16_to_cpu(v16) (v16)
188#define le32_to_cpu(v32) (v32)
189#define le64_to_cpu(v32) (v64)
190
191/*L:100 The Launcher code itself takes us out into userspace, that scary place
192 * where pointers run wild and free! Unfortunately, like most userspace
193 * programs, it's quite boring (which is why everyone likes to hack on the
194 * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it
195 * will get you through this section. Or, maybe not.
196 *
197 * The Launcher sets up a big chunk of memory to be the Guest's "physical"
198 * memory and stores it in "guest_base". In other words, Guest physical ==
199 * Launcher virtual with an offset.
200 *
201 * This can be tough to get your head around, but usually it just means that we
202 * use these trivial conversion functions when the Guest gives us it's
203 * "physical" addresses: */
204static void *from_guest_phys(unsigned long addr)
205{
206 return guest_base + addr;
207}
208
209static unsigned long to_guest_phys(const void *addr)
210{
211 return (addr - guest_base);
212}
213
113/*L:130 214/*L:130
114 * Loading the Kernel. 215 * Loading the Kernel.
115 * 216 *
@@ -123,43 +224,55 @@ static int open_or_die(const char *name, int flags)
123 return fd; 224 return fd;
124} 225}
125 226
126/* map_zeroed_pages() takes a (page-aligned) address and a number of pages. */ 227/* map_zeroed_pages() takes a number of pages. */
127static void *map_zeroed_pages(unsigned long addr, unsigned int num) 228static void *map_zeroed_pages(unsigned int num)
128{ 229{
129 /* We cache the /dev/zero file-descriptor so we only open it once. */ 230 int fd = open_or_die("/dev/zero", O_RDONLY);
130 static int fd = -1; 231 void *addr;
131
132 if (fd == -1)
133 fd = open_or_die("/dev/zero", O_RDONLY);
134 232
135 /* We use a private mapping (ie. if we write to the page, it will be 233 /* We use a private mapping (ie. if we write to the page, it will be
136 * copied), and obviously we insist that it be mapped where we ask. */ 234 * copied). */
137 if (mmap((void *)addr, getpagesize() * num, 235 addr = mmap(NULL, getpagesize() * num,
138 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, 0) 236 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0);
139 != (void *)addr) 237 if (addr == MAP_FAILED)
140 err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr); 238 err(1, "Mmaping %u pages of /dev/zero", num);
141 239
142 /* Returning the address is just a courtesy: can simplify callers. */ 240 return addr;
143 return (void *)addr;
144} 241}
145 242
146/* To find out where to start we look for the magic Guest string, which marks 243/* Get some more pages for a device. */
147 * the code we see in lguest_asm.S. This is a hack which we are currently 244static void *get_pages(unsigned int num)
148 * plotting to replace with the normal Linux entry point. */
149static unsigned long entry_point(void *start, void *end,
150 unsigned long page_offset)
151{ 245{
152 void *p; 246 void *addr = from_guest_phys(guest_limit);
153 247
154 /* The scan gives us the physical starting address. We want the 248 guest_limit += num * getpagesize();
155 * virtual address in this case, and fortunately, we already figured 249 if (guest_limit > guest_max)
156 * out the physical-virtual difference and passed it here in 250 errx(1, "Not enough memory for devices");
157 * "page_offset". */ 251 return addr;
158 for (p = start; p < end; p++) 252}
159 if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0)
160 return (long)p + strlen("GenuineLguest") + page_offset;
161 253
162 err(1, "Is this image a genuine lguest?"); 254/* This routine is used to load the kernel or initrd. It tries mmap, but if
255 * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries),
256 * it falls back to reading the memory in. */
257static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
258{
259 ssize_t r;
260
261 /* We map writable even though for some segments are marked read-only.
262 * The kernel really wants to be writable: it patches its own
263 * instructions.
264 *
265 * MAP_PRIVATE means that the page won't be copied until a write is
266 * done to it. This allows us to share untouched memory between
267 * Guests. */
268 if (mmap(addr, len, PROT_READ|PROT_WRITE|PROT_EXEC,
269 MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED)
270 return;
271
272 /* pread does a seek and a read in one shot: saves a few lines. */
273 r = pread(fd, addr, len, offset);
274 if (r != len)
275 err(1, "Reading offset %lu len %lu gave %zi", offset, len, r);
163} 276}
164 277
165/* This routine takes an open vmlinux image, which is in ELF, and maps it into 278/* This routine takes an open vmlinux image, which is in ELF, and maps it into
@@ -167,19 +280,14 @@ static unsigned long entry_point(void *start, void *end,
167 * by all modern binaries on Linux including the kernel. 280 * by all modern binaries on Linux including the kernel.
168 * 281 *
169 * The ELF headers give *two* addresses: a physical address, and a virtual 282 * The ELF headers give *two* addresses: a physical address, and a virtual
170 * address. The Guest kernel expects to be placed in memory at the physical 283 * address. We use the physical address; the Guest will map itself to the
171 * address, and the page tables set up so it will correspond to that virtual 284 * virtual address.
172 * address. We return the difference between the virtual and physical
173 * addresses in the "page_offset" pointer.
174 * 285 *
175 * We return the starting address. */ 286 * We return the starting address. */
176static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, 287static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
177 unsigned long *page_offset)
178{ 288{
179 void *addr;
180 Elf32_Phdr phdr[ehdr->e_phnum]; 289 Elf32_Phdr phdr[ehdr->e_phnum];
181 unsigned int i; 290 unsigned int i;
182 unsigned long start = -1UL, end = 0;
183 291
184 /* Sanity checks on the main ELF header: an x86 executable with a 292 /* Sanity checks on the main ELF header: an x86 executable with a
185 * reasonable number of correctly-sized program headers. */ 293 * reasonable number of correctly-sized program headers. */
@@ -199,9 +307,6 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
199 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) 307 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
200 err(1, "Reading program headers"); 308 err(1, "Reading program headers");
201 309
202 /* We don't know page_offset yet. */
203 *page_offset = 0;
204
205 /* Try all the headers: there are usually only three. A read-only one, 310 /* Try all the headers: there are usually only three. A read-only one,
206 * a read-write one, and a "note" section which isn't loadable. */ 311 * a read-write one, and a "note" section which isn't loadable. */
207 for (i = 0; i < ehdr->e_phnum; i++) { 312 for (i = 0; i < ehdr->e_phnum; i++) {
@@ -212,158 +317,53 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
212 verbose("Section %i: size %i addr %p\n", 317 verbose("Section %i: size %i addr %p\n",
213 i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); 318 i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
214 319
215 /* We expect a simple linear address space: every segment must 320 /* We map this section of the file at its physical address. */
216 * have the same difference between virtual (p_vaddr) and 321 map_at(elf_fd, from_guest_phys(phdr[i].p_paddr),
217 * physical (p_paddr) address. */ 322 phdr[i].p_offset, phdr[i].p_filesz);
218 if (!*page_offset)
219 *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr;
220 else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
221 errx(1, "Page offset of section %i different", i);
222
223 /* We track the first and last address we mapped, so we can
224 * tell entry_point() where to scan. */
225 if (phdr[i].p_paddr < start)
226 start = phdr[i].p_paddr;
227 if (phdr[i].p_paddr + phdr[i].p_filesz > end)
228 end = phdr[i].p_paddr + phdr[i].p_filesz;
229
230 /* We map this section of the file at its physical address. We
231 * map it read & write even if the header says this segment is
232 * read-only. The kernel really wants to be writable: it
233 * patches its own instructions which would normally be
234 * read-only.
235 *
236 * MAP_PRIVATE means that the page won't be copied until a
237 * write is done to it. This allows us to share much of the
238 * kernel memory between Guests. */
239 addr = mmap((void *)phdr[i].p_paddr,
240 phdr[i].p_filesz,
241 PROT_READ|PROT_WRITE|PROT_EXEC,
242 MAP_FIXED|MAP_PRIVATE,
243 elf_fd, phdr[i].p_offset);
244 if (addr != (void *)phdr[i].p_paddr)
245 err(1, "Mmaping vmlinux seg %i gave %p not %p",
246 i, addr, (void *)phdr[i].p_paddr);
247 } 323 }
248 324
249 return entry_point((void *)start, (void *)end, *page_offset); 325 /* The entry point is given in the ELF header. */
326 return ehdr->e_entry;
250} 327}
251 328
252/*L:170 Prepare to be SHOCKED and AMAZED. And possibly a trifle nauseated. 329/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're
253 * 330 * supposed to jump into it and it will unpack itself. We used to have to
254 * We know that CONFIG_PAGE_OFFSET sets what virtual address the kernel expects 331 * perform some hairy magic because the unpacking code scared me.
255 * to be. We don't know what that option was, but we can figure it out
256 * approximately by looking at the addresses in the code. I chose the common
257 * case of reading a memory location into the %eax register:
258 *
259 * movl <some-address>, %eax
260 *
261 * This gets encoded as five bytes: "0xA1 <4-byte-address>". For example,
262 * "0xA1 0x18 0x60 0x47 0xC0" reads the address 0xC0476018 into %eax.
263 *
264 * In this example can guess that the kernel was compiled with
265 * CONFIG_PAGE_OFFSET set to 0xC0000000 (it's always a round number). If the
266 * kernel were larger than 16MB, we might see 0xC1 addresses show up, but our
267 * kernel isn't that bloated yet.
268 *
269 * Unfortunately, x86 has variable-length instructions, so finding this
270 * particular instruction properly involves writing a disassembler. Instead,
271 * we rely on statistics. We look for "0xA1" and tally the different bytes
272 * which occur 4 bytes later (the "0xC0" in our example above). When one of
273 * those bytes appears three times, we can be reasonably confident that it
274 * forms the start of CONFIG_PAGE_OFFSET.
275 * 332 *
276 * This is amazingly reliable. */ 333 * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote
277static unsigned long intuit_page_offset(unsigned char *img, unsigned long len) 334 * a small patch to jump over the tricky bits in the Guest, so now we just read
335 * the funky header so we know where in the file to load, and away we go! */
336static unsigned long load_bzimage(int fd)
278{ 337{
279 unsigned int i, possibilities[256] = { 0 }; 338 struct boot_params boot;
339 int r;
340 /* Modern bzImages get loaded at 1M. */
341 void *p = from_guest_phys(0x100000);
280 342
281 for (i = 0; i + 4 < len; i++) { 343 /* Go back to the start of the file and read the header. It should be
282 /* mov 0xXXXXXXXX,%eax */ 344 * a Linux boot header (see Documentation/i386/boot.txt) */
283 if (img[i] == 0xA1 && ++possibilities[img[i+4]] > 3) 345 lseek(fd, 0, SEEK_SET);
284 return (unsigned long)img[i+4] << 24; 346 read(fd, &boot, sizeof(boot));
285 }
286 errx(1, "could not determine page offset");
287}
288 347
289/*L:160 Unfortunately the entire ELF image isn't compressed: the segments 348 /* Inside the setup_hdr, we expect the magic "HdrS" */
290 * which need loading are extracted and compressed raw. This denies us the 349 if (memcmp(&boot.hdr.header, "HdrS", 4) != 0)
291 * information we need to make a fully-general loader. */ 350 errx(1, "This doesn't look like a bzImage to me");
292static unsigned long unpack_bzimage(int fd, unsigned long *page_offset)
293{
294 gzFile f;
295 int ret, len = 0;
296 /* A bzImage always gets loaded at physical address 1M. This is
297 * actually configurable as CONFIG_PHYSICAL_START, but as the comment
298 * there says, "Don't change this unless you know what you are doing".
299 * Indeed. */
300 void *img = (void *)0x100000;
301
302 /* gzdopen takes our file descriptor (carefully placed at the start of
303 * the GZIP header we found) and returns a gzFile. */
304 f = gzdopen(fd, "rb");
305 /* We read it into memory in 64k chunks until we hit the end. */
306 while ((ret = gzread(f, img + len, 65536)) > 0)
307 len += ret;
308 if (ret < 0)
309 err(1, "reading image from bzImage");
310
311 verbose("Unpacked size %i addr %p\n", len, img);
312
313 /* Without the ELF header, we can't tell virtual-physical gap. This is
314 * CONFIG_PAGE_OFFSET, and people do actually change it. Fortunately,
315 * I have a clever way of figuring it out from the code itself. */
316 *page_offset = intuit_page_offset(img, len);
317
318 return entry_point(img, img + len, *page_offset);
319}
320 351
321/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're 352 /* Skip over the extra sectors of the header. */
322 * supposed to jump into it and it will unpack itself. We can't do that 353 lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET);
323 * because the Guest can't run the unpacking code, and adding features to 354
324 * lguest kills puppies, so we don't want to. 355 /* Now read everything into memory. in nice big chunks. */
325 * 356 while ((r = read(fd, p, 65536)) > 0)
326 * The bzImage is formed by putting the decompressing code in front of the 357 p += r;
327 * compressed kernel code. So we can simple scan through it looking for the 358
328 * first "gzip" header, and start decompressing from there. */ 359 /* Finally, code32_start tells us where to enter the kernel. */
329static unsigned long load_bzimage(int fd, unsigned long *page_offset) 360 return boot.hdr.code32_start;
330{
331 unsigned char c;
332 int state = 0;
333
334 /* GZIP header is 0x1F 0x8B <method> <flags>... <compressed-by>. */
335 while (read(fd, &c, 1) == 1) {
336 switch (state) {
337 case 0:
338 if (c == 0x1F)
339 state++;
340 break;
341 case 1:
342 if (c == 0x8B)
343 state++;
344 else
345 state = 0;
346 break;
347 case 2 ... 8:
348 state++;
349 break;
350 case 9:
351 /* Seek back to the start of the gzip header. */
352 lseek(fd, -10, SEEK_CUR);
353 /* One final check: "compressed under UNIX". */
354 if (c != 0x03)
355 state = -1;
356 else
357 return unpack_bzimage(fd, page_offset);
358 }
359 }
360 errx(1, "Could not find kernel in bzImage");
361} 361}
362 362
363/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels 363/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels
364 * come wrapped up in the self-decompressing "bzImage" format. With some funky 364 * come wrapped up in the self-decompressing "bzImage" format. With some funky
365 * coding, we can load those, too. */ 365 * coding, we can load those, too. */
366static unsigned long load_kernel(int fd, unsigned long *page_offset) 366static unsigned long load_kernel(int fd)
367{ 367{
368 Elf32_Ehdr hdr; 368 Elf32_Ehdr hdr;
369 369
@@ -373,10 +373,10 @@ static unsigned long load_kernel(int fd, unsigned long *page_offset)
373 373
374 /* If it's an ELF file, it starts with "\177ELF" */ 374 /* If it's an ELF file, it starts with "\177ELF" */
375 if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) 375 if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
376 return map_elf(fd, &hdr, page_offset); 376 return map_elf(fd, &hdr);
377 377
378 /* Otherwise we assume it's a bzImage, and try to unpack it */ 378 /* Otherwise we assume it's a bzImage, and try to unpack it */
379 return load_bzimage(fd, page_offset); 379 return load_bzimage(fd);
380} 380}
381 381
382/* This is a trivial little helper to align pages. Andi Kleen hated it because 382/* This is a trivial little helper to align pages. Andi Kleen hated it because
@@ -402,59 +402,45 @@ static unsigned long load_initrd(const char *name, unsigned long mem)
402 int ifd; 402 int ifd;
403 struct stat st; 403 struct stat st;
404 unsigned long len; 404 unsigned long len;
405 void *iaddr;
406 405
407 ifd = open_or_die(name, O_RDONLY); 406 ifd = open_or_die(name, O_RDONLY);
408 /* fstat() is needed to get the file size. */ 407 /* fstat() is needed to get the file size. */
409 if (fstat(ifd, &st) < 0) 408 if (fstat(ifd, &st) < 0)
410 err(1, "fstat() on initrd '%s'", name); 409 err(1, "fstat() on initrd '%s'", name);
411 410
412 /* The length needs to be rounded up to a page size: mmap needs the 411 /* We map the initrd at the top of memory, but mmap wants it to be
413 * address to be page aligned. */ 412 * page-aligned, so we round the size up for that. */
414 len = page_align(st.st_size); 413 len = page_align(st.st_size);
415 /* We map the initrd at the top of memory. */ 414 map_at(ifd, from_guest_phys(mem - len), 0, st.st_size);
416 iaddr = mmap((void *)mem - len, st.st_size,
417 PROT_READ|PROT_EXEC|PROT_WRITE,
418 MAP_FIXED|MAP_PRIVATE, ifd, 0);
419 if (iaddr != (void *)mem - len)
420 err(1, "Mmaping initrd '%s' returned %p not %p",
421 name, iaddr, (void *)mem - len);
422 /* Once a file is mapped, you can close the file descriptor. It's a 415 /* Once a file is mapped, you can close the file descriptor. It's a
423 * little odd, but quite useful. */ 416 * little odd, but quite useful. */
424 close(ifd); 417 close(ifd);
425 verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr); 418 verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len);
426 419
427 /* We return the initrd size. */ 420 /* We return the initrd size. */
428 return len; 421 return len;
429} 422}
430 423
431/* Once we know how much memory we have, and the address the Guest kernel 424/* Once we know how much memory we have, we can construct simple linear page
432 * expects, we can construct simple linear page tables which will get the Guest 425 * tables which set virtual == physical which will get the Guest far enough
433 * far enough into the boot to create its own. 426 * into the boot to create its own.
434 * 427 *
435 * We lay them out of the way, just below the initrd (which is why we need to 428 * We lay them out of the way, just below the initrd (which is why we need to
436 * know its size). */ 429 * know its size). */
437static unsigned long setup_pagetables(unsigned long mem, 430static unsigned long setup_pagetables(unsigned long mem,
438 unsigned long initrd_size, 431 unsigned long initrd_size)
439 unsigned long page_offset)
440{ 432{
441 u32 *pgdir, *linear; 433 unsigned long *pgdir, *linear;
442 unsigned int mapped_pages, i, linear_pages; 434 unsigned int mapped_pages, i, linear_pages;
443 unsigned int ptes_per_page = getpagesize()/sizeof(u32); 435 unsigned int ptes_per_page = getpagesize()/sizeof(void *);
444 436
445 /* Ideally we map all physical memory starting at page_offset. 437 mapped_pages = mem/getpagesize();
446 * However, if page_offset is 0xC0000000 we can only map 1G of physical
447 * (0xC0000000 + 1G overflows). */
448 if (mem <= -page_offset)
449 mapped_pages = mem/getpagesize();
450 else
451 mapped_pages = -page_offset/getpagesize();
452 438
453 /* Each PTE page can map ptes_per_page pages: how many do we need? */ 439 /* Each PTE page can map ptes_per_page pages: how many do we need? */
454 linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page; 440 linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page;
455 441
456 /* We put the toplevel page directory page at the top of memory. */ 442 /* We put the toplevel page directory page at the top of memory. */
457 pgdir = (void *)mem - initrd_size - getpagesize(); 443 pgdir = from_guest_phys(mem) - initrd_size - getpagesize();
458 444
459 /* Now we use the next linear_pages pages as pte pages */ 445 /* Now we use the next linear_pages pages as pte pages */
460 linear = (void *)pgdir - linear_pages*getpagesize(); 446 linear = (void *)pgdir - linear_pages*getpagesize();
@@ -465,20 +451,19 @@ static unsigned long setup_pagetables(unsigned long mem,
465 for (i = 0; i < mapped_pages; i++) 451 for (i = 0; i < mapped_pages; i++)
466 linear[i] = ((i * getpagesize()) | PAGE_PRESENT); 452 linear[i] = ((i * getpagesize()) | PAGE_PRESENT);
467 453
468 /* The top level points to the linear page table pages above. The 454 /* The top level points to the linear page table pages above. */
469 * entry representing page_offset points to the first one, and they
470 * continue from there. */
471 for (i = 0; i < mapped_pages; i += ptes_per_page) { 455 for (i = 0; i < mapped_pages; i += ptes_per_page) {
472 pgdir[(i + page_offset/getpagesize())/ptes_per_page] 456 pgdir[i/ptes_per_page]
473 = (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT); 457 = ((to_guest_phys(linear) + i*sizeof(void *))
458 | PAGE_PRESENT);
474 } 459 }
475 460
476 verbose("Linear mapping of %u pages in %u pte pages at %p\n", 461 verbose("Linear mapping of %u pages in %u pte pages at %#lx\n",
477 mapped_pages, linear_pages, linear); 462 mapped_pages, linear_pages, to_guest_phys(linear));
478 463
479 /* We return the top level (guest-physical) address: the kernel needs 464 /* We return the top level (guest-physical) address: the kernel needs
480 * to know where it is. */ 465 * to know where it is. */
481 return (unsigned long)pgdir; 466 return to_guest_phys(pgdir);
482} 467}
483 468
484/* Simple routine to roll all the commandline arguments together with spaces 469/* Simple routine to roll all the commandline arguments together with spaces
@@ -498,14 +483,17 @@ static void concat(char *dst, char *args[])
498 483
499/* This is where we actually tell the kernel to initialize the Guest. We saw 484/* This is where we actually tell the kernel to initialize the Guest. We saw
500 * the arguments it expects when we looked at initialize() in lguest_user.c: 485 * the arguments it expects when we looked at initialize() in lguest_user.c:
501 * the top physical page to allow, the top level pagetable, the entry point and 486 * the base of guest "physical" memory, the top physical page to allow, the
502 * the page_offset constant for the Guest. */ 487 * top level pagetable and the entry point for the Guest. */
503static int tell_kernel(u32 pgdir, u32 start, u32 page_offset) 488static int tell_kernel(unsigned long pgdir, unsigned long start)
504{ 489{
505 u32 args[] = { LHREQ_INITIALIZE, 490 unsigned long args[] = { LHREQ_INITIALIZE,
506 top/getpagesize(), pgdir, start, page_offset }; 491 (unsigned long)guest_base,
492 guest_limit / getpagesize(), pgdir, start };
507 int fd; 493 int fd;
508 494
495 verbose("Guest: %p - %p (%#lx)\n",
496 guest_base, guest_base + guest_limit, guest_limit);
509 fd = open_or_die("/dev/lguest", O_RDWR); 497 fd = open_or_die("/dev/lguest", O_RDWR);
510 if (write(fd, args, sizeof(args)) < 0) 498 if (write(fd, args, sizeof(args)) < 0)
511 err(1, "Writing to /dev/lguest"); 499 err(1, "Writing to /dev/lguest");
@@ -515,11 +503,11 @@ static int tell_kernel(u32 pgdir, u32 start, u32 page_offset)
515} 503}
516/*:*/ 504/*:*/
517 505
518static void set_fd(int fd, struct device_list *devices) 506static void add_device_fd(int fd)
519{ 507{
520 FD_SET(fd, &devices->infds); 508 FD_SET(fd, &devices.infds);
521 if (fd > devices->max_infd) 509 if (fd > devices.max_infd)
522 devices->max_infd = fd; 510 devices.max_infd = fd;
523} 511}
524 512
525/*L:200 513/*L:200
@@ -537,36 +525,38 @@ static void set_fd(int fd, struct device_list *devices)
537 * 525 *
538 * This, of course, is merely a different *kind* of icky. 526 * This, of course, is merely a different *kind* of icky.
539 */ 527 */
540static void wake_parent(int pipefd, int lguest_fd, struct device_list *devices) 528static void wake_parent(int pipefd, int lguest_fd)
541{ 529{
542 /* Add the pipe from the Launcher to the fdset in the device_list, so 530 /* Add the pipe from the Launcher to the fdset in the device_list, so
543 * we watch it, too. */ 531 * we watch it, too. */
544 set_fd(pipefd, devices); 532 add_device_fd(pipefd);
545 533
546 for (;;) { 534 for (;;) {
547 fd_set rfds = devices->infds; 535 fd_set rfds = devices.infds;
548 u32 args[] = { LHREQ_BREAK, 1 }; 536 unsigned long args[] = { LHREQ_BREAK, 1 };
549 537
550 /* Wait until input is ready from one of the devices. */ 538 /* Wait until input is ready from one of the devices. */
551 select(devices->max_infd+1, &rfds, NULL, NULL, NULL); 539 select(devices.max_infd+1, &rfds, NULL, NULL, NULL);
552 /* Is it a message from the Launcher? */ 540 /* Is it a message from the Launcher? */
553 if (FD_ISSET(pipefd, &rfds)) { 541 if (FD_ISSET(pipefd, &rfds)) {
554 int ignorefd; 542 int fd;
555 /* If read() returns 0, it means the Launcher has 543 /* If read() returns 0, it means the Launcher has
556 * exited. We silently follow. */ 544 * exited. We silently follow. */
557 if (read(pipefd, &ignorefd, sizeof(ignorefd)) == 0) 545 if (read(pipefd, &fd, sizeof(fd)) == 0)
558 exit(0); 546 exit(0);
559 /* Otherwise it's telling us there's a problem with one 547 /* Otherwise it's telling us to change what file
560 * of the devices, and we should ignore that file 548 * descriptors we're to listen to. */
561 * descriptor from now on. */ 549 if (fd >= 0)
562 FD_CLR(ignorefd, &devices->infds); 550 FD_SET(fd, &devices.infds);
551 else
552 FD_CLR(-fd - 1, &devices.infds);
563 } else /* Send LHREQ_BREAK command. */ 553 } else /* Send LHREQ_BREAK command. */
564 write(lguest_fd, args, sizeof(args)); 554 write(lguest_fd, args, sizeof(args));
565 } 555 }
566} 556}
567 557
568/* This routine just sets up a pipe to the Waker process. */ 558/* This routine just sets up a pipe to the Waker process. */
569static int setup_waker(int lguest_fd, struct device_list *device_list) 559static int setup_waker(int lguest_fd)
570{ 560{
571 int pipefd[2], child; 561 int pipefd[2], child;
572 562
@@ -580,7 +570,7 @@ static int setup_waker(int lguest_fd, struct device_list *device_list)
580 if (child == 0) { 570 if (child == 0) {
581 /* Close the "writing" end of our copy of the pipe */ 571 /* Close the "writing" end of our copy of the pipe */
582 close(pipefd[1]); 572 close(pipefd[1]);
583 wake_parent(pipefd[0], lguest_fd, device_list); 573 wake_parent(pipefd[0], lguest_fd);
584 } 574 }
585 /* Close the reading end of our copy of the pipe. */ 575 /* Close the reading end of our copy of the pipe. */
586 close(pipefd[0]); 576 close(pipefd[0]);
@@ -602,83 +592,128 @@ static void *_check_pointer(unsigned long addr, unsigned int size,
602{ 592{
603 /* We have to separately check addr and addr+size, because size could 593 /* We have to separately check addr and addr+size, because size could
604 * be huge and addr + size might wrap around. */ 594 * be huge and addr + size might wrap around. */
605 if (addr >= top || addr + size >= top) 595 if (addr >= guest_limit || addr + size >= guest_limit)
606 errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr); 596 errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr);
607 /* We return a pointer for the caller's convenience, now we know it's 597 /* We return a pointer for the caller's convenience, now we know it's
608 * safe to use. */ 598 * safe to use. */
609 return (void *)addr; 599 return from_guest_phys(addr);
610} 600}
611/* A macro which transparently hands the line number to the real function. */ 601/* A macro which transparently hands the line number to the real function. */
612#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) 602#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
613 603
614/* The Guest has given us the address of a "struct lguest_dma". We check it's 604/* This function returns the next descriptor in the chain, or vq->vring.num. */
615 * OK and convert it to an iovec (which is a simple array of ptr/size 605static unsigned next_desc(struct virtqueue *vq, unsigned int i)
616 * pairs). */
617static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num)
618{ 606{
619 unsigned int i; 607 unsigned int next;
620 struct lguest_dma *udma;
621
622 /* First we make sure that the array memory itself is valid. */
623 udma = check_pointer(dma, sizeof(*udma));
624 /* Now we check each element */
625 for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
626 /* A zero length ends the array. */
627 if (!udma->len[i])
628 break;
629 608
630 iov[i].iov_base = check_pointer(udma->addr[i], udma->len[i]); 609 /* If this descriptor says it doesn't chain, we're done. */
631 iov[i].iov_len = udma->len[i]; 610 if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT))
632 } 611 return vq->vring.num;
633 *num = i; 612
613 /* Check they're not leading us off end of descriptors. */
614 next = vq->vring.desc[i].next;
615 /* Make sure compiler knows to grab that: we don't want it changing! */
616 wmb();
634 617
635 /* We return the pointer to where the caller should write the amount of 618 if (next >= vq->vring.num)
636 * the buffer used. */ 619 errx(1, "Desc next is %u", next);
637 return &udma->used_len; 620
621 return next;
622}
623
624/* This looks in the virtqueue and for the first available buffer, and converts
625 * it to an iovec for convenient access. Since descriptors consist of some
626 * number of output then some number of input descriptors, it's actually two
627 * iovecs, but we pack them into one and note how many of each there were.
628 *
629 * This function returns the descriptor number found, or vq->vring.num (which
630 * is never a valid descriptor number) if none was found. */
631static unsigned get_vq_desc(struct virtqueue *vq,
632 struct iovec iov[],
633 unsigned int *out_num, unsigned int *in_num)
634{
635 unsigned int i, head;
636
637 /* Check it isn't doing very strange things with descriptor numbers. */
638 if ((u16)(vq->vring.avail->idx - vq->last_avail_idx) > vq->vring.num)
639 errx(1, "Guest moved used index from %u to %u",
640 vq->last_avail_idx, vq->vring.avail->idx);
641
642 /* If there's nothing new since last we looked, return invalid. */
643 if (vq->vring.avail->idx == vq->last_avail_idx)
644 return vq->vring.num;
645
646 /* Grab the next descriptor number they're advertising, and increment
647 * the index we've seen. */
648 head = vq->vring.avail->ring[vq->last_avail_idx++ % vq->vring.num];
649
650 /* If their number is silly, that's a fatal mistake. */
651 if (head >= vq->vring.num)
652 errx(1, "Guest says index %u is available", head);
653
654 /* When we start there are none of either input nor output. */
655 *out_num = *in_num = 0;
656
657 i = head;
658 do {
659 /* Grab the first descriptor, and check it's OK. */
660 iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len;
661 iov[*out_num + *in_num].iov_base
662 = check_pointer(vq->vring.desc[i].addr,
663 vq->vring.desc[i].len);
664 /* If this is an input descriptor, increment that count. */
665 if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE)
666 (*in_num)++;
667 else {
668 /* If it's an output descriptor, they're all supposed
669 * to come before any input descriptors. */
670 if (*in_num)
671 errx(1, "Descriptor has out after in");
672 (*out_num)++;
673 }
674
675 /* If we've got too many, that implies a descriptor loop. */
676 if (*out_num + *in_num > vq->vring.num)
677 errx(1, "Looped descriptor");
678 } while ((i = next_desc(vq, i)) != vq->vring.num);
679
680 return head;
638} 681}
639 682
640/* This routine gets a DMA buffer from the Guest for a given key, and converts 683/* Once we've used one of their buffers, we tell them about it. We'll then
641 * it to an iovec array. It returns the interrupt the Guest wants when we're 684 * want to send them an interrupt, using trigger_irq(). */
642 * finished, and a pointer to the "used_len" field to fill in. */ 685static void add_used(struct virtqueue *vq, unsigned int head, int len)
643static u32 *get_dma_buffer(int fd, void *key,
644 struct iovec iov[], unsigned int *num, u32 *irq)
645{ 686{
646 u32 buf[] = { LHREQ_GETDMA, (u32)key }; 687 struct vring_used_elem *used;
647 unsigned long udma; 688
648 u32 *res; 689 /* Get a pointer to the next entry in the used ring. */
649 690 used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
650 /* Ask the kernel for a DMA buffer corresponding to this key. */ 691 used->id = head;
651 udma = write(fd, buf, sizeof(buf)); 692 used->len = len;
652 /* They haven't registered any, or they're all used? */ 693 /* Make sure buffer is written before we update index. */
653 if (udma == (unsigned long)-1) 694 wmb();
654 return NULL; 695 vq->vring.used->idx++;
655
656 /* Convert it into our iovec array */
657 res = dma2iov(udma, iov, num);
658 /* The kernel stashes irq in ->used_len to get it out to us. */
659 *irq = *res;
660 /* Return a pointer to ((struct lguest_dma *)udma)->used_len. */
661 return res;
662} 696}
663 697
664/* This is a convenient routine to send the Guest an interrupt. */ 698/* This actually sends the interrupt for this virtqueue */
665static void trigger_irq(int fd, u32 irq) 699static void trigger_irq(int fd, struct virtqueue *vq)
666{ 700{
667 u32 buf[] = { LHREQ_IRQ, irq }; 701 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
702
703 if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
704 return;
705
706 /* Send the Guest an interrupt tell them we used something up. */
668 if (write(fd, buf, sizeof(buf)) != 0) 707 if (write(fd, buf, sizeof(buf)) != 0)
669 err(1, "Triggering irq %i", irq); 708 err(1, "Triggering irq %i", vq->config.irq);
670} 709}
671 710
672/* This simply sets up an iovec array where we can put data to be discarded. 711/* And here's the combo meal deal. Supersize me! */
673 * This happens when the Guest doesn't want or can't handle the input: we have 712static void add_used_and_trigger(int fd, struct virtqueue *vq,
674 * to get rid of it somewhere, and if we bury it in the ceiling space it will 713 unsigned int head, int len)
675 * start to smell after a week. */
676static void discard_iovec(struct iovec *iov, unsigned int *num)
677{ 714{
678 static char discard_buf[1024]; 715 add_used(vq, head, len);
679 *num = 1; 716 trigger_irq(fd, vq);
680 iov->iov_base = discard_buf;
681 iov->iov_len = sizeof(discard_buf);
682} 717}
683 718
684/* Here is the input terminal setting we save, and the routine to restore them 719/* Here is the input terminal setting we save, and the routine to restore them
@@ -701,38 +736,39 @@ struct console_abort
701/* This is the routine which handles console input (ie. stdin). */ 736/* This is the routine which handles console input (ie. stdin). */
702static bool handle_console_input(int fd, struct device *dev) 737static bool handle_console_input(int fd, struct device *dev)
703{ 738{
704 u32 irq = 0, *lenp;
705 int len; 739 int len;
706 unsigned int num; 740 unsigned int head, in_num, out_num;
707 struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; 741 struct iovec iov[dev->vq->vring.num];
708 struct console_abort *abort = dev->priv; 742 struct console_abort *abort = dev->priv;
709 743
710 /* First we get the console buffer from the Guest. The key is dev->mem 744 /* First we need a console buffer from the Guests's input virtqueue. */
711 * which was set to 0 in setup_console(). */ 745 head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
712 lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq); 746
713 if (!lenp) { 747 /* If they're not ready for input, stop listening to this file
714 /* If it's not ready for input, warn and set up to discard. */ 748 * descriptor. We'll start again once they add an input buffer. */
715 warn("console: no dma buffer!"); 749 if (head == dev->vq->vring.num)
716 discard_iovec(iov, &num); 750 return false;
717 } 751
752 if (out_num)
753 errx(1, "Output buffers in console in queue?");
718 754
719 /* This is why we convert to iovecs: the readv() call uses them, and so 755 /* This is why we convert to iovecs: the readv() call uses them, and so
720 * it reads straight into the Guest's buffer. */ 756 * it reads straight into the Guest's buffer. */
721 len = readv(dev->fd, iov, num); 757 len = readv(dev->fd, iov, in_num);
722 if (len <= 0) { 758 if (len <= 0) {
723 /* This implies that the console is closed, is /dev/null, or 759 /* This implies that the console is closed, is /dev/null, or
724 * something went terribly wrong. We still go through the rest 760 * something went terribly wrong. */
725 * of the logic, though, especially the exit handling below. */
726 warnx("Failed to get console input, ignoring console."); 761 warnx("Failed to get console input, ignoring console.");
727 len = 0; 762 /* Put the input terminal back. */
763 restore_term();
764 /* Remove callback from input vq, so it doesn't restart us. */
765 dev->vq->handle_output = NULL;
766 /* Stop listening to this fd: don't call us again. */
767 return false;
728 } 768 }
729 769
730 /* If we read the data into the Guest, fill in the length and send the 770 /* Tell the Guest about the new input. */
731 * interrupt. */ 771 add_used_and_trigger(fd, dev->vq, head, len);
732 if (lenp) {
733 *lenp = len;
734 trigger_irq(fd, irq);
735 }
736 772
737 /* Three ^C within one second? Exit. 773 /* Three ^C within one second? Exit.
738 * 774 *
@@ -746,7 +782,7 @@ static bool handle_console_input(int fd, struct device *dev)
746 struct timeval now; 782 struct timeval now;
747 gettimeofday(&now, NULL); 783 gettimeofday(&now, NULL);
748 if (now.tv_sec <= abort->start.tv_sec+1) { 784 if (now.tv_sec <= abort->start.tv_sec+1) {
749 u32 args[] = { LHREQ_BREAK, 0 }; 785 unsigned long args[] = { LHREQ_BREAK, 0 };
750 /* Close the fd so Waker will know it has to 786 /* Close the fd so Waker will know it has to
751 * exit. */ 787 * exit. */
752 close(waker_fd); 788 close(waker_fd);
@@ -761,214 +797,163 @@ static bool handle_console_input(int fd, struct device *dev)
761 /* Any other key resets the abort counter. */ 797 /* Any other key resets the abort counter. */
762 abort->count = 0; 798 abort->count = 0;
763 799
764 /* Now, if we didn't read anything, put the input terminal back and
765 * return failure (meaning, don't call us again). */
766 if (!len) {
767 restore_term();
768 return false;
769 }
770 /* Everything went OK! */ 800 /* Everything went OK! */
771 return true; 801 return true;
772} 802}
773 803
774/* Handling console output is much simpler than input. */ 804/* Handling output for console is simple: we just get all the output buffers
775static u32 handle_console_output(int fd, const struct iovec *iov, 805 * and write them to stdout. */
776 unsigned num, struct device*dev) 806static void handle_console_output(int fd, struct virtqueue *vq)
777{ 807{
778 /* Whatever the Guest sends, write it to standard output. Return the 808 unsigned int head, out, in;
779 * number of bytes written. */ 809 int len;
780 return writev(STDOUT_FILENO, iov, num); 810 struct iovec iov[vq->vring.num];
781} 811
782 812 /* Keep getting output buffers from the Guest until we run out. */
783/* Guest->Host network output is also pretty easy. */ 813 while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
784static u32 handle_tun_output(int fd, const struct iovec *iov, 814 if (in)
785 unsigned num, struct device *dev) 815 errx(1, "Input buffers in output queue?");
786{ 816 len = writev(STDOUT_FILENO, iov, out);
787 /* We put a flag in the "priv" pointer of the network device, and set 817 add_used_and_trigger(fd, vq, head, len);
788 * it as soon as we see output. We'll see why in handle_tun_input() */ 818 }
789 *(bool *)dev->priv = true;
790 /* Whatever packet the Guest sent us, write it out to the tun
791 * device. */
792 return writev(dev->fd, iov, num);
793} 819}
794 820
795/* This matches the peer_key() in lguest_net.c. The key for any given slot 821/* Handling output for network is also simple: we get all the output buffers
796 * is the address of the network device's page plus 4 * the slot number. */ 822 * and write them (ignoring the first element) to this device's file descriptor
797static unsigned long peer_offset(unsigned int peernum) 823 * (stdout). */
824static void handle_net_output(int fd, struct virtqueue *vq)
798{ 825{
799 return 4 * peernum; 826 unsigned int head, out, in;
827 int len;
828 struct iovec iov[vq->vring.num];
829
830 /* Keep getting output buffers from the Guest until we run out. */
831 while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
832 if (in)
833 errx(1, "Input buffers in output queue?");
834 /* Check header, but otherwise ignore it (we said we supported
835 * no features). */
836 (void)convert(&iov[0], struct virtio_net_hdr);
837 len = writev(vq->dev->fd, iov+1, out-1);
838 add_used_and_trigger(fd, vq, head, len);
839 }
800} 840}
801 841
802/* This is where we handle a packet coming in from the tun device */ 842/* This is where we handle a packet coming in from the tun device to our
843 * Guest. */
803static bool handle_tun_input(int fd, struct device *dev) 844static bool handle_tun_input(int fd, struct device *dev)
804{ 845{
805 u32 irq = 0, *lenp; 846 unsigned int head, in_num, out_num;
806 int len; 847 int len;
807 unsigned num; 848 struct iovec iov[dev->vq->vring.num];
808 struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; 849 struct virtio_net_hdr *hdr;
809 850
810 /* First we get a buffer the Guest has bound to its key. */ 851 /* First we need a network buffer from the Guests's recv virtqueue. */
811 lenp = get_dma_buffer(fd, dev->mem+peer_offset(NET_PEERNUM), iov, &num, 852 head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
812 &irq); 853 if (head == dev->vq->vring.num) {
813 if (!lenp) {
814 /* Now, it's expected that if we try to send a packet too 854 /* Now, it's expected that if we try to send a packet too
815 * early, the Guest won't be ready yet. This is why we set a 855 * early, the Guest won't be ready yet. Wait until the device
816 * flag when the Guest sends its first packet. If it's sent a 856 * status says it's ready. */
817 * packet we assume it should be ready to receive them. 857 /* FIXME: Actually want DRIVER_ACTIVE here. */
818 * 858 if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK)
819 * Actually, this is what the status bits in the descriptor are
820 * for: we should *use* them. FIXME! */
821 if (*(bool *)dev->priv)
822 warn("network: no dma buffer!"); 859 warn("network: no dma buffer!");
823 discard_iovec(iov, &num); 860 /* We'll turn this back on if input buffers are registered. */
824 } 861 return false;
862 } else if (out_num)
863 errx(1, "Output buffers in network recv queue?");
864
865 /* First element is the header: we set it to 0 (no features). */
866 hdr = convert(&iov[0], struct virtio_net_hdr);
867 hdr->flags = 0;
868 hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
825 869
826 /* Read the packet from the device directly into the Guest's buffer. */ 870 /* Read the packet from the device directly into the Guest's buffer. */
827 len = readv(dev->fd, iov, num); 871 len = readv(dev->fd, iov+1, in_num-1);
828 if (len <= 0) 872 if (len <= 0)
829 err(1, "reading network"); 873 err(1, "reading network");
830 874
831 /* Write the used_len, and trigger the interrupt for the Guest */ 875 /* Tell the Guest about the new packet. */
832 if (lenp) { 876 add_used_and_trigger(fd, dev->vq, head, sizeof(*hdr) + len);
833 *lenp = len; 877
834 trigger_irq(fd, irq);
835 }
836 verbose("tun input packet len %i [%02x %02x] (%s)\n", len, 878 verbose("tun input packet len %i [%02x %02x] (%s)\n", len,
837 ((u8 *)iov[0].iov_base)[0], ((u8 *)iov[0].iov_base)[1], 879 ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1],
838 lenp ? "sent" : "discarded"); 880 head != dev->vq->vring.num ? "sent" : "discarded");
881
839 /* All good. */ 882 /* All good. */
840 return true; 883 return true;
841} 884}
842 885
843/* The last device handling routine is block output: the Guest has sent a DMA 886/* This callback ensures we try again, in case we stopped console or net
844 * to the block device. It will have placed the command it wants in the 887 * delivery because Guest didn't have any buffers. */
845 * "struct lguest_block_page". */ 888static void enable_fd(int fd, struct virtqueue *vq)
846static u32 handle_block_output(int fd, const struct iovec *iov,
847 unsigned num, struct device *dev)
848{ 889{
849 struct lguest_block_page *p = dev->mem; 890 add_device_fd(vq->dev->fd);
850 u32 irq, *lenp; 891 /* Tell waker to listen to it again */
851 unsigned int len, reply_num; 892 write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd));
852 struct iovec reply[LGUEST_MAX_DMA_SECTIONS];
853 off64_t device_len, off = (off64_t)p->sector * 512;
854
855 /* First we extract the device length from the dev->priv pointer. */
856 device_len = *(off64_t *)dev->priv;
857
858 /* We first check that the read or write is within the length of the
859 * block file. */
860 if (off >= device_len)
861 err(1, "Bad offset %llu vs %llu", off, device_len);
862 /* Move to the right location in the block file. This shouldn't fail,
863 * but best to check. */
864 if (lseek64(dev->fd, off, SEEK_SET) != off)
865 err(1, "Bad seek to sector %i", p->sector);
866
867 verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off);
868
869 /* They were supposed to bind a reply buffer at key equal to the start
870 * of the block device memory. We need this to tell them when the
871 * request is finished. */
872 lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq);
873 if (!lenp)
874 err(1, "Block request didn't give us a dma buffer");
875
876 if (p->type) {
877 /* A write request. The DMA they sent contained the data, so
878 * write it out. */
879 len = writev(dev->fd, iov, num);
880 /* Grr... Now we know how long the "struct lguest_dma" they
881 * sent was, we make sure they didn't try to write over the end
882 * of the block file (possibly extending it). */
883 if (off + len > device_len) {
884 /* Trim it back to the correct length */
885 ftruncate64(dev->fd, device_len);
886 /* Die, bad Guest, die. */
887 errx(1, "Write past end %llu+%u", off, len);
888 }
889 /* The reply length is 0: we just send back an empty DMA to
890 * interrupt them and tell them the write is finished. */
891 *lenp = 0;
892 } else {
893 /* A read request. They sent an empty DMA to start the
894 * request, and we put the read contents into the reply
895 * buffer. */
896 len = readv(dev->fd, reply, reply_num);
897 *lenp = len;
898 }
899
900 /* The result is 1 (done), 2 if there was an error (short read or
901 * write). */
902 p->result = 1 + (p->bytes != len);
903 /* Now tell them we've used their reply buffer. */
904 trigger_irq(fd, irq);
905
906 /* We're supposed to return the number of bytes of the output buffer we
907 * used. But the block device uses the "result" field instead, so we
908 * don't bother. */
909 return 0;
910} 893}
911 894
912/* This is the generic routine we call when the Guest sends some DMA out. */ 895/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */
913static void handle_output(int fd, unsigned long dma, unsigned long key, 896static void handle_output(int fd, unsigned long addr)
914 struct device_list *devices)
915{ 897{
916 struct device *i; 898 struct device *i;
917 u32 *lenp; 899 struct virtqueue *vq;
918 struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; 900
919 unsigned num = 0; 901 /* Check each virtqueue. */
920 902 for (i = devices.dev; i; i = i->next) {
921 /* Convert the "struct lguest_dma" they're sending to a "struct 903 for (vq = i->vq; vq; vq = vq->next) {
922 * iovec". */ 904 if (vq->config.pfn == addr/getpagesize()
923 lenp = dma2iov(dma, iov, &num); 905 && vq->handle_output) {
924 906 verbose("Output to %s\n", vq->dev->name);
925 /* Check each device: if they expect output to this key, tell them to 907 vq->handle_output(fd, vq);
926 * handle it. */ 908 return;
927 for (i = devices->dev; i; i = i->next) { 909 }
928 if (i->handle_output && key == i->watch_key) {
929 /* We write the result straight into the used_len field
930 * for them. */
931 *lenp = i->handle_output(fd, iov, num, i);
932 return;
933 } 910 }
934 } 911 }
935 912
936 /* This can happen: the kernel sends any SEND_DMA which doesn't match 913 /* Early console write is done using notify on a nul-terminated string
937 * another Guest to us. It could be that another Guest just left a 914 * in Guest memory. */
938 * network, for example. But it's unusual. */ 915 if (addr >= guest_limit)
939 warnx("Pending dma %p, key %p", (void *)dma, (void *)key); 916 errx(1, "Bad NOTIFY %#lx", addr);
917
918 write(STDOUT_FILENO, from_guest_phys(addr),
919 strnlen(from_guest_phys(addr), guest_limit - addr));
940} 920}
941 921
942/* This is called when the waker wakes us up: check for incoming file 922/* This is called when the waker wakes us up: check for incoming file
943 * descriptors. */ 923 * descriptors. */
944static void handle_input(int fd, struct device_list *devices) 924static void handle_input(int fd)
945{ 925{
946 /* select() wants a zeroed timeval to mean "don't wait". */ 926 /* select() wants a zeroed timeval to mean "don't wait". */
947 struct timeval poll = { .tv_sec = 0, .tv_usec = 0 }; 927 struct timeval poll = { .tv_sec = 0, .tv_usec = 0 };
948 928
949 for (;;) { 929 for (;;) {
950 struct device *i; 930 struct device *i;
951 fd_set fds = devices->infds; 931 fd_set fds = devices.infds;
952 932
953 /* If nothing is ready, we're done. */ 933 /* If nothing is ready, we're done. */
954 if (select(devices->max_infd+1, &fds, NULL, NULL, &poll) == 0) 934 if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0)
955 break; 935 break;
956 936
957 /* Otherwise, call the device(s) which have readable 937 /* Otherwise, call the device(s) which have readable
958 * file descriptors and a method of handling them. */ 938 * file descriptors and a method of handling them. */
959 for (i = devices->dev; i; i = i->next) { 939 for (i = devices.dev; i; i = i->next) {
960 if (i->handle_input && FD_ISSET(i->fd, &fds)) { 940 if (i->handle_input && FD_ISSET(i->fd, &fds)) {
941 int dev_fd;
942 if (i->handle_input(fd, i))
943 continue;
944
961 /* If handle_input() returns false, it means we 945 /* If handle_input() returns false, it means we
962 * should no longer service it. 946 * should no longer service it. Networking and
963 * handle_console_input() does this. */ 947 * console do this when there's no input
964 if (!i->handle_input(fd, i)) { 948 * buffers to deliver into. Console also uses
965 /* Clear it from the set of input file 949 * it when it discovers that stdin is
966 * descriptors kept at the head of the 950 * closed. */
967 * device list. */ 951 FD_CLR(i->fd, &devices.infds);
968 FD_CLR(i->fd, &devices->infds); 952 /* Tell waker to ignore it too, by sending a
969 /* Tell waker to ignore it too... */ 953 * negative fd number (-1, since 0 is a valid
970 write(waker_fd, &i->fd, sizeof(i->fd)); 954 * FD number). */
971 } 955 dev_fd = -i->fd - 1;
956 write(waker_fd, &dev_fd, sizeof(dev_fd));
972 } 957 }
973 } 958 }
974 } 959 }
@@ -982,43 +967,93 @@ static void handle_input(int fd, struct device_list *devices)
982 * routines to allocate them. 967 * routines to allocate them.
983 * 968 *
984 * This routine allocates a new "struct lguest_device_desc" from descriptor 969 * This routine allocates a new "struct lguest_device_desc" from descriptor
985 * table in the devices array just above the Guest's normal memory. */ 970 * table just above the Guest's normal memory. It returns a pointer to that
986static struct lguest_device_desc * 971 * descriptor. */
987new_dev_desc(struct lguest_device_desc *descs, 972static struct lguest_device_desc *new_dev_desc(u16 type)
988 u16 type, u16 features, u16 num_pages)
989{ 973{
990 unsigned int i; 974 struct lguest_device_desc *d;
991 975
992 for (i = 0; i < LGUEST_MAX_DEVICES; i++) { 976 /* We only have one page for all the descriptors. */
993 if (!descs[i].type) { 977 if (devices.desc_used + sizeof(*d) > getpagesize())
994 descs[i].type = type; 978 errx(1, "Too many devices");
995 descs[i].features = features; 979
996 descs[i].num_pages = num_pages; 980 /* We don't need to set config_len or status: page is 0 already. */
997 /* If they said the device needs memory, we allocate 981 d = (void *)devices.descpage + devices.desc_used;
998 * that now, bumping up the top of Guest memory. */ 982 d->type = type;
999 if (num_pages) { 983 devices.desc_used += sizeof(*d);
1000 map_zeroed_pages(top, num_pages); 984
1001 descs[i].pfn = top/getpagesize(); 985 return d;
1002 top += num_pages*getpagesize();
1003 }
1004 return &descs[i];
1005 }
1006 }
1007 errx(1, "too many devices");
1008} 986}
1009 987
1010/* This monster routine does all the creation and setup of a new device, 988/* Each device descriptor is followed by some configuration information.
1011 * including caling new_dev_desc() to allocate the descriptor and device 989 * The first byte is a "status" byte for the Guest to report what's happening.
1012 * memory. */ 990 * After that are fields: u8 type, u8 len, [... len bytes...].
1013static struct device *new_device(struct device_list *devices, 991 *
1014 u16 type, u16 num_pages, u16 features, 992 * This routine adds a new field to an existing device's descriptor. It only
1015 int fd, 993 * works for the last device, but that's OK because that's how we use it. */
1016 bool (*handle_input)(int, struct device *), 994static void add_desc_field(struct device *dev, u8 type, u8 len, const void *c)
1017 unsigned long watch_off, 995{
1018 u32 (*handle_output)(int, 996 /* This is the last descriptor, right? */
1019 const struct iovec *, 997 assert(devices.descpage + devices.desc_used
1020 unsigned, 998 == (u8 *)(dev->desc + 1) + dev->desc->config_len);
1021 struct device *)) 999
1000 /* We only have one page of device descriptions. */
1001 if (devices.desc_used + 2 + len > getpagesize())
1002 errx(1, "Too many devices");
1003
1004 /* Copy in the new config header: type then length. */
1005 devices.descpage[devices.desc_used++] = type;
1006 devices.descpage[devices.desc_used++] = len;
1007 memcpy(devices.descpage + devices.desc_used, c, len);
1008 devices.desc_used += len;
1009
1010 /* Update the device descriptor length: two byte head then data. */
1011 dev->desc->config_len += 2 + len;
1012}
1013
1014/* This routine adds a virtqueue to a device. We specify how many descriptors
1015 * the virtqueue is to have. */
1016static void add_virtqueue(struct device *dev, unsigned int num_descs,
1017 void (*handle_output)(int fd, struct virtqueue *me))
1018{
1019 unsigned int pages;
1020 struct virtqueue **i, *vq = malloc(sizeof(*vq));
1021 void *p;
1022
1023 /* First we need some pages for this virtqueue. */
1024 pages = (vring_size(num_descs) + getpagesize() - 1) / getpagesize();
1025 p = get_pages(pages);
1026
1027 /* Initialize the configuration. */
1028 vq->config.num = num_descs;
1029 vq->config.irq = devices.next_irq++;
1030 vq->config.pfn = to_guest_phys(p) / getpagesize();
1031
1032 /* Initialize the vring. */
1033 vring_init(&vq->vring, num_descs, p);
1034
1035 /* Add the configuration information to this device's descriptor. */
1036 add_desc_field(dev, VIRTIO_CONFIG_F_VIRTQUEUE,
1037 sizeof(vq->config), &vq->config);
1038
1039 /* Add to tail of list, so dev->vq is first vq, dev->vq->next is
1040 * second. */
1041 for (i = &dev->vq; *i; i = &(*i)->next);
1042 *i = vq;
1043
1044 /* Link virtqueue back to device. */
1045 vq->dev = dev;
1046
1047 /* Set up handler. */
1048 vq->handle_output = handle_output;
1049 if (!handle_output)
1050 vq->vring.used->flags = VRING_USED_F_NO_NOTIFY;
1051}
1052
1053/* This routine does all the creation and setup of a new device, including
1054 * caling new_dev_desc() to allocate the descriptor and device memory. */
1055static struct device *new_device(const char *name, u16 type, int fd,
1056 bool (*handle_input)(int, struct device *))
1022{ 1057{
1023 struct device *dev = malloc(sizeof(*dev)); 1058 struct device *dev = malloc(sizeof(*dev));
1024 1059
@@ -1026,27 +1061,25 @@ static struct device *new_device(struct device_list *devices,
1026 * easier, but the user expects the devices to be arranged on the bus 1061 * easier, but the user expects the devices to be arranged on the bus
1027 * in command-line order. The first network device on the command line 1062 * in command-line order. The first network device on the command line
1028 * is eth0, the first block device /dev/lgba, etc. */ 1063 * is eth0, the first block device /dev/lgba, etc. */
1029 *devices->lastdev = dev; 1064 *devices.lastdev = dev;
1030 dev->next = NULL; 1065 dev->next = NULL;
1031 devices->lastdev = &dev->next; 1066 devices.lastdev = &dev->next;
1032 1067
1033 /* Now we populate the fields one at a time. */ 1068 /* Now we populate the fields one at a time. */
1034 dev->fd = fd; 1069 dev->fd = fd;
1035 /* If we have an input handler for this file descriptor, then we add it 1070 /* If we have an input handler for this file descriptor, then we add it
1036 * to the device_list's fdset and maxfd. */ 1071 * to the device_list's fdset and maxfd. */
1037 if (handle_input) 1072 if (handle_input)
1038 set_fd(dev->fd, devices); 1073 add_device_fd(dev->fd);
1039 dev->desc = new_dev_desc(devices->descs, type, features, num_pages); 1074 dev->desc = new_dev_desc(type);
1040 dev->mem = (void *)(dev->desc->pfn * getpagesize());
1041 dev->handle_input = handle_input; 1075 dev->handle_input = handle_input;
1042 dev->watch_key = (unsigned long)dev->mem + watch_off; 1076 dev->name = name;
1043 dev->handle_output = handle_output;
1044 return dev; 1077 return dev;
1045} 1078}
1046 1079
1047/* Our first setup routine is the console. It's a fairly simple device, but 1080/* Our first setup routine is the console. It's a fairly simple device, but
1048 * UNIX tty handling makes it uglier than it could be. */ 1081 * UNIX tty handling makes it uglier than it could be. */
1049static void setup_console(struct device_list *devices) 1082static void setup_console(void)
1050{ 1083{
1051 struct device *dev; 1084 struct device *dev;
1052 1085
@@ -1062,127 +1095,38 @@ static void setup_console(struct device_list *devices)
1062 atexit(restore_term); 1095 atexit(restore_term);
1063 } 1096 }
1064 1097
1065 /* We don't currently require any memory for the console, so we ask for 1098 dev = new_device("console", VIRTIO_ID_CONSOLE,
1066 * 0 pages. */ 1099 STDIN_FILENO, handle_console_input);
1067 dev = new_device(devices, LGUEST_DEVICE_T_CONSOLE, 0, 0,
1068 STDIN_FILENO, handle_console_input,
1069 LGUEST_CONSOLE_DMA_KEY, handle_console_output);
1070 /* We store the console state in dev->priv, and initialize it. */ 1100 /* We store the console state in dev->priv, and initialize it. */
1071 dev->priv = malloc(sizeof(struct console_abort)); 1101 dev->priv = malloc(sizeof(struct console_abort));
1072 ((struct console_abort *)dev->priv)->count = 0; 1102 ((struct console_abort *)dev->priv)->count = 0;
1073 verbose("device %p: console\n",
1074 (void *)(dev->desc->pfn * getpagesize()));
1075}
1076 1103
1077/* Setting up a block file is also fairly straightforward. */ 1104 /* The console needs two virtqueues: the input then the output. When
1078static void setup_block_file(const char *filename, struct device_list *devices) 1105 * they put something the input queue, we make sure we're listening to
1079{ 1106 * stdin. When they put something in the output queue, we write it to
1080 int fd; 1107 * stdout. */
1081 struct device *dev; 1108 add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
1082 off64_t *device_len; 1109 add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output);
1083 struct lguest_block_page *p; 1110
1084 1111 verbose("device %u: console\n", devices.device_num++);
1085 /* We open with O_LARGEFILE because otherwise we get stuck at 2G. We
1086 * open with O_DIRECT because otherwise our benchmarks go much too
1087 * fast. */
1088 fd = open_or_die(filename, O_RDWR|O_LARGEFILE|O_DIRECT);
1089
1090 /* We want one page, and have no input handler (the block file never
1091 * has anything interesting to say to us). Our timing will be quite
1092 * random, so it should be a reasonable randomness source. */
1093 dev = new_device(devices, LGUEST_DEVICE_T_BLOCK, 1,
1094 LGUEST_DEVICE_F_RANDOMNESS,
1095 fd, NULL, 0, handle_block_output);
1096
1097 /* We store the device size in the private area */
1098 device_len = dev->priv = malloc(sizeof(*device_len));
1099 /* This is the safe way of establishing the size of our device: it
1100 * might be a normal file or an actual block device like /dev/hdb. */
1101 *device_len = lseek64(fd, 0, SEEK_END);
1102
1103 /* The device memory is a "struct lguest_block_page". It's zeroed
1104 * already, we just need to put in the device size. Block devices
1105 * think in sectors (ie. 512 byte chunks), so we translate here. */
1106 p = dev->mem;
1107 p->num_sectors = *device_len/512;
1108 verbose("device %p: block %i sectors\n",
1109 (void *)(dev->desc->pfn * getpagesize()), p->num_sectors);
1110} 1112}
1113/*:*/
1111 1114
1112/* 1115/*M:010 Inter-guest networking is an interesting area. Simplest is to have a
1113 * Network Devices. 1116 * --sharenet=<name> option which opens or creates a named pipe. This can be
1117 * used to send packets to another guest in a 1:1 manner.
1114 * 1118 *
1115 * Setting up network devices is quite a pain, because we have three types. 1119 * More sopisticated is to use one of the tools developed for project like UML
1116 * First, we have the inter-Guest network. This is a file which is mapped into 1120 * to do networking.
1117 * the address space of the Guests who are on the network. Because it is a
1118 * shared mapping, the same page underlies all the devices, and they can send
1119 * DMA to each other.
1120 * 1121 *
1121 * Remember from our network driver, the Guest is told what slot in the page it 1122 * Faster is to do virtio bonding in kernel. Doing this 1:1 would be
1122 * is to use. We use exclusive fnctl locks to reserve a slot. If another 1123 * completely generic ("here's my vring, attach to your vring") and would work
1123 * Guest is using a slot, the lock will fail and we try another. Because fnctl 1124 * for any traffic. Of course, namespace and permissions issues need to be
1124 * locks are cleaned up automatically when we die, this cleverly means that our 1125 * dealt with. A more sophisticated "multi-channel" virtio_net.c could hide
1125 * reservation on the slot will vanish if we crash. */ 1126 * multiple inter-guest channels behind one interface, although it would
1126static unsigned int find_slot(int netfd, const char *filename) 1127 * require some manner of hotplugging new virtio channels.
1127{ 1128 *
1128 struct flock fl; 1129 * Finally, we could implement a virtio network switch in the kernel. :*/
1129
1130 fl.l_type = F_WRLCK;
1131 fl.l_whence = SEEK_SET;
1132 fl.l_len = 1;
1133 /* Try a 1 byte lock in each possible position number */
1134 for (fl.l_start = 0;
1135 fl.l_start < getpagesize()/sizeof(struct lguest_net);
1136 fl.l_start++) {
1137 /* If we succeed, return the slot number. */
1138 if (fcntl(netfd, F_SETLK, &fl) == 0)
1139 return fl.l_start;
1140 }
1141 errx(1, "No free slots in network file %s", filename);
1142}
1143
1144/* This function sets up the network file */
1145static void setup_net_file(const char *filename,
1146 struct device_list *devices)
1147{
1148 int netfd;
1149 struct device *dev;
1150
1151 /* We don't use open_or_die() here: for friendliness we create the file
1152 * if it doesn't already exist. */
1153 netfd = open(filename, O_RDWR, 0);
1154 if (netfd < 0) {
1155 if (errno == ENOENT) {
1156 netfd = open(filename, O_RDWR|O_CREAT, 0600);
1157 if (netfd >= 0) {
1158 /* If we succeeded, initialize the file with a
1159 * blank page. */
1160 char page[getpagesize()];
1161 memset(page, 0, sizeof(page));
1162 write(netfd, page, sizeof(page));
1163 }
1164 }
1165 if (netfd < 0)
1166 err(1, "cannot open net file '%s'", filename);
1167 }
1168
1169 /* We need 1 page, and the features indicate the slot to use and that
1170 * no checksum is needed. We never touch this device again; it's
1171 * between the Guests on the network, so we don't register input or
1172 * output handlers. */
1173 dev = new_device(devices, LGUEST_DEVICE_T_NET, 1,
1174 find_slot(netfd, filename)|LGUEST_NET_F_NOCSUM,
1175 -1, NULL, 0, NULL);
1176
1177 /* Map the shared file. */
1178 if (mmap(dev->mem, getpagesize(), PROT_READ|PROT_WRITE,
1179 MAP_FIXED|MAP_SHARED, netfd, 0) != dev->mem)
1180 err(1, "could not mmap '%s'", filename);
1181 verbose("device %p: shared net %s, peer %i\n",
1182 (void *)(dev->desc->pfn * getpagesize()), filename,
1183 dev->desc->features & ~LGUEST_NET_F_NOCSUM);
1184}
1185/*:*/
1186 1130
1187static u32 str2ip(const char *ipaddr) 1131static u32 str2ip(const char *ipaddr)
1188{ 1132{
@@ -1217,7 +1161,7 @@ static void add_to_bridge(int fd, const char *if_name, const char *br_name)
1217 1161
1218/* This sets up the Host end of the network device with an IP address, brings 1162/* This sets up the Host end of the network device with an IP address, brings
1219 * it up so packets will flow, the copies the MAC address into the hwaddr 1163 * it up so packets will flow, the copies the MAC address into the hwaddr
1220 * pointer (in practice, the Host's slot in the network device's memory). */ 1164 * pointer. */
1221static void configure_device(int fd, const char *devname, u32 ipaddr, 1165static void configure_device(int fd, const char *devname, u32 ipaddr,
1222 unsigned char hwaddr[6]) 1166 unsigned char hwaddr[6])
1223{ 1167{
@@ -1243,18 +1187,18 @@ static void configure_device(int fd, const char *devname, u32 ipaddr,
1243 memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6); 1187 memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6);
1244} 1188}
1245 1189
1246/*L:195 The other kind of network is a Host<->Guest network. This can either 1190/*L:195 Our network is a Host<->Guest network. This can either use bridging or
1247 * use briding or routing, but the principle is the same: it uses the "tun" 1191 * routing, but the principle is the same: it uses the "tun" device to inject
1248 * device to inject packets into the Host as if they came in from a normal 1192 * packets into the Host as if they came in from a normal network card. We
1249 * network card. We just shunt packets between the Guest and the tun 1193 * just shunt packets between the Guest and the tun device. */
1250 * device. */ 1194static void setup_tun_net(const char *arg)
1251static void setup_tun_net(const char *arg, struct device_list *devices)
1252{ 1195{
1253 struct device *dev; 1196 struct device *dev;
1254 struct ifreq ifr; 1197 struct ifreq ifr;
1255 int netfd, ipfd; 1198 int netfd, ipfd;
1256 u32 ip; 1199 u32 ip;
1257 const char *br_name = NULL; 1200 const char *br_name = NULL;
1201 u8 hwaddr[6];
1258 1202
1259 /* We open the /dev/net/tun device and tell it we want a tap device. A 1203 /* We open the /dev/net/tun device and tell it we want a tap device. A
1260 * tap device is like a tun device, only somehow different. To tell 1204 * tap device is like a tun device, only somehow different. To tell
@@ -1270,21 +1214,13 @@ static void setup_tun_net(const char *arg, struct device_list *devices)
1270 * device: trust us! */ 1214 * device: trust us! */
1271 ioctl(netfd, TUNSETNOCSUM, 1); 1215 ioctl(netfd, TUNSETNOCSUM, 1);
1272 1216
1273 /* We create the net device with 1 page, using the features field of 1217 /* First we create a new network device. */
1274 * the descriptor to tell the Guest it is in slot 1 (NET_PEERNUM), and 1218 dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input);
1275 * that the device has fairly random timing. We do *not* specify
1276 * LGUEST_NET_F_NOCSUM: these packets can reach the real world.
1277 *
1278 * We will put our MAC address is slot 0 for the Guest to see, so
1279 * it will send packets to us using the key "peer_offset(0)": */
1280 dev = new_device(devices, LGUEST_DEVICE_T_NET, 1,
1281 NET_PEERNUM|LGUEST_DEVICE_F_RANDOMNESS, netfd,
1282 handle_tun_input, peer_offset(0), handle_tun_output);
1283 1219
1284 /* We keep a flag which says whether we've seen packets come out from 1220 /* Network devices need a receive and a send queue, just like
1285 * this network device. */ 1221 * console. */
1286 dev->priv = malloc(sizeof(bool)); 1222 add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
1287 *(bool *)dev->priv = false; 1223 add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output);
1288 1224
1289 /* We need a socket to perform the magic network ioctls to bring up the 1225 /* We need a socket to perform the magic network ioctls to bring up the
1290 * tap interface, connect to the bridge etc. Any socket will do! */ 1226 * tap interface, connect to the bridge etc. Any socket will do! */
@@ -1300,44 +1236,251 @@ static void setup_tun_net(const char *arg, struct device_list *devices)
1300 } else /* It is an IP address to set up the device with */ 1236 } else /* It is an IP address to set up the device with */
1301 ip = str2ip(arg); 1237 ip = str2ip(arg);
1302 1238
1303 /* We are peer 0, ie. first slot, so we hand dev->mem to this routine 1239 /* Set up the tun device, and get the mac address for the interface. */
1304 * to write the MAC address at the start of the device memory. */ 1240 configure_device(ipfd, ifr.ifr_name, ip, hwaddr);
1305 configure_device(ipfd, ifr.ifr_name, ip, dev->mem);
1306 1241
1307 /* Set "promisc" bit: we want every single packet if we're going to 1242 /* Tell Guest what MAC address to use. */
1308 * bridge to other machines (and otherwise it doesn't matter). */ 1243 add_desc_field(dev, VIRTIO_CONFIG_NET_MAC_F, sizeof(hwaddr), hwaddr);
1309 *((u8 *)dev->mem) |= 0x1;
1310 1244
1245 /* We don't seed the socket any more; setup is done. */
1311 close(ipfd); 1246 close(ipfd);
1312 1247
1313 verbose("device %p: tun net %u.%u.%u.%u\n", 1248 verbose("device %u: tun net %u.%u.%u.%u\n",
1314 (void *)(dev->desc->pfn * getpagesize()), 1249 devices.device_num++,
1315 (u8)(ip>>24), (u8)(ip>>16), (u8)(ip>>8), (u8)ip); 1250 (u8)(ip>>24),(u8)(ip>>16),(u8)(ip>>8),(u8)ip);
1316 if (br_name) 1251 if (br_name)
1317 verbose("attached to bridge: %s\n", br_name); 1252 verbose("attached to bridge: %s\n", br_name);
1318} 1253}
1254
1255
1256/*
1257 * Block device.
1258 *
1259 * Serving a block device is really easy: the Guest asks for a block number and
1260 * we read or write that position in the file.
1261 *
1262 * Unfortunately, this is amazingly slow: the Guest waits until the read is
1263 * finished before running anything else, even if it could be doing useful
1264 * work. We could use async I/O, except it's reputed to suck so hard that
1265 * characters actually go missing from your code when you try to use it.
1266 *
1267 * So we farm the I/O out to thread, and communicate with it via a pipe. */
1268
1269/* This hangs off device->priv, with the data. */
1270struct vblk_info
1271{
1272 /* The size of the file. */
1273 off64_t len;
1274
1275 /* The file descriptor for the file. */
1276 int fd;
1277
1278 /* IO thread listens on this file descriptor [0]. */
1279 int workpipe[2];
1280
1281 /* IO thread writes to this file descriptor to mark it done, then
1282 * Launcher triggers interrupt to Guest. */
1283 int done_fd;
1284};
1285
1286/* This is the core of the I/O thread. It returns true if it did something. */
1287static bool service_io(struct device *dev)
1288{
1289 struct vblk_info *vblk = dev->priv;
1290 unsigned int head, out_num, in_num, wlen;
1291 int ret;
1292 struct virtio_blk_inhdr *in;
1293 struct virtio_blk_outhdr *out;
1294 struct iovec iov[dev->vq->vring.num];
1295 off64_t off;
1296
1297 head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
1298 if (head == dev->vq->vring.num)
1299 return false;
1300
1301 if (out_num == 0 || in_num == 0)
1302 errx(1, "Bad virtblk cmd %u out=%u in=%u",
1303 head, out_num, in_num);
1304
1305 out = convert(&iov[0], struct virtio_blk_outhdr);
1306 in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr);
1307 off = out->sector * 512;
1308
1309 /* This is how we implement barriers. Pretty poor, no? */
1310 if (out->type & VIRTIO_BLK_T_BARRIER)
1311 fdatasync(vblk->fd);
1312
1313 if (out->type & VIRTIO_BLK_T_SCSI_CMD) {
1314 fprintf(stderr, "Scsi commands unsupported\n");
1315 in->status = VIRTIO_BLK_S_UNSUPP;
1316 wlen = sizeof(in);
1317 } else if (out->type & VIRTIO_BLK_T_OUT) {
1318 /* Write */
1319
1320 /* Move to the right location in the block file. This can fail
1321 * if they try to write past end. */
1322 if (lseek64(vblk->fd, off, SEEK_SET) != off)
1323 err(1, "Bad seek to sector %llu", out->sector);
1324
1325 ret = writev(vblk->fd, iov+1, out_num-1);
1326 verbose("WRITE to sector %llu: %i\n", out->sector, ret);
1327
1328 /* Grr... Now we know how long the descriptor they sent was, we
1329 * make sure they didn't try to write over the end of the block
1330 * file (possibly extending it). */
1331 if (ret > 0 && off + ret > vblk->len) {
1332 /* Trim it back to the correct length */
1333 ftruncate64(vblk->fd, vblk->len);
1334 /* Die, bad Guest, die. */
1335 errx(1, "Write past end %llu+%u", off, ret);
1336 }
1337 wlen = sizeof(in);
1338 in->status = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
1339 } else {
1340 /* Read */
1341
1342 /* Move to the right location in the block file. This can fail
1343 * if they try to read past end. */
1344 if (lseek64(vblk->fd, off, SEEK_SET) != off)
1345 err(1, "Bad seek to sector %llu", out->sector);
1346
1347 ret = readv(vblk->fd, iov+1, in_num-1);
1348 verbose("READ from sector %llu: %i\n", out->sector, ret);
1349 if (ret >= 0) {
1350 wlen = sizeof(in) + ret;
1351 in->status = VIRTIO_BLK_S_OK;
1352 } else {
1353 wlen = sizeof(in);
1354 in->status = VIRTIO_BLK_S_IOERR;
1355 }
1356 }
1357
1358 /* We can't trigger an IRQ, because we're not the Launcher. It does
1359 * that when we tell it we're done. */
1360 add_used(dev->vq, head, wlen);
1361 return true;
1362}
1363
1364/* This is the thread which actually services the I/O. */
1365static int io_thread(void *_dev)
1366{
1367 struct device *dev = _dev;
1368 struct vblk_info *vblk = dev->priv;
1369 char c;
1370
1371 /* Close other side of workpipe so we get 0 read when main dies. */
1372 close(vblk->workpipe[1]);
1373 /* Close the other side of the done_fd pipe. */
1374 close(dev->fd);
1375
1376 /* When this read fails, it means Launcher died, so we follow. */
1377 while (read(vblk->workpipe[0], &c, 1) == 1) {
1378 /* We acknowledge each request immediately, to reduce latency,
1379 * rather than waiting until we've done them all. I haven't
1380 * measured to see if it makes any difference. */
1381 while (service_io(dev))
1382 write(vblk->done_fd, &c, 1);
1383 }
1384 return 0;
1385}
1386
1387/* When the thread says some I/O is done, we interrupt the Guest. */
1388static bool handle_io_finish(int fd, struct device *dev)
1389{
1390 char c;
1391
1392 /* If child died, presumably it printed message. */
1393 if (read(dev->fd, &c, 1) != 1)
1394 exit(1);
1395
1396 /* It did some work, so trigger the irq. */
1397 trigger_irq(fd, dev->vq);
1398 return true;
1399}
1400
1401/* When the Guest submits some I/O, we wake the I/O thread. */
1402static void handle_virtblk_output(int fd, struct virtqueue *vq)
1403{
1404 struct vblk_info *vblk = vq->dev->priv;
1405 char c = 0;
1406
1407 /* Wake up I/O thread and tell it to go to work! */
1408 if (write(vblk->workpipe[1], &c, 1) != 1)
1409 /* Presumably it indicated why it died. */
1410 exit(1);
1411}
1412
1413/* This creates a virtual block device. */
1414static void setup_block_file(const char *filename)
1415{
1416 int p[2];
1417 struct device *dev;
1418 struct vblk_info *vblk;
1419 void *stack;
1420 u64 cap;
1421 unsigned int val;
1422
1423 /* This is the pipe the I/O thread will use to tell us I/O is done. */
1424 pipe(p);
1425
1426 /* The device responds to return from I/O thread. */
1427 dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish);
1428
1429 /* The device has a virtqueue. */
1430 add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output);
1431
1432 /* Allocate the room for our own bookkeeping */
1433 vblk = dev->priv = malloc(sizeof(*vblk));
1434
1435 /* First we open the file and store the length. */
1436 vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
1437 vblk->len = lseek64(vblk->fd, 0, SEEK_END);
1438
1439 /* Tell Guest how many sectors this device has. */
1440 cap = cpu_to_le64(vblk->len / 512);
1441 add_desc_field(dev, VIRTIO_CONFIG_BLK_F_CAPACITY, sizeof(cap), &cap);
1442
1443 /* Tell Guest not to put in too many descriptors at once: two are used
1444 * for the in and out elements. */
1445 val = cpu_to_le32(VIRTQUEUE_NUM - 2);
1446 add_desc_field(dev, VIRTIO_CONFIG_BLK_F_SEG_MAX, sizeof(val), &val);
1447
1448 /* The I/O thread writes to this end of the pipe when done. */
1449 vblk->done_fd = p[1];
1450
1451 /* This is how we tell the I/O thread about more work. */
1452 pipe(vblk->workpipe);
1453
1454 /* Create stack for thread and run it */
1455 stack = malloc(32768);
1456 if (clone(io_thread, stack + 32768, CLONE_VM, dev) == -1)
1457 err(1, "Creating clone");
1458
1459 /* We don't need to keep the I/O thread's end of the pipes open. */
1460 close(vblk->done_fd);
1461 close(vblk->workpipe[0]);
1462
1463 verbose("device %u: virtblock %llu sectors\n",
1464 devices.device_num, cap);
1465}
1319/* That's the end of device setup. */ 1466/* That's the end of device setup. */
1320 1467
1321/*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves 1468/*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves
1322 * its input and output, and finally, lays it to rest. */ 1469 * its input and output, and finally, lays it to rest. */
1323static void __attribute__((noreturn)) 1470static void __attribute__((noreturn)) run_guest(int lguest_fd)
1324run_guest(int lguest_fd, struct device_list *device_list)
1325{ 1471{
1326 for (;;) { 1472 for (;;) {
1327 u32 args[] = { LHREQ_BREAK, 0 }; 1473 unsigned long args[] = { LHREQ_BREAK, 0 };
1328 unsigned long arr[2]; 1474 unsigned long notify_addr;
1329 int readval; 1475 int readval;
1330 1476
1331 /* We read from the /dev/lguest device to run the Guest. */ 1477 /* We read from the /dev/lguest device to run the Guest. */
1332 readval = read(lguest_fd, arr, sizeof(arr)); 1478 readval = read(lguest_fd, &notify_addr, sizeof(notify_addr));
1333
1334 /* The read can only really return sizeof(arr) (the Guest did a
1335 * SEND_DMA to us), or an error. */
1336 1479
1337 /* For a successful read, arr[0] is the address of the "struct 1480 /* One unsigned long means the Guest did HCALL_NOTIFY */
1338 * lguest_dma", and arr[1] is the key the Guest sent to. */ 1481 if (readval == sizeof(notify_addr)) {
1339 if (readval == sizeof(arr)) { 1482 verbose("Notify on address %#lx\n", notify_addr);
1340 handle_output(lguest_fd, arr[0], arr[1], device_list); 1483 handle_output(lguest_fd, notify_addr);
1341 continue; 1484 continue;
1342 /* ENOENT means the Guest died. Reading tells us why. */ 1485 /* ENOENT means the Guest died. Reading tells us why. */
1343 } else if (errno == ENOENT) { 1486 } else if (errno == ENOENT) {
@@ -1351,7 +1494,7 @@ run_guest(int lguest_fd, struct device_list *device_list)
1351 1494
1352 /* Service input, then unset the BREAK which releases 1495 /* Service input, then unset the BREAK which releases
1353 * the Waker. */ 1496 * the Waker. */
1354 handle_input(lguest_fd, device_list); 1497 handle_input(lguest_fd);
1355 if (write(lguest_fd, args, sizeof(args)) < 0) 1498 if (write(lguest_fd, args, sizeof(args)) < 0)
1356 err(1, "Resetting break"); 1499 err(1, "Resetting break");
1357 } 1500 }
@@ -1365,7 +1508,6 @@ run_guest(int lguest_fd, struct device_list *device_list)
1365 1508
1366static struct option opts[] = { 1509static struct option opts[] = {
1367 { "verbose", 0, NULL, 'v' }, 1510 { "verbose", 0, NULL, 'v' },
1368 { "sharenet", 1, NULL, 's' },
1369 { "tunnet", 1, NULL, 't' }, 1511 { "tunnet", 1, NULL, 't' },
1370 { "block", 1, NULL, 'b' }, 1512 { "block", 1, NULL, 'b' },
1371 { "initrd", 1, NULL, 'i' }, 1513 { "initrd", 1, NULL, 'i' },
@@ -1374,37 +1516,21 @@ static struct option opts[] = {
1374static void usage(void) 1516static void usage(void)
1375{ 1517{
1376 errx(1, "Usage: lguest [--verbose] " 1518 errx(1, "Usage: lguest [--verbose] "
1377 "[--sharenet=<filename>|--tunnet=(<ipaddr>|bridge:<bridgename>)\n" 1519 "[--tunnet=(<ipaddr>|bridge:<bridgename>)\n"
1378 "|--block=<filename>|--initrd=<filename>]...\n" 1520 "|--block=<filename>|--initrd=<filename>]...\n"
1379 "<mem-in-mb> vmlinux [args...]"); 1521 "<mem-in-mb> vmlinux [args...]");
1380} 1522}
1381 1523
1382/*L:100 The Launcher code itself takes us out into userspace, that scary place 1524/*L:105 The main routine is where the real work begins: */
1383 * where pointers run wild and free! Unfortunately, like most userspace
1384 * programs, it's quite boring (which is why everyone like to hack on the
1385 * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it
1386 * will get you through this section. Or, maybe not.
1387 *
1388 * The Launcher binary sits up high, usually starting at address 0xB8000000.
1389 * Everything below this is the "physical" memory for the Guest. For example,
1390 * if the Guest were to write a "1" at physical address 0, we would see a "1"
1391 * in the Launcher at "(int *)0". Guest physical == Launcher virtual.
1392 *
1393 * This can be tough to get your head around, but usually it just means that we
1394 * don't need to do any conversion when the Guest gives us it's "physical"
1395 * addresses.
1396 */
1397int main(int argc, char *argv[]) 1525int main(int argc, char *argv[])
1398{ 1526{
1399 /* Memory, top-level pagetable, code startpoint, PAGE_OFFSET and size 1527 /* Memory, top-level pagetable, code startpoint and size of the
1400 * of the (optional) initrd. */ 1528 * (optional) initrd. */
1401 unsigned long mem = 0, pgdir, start, page_offset, initrd_size = 0; 1529 unsigned long mem = 0, pgdir, start, initrd_size = 0;
1402 /* A temporary and the /dev/lguest file descriptor. */ 1530 /* A temporary and the /dev/lguest file descriptor. */
1403 int i, c, lguest_fd; 1531 int i, c, lguest_fd;
1404 /* The list of Guest devices, based on command line arguments. */ 1532 /* The boot information for the Guest. */
1405 struct device_list device_list; 1533 struct boot_params *boot;
1406 /* The boot information for the Guest: at guest-physical address 0. */
1407 void *boot = (void *)0;
1408 /* If they specify an initrd file to load. */ 1534 /* If they specify an initrd file to load. */
1409 const char *initrd_name = NULL; 1535 const char *initrd_name = NULL;
1410 1536
@@ -1412,11 +1538,12 @@ int main(int argc, char *argv[])
1412 * device receive input from a file descriptor, we keep an fdset 1538 * device receive input from a file descriptor, we keep an fdset
1413 * (infds) and the maximum fd number (max_infd) with the head of the 1539 * (infds) and the maximum fd number (max_infd) with the head of the
1414 * list. We also keep a pointer to the last device, for easy appending 1540 * list. We also keep a pointer to the last device, for easy appending
1415 * to the list. */ 1541 * to the list. Finally, we keep the next interrupt number to hand out
1416 device_list.max_infd = -1; 1542 * (1: remember that 0 is used by the timer). */
1417 device_list.dev = NULL; 1543 FD_ZERO(&devices.infds);
1418 device_list.lastdev = &device_list.dev; 1544 devices.max_infd = -1;
1419 FD_ZERO(&device_list.infds); 1545 devices.lastdev = &devices.dev;
1546 devices.next_irq = 1;
1420 1547
1421 /* We need to know how much memory so we can set up the device 1548 /* We need to know how much memory so we can set up the device
1422 * descriptor and memory pages for the devices as we parse the command 1549 * descriptor and memory pages for the devices as we parse the command
@@ -1424,9 +1551,16 @@ int main(int argc, char *argv[])
1424 * of memory now. */ 1551 * of memory now. */
1425 for (i = 1; i < argc; i++) { 1552 for (i = 1; i < argc; i++) {
1426 if (argv[i][0] != '-') { 1553 if (argv[i][0] != '-') {
1427 mem = top = atoi(argv[i]) * 1024 * 1024; 1554 mem = atoi(argv[i]) * 1024 * 1024;
1428 device_list.descs = map_zeroed_pages(top, 1); 1555 /* We start by mapping anonymous pages over all of
1429 top += getpagesize(); 1556 * guest-physical memory range. This fills it with 0,
1557 * and ensures that the Guest won't be killed when it
1558 * tries to access it. */
1559 guest_base = map_zeroed_pages(mem / getpagesize()
1560 + DEVICE_PAGES);
1561 guest_limit = mem;
1562 guest_max = mem + DEVICE_PAGES*getpagesize();
1563 devices.descpage = get_pages(1);
1430 break; 1564 break;
1431 } 1565 }
1432 } 1566 }
@@ -1437,14 +1571,11 @@ int main(int argc, char *argv[])
1437 case 'v': 1571 case 'v':
1438 verbose = true; 1572 verbose = true;
1439 break; 1573 break;
1440 case 's':
1441 setup_net_file(optarg, &device_list);
1442 break;
1443 case 't': 1574 case 't':
1444 setup_tun_net(optarg, &device_list); 1575 setup_tun_net(optarg);
1445 break; 1576 break;
1446 case 'b': 1577 case 'b':
1447 setup_block_file(optarg, &device_list); 1578 setup_block_file(optarg);
1448 break; 1579 break;
1449 case 'i': 1580 case 'i':
1450 initrd_name = optarg; 1581 initrd_name = optarg;
@@ -1459,56 +1590,60 @@ int main(int argc, char *argv[])
1459 if (optind + 2 > argc) 1590 if (optind + 2 > argc)
1460 usage(); 1591 usage();
1461 1592
1462 /* We always have a console device */ 1593 verbose("Guest base is at %p\n", guest_base);
1463 setup_console(&device_list);
1464 1594
1465 /* We start by mapping anonymous pages over all of guest-physical 1595 /* We always have a console device */
1466 * memory range. This fills it with 0, and ensures that the Guest 1596 setup_console();
1467 * won't be killed when it tries to access it. */
1468 map_zeroed_pages(0, mem / getpagesize());
1469 1597
1470 /* Now we load the kernel */ 1598 /* Now we load the kernel */
1471 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY), 1599 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
1472 &page_offset); 1600
1601 /* Boot information is stashed at physical address 0 */
1602 boot = from_guest_phys(0);
1473 1603
1474 /* Map the initrd image if requested (at top of physical memory) */ 1604 /* Map the initrd image if requested (at top of physical memory) */
1475 if (initrd_name) { 1605 if (initrd_name) {
1476 initrd_size = load_initrd(initrd_name, mem); 1606 initrd_size = load_initrd(initrd_name, mem);
1477 /* These are the location in the Linux boot header where the 1607 /* These are the location in the Linux boot header where the
1478 * start and size of the initrd are expected to be found. */ 1608 * start and size of the initrd are expected to be found. */
1479 *(unsigned long *)(boot+0x218) = mem - initrd_size; 1609 boot->hdr.ramdisk_image = mem - initrd_size;
1480 *(unsigned long *)(boot+0x21c) = initrd_size; 1610 boot->hdr.ramdisk_size = initrd_size;
1481 /* The bootloader type 0xFF means "unknown"; that's OK. */ 1611 /* The bootloader type 0xFF means "unknown"; that's OK. */
1482 *(unsigned char *)(boot+0x210) = 0xFF; 1612 boot->hdr.type_of_loader = 0xFF;
1483 } 1613 }
1484 1614
1485 /* Set up the initial linear pagetables, starting below the initrd. */ 1615 /* Set up the initial linear pagetables, starting below the initrd. */
1486 pgdir = setup_pagetables(mem, initrd_size, page_offset); 1616 pgdir = setup_pagetables(mem, initrd_size);
1487 1617
1488 /* The Linux boot header contains an "E820" memory map: ours is a 1618 /* The Linux boot header contains an "E820" memory map: ours is a
1489 * simple, single region. */ 1619 * simple, single region. */
1490 *(char*)(boot+E820NR) = 1; 1620 boot->e820_entries = 1;
1491 *((struct e820entry *)(boot+E820MAP)) 1621 boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM });
1492 = ((struct e820entry) { 0, mem, E820_RAM });
1493 /* The boot header contains a command line pointer: we put the command 1622 /* The boot header contains a command line pointer: we put the command
1494 * line after the boot header (at address 4096) */ 1623 * line after the boot header. */
1495 *(void **)(boot + 0x228) = boot + 4096; 1624 boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
1496 concat(boot + 4096, argv+optind+2); 1625 concat((char *)(boot + 1), argv+optind+2);
1626
1627 /* Boot protocol version: 2.07 supports the fields for lguest. */
1628 boot->hdr.version = 0x207;
1629
1630 /* The hardware_subarch value of "1" tells the Guest it's an lguest. */
1631 boot->hdr.hardware_subarch = 1;
1497 1632
1498 /* The guest type value of "1" tells the Guest it's under lguest. */ 1633 /* Tell the entry path not to try to reload segment registers. */
1499 *(int *)(boot + 0x23c) = 1; 1634 boot->hdr.loadflags |= KEEP_SEGMENTS;
1500 1635
1501 /* We tell the kernel to initialize the Guest: this returns the open 1636 /* We tell the kernel to initialize the Guest: this returns the open
1502 * /dev/lguest file descriptor. */ 1637 * /dev/lguest file descriptor. */
1503 lguest_fd = tell_kernel(pgdir, start, page_offset); 1638 lguest_fd = tell_kernel(pgdir, start);
1504 1639
1505 /* We fork off a child process, which wakes the Launcher whenever one 1640 /* We fork off a child process, which wakes the Launcher whenever one
1506 * of the input file descriptors needs attention. Otherwise we would 1641 * of the input file descriptors needs attention. Otherwise we would
1507 * run the Guest until it tries to output something. */ 1642 * run the Guest until it tries to output something. */
1508 waker_fd = setup_waker(lguest_fd, &device_list); 1643 waker_fd = setup_waker(lguest_fd);
1509 1644
1510 /* Finally, run the Guest. This doesn't return. */ 1645 /* Finally, run the Guest. This doesn't return. */
1511 run_guest(lguest_fd, &device_list); 1646 run_guest(lguest_fd);
1512} 1647}
1513/*:*/ 1648/*:*/
1514 1649
diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt
index 821617bd6c04..7885ab2d5f53 100644
--- a/Documentation/lguest/lguest.txt
+++ b/Documentation/lguest/lguest.txt
@@ -6,7 +6,7 @@ Lguest is designed to be a minimal hypervisor for the Linux kernel, for
6Linux developers and users to experiment with virtualization with the 6Linux developers and users to experiment with virtualization with the
7minimum of complexity. Nonetheless, it should have sufficient 7minimum of complexity. Nonetheless, it should have sufficient
8features to make it useful for specific tasks, and, of course, you are 8features to make it useful for specific tasks, and, of course, you are
9encouraged to fork and enhance it. 9encouraged to fork and enhance it (see drivers/lguest/README).
10 10
11Features: 11Features:
12 12
@@ -23,19 +23,30 @@ Developer features:
23 23
24Running Lguest: 24Running Lguest:
25 25
26- Lguest runs the same kernel as guest and host. You can configure 26- The easiest way to run lguest is to use same kernel as guest and host.
27 them differently, but usually it's easiest not to. 27 You can configure them differently, but usually it's easiest not to.
28 28
29 You will need to configure your kernel with the following options: 29 You will need to configure your kernel with the following options:
30 30
31 CONFIG_HIGHMEM64G=n ("High Memory Support" "64GB")[1] 31 "General setup":
32 CONFIG_TUN=y/m ("Universal TUN/TAP device driver support") 32 "Prompt for development and/or incomplete code/drivers" = Y
33 CONFIG_EXPERIMENTAL=y ("Prompt for development and/or incomplete code/drivers") 33 (CONFIG_EXPERIMENTAL=y)
34 CONFIG_PARAVIRT=y ("Paravirtualization support (EXPERIMENTAL)") 34
35 CONFIG_LGUEST=y/m ("Linux hypervisor example code") 35 "Processor type and features":
36 36 "Paravirtualized guest support" = Y
37 and I recommend: 37 "Lguest guest support" = Y
38 CONFIG_HZ=100 ("Timer frequency")[2] 38 "High Memory Support" = off/4GB
39 "Alignment value to which kernel should be aligned" = 0x100000
40 (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
41 CONFIG_PHYSICAL_ALIGN=0x100000)
42
43 "Device Drivers":
44 "Network device support"
45 "Universal TUN/TAP device driver support" = M/Y
46 (CONFIG_TUN=m)
47 "Virtualization"
48 "Linux hypervisor example code" = M/Y
49 (CONFIG_LGUEST=m)
39 50
40- A tool called "lguest" is available in this directory: type "make" 51- A tool called "lguest" is available in this directory: type "make"
41 to build it. If you didn't build your kernel in-tree, use "make 52 to build it. If you didn't build your kernel in-tree, use "make
@@ -51,14 +62,17 @@ Running Lguest:
51 dd if=/dev/zero of=rootfile bs=1M count=2048 62 dd if=/dev/zero of=rootfile bs=1M count=2048
52 qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d 63 qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d
53 64
65 Make sure that you install a getty on /dev/hvc0 if you want to log in on the
66 console!
67
54- "modprobe lg" if you built it as a module. 68- "modprobe lg" if you built it as a module.
55 69
56- Run an lguest as root: 70- Run an lguest as root:
57 71
58 Documentation/lguest/lguest 64m vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/lgba 72 Documentation/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/vda
59 73
60 Explanation: 74 Explanation:
61 64m: the amount of memory to use. 75 64: the amount of memory to use, in MB.
62 76
63 vmlinux: the kernel image found in the top of your build directory. You 77 vmlinux: the kernel image found in the top of your build directory. You
64 can also use a standard bzImage. 78 can also use a standard bzImage.
@@ -66,10 +80,10 @@ Running Lguest:
66 --tunnet=192.168.19.1: configures a "tap" device for networking with this 80 --tunnet=192.168.19.1: configures a "tap" device for networking with this
67 IP address. 81 IP address.
68 82
69 --block=rootfile: a file or block device which becomes /dev/lgba 83 --block=rootfile: a file or block device which becomes /dev/vda
70 inside the guest. 84 inside the guest.
71 85
72 root=/dev/lgba: this (and anything else on the command line) are 86 root=/dev/vda: this (and anything else on the command line) are
73 kernel boot parameters. 87 kernel boot parameters.
74 88
75- Configuring networking. I usually have the host masquerade, using 89- Configuring networking. I usually have the host masquerade, using
@@ -99,31 +113,7 @@ Running Lguest:
99 "--sharenet=<filename>": any two guests using the same file are on 113 "--sharenet=<filename>": any two guests using the same file are on
100 the same network. This file is created if it does not exist. 114 the same network. This file is created if it does not exist.
101 115
102Lguest I/O model: 116There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest
103
104Lguest uses a simplified DMA model plus shared memory for I/O. Guests
105can communicate with each other if they share underlying memory
106(usually by the lguest program mmaping the same file), but they can
107use any non-shared memory to communicate with the lguest process.
108
109Guests can register DMA buffers at any key (must be a valid physical
110address) using the LHCALL_BIND_DMA(key, dmabufs, num<<8|irq)
111hypercall. "dmabufs" is the physical address of an array of "num"
112"struct lguest_dma": each contains a used_len, and an array of
113physical addresses and lengths. When a transfer occurs, the
114"used_len" field of one of the buffers which has used_len 0 will be
115set to the length transferred and the irq will fire.
116 117
117Using an irq value of 0 unbinds the dma buffers. 118Good luck!
118
119To send DMA, the LHCALL_SEND_DMA(key, dma_physaddr) hypercall is used,
120and the bytes used is written to the used_len field. This can be 0 if
121noone else has bound a DMA buffer to that key or some other error.
122DMA buffers bound by the same guest are ignored.
123
124Cheers!
125Rusty Russell rusty@rustcorp.com.au. 119Rusty Russell rusty@rustcorp.com.au.
126
127[1] These are on various places on the TODO list, waiting for you to
128 get annoyed enough at the limitation to fix it.
129[2] Lguest is not yet tickless when idle. See [1].
diff --git a/Documentation/m68k/kernel-options.txt b/Documentation/m68k/kernel-options.txt
index 8a523f6af48a..248589e8bcf5 100644
--- a/Documentation/m68k/kernel-options.txt
+++ b/Documentation/m68k/kernel-options.txt
@@ -890,10 +890,7 @@ Syntax: nosync:0
8905.5.2) noasync 8905.5.2) noasync
891-------------- 891--------------
892 892
893Syntax: noasync:0 893[OBSOLETE, REMOVED]
894
895 Disables async and sync negotiation for all devices. Any value
896 after the colon is acceptable (and has the same effect).
897 894
8985.5.3) nodisconnect 8955.5.3) nodisconnect
899------------------- 896-------------------
diff --git a/Documentation/markers.txt b/Documentation/markers.txt
new file mode 100644
index 000000000000..295a71bc301e
--- /dev/null
+++ b/Documentation/markers.txt
@@ -0,0 +1,81 @@
1 Using the Linux Kernel Markers
2
3 Mathieu Desnoyers
4
5
6This document introduces Linux Kernel Markers and their use. It provides
7examples of how to insert markers in the kernel and connect probe functions to
8them and provides some examples of probe functions.
9
10
11* Purpose of markers
12
13A marker placed in code provides a hook to call a function (probe) that you can
14provide at runtime. A marker can be "on" (a probe is connected to it) or "off"
15(no probe is attached). When a marker is "off" it has no effect, except for
16adding a tiny time penalty (checking a condition for a branch) and space
17penalty (adding a few bytes for the function call at the end of the
18instrumented function and adds a data structure in a separate section). When a
19marker is "on", the function you provide is called each time the marker is
20executed, in the execution context of the caller. When the function provided
21ends its execution, it returns to the caller (continuing from the marker site).
22
23You can put markers at important locations in the code. Markers are
24lightweight hooks that can pass an arbitrary number of parameters,
25described in a printk-like format string, to the attached probe function.
26
27They can be used for tracing and performance accounting.
28
29
30* Usage
31
32In order to use the macro trace_mark, you should include linux/marker.h.
33
34#include <linux/marker.h>
35
36And,
37
38trace_mark(subsystem_event, "%d %s", someint, somestring);
39Where :
40- subsystem_event is an identifier unique to your event
41 - subsystem is the name of your subsystem.
42 - event is the name of the event to mark.
43- "%d %s" is the formatted string for the serializer.
44- someint is an integer.
45- somestring is a char pointer.
46
47Connecting a function (probe) to a marker is done by providing a probe (function
48to call) for the specific marker through marker_probe_register() and can be
49activated by calling marker_arm(). Marker deactivation can be done by calling
50marker_disarm() as many times as marker_arm() has been called. Removing a probe
51is done through marker_probe_unregister(); it will disarm the probe and make
52sure there is no caller left using the probe when it returns. Probe removal is
53preempt-safe because preemption is disabled around the probe call. See the
54"Probe example" section below for a sample probe module.
55
56The marker mechanism supports inserting multiple instances of the same marker.
57Markers can be put in inline functions, inlined static functions, and
58unrolled loops as well as regular functions.
59
60The naming scheme "subsystem_event" is suggested here as a convention intended
61to limit collisions. Marker names are global to the kernel: they are considered
62as being the same whether they are in the core kernel image or in modules.
63Conflicting format strings for markers with the same name will cause the markers
64to be detected to have a different format string not to be armed and will output
65a printk warning which identifies the inconsistency:
66
67"Format mismatch for probe probe_name (format), marker (format)"
68
69
70* Probe / marker example
71
72See the example provided in samples/markers/src
73
74Compile them with your kernel.
75
76Run, as root :
77modprobe marker-example (insmod order is not important)
78modprobe probe-example
79cat /proc/marker-example (returns an expected error)
80rmmod marker-example probe-example
81dmesg
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 650657c54733..4e17beba2379 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -1479,7 +1479,8 @@ kernel.
1479 1479
1480Any atomic operation that modifies some state in memory and returns information 1480Any atomic operation that modifies some state in memory and returns information
1481about the state (old or new) implies an SMP-conditional general memory barrier 1481about the state (old or new) implies an SMP-conditional general memory barrier
1482(smp_mb()) on each side of the actual operation. These include: 1482(smp_mb()) on each side of the actual operation (with the exception of
1483explicit lock operations, described later). These include:
1483 1484
1484 xchg(); 1485 xchg();
1485 cmpxchg(); 1486 cmpxchg();
@@ -1536,10 +1537,19 @@ If they're used for constructing a lock of some description, then they probably
1536do need memory barriers as a lock primitive generally has to do things in a 1537do need memory barriers as a lock primitive generally has to do things in a
1537specific order. 1538specific order.
1538 1539
1539
1540Basically, each usage case has to be carefully considered as to whether memory 1540Basically, each usage case has to be carefully considered as to whether memory
1541barriers are needed or not. 1541barriers are needed or not.
1542 1542
1543The following operations are special locking primitives:
1544
1545 test_and_set_bit_lock();
1546 clear_bit_unlock();
1547 __clear_bit_unlock();
1548
1549These implement LOCK-class and UNLOCK-class operations. These should be used in
1550preference to other operations when implementing locking primitives, because
1551their implementations can be optimised on many architectures.
1552
1543[!] Note that special memory barrier primitives are available for these 1553[!] Note that special memory barrier primitives are available for these
1544situations because on some CPUs the atomic instructions used imply full memory 1554situations because on some CPUs the atomic instructions used imply full memory
1545barriers, and so barrier instructions are superfluous in conjunction with them, 1555barriers, and so barrier instructions are superfluous in conjunction with them,
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index 5fbcc22c98e9..168117bd6ee8 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -2,7 +2,8 @@
2Memory Hotplug 2Memory Hotplug
3============== 3==============
4 4
5Last Updated: Jul 28 2007 5Created: Jul 28 2007
6Add description of notifier of memory hotplug Oct 11 2007
6 7
7This document is about memory hotplug including how-to-use and current status. 8This document is about memory hotplug including how-to-use and current status.
8Because Memory Hotplug is still under development, contents of this text will 9Because Memory Hotplug is still under development, contents of this text will
@@ -24,7 +25,8 @@ be changed often.
24 6.1 Memory offline and ZONE_MOVABLE 25 6.1 Memory offline and ZONE_MOVABLE
25 6.2. How to offline memory 26 6.2. How to offline memory
267. Physical memory remove 277. Physical memory remove
278. Future Work List 288. Memory hotplug event notifier
299. Future Work List
28 30
29Note(1): x86_64's has special implementation for memory hotplug. 31Note(1): x86_64's has special implementation for memory hotplug.
30 This text does not describe it. 32 This text does not describe it.
@@ -307,8 +309,58 @@ Need more implementation yet....
307 - Notification completion of remove works by OS to firmware. 309 - Notification completion of remove works by OS to firmware.
308 - Guard from remove if not yet. 310 - Guard from remove if not yet.
309 311
312--------------------------------
3138. Memory hotplug event notifier
314--------------------------------
315Memory hotplug has event notifer. There are 6 types of notification.
316
317MEMORY_GOING_ONLINE
318 Generated before new memory becomes available in order to be able to
319 prepare subsystems to handle memory. The page allocator is still unable
320 to allocate from the new memory.
321
322MEMORY_CANCEL_ONLINE
323 Generated if MEMORY_GOING_ONLINE fails.
324
325MEMORY_ONLINE
326 Generated when memory has succesfully brought online. The callback may
327 allocate pages from the new memory.
328
329MEMORY_GOING_OFFLINE
330 Generated to begin the process of offlining memory. Allocations are no
331 longer possible from the memory but some of the memory to be offlined
332 is still in use. The callback can be used to free memory known to a
333 subsystem from the indicated memory section.
334
335MEMORY_CANCEL_OFFLINE
336 Generated if MEMORY_GOING_OFFLINE fails. Memory is available again from
337 the section that we attempted to offline.
338
339MEMORY_OFFLINE
340 Generated after offlining memory is complete.
341
342A callback routine can be registered by
343 hotplug_memory_notifier(callback_func, priority)
344
345The second argument of callback function (action) is event types of above.
346The third argument is passed by pointer of struct memory_notify.
347
348struct memory_notify {
349 unsigned long start_pfn;
350 unsigned long nr_pages;
351 int status_cahnge_nid;
352}
353
354start_pfn is start_pfn of online/offline memory.
355nr_pages is # of pages of online/offline memory.
356status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be)
357set/clear. It means a new(memoryless) node gets new memory by online and a
358node loses all memory. If this is -1, then nodemask status is not changed.
359If status_changed_nid >= 0, callback should create/discard structures for the
360node if necessary.
361
310-------------- 362--------------
3118. Future Work 3639. Future Work
312-------------- 364--------------
313 - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like 365 - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like
314 sysctl or new control file. 366 sysctl or new control file.
diff --git a/Documentation/mips/00-INDEX b/Documentation/mips/00-INDEX
index 9df8a2eac7b4..3f13bf8043d2 100644
--- a/Documentation/mips/00-INDEX
+++ b/Documentation/mips/00-INDEX
@@ -4,5 +4,3 @@ AU1xxx_IDE.README
4 - README for MIPS AU1XXX IDE driver. 4 - README for MIPS AU1XXX IDE driver.
5GT64120.README 5GT64120.README
6 - README for dir with info on MIPS boards using GT-64120 or GT-64120A. 6 - README for dir with info on MIPS boards using GT-64120 or GT-64120A.
7time.README
8 - README for MIPS time services.
diff --git a/Documentation/mips/AU1xxx_IDE.README b/Documentation/mips/AU1xxx_IDE.README
index afb31c141d9d..5c8334123f4f 100644
--- a/Documentation/mips/AU1xxx_IDE.README
+++ b/Documentation/mips/AU1xxx_IDE.README
@@ -59,7 +59,7 @@ Four configs variables are introduced:
59 CONFIG_BLK_DEV_IDE_AU1XXX_PIO_DBDMA - enable the PIO+DBDMA mode 59 CONFIG_BLK_DEV_IDE_AU1XXX_PIO_DBDMA - enable the PIO+DBDMA mode
60 CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA - enable the MWDMA mode 60 CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA - enable the MWDMA mode
61 CONFIG_BLK_DEV_IDE_AU1XXX_BURSTABLE_ON - set Burstable FIFO in DBDMA 61 CONFIG_BLK_DEV_IDE_AU1XXX_BURSTABLE_ON - set Burstable FIFO in DBDMA
62 controler 62 controller
63 CONFIG_BLK_DEV_IDE_AU1XXX_SEQTS_PER_RQ - maximum transfer size 63 CONFIG_BLK_DEV_IDE_AU1XXX_SEQTS_PER_RQ - maximum transfer size
64 per descriptor 64 per descriptor
65 65
diff --git a/Documentation/mips/time.README b/Documentation/mips/time.README
deleted file mode 100644
index a4ce603ed3b3..000000000000
--- a/Documentation/mips/time.README
+++ /dev/null
@@ -1,173 +0,0 @@
1README for MIPS time services
2
3Jun Sun
4jsun@mvista.com or jsun@junsun.net
5
6
7ABOUT
8-----
9This file describes the new arch/mips/kernel/time.c, related files and the
10services they provide.
11
12If you are short in patience and just want to know how to use time.c for a
13new board or convert an existing board, go to the last section.
14
15
16FILES, COMPATABILITY AND CONFIGS
17---------------------------------
18
19The old arch/mips/kernel/time.c is renamed to old-time.c.
20
21A new time.c is put there, together with include/asm-mips/time.h.
22
23Two configs variables are introduced, CONFIG_OLD_TIME_C and CONFIG_NEW_TIME_C.
24So we allow boards using
25
26 1) old time.c (CONFIG_OLD_TIME_C)
27 2) new time.c (CONFIG_NEW_TIME_C)
28 3) neither (their own private time.c)
29
30However, it is expected every board will move to the new time.c in the near
31future.
32
33
34WHAT THE NEW CODE PROVIDES?
35---------------------------
36
37The new time code provide the following services:
38
39 a) Implements functions required by Linux common code:
40 time_init
41
42 b) provides an abstraction of RTC and null RTC implementation as default.
43 extern unsigned long (*rtc_get_time)(void);
44 extern int (*rtc_set_time)(unsigned long);
45
46 c) high-level and low-level timer interrupt routines where the timer
47 interrupt source may or may not be the CPU timer. The high-level
48 routine is dispatched through do_IRQ() while the low-level is
49 dispatched in assemably code (usually int-handler.S)
50
51
52WHAT THE NEW CODE REQUIRES?
53---------------------------
54
55For the new code to work properly, each board implementation needs to supply
56the following functions or values:
57
58 a) board_time_init - a function pointer. Invoked at the beginnig of
59 time_init(). It is optional.
60 1. (optional) set up RTC routines
61 2. (optional) calibrate and set the mips_hpt_frequency
62
63 b) plat_timer_setup - a function pointer. Invoked at the end of time_init()
64 1. (optional) over-ride any decisions made in time_init()
65 2. set up the irqaction for timer interrupt.
66 3. enable the timer interrupt
67
68 c) (optional) board-specific RTC routines.
69
70 d) (optional) mips_hpt_frequency - It must be definied if the board
71 is using CPU counter for timer interrupt.
72
73
74PORTING GUIDE
75-------------
76
77Step 1: decide how you like to implement the time services.
78
79 a) does this board have a RTC? If yes, implement the two RTC funcs.
80
81 b) does the CPU have counter/compare registers?
82
83 If the answer is no, you need a timer to provide the timer interrupt
84 at 100 HZ speed.
85
86 c) The following sub steps assume your CPU has counter register.
87 Do you plan to use the CPU counter register as the timer interrupt
88 or use an exnternal timer?
89
90 In order to use CPU counter register as the timer interrupt source, you
91 must know the counter speed (mips_hpt_frequency). It is usually the
92 same as the CPU speed or an integral divisor of it.
93
94 d) decide on whether you want to use high-level or low-level timer
95 interrupt routines. The low-level one is presumably faster, but should
96 not make too mcuh difference.
97
98
99Step 2: the machine setup() function
100
101 If you supply board_time_init(), set the function poointer.
102
103
104Step 3: implement rtc routines, board_time_init() and plat_timer_setup()
105 if needed.
106
107 board_time_init() -
108 a) (optional) set up RTC routines,
109 b) (optional) calibrate and set the mips_hpt_frequency
110 (only needed if you intended to use cpu counter as timer interrupt
111 source)
112
113 plat_timer_setup() -
114 a) (optional) over-write any choices made above by time_init().
115 b) machine specific code should setup the timer irqaction.
116 c) enable the timer interrupt
117
118
119 If the RTC chip is a common chip, I suggest the routines are put under
120 arch/mips/libs. For example, for DS1386 chip, one would create
121 rtc-ds1386.c under arch/mips/lib directory. Add the following line to
122 the arch/mips/lib/Makefile:
123
124 obj-$(CONFIG_DDB5476) += rtc-ds1386.o
125
126Step 4: if you are using low-level timer interrupt, change your interrupt
127 dispathcing code to check for timer interrupt and jump to
128 ll_timer_interrupt() directly if one is detected.
129
130Step 5: Modify arch/mips/config.in and add CONFIG_NEW_TIME_C to your machine.
131 Modify the appropriate defconfig if applicable.
132
133Final notes:
134
135For some tricky cases, you may need to add your own wrapper functions
136for some of the functions in time.c.
137
138For example, you may define your own timer interrupt routine, which does
139some of its own processing and then calls timer_interrupt().
140
141You can also over-ride any of the built-in functions (RTC routines
142and/or timer interrupt routine).
143
144
145PORTING NOTES FOR SMP
146----------------------
147
148If you have a SMP box, things are slightly more complicated.
149
150The time service running every jiffy is logically divided into two parts:
151
152 1) the one for the whole system (defined in timer_interrupt())
153 2) the one that should run for each CPU (defined in local_timer_interrupt())
154
155You need to decide on your timer interrupt sources.
156
157 case 1) - whole system has only one timer interrupt delivered to one CPU
158
159 In this case, you set up timer interrupt as in UP systems. In addtion,
160 you need to set emulate_local_timer_interrupt to 1 so that other
161 CPUs get to call local_timer_interrupt().
162
163 THIS IS CURRENTLY NOT IMPLEMNETED. However, it is rather easy to write
164 one should such a need arise. You simply make a IPI call.
165
166 case 2) - each CPU has a separate timer interrupt
167
168 In this case, you need to set up IRQ such that each of them will
169 call local_timer_interrupt(). In addition, you need to arrange
170 one and only one of them to call timer_interrupt().
171
172 You can also do the low-level version of those interrupt routines,
173 following similar dispatching routes described above.
diff --git a/Documentation/mutex-design.txt b/Documentation/mutex-design.txt
index 51f935191ae5..aa60d1f627e5 100644
--- a/Documentation/mutex-design.txt
+++ b/Documentation/mutex-design.txt
@@ -133,4 +133,6 @@ the APIs of 'struct mutex' have been streamlined:
133 int mutex_trylock(struct mutex *lock); 133 int mutex_trylock(struct mutex *lock);
134 void mutex_unlock(struct mutex *lock); 134 void mutex_unlock(struct mutex *lock);
135 int mutex_is_locked(struct mutex *lock); 135 int mutex_is_locked(struct mutex *lock);
136 136 void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
137 int mutex_lock_interruptible_nested(struct mutex *lock,
138 unsigned int subclass);
diff --git a/Documentation/networking/bcm43xx.txt b/Documentation/networking/bcm43xx.txt
index a136721499bf..d602c8d6ff3e 100644
--- a/Documentation/networking/bcm43xx.txt
+++ b/Documentation/networking/bcm43xx.txt
@@ -37,7 +37,7 @@ all, distributions. There is, however, additional software that is
37required. The firmware used by the chip is the intellectual property 37required. The firmware used by the chip is the intellectual property
38of Broadcom and they have not given the bcm43xx team redistribution 38of Broadcom and they have not given the bcm43xx team redistribution
39rights to this firmware. Since we cannot legally redistribute 39rights to this firmware. Since we cannot legally redistribute
40the firwmare we cannot include it with the driver. Furthermore, it 40the firmware we cannot include it with the driver. Furthermore, it
41cannot be placed in the downloadable archives of any distributing 41cannot be placed in the downloadable archives of any distributing
42organization; therefore, the user is responsible for obtaining the 42organization; therefore, the user is responsible for obtaining the
43firmware and placing it in the appropriate location so that the driver 43firmware and placing it in the appropriate location so that the driver
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 6ae2feff3087..747a5d15d529 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -293,7 +293,7 @@ tcp_no_metrics_save - BOOLEAN
293 when the connection closes, so that connections established in the 293 when the connection closes, so that connections established in the
294 near future can use these to set initial conditions. Usually, this 294 near future can use these to set initial conditions. Usually, this
295 increases overall performance, but may sometimes cause performance 295 increases overall performance, but may sometimes cause performance
296 degredation. If set, TCP will not cache metrics on closing 296 degradation. If set, TCP will not cache metrics on closing
297 connections. 297 connections.
298 298
299tcp_orphan_retries - INTEGER 299tcp_orphan_retries - INTEGER
diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt
index c36b64b0020f..c3669a3fb4af 100644
--- a/Documentation/networking/rxrpc.txt
+++ b/Documentation/networking/rxrpc.txt
@@ -689,7 +689,7 @@ such as the AFS filesystem. This permits such a utility to:
689 buffers manipulated directly. 689 buffers manipulated directly.
690 690
691To use the RxRPC facility, a kernel utility must still open an AF_RXRPC socket, 691To use the RxRPC facility, a kernel utility must still open an AF_RXRPC socket,
692bind an addess as appropriate and listen if it's to be a server socket, but 692bind an address as appropriate and listen if it's to be a server socket, but
693then it passes this to the kernel interface functions. 693then it passes this to the kernel interface functions.
694 694
695The kernel interface functions are as follows: 695The kernel interface functions are as follows:
diff --git a/Documentation/networking/udplite.txt b/Documentation/networking/udplite.txt
index 6be09ba24a36..b6409cab075c 100644
--- a/Documentation/networking/udplite.txt
+++ b/Documentation/networking/udplite.txt
@@ -12,7 +12,7 @@
12 For in-depth information, you can consult: 12 For in-depth information, you can consult:
13 13
14 o The UDP-Lite Homepage: http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/ 14 o The UDP-Lite Homepage: http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/
15 Fom here you can also download some example application source code. 15 From here you can also download some example application source code.
16 16
17 o The UDP-Lite HOWTO on 17 o The UDP-Lite HOWTO on
18 http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/files/UDP-Lite-HOWTO.txt 18 http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/files/UDP-Lite-HOWTO.txt
@@ -223,7 +223,7 @@
223 While it is important that such cases are dealt with correctly, they 223 While it is important that such cases are dealt with correctly, they
224 are (annoyingly) rare: UDP-Lite is designed for optimising multimedia 224 are (annoyingly) rare: UDP-Lite is designed for optimising multimedia
225 performance over wireless (or generally noisy) links and thus smaller 225 performance over wireless (or generally noisy) links and thus smaller
226 coverage lenghts are likely to be expected. 226 coverage lengths are likely to be expected.
227 227
228 228
229 V) UDP-LITE RUNTIME STATISTICS AND THEIR MEANING 229 V) UDP-LITE RUNTIME STATISTICS AND THEIR MEANING
@@ -259,7 +259,7 @@
259 VI) IPTABLES 259 VI) IPTABLES
260 260
261 There is packet match support for UDP-Lite as well as support for the LOG target. 261 There is packet match support for UDP-Lite as well as support for the LOG target.
262 If you copy and paste the following line into /etc/protcols, 262 If you copy and paste the following line into /etc/protocols,
263 263
264 udplite 136 UDP-Lite # UDP-Lite [RFC 3828] 264 udplite 136 UDP-Lite # UDP-Lite [RFC 3828]
265 265
diff --git a/Documentation/parport-lowlevel.txt b/Documentation/parport-lowlevel.txt
index 8f2302415eff..265fcdcb8e5f 100644
--- a/Documentation/parport-lowlevel.txt
+++ b/Documentation/parport-lowlevel.txt
@@ -25,7 +25,6 @@ Global functions:
25 parport_open 25 parport_open
26 parport_close 26 parport_close
27 parport_device_id 27 parport_device_id
28 parport_device_num
29 parport_device_coords 28 parport_device_coords
30 parport_find_class 29 parport_find_class
31 parport_find_device 30 parport_find_device
@@ -735,7 +734,7 @@ NULL is returned.
735 734
736SEE ALSO 735SEE ALSO
737 736
738parport_register_device, parport_device_num 737parport_register_device
739 738
740parport_close - unregister device for particular device number 739parport_close - unregister device for particular device number
741------------- 740-------------
@@ -787,29 +786,7 @@ Many devices have ill-formed IEEE 1284 Device IDs.
787 786
788SEE ALSO 787SEE ALSO
789 788
790parport_find_class, parport_find_device, parport_device_num 789parport_find_class, parport_find_device
791
792parport_device_num - convert device coordinates to device number
793------------------
794
795SYNOPSIS
796
797#include <linux/parport.h>
798
799int parport_device_num (int parport, int mux, int daisy);
800
801DESCRIPTION
802
803Convert between device coordinates (port, multiplexor, daisy chain
804address) and device number (zero-based).
805
806RETURN VALUE
807
808Device number, or -1 if no device at given coordinates.
809
810SEE ALSO
811
812parport_device_coords, parport_open, parport_device_id
813 790
814parport_device_coords - convert device number to device coordinates 791parport_device_coords - convert device number to device coordinates
815------------------ 792------------------
@@ -833,7 +810,7 @@ Zero on success, in which case the coordinates are (*parport, *mux,
833 810
834SEE ALSO 811SEE ALSO
835 812
836parport_device_num, parport_open, parport_device_id 813parport_open, parport_device_id
837 814
838parport_find_class - find a device by its class 815parport_find_class - find a device by its class
839------------------ 816------------------
diff --git a/Documentation/power/basic-pm-debugging.txt b/Documentation/power/basic-pm-debugging.txt
index 1a85e2b964dc..57aef2f6e0de 100644
--- a/Documentation/power/basic-pm-debugging.txt
+++ b/Documentation/power/basic-pm-debugging.txt
@@ -78,8 +78,8 @@ c) Advanced debugging
78In case the STD does not work on your system even in the minimal configuration 78In case the STD does not work on your system even in the minimal configuration
79and compiling more drivers as modules is not practical or some modules cannot 79and compiling more drivers as modules is not practical or some modules cannot
80be unloaded, you can use one of the more advanced debugging techniques to find 80be unloaded, you can use one of the more advanced debugging techniques to find
81the problem. First, if there is a serial port in your box, you can set the 81the problem. First, if there is a serial port in your box, you can boot the
82CONFIG_DISABLE_CONSOLE_SUSPEND kernel configuration option and try to log kernel 82kernel with the 'no_console_suspend' parameter and try to log kernel
83messages using the serial console. This may provide you with some information 83messages using the serial console. This may provide you with some information
84about the reasons of the suspend (resume) failure. Alternatively, it may be 84about the reasons of the suspend (resume) failure. Alternatively, it may be
85possible to use a FireWire port for debugging with firescope 85possible to use a FireWire port for debugging with firescope
diff --git a/Documentation/power/freezing-of-tasks.txt b/Documentation/power/freezing-of-tasks.txt
index 04dc1cf9d215..38b57248fd61 100644
--- a/Documentation/power/freezing-of-tasks.txt
+++ b/Documentation/power/freezing-of-tasks.txt
@@ -19,12 +19,13 @@ we only consider hibernation, but the description also applies to suspend).
19Namely, as the first step of the hibernation procedure the function 19Namely, as the first step of the hibernation procedure the function
20freeze_processes() (defined in kernel/power/process.c) is called. It executes 20freeze_processes() (defined in kernel/power/process.c) is called. It executes
21try_to_freeze_tasks() that sets TIF_FREEZE for all of the freezable tasks and 21try_to_freeze_tasks() that sets TIF_FREEZE for all of the freezable tasks and
22sends a fake signal to each of them. A task that receives such a signal and has 22either wakes them up, if they are kernel threads, or sends fake signals to them,
23TIF_FREEZE set, should react to it by calling the refrigerator() function 23if they are user space processes. A task that has TIF_FREEZE set, should react
24(defined in kernel/power/process.c), which sets the task's PF_FROZEN flag, 24to it by calling the function called refrigerator() (defined in
25changes its state to TASK_UNINTERRUPTIBLE and makes it loop until PF_FROZEN is 25kernel/power/process.c), which sets the task's PF_FROZEN flag, changes its state
26cleared for it. Then, we say that the task is 'frozen' and therefore the set of 26to TASK_UNINTERRUPTIBLE and makes it loop until PF_FROZEN is cleared for it.
27functions handling this mechanism is called 'the freezer' (these functions are 27Then, we say that the task is 'frozen' and therefore the set of functions
28handling this mechanism is referred to as 'the freezer' (these functions are
28defined in kernel/power/process.c and include/linux/freezer.h). User space 29defined in kernel/power/process.c and include/linux/freezer.h). User space
29processes are generally frozen before kernel threads. 30processes are generally frozen before kernel threads.
30 31
@@ -35,21 +36,27 @@ task enter refrigerator() if the flag is set.
35 36
36For user space processes try_to_freeze() is called automatically from the 37For user space processes try_to_freeze() is called automatically from the
37signal-handling code, but the freezable kernel threads need to call it 38signal-handling code, but the freezable kernel threads need to call it
38explicitly in suitable places. The code to do this may look like the following: 39explicitly in suitable places or use the wait_event_freezable() or
40wait_event_freezable_timeout() macros (defined in include/linux/freezer.h)
41that combine interruptible sleep with checking if TIF_FREEZE is set and calling
42try_to_freeze(). The main loop of a freezable kernel thread may look like the
43following one:
39 44
45 set_freezable();
40 do { 46 do {
41 hub_events(); 47 hub_events();
42 wait_event_interruptible(khubd_wait, 48 wait_event_freezable(khubd_wait,
43 !list_empty(&hub_event_list)); 49 !list_empty(&hub_event_list) ||
44 try_to_freeze(); 50 kthread_should_stop());
45 } while (!signal_pending(current)); 51 } while (!kthread_should_stop() || !list_empty(&hub_event_list));
46 52
47(from drivers/usb/core/hub.c::hub_thread()). 53(from drivers/usb/core/hub.c::hub_thread()).
48 54
49If a freezable kernel thread fails to call try_to_freeze() after the freezer has 55If a freezable kernel thread fails to call try_to_freeze() after the freezer has
50set TIF_FREEZE for it, the freezing of tasks will fail and the entire 56set TIF_FREEZE for it, the freezing of tasks will fail and the entire
51hibernation operation will be cancelled. For this reason, freezable kernel 57hibernation operation will be cancelled. For this reason, freezable kernel
52threads must call try_to_freeze() somewhere. 58threads must call try_to_freeze() somewhere or use one of the
59wait_event_freezable() and wait_event_freezable_timeout() macros.
53 60
54After the system memory state has been restored from a hibernation image and 61After the system memory state has been restored from a hibernation image and
55devices have been reinitialized, the function thaw_processes() is called in 62devices have been reinitialized, the function thaw_processes() is called in
@@ -81,7 +88,16 @@ hibernation image has been created and before the system is finally powered off.
81The majority of these are user space processes, but if any of the kernel threads 88The majority of these are user space processes, but if any of the kernel threads
82may cause something like this to happen, they have to be freezable. 89may cause something like this to happen, they have to be freezable.
83 90
842. The second reason is to prevent user space processes and some kernel threads 912. Next, to create the hibernation image we need to free a sufficient amount of
92memory (approximately 50% of available RAM) and we need to do that before
93devices are deactivated, because we generally need them for swapping out. Then,
94after the memory for the image has been freed, we don't want tasks to allocate
95additional memory and we prevent them from doing that by freezing them earlier.
96[Of course, this also means that device drivers should not allocate substantial
97amounts of memory from their .suspend() callbacks before hibernation, but this
98is e separate issue.]
99
1003. The third reason is to prevent user space processes and some kernel threads
85from interfering with the suspending and resuming of devices. A user space 101from interfering with the suspending and resuming of devices. A user space
86process running on a second CPU while we are suspending devices may, for 102process running on a second CPU while we are suspending devices may, for
87example, be troublesome and without the freezing of tasks we would need some 103example, be troublesome and without the freezing of tasks we would need some
@@ -111,7 +127,7 @@ frozen before the driver's .suspend() callback is executed and it will be
111thawed after the driver's .resume() callback has run, so it won't be accessing 127thawed after the driver's .resume() callback has run, so it won't be accessing
112the device while it's suspended. 128the device while it's suspended.
113 129
1143. Another reason for freezing tasks is to prevent user space processes from 1304. Another reason for freezing tasks is to prevent user space processes from
115realizing that hibernation (or suspend) operation takes place. Ideally, user 131realizing that hibernation (or suspend) operation takes place. Ideally, user
116space processes should not notice that such a system-wide operation has occurred 132space processes should not notice that such a system-wide operation has occurred
117and should continue running without any problems after the restore (or resume 133and should continue running without any problems after the restore (or resume
diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt
index fd5192a8fa8a..e67211fe0ee2 100644
--- a/Documentation/power/interface.txt
+++ b/Documentation/power/interface.txt
@@ -20,7 +20,7 @@ states.
20/sys/power/disk controls the operating mode of the suspend-to-disk 20/sys/power/disk controls the operating mode of the suspend-to-disk
21mechanism. Suspend-to-disk can be handled in several ways. We have a 21mechanism. Suspend-to-disk can be handled in several ways. We have a
22few options for putting the system to sleep - using the platform driver 22few options for putting the system to sleep - using the platform driver
23(e.g. ACPI or other pm_ops), powering off the system or rebooting the 23(e.g. ACPI or other suspend_ops), powering off the system or rebooting the
24system (for testing). 24system (for testing).
25 25
26Additionally, /sys/power/disk can be used to turn on one of the two testing 26Additionally, /sys/power/disk can be used to turn on one of the two testing
diff --git a/Documentation/power/swsusp-and-swap-files.txt b/Documentation/power/swsusp-and-swap-files.txt
index 06f911a5f885..f281886de490 100644
--- a/Documentation/power/swsusp-and-swap-files.txt
+++ b/Documentation/power/swsusp-and-swap-files.txt
@@ -39,7 +39,7 @@ resume=<swap_file_partition> resume_offset=<swap_file_offset>
39where <swap_file_partition> is the partition on which the swap file is located 39where <swap_file_partition> is the partition on which the swap file is located
40and <swap_file_offset> is the offset of the swap header determined by the 40and <swap_file_offset> is the offset of the swap header determined by the
41application in 2) (of course, this step may be carried out automatically 41application in 2) (of course, this step may be carried out automatically
42by the same application that determies the swap file's header offset using the 42by the same application that determines the swap file's header offset using the
43FIBMAP ioctl) 43FIBMAP ioctl)
44 44
45OR 45OR
diff --git a/Documentation/powerpc/eeh-pci-error-recovery.txt b/Documentation/powerpc/eeh-pci-error-recovery.txt
index 4530d1bf0286..df7afe43d462 100644
--- a/Documentation/powerpc/eeh-pci-error-recovery.txt
+++ b/Documentation/powerpc/eeh-pci-error-recovery.txt
@@ -36,8 +36,8 @@ Causes of EEH Errors
36EEH was originally designed to guard against hardware failure, such 36EEH was originally designed to guard against hardware failure, such
37as PCI cards dying from heat, humidity, dust, vibration and bad 37as PCI cards dying from heat, humidity, dust, vibration and bad
38electrical connections. The vast majority of EEH errors seen in 38electrical connections. The vast majority of EEH errors seen in
39"real life" are due to eithr poorly seated PCI cards, or, 39"real life" are due to either poorly seated PCI cards, or,
40unfortunately quite commonly, due device driver bugs, device firmware 40unfortunately quite commonly, due to device driver bugs, device firmware
41bugs, and sometimes PCI card hardware bugs. 41bugs, and sometimes PCI card hardware bugs.
42 42
43The most common software bug, is one that causes the device to 43The most common software bug, is one that causes the device to
diff --git a/Documentation/powerpc/mpc52xx-device-tree-bindings.txt b/Documentation/powerpc/mpc52xx-device-tree-bindings.txt
index e59fcbbe338c..5e03610e186f 100644
--- a/Documentation/powerpc/mpc52xx-device-tree-bindings.txt
+++ b/Documentation/powerpc/mpc52xx-device-tree-bindings.txt
@@ -17,12 +17,12 @@ passed by the boot loader to the kernel at boot time. The device tree
17describes what devices are present on the board and how they are 17describes what devices are present on the board and how they are
18connected. The device tree can either be passed as a binary blob (as 18connected. The device tree can either be passed as a binary blob (as
19described in Documentation/powerpc/booting-without-of.txt), or passed 19described in Documentation/powerpc/booting-without-of.txt), or passed
20by Open Firmare (IEEE 1275) compatible firmware using an OF compatible 20by Open Firmware (IEEE 1275) compatible firmware using an OF compatible
21client interface API. 21client interface API.
22 22
23This document specifies the requirements on the device-tree for mpc5200 23This document specifies the requirements on the device-tree for mpc5200
24based boards. These requirements are above and beyond the details 24based boards. These requirements are above and beyond the details
25specified in either the OpenFirmware spec or booting-without-of.txt 25specified in either the Open Firmware spec or booting-without-of.txt
26 26
27All new mpc5200-based boards are expected to match this document. In 27All new mpc5200-based boards are expected to match this document. In
28cases where this document is not sufficient to support a new board port, 28cases where this document is not sufficient to support a new board port,
@@ -73,8 +73,8 @@ match on the compatible list; the 'most compatible' driver should be
73selected. 73selected.
74 74
75The split between the MPC5200 and the MPC5200B leaves a bit of a 75The split between the MPC5200 and the MPC5200B leaves a bit of a
76connundrum. How should the compatible property be set up to provide 76conundrum. How should the compatible property be set up to provide
77maximum compatability information; but still acurately describe the 77maximum compatibility information; but still accurately describe the
78chip? For the MPC5200; the answer is easy. Most of the SoC devices 78chip? For the MPC5200; the answer is easy. Most of the SoC devices
79originally appeared on the MPC5200. Since they didn't exist anywhere 79originally appeared on the MPC5200. Since they didn't exist anywhere
80else; the 5200 compatible properties will contain only one item; 80else; the 5200 compatible properties will contain only one item;
@@ -84,7 +84,7 @@ The 5200B is almost the same as the 5200, but not quite. It fixes
84silicon bugs and it adds a small number of enhancements. Most of the 84silicon bugs and it adds a small number of enhancements. Most of the
85devices either provide exactly the same interface as on the 5200. A few 85devices either provide exactly the same interface as on the 5200. A few
86devices have extra functions but still have a backwards compatible mode. 86devices have extra functions but still have a backwards compatible mode.
87To express this infomation as completely as possible, 5200B device trees 87To express this information as completely as possible, 5200B device trees
88should have two items in the compatible list; 88should have two items in the compatible list;
89"mpc5200b-<device>\0mpc5200-<device>". It is *strongly* recommended 89"mpc5200b-<device>\0mpc5200-<device>". It is *strongly* recommended
90that 5200B device trees follow this convention (instead of only listing 90that 5200B device trees follow this convention (instead of only listing
@@ -185,7 +185,7 @@ bestcomm@<addr> dma-controller mpc5200-bestcomm 5200 pic also requires
185Recommended soc5200 child nodes; populate as needed for your board 185Recommended soc5200 child nodes; populate as needed for your board
186name device_type compatible Description 186name device_type compatible Description
187---- ----------- ---------- ----------- 187---- ----------- ---------- -----------
188gpt@<addr> gpt mpc5200-gpt General purpose timers 188gpt@<addr> gpt fsl,mpc5200-gpt General purpose timers
189rtc@<addr> rtc mpc5200-rtc Real time clock 189rtc@<addr> rtc mpc5200-rtc Real time clock
190mscan@<addr> mscan mpc5200-mscan CAN bus controller 190mscan@<addr> mscan mpc5200-mscan CAN bus controller
191pci@<addr> pci mpc5200-pci PCI bridge 191pci@<addr> pci mpc5200-pci PCI bridge
@@ -199,7 +199,7 @@ ethernet@<addr> network mpc5200-fec MPC5200 ethernet device
199ata@<addr> ata mpc5200-ata IDE ATA interface 199ata@<addr> ata mpc5200-ata IDE ATA interface
200i2c@<addr> i2c mpc5200-i2c I2C controller 200i2c@<addr> i2c mpc5200-i2c I2C controller
201usb@<addr> usb-ohci-be mpc5200-ohci,ohci-be USB controller 201usb@<addr> usb-ohci-be mpc5200-ohci,ohci-be USB controller
202xlb@<addr> xlb mpc5200-xlb XLB arbritrator 202xlb@<addr> xlb mpc5200-xlb XLB arbitrator
203 203
204Important child node properties 204Important child node properties
205name type description 205name type description
@@ -213,7 +213,7 @@ cell-index int When multiple devices are present, is the
2135) General Purpose Timer nodes (child of soc5200 node) 2135) General Purpose Timer nodes (child of soc5200 node)
214On the mpc5200 and 5200b, GPT0 has a watchdog timer function. If the board 214On the mpc5200 and 5200b, GPT0 has a watchdog timer function. If the board
215design supports the internal wdt, then the device node for GPT0 should 215design supports the internal wdt, then the device node for GPT0 should
216include the empty property 'has-wdt'. 216include the empty property 'fsl,has-wdt'.
217 217
2186) PSC nodes (child of soc5200 node) 2186) PSC nodes (child of soc5200 node)
219PSC nodes can define the optional 'port-number' property to force assignment 219PSC nodes can define the optional 'port-number' property to force assignment
diff --git a/Documentation/scsi/aic79xx.txt b/Documentation/scsi/aic79xx.txt
index 6aa9a891f3d0..683ccae00ad4 100644
--- a/Documentation/scsi/aic79xx.txt
+++ b/Documentation/scsi/aic79xx.txt
@@ -120,7 +120,7 @@ The following information is available in this file:
120 list size to avoid SCSI malloc pool fragmentation. 120 list size to avoid SCSI malloc pool fragmentation.
121 - Cleanup channel display in our /proc output. 121 - Cleanup channel display in our /proc output.
122 - Workaround duplicate device entries in the mid-layer 122 - Workaround duplicate device entries in the mid-layer
123 devlice list during add-single-device. 123 device list during add-single-device.
124 124
125 1.3.6 (March 28th, 2003) 125 1.3.6 (March 28th, 2003)
126 - Correct a double free in the Domain Validation code. 126 - Correct a double free in the Domain Validation code.
diff --git a/Documentation/scsi/aic7xxx.txt b/Documentation/scsi/aic7xxx.txt
index 5f34d2ba69b4..b7e238cbb5a7 100644
--- a/Documentation/scsi/aic7xxx.txt
+++ b/Documentation/scsi/aic7xxx.txt
@@ -159,7 +159,7 @@ The following information is available in this file:
159 - Add support for 2.5.X's scsi_report_device_reset(). 159 - Add support for 2.5.X's scsi_report_device_reset().
160 160
161 6.2.34 (May 5th, 2003) 161 6.2.34 (May 5th, 2003)
162 - Fix locking regression instroduced in 6.2.29 that 162 - Fix locking regression introduced in 6.2.29 that
163 could cause a lock order reversal between the io_request_lock 163 could cause a lock order reversal between the io_request_lock
164 and our per-softc lock. This was only possible on RH9, 164 and our per-softc lock. This was only possible on RH9,
165 SuSE, and kernel.org 2.4.X kernels. 165 SuSE, and kernel.org 2.4.X kernels.
@@ -264,7 +264,7 @@ The following information is available in this file:
264 Option: tag_info:{{value[,value...]}[,{value[,value...]}...]} 264 Option: tag_info:{{value[,value...]}[,{value[,value...]}...]}
265 Definition: Set the per-target tagged queue depth on a 265 Definition: Set the per-target tagged queue depth on a
266 per controller basis. Both controllers and targets 266 per controller basis. Both controllers and targets
267 may be ommitted indicating that they should retain 267 may be omitted indicating that they should retain
268 the default tag depth. 268 the default tag depth.
269 Examples: tag_info:{{16,32,32,64,8,8,,32,32,32,32,32,32,32,32,32} 269 Examples: tag_info:{{16,32,32,64,8,8,,32,32,32,32,32,32,32,32,32}
270 On Controller 0 270 On Controller 0
@@ -290,7 +290,7 @@ The following information is available in this file:
290 ----------------------------------------------------------------- 290 -----------------------------------------------------------------
291 Option: dv: {value[,value...]} 291 Option: dv: {value[,value...]}
292 Definition: Set Domain Validation Policy on a per-controller basis. 292 Definition: Set Domain Validation Policy on a per-controller basis.
293 Controllers may be ommitted indicating that 293 Controllers may be omitted indicating that
294 they should retain the default read streaming setting. 294 they should retain the default read streaming setting.
295 Example: dv:{-1,0,,1,1,0} 295 Example: dv:{-1,0,,1,1,0}
296 On Controller 0 leave DV at its default setting. 296 On Controller 0 leave DV at its default setting.
diff --git a/Documentation/scsi/arcmsr_spec.txt b/Documentation/scsi/arcmsr_spec.txt
index 5e0042340fd3..45d9482c1517 100644
--- a/Documentation/scsi/arcmsr_spec.txt
+++ b/Documentation/scsi/arcmsr_spec.txt
@@ -3,7 +3,7 @@
3******************************************************************************* 3*******************************************************************************
4** Usage of IOP331 adapter 4** Usage of IOP331 adapter
5** (All In/Out is in IOP331's view) 5** (All In/Out is in IOP331's view)
6** 1. Message 0 --> InitThread message and retrun code 6** 1. Message 0 --> InitThread message and return code
7** 2. Doorbell is used for RS-232 emulation 7** 2. Doorbell is used for RS-232 emulation
8** inDoorBell : bit0 -- data in ready 8** inDoorBell : bit0 -- data in ready
9** (DRIVER DATA WRITE OK) 9** (DRIVER DATA WRITE OK)
diff --git a/Documentation/scsi/ibmmca.txt b/Documentation/scsi/ibmmca.txt
index a08e225653d6..a810421f1fb3 100644
--- a/Documentation/scsi/ibmmca.txt
+++ b/Documentation/scsi/ibmmca.txt
@@ -21,7 +21,7 @@
21 versions older than 4.0 do not work with kernels 2.4.0 or later! If you 21 versions older than 4.0 do not work with kernels 2.4.0 or later! If you
22 try to compile your kernel with the wrong driver source, the 22 try to compile your kernel with the wrong driver source, the
23 compilation is aborted and you get a corresponding error message. This is 23 compilation is aborted and you get a corresponding error message. This is
24 no bug in the driver. It prevents you from using the wrong sourcecode 24 no bug in the driver; it prevents you from using the wrong source code
25 with the wrong kernel version. 25 with the wrong kernel version.
26 26
27 Authors of this Driver 27 Authors of this Driver
@@ -58,7 +58,7 @@
58 5 Users' Manual 58 5 Users' Manual
59 5.1 Commandline Parameters 59 5.1 Commandline Parameters
60 5.2 Troubleshooting 60 5.2 Troubleshooting
61 5.3 Bugreports 61 5.3 Bug reports
62 5.4 Support WWW-page 62 5.4 Support WWW-page
63 6 References 63 6 References
64 7 Credits to 64 7 Credits to
@@ -71,13 +71,13 @@
71 71
72 1 Abstract 72 1 Abstract
73 ---------- 73 ----------
74 This README-file describes the IBM SCSI-subsystem low level driver for 74 This README-file describes the IBM SCSI-subsystem low level driver for
75 Linux. The descriptions which were formerly kept in the source-code have 75 Linux. The descriptions which were formerly kept in the source code have
76 been taken out to this file to easify the codes' readability. The driver 76 been taken out of this file to simplify the codes readability. The driver
77 description has been updated, as most of the former description was already 77 description has been updated, as most of the former description was already
78 quite outdated. The history of the driver development is also kept inside 78 quite outdated. The history of the driver development is also kept inside
79 here. Multiple historical developments have been summarized to shorten the 79 here. Multiple historical developments have been summarized to shorten the
80 textsize a bit. At the end of this file you can find a small manual for 80 text size a bit. At the end of this file you can find a small manual for
81 this driver and hints to get it running on your machine. 81 this driver and hints to get it running on your machine.
82 82
83 2 Driver Description 83 2 Driver Description
@@ -186,7 +186,7 @@
186 between 0 and 7). The IBM SCSI-2 F/W adapter offers this on up to two 186 between 0 and 7). The IBM SCSI-2 F/W adapter offers this on up to two
187 busses and provides support for 30 logical devices at the same time, where 187 busses and provides support for 30 logical devices at the same time, where
188 in wide-addressing mode you can have 16 puns with 32 luns on each device. 188 in wide-addressing mode you can have 16 puns with 32 luns on each device.
189 This section dexribes you the handling of devices on non-F/W adapters. 189 This section describes the handling of devices on non-F/W adapters.
190 Just imagine, that you can have 16 * 32 = 512 devices on a F/W adapter 190 Just imagine, that you can have 16 * 32 = 512 devices on a F/W adapter
191 which means a lot of possible devices for such a small machine. 191 which means a lot of possible devices for such a small machine.
192 192
@@ -209,10 +209,10 @@
209 -------------------------------------------------------- 209 --------------------------------------------------------
210 One consequence of information hiding is that the real (pun,lun) 210 One consequence of information hiding is that the real (pun,lun)
211 numbers are also hidden. The two possibilities to get around this problem 211 numbers are also hidden. The two possibilities to get around this problem
212 is to offer fake pun/lun combinations to the operating system or to 212 are to offer fake pun/lun combinations to the operating system or to
213 delete the whole mapping of the adapter and to reassign the ldns, using 213 delete the whole mapping of the adapter and to reassign the ldns, using
214 the immediate assign command of the SCSI-subsystem for probing through 214 the immediate assign command of the SCSI-subsystem for probing through
215 all possible pun/lun combinations. a ldn is a "logical device number" 215 all possible pun/lun combinations. An ldn is a "logical device number"
216 which is used by IBM SCSI-subsystems to access some valid SCSI-device. 216 which is used by IBM SCSI-subsystems to access some valid SCSI-device.
217 At the beginning of the development of this driver, the following approach 217 At the beginning of the development of this driver, the following approach
218 was used: 218 was used:
@@ -251,9 +251,9 @@
251 lun>0 or to non-existing devices, in order to satisfy the subsystem, if 251 lun>0 or to non-existing devices, in order to satisfy the subsystem, if
252 there are less than 15 SCSI-devices connected. In the case of more than 15 252 there are less than 15 SCSI-devices connected. In the case of more than 15
253 devices, the dynamical mapping goes active. If the get_scsi[][] reports a 253 devices, the dynamical mapping goes active. If the get_scsi[][] reports a
254 device to be existant, but it has no ldn assigned, it gets a ldn out of 7 254 device to be existent, but it has no ldn assigned, it gets an ldn out of 7
255 to 14. The numbers are assigned in cyclic order. Therefore it takes 8 255 to 14. The numbers are assigned in cyclic order, therefore it takes 8
256 dynamical reassignments on the SCSI-devices, until a certain device 256 dynamical reassignments on the SCSI-devices until a certain device
257 loses its ldn again. This assures that dynamical remapping is avoided 257 loses its ldn again. This assures that dynamical remapping is avoided
258 during intense I/O between up to 15 SCSI-devices (means pun,lun 258 during intense I/O between up to 15 SCSI-devices (means pun,lun
259 combinations). A further advantage of this method is that people who 259 combinations). A further advantage of this method is that people who
@@ -551,7 +551,7 @@
551 than devices are available, they are assigned to non existing pun,lun 551 than devices are available, they are assigned to non existing pun,lun
552 combinations to satisfy the adapter. With this, the dynamical mapping 552 combinations to satisfy the adapter. With this, the dynamical mapping
553 was possible to implement. (For further info see the text in the 553 was possible to implement. (For further info see the text in the
554 source-code and in the description below. Read the description 554 source code and in the description below. Read the description
555 below BEFORE installing this driver on your system!) 555 below BEFORE installing this driver on your system!)
556 2) Changed the name IBMMCA_DRIVER_VERSION to IBMMCA_SCSI_DRIVER_VERSION. 556 2) Changed the name IBMMCA_DRIVER_VERSION to IBMMCA_SCSI_DRIVER_VERSION.
557 3) The LED-display shows on PS/2-95 no longer the ldn, but the SCSI-ID 557 3) The LED-display shows on PS/2-95 no longer the ldn, but the SCSI-ID
@@ -762,9 +762,9 @@
762 - Michael Lang 762 - Michael Lang
763 763
764 Apr 23, 2000 (v3.2pre1) 764 Apr 23, 2000 (v3.2pre1)
765 1) During a very long time, I collected a huge amount of bugreports from 765 1) During a very long time, I collected a huge amount of bug reports from
766 various people, trying really quite different things on their SCSI- 766 various people, trying really quite different things on their SCSI-
767 PS/2s. Today, all these bugreports are taken into account and should be 767 PS/2s. Today, all these bug reports are taken into account and should be
768 mostly solved. The major topics were: 768 mostly solved. The major topics were:
769 - Driver crashes during boottime by no obvious reason. 769 - Driver crashes during boottime by no obvious reason.
770 - Driver panics while the midlevel-SCSI-driver is trying to inquire 770 - Driver panics while the midlevel-SCSI-driver is trying to inquire
@@ -819,7 +819,7 @@
819 - Michael Lang 819 - Michael Lang
820 820
821 July 17, 2000 (v3.2pre8) 821 July 17, 2000 (v3.2pre8)
822 A long period of collecting bugreports from all corners of the world 822 A long period of collecting bug reports from all corners of the world
823 now lead to the following corrections to the code: 823 now lead to the following corrections to the code:
824 1) SCSI-2 F/W support crashed with a COMMAND ERROR. The reason for this 824 1) SCSI-2 F/W support crashed with a COMMAND ERROR. The reason for this
825 was that it is possible to disable Fast-SCSI for the external bus. 825 was that it is possible to disable Fast-SCSI for the external bus.
@@ -873,7 +873,7 @@
873 July 26, 2000 (v3.2pre11) 873 July 26, 2000 (v3.2pre11)
874 1) I passed a horrible weekend getting mad with NMIs on kernel 2.2.14 and 874 1) I passed a horrible weekend getting mad with NMIs on kernel 2.2.14 and
875 a model 9595. Asking around in the community, nobody except of me has 875 a model 9595. Asking around in the community, nobody except of me has
876 seen such errors. Weired, but I am trying to recompile everything on 876 seen such errors. Weird, but I am trying to recompile everything on
877 the model 9595. Maybe, as I use a specially modified gcc, that could 877 the model 9595. Maybe, as I use a specially modified gcc, that could
878 cause problems. But, it was not the reason. The true background was, 878 cause problems. But, it was not the reason. The true background was,
879 that the kernel was compiled for i386 and the 9595 has a 486DX-2. 879 that the kernel was compiled for i386 and the 9595 has a 486DX-2.
@@ -886,7 +886,7 @@
886 alive rotator during boottime. This makes sense, when no monitor is 886 alive rotator during boottime. This makes sense, when no monitor is
887 connected to the system. You can get rid of all display activity, if 887 connected to the system. You can get rid of all display activity, if
888 you do not use any parameter or just ibmmcascsi=activity, for the 888 you do not use any parameter or just ibmmcascsi=activity, for the
889 harddrive activity LED, existant on all PS/2, except models 8595-XXX. 889 harddrive activity LED, existent on all PS/2, except models 8595-XXX.
890 If no monitor is available, please use ibmmcascsi=display, which works 890 If no monitor is available, please use ibmmcascsi=display, which works
891 fine together with the linuxinfo utility for the LED-panel. 891 fine together with the linuxinfo utility for the LED-panel.
892 - Michael Lang 892 - Michael Lang
@@ -1115,7 +1115,7 @@
1115 If this really happens, do also send e-mail to the maintainer, as 1115 If this really happens, do also send e-mail to the maintainer, as
1116 forced detection should be never necessary. Forced detection is in 1116 forced detection should be never necessary. Forced detection is in
1117 principal some flaw of the driver adapter detection and goes into 1117 principal some flaw of the driver adapter detection and goes into
1118 bugreports. 1118 bug reports.
1119 Q: The driver screws up, if it starts to probe SCSI-devices, is there 1119 Q: The driver screws up, if it starts to probe SCSI-devices, is there
1120 some way out of it? 1120 some way out of it?
1121 A: Yes, that was some recognition problem of the correct SCSI-adapter 1121 A: Yes, that was some recognition problem of the correct SCSI-adapter
@@ -1172,7 +1172,7 @@
1172 recommended version is 3.2 or later. Here, the F/W support is in 1172 recommended version is 3.2 or later. Here, the F/W support is in
1173 a stable and reliable condition. Wide-addressing is in addition 1173 a stable and reliable condition. Wide-addressing is in addition
1174 supported. 1174 supported.
1175 Q: I get a Ooops message and something like "killing interrupt". 1175 Q: I get an Oops message and something like "killing interrupt".
1176 A: The reason for this is that the IBM SCSI-subsystem only sends a 1176 A: The reason for this is that the IBM SCSI-subsystem only sends a
1177 termination status back, if some error appeared. In former releases 1177 termination status back, if some error appeared. In former releases
1178 of the driver, it was not checked, if the termination status block 1178 of the driver, it was not checked, if the termination status block
@@ -1213,21 +1213,21 @@
1213 problem. Not yet tried, but guessing that it could work. To get this, 1213 problem. Not yet tried, but guessing that it could work. To get this,
1214 set unchecked_isa_dma argument of ibmmca.h from 0 to 1. 1214 set unchecked_isa_dma argument of ibmmca.h from 0 to 1.
1215 1215
1216 5.3 Bugreports 1216 5.3 Bug reports
1217 -------------- 1217 --------------
1218 If you really find bugs in the sourcecode or the driver will successfully 1218 If you really find bugs in the source code or the driver will successfully
1219 refuse to work on your machine, you should send a bug report to me. The 1219 refuse to work on your machine, you should send a bug report to me. The
1220 best for this is to follow the instructions on the WWW-page for this 1220 best for this is to follow the instructions on the WWW-page for this
1221 driver. Fill out the bug-report form, placed on the WWW-page and ship it, 1221 driver. Fill out the bug-report form, placed on the WWW-page and ship it,
1222 so the bugs can be taken into account with maximum efforts. But, please 1222 so the bugs can be taken into account with maximum efforts. But, please
1223 do not send bug reports about this driver to Linus Torvalds or Leonard 1223 do not send bug reports about this driver to Linus Torvalds or Leonard
1224 Zubkoff, as Linus is burried in E-Mail and Leonard is supervising all 1224 Zubkoff, as Linus is buried in E-Mail and Leonard is supervising all
1225 SCSI-drivers and won't have the time left to look inside every single 1225 SCSI-drivers and won't have the time left to look inside every single
1226 driver to fix a bug and especially DO NOT send modified code to Linus 1226 driver to fix a bug and especially DO NOT send modified code to Linus
1227 Torvalds or Alan J. Cox which has not been checked here!!! They are both 1227 Torvalds or Alan J. Cox which has not been checked here!!! They are both
1228 quite burried in E-mail (as me, sometimes, too) and one should first check 1228 quite buried in E-mail (as me, sometimes, too) and one should first check
1229 for problems on my local teststand. Recently, I got a lot of 1229 for problems on my local teststand. Recently, I got a lot of
1230 bugreports for errors in the ibmmca.c code, which I could not imagine, but 1230 bug reports for errors in the ibmmca.c code, which I could not imagine, but
1231 a look inside some Linux-distribution showed me quite often some modified 1231 a look inside some Linux-distribution showed me quite often some modified
1232 code, which did no longer work on most other machines than the one of the 1232 code, which did no longer work on most other machines than the one of the
1233 modifier. Ok, so now that there is maintenance service available for this 1233 modifier. Ok, so now that there is maintenance service available for this
@@ -1261,7 +1261,7 @@
1261 some e-mail directly, but at least with the same information as required by 1261 some e-mail directly, but at least with the same information as required by
1262 the formular. 1262 the formular.
1263 1263
1264 If you have extensive bugreports, including Ooops messages and 1264 If you have extensive bug reports, including Oops messages and
1265 screen-shots, please feel free to send it directly to the address 1265 screen-shots, please feel free to send it directly to the address
1266 of the maintainer, too. The current address of the maintainer is: 1266 of the maintainer, too. The current address of the maintainer is:
1267 1267
@@ -1318,7 +1318,7 @@
1318 detailed bug reports and ideas for this driver (and his 1318 detailed bug reports and ideas for this driver (and his
1319 patience ;-)). 1319 patience ;-)).
1320 Alan J. Cox 1320 Alan J. Cox
1321 for his bugreports and his bold activities in cross-checking 1321 for his bug reports and his bold activities in cross-checking
1322 the driver-code with his teststand. 1322 the driver-code with his teststand.
1323 1323
1324 7.2 Sponsors & Supporters 1324 7.2 Sponsors & Supporters
diff --git a/Documentation/sharedsubtree.txt b/Documentation/sharedsubtree.txt
index ccf1cebe744f..736540045dc7 100644
--- a/Documentation/sharedsubtree.txt
+++ b/Documentation/sharedsubtree.txt
@@ -153,6 +153,7 @@ replicas continue to be exactly same.
153 #include <stdio.h> 153 #include <stdio.h>
154 #include <stdlib.h> 154 #include <stdlib.h>
155 #include <unistd.h> 155 #include <unistd.h>
156 #include <string.h>
156 #include <sys/mount.h> 157 #include <sys/mount.h>
157 #include <sys/fsuid.h> 158 #include <sys/fsuid.h>
158 159
diff --git a/Documentation/sound/alsa/soc/DAI.txt b/Documentation/sound/alsa/soc/DAI.txt
index 58cbfd01ea8f..3feeb9ecdec4 100644
--- a/Documentation/sound/alsa/soc/DAI.txt
+++ b/Documentation/sound/alsa/soc/DAI.txt
@@ -20,12 +20,12 @@ I2S
20=== 20===
21 21
22 I2S is a common 4 wire DAI used in HiFi, STB and portable devices. The Tx and 22 I2S is a common 4 wire DAI used in HiFi, STB and portable devices. The Tx and
23Rx lines are used for audio transmision, whilst the bit clock (BCLK) and 23Rx lines are used for audio transmission, whilst the bit clock (BCLK) and
24left/right clock (LRC) synchronise the link. I2S is flexible in that either the 24left/right clock (LRC) synchronise the link. I2S is flexible in that either the
25controller or CODEC can drive (master) the BCLK and LRC clock lines. Bit clock 25controller or CODEC can drive (master) the BCLK and LRC clock lines. Bit clock
26usually varies depending on the sample rate and the master system clock 26usually varies depending on the sample rate and the master system clock
27(SYSCLK). LRCLK is the same as the sample rate. A few devices support separate 27(SYSCLK). LRCLK is the same as the sample rate. A few devices support separate
28ADC and DAC LRCLK's, this allows for similtanious capture and playback at 28ADC and DAC LRCLK's, this allows for simultaneous capture and playback at
29different sample rates. 29different sample rates.
30 30
31I2S has several different operating modes:- 31I2S has several different operating modes:-
@@ -41,12 +41,12 @@ I2S has several different operating modes:-
41PCM 41PCM
42=== 42===
43 43
44PCM is another 4 wire interface, very similar to I2S, that can support a more 44PCM is another 4 wire interface, very similar to I2S, which can support a more
45flexible protocol. It has bit clock (BCLK) and sync (SYNC) lines that are used 45flexible protocol. It has bit clock (BCLK) and sync (SYNC) lines that are used
46to synchronise the link whilst the Tx and Rx lines are used to transmit and 46to synchronise the link whilst the Tx and Rx lines are used to transmit and
47receive the audio data. Bit clock usually varies depending on sample rate 47receive the audio data. Bit clock usually varies depending on sample rate
48whilst sync runs at the sample rate. PCM also supports Time Division 48whilst sync runs at the sample rate. PCM also supports Time Division
49Multiplexing (TDM) in that several devices can use the bus similtaniuosly (This 49Multiplexing (TDM) in that several devices can use the bus simultaneously (this
50is sometimes referred to as network mode). 50is sometimes referred to as network mode).
51 51
52Common PCM operating modes:- 52Common PCM operating modes:-
diff --git a/Documentation/sound/alsa/soc/clocking.txt b/Documentation/sound/alsa/soc/clocking.txt
index e93960d53a1e..14930887c25f 100644
--- a/Documentation/sound/alsa/soc/clocking.txt
+++ b/Documentation/sound/alsa/soc/clocking.txt
@@ -2,20 +2,20 @@ Audio Clocking
2============== 2==============
3 3
4This text describes the audio clocking terms in ASoC and digital audio in 4This text describes the audio clocking terms in ASoC and digital audio in
5general. Note: Audio clocking can be complex ! 5general. Note: Audio clocking can be complex!
6 6
7 7
8Master Clock 8Master Clock
9------------ 9------------
10 10
11Every audio subsystem is driven by a master clock (sometimes refered to as MCLK 11Every audio subsystem is driven by a master clock (sometimes referred to as MCLK
12or SYSCLK). This audio master clock can be derived from a number of sources 12or SYSCLK). This audio master clock can be derived from a number of sources
13(e.g. crystal, PLL, CPU clock) and is responsible for producing the correct 13(e.g. crystal, PLL, CPU clock) and is responsible for producing the correct
14audio playback and capture sample rates. 14audio playback and capture sample rates.
15 15
16Some master clocks (e.g. PLL's and CPU based clocks) are configuarble in that 16Some master clocks (e.g. PLL's and CPU based clocks) are configurable in that
17their speed can be altered by software (depending on the system use and to save 17their speed can be altered by software (depending on the system use and to save
18power). Other master clocks are fixed at at set frequency (i.e. crystals). 18power). Other master clocks are fixed at a set frequency (i.e. crystals).
19 19
20 20
21DAI Clocks 21DAI Clocks
@@ -44,7 +44,7 @@ This relationship depends on the codec or SoC CPU in particular. In general
44it's best to configure BCLK to the lowest possible speed (depending on your 44it's best to configure BCLK to the lowest possible speed (depending on your
45rate, number of channels and wordsize) to save on power. 45rate, number of channels and wordsize) to save on power.
46 46
47It's also desireable to use the codec (if possible) to drive (or master) the 47It's also desirable to use the codec (if possible) to drive (or master) the
48audio clocks as it's usually gives more accurate sample rates than the CPU. 48audio clocks as it's usually gives more accurate sample rates than the CPU.
49 49
50 50
diff --git a/Documentation/sound/alsa/soc/codec.txt b/Documentation/sound/alsa/soc/codec.txt
index 48983c75aad9..1e766ad0ebd1 100644
--- a/Documentation/sound/alsa/soc/codec.txt
+++ b/Documentation/sound/alsa/soc/codec.txt
@@ -19,7 +19,7 @@ Optionally, codec drivers can also provide:-
19 6) DAPM event handler. 19 6) DAPM event handler.
20 7) DAC Digital mute control. 20 7) DAC Digital mute control.
21 21
22It's probably best to use this guide in conjuction with the existing codec 22It's probably best to use this guide in conjunction with the existing codec
23driver code in sound/soc/codecs/ 23driver code in sound/soc/codecs/
24 24
25ASoC Codec driver breakdown 25ASoC Codec driver breakdown
@@ -28,7 +28,7 @@ ASoC Codec driver breakdown
281 - Codec DAI and PCM configuration 281 - Codec DAI and PCM configuration
29----------------------------------- 29-----------------------------------
30Each codec driver must have a struct snd_soc_codec_dai to define it's DAI and 30Each codec driver must have a struct snd_soc_codec_dai to define it's DAI and
31PCM's capablities and operations. This struct is exported so that it can be 31PCM's capabilities and operations. This struct is exported so that it can be
32registered with the core by your machine driver. 32registered with the core by your machine driver.
33 33
34e.g. 34e.g.
@@ -67,7 +67,7 @@ EXPORT_SYMBOL_GPL(wm8731_dai);
67 67
682 - Codec control IO 682 - Codec control IO
69-------------------- 69--------------------
70The codec can ususally be controlled via an I2C or SPI style interface (AC97 70The codec can usually be controlled via an I2C or SPI style interface (AC97
71combines control with data in the DAI). The codec drivers will have to provide 71combines control with data in the DAI). The codec drivers will have to provide
72functions to read and write the codec registers along with supplying a register 72functions to read and write the codec registers along with supplying a register
73cache:- 73cache:-
diff --git a/Documentation/sound/alsa/soc/dapm.txt b/Documentation/sound/alsa/soc/dapm.txt
index c11877f5b4a1..ab0766fd7869 100644
--- a/Documentation/sound/alsa/soc/dapm.txt
+++ b/Documentation/sound/alsa/soc/dapm.txt
@@ -11,7 +11,7 @@ other PM systems.
11 11
12DAPM is also completely transparent to all user space applications as all power 12DAPM is also completely transparent to all user space applications as all power
13switching is done within the ASoC core. No code changes or recompiling are 13switching is done within the ASoC core. No code changes or recompiling are
14required for user space applications. DAPM makes power switching descisions based 14required for user space applications. DAPM makes power switching decisions based
15upon any audio stream (capture/playback) activity and audio mixer settings 15upon any audio stream (capture/playback) activity and audio mixer settings
16within the device. 16within the device.
17 17
@@ -38,7 +38,7 @@ There are 4 power domains within DAPM
38 Enabled and disabled when stream playback/capture is started and 38 Enabled and disabled when stream playback/capture is started and
39 stopped respectively. e.g. aplay, arecord. 39 stopped respectively. e.g. aplay, arecord.
40 40
41All DAPM power switching descisons are made automatically by consulting an audio 41All DAPM power switching decisions are made automatically by consulting an audio
42routing map of the whole machine. This map is specific to each machine and 42routing map of the whole machine. This map is specific to each machine and
43consists of the interconnections between every audio component (including 43consists of the interconnections between every audio component (including
44internal codec components). All audio components that effect power are called 44internal codec components). All audio components that effect power are called
diff --git a/Documentation/sound/alsa/soc/overview.txt b/Documentation/sound/alsa/soc/overview.txt
index 753c5cc5984a..c47ce9530677 100644
--- a/Documentation/sound/alsa/soc/overview.txt
+++ b/Documentation/sound/alsa/soc/overview.txt
@@ -2,18 +2,19 @@ ALSA SoC Layer
2============== 2==============
3 3
4The overall project goal of the ALSA System on Chip (ASoC) layer is to provide 4The overall project goal of the ALSA System on Chip (ASoC) layer is to provide
5better ALSA support for embedded system on chip procesors (e.g. pxa2xx, au1x00, 5better ALSA support for embedded system-on-chip processors (e.g. pxa2xx, au1x00,
6iMX, etc) and portable audio codecs. Currently there is some support in the 6iMX, etc) and portable audio codecs. Currently there is some support in the
7kernel for SoC audio, however it has some limitations:- 7kernel for SoC audio, however it has some limitations:-
8 8
9 * Currently, codec drivers are often tightly coupled to the underlying SoC 9 * Currently, codec drivers are often tightly coupled to the underlying SoC
10 cpu. This is not ideal and leads to code duplication i.e. Linux now has 4 10 CPU. This is not ideal and leads to code duplication i.e. Linux now has 4
11 different wm8731 drivers for 4 different SoC platforms. 11 different wm8731 drivers for 4 different SoC platforms.
12 12
13 * There is no standard method to signal user initiated audio events. 13 * There is no standard method to signal user initiated audio events (e.g.
14 e.g. Headphone/Mic insertion, Headphone/Mic detection after an insertion 14 Headphone/Mic insertion, Headphone/Mic detection after an insertion
15 event. These are quite common events on portable devices and ofter require 15 event). These are quite common events on portable devices and often require
16 machine specific code to re route audio, enable amps etc after such an event. 16 machine specific code to re-route audio, enable amps, etc., after such an
17 event.
17 18
18 * Current drivers tend to power up the entire codec when playing 19 * Current drivers tend to power up the entire codec when playing
19 (or recording) audio. This is fine for a PC, but tends to waste a lot of 20 (or recording) audio. This is fine for a PC, but tends to waste a lot of
@@ -44,7 +45,7 @@ features :-
44 signals the codec when to change power states. 45 signals the codec when to change power states.
45 46
46 * Machine specific controls: Allow machines to add controls to the sound card 47 * Machine specific controls: Allow machines to add controls to the sound card
47 e.g. volume control for speaker amp. 48 (e.g. volume control for speaker amp).
48 49
49To achieve all this, ASoC basically splits an embedded audio system into 3 50To achieve all this, ASoC basically splits an embedded audio system into 3
50components :- 51components :-
@@ -57,7 +58,7 @@ components :-
57 interface drivers (e.g. I2S, AC97, PCM) for that platform. 58 interface drivers (e.g. I2S, AC97, PCM) for that platform.
58 59
59 * Machine driver: The machine driver handles any machine specific controls and 60 * Machine driver: The machine driver handles any machine specific controls and
60 audio events. i.e. turing on an amp at start of playback. 61 audio events (e.g. turning on an amp at start of playback).
61 62
62 63
63Documentation 64Documentation
diff --git a/Documentation/sound/alsa/soc/platform.txt b/Documentation/sound/alsa/soc/platform.txt
index e95b16d5a53b..d4678b4dc6c6 100644
--- a/Documentation/sound/alsa/soc/platform.txt
+++ b/Documentation/sound/alsa/soc/platform.txt
@@ -20,7 +20,7 @@ struct snd_soc_ops {
20 int (*trigger)(struct snd_pcm_substream *, int); 20 int (*trigger)(struct snd_pcm_substream *, int);
21}; 21};
22 22
23The platform driver exports it's DMA functionailty via struct snd_soc_platform:- 23The platform driver exports its DMA functionality via struct snd_soc_platform:-
24 24
25struct snd_soc_platform { 25struct snd_soc_platform {
26 char *name; 26 char *name;
diff --git a/Documentation/sound/alsa/soc/pops_clicks.txt b/Documentation/sound/alsa/soc/pops_clicks.txt
index 2cf7ee5b3d74..3371bd9d7cfa 100644
--- a/Documentation/sound/alsa/soc/pops_clicks.txt
+++ b/Documentation/sound/alsa/soc/pops_clicks.txt
@@ -2,7 +2,7 @@ Audio Pops and Clicks
2===================== 2=====================
3 3
4Pops and clicks are unwanted audio artifacts caused by the powering up and down 4Pops and clicks are unwanted audio artifacts caused by the powering up and down
5of components within the audio subsystem. This is noticable on PC's when an 5of components within the audio subsystem. This is noticeable on PCs when an
6audio module is either loaded or unloaded (at module load time the sound card is 6audio module is either loaded or unloaded (at module load time the sound card is
7powered up and causes a popping noise on the speakers). 7powered up and causes a popping noise on the speakers).
8 8
@@ -16,7 +16,7 @@ Minimising Playback Pops and Clicks
16=================================== 16===================================
17 17
18Playback pops in portable audio subsystems cannot be completely eliminated atm, 18Playback pops in portable audio subsystems cannot be completely eliminated atm,
19however future audio codec hardware will have better pop and click supression. 19however future audio codec hardware will have better pop and click suppression.
20Pops can be reduced within playback by powering the audio components in a 20Pops can be reduced within playback by powering the audio components in a
21specific order. This order is different for startup and shutdown and follows 21specific order. This order is different for startup and shutdown and follows
22some basic rules:- 22some basic rules:-
@@ -33,7 +33,7 @@ Minimising Capture Pops and Clicks
33================================== 33==================================
34 34
35Capture artifacts are somewhat easier to get rid as we can delay activating the 35Capture artifacts are somewhat easier to get rid as we can delay activating the
36ADC until all the pops have occured. This follows similar power rules to 36ADC until all the pops have occurred. This follows similar power rules to
37playback in that components are powered in a sequence depending upon stream 37playback in that components are powered in a sequence depending upon stream
38startup or shutdown. 38startup or shutdown.
39 39
diff --git a/Documentation/sound/oss/es1371 b/Documentation/sound/oss/es1371
deleted file mode 100644
index c3151266771c..000000000000
--- a/Documentation/sound/oss/es1371
+++ /dev/null
@@ -1,64 +0,0 @@
1/proc/sound, /dev/sndstat
2-------------------------
3
4/proc/sound and /dev/sndstat is not supported by the
5driver. To find out whether the driver succeeded loading,
6check the kernel log (dmesg).
7
8
9ALaw/uLaw sample formats
10------------------------
11
12This driver does not support the ALaw/uLaw sample formats.
13ALaw is the default mode when opening a sound device
14using OSS/Free. The reason for the lack of support is
15that the hardware does not support these formats, and adding
16conversion routines to the kernel would lead to very ugly
17code in the presence of the mmap interface to the driver.
18And since xquake uses mmap, mmap is considered important :-)
19and no sane application uses ALaw/uLaw these days anyway.
20In short, playing a Sun .au file as follows:
21
22cat my_file.au > /dev/dsp
23
24does not work. Instead, you may use the play script from
25Chris Bagwell's sox-12.14 package (available from the URL
26below) to play many different audio file formats.
27The script automatically determines the audio format
28and does do audio conversions if necessary.
29http://home.sprynet.com/sprynet/cbagwell/projects.html
30
31
32Blocking vs. nonblocking IO
33---------------------------
34
35Unlike OSS/Free this driver honours the O_NONBLOCK file flag
36not only during open, but also during read and write.
37This is an effort to make the sound driver interface more
38regular. Timidity has problems with this; a patch
39is available from http://www.ife.ee.ethz.ch/~sailer/linux/pciaudio.html.
40(Timidity patched will also run on OSS/Free).
41
42
43MIDI UART
44---------
45
46The driver supports a simple MIDI UART interface, with
47no ioctl's supported.
48
49
50MIDI synthesizer
51----------------
52
53This soundcard does not have any hardware MIDI synthesizer;
54MIDI synthesis has to be done in software. To allow this
55the driver/soundcard supports two PCM (/dev/dsp) interfaces.
56
57There is a freely available software package that allows
58MIDI file playback on this soundcard called Timidity.
59See http://www.cgs.fi/~tt/timidity/.
60
61
62
63Thomas Sailer
64t.sailer@alumni.ethz.ch
diff --git a/Documentation/spi/pxa2xx b/Documentation/spi/pxa2xx
index 215e3b8e7266..f3853cc37bde 100644
--- a/Documentation/spi/pxa2xx
+++ b/Documentation/spi/pxa2xx
@@ -1,4 +1,4 @@
1PXA2xx SPI on SSP driver HOWTO 1PXA2xx SPI on SSP driver HOWTO
2=================================================== 2===================================================
3This a mini howto on the pxa2xx_spi driver. The driver turns a PXA2xx 3This a mini howto on the pxa2xx_spi driver. The driver turns a PXA2xx
4synchronous serial port into a SPI master controller 4synchronous serial port into a SPI master controller
diff --git a/Documentation/thinkpad-acpi.txt b/Documentation/thinkpad-acpi.txt
index 60953d6c919d..ec499265deca 100644
--- a/Documentation/thinkpad-acpi.txt
+++ b/Documentation/thinkpad-acpi.txt
@@ -105,10 +105,15 @@ The version of thinkpad-acpi's sysfs interface is exported by the driver
105as a driver attribute (see below). 105as a driver attribute (see below).
106 106
107Sysfs driver attributes are on the driver's sysfs attribute space, 107Sysfs driver attributes are on the driver's sysfs attribute space,
108for 2.6.20 this is /sys/bus/platform/drivers/thinkpad_acpi/. 108for 2.6.23 this is /sys/bus/platform/drivers/thinkpad_acpi/ and
109/sys/bus/platform/drivers/thinkpad_hwmon/
109 110
110Sysfs device attributes are on the driver's sysfs attribute space, 111Sysfs device attributes are on the thinkpad_acpi device sysfs attribute
111for 2.6.20 this is /sys/devices/platform/thinkpad_acpi/. 112space, for 2.6.23 this is /sys/devices/platform/thinkpad_acpi/.
113
114Sysfs device attributes for the sensors and fan are on the
115thinkpad_hwmon device's sysfs attribute space, but you should locate it
116looking for a hwmon device with the name attribute of "thinkpad".
112 117
113Driver version 118Driver version
114-------------- 119--------------
@@ -766,7 +771,7 @@ Temperature sensors
766------------------- 771-------------------
767 772
768procfs: /proc/acpi/ibm/thermal 773procfs: /proc/acpi/ibm/thermal
769sysfs device attributes: (hwmon) temp*_input 774sysfs device attributes: (hwmon "thinkpad") temp*_input
770 775
771Most ThinkPads include six or more separate temperature sensors but only 776Most ThinkPads include six or more separate temperature sensors but only
772expose the CPU temperature through the standard ACPI methods. This 777expose the CPU temperature through the standard ACPI methods. This
@@ -989,7 +994,9 @@ Fan control and monitoring: fan speed, fan enable/disable
989--------------------------------------------------------- 994---------------------------------------------------------
990 995
991procfs: /proc/acpi/ibm/fan 996procfs: /proc/acpi/ibm/fan
992sysfs device attributes: (hwmon) fan_input, pwm1, pwm1_enable 997sysfs device attributes: (hwmon "thinkpad") fan1_input, pwm1,
998 pwm1_enable
999sysfs hwmon driver attributes: fan_watchdog
993 1000
994NOTE NOTE NOTE: fan control operations are disabled by default for 1001NOTE NOTE NOTE: fan control operations are disabled by default for
995safety reasons. To enable them, the module parameter "fan_control=1" 1002safety reasons. To enable them, the module parameter "fan_control=1"
@@ -1028,7 +1035,7 @@ enable it if necessary to avoid overheating.
1028 1035
1029An enabled fan in level "auto" may stop spinning if the EC decides the 1036An enabled fan in level "auto" may stop spinning if the EC decides the
1030ThinkPad is cool enough and doesn't need the extra airflow. This is 1037ThinkPad is cool enough and doesn't need the extra airflow. This is
1031normal, and the EC will spin the fan up if the varios thermal readings 1038normal, and the EC will spin the fan up if the various thermal readings
1032rise too much. 1039rise too much.
1033 1040
1034On the X40, this seems to depend on the CPU and HDD temperatures. 1041On the X40, this seems to depend on the CPU and HDD temperatures.
@@ -1131,7 +1138,7 @@ hwmon device attribute fan1_input:
1131 which can take up to two minutes. May return rubbish on older 1138 which can take up to two minutes. May return rubbish on older
1132 ThinkPads. 1139 ThinkPads.
1133 1140
1134driver attribute fan_watchdog: 1141hwmon driver attribute fan_watchdog:
1135 Fan safety watchdog timer interval, in seconds. Minimum is 1142 Fan safety watchdog timer interval, in seconds. Minimum is
1136 1 second, maximum is 120 seconds. 0 disables the watchdog. 1143 1 second, maximum is 120 seconds. 0 disables the watchdog.
1137 1144
@@ -1196,7 +1203,7 @@ for example:
1196Enabling debugging output 1203Enabling debugging output
1197------------------------- 1204-------------------------
1198 1205
1199The module takes a debug paramater which can be used to selectively 1206The module takes a debug parameter which can be used to selectively
1200enable various classes of debugging output, for example: 1207enable various classes of debugging output, for example:
1201 1208
1202 modprobe ibm_acpi debug=0xffff 1209 modprobe ibm_acpi debug=0xffff
@@ -1233,3 +1240,9 @@ Sysfs interface changelog:
1233 layer, the radio switch generates input event EV_RADIO, 1240 layer, the radio switch generates input event EV_RADIO,
1234 and the driver enables hot key handling by default in 1241 and the driver enables hot key handling by default in
1235 the firmware. 1242 the firmware.
1243
12440x020000: ABI fix: added a separate hwmon platform device and
1245 driver, which must be located by name (thinkpad)
1246 and the hwmon class for libsensors4 (lm-sensors 3)
1247 compatibility. Moved all hwmon attributes to this
1248 new platform device.
diff --git a/Documentation/usb/usb-serial.txt b/Documentation/usb/usb-serial.txt
index 4e0b62b8566f..8b077e43eee7 100644
--- a/Documentation/usb/usb-serial.txt
+++ b/Documentation/usb/usb-serial.txt
@@ -338,7 +338,7 @@ MCT USB Single Port Serial Adapter U232
338 This driver is for the MCT USB-RS232 Converter (25 pin, Model No. 338 This driver is for the MCT USB-RS232 Converter (25 pin, Model No.
339 U232-P25) from Magic Control Technology Corp. (there is also a 9 pin 339 U232-P25) from Magic Control Technology Corp. (there is also a 9 pin
340 Model No. U232-P9). More information about this device can be found at 340 Model No. U232-P9). More information about this device can be found at
341 the manufacture's web-site: http://www.mct.com.tw. 341 the manufacturer's web-site: http://www.mct.com.tw.
342 342
343 The driver is generally working, though it still needs some more testing. 343 The driver is generally working, though it still needs some more testing.
344 It is derived from the Belkin USB Serial Adapter F5U103 driver and its 344 It is derived from the Belkin USB Serial Adapter F5U103 driver and its
diff --git a/Documentation/watchdog/src/watchdog-simple.c b/Documentation/watchdog/src/watchdog-simple.c
index 47801bc7e742..4cf72f3fa8e9 100644
--- a/Documentation/watchdog/src/watchdog-simple.c
+++ b/Documentation/watchdog/src/watchdog-simple.c
@@ -3,15 +3,25 @@
3#include <unistd.h> 3#include <unistd.h>
4#include <fcntl.h> 4#include <fcntl.h>
5 5
6int main(int argc, const char *argv[]) { 6int main(void)
7{
7 int fd = open("/dev/watchdog", O_WRONLY); 8 int fd = open("/dev/watchdog", O_WRONLY);
9 int ret = 0;
8 if (fd == -1) { 10 if (fd == -1) {
9 perror("watchdog"); 11 perror("watchdog");
10 exit(1); 12 exit(EXIT_FAILURE);
11 } 13 }
12 while (1) { 14 while (1) {
13 write(fd, "\0", 1); 15 ret = write(fd, "\0", 1);
14 fsync(fd); 16 if (ret != 1) {
17 ret = -1;
18 break;
19 }
20 ret = fsync(fd);
21 if (ret)
22 break;
15 sleep(10); 23 sleep(10);
16 } 24 }
25 close(fd);
26 return ret;
17} 27}