diff options
Diffstat (limited to 'Documentation')
82 files changed, 2515 insertions, 1492 deletions
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile index 1a7f53068ec2..054a7ecf64c6 100644 --- a/Documentation/DocBook/Makefile +++ b/Documentation/DocBook/Makefile | |||
@@ -165,7 +165,7 @@ quiet_cmd_db2man = MAN $@ | |||
165 | @touch $@ | 165 | @touch $@ |
166 | 166 | ||
167 | ### | 167 | ### |
168 | # Rules to generate postscripts and PNG imgages from .fig format files | 168 | # Rules to generate postscripts and PNG images from .fig format files |
169 | quiet_cmd_fig2eps = FIG2EPS $@ | 169 | quiet_cmd_fig2eps = FIG2EPS $@ |
170 | cmd_fig2eps = fig2dev -Leps $< $@ | 170 | cmd_fig2eps = fig2dev -Leps $< $@ |
171 | 171 | ||
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl index d3290c46af51..aa38cc5692a0 100644 --- a/Documentation/DocBook/kernel-api.tmpl +++ b/Documentation/DocBook/kernel-api.tmpl | |||
@@ -46,7 +46,7 @@ | |||
46 | 46 | ||
47 | <sect1><title>Atomic and pointer manipulation</title> | 47 | <sect1><title>Atomic and pointer manipulation</title> |
48 | !Iinclude/asm-x86/atomic_32.h | 48 | !Iinclude/asm-x86/atomic_32.h |
49 | !Iinclude/asm-x86/unaligned_32.h | 49 | !Iinclude/asm-x86/unaligned.h |
50 | </sect1> | 50 | </sect1> |
51 | 51 | ||
52 | <sect1><title>Delaying, scheduling, and timer routines</title> | 52 | <sect1><title>Delaying, scheduling, and timer routines</title> |
diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl index 6fbc41d98c1e..957cf5c26831 100644 --- a/Documentation/DocBook/mtdnand.tmpl +++ b/Documentation/DocBook/mtdnand.tmpl | |||
@@ -282,7 +282,7 @@ int __init board_init (void) | |||
282 | goto out; | 282 | goto out; |
283 | } | 283 | } |
284 | 284 | ||
285 | /* map physical adress */ | 285 | /* map physical address */ |
286 | baseaddr = (unsigned long)ioremap(CHIP_PHYSICAL_ADDRESS, 1024); | 286 | baseaddr = (unsigned long)ioremap(CHIP_PHYSICAL_ADDRESS, 1024); |
287 | if(!baseaddr){ | 287 | if(!baseaddr){ |
288 | printk("Ioremap to access NAND chip failed\n"); | 288 | printk("Ioremap to access NAND chip failed\n"); |
@@ -306,7 +306,7 @@ int __init board_init (void) | |||
306 | this->dev_ready = board_dev_ready; | 306 | this->dev_ready = board_dev_ready; |
307 | this->eccmode = NAND_ECC_SOFT; | 307 | this->eccmode = NAND_ECC_SOFT; |
308 | 308 | ||
309 | /* Scan to find existance of the device */ | 309 | /* Scan to find existence of the device */ |
310 | if (nand_scan (board_mtd, 1)) { | 310 | if (nand_scan (board_mtd, 1)) { |
311 | err = -ENXIO; | 311 | err = -ENXIO; |
312 | goto out_ior; | 312 | goto out_ior; |
@@ -340,7 +340,7 @@ static void __exit board_cleanup (void) | |||
340 | /* Release resources, unregister device */ | 340 | /* Release resources, unregister device */ |
341 | nand_release (board_mtd); | 341 | nand_release (board_mtd); |
342 | 342 | ||
343 | /* unmap physical adress */ | 343 | /* unmap physical address */ |
344 | iounmap((void *)baseaddr); | 344 | iounmap((void *)baseaddr); |
345 | 345 | ||
346 | /* Free the MTD device structure */ | 346 | /* Free the MTD device structure */ |
diff --git a/Documentation/IPMI.txt b/Documentation/IPMI.txt index 24dc3fcf1594..bc38283379f0 100644 --- a/Documentation/IPMI.txt +++ b/Documentation/IPMI.txt | |||
@@ -441,17 +441,20 @@ ACPI, and if none of those then a KCS device at the spec-specified | |||
441 | 0xca2. If you want to turn this off, set the "trydefaults" option to | 441 | 0xca2. If you want to turn this off, set the "trydefaults" option to |
442 | false. | 442 | false. |
443 | 443 | ||
444 | If you have high-res timers compiled into the kernel, the driver will | 444 | If your IPMI interface does not support interrupts and is a KCS or |
445 | use them to provide much better performance. Note that if you do not | 445 | SMIC interface, the IPMI driver will start a kernel thread for the |
446 | have high-res timers enabled in the kernel and you don't have | 446 | interface to help speed things up. This is a low-priority kernel |
447 | interrupts enabled, the driver will run VERY slowly. Don't blame me, | 447 | thread that constantly polls the IPMI driver while an IPMI operation |
448 | is in progress. The force_kipmid module parameter will all the user to | ||
449 | force this thread on or off. If you force it off and don't have | ||
450 | interrupts, the driver will run VERY slowly. Don't blame me, | ||
448 | these interfaces suck. | 451 | these interfaces suck. |
449 | 452 | ||
450 | The driver supports a hot add and remove of interfaces. This way, | 453 | The driver supports a hot add and remove of interfaces. This way, |
451 | interfaces can be added or removed after the kernel is up and running. | 454 | interfaces can be added or removed after the kernel is up and running. |
452 | This is done using /sys/modules/ipmi_si/hotmod, which is a write-only | 455 | This is done using /sys/modules/ipmi_si/parameters/hotmod, which is a |
453 | parameter. You write a string to this interface. The string has the | 456 | write-only parameter. You write a string to this interface. The string |
454 | format: | 457 | has the format: |
455 | <op1>[:op2[:op3...]] | 458 | <op1>[:op2[:op3...]] |
456 | The "op"s are: | 459 | The "op"s are: |
457 | add|remove,kcs|bt|smic,mem|i/o,<address>[,<opt1>[,<opt2>[,...]]] | 460 | add|remove,kcs|bt|smic,mem|i/o,<address>[,<opt1>[,<opt2>[,...]]] |
@@ -581,9 +584,11 @@ The watchdog will panic and start a 120 second reset timeout if it | |||
581 | gets a pre-action. During a panic or a reboot, the watchdog will | 584 | gets a pre-action. During a panic or a reboot, the watchdog will |
582 | start a 120 timer if it is running to make sure the reboot occurs. | 585 | start a 120 timer if it is running to make sure the reboot occurs. |
583 | 586 | ||
584 | Note that if you use the NMI preaction for the watchdog, you MUST | 587 | Note that if you use the NMI preaction for the watchdog, you MUST NOT |
585 | NOT use nmi watchdog mode 1. If you use the NMI watchdog, you | 588 | use the nmi watchdog. There is no reasonable way to tell if an NMI |
586 | must use mode 2. | 589 | comes from the IPMI controller, so it must assume that if it gets an |
590 | otherwise unhandled NMI, it must be from IPMI and it will panic | ||
591 | immediately. | ||
587 | 592 | ||
588 | Once you open the watchdog timer, you must write a 'V' character to the | 593 | Once you open the watchdog timer, you must write a 'V' character to the |
589 | device to close it, or the timer will not stop. This is a new semantic | 594 | device to close it, or the timer will not stop. This is a new semantic |
diff --git a/Documentation/Intel-IOMMU.txt b/Documentation/Intel-IOMMU.txt new file mode 100644 index 000000000000..c2321903aa09 --- /dev/null +++ b/Documentation/Intel-IOMMU.txt | |||
@@ -0,0 +1,115 @@ | |||
1 | Linux IOMMU Support | ||
2 | =================== | ||
3 | |||
4 | The architecture spec can be obtained from the below location. | ||
5 | |||
6 | http://www.intel.com/technology/virtualization/ | ||
7 | |||
8 | This guide gives a quick cheat sheet for some basic understanding. | ||
9 | |||
10 | Some Keywords | ||
11 | |||
12 | DMAR - DMA remapping | ||
13 | DRHD - DMA Engine Reporting Structure | ||
14 | RMRR - Reserved memory Region Reporting Structure | ||
15 | ZLR - Zero length reads from PCI devices | ||
16 | IOVA - IO Virtual address. | ||
17 | |||
18 | Basic stuff | ||
19 | ----------- | ||
20 | |||
21 | ACPI enumerates and lists the different DMA engines in the platform, and | ||
22 | device scope relationships between PCI devices and which DMA engine controls | ||
23 | them. | ||
24 | |||
25 | What is RMRR? | ||
26 | ------------- | ||
27 | |||
28 | There are some devices the BIOS controls, for e.g USB devices to perform | ||
29 | PS2 emulation. The regions of memory used for these devices are marked | ||
30 | reserved in the e820 map. When we turn on DMA translation, DMA to those | ||
31 | regions will fail. Hence BIOS uses RMRR to specify these regions along with | ||
32 | devices that need to access these regions. OS is expected to setup | ||
33 | unity mappings for these regions for these devices to access these regions. | ||
34 | |||
35 | How is IOVA generated? | ||
36 | --------------------- | ||
37 | |||
38 | Well behaved drivers call pci_map_*() calls before sending command to device | ||
39 | that needs to perform DMA. Once DMA is completed and mapping is no longer | ||
40 | required, device performs a pci_unmap_*() calls to unmap the region. | ||
41 | |||
42 | The Intel IOMMU driver allocates a virtual address per domain. Each PCIE | ||
43 | device has its own domain (hence protection). Devices under p2p bridges | ||
44 | share the virtual address with all devices under the p2p bridge due to | ||
45 | transaction id aliasing for p2p bridges. | ||
46 | |||
47 | IOVA generation is pretty generic. We used the same technique as vmalloc() | ||
48 | but these are not global address spaces, but separate for each domain. | ||
49 | Different DMA engines may support different number of domains. | ||
50 | |||
51 | We also allocate gaurd pages with each mapping, so we can attempt to catch | ||
52 | any overflow that might happen. | ||
53 | |||
54 | |||
55 | Graphics Problems? | ||
56 | ------------------ | ||
57 | If you encounter issues with graphics devices, you can try adding | ||
58 | option intel_iommu=igfx_off to turn off the integrated graphics engine. | ||
59 | |||
60 | If it happens to be a PCI device included in the INCLUDE_ALL Engine, | ||
61 | then try enabling CONFIG_DMAR_GFX_WA to setup a 1-1 map. We hear | ||
62 | graphics drivers may be in process of using DMA api's in the near | ||
63 | future and at that time this option can be yanked out. | ||
64 | |||
65 | Some exceptions to IOVA | ||
66 | ----------------------- | ||
67 | Interrupt ranges are not address translated, (0xfee00000 - 0xfeefffff). | ||
68 | The same is true for peer to peer transactions. Hence we reserve the | ||
69 | address from PCI MMIO ranges so they are not allocated for IOVA addresses. | ||
70 | |||
71 | |||
72 | Fault reporting | ||
73 | --------------- | ||
74 | When errors are reported, the DMA engine signals via an interrupt. The fault | ||
75 | reason and device that caused it with fault reason is printed on console. | ||
76 | |||
77 | See below for sample. | ||
78 | |||
79 | |||
80 | Boot Message Sample | ||
81 | ------------------- | ||
82 | |||
83 | Something like this gets printed indicating presence of DMAR tables | ||
84 | in ACPI. | ||
85 | |||
86 | ACPI: DMAR (v001 A M I OEMDMAR 0x00000001 MSFT 0x00000097) @ 0x000000007f5b5ef0 | ||
87 | |||
88 | When DMAR is being processed and initialized by ACPI, prints DMAR locations | ||
89 | and any RMRR's processed. | ||
90 | |||
91 | ACPI DMAR:Host address width 36 | ||
92 | ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed90000 | ||
93 | ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed91000 | ||
94 | ACPI DMAR:DRHD (flags: 0x00000001)base: 0x00000000fed93000 | ||
95 | ACPI DMAR:RMRR base: 0x00000000000ed000 end: 0x00000000000effff | ||
96 | ACPI DMAR:RMRR base: 0x000000007f600000 end: 0x000000007fffffff | ||
97 | |||
98 | When DMAR is enabled for use, you will notice.. | ||
99 | |||
100 | PCI-DMA: Using DMAR IOMMU | ||
101 | |||
102 | Fault reporting | ||
103 | --------------- | ||
104 | |||
105 | DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000 | ||
106 | DMAR:[fault reason 05] PTE Write access is not set | ||
107 | DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000 | ||
108 | DMAR:[fault reason 05] PTE Write access is not set | ||
109 | |||
110 | TBD | ||
111 | ---- | ||
112 | |||
113 | - For compatibility testing, could use unity map domain for all devices, just | ||
114 | provide a 1-1 for all useful memory under a single domain for all devices. | ||
115 | - API for paravirt ops for abstracting functionlity for VMM folks. | ||
diff --git a/Documentation/SubmitChecklist b/Documentation/SubmitChecklist index 19e7f65c269f..34e06d2f194f 100644 --- a/Documentation/SubmitChecklist +++ b/Documentation/SubmitChecklist | |||
@@ -67,7 +67,7 @@ kernel patches. | |||
67 | 20: Check that it all passes `make headers_check'. | 67 | 20: Check that it all passes `make headers_check'. |
68 | 68 | ||
69 | 21: Has been checked with injection of at least slab and page-allocation | 69 | 21: Has been checked with injection of at least slab and page-allocation |
70 | fauilures. See Documentation/fault-injection/. | 70 | failures. See Documentation/fault-injection/. |
71 | 71 | ||
72 | If the new code is substantial, addition of subsystem-specific fault | 72 | If the new code is substantial, addition of subsystem-specific fault |
73 | injection might be appropriate. | 73 | injection might be appropriate. |
diff --git a/Documentation/SubmittingDrivers b/Documentation/SubmittingDrivers index d7e26427e426..24f2eb40cae5 100644 --- a/Documentation/SubmittingDrivers +++ b/Documentation/SubmittingDrivers | |||
@@ -36,8 +36,7 @@ Linux 2.4: | |||
36 | If the code area has a general maintainer then please submit it to | 36 | If the code area has a general maintainer then please submit it to |
37 | the maintainer listed in MAINTAINERS in the kernel file. If the | 37 | the maintainer listed in MAINTAINERS in the kernel file. If the |
38 | maintainer does not respond or you cannot find the appropriate | 38 | maintainer does not respond or you cannot find the appropriate |
39 | maintainer then please contact Marcelo Tosatti | 39 | maintainer then please contact Willy Tarreau <w@1wt.eu>. |
40 | <marcelo.tosatti@cyclades.com>. | ||
41 | 40 | ||
42 | Linux 2.6: | 41 | Linux 2.6: |
43 | The same rules apply as 2.4 except that you should follow linux-kernel | 42 | The same rules apply as 2.4 except that you should follow linux-kernel |
diff --git a/Documentation/accounting/cgroupstats.txt b/Documentation/accounting/cgroupstats.txt new file mode 100644 index 000000000000..eda40fd39cad --- /dev/null +++ b/Documentation/accounting/cgroupstats.txt | |||
@@ -0,0 +1,27 @@ | |||
1 | Control Groupstats is inspired by the discussion at | ||
2 | http://lkml.org/lkml/2007/4/11/187 and implements per cgroup statistics as | ||
3 | suggested by Andrew Morton in http://lkml.org/lkml/2007/4/11/263. | ||
4 | |||
5 | Per cgroup statistics infrastructure re-uses code from the taskstats | ||
6 | interface. A new set of cgroup operations are registered with commands | ||
7 | and attributes specific to cgroups. It should be very easy to | ||
8 | extend per cgroup statistics, by adding members to the cgroupstats | ||
9 | structure. | ||
10 | |||
11 | The current model for cgroupstats is a pull, a push model (to post | ||
12 | statistics on interesting events), should be very easy to add. Currently | ||
13 | user space requests for statistics by passing the cgroup path. | ||
14 | Statistics about the state of all the tasks in the cgroup is returned to | ||
15 | user space. | ||
16 | |||
17 | NOTE: We currently rely on delay accounting for extracting information | ||
18 | about tasks blocked on I/O. If CONFIG_TASK_DELAY_ACCT is disabled, this | ||
19 | information will not be available. | ||
20 | |||
21 | To extract cgroup statistics a utility very similar to getdelays.c | ||
22 | has been developed, the sample output of the utility is shown below | ||
23 | |||
24 | ~/balbir/cgroupstats # ./getdelays -C "/cgroup/a" | ||
25 | sleeping 1, blocked 0, running 1, stopped 0, uninterruptible 0 | ||
26 | ~/balbir/cgroupstats # ./getdelays -C "/cgroup" | ||
27 | sleeping 155, blocked 0, running 1, stopped 0, uninterruptible 2 | ||
diff --git a/Documentation/arm/Samsung-S3C24XX/DMA.txt b/Documentation/arm/Samsung-S3C24XX/DMA.txt index 37f4edcc5d87..3ed82383efea 100644 --- a/Documentation/arm/Samsung-S3C24XX/DMA.txt +++ b/Documentation/arm/Samsung-S3C24XX/DMA.txt | |||
@@ -5,7 +5,7 @@ Introduction | |||
5 | ------------ | 5 | ------------ |
6 | 6 | ||
7 | The kernel provides an interface to manage DMA transfers | 7 | The kernel provides an interface to manage DMA transfers |
8 | using the DMA channels in the cpu, so that the central | 8 | using the DMA channels in the CPU, so that the central |
9 | duty of managing channel mappings, and programming the | 9 | duty of managing channel mappings, and programming the |
10 | channel generators is in one place. | 10 | channel generators is in one place. |
11 | 11 | ||
@@ -17,24 +17,24 @@ DMA Channel Ordering | |||
17 | channels to all sources, which means that some devices | 17 | channels to all sources, which means that some devices |
18 | have a restricted number of channels that can be used. | 18 | have a restricted number of channels that can be used. |
19 | 19 | ||
20 | To allow flexibilty for each cpu type and board, the | 20 | To allow flexibility for each CPU type and board, the |
21 | dma code can be given an dma ordering structure which | 21 | DMA code can be given a DMA ordering structure which |
22 | allows the order of channel search to be specified, as | 22 | allows the order of channel search to be specified, as |
23 | well as allowing the prohibition of certain claims. | 23 | well as allowing the prohibition of certain claims. |
24 | 24 | ||
25 | struct s3c24xx_dma_order has a list of channels, and | 25 | struct s3c24xx_dma_order has a list of channels, and |
26 | each channel within has a slot for a list of dma | 26 | each channel within has a slot for a list of DMA |
27 | channel numbers. The slots are searched in order, for | 27 | channel numbers. The slots are searched in order for |
28 | the presence of a dma channel number with DMA_CH_VALID | 28 | the presence of a DMA channel number with DMA_CH_VALID |
29 | orred in. | 29 | or-ed in. |
30 | 30 | ||
31 | If the order has the flag DMA_CH_NEVER set, then after | 31 | If the order has the flag DMA_CH_NEVER set, then after |
32 | checking the channel list, the system will return no | 32 | checking the channel list, the system will return no |
33 | found channel, thus denying the request. | 33 | found channel, thus denying the request. |
34 | 34 | ||
35 | A board support file can call s3c24xx_dma_order_set() | 35 | A board support file can call s3c24xx_dma_order_set() |
36 | to register an complete ordering set. The routine will | 36 | to register a complete ordering set. The routine will |
37 | copy the data, so the original can be discared with | 37 | copy the data, so the original can be discarded with |
38 | __initdata. | 38 | __initdata. |
39 | 39 | ||
40 | 40 | ||
diff --git a/Documentation/atomic_ops.txt b/Documentation/atomic_ops.txt index d46306fea230..f20c10c2858f 100644 --- a/Documentation/atomic_ops.txt +++ b/Documentation/atomic_ops.txt | |||
@@ -418,6 +418,20 @@ brothers: | |||
418 | */ | 418 | */ |
419 | smp_mb__after_clear_bit(); | 419 | smp_mb__after_clear_bit(); |
420 | 420 | ||
421 | There are two special bitops with lock barrier semantics (acquire/release, | ||
422 | same as spinlocks). These operate in the same way as their non-_lock/unlock | ||
423 | postfixed variants, except that they are to provide acquire/release semantics, | ||
424 | respectively. This means they can be used for bit_spin_trylock and | ||
425 | bit_spin_unlock type operations without specifying any more barriers. | ||
426 | |||
427 | int test_and_set_bit_lock(unsigned long nr, unsigned long *addr); | ||
428 | void clear_bit_unlock(unsigned long nr, unsigned long *addr); | ||
429 | void __clear_bit_unlock(unsigned long nr, unsigned long *addr); | ||
430 | |||
431 | The __clear_bit_unlock version is non-atomic, however it still implements | ||
432 | unlock barrier semantics. This can be useful if the lock itself is protecting | ||
433 | the other bits in the word. | ||
434 | |||
421 | Finally, there are non-atomic versions of the bitmask operations | 435 | Finally, there are non-atomic versions of the bitmask operations |
422 | provided. They are used in contexts where some other higher-level SMP | 436 | provided. They are used in contexts where some other higher-level SMP |
423 | locking scheme is being used to protect the bitmask, and thus less | 437 | locking scheme is being used to protect the bitmask, and thus less |
diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt index 552cabac0608..da42ab414c48 100644 --- a/Documentation/cachetlb.txt +++ b/Documentation/cachetlb.txt | |||
@@ -87,30 +87,7 @@ changes occur: | |||
87 | 87 | ||
88 | This is used primarily during fault processing. | 88 | This is used primarily during fault processing. |
89 | 89 | ||
90 | 5) void flush_tlb_pgtables(struct mm_struct *mm, | 90 | 5) void update_mmu_cache(struct vm_area_struct *vma, |
91 | unsigned long start, unsigned long end) | ||
92 | |||
93 | The software page tables for address space 'mm' for virtual | ||
94 | addresses in the range 'start' to 'end-1' are being torn down. | ||
95 | |||
96 | Some platforms cache the lowest level of the software page tables | ||
97 | in a linear virtually mapped array, to make TLB miss processing | ||
98 | more efficient. On such platforms, since the TLB is caching the | ||
99 | software page table structure, it needs to be flushed when parts | ||
100 | of the software page table tree are unlinked/freed. | ||
101 | |||
102 | Sparc64 is one example of a platform which does this. | ||
103 | |||
104 | Usually, when munmap()'ing an area of user virtual address | ||
105 | space, the kernel leaves the page table parts around and just | ||
106 | marks the individual pte's as invalid. However, if very large | ||
107 | portions of the address space are unmapped, the kernel frees up | ||
108 | those portions of the software page tables to prevent potential | ||
109 | excessive kernel memory usage caused by erratic mmap/mmunmap | ||
110 | sequences. It is at these times that flush_tlb_pgtables will | ||
111 | be invoked. | ||
112 | |||
113 | 6) void update_mmu_cache(struct vm_area_struct *vma, | ||
114 | unsigned long address, pte_t pte) | 91 | unsigned long address, pte_t pte) |
115 | 92 | ||
116 | At the end of every page fault, this routine is invoked to | 93 | At the end of every page fault, this routine is invoked to |
@@ -123,7 +100,7 @@ changes occur: | |||
123 | translations for software managed TLB configurations. | 100 | translations for software managed TLB configurations. |
124 | The sparc64 port currently does this. | 101 | The sparc64 port currently does this. |
125 | 102 | ||
126 | 7) void tlb_migrate_finish(struct mm_struct *mm) | 103 | 6) void tlb_migrate_finish(struct mm_struct *mm) |
127 | 104 | ||
128 | This interface is called at the end of an explicit | 105 | This interface is called at the end of an explicit |
129 | process migration. This interface provides a hook | 106 | process migration. This interface provides a hook |
diff --git a/Documentation/cdrom/cdrom-standard.tex b/Documentation/cdrom/cdrom-standard.tex index 92f94e597582..c713aeb020c4 100644 --- a/Documentation/cdrom/cdrom-standard.tex +++ b/Documentation/cdrom/cdrom-standard.tex | |||
@@ -1009,7 +1009,7 @@ taken over the torch in maintaining \cdromc\ and integrating much | |||
1009 | \cdrom-related code in the 2.1-kernel. Thanks to Scott Snyder and | 1009 | \cdrom-related code in the 2.1-kernel. Thanks to Scott Snyder and |
1010 | Gerd Knorr, who were the first to implement this interface for SCSI | 1010 | Gerd Knorr, who were the first to implement this interface for SCSI |
1011 | and IDE-CD drivers and added many ideas for extension of the data | 1011 | and IDE-CD drivers and added many ideas for extension of the data |
1012 | structures relative to kernel~2.0. Further thanks to Heiko Eissfeldt, | 1012 | structures relative to kernel~2.0. Further thanks to Heiko Ei{\sz}feldt, |
1013 | Thomas Quinot, Jon Tombs, Ken Pizzini, Eberhard M\"onkeberg and Andrew | 1013 | Thomas Quinot, Jon Tombs, Ken Pizzini, Eberhard M\"onkeberg and Andrew |
1014 | Kroll, the \linux\ \cdrom\ device driver developers who were kind | 1014 | Kroll, the \linux\ \cdrom\ device driver developers who were kind |
1015 | enough to give suggestions and criticisms during the writing. Finally | 1015 | enough to give suggestions and criticisms during the writing. Finally |
diff --git a/Documentation/cgroups.txt b/Documentation/cgroups.txt new file mode 100644 index 000000000000..98a26f81fa75 --- /dev/null +++ b/Documentation/cgroups.txt | |||
@@ -0,0 +1,545 @@ | |||
1 | CGROUPS | ||
2 | ------- | ||
3 | |||
4 | Written by Paul Menage <menage@google.com> based on Documentation/cpusets.txt | ||
5 | |||
6 | Original copyright statements from cpusets.txt: | ||
7 | Portions Copyright (C) 2004 BULL SA. | ||
8 | Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. | ||
9 | Modified by Paul Jackson <pj@sgi.com> | ||
10 | Modified by Christoph Lameter <clameter@sgi.com> | ||
11 | |||
12 | CONTENTS: | ||
13 | ========= | ||
14 | |||
15 | 1. Control Groups | ||
16 | 1.1 What are cgroups ? | ||
17 | 1.2 Why are cgroups needed ? | ||
18 | 1.3 How are cgroups implemented ? | ||
19 | 1.4 What does notify_on_release do ? | ||
20 | 1.5 How do I use cgroups ? | ||
21 | 2. Usage Examples and Syntax | ||
22 | 2.1 Basic Usage | ||
23 | 2.2 Attaching processes | ||
24 | 3. Kernel API | ||
25 | 3.1 Overview | ||
26 | 3.2 Synchronization | ||
27 | 3.3 Subsystem API | ||
28 | 4. Questions | ||
29 | |||
30 | 1. Control Groups | ||
31 | ========== | ||
32 | |||
33 | 1.1 What are cgroups ? | ||
34 | ---------------------- | ||
35 | |||
36 | Control Groups provide a mechanism for aggregating/partitioning sets of | ||
37 | tasks, and all their future children, into hierarchical groups with | ||
38 | specialized behaviour. | ||
39 | |||
40 | Definitions: | ||
41 | |||
42 | A *cgroup* associates a set of tasks with a set of parameters for one | ||
43 | or more subsystems. | ||
44 | |||
45 | A *subsystem* is a module that makes use of the task grouping | ||
46 | facilities provided by cgroups to treat groups of tasks in | ||
47 | particular ways. A subsystem is typically a "resource controller" that | ||
48 | schedules a resource or applies per-cgroup limits, but it may be | ||
49 | anything that wants to act on a group of processes, e.g. a | ||
50 | virtualization subsystem. | ||
51 | |||
52 | A *hierarchy* is a set of cgroups arranged in a tree, such that | ||
53 | every task in the system is in exactly one of the cgroups in the | ||
54 | hierarchy, and a set of subsystems; each subsystem has system-specific | ||
55 | state attached to each cgroup in the hierarchy. Each hierarchy has | ||
56 | an instance of the cgroup virtual filesystem associated with it. | ||
57 | |||
58 | At any one time there may be multiple active hierachies of task | ||
59 | cgroups. Each hierarchy is a partition of all tasks in the system. | ||
60 | |||
61 | User level code may create and destroy cgroups by name in an | ||
62 | instance of the cgroup virtual file system, specify and query to | ||
63 | which cgroup a task is assigned, and list the task pids assigned to | ||
64 | a cgroup. Those creations and assignments only affect the hierarchy | ||
65 | associated with that instance of the cgroup file system. | ||
66 | |||
67 | On their own, the only use for cgroups is for simple job | ||
68 | tracking. The intention is that other subsystems hook into the generic | ||
69 | cgroup support to provide new attributes for cgroups, such as | ||
70 | accounting/limiting the resources which processes in a cgroup can | ||
71 | access. For example, cpusets (see Documentation/cpusets.txt) allows | ||
72 | you to associate a set of CPUs and a set of memory nodes with the | ||
73 | tasks in each cgroup. | ||
74 | |||
75 | 1.2 Why are cgroups needed ? | ||
76 | ---------------------------- | ||
77 | |||
78 | There are multiple efforts to provide process aggregations in the | ||
79 | Linux kernel, mainly for resource tracking purposes. Such efforts | ||
80 | include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server | ||
81 | namespaces. These all require the basic notion of a | ||
82 | grouping/partitioning of processes, with newly forked processes ending | ||
83 | in the same group (cgroup) as their parent process. | ||
84 | |||
85 | The kernel cgroup patch provides the minimum essential kernel | ||
86 | mechanisms required to efficiently implement such groups. It has | ||
87 | minimal impact on the system fast paths, and provides hooks for | ||
88 | specific subsystems such as cpusets to provide additional behaviour as | ||
89 | desired. | ||
90 | |||
91 | Multiple hierarchy support is provided to allow for situations where | ||
92 | the division of tasks into cgroups is distinctly different for | ||
93 | different subsystems - having parallel hierarchies allows each | ||
94 | hierarchy to be a natural division of tasks, without having to handle | ||
95 | complex combinations of tasks that would be present if several | ||
96 | unrelated subsystems needed to be forced into the same tree of | ||
97 | cgroups. | ||
98 | |||
99 | At one extreme, each resource controller or subsystem could be in a | ||
100 | separate hierarchy; at the other extreme, all subsystems | ||
101 | would be attached to the same hierarchy. | ||
102 | |||
103 | As an example of a scenario (originally proposed by vatsa@in.ibm.com) | ||
104 | that can benefit from multiple hierarchies, consider a large | ||
105 | university server with various users - students, professors, system | ||
106 | tasks etc. The resource planning for this server could be along the | ||
107 | following lines: | ||
108 | |||
109 | CPU : Top cpuset | ||
110 | / \ | ||
111 | CPUSet1 CPUSet2 | ||
112 | | | | ||
113 | (Profs) (Students) | ||
114 | |||
115 | In addition (system tasks) are attached to topcpuset (so | ||
116 | that they can run anywhere) with a limit of 20% | ||
117 | |||
118 | Memory : Professors (50%), students (30%), system (20%) | ||
119 | |||
120 | Disk : Prof (50%), students (30%), system (20%) | ||
121 | |||
122 | Network : WWW browsing (20%), Network File System (60%), others (20%) | ||
123 | / \ | ||
124 | Prof (15%) students (5%) | ||
125 | |||
126 | Browsers like firefox/lynx go into the WWW network class, while (k)nfsd go | ||
127 | into NFS network class. | ||
128 | |||
129 | At the same time firefox/lynx will share an appropriate CPU/Memory class | ||
130 | depending on who launched it (prof/student). | ||
131 | |||
132 | With the ability to classify tasks differently for different resources | ||
133 | (by putting those resource subsystems in different hierarchies) then | ||
134 | the admin can easily set up a script which receives exec notifications | ||
135 | and depending on who is launching the browser he can | ||
136 | |||
137 | # echo browser_pid > /mnt/<restype>/<userclass>/tasks | ||
138 | |||
139 | With only a single hierarchy, he now would potentially have to create | ||
140 | a separate cgroup for every browser launched and associate it with | ||
141 | approp network and other resource class. This may lead to | ||
142 | proliferation of such cgroups. | ||
143 | |||
144 | Also lets say that the administrator would like to give enhanced network | ||
145 | access temporarily to a student's browser (since it is night and the user | ||
146 | wants to do online gaming :) OR give one of the students simulation | ||
147 | apps enhanced CPU power, | ||
148 | |||
149 | With ability to write pids directly to resource classes, its just a | ||
150 | matter of : | ||
151 | |||
152 | # echo pid > /mnt/network/<new_class>/tasks | ||
153 | (after some time) | ||
154 | # echo pid > /mnt/network/<orig_class>/tasks | ||
155 | |||
156 | Without this ability, he would have to split the cgroup into | ||
157 | multiple separate ones and then associate the new cgroups with the | ||
158 | new resource classes. | ||
159 | |||
160 | |||
161 | |||
162 | 1.3 How are cgroups implemented ? | ||
163 | --------------------------------- | ||
164 | |||
165 | Control Groups extends the kernel as follows: | ||
166 | |||
167 | - Each task in the system has a reference-counted pointer to a | ||
168 | css_set. | ||
169 | |||
170 | - A css_set contains a set of reference-counted pointers to | ||
171 | cgroup_subsys_state objects, one for each cgroup subsystem | ||
172 | registered in the system. There is no direct link from a task to | ||
173 | the cgroup of which it's a member in each hierarchy, but this | ||
174 | can be determined by following pointers through the | ||
175 | cgroup_subsys_state objects. This is because accessing the | ||
176 | subsystem state is something that's expected to happen frequently | ||
177 | and in performance-critical code, whereas operations that require a | ||
178 | task's actual cgroup assignments (in particular, moving between | ||
179 | cgroups) are less common. A linked list runs through the cg_list | ||
180 | field of each task_struct using the css_set, anchored at | ||
181 | css_set->tasks. | ||
182 | |||
183 | - A cgroup hierarchy filesystem can be mounted for browsing and | ||
184 | manipulation from user space. | ||
185 | |||
186 | - You can list all the tasks (by pid) attached to any cgroup. | ||
187 | |||
188 | The implementation of cgroups requires a few, simple hooks | ||
189 | into the rest of the kernel, none in performance critical paths: | ||
190 | |||
191 | - in init/main.c, to initialize the root cgroups and initial | ||
192 | css_set at system boot. | ||
193 | |||
194 | - in fork and exit, to attach and detach a task from its css_set. | ||
195 | |||
196 | In addition a new file system, of type "cgroup" may be mounted, to | ||
197 | enable browsing and modifying the cgroups presently known to the | ||
198 | kernel. When mounting a cgroup hierarchy, you may specify a | ||
199 | comma-separated list of subsystems to mount as the filesystem mount | ||
200 | options. By default, mounting the cgroup filesystem attempts to | ||
201 | mount a hierarchy containing all registered subsystems. | ||
202 | |||
203 | If an active hierarchy with exactly the same set of subsystems already | ||
204 | exists, it will be reused for the new mount. If no existing hierarchy | ||
205 | matches, and any of the requested subsystems are in use in an existing | ||
206 | hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy | ||
207 | is activated, associated with the requested subsystems. | ||
208 | |||
209 | It's not currently possible to bind a new subsystem to an active | ||
210 | cgroup hierarchy, or to unbind a subsystem from an active cgroup | ||
211 | hierarchy. This may be possible in future, but is fraught with nasty | ||
212 | error-recovery issues. | ||
213 | |||
214 | When a cgroup filesystem is unmounted, if there are any | ||
215 | child cgroups created below the top-level cgroup, that hierarchy | ||
216 | will remain active even though unmounted; if there are no | ||
217 | child cgroups then the hierarchy will be deactivated. | ||
218 | |||
219 | No new system calls are added for cgroups - all support for | ||
220 | querying and modifying cgroups is via this cgroup file system. | ||
221 | |||
222 | Each task under /proc has an added file named 'cgroup' displaying, | ||
223 | for each active hierarchy, the subsystem names and the cgroup name | ||
224 | as the path relative to the root of the cgroup file system. | ||
225 | |||
226 | Each cgroup is represented by a directory in the cgroup file system | ||
227 | containing the following files describing that cgroup: | ||
228 | |||
229 | - tasks: list of tasks (by pid) attached to that cgroup | ||
230 | - notify_on_release flag: run /sbin/cgroup_release_agent on exit? | ||
231 | |||
232 | Other subsystems such as cpusets may add additional files in each | ||
233 | cgroup dir | ||
234 | |||
235 | New cgroups are created using the mkdir system call or shell | ||
236 | command. The properties of a cgroup, such as its flags, are | ||
237 | modified by writing to the appropriate file in that cgroups | ||
238 | directory, as listed above. | ||
239 | |||
240 | The named hierarchical structure of nested cgroups allows partitioning | ||
241 | a large system into nested, dynamically changeable, "soft-partitions". | ||
242 | |||
243 | The attachment of each task, automatically inherited at fork by any | ||
244 | children of that task, to a cgroup allows organizing the work load | ||
245 | on a system into related sets of tasks. A task may be re-attached to | ||
246 | any other cgroup, if allowed by the permissions on the necessary | ||
247 | cgroup file system directories. | ||
248 | |||
249 | When a task is moved from one cgroup to another, it gets a new | ||
250 | css_set pointer - if there's an already existing css_set with the | ||
251 | desired collection of cgroups then that group is reused, else a new | ||
252 | css_set is allocated. Note that the current implementation uses a | ||
253 | linear search to locate an appropriate existing css_set, so isn't | ||
254 | very efficient. A future version will use a hash table for better | ||
255 | performance. | ||
256 | |||
257 | To allow access from a cgroup to the css_sets (and hence tasks) | ||
258 | that comprise it, a set of cg_cgroup_link objects form a lattice; | ||
259 | each cg_cgroup_link is linked into a list of cg_cgroup_links for | ||
260 | a single cgroup on its cont_link_list field, and a list of | ||
261 | cg_cgroup_links for a single css_set on its cg_link_list. | ||
262 | |||
263 | Thus the set of tasks in a cgroup can be listed by iterating over | ||
264 | each css_set that references the cgroup, and sub-iterating over | ||
265 | each css_set's task set. | ||
266 | |||
267 | The use of a Linux virtual file system (vfs) to represent the | ||
268 | cgroup hierarchy provides for a familiar permission and name space | ||
269 | for cgroups, with a minimum of additional kernel code. | ||
270 | |||
271 | 1.4 What does notify_on_release do ? | ||
272 | ------------------------------------ | ||
273 | |||
274 | *** notify_on_release is disabled in the current patch set. It will be | ||
275 | *** reactivated in a future patch in a less-intrusive manner | ||
276 | |||
277 | If the notify_on_release flag is enabled (1) in a cgroup, then | ||
278 | whenever the last task in the cgroup leaves (exits or attaches to | ||
279 | some other cgroup) and the last child cgroup of that cgroup | ||
280 | is removed, then the kernel runs the command specified by the contents | ||
281 | of the "release_agent" file in that hierarchy's root directory, | ||
282 | supplying the pathname (relative to the mount point of the cgroup | ||
283 | file system) of the abandoned cgroup. This enables automatic | ||
284 | removal of abandoned cgroups. The default value of | ||
285 | notify_on_release in the root cgroup at system boot is disabled | ||
286 | (0). The default value of other cgroups at creation is the current | ||
287 | value of their parents notify_on_release setting. The default value of | ||
288 | a cgroup hierarchy's release_agent path is empty. | ||
289 | |||
290 | 1.5 How do I use cgroups ? | ||
291 | -------------------------- | ||
292 | |||
293 | To start a new job that is to be contained within a cgroup, using | ||
294 | the "cpuset" cgroup subsystem, the steps are something like: | ||
295 | |||
296 | 1) mkdir /dev/cgroup | ||
297 | 2) mount -t cgroup -ocpuset cpuset /dev/cgroup | ||
298 | 3) Create the new cgroup by doing mkdir's and write's (or echo's) in | ||
299 | the /dev/cgroup virtual file system. | ||
300 | 4) Start a task that will be the "founding father" of the new job. | ||
301 | 5) Attach that task to the new cgroup by writing its pid to the | ||
302 | /dev/cgroup tasks file for that cgroup. | ||
303 | 6) fork, exec or clone the job tasks from this founding father task. | ||
304 | |||
305 | For example, the following sequence of commands will setup a cgroup | ||
306 | named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, | ||
307 | and then start a subshell 'sh' in that cgroup: | ||
308 | |||
309 | mount -t cgroup cpuset -ocpuset /dev/cgroup | ||
310 | cd /dev/cgroup | ||
311 | mkdir Charlie | ||
312 | cd Charlie | ||
313 | /bin/echo 2-3 > cpus | ||
314 | /bin/echo 1 > mems | ||
315 | /bin/echo $$ > tasks | ||
316 | sh | ||
317 | # The subshell 'sh' is now running in cgroup Charlie | ||
318 | # The next line should display '/Charlie' | ||
319 | cat /proc/self/cgroup | ||
320 | |||
321 | 2. Usage Examples and Syntax | ||
322 | ============================ | ||
323 | |||
324 | 2.1 Basic Usage | ||
325 | --------------- | ||
326 | |||
327 | Creating, modifying, using the cgroups can be done through the cgroup | ||
328 | virtual filesystem. | ||
329 | |||
330 | To mount a cgroup hierarchy will all available subsystems, type: | ||
331 | # mount -t cgroup xxx /dev/cgroup | ||
332 | |||
333 | The "xxx" is not interpreted by the cgroup code, but will appear in | ||
334 | /proc/mounts so may be any useful identifying string that you like. | ||
335 | |||
336 | To mount a cgroup hierarchy with just the cpuset and numtasks | ||
337 | subsystems, type: | ||
338 | # mount -t cgroup -o cpuset,numtasks hier1 /dev/cgroup | ||
339 | |||
340 | To change the set of subsystems bound to a mounted hierarchy, just | ||
341 | remount with different options: | ||
342 | |||
343 | # mount -o remount,cpuset,ns /dev/cgroup | ||
344 | |||
345 | Note that changing the set of subsystems is currently only supported | ||
346 | when the hierarchy consists of a single (root) cgroup. Supporting | ||
347 | the ability to arbitrarily bind/unbind subsystems from an existing | ||
348 | cgroup hierarchy is intended to be implemented in the future. | ||
349 | |||
350 | Then under /dev/cgroup you can find a tree that corresponds to the | ||
351 | tree of the cgroups in the system. For instance, /dev/cgroup | ||
352 | is the cgroup that holds the whole system. | ||
353 | |||
354 | If you want to create a new cgroup under /dev/cgroup: | ||
355 | # cd /dev/cgroup | ||
356 | # mkdir my_cgroup | ||
357 | |||
358 | Now you want to do something with this cgroup. | ||
359 | # cd my_cgroup | ||
360 | |||
361 | In this directory you can find several files: | ||
362 | # ls | ||
363 | notify_on_release release_agent tasks | ||
364 | (plus whatever files are added by the attached subsystems) | ||
365 | |||
366 | Now attach your shell to this cgroup: | ||
367 | # /bin/echo $$ > tasks | ||
368 | |||
369 | You can also create cgroups inside your cgroup by using mkdir in this | ||
370 | directory. | ||
371 | # mkdir my_sub_cs | ||
372 | |||
373 | To remove a cgroup, just use rmdir: | ||
374 | # rmdir my_sub_cs | ||
375 | |||
376 | This will fail if the cgroup is in use (has cgroups inside, or | ||
377 | has processes attached, or is held alive by other subsystem-specific | ||
378 | reference). | ||
379 | |||
380 | 2.2 Attaching processes | ||
381 | ----------------------- | ||
382 | |||
383 | # /bin/echo PID > tasks | ||
384 | |||
385 | Note that it is PID, not PIDs. You can only attach ONE task at a time. | ||
386 | If you have several tasks to attach, you have to do it one after another: | ||
387 | |||
388 | # /bin/echo PID1 > tasks | ||
389 | # /bin/echo PID2 > tasks | ||
390 | ... | ||
391 | # /bin/echo PIDn > tasks | ||
392 | |||
393 | 3. Kernel API | ||
394 | ============= | ||
395 | |||
396 | 3.1 Overview | ||
397 | ------------ | ||
398 | |||
399 | Each kernel subsystem that wants to hook into the generic cgroup | ||
400 | system needs to create a cgroup_subsys object. This contains | ||
401 | various methods, which are callbacks from the cgroup system, along | ||
402 | with a subsystem id which will be assigned by the cgroup system. | ||
403 | |||
404 | Other fields in the cgroup_subsys object include: | ||
405 | |||
406 | - subsys_id: a unique array index for the subsystem, indicating which | ||
407 | entry in cgroup->subsys[] this subsystem should be | ||
408 | managing. Initialized by cgroup_register_subsys(); prior to this | ||
409 | it should be initialized to -1 | ||
410 | |||
411 | - hierarchy: an index indicating which hierarchy, if any, this | ||
412 | subsystem is currently attached to. If this is -1, then the | ||
413 | subsystem is not attached to any hierarchy, and all tasks should be | ||
414 | considered to be members of the subsystem's top_cgroup. It should | ||
415 | be initialized to -1. | ||
416 | |||
417 | - name: should be initialized to a unique subsystem name prior to | ||
418 | calling cgroup_register_subsystem. Should be no longer than | ||
419 | MAX_CGROUP_TYPE_NAMELEN | ||
420 | |||
421 | Each cgroup object created by the system has an array of pointers, | ||
422 | indexed by subsystem id; this pointer is entirely managed by the | ||
423 | subsystem; the generic cgroup code will never touch this pointer. | ||
424 | |||
425 | 3.2 Synchronization | ||
426 | ------------------- | ||
427 | |||
428 | There is a global mutex, cgroup_mutex, used by the cgroup | ||
429 | system. This should be taken by anything that wants to modify a | ||
430 | cgroup. It may also be taken to prevent cgroups from being | ||
431 | modified, but more specific locks may be more appropriate in that | ||
432 | situation. | ||
433 | |||
434 | See kernel/cgroup.c for more details. | ||
435 | |||
436 | Subsystems can take/release the cgroup_mutex via the functions | ||
437 | cgroup_lock()/cgroup_unlock(), and can | ||
438 | take/release the callback_mutex via the functions | ||
439 | cgroup_lock()/cgroup_unlock(). | ||
440 | |||
441 | Accessing a task's cgroup pointer may be done in the following ways: | ||
442 | - while holding cgroup_mutex | ||
443 | - while holding the task's alloc_lock (via task_lock()) | ||
444 | - inside an rcu_read_lock() section via rcu_dereference() | ||
445 | |||
446 | 3.3 Subsystem API | ||
447 | -------------------------- | ||
448 | |||
449 | Each subsystem should: | ||
450 | |||
451 | - add an entry in linux/cgroup_subsys.h | ||
452 | - define a cgroup_subsys object called <name>_subsys | ||
453 | |||
454 | Each subsystem may export the following methods. The only mandatory | ||
455 | methods are create/destroy. Any others that are null are presumed to | ||
456 | be successful no-ops. | ||
457 | |||
458 | struct cgroup_subsys_state *create(struct cgroup *cont) | ||
459 | LL=cgroup_mutex | ||
460 | |||
461 | Called to create a subsystem state object for a cgroup. The | ||
462 | subsystem should allocate its subsystem state object for the passed | ||
463 | cgroup, returning a pointer to the new object on success or a | ||
464 | negative error code. On success, the subsystem pointer should point to | ||
465 | a structure of type cgroup_subsys_state (typically embedded in a | ||
466 | larger subsystem-specific object), which will be initialized by the | ||
467 | cgroup system. Note that this will be called at initialization to | ||
468 | create the root subsystem state for this subsystem; this case can be | ||
469 | identified by the passed cgroup object having a NULL parent (since | ||
470 | it's the root of the hierarchy) and may be an appropriate place for | ||
471 | initialization code. | ||
472 | |||
473 | void destroy(struct cgroup *cont) | ||
474 | LL=cgroup_mutex | ||
475 | |||
476 | The cgroup system is about to destroy the passed cgroup; the | ||
477 | subsystem should do any necessary cleanup | ||
478 | |||
479 | int can_attach(struct cgroup_subsys *ss, struct cgroup *cont, | ||
480 | struct task_struct *task) | ||
481 | LL=cgroup_mutex | ||
482 | |||
483 | Called prior to moving a task into a cgroup; if the subsystem | ||
484 | returns an error, this will abort the attach operation. If a NULL | ||
485 | task is passed, then a successful result indicates that *any* | ||
486 | unspecified task can be moved into the cgroup. Note that this isn't | ||
487 | called on a fork. If this method returns 0 (success) then this should | ||
488 | remain valid while the caller holds cgroup_mutex. | ||
489 | |||
490 | void attach(struct cgroup_subsys *ss, struct cgroup *cont, | ||
491 | struct cgroup *old_cont, struct task_struct *task) | ||
492 | LL=cgroup_mutex | ||
493 | |||
494 | |||
495 | Called after the task has been attached to the cgroup, to allow any | ||
496 | post-attachment activity that requires memory allocations or blocking. | ||
497 | |||
498 | void fork(struct cgroup_subsy *ss, struct task_struct *task) | ||
499 | LL=callback_mutex, maybe read_lock(tasklist_lock) | ||
500 | |||
501 | Called when a task is forked into a cgroup. Also called during | ||
502 | registration for all existing tasks. | ||
503 | |||
504 | void exit(struct cgroup_subsys *ss, struct task_struct *task) | ||
505 | LL=callback_mutex | ||
506 | |||
507 | Called during task exit | ||
508 | |||
509 | int populate(struct cgroup_subsys *ss, struct cgroup *cont) | ||
510 | LL=none | ||
511 | |||
512 | Called after creation of a cgroup to allow a subsystem to populate | ||
513 | the cgroup directory with file entries. The subsystem should make | ||
514 | calls to cgroup_add_file() with objects of type cftype (see | ||
515 | include/linux/cgroup.h for details). Note that although this | ||
516 | method can return an error code, the error code is currently not | ||
517 | always handled well. | ||
518 | |||
519 | void post_clone(struct cgroup_subsys *ss, struct cgroup *cont) | ||
520 | |||
521 | Called at the end of cgroup_clone() to do any paramater | ||
522 | initialization which might be required before a task could attach. For | ||
523 | example in cpusets, no task may attach before 'cpus' and 'mems' are set | ||
524 | up. | ||
525 | |||
526 | void bind(struct cgroup_subsys *ss, struct cgroup *root) | ||
527 | LL=callback_mutex | ||
528 | |||
529 | Called when a cgroup subsystem is rebound to a different hierarchy | ||
530 | and root cgroup. Currently this will only involve movement between | ||
531 | the default hierarchy (which never has sub-cgroups) and a hierarchy | ||
532 | that is being created/destroyed (and hence has no sub-cgroups). | ||
533 | |||
534 | 4. Questions | ||
535 | ============ | ||
536 | |||
537 | Q: what's up with this '/bin/echo' ? | ||
538 | A: bash's builtin 'echo' command does not check calls to write() against | ||
539 | errors. If you use it in the cgroup file system, you won't be | ||
540 | able to tell whether a command succeeded or failed. | ||
541 | |||
542 | Q: When I attach processes, only the first of the line gets really attached ! | ||
543 | A: We can only return one error code per call to write(). So you should also | ||
544 | put only ONE pid. | ||
545 | |||
diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt index b6d24c22274b..a741f658a3c9 100644 --- a/Documentation/cpu-hotplug.txt +++ b/Documentation/cpu-hotplug.txt | |||
@@ -220,7 +220,9 @@ A: The following happen, listed in no particular order :-) | |||
220 | CPU_DOWN_PREPARE or CPU_DOWN_PREPARE_FROZEN, depending on whether or not the | 220 | CPU_DOWN_PREPARE or CPU_DOWN_PREPARE_FROZEN, depending on whether or not the |
221 | CPU is being offlined while tasks are frozen due to a suspend operation in | 221 | CPU is being offlined while tasks are frozen due to a suspend operation in |
222 | progress | 222 | progress |
223 | - All process is migrated away from this outgoing CPU to a new CPU | 223 | - All processes are migrated away from this outgoing CPU to new CPUs. |
224 | The new CPU is chosen from each process' current cpuset, which may be | ||
225 | a subset of all online CPUs. | ||
224 | - All interrupts targeted to this CPU is migrated to a new CPU | 226 | - All interrupts targeted to this CPU is migrated to a new CPU |
225 | - timers/bottom half/task lets are also migrated to a new CPU | 227 | - timers/bottom half/task lets are also migrated to a new CPU |
226 | - Once all services are migrated, kernel calls an arch specific routine | 228 | - Once all services are migrated, kernel calls an arch specific routine |
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt index ec9de6917f01..141bef1c8599 100644 --- a/Documentation/cpusets.txt +++ b/Documentation/cpusets.txt | |||
@@ -7,6 +7,7 @@ Written by Simon.Derr@bull.net | |||
7 | Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. | 7 | Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. |
8 | Modified by Paul Jackson <pj@sgi.com> | 8 | Modified by Paul Jackson <pj@sgi.com> |
9 | Modified by Christoph Lameter <clameter@sgi.com> | 9 | Modified by Christoph Lameter <clameter@sgi.com> |
10 | Modified by Paul Menage <menage@google.com> | ||
10 | 11 | ||
11 | CONTENTS: | 12 | CONTENTS: |
12 | ========= | 13 | ========= |
@@ -16,9 +17,9 @@ CONTENTS: | |||
16 | 1.2 Why are cpusets needed ? | 17 | 1.2 Why are cpusets needed ? |
17 | 1.3 How are cpusets implemented ? | 18 | 1.3 How are cpusets implemented ? |
18 | 1.4 What are exclusive cpusets ? | 19 | 1.4 What are exclusive cpusets ? |
19 | 1.5 What does notify_on_release do ? | 20 | 1.5 What is memory_pressure ? |
20 | 1.6 What is memory_pressure ? | 21 | 1.6 What is memory spread ? |
21 | 1.7 What is memory spread ? | 22 | 1.7 What is sched_load_balance ? |
22 | 1.8 How do I use cpusets ? | 23 | 1.8 How do I use cpusets ? |
23 | 2. Usage Examples and Syntax | 24 | 2. Usage Examples and Syntax |
24 | 2.1 Basic Usage | 25 | 2.1 Basic Usage |
@@ -44,18 +45,19 @@ hierarchy visible in a virtual file system. These are the essential | |||
44 | hooks, beyond what is already present, required to manage dynamic | 45 | hooks, beyond what is already present, required to manage dynamic |
45 | job placement on large systems. | 46 | job placement on large systems. |
46 | 47 | ||
47 | Each task has a pointer to a cpuset. Multiple tasks may reference | 48 | Cpusets use the generic cgroup subsystem described in |
48 | the same cpuset. Requests by a task, using the sched_setaffinity(2) | 49 | Documentation/cgroup.txt. |
49 | system call to include CPUs in its CPU affinity mask, and using the | 50 | |
50 | mbind(2) and set_mempolicy(2) system calls to include Memory Nodes | 51 | Requests by a task, using the sched_setaffinity(2) system call to |
51 | in its memory policy, are both filtered through that tasks cpuset, | 52 | include CPUs in its CPU affinity mask, and using the mbind(2) and |
52 | filtering out any CPUs or Memory Nodes not in that cpuset. The | 53 | set_mempolicy(2) system calls to include Memory Nodes in its memory |
53 | scheduler will not schedule a task on a CPU that is not allowed in | 54 | policy, are both filtered through that tasks cpuset, filtering out any |
54 | its cpus_allowed vector, and the kernel page allocator will not | 55 | CPUs or Memory Nodes not in that cpuset. The scheduler will not |
55 | allocate a page on a node that is not allowed in the requesting tasks | 56 | schedule a task on a CPU that is not allowed in its cpus_allowed |
56 | mems_allowed vector. | 57 | vector, and the kernel page allocator will not allocate a page on a |
57 | 58 | node that is not allowed in the requesting tasks mems_allowed vector. | |
58 | User level code may create and destroy cpusets by name in the cpuset | 59 | |
60 | User level code may create and destroy cpusets by name in the cgroup | ||
59 | virtual file system, manage the attributes and permissions of these | 61 | virtual file system, manage the attributes and permissions of these |
60 | cpusets and which CPUs and Memory Nodes are assigned to each cpuset, | 62 | cpusets and which CPUs and Memory Nodes are assigned to each cpuset, |
61 | specify and query to which cpuset a task is assigned, and list the | 63 | specify and query to which cpuset a task is assigned, and list the |
@@ -115,7 +117,7 @@ Cpusets extends these two mechanisms as follows: | |||
115 | - Cpusets are sets of allowed CPUs and Memory Nodes, known to the | 117 | - Cpusets are sets of allowed CPUs and Memory Nodes, known to the |
116 | kernel. | 118 | kernel. |
117 | - Each task in the system is attached to a cpuset, via a pointer | 119 | - Each task in the system is attached to a cpuset, via a pointer |
118 | in the task structure to a reference counted cpuset structure. | 120 | in the task structure to a reference counted cgroup structure. |
119 | - Calls to sched_setaffinity are filtered to just those CPUs | 121 | - Calls to sched_setaffinity are filtered to just those CPUs |
120 | allowed in that tasks cpuset. | 122 | allowed in that tasks cpuset. |
121 | - Calls to mbind and set_mempolicy are filtered to just | 123 | - Calls to mbind and set_mempolicy are filtered to just |
@@ -145,15 +147,10 @@ into the rest of the kernel, none in performance critical paths: | |||
145 | - in page_alloc.c, to restrict memory to allowed nodes. | 147 | - in page_alloc.c, to restrict memory to allowed nodes. |
146 | - in vmscan.c, to restrict page recovery to the current cpuset. | 148 | - in vmscan.c, to restrict page recovery to the current cpuset. |
147 | 149 | ||
148 | In addition a new file system, of type "cpuset" may be mounted, | 150 | You should mount the "cgroup" filesystem type in order to enable |
149 | typically at /dev/cpuset, to enable browsing and modifying the cpusets | 151 | browsing and modifying the cpusets presently known to the kernel. No |
150 | presently known to the kernel. No new system calls are added for | 152 | new system calls are added for cpusets - all support for querying and |
151 | cpusets - all support for querying and modifying cpusets is via | 153 | modifying cpusets is via this cpuset file system. |
152 | this cpuset file system. | ||
153 | |||
154 | Each task under /proc has an added file named 'cpuset', displaying | ||
155 | the cpuset name, as the path relative to the root of the cpuset file | ||
156 | system. | ||
157 | 154 | ||
158 | The /proc/<pid>/status file for each task has two added lines, | 155 | The /proc/<pid>/status file for each task has two added lines, |
159 | displaying the tasks cpus_allowed (on which CPUs it may be scheduled) | 156 | displaying the tasks cpus_allowed (on which CPUs it may be scheduled) |
@@ -163,16 +160,15 @@ in the format seen in the following example: | |||
163 | Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff | 160 | Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff |
164 | Mems_allowed: ffffffff,ffffffff | 161 | Mems_allowed: ffffffff,ffffffff |
165 | 162 | ||
166 | Each cpuset is represented by a directory in the cpuset file system | 163 | Each cpuset is represented by a directory in the cgroup file system |
167 | containing the following files describing that cpuset: | 164 | containing (on top of the standard cgroup files) the following |
165 | files describing that cpuset: | ||
168 | 166 | ||
169 | - cpus: list of CPUs in that cpuset | 167 | - cpus: list of CPUs in that cpuset |
170 | - mems: list of Memory Nodes in that cpuset | 168 | - mems: list of Memory Nodes in that cpuset |
171 | - memory_migrate flag: if set, move pages to cpusets nodes | 169 | - memory_migrate flag: if set, move pages to cpusets nodes |
172 | - cpu_exclusive flag: is cpu placement exclusive? | 170 | - cpu_exclusive flag: is cpu placement exclusive? |
173 | - mem_exclusive flag: is memory placement exclusive? | 171 | - mem_exclusive flag: is memory placement exclusive? |
174 | - tasks: list of tasks (by pid) attached to that cpuset | ||
175 | - notify_on_release flag: run /sbin/cpuset_release_agent on exit? | ||
176 | - memory_pressure: measure of how much paging pressure in cpuset | 172 | - memory_pressure: measure of how much paging pressure in cpuset |
177 | 173 | ||
178 | In addition, the root cpuset only has the following file: | 174 | In addition, the root cpuset only has the following file: |
@@ -237,21 +233,7 @@ such as requests from interrupt handlers, is allowed to be taken | |||
237 | outside even a mem_exclusive cpuset. | 233 | outside even a mem_exclusive cpuset. |
238 | 234 | ||
239 | 235 | ||
240 | 1.5 What does notify_on_release do ? | 236 | 1.5 What is memory_pressure ? |
241 | ------------------------------------ | ||
242 | |||
243 | If the notify_on_release flag is enabled (1) in a cpuset, then whenever | ||
244 | the last task in the cpuset leaves (exits or attaches to some other | ||
245 | cpuset) and the last child cpuset of that cpuset is removed, then | ||
246 | the kernel runs the command /sbin/cpuset_release_agent, supplying the | ||
247 | pathname (relative to the mount point of the cpuset file system) of the | ||
248 | abandoned cpuset. This enables automatic removal of abandoned cpusets. | ||
249 | The default value of notify_on_release in the root cpuset at system | ||
250 | boot is disabled (0). The default value of other cpusets at creation | ||
251 | is the current value of their parents notify_on_release setting. | ||
252 | |||
253 | |||
254 | 1.6 What is memory_pressure ? | ||
255 | ----------------------------- | 237 | ----------------------------- |
256 | The memory_pressure of a cpuset provides a simple per-cpuset metric | 238 | The memory_pressure of a cpuset provides a simple per-cpuset metric |
257 | of the rate that the tasks in a cpuset are attempting to free up in | 239 | of the rate that the tasks in a cpuset are attempting to free up in |
@@ -308,7 +290,7 @@ the tasks in the cpuset, in units of reclaims attempted per second, | |||
308 | times 1000. | 290 | times 1000. |
309 | 291 | ||
310 | 292 | ||
311 | 1.7 What is memory spread ? | 293 | 1.6 What is memory spread ? |
312 | --------------------------- | 294 | --------------------------- |
313 | There are two boolean flag files per cpuset that control where the | 295 | There are two boolean flag files per cpuset that control where the |
314 | kernel allocates pages for the file system buffers and related in | 296 | kernel allocates pages for the file system buffers and related in |
@@ -378,6 +360,142 @@ policy, especially for jobs that might have one thread reading in the | |||
378 | data set, the memory allocation across the nodes in the jobs cpuset | 360 | data set, the memory allocation across the nodes in the jobs cpuset |
379 | can become very uneven. | 361 | can become very uneven. |
380 | 362 | ||
363 | 1.7 What is sched_load_balance ? | ||
364 | -------------------------------- | ||
365 | |||
366 | The kernel scheduler (kernel/sched.c) automatically load balances | ||
367 | tasks. If one CPU is underutilized, kernel code running on that | ||
368 | CPU will look for tasks on other more overloaded CPUs and move those | ||
369 | tasks to itself, within the constraints of such placement mechanisms | ||
370 | as cpusets and sched_setaffinity. | ||
371 | |||
372 | The algorithmic cost of load balancing and its impact on key shared | ||
373 | kernel data structures such as the task list increases more than | ||
374 | linearly with the number of CPUs being balanced. So the scheduler | ||
375 | has support to partition the systems CPUs into a number of sched | ||
376 | domains such that it only load balances within each sched domain. | ||
377 | Each sched domain covers some subset of the CPUs in the system; | ||
378 | no two sched domains overlap; some CPUs might not be in any sched | ||
379 | domain and hence won't be load balanced. | ||
380 | |||
381 | Put simply, it costs less to balance between two smaller sched domains | ||
382 | than one big one, but doing so means that overloads in one of the | ||
383 | two domains won't be load balanced to the other one. | ||
384 | |||
385 | By default, there is one sched domain covering all CPUs, except those | ||
386 | marked isolated using the kernel boot time "isolcpus=" argument. | ||
387 | |||
388 | This default load balancing across all CPUs is not well suited for | ||
389 | the following two situations: | ||
390 | 1) On large systems, load balancing across many CPUs is expensive. | ||
391 | If the system is managed using cpusets to place independent jobs | ||
392 | on separate sets of CPUs, full load balancing is unnecessary. | ||
393 | 2) Systems supporting realtime on some CPUs need to minimize | ||
394 | system overhead on those CPUs, including avoiding task load | ||
395 | balancing if that is not needed. | ||
396 | |||
397 | When the per-cpuset flag "sched_load_balance" is enabled (the default | ||
398 | setting), it requests that all the CPUs in that cpusets allowed 'cpus' | ||
399 | be contained in a single sched domain, ensuring that load balancing | ||
400 | can move a task (not otherwised pinned, as by sched_setaffinity) | ||
401 | from any CPU in that cpuset to any other. | ||
402 | |||
403 | When the per-cpuset flag "sched_load_balance" is disabled, then the | ||
404 | scheduler will avoid load balancing across the CPUs in that cpuset, | ||
405 | --except-- in so far as is necessary because some overlapping cpuset | ||
406 | has "sched_load_balance" enabled. | ||
407 | |||
408 | So, for example, if the top cpuset has the flag "sched_load_balance" | ||
409 | enabled, then the scheduler will have one sched domain covering all | ||
410 | CPUs, and the setting of the "sched_load_balance" flag in any other | ||
411 | cpusets won't matter, as we're already fully load balancing. | ||
412 | |||
413 | Therefore in the above two situations, the top cpuset flag | ||
414 | "sched_load_balance" should be disabled, and only some of the smaller, | ||
415 | child cpusets have this flag enabled. | ||
416 | |||
417 | When doing this, you don't usually want to leave any unpinned tasks in | ||
418 | the top cpuset that might use non-trivial amounts of CPU, as such tasks | ||
419 | may be artificially constrained to some subset of CPUs, depending on | ||
420 | the particulars of this flag setting in descendent cpusets. Even if | ||
421 | such a task could use spare CPU cycles in some other CPUs, the kernel | ||
422 | scheduler might not consider the possibility of load balancing that | ||
423 | task to that underused CPU. | ||
424 | |||
425 | Of course, tasks pinned to a particular CPU can be left in a cpuset | ||
426 | that disables "sched_load_balance" as those tasks aren't going anywhere | ||
427 | else anyway. | ||
428 | |||
429 | There is an impedance mismatch here, between cpusets and sched domains. | ||
430 | Cpusets are hierarchical and nest. Sched domains are flat; they don't | ||
431 | overlap and each CPU is in at most one sched domain. | ||
432 | |||
433 | It is necessary for sched domains to be flat because load balancing | ||
434 | across partially overlapping sets of CPUs would risk unstable dynamics | ||
435 | that would be beyond our understanding. So if each of two partially | ||
436 | overlapping cpusets enables the flag 'sched_load_balance', then we | ||
437 | form a single sched domain that is a superset of both. We won't move | ||
438 | a task to a CPU outside it cpuset, but the scheduler load balancing | ||
439 | code might waste some compute cycles considering that possibility. | ||
440 | |||
441 | This mismatch is why there is not a simple one-to-one relation | ||
442 | between which cpusets have the flag "sched_load_balance" enabled, | ||
443 | and the sched domain configuration. If a cpuset enables the flag, it | ||
444 | will get balancing across all its CPUs, but if it disables the flag, | ||
445 | it will only be assured of no load balancing if no other overlapping | ||
446 | cpuset enables the flag. | ||
447 | |||
448 | If two cpusets have partially overlapping 'cpus' allowed, and only | ||
449 | one of them has this flag enabled, then the other may find its | ||
450 | tasks only partially load balanced, just on the overlapping CPUs. | ||
451 | This is just the general case of the top_cpuset example given a few | ||
452 | paragraphs above. In the general case, as in the top cpuset case, | ||
453 | don't leave tasks that might use non-trivial amounts of CPU in | ||
454 | such partially load balanced cpusets, as they may be artificially | ||
455 | constrained to some subset of the CPUs allowed to them, for lack of | ||
456 | load balancing to the other CPUs. | ||
457 | |||
458 | 1.7.1 sched_load_balance implementation details. | ||
459 | ------------------------------------------------ | ||
460 | |||
461 | The per-cpuset flag 'sched_load_balance' defaults to enabled (contrary | ||
462 | to most cpuset flags.) When enabled for a cpuset, the kernel will | ||
463 | ensure that it can load balance across all the CPUs in that cpuset | ||
464 | (makes sure that all the CPUs in the cpus_allowed of that cpuset are | ||
465 | in the same sched domain.) | ||
466 | |||
467 | If two overlapping cpusets both have 'sched_load_balance' enabled, | ||
468 | then they will be (must be) both in the same sched domain. | ||
469 | |||
470 | If, as is the default, the top cpuset has 'sched_load_balance' enabled, | ||
471 | then by the above that means there is a single sched domain covering | ||
472 | the whole system, regardless of any other cpuset settings. | ||
473 | |||
474 | The kernel commits to user space that it will avoid load balancing | ||
475 | where it can. It will pick as fine a granularity partition of sched | ||
476 | domains as it can while still providing load balancing for any set | ||
477 | of CPUs allowed to a cpuset having 'sched_load_balance' enabled. | ||
478 | |||
479 | The internal kernel cpuset to scheduler interface passes from the | ||
480 | cpuset code to the scheduler code a partition of the load balanced | ||
481 | CPUs in the system. This partition is a set of subsets (represented | ||
482 | as an array of cpumask_t) of CPUs, pairwise disjoint, that cover all | ||
483 | the CPUs that must be load balanced. | ||
484 | |||
485 | Whenever the 'sched_load_balance' flag changes, or CPUs come or go | ||
486 | from a cpuset with this flag enabled, or a cpuset with this flag | ||
487 | enabled is removed, the cpuset code builds a new such partition and | ||
488 | passes it to the scheduler sched domain setup code, to have the sched | ||
489 | domains rebuilt as necessary. | ||
490 | |||
491 | This partition exactly defines what sched domains the scheduler should | ||
492 | setup - one sched domain for each element (cpumask_t) in the partition. | ||
493 | |||
494 | The scheduler remembers the currently active sched domain partitions. | ||
495 | When the scheduler routine partition_sched_domains() is invoked from | ||
496 | the cpuset code to update these sched domains, it compares the new | ||
497 | partition requested with the current, and updates its sched domains, | ||
498 | removing the old and adding the new, for each change. | ||
381 | 499 | ||
382 | 1.8 How do I use cpusets ? | 500 | 1.8 How do I use cpusets ? |
383 | -------------------------- | 501 | -------------------------- |
@@ -469,7 +587,7 @@ than stress the kernel. | |||
469 | To start a new job that is to be contained within a cpuset, the steps are: | 587 | To start a new job that is to be contained within a cpuset, the steps are: |
470 | 588 | ||
471 | 1) mkdir /dev/cpuset | 589 | 1) mkdir /dev/cpuset |
472 | 2) mount -t cpuset none /dev/cpuset | 590 | 2) mount -t cgroup -ocpuset cpuset /dev/cpuset |
473 | 3) Create the new cpuset by doing mkdir's and write's (or echo's) in | 591 | 3) Create the new cpuset by doing mkdir's and write's (or echo's) in |
474 | the /dev/cpuset virtual file system. | 592 | the /dev/cpuset virtual file system. |
475 | 4) Start a task that will be the "founding father" of the new job. | 593 | 4) Start a task that will be the "founding father" of the new job. |
@@ -481,7 +599,7 @@ For example, the following sequence of commands will setup a cpuset | |||
481 | named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, | 599 | named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, |
482 | and then start a subshell 'sh' in that cpuset: | 600 | and then start a subshell 'sh' in that cpuset: |
483 | 601 | ||
484 | mount -t cpuset none /dev/cpuset | 602 | mount -t cgroup -ocpuset cpuset /dev/cpuset |
485 | cd /dev/cpuset | 603 | cd /dev/cpuset |
486 | mkdir Charlie | 604 | mkdir Charlie |
487 | cd Charlie | 605 | cd Charlie |
@@ -513,7 +631,7 @@ Creating, modifying, using the cpusets can be done through the cpuset | |||
513 | virtual filesystem. | 631 | virtual filesystem. |
514 | 632 | ||
515 | To mount it, type: | 633 | To mount it, type: |
516 | # mount -t cpuset none /dev/cpuset | 634 | # mount -t cgroup -o cpuset cpuset /dev/cpuset |
517 | 635 | ||
518 | Then under /dev/cpuset you can find a tree that corresponds to the | 636 | Then under /dev/cpuset you can find a tree that corresponds to the |
519 | tree of the cpusets in the system. For instance, /dev/cpuset | 637 | tree of the cpusets in the system. For instance, /dev/cpuset |
@@ -556,6 +674,18 @@ To remove a cpuset, just use rmdir: | |||
556 | This will fail if the cpuset is in use (has cpusets inside, or has | 674 | This will fail if the cpuset is in use (has cpusets inside, or has |
557 | processes attached). | 675 | processes attached). |
558 | 676 | ||
677 | Note that for legacy reasons, the "cpuset" filesystem exists as a | ||
678 | wrapper around the cgroup filesystem. | ||
679 | |||
680 | The command | ||
681 | |||
682 | mount -t cpuset X /dev/cpuset | ||
683 | |||
684 | is equivalent to | ||
685 | |||
686 | mount -t cgroup -ocpuset X /dev/cpuset | ||
687 | echo "/sbin/cpuset_release_agent" > /dev/cpuset/release_agent | ||
688 | |||
559 | 2.2 Adding/removing cpus | 689 | 2.2 Adding/removing cpus |
560 | ------------------------ | 690 | ------------------------ |
561 | 691 | ||
diff --git a/Documentation/device-mapper/dm-uevent.txt b/Documentation/device-mapper/dm-uevent.txt new file mode 100644 index 000000000000..07edbd85c714 --- /dev/null +++ b/Documentation/device-mapper/dm-uevent.txt | |||
@@ -0,0 +1,97 @@ | |||
1 | The device-mapper uevent code adds the capability to device-mapper to create | ||
2 | and send kobject uevents (uevents). Previously device-mapper events were only | ||
3 | available through the ioctl interface. The advantage of the uevents interface | ||
4 | is the event contains environment attributes providing increased context for | ||
5 | the event avoiding the need to query the state of the device-mapper device after | ||
6 | the event is received. | ||
7 | |||
8 | There are two functions currently for device-mapper events. The first function | ||
9 | listed creates the event and the second function sends the event(s). | ||
10 | |||
11 | void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti, | ||
12 | const char *path, unsigned nr_valid_paths) | ||
13 | |||
14 | void dm_send_uevents(struct list_head *events, struct kobject *kobj) | ||
15 | |||
16 | |||
17 | The variables added to the uevent environment are: | ||
18 | |||
19 | Variable Name: DM_TARGET | ||
20 | Uevent Action(s): KOBJ_CHANGE | ||
21 | Type: string | ||
22 | Description: | ||
23 | Value: Name of device-mapper target that generated the event. | ||
24 | |||
25 | Variable Name: DM_ACTION | ||
26 | Uevent Action(s): KOBJ_CHANGE | ||
27 | Type: string | ||
28 | Description: | ||
29 | Value: Device-mapper specific action that caused the uevent action. | ||
30 | PATH_FAILED - A path has failed. | ||
31 | PATH_REINSTATED - A path has been reinstated. | ||
32 | |||
33 | Variable Name: DM_SEQNUM | ||
34 | Uevent Action(s): KOBJ_CHANGE | ||
35 | Type: unsigned integer | ||
36 | Description: A sequence number for this specific device-mapper device. | ||
37 | Value: Valid unsigned integer range. | ||
38 | |||
39 | Variable Name: DM_PATH | ||
40 | Uevent Action(s): KOBJ_CHANGE | ||
41 | Type: string | ||
42 | Description: Major and minor number of the path device pertaining to this | ||
43 | event. | ||
44 | Value: Path name in the form of "Major:Minor" | ||
45 | |||
46 | Variable Name: DM_NR_VALID_PATHS | ||
47 | Uevent Action(s): KOBJ_CHANGE | ||
48 | Type: unsigned integer | ||
49 | Description: | ||
50 | Value: Valid unsigned integer range. | ||
51 | |||
52 | Variable Name: DM_NAME | ||
53 | Uevent Action(s): KOBJ_CHANGE | ||
54 | Type: string | ||
55 | Description: Name of the device-mapper device. | ||
56 | Value: Name | ||
57 | |||
58 | Variable Name: DM_UUID | ||
59 | Uevent Action(s): KOBJ_CHANGE | ||
60 | Type: string | ||
61 | Description: UUID of the device-mapper device. | ||
62 | Value: UUID. (Empty string if there isn't one.) | ||
63 | |||
64 | An example of the uevents generated as captured by udevmonitor is shown | ||
65 | below. | ||
66 | |||
67 | 1.) Path failure. | ||
68 | UEVENT[1192521009.711215] change@/block/dm-3 | ||
69 | ACTION=change | ||
70 | DEVPATH=/block/dm-3 | ||
71 | SUBSYSTEM=block | ||
72 | DM_TARGET=multipath | ||
73 | DM_ACTION=PATH_FAILED | ||
74 | DM_SEQNUM=1 | ||
75 | DM_PATH=8:32 | ||
76 | DM_NR_VALID_PATHS=0 | ||
77 | DM_NAME=mpath2 | ||
78 | DM_UUID=mpath-35333333000002328 | ||
79 | MINOR=3 | ||
80 | MAJOR=253 | ||
81 | SEQNUM=1130 | ||
82 | |||
83 | 2.) Path reinstate. | ||
84 | UEVENT[1192521132.989927] change@/block/dm-3 | ||
85 | ACTION=change | ||
86 | DEVPATH=/block/dm-3 | ||
87 | SUBSYSTEM=block | ||
88 | DM_TARGET=multipath | ||
89 | DM_ACTION=PATH_REINSTATED | ||
90 | DM_SEQNUM=2 | ||
91 | DM_PATH=8:32 | ||
92 | DM_NR_VALID_PATHS=1 | ||
93 | DM_NAME=mpath2 | ||
94 | DM_UUID=mpath-35333333000002328 | ||
95 | MINOR=3 | ||
96 | MAJOR=253 | ||
97 | SEQNUM=1131 | ||
diff --git a/Documentation/devices.txt b/Documentation/devices.txt index 6c46730c631a..e6244cde26e9 100644 --- a/Documentation/devices.txt +++ b/Documentation/devices.txt | |||
@@ -2188,7 +2188,7 @@ Your cooperation is appreciated. | |||
2188 | 2188 | ||
2189 | 136-143 char Unix98 PTY slaves | 2189 | 136-143 char Unix98 PTY slaves |
2190 | 0 = /dev/pts/0 First Unix98 pseudo-TTY | 2190 | 0 = /dev/pts/0 First Unix98 pseudo-TTY |
2191 | 1 = /dev/pts/1 Second Unix98 pesudo-TTY | 2191 | 1 = /dev/pts/1 Second Unix98 pseudo-TTY |
2192 | ... | 2192 | ... |
2193 | 2193 | ||
2194 | These device nodes are automatically generated with | 2194 | These device nodes are automatically generated with |
diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt index 8569072fa387..387b8a720f4a 100644 --- a/Documentation/driver-model/devres.txt +++ b/Documentation/driver-model/devres.txt | |||
@@ -32,7 +32,7 @@ braindamaged document, if it's finally working, well, it's working. | |||
32 | 32 | ||
33 | For one reason or another, low level drivers don't receive as much | 33 | For one reason or another, low level drivers don't receive as much |
34 | attention or testing as core code, and bugs on driver detach or | 34 | attention or testing as core code, and bugs on driver detach or |
35 | initilaization failure doesn't happen often enough to be noticeable. | 35 | initialization failure don't happen often enough to be noticeable. |
36 | Init failure path is worse because it's much less travelled while | 36 | Init failure path is worse because it's much less travelled while |
37 | needs to handle multiple entry points. | 37 | needs to handle multiple entry points. |
38 | 38 | ||
@@ -160,7 +160,7 @@ resources on failure. For example, | |||
160 | devres_release_group(dev, NULL); | 160 | devres_release_group(dev, NULL); |
161 | return err_code; | 161 | return err_code; |
162 | 162 | ||
163 | As resource acquision failure usually means probe failure, constructs | 163 | As resource acquisition failure usually means probe failure, constructs |
164 | like above are usually useful in midlayer driver (e.g. libata core | 164 | like above are usually useful in midlayer driver (e.g. libata core |
165 | layer) where interface function shouldn't have side effect on failure. | 165 | layer) where interface function shouldn't have side effect on failure. |
166 | For LLDs, just returning error code suffices in most cases. | 166 | For LLDs, just returning error code suffices in most cases. |
diff --git a/Documentation/fb/deferred_io.txt b/Documentation/fb/deferred_io.txt index 73cf9fb7cf60..63883a892120 100644 --- a/Documentation/fb/deferred_io.txt +++ b/Documentation/fb/deferred_io.txt | |||
@@ -3,7 +3,7 @@ Deferred IO | |||
3 | 3 | ||
4 | Deferred IO is a way to delay and repurpose IO. It uses host memory as a | 4 | Deferred IO is a way to delay and repurpose IO. It uses host memory as a |
5 | buffer and the MMU pagefault as a pretrigger for when to perform the device | 5 | buffer and the MMU pagefault as a pretrigger for when to perform the device |
6 | IO. The following example may be a useful explaination of how one such setup | 6 | IO. The following example may be a useful explanation of how one such setup |
7 | works: | 7 | works: |
8 | 8 | ||
9 | - userspace app like Xfbdev mmaps framebuffer | 9 | - userspace app like Xfbdev mmaps framebuffer |
@@ -28,7 +28,7 @@ a relatively more expensive operation. | |||
28 | 28 | ||
29 | For some types of nonvolatile high latency displays, the desired image is | 29 | For some types of nonvolatile high latency displays, the desired image is |
30 | the final image rather than the intermediate stages which is why it's okay | 30 | the final image rather than the intermediate stages which is why it's okay |
31 | to not update for each write that is occuring. | 31 | to not update for each write that is occurring. |
32 | 32 | ||
33 | It may be the case that this is useful in other scenarios as well. Paul Mundt | 33 | It may be the case that this is useful in other scenarios as well. Paul Mundt |
34 | has mentioned a case where it is beneficial to use the page count to decide | 34 | has mentioned a case where it is beneficial to use the page count to decide |
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 280ec06573e6..6bb9be54ab76 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt | |||
@@ -14,18 +14,6 @@ Who: Jiri Slaby <jirislaby@gmail.com> | |||
14 | 14 | ||
15 | --------------------------- | 15 | --------------------------- |
16 | 16 | ||
17 | What: V4L2 VIDIOC_G_MPEGCOMP and VIDIOC_S_MPEGCOMP | ||
18 | When: October 2007 | ||
19 | Why: Broken attempt to set MPEG compression parameters. These ioctls are | ||
20 | not able to implement the wide variety of parameters that can be set | ||
21 | by hardware MPEG encoders. A new MPEG control mechanism was created | ||
22 | in kernel 2.6.18 that replaces these ioctls. See the V4L2 specification | ||
23 | (section 1.9: Extended controls) for more information on this topic. | ||
24 | Who: Hans Verkuil <hverkuil@xs4all.nl> and | ||
25 | Mauro Carvalho Chehab <mchehab@infradead.org> | ||
26 | |||
27 | --------------------------- | ||
28 | |||
29 | What: dev->power.power_state | 17 | What: dev->power.power_state |
30 | When: July 2007 | 18 | When: July 2007 |
31 | Why: Broken design for runtime control over driver power states, confusing | 19 | Why: Broken design for runtime control over driver power states, confusing |
@@ -49,10 +37,10 @@ Who: David Miller <davem@davemloft.net> | |||
49 | --------------------------- | 37 | --------------------------- |
50 | 38 | ||
51 | What: Video4Linux API 1 ioctls and video_decoder.h from Video devices. | 39 | What: Video4Linux API 1 ioctls and video_decoder.h from Video devices. |
52 | When: December 2006 | 40 | When: December 2008 |
53 | Files: include/linux/video_decoder.h | 41 | Files: include/linux/video_decoder.h include/linux/videodev.h |
54 | Check: include/linux/video_decoder.h | 42 | Check: include/linux/video_decoder.h include/linux/videodev.h |
55 | Why: V4L1 AP1 was replaced by V4L2 API. during migration from 2.4 to 2.6 | 43 | Why: V4L1 AP1 was replaced by V4L2 API during migration from 2.4 to 2.6 |
56 | series. The old API have lots of drawbacks and don't provide enough | 44 | series. The old API have lots of drawbacks and don't provide enough |
57 | means to work with all video and audio standards. The newer API is | 45 | means to work with all video and audio standards. The newer API is |
58 | already available on the main drivers and should be used instead. | 46 | already available on the main drivers and should be used instead. |
@@ -61,7 +49,9 @@ Why: V4L1 AP1 was replaced by V4L2 API. during migration from 2.4 to 2.6 | |||
61 | Decoder iocts are using internally to allow video drivers to | 49 | Decoder iocts are using internally to allow video drivers to |
62 | communicate with video decoders. This should also be improved to allow | 50 | communicate with video decoders. This should also be improved to allow |
63 | V4L2 calls being translated into compatible internal ioctls. | 51 | V4L2 calls being translated into compatible internal ioctls. |
64 | Who: Mauro Carvalho Chehab <mchehab@brturbo.com.br> | 52 | Compatibility ioctls will be provided, for a while, via |
53 | v4l1-compat module. | ||
54 | Who: Mauro Carvalho Chehab <mchehab@infradead.org> | ||
65 | 55 | ||
66 | --------------------------- | 56 | --------------------------- |
67 | 57 | ||
@@ -82,6 +72,41 @@ Who: Dominik Brodowski <linux@brodo.de> | |||
82 | 72 | ||
83 | --------------------------- | 73 | --------------------------- |
84 | 74 | ||
75 | What: sys_sysctl | ||
76 | When: September 2010 | ||
77 | Option: CONFIG_SYSCTL_SYSCALL | ||
78 | Why: The same information is available in a more convenient from | ||
79 | /proc/sys, and none of the sysctl variables appear to be | ||
80 | important performance wise. | ||
81 | |||
82 | Binary sysctls are a long standing source of subtle kernel | ||
83 | bugs and security issues. | ||
84 | |||
85 | When I looked several months ago all I could find after | ||
86 | searching several distributions were 5 user space programs and | ||
87 | glibc (which falls back to /proc/sys) using this syscall. | ||
88 | |||
89 | The man page for sysctl(2) documents it as unusable for user | ||
90 | space programs. | ||
91 | |||
92 | sysctl(2) is not generally ABI compatible to a 32bit user | ||
93 | space application on a 64bit and a 32bit kernel. | ||
94 | |||
95 | For the last several months the policy has been no new binary | ||
96 | sysctls and no one has put forward an argument to use them. | ||
97 | |||
98 | Binary sysctls issues seem to keep happening appearing so | ||
99 | properly deprecating them (with a warning to user space) and a | ||
100 | 2 year grace warning period will mean eventually we can kill | ||
101 | them and end the pain. | ||
102 | |||
103 | In the mean time individual binary sysctls can be dealt with | ||
104 | in a piecewise fashion. | ||
105 | |||
106 | Who: Eric Biederman <ebiederm@xmission.com> | ||
107 | |||
108 | --------------------------- | ||
109 | |||
85 | What: a.out interpreter support for ELF executables | 110 | What: a.out interpreter support for ELF executables |
86 | When: 2.6.25 | 111 | When: 2.6.25 |
87 | Files: fs/binfmt_elf.c | 112 | Files: fs/binfmt_elf.c |
@@ -184,13 +209,6 @@ Who: Jean Delvare <khali@linux-fr.org>, | |||
184 | 209 | ||
185 | --------------------------- | 210 | --------------------------- |
186 | 211 | ||
187 | What: drivers depending on OBSOLETE_OSS | ||
188 | When: options in 2.6.22, code in 2.6.24 | ||
189 | Why: OSS drivers with ALSA replacements | ||
190 | Who: Adrian Bunk <bunk@stusta.de> | ||
191 | |||
192 | --------------------------- | ||
193 | |||
194 | What: ACPI procfs interface | 212 | What: ACPI procfs interface |
195 | When: July 2008 | 213 | When: July 2008 |
196 | Why: ACPI sysfs conversion should be finished by January 2008. | 214 | Why: ACPI sysfs conversion should be finished by January 2008. |
diff --git a/Documentation/filesystems/9p.txt b/Documentation/filesystems/9p.txt index d6fd6c6e4244..bf8080640eba 100644 --- a/Documentation/filesystems/9p.txt +++ b/Documentation/filesystems/9p.txt | |||
@@ -42,10 +42,12 @@ OPTIONS | |||
42 | 42 | ||
43 | trans=name select an alternative transport. Valid options are | 43 | trans=name select an alternative transport. Valid options are |
44 | currently: | 44 | currently: |
45 | unix - specifying a named pipe mount point | 45 | unix - specifying a named pipe mount point |
46 | tcp - specifying a normal TCP/IP connection | 46 | tcp - specifying a normal TCP/IP connection |
47 | fd - used passed file descriptors for connection | 47 | fd - used passed file descriptors for connection |
48 | (see rfdno and wfdno) | 48 | (see rfdno and wfdno) |
49 | virtio - connect to the next virtio channel available | ||
50 | (from lguest or KVM with trans_virtio module) | ||
49 | 51 | ||
50 | uname=name user name to attempt mount as on the remote server. The | 52 | uname=name user name to attempt mount as on the remote server. The |
51 | server may override or ignore this value. Certain user | 53 | server may override or ignore this value. Certain user |
@@ -54,7 +56,7 @@ OPTIONS | |||
54 | aname=name aname specifies the file tree to access when the server is | 56 | aname=name aname specifies the file tree to access when the server is |
55 | offering several exported file systems. | 57 | offering several exported file systems. |
56 | 58 | ||
57 | cache=mode specifies a cacheing policy. By default, no caches are used. | 59 | cache=mode specifies a caching policy. By default, no caches are used. |
58 | loose = no attempts are made at consistency, | 60 | loose = no attempts are made at consistency, |
59 | intended for exclusive, read-only mounts | 61 | intended for exclusive, read-only mounts |
60 | 62 | ||
diff --git a/Documentation/filesystems/Exporting b/Documentation/filesystems/Exporting index 31047e0fe14b..87019d2b5981 100644 --- a/Documentation/filesystems/Exporting +++ b/Documentation/filesystems/Exporting | |||
@@ -2,9 +2,12 @@ | |||
2 | Making Filesystems Exportable | 2 | Making Filesystems Exportable |
3 | ============================= | 3 | ============================= |
4 | 4 | ||
5 | Most filesystem operations require a dentry (or two) as a starting | 5 | Overview |
6 | -------- | ||
7 | |||
8 | All filesystem operations require a dentry (or two) as a starting | ||
6 | point. Local applications have a reference-counted hold on suitable | 9 | point. Local applications have a reference-counted hold on suitable |
7 | dentrys via open file descriptors or cwd/root. However remote | 10 | dentries via open file descriptors or cwd/root. However remote |
8 | applications that access a filesystem via a remote filesystem protocol | 11 | applications that access a filesystem via a remote filesystem protocol |
9 | such as NFS may not be able to hold such a reference, and so need a | 12 | such as NFS may not be able to hold such a reference, and so need a |
10 | different way to refer to a particular dentry. As the alternative | 13 | different way to refer to a particular dentry. As the alternative |
@@ -13,14 +16,14 @@ server-reboot (among other things, though these tend to be the most | |||
13 | problematic), there is no simple answer like 'filename'. | 16 | problematic), there is no simple answer like 'filename'. |
14 | 17 | ||
15 | The mechanism discussed here allows each filesystem implementation to | 18 | The mechanism discussed here allows each filesystem implementation to |
16 | specify how to generate an opaque (out side of the filesystem) byte | 19 | specify how to generate an opaque (outside of the filesystem) byte |
17 | string for any dentry, and how to find an appropriate dentry for any | 20 | string for any dentry, and how to find an appropriate dentry for any |
18 | given opaque byte string. | 21 | given opaque byte string. |
19 | This byte string will be called a "filehandle fragment" as it | 22 | This byte string will be called a "filehandle fragment" as it |
20 | corresponds to part of an NFS filehandle. | 23 | corresponds to part of an NFS filehandle. |
21 | 24 | ||
22 | A filesystem which supports the mapping between filehandle fragments | 25 | A filesystem which supports the mapping between filehandle fragments |
23 | and dentrys will be termed "exportable". | 26 | and dentries will be termed "exportable". |
24 | 27 | ||
25 | 28 | ||
26 | 29 | ||
@@ -89,11 +92,9 @@ For a filesystem to be exportable it must: | |||
89 | 1/ provide the filehandle fragment routines described below. | 92 | 1/ provide the filehandle fragment routines described below. |
90 | 2/ make sure that d_splice_alias is used rather than d_add | 93 | 2/ make sure that d_splice_alias is used rather than d_add |
91 | when ->lookup finds an inode for a given parent and name. | 94 | when ->lookup finds an inode for a given parent and name. |
92 | Typically the ->lookup routine will end: | 95 | Typically the ->lookup routine will end with a: |
93 | if (inode) | 96 | |
94 | return d_splice(inode, dentry); | 97 | return d_splice_alias(inode, dentry); |
95 | d_add(dentry, inode); | ||
96 | return NULL; | ||
97 | } | 98 | } |
98 | 99 | ||
99 | 100 | ||
@@ -101,67 +102,39 @@ For a filesystem to be exportable it must: | |||
101 | A file system implementation declares that instances of the filesystem | 102 | A file system implementation declares that instances of the filesystem |
102 | are exportable by setting the s_export_op field in the struct | 103 | are exportable by setting the s_export_op field in the struct |
103 | super_block. This field must point to a "struct export_operations" | 104 | super_block. This field must point to a "struct export_operations" |
104 | struct which could potentially be full of NULLs, though normally at | 105 | struct which has the following members: |
105 | least get_parent will be set. | 106 | |
106 | 107 | encode_fh (optional) | |
107 | The primary operations are decode_fh and encode_fh. | 108 | Takes a dentry and creates a filehandle fragment which can later be used |
108 | decode_fh takes a filehandle fragment and tries to find or create a | 109 | to find or create a dentry for the same object. The default |
109 | dentry for the object referred to by the filehandle. | 110 | implementation creates a filehandle fragment that encodes a 32bit inode |
110 | encode_fh takes a dentry and creates a filehandle fragment which can | 111 | and generation number for the inode encoded, and if necessary the |
111 | later be used to find/create a dentry for the same object. | 112 | same information for the parent. |
112 | 113 | ||
113 | decode_fh will probably make use of "find_exported_dentry". | 114 | fh_to_dentry (mandatory) |
114 | This function lives in the "exportfs" module which a filesystem does | 115 | Given a filehandle fragment, this should find the implied object and |
115 | not need unless it is being exported. So rather that calling | 116 | create a dentry for it (possibly with d_alloc_anon). |
116 | find_exported_dentry directly, each filesystem should call it through | 117 | |
117 | the find_exported_dentry pointer in it's export_operations table. | 118 | fh_to_parent (optional but strongly recommended) |
118 | This field is set correctly by the exporting agent (e.g. nfsd) when a | 119 | Given a filehandle fragment, this should find the parent of the |
119 | filesystem is exported, and before any export operations are called. | 120 | implied object and create a dentry for it (possibly with d_alloc_anon). |
120 | 121 | May fail if the filehandle fragment is too small. | |
121 | find_exported_dentry needs three support functions from the | 122 | |
122 | filesystem: | 123 | get_parent (optional but strongly recommended) |
123 | get_name. When given a parent dentry and a child dentry, this | 124 | When given a dentry for a directory, this should return a dentry for |
124 | should find a name in the directory identified by the parent | 125 | the parent. Quite possibly the parent dentry will have been allocated |
125 | dentry, which leads to the object identified by the child dentry. | 126 | by d_alloc_anon. The default get_parent function just returns an error |
126 | If no get_name function is supplied, a default implementation is | 127 | so any filehandle lookup that requires finding a parent will fail. |
127 | provided which uses vfs_readdir to find potential names, and | 128 | ->lookup("..") is *not* used as a default as it can leave ".." entries |
128 | matches inode numbers to find the correct match. | 129 | in the dcache which are too messy to work with. |
129 | 130 | ||
130 | get_parent. When given a dentry for a directory, this should return | 131 | get_name (optional) |
131 | a dentry for the parent. Quite possibly the parent dentry will | 132 | When given a parent dentry and a child dentry, this should find a name |
132 | have been allocated by d_alloc_anon. | 133 | in the directory identified by the parent dentry, which leads to the |
133 | The default get_parent function just returns an error so any | 134 | object identified by the child dentry. If no get_name function is |
134 | filehandle lookup that requires finding a parent will fail. | 135 | supplied, a default implementation is provided which uses vfs_readdir |
135 | ->lookup("..") is *not* used as a default as it can leave ".." | 136 | to find potential names, and matches inode numbers to find the correct |
136 | entries in the dcache which are too messy to work with. | 137 | match. |
137 | |||
138 | get_dentry. When given an opaque datum, this should find the | ||
139 | implied object and create a dentry for it (possibly with | ||
140 | d_alloc_anon). | ||
141 | The opaque datum is whatever is passed down by the decode_fh | ||
142 | function, and is often simply a fragment of the filehandle | ||
143 | fragment. | ||
144 | decode_fh passes two datums through find_exported_dentry. One that | ||
145 | should be used to identify the target object, and one that can be | ||
146 | used to identify the object's parent, should that be necessary. | ||
147 | The default get_dentry function assumes that the datum contains an | ||
148 | inode number and a generation number, and it attempts to get the | ||
149 | inode using "iget" and check it's validity by matching the | ||
150 | generation number. A filesystem should only depend on the default | ||
151 | if iget can safely be used this way. | ||
152 | |||
153 | If decode_fh and/or encode_fh are left as NULL, then default | ||
154 | implementations are used. These defaults are suitable for ext2 and | ||
155 | extremely similar filesystems (like ext3). | ||
156 | |||
157 | The default encode_fh creates a filehandle fragment from the inode | ||
158 | number and generation number of the target together with the inode | ||
159 | number and generation number of the parent (if the parent is | ||
160 | required). | ||
161 | |||
162 | The default decode_fh extract the target and parent datums from the | ||
163 | filehandle assuming the format used by the default encode_fh and | ||
164 | passed them to find_exported_dentry. | ||
165 | 138 | ||
166 | 139 | ||
167 | A filehandle fragment consists of an array of 1 or more 4byte words, | 140 | A filehandle fragment consists of an array of 1 or more 4byte words, |
@@ -172,5 +145,3 @@ generated by encode_fh, in which case it will have been padded with | |||
172 | nuls. Rather, the encode_fh routine should choose a "type" which | 145 | nuls. Rather, the encode_fh routine should choose a "type" which |
173 | indicates the decode_fh how much of the filehandle is valid, and how | 146 | indicates the decode_fh how much of the filehandle is valid, and how |
174 | it should be interpreted. | 147 | it should be interpreted. |
175 | |||
176 | |||
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index fe26cc978523..37c10cba7177 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
@@ -224,7 +224,7 @@ against the page the filesystem should redirty the page with | |||
224 | redirty_page_for_writepage(), then unlock the page and return zero. | 224 | redirty_page_for_writepage(), then unlock the page and return zero. |
225 | This may also be done to avoid internal deadlocks, but rarely. | 225 | This may also be done to avoid internal deadlocks, but rarely. |
226 | 226 | ||
227 | If the filesytem is called for sync then it must wait on any | 227 | If the filesystem is called for sync then it must wait on any |
228 | in-progress I/O and then start new I/O. | 228 | in-progress I/O and then start new I/O. |
229 | 229 | ||
230 | The filesystem should unlock the page synchronously, before returning to the | 230 | The filesystem should unlock the page synchronously, before returning to the |
diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt index 4aecc9bdb273..b45f3c1b8b43 100644 --- a/Documentation/filesystems/ext3.txt +++ b/Documentation/filesystems/ext3.txt | |||
@@ -130,12 +130,12 @@ Device layer. | |||
130 | 130 | ||
131 | Journaling Block Device layer | 131 | Journaling Block Device layer |
132 | ----------------------------- | 132 | ----------------------------- |
133 | The Journaling Block Device layer (JBD) isn't ext3 specific. It was design to | 133 | The Journaling Block Device layer (JBD) isn't ext3 specific. It was designed |
134 | add journaling capabilities on a block device. The ext3 filesystem code will | 134 | to add journaling capabilities to a block device. The ext3 filesystem code |
135 | inform the JBD of modifications it is performing (called a transaction). The | 135 | will inform the JBD of modifications it is performing (called a transaction). |
136 | journal supports the transactions start and stop, and in case of crash, the | 136 | The journal supports the transactions start and stop, and in case of a crash, |
137 | journal can replayed the transactions to put the partition back in a | 137 | the journal can replay the transactions to quickly put the partition back into |
138 | consistent state fast. | 138 | a consistent state. |
139 | 139 | ||
140 | Handles represent a single atomic update to a filesystem. JBD can handle an | 140 | Handles represent a single atomic update to a filesystem. JBD can handle an |
141 | external journal on a block device. | 141 | external journal on a block device. |
@@ -164,7 +164,7 @@ written to the journal first, and then to its final location. | |||
164 | In the event of a crash, the journal can be replayed, bringing both data and | 164 | In the event of a crash, the journal can be replayed, bringing both data and |
165 | metadata into a consistent state. This mode is the slowest except when data | 165 | metadata into a consistent state. This mode is the slowest except when data |
166 | needs to be read from and written to disk at the same time where it | 166 | needs to be read from and written to disk at the same time where it |
167 | outperforms all others modes. | 167 | outperforms all other modes. |
168 | 168 | ||
169 | Compatibility | 169 | Compatibility |
170 | ------------- | 170 | ------------- |
diff --git a/Documentation/filesystems/files.txt b/Documentation/filesystems/files.txt index 133e213ebb72..bb0142f61084 100644 --- a/Documentation/filesystems/files.txt +++ b/Documentation/filesystems/files.txt | |||
@@ -76,13 +76,13 @@ the fdtable structure - | |||
76 | 5. Handling of the file structures is special. Since the look-up | 76 | 5. Handling of the file structures is special. Since the look-up |
77 | of the fd (fget()/fget_light()) are lock-free, it is possible | 77 | of the fd (fget()/fget_light()) are lock-free, it is possible |
78 | that look-up may race with the last put() operation on the | 78 | that look-up may race with the last put() operation on the |
79 | file structure. This is avoided using the rcuref APIs | 79 | file structure. This is avoided using atomic_inc_not_zero() |
80 | on ->f_count : | 80 | on ->f_count : |
81 | 81 | ||
82 | rcu_read_lock(); | 82 | rcu_read_lock(); |
83 | file = fcheck_files(files, fd); | 83 | file = fcheck_files(files, fd); |
84 | if (file) { | 84 | if (file) { |
85 | if (rcuref_inc_lf(&file->f_count)) | 85 | if (atomic_inc_not_zero(&file->f_count)) |
86 | *fput_needed = 1; | 86 | *fput_needed = 1; |
87 | else | 87 | else |
88 | /* Didn't get the reference, someone's freed */ | 88 | /* Didn't get the reference, someone's freed */ |
@@ -92,7 +92,7 @@ the fdtable structure - | |||
92 | .... | 92 | .... |
93 | return file; | 93 | return file; |
94 | 94 | ||
95 | rcuref_inc_lf() detects if refcounts is already zero or | 95 | atomic_inc_not_zero() detects if refcounts is already zero or |
96 | goes to zero during increment. If it does, we fail | 96 | goes to zero during increment. If it does, we fail |
97 | fget()/fget_light(). | 97 | fget()/fget_light(). |
98 | 98 | ||
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index e5c1df52a876..dec99455321f 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -813,9 +813,9 @@ Various pieces of information about kernel activity are available in the | |||
813 | since the system first booted. For a quick look, simply cat the file: | 813 | since the system first booted. For a quick look, simply cat the file: |
814 | 814 | ||
815 | > cat /proc/stat | 815 | > cat /proc/stat |
816 | cpu 2255 34 2290 22625563 6290 127 456 | 816 | cpu 2255 34 2290 22625563 6290 127 456 0 |
817 | cpu0 1132 34 1441 11311718 3675 127 438 | 817 | cpu0 1132 34 1441 11311718 3675 127 438 0 |
818 | cpu1 1123 0 849 11313845 2614 0 18 | 818 | cpu1 1123 0 849 11313845 2614 0 18 0 |
819 | intr 114930548 113199788 3 0 5 263 0 4 [... lots more numbers ...] | 819 | intr 114930548 113199788 3 0 5 263 0 4 [... lots more numbers ...] |
820 | ctxt 1990473 | 820 | ctxt 1990473 |
821 | btime 1062191376 | 821 | btime 1062191376 |
@@ -835,6 +835,7 @@ second). The meanings of the columns are as follows, from left to right: | |||
835 | - iowait: waiting for I/O to complete | 835 | - iowait: waiting for I/O to complete |
836 | - irq: servicing interrupts | 836 | - irq: servicing interrupts |
837 | - softirq: servicing softirqs | 837 | - softirq: servicing softirqs |
838 | - steal: involuntary wait | ||
838 | 839 | ||
839 | The "intr" line gives counts of interrupts serviced since boot time, for each | 840 | The "intr" line gives counts of interrupts serviced since boot time, for each |
840 | of the possible system interrupts. The first column is the total of all | 841 | of the possible system interrupts. The first column is the total of all |
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt index 4b5ca26e5048..4598ef7b622b 100644 --- a/Documentation/filesystems/sysfs.txt +++ b/Documentation/filesystems/sysfs.txt | |||
@@ -51,7 +51,7 @@ for the attributes, providing a means to read and write kernel | |||
51 | attributes. | 51 | attributes. |
52 | 52 | ||
53 | Attributes should be ASCII text files, preferably with only one value | 53 | Attributes should be ASCII text files, preferably with only one value |
54 | per file. It is noted that it may not be efficient to contain only | 54 | per file. It is noted that it may not be efficient to contain only one |
55 | value per file, so it is socially acceptable to express an array of | 55 | value per file, so it is socially acceptable to express an array of |
56 | values of the same type. | 56 | values of the same type. |
57 | 57 | ||
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 6f8e16e3d6c0..9d019d35728f 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt | |||
@@ -706,7 +706,7 @@ struct address_space_operations { | |||
706 | wants to make it a free page. If ->releasepage succeeds, the | 706 | wants to make it a free page. If ->releasepage succeeds, the |
707 | page will be removed from the address_space and become free. | 707 | page will be removed from the address_space and become free. |
708 | 708 | ||
709 | The second case if when a request has been made to invalidate | 709 | The second case is when a request has been made to invalidate |
710 | some or all pages in an address_space. This can happen | 710 | some or all pages in an address_space. This can happen |
711 | through the fadvice(POSIX_FADV_DONTNEED) system call or by the | 711 | through the fadvice(POSIX_FADV_DONTNEED) system call or by the |
712 | filesystem explicitly requesting it as nfs and 9fs do (when | 712 | filesystem explicitly requesting it as nfs and 9fs do (when |
diff --git a/Documentation/i2c/i2c-protocol b/Documentation/i2c/i2c-protocol index 579b92d5f3a3..10518dd58814 100644 --- a/Documentation/i2c/i2c-protocol +++ b/Documentation/i2c/i2c-protocol | |||
@@ -68,7 +68,7 @@ We have found some I2C devices that needs the following modifications: | |||
68 | 68 | ||
69 | Flags I2C_M_IGNORE_NAK | 69 | Flags I2C_M_IGNORE_NAK |
70 | Normally message is interrupted immediately if there is [NA] from the | 70 | Normally message is interrupted immediately if there is [NA] from the |
71 | client. Setting this flag treats any [NA] as [A], and all of | 71 | client. Setting this flag treats any [NA] as [A], and all of |
72 | message is sent. | 72 | message is sent. |
73 | These messages may still fail to SCL lo->hi timeout. | 73 | These messages may still fail to SCL lo->hi timeout. |
74 | 74 | ||
diff --git a/Documentation/i386/boot.txt b/Documentation/i386/boot.txt index 35985b34d5a6..2f75e750e4f5 100644 --- a/Documentation/i386/boot.txt +++ b/Documentation/i386/boot.txt | |||
@@ -168,6 +168,8 @@ Offset Proto Name Meaning | |||
168 | 0234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not | 168 | 0234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not |
169 | 0235/3 N/A pad2 Unused | 169 | 0235/3 N/A pad2 Unused |
170 | 0238/4 2.06+ cmdline_size Maximum size of the kernel command line | 170 | 0238/4 2.06+ cmdline_size Maximum size of the kernel command line |
171 | 023C/4 2.07+ hardware_subarch Hardware subarchitecture | ||
172 | 0240/8 2.07+ hardware_subarch_data Subarchitecture-specific data | ||
171 | 173 | ||
172 | (1) For backwards compatibility, if the setup_sects field contains 0, the | 174 | (1) For backwards compatibility, if the setup_sects field contains 0, the |
173 | real value is 4. | 175 | real value is 4. |
@@ -204,7 +206,7 @@ boot loaders can ignore those fields. | |||
204 | 206 | ||
205 | The byte order of all fields is littleendian (this is x86, after all.) | 207 | The byte order of all fields is littleendian (this is x86, after all.) |
206 | 208 | ||
207 | Field name: setup_secs | 209 | Field name: setup_sects |
208 | Type: read | 210 | Type: read |
209 | Offset/size: 0x1f1/1 | 211 | Offset/size: 0x1f1/1 |
210 | Protocol: ALL | 212 | Protocol: ALL |
@@ -356,6 +358,13 @@ Protocol: 2.00+ | |||
356 | - If 0, the protected-mode code is loaded at 0x10000. | 358 | - If 0, the protected-mode code is loaded at 0x10000. |
357 | - If 1, the protected-mode code is loaded at 0x100000. | 359 | - If 1, the protected-mode code is loaded at 0x100000. |
358 | 360 | ||
361 | Bit 6 (write): KEEP_SEGMENTS | ||
362 | Protocol: 2.07+ | ||
363 | - if 0, reload the segment registers in the 32bit entry point. | ||
364 | - if 1, do not reload the segment registers in the 32bit entry point. | ||
365 | Assume that %cs %ds %ss %es are all set to flat segments with | ||
366 | a base of 0 (or the equivalent for their environment). | ||
367 | |||
359 | Bit 7 (write): CAN_USE_HEAP | 368 | Bit 7 (write): CAN_USE_HEAP |
360 | Set this bit to 1 to indicate that the value entered in the | 369 | Set this bit to 1 to indicate that the value entered in the |
361 | heap_end_ptr is valid. If this field is clear, some setup code | 370 | heap_end_ptr is valid. If this field is clear, some setup code |
@@ -480,6 +489,29 @@ Protocol: 2.06+ | |||
480 | cmdline_size characters. With protocol version 2.05 and earlier, the | 489 | cmdline_size characters. With protocol version 2.05 and earlier, the |
481 | maximum size was 255. | 490 | maximum size was 255. |
482 | 491 | ||
492 | Field name: hardware_subarch | ||
493 | Type: write | ||
494 | Offset/size: 0x23c/4 | ||
495 | Protocol: 2.07+ | ||
496 | |||
497 | In a paravirtualized environment the hardware low level architectural | ||
498 | pieces such as interrupt handling, page table handling, and | ||
499 | accessing process control registers needs to be done differently. | ||
500 | |||
501 | This field allows the bootloader to inform the kernel we are in one | ||
502 | one of those environments. | ||
503 | |||
504 | 0x00000000 The default x86/PC environment | ||
505 | 0x00000001 lguest | ||
506 | 0x00000002 Xen | ||
507 | |||
508 | Field name: hardware_subarch_data | ||
509 | Type: write | ||
510 | Offset/size: 0x240/8 | ||
511 | Protocol: 2.07+ | ||
512 | |||
513 | A pointer to data that is specific to hardware subarch | ||
514 | |||
483 | 515 | ||
484 | **** THE KERNEL COMMAND LINE | 516 | **** THE KERNEL COMMAND LINE |
485 | 517 | ||
diff --git a/Documentation/ia64/err_inject.txt b/Documentation/ia64/err_inject.txt index 6449a7090dbb..223e4f0582d0 100644 --- a/Documentation/ia64/err_inject.txt +++ b/Documentation/ia64/err_inject.txt | |||
@@ -21,10 +21,10 @@ software test suits to do stressful testing on IPF. | |||
21 | 21 | ||
22 | Below is a sample application as part of the whole tool. The sample | 22 | Below is a sample application as part of the whole tool. The sample |
23 | can be used as a working test tool. Or it can be expanded to include | 23 | can be used as a working test tool. Or it can be expanded to include |
24 | more features. It also can be a integrated into a libary or other user | 24 | more features. It also can be a integrated into a library or other user |
25 | application to have more thorough test. | 25 | application to have more thorough test. |
26 | 26 | ||
27 | The sample application takes err.conf as error configuation input. Gcc | 27 | The sample application takes err.conf as error configuration input. GCC |
28 | compiles the code. After you install err_inject driver, you can run | 28 | compiles the code. After you install err_inject driver, you can run |
29 | this sample application to inject errors. | 29 | this sample application to inject errors. |
30 | 30 | ||
@@ -809,7 +809,7 @@ int err_inj() | |||
809 | } | 809 | } |
810 | 810 | ||
811 | /* Create semaphore: If one_lock, one semaphore for all processors. | 811 | /* Create semaphore: If one_lock, one semaphore for all processors. |
812 | Otherwise, one sempaphore for each processor. */ | 812 | Otherwise, one semaphore for each processor. */ |
813 | if (one_lock) { | 813 | if (one_lock) { |
814 | if (create_sem(0)) { | 814 | if (create_sem(0)) { |
815 | printf("Can not create semaphore...exit\n"); | 815 | printf("Can not create semaphore...exit\n"); |
diff --git a/Documentation/input/atarikbd.txt b/Documentation/input/atarikbd.txt index ab050621e20f..f3a3ba8847ba 100644 --- a/Documentation/input/atarikbd.txt +++ b/Documentation/input/atarikbd.txt | |||
@@ -170,7 +170,7 @@ major controller faults (ROM checksum and RAM test) and such things as stuck | |||
170 | keys. Any keys down at power-up are presumed to be stuck, and their BREAK | 170 | keys. Any keys down at power-up are presumed to be stuck, and their BREAK |
171 | (sic) code is returned (which without the preceding MAKE code is a flag for a | 171 | (sic) code is returned (which without the preceding MAKE code is a flag for a |
172 | keyboard error). If the controller self-test completes without error, the code | 172 | keyboard error). If the controller self-test completes without error, the code |
173 | 0xF0 is returned. (This code will be used to indicate the version/rlease of | 173 | 0xF0 is returned. (This code will be used to indicate the version/release of |
174 | the ikbd controller. The first release of the ikbd is version 0xF0, should | 174 | the ikbd controller. The first release of the ikbd is version 0xF0, should |
175 | there be a second release it will be 0xF1, and so on.) | 175 | there be a second release it will be 0xF1, and so on.) |
176 | The ikbd defaults to a mouse position reporting with threshold of 1 unit in | 176 | The ikbd defaults to a mouse position reporting with threshold of 1 unit in |
@@ -413,7 +413,7 @@ INTERROGATION MODE. | |||
413 | %nnnnmmmm ; where m is JOYSTICK1 state | 413 | %nnnnmmmm ; where m is JOYSTICK1 state |
414 | ; and n is JOYSTICK0 state | 414 | ; and n is JOYSTICK0 state |
415 | 415 | ||
416 | Sets the ikbd to do nothing but monitor the serial command lne, maintain the | 416 | Sets the ikbd to do nothing but monitor the serial command line, maintain the |
417 | time-of-day clock, and monitor the joystick. The rate sets the interval | 417 | time-of-day clock, and monitor the joystick. The rate sets the interval |
418 | between joystick samples. | 418 | between joystick samples. |
419 | N.B. The user should not set the rate higher than the serial communications | 419 | N.B. The user should not set the rate higher than the serial communications |
@@ -446,10 +446,10 @@ The sample interval should be as constant as possible. | |||
446 | ; until vertical cursor key is generated before RY | 446 | ; until vertical cursor key is generated before RY |
447 | ; has elapsed | 447 | ; has elapsed |
448 | VX ; length (in tenths of seconds) of joystick closure | 448 | VX ; length (in tenths of seconds) of joystick closure |
449 | ; until horizontal cursor keystokes are generated | 449 | ; until horizontal cursor keystrokes are generated |
450 | ; after RX has elapsed | 450 | ; after RX has elapsed |
451 | VY ; length (in tenths of seconds) of joystick closure | 451 | VY ; length (in tenths of seconds) of joystick closure |
452 | ; until vertical cursor keystokes are generated | 452 | ; until vertical cursor keystrokes are generated |
453 | ; after RY has elapsed | 453 | ; after RY has elapsed |
454 | 454 | ||
455 | In this mode, joystick 0 is scanned in a way that simulates cursor keystrokes. | 455 | In this mode, joystick 0 is scanned in a way that simulates cursor keystrokes. |
diff --git a/Documentation/input/ff.txt b/Documentation/input/ff.txt index 085eb15b45b7..ded4d5f53109 100644 --- a/Documentation/input/ff.txt +++ b/Documentation/input/ff.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | Force feedback for Linux. | 1 | Force feedback for Linux. |
2 | By Johann Deneux <deneux@ifrance.com> on 2001/04/22. | 2 | By Johann Deneux <johann.deneux@gmail.com> on 2001/04/22. |
3 | Updated by Anssi Hannula <anssi.hannula@gmail.com> on 2006/04/09. | 3 | Updated by Anssi Hannula <anssi.hannula@gmail.com> on 2006/04/09. |
4 | You may redistribute this file. Please remember to include shape.fig and | 4 | You may redistribute this file. Please remember to include shape.fig and |
5 | interactive.fig as well. | 5 | interactive.fig as well. |
diff --git a/Documentation/input/iforce-protocol.txt b/Documentation/input/iforce-protocol.txt index 8777d2d321e3..3ac92413c874 100644 --- a/Documentation/input/iforce-protocol.txt +++ b/Documentation/input/iforce-protocol.txt | |||
@@ -4,10 +4,10 @@ specify force effects to I-Force 2.0 devices. None of this information comes | |||
4 | from Immerse. That's why you should not trust what is written in this | 4 | from Immerse. That's why you should not trust what is written in this |
5 | document. This document is intended to help understanding the protocol. | 5 | document. This document is intended to help understanding the protocol. |
6 | This is not a reference. Comments and corrections are welcome. To contact me, | 6 | This is not a reference. Comments and corrections are welcome. To contact me, |
7 | send an email to: deneux@ifrance.com | 7 | send an email to: johann.deneux@gmail.com |
8 | 8 | ||
9 | ** WARNING ** | 9 | ** WARNING ** |
10 | I may not be held responsible for any dammage or harm caused if you try to | 10 | I shall not be held responsible for any damage or harm caused if you try to |
11 | send data to your I-Force device based on what you read in this document. | 11 | send data to your I-Force device based on what you read in this document. |
12 | 12 | ||
13 | ** Preliminary Notes: | 13 | ** Preliminary Notes: |
@@ -151,13 +151,13 @@ OP= ff | |||
151 | Query command. Length varies according to the query type. | 151 | Query command. Length varies according to the query type. |
152 | The general format of this packet is: | 152 | The general format of this packet is: |
153 | ff 01 QUERY [INDEX] CHECKSUM | 153 | ff 01 QUERY [INDEX] CHECKSUM |
154 | reponses are of the same form: | 154 | responses are of the same form: |
155 | FF LEN QUERY VALUE_QUERIED CHECKSUM2 | 155 | FF LEN QUERY VALUE_QUERIED CHECKSUM2 |
156 | where LEN = 1 + length(VALUE_QUERIED) | 156 | where LEN = 1 + length(VALUE_QUERIED) |
157 | 157 | ||
158 | **** Query ram size **** | 158 | **** Query ram size **** |
159 | QUERY = 42 ('B'uffer size) | 159 | QUERY = 42 ('B'uffer size) |
160 | The device should reply with the same packet plus two additionnal bytes | 160 | The device should reply with the same packet plus two additional bytes |
161 | containing the size of the memory: | 161 | containing the size of the memory: |
162 | ff 03 42 03 e8 CS would mean that the device has 1000 bytes of ram available. | 162 | ff 03 42 03 e8 CS would mean that the device has 1000 bytes of ram available. |
163 | 163 | ||
@@ -234,19 +234,23 @@ is the amount of memory apparently needed for every set of parameters: | |||
234 | 234 | ||
235 | ** Appendix: How to study the protocol ? ** | 235 | ** Appendix: How to study the protocol ? ** |
236 | 236 | ||
237 | 1. Generate effects using the force editor provided with the DirectX SDK, or use Immersion Studio (freely available at their web site in the developer section: www.immersion.com) | 237 | 1. Generate effects using the force editor provided with the DirectX SDK, or |
238 | 2. Start a soft spying RS232 or USB (depending on where you connected your joystick/wheel). I used ComPortSpy from fCoder (alpha version!) | 238 | use Immersion Studio (freely available at their web site in the developer section: |
239 | www.immersion.com) | ||
240 | 2. Start a soft spying RS232 or USB (depending on where you connected your | ||
241 | joystick/wheel). I used ComPortSpy from fCoder (alpha version!) | ||
239 | 3. Play the effect, and watch what happens on the spy screen. | 242 | 3. Play the effect, and watch what happens on the spy screen. |
240 | 243 | ||
241 | A few words about ComPortSpy: | 244 | A few words about ComPortSpy: |
242 | At first glance, this soft seems, hum, well... buggy. In fact, data appear with a few seconds latency. Personnaly, I restart it every time I play an effect. | 245 | At first glance, this software seems, hum, well... buggy. In fact, data appear with a |
246 | few seconds latency. Personally, I restart it every time I play an effect. | ||
243 | Remember it's free (as in free beer) and alpha! | 247 | Remember it's free (as in free beer) and alpha! |
244 | 248 | ||
245 | ** URLS ** | 249 | ** URLS ** |
246 | Check www.immerse.com for Immersion Studio, and www.fcoder.com for ComPortSpy. | 250 | Check www.immerse.com for Immersion Studio, and www.fcoder.com for ComPortSpy. |
247 | 251 | ||
248 | ** Author of this document ** | 252 | ** Author of this document ** |
249 | Johann Deneux <deneux@ifrance.com> | 253 | Johann Deneux <johann.deneux@gmail.com> |
250 | Home page at http://www.esil.univ-mrs.fr/~jdeneux/projects/ff/ | 254 | Home page at http://www.esil.univ-mrs.fr/~jdeneux/projects/ff/ |
251 | 255 | ||
252 | Additions by Vojtech Pavlik. | 256 | Additions by Vojtech Pavlik. |
diff --git a/Documentation/input/input-programming.txt b/Documentation/input/input-programming.txt index d9d523099bb7..47fc86830cd7 100644 --- a/Documentation/input/input-programming.txt +++ b/Documentation/input/input-programming.txt | |||
@@ -42,8 +42,8 @@ static int __init button_init(void) | |||
42 | goto err_free_irq; | 42 | goto err_free_irq; |
43 | } | 43 | } |
44 | 44 | ||
45 | button_dev->evbit[0] = BIT(EV_KEY); | 45 | button_dev->evbit[0] = BIT_MASK(EV_KEY); |
46 | button_dev->keybit[LONG(BTN_0)] = BIT(BTN_0); | 46 | button_dev->keybit[BIT_WORD(BTN_0)] = BIT_MASK(BTN_0); |
47 | 47 | ||
48 | error = input_register_device(button_dev); | 48 | error = input_register_device(button_dev); |
49 | if (error) { | 49 | if (error) { |
@@ -79,7 +79,7 @@ In the _init function, which is called either upon module load or when | |||
79 | booting the kernel, it grabs the required resources (it should also check | 79 | booting the kernel, it grabs the required resources (it should also check |
80 | for the presence of the device). | 80 | for the presence of the device). |
81 | 81 | ||
82 | Then it allocates a new input device structure with input_aloocate_device() | 82 | Then it allocates a new input device structure with input_allocate_device() |
83 | and sets up input bitfields. This way the device driver tells the other | 83 | and sets up input bitfields. This way the device driver tells the other |
84 | parts of the input systems what it is - what events can be generated or | 84 | parts of the input systems what it is - what events can be generated or |
85 | accepted by this input device. Our example device can only generate EV_KEY | 85 | accepted by this input device. Our example device can only generate EV_KEY |
@@ -217,14 +217,15 @@ If you don't need absfuzz and absflat, you can set them to zero, which mean | |||
217 | that the thing is precise and always returns to exactly the center position | 217 | that the thing is precise and always returns to exactly the center position |
218 | (if it has any). | 218 | (if it has any). |
219 | 219 | ||
220 | 1.4 NBITS(), LONG(), BIT() | 220 | 1.4 BITS_TO_LONGS(), BIT_WORD(), BIT_MASK() |
221 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ | 221 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ |
222 | 222 | ||
223 | These three macros from input.h help some bitfield computations: | 223 | These three macros from bitops.h help some bitfield computations: |
224 | 224 | ||
225 | NBITS(x) - returns the length of a bitfield array in longs for x bits | 225 | BITS_TO_LONGS(x) - returns the length of a bitfield array in longs for |
226 | LONG(x) - returns the index in the array in longs for bit x | 226 | x bits |
227 | BIT(x) - returns the index in a long for bit x | 227 | BIT_WORD(x) - returns the index in the array in longs for bit x |
228 | BIT_MASK(x) - returns the index in a long for bit x | ||
228 | 229 | ||
229 | 1.5 The id* and name fields | 230 | 1.5 The id* and name fields |
230 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 231 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
diff --git a/Documentation/isdn/CREDITS b/Documentation/isdn/CREDITS index 7c17c837064f..8cac6c2f23ee 100644 --- a/Documentation/isdn/CREDITS +++ b/Documentation/isdn/CREDITS | |||
@@ -40,7 +40,7 @@ Andreas Kool (akool@Kool.f.EUnet.de) | |||
40 | Pedro Roque Marques (roque@di.fc.ul.pt) | 40 | Pedro Roque Marques (roque@di.fc.ul.pt) |
41 | For lot of new ideas and the pcbit driver. | 41 | For lot of new ideas and the pcbit driver. |
42 | 42 | ||
43 | Eberhard Moenkeberg (emoenke@gwdg.de) | 43 | Eberhard Mönkeberg (emoenke@gwdg.de) |
44 | For testing and help to get into kernel. | 44 | For testing and help to get into kernel. |
45 | 45 | ||
46 | Thomas Neumann (tn@ruhr.de) | 46 | Thomas Neumann (tn@ruhr.de) |
diff --git a/Documentation/isdn/README.concap b/Documentation/isdn/README.concap index 2f114babe4b6..a76d74845a4c 100644 --- a/Documentation/isdn/README.concap +++ b/Documentation/isdn/README.concap | |||
@@ -111,7 +111,7 @@ struct concap_proto_ops{ | |||
111 | struct concap_proto * (*proto_new) (void); | 111 | struct concap_proto * (*proto_new) (void); |
112 | 112 | ||
113 | /* delete encapsulation protocol instance and free all its resources. | 113 | /* delete encapsulation protocol instance and free all its resources. |
114 | cprot may no loger be referenced after calling this */ | 114 | cprot may no longer be referenced after calling this */ |
115 | void (*proto_del)(struct concap_proto *cprot); | 115 | void (*proto_del)(struct concap_proto *cprot); |
116 | 116 | ||
117 | /* initialize the protocol's data. To be called at interface startup | 117 | /* initialize the protocol's data. To be called at interface startup |
diff --git a/Documentation/java.txt b/Documentation/java.txt index 3cce3fbb6644..e6a723281547 100644 --- a/Documentation/java.txt +++ b/Documentation/java.txt | |||
@@ -37,7 +37,7 @@ other program after you have done the following: | |||
37 | or the following, if you want to be more selective: | 37 | or the following, if you want to be more selective: |
38 | ':Applet:M::<!--applet::/usr/bin/appletviewer:' | 38 | ':Applet:M::<!--applet::/usr/bin/appletviewer:' |
39 | 39 | ||
40 | Of cause you have to fix the path names. Given path/file names in this | 40 | Of course you have to fix the path names. The path/file names given in this |
41 | document match the Debian 2.1 system. (i.e. jdk installed in /usr, | 41 | document match the Debian 2.1 system. (i.e. jdk installed in /usr, |
42 | custom wrappers from this document in /usr/local) | 42 | custom wrappers from this document in /usr/local) |
43 | 43 | ||
diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt index fe8b0c4892cf..616043a6da99 100644 --- a/Documentation/kbuild/kconfig-language.txt +++ b/Documentation/kbuild/kconfig-language.txt | |||
@@ -77,7 +77,12 @@ applicable everywhere (see syntax). | |||
77 | Optionally, dependencies only for this default value can be added with | 77 | Optionally, dependencies only for this default value can be added with |
78 | "if". | 78 | "if". |
79 | 79 | ||
80 | - dependencies: "depends on"/"requires" <expr> | 80 | - type definition + default value: |
81 | "def_bool"/"def_tristate" <expr> ["if" <expr>] | ||
82 | This is a shorthand notation for a type definition plus a value. | ||
83 | Optionally dependencies for this default value can be added with "if". | ||
84 | |||
85 | - dependencies: "depends on" <expr> | ||
81 | This defines a dependency for this menu entry. If multiple | 86 | This defines a dependency for this menu entry. If multiple |
82 | dependencies are defined, they are connected with '&&'. Dependencies | 87 | dependencies are defined, they are connected with '&&'. Dependencies |
83 | are applied to all other options within this menu entry (which also | 88 | are applied to all other options within this menu entry (which also |
@@ -289,3 +294,10 @@ source: | |||
289 | "source" <prompt> | 294 | "source" <prompt> |
290 | 295 | ||
291 | This reads the specified configuration file. This file is always parsed. | 296 | This reads the specified configuration file. This file is always parsed. |
297 | |||
298 | mainmenu: | ||
299 | |||
300 | "mainmenu" <prompt> | ||
301 | |||
302 | This sets the config program's title bar if the config program chooses | ||
303 | to use it. | ||
diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt index f099b814d383..7a7753321a26 100644 --- a/Documentation/kbuild/makefiles.txt +++ b/Documentation/kbuild/makefiles.txt | |||
@@ -518,6 +518,28 @@ more details, with real examples. | |||
518 | In this example for a specific GCC version the build will error out explaining | 518 | In this example for a specific GCC version the build will error out explaining |
519 | to the user why it stops. | 519 | to the user why it stops. |
520 | 520 | ||
521 | cc-cross-prefix | ||
522 | cc-cross-prefix is used to check if there exists a $(CC) in path with | ||
523 | one of the listed prefixes. The first prefix where there exist a | ||
524 | prefix$(CC) in the PATH is returned - and if no prefix$(CC) is found | ||
525 | then nothing is returned. | ||
526 | Additional prefixes are separated by a single space in the | ||
527 | call of cc-cross-prefix. | ||
528 | This functionality is useful for architecture Makefiles that try | ||
529 | to set CROSS_COMPILE to well-known values but may have several | ||
530 | values to select between. | ||
531 | It is recommended only to try to set CROSS_COMPILE if it is a cross | ||
532 | build (host arch is different from target arch). And if CROSS_COMPILE | ||
533 | is already set then leave it with the old value. | ||
534 | |||
535 | Example: | ||
536 | #arch/m68k/Makefile | ||
537 | ifneq ($(SUBARCH),$(ARCH)) | ||
538 | ifeq ($(CROSS_COMPILE),) | ||
539 | CROSS_COMPILE := $(call cc-cross-prefix, m68k-linux-gnu-) | ||
540 | endif | ||
541 | endif | ||
542 | |||
521 | === 4 Host Program support | 543 | === 4 Host Program support |
522 | 544 | ||
523 | Kbuild supports building executables on the host for use during the | 545 | Kbuild supports building executables on the host for use during the |
diff --git a/Documentation/kdump/kdump.txt b/Documentation/kdump/kdump.txt index 1b37b28cc234..d0ac72cc19ff 100644 --- a/Documentation/kdump/kdump.txt +++ b/Documentation/kdump/kdump.txt | |||
@@ -231,6 +231,32 @@ Dump-capture kernel config options (Arch Dependent, ia64) | |||
231 | any space below the alignment point will be wasted. | 231 | any space below the alignment point will be wasted. |
232 | 232 | ||
233 | 233 | ||
234 | Extended crashkernel syntax | ||
235 | =========================== | ||
236 | |||
237 | While the "crashkernel=size[@offset]" syntax is sufficient for most | ||
238 | configurations, sometimes it's handy to have the reserved memory dependent | ||
239 | on the value of System RAM -- that's mostly for distributors that pre-setup | ||
240 | the kernel command line to avoid a unbootable system after some memory has | ||
241 | been removed from the machine. | ||
242 | |||
243 | The syntax is: | ||
244 | |||
245 | crashkernel=<range1>:<size1>[,<range2>:<size2>,...][@offset] | ||
246 | range=start-[end] | ||
247 | |||
248 | For example: | ||
249 | |||
250 | crashkernel=512M-2G:64M,2G-:128M | ||
251 | |||
252 | This would mean: | ||
253 | |||
254 | 1) if the RAM is smaller than 512M, then don't reserve anything | ||
255 | (this is the "rescue" case) | ||
256 | 2) if the RAM size is between 512M and 2G, then reserve 64M | ||
257 | 3) if the RAM size is larger than 2G, then reserve 128M | ||
258 | |||
259 | |||
234 | Boot into System Kernel | 260 | Boot into System Kernel |
235 | ======================= | 261 | ======================= |
236 | 262 | ||
diff --git a/Documentation/kernel-docs.txt b/Documentation/kernel-docs.txt index d9e3b199929b..5a4ef48224ae 100644 --- a/Documentation/kernel-docs.txt +++ b/Documentation/kernel-docs.txt | |||
@@ -76,9 +76,9 @@ | |||
76 | * Title: "Conceptual Architecture of the Linux Kernel" | 76 | * Title: "Conceptual Architecture of the Linux Kernel" |
77 | Author: Ivan T. Bowman. | 77 | Author: Ivan T. Bowman. |
78 | URL: http://plg.uwaterloo.ca/~itbowman/papers/CS746G-a1.html | 78 | URL: http://plg.uwaterloo.ca/~itbowman/papers/CS746G-a1.html |
79 | Keywords: conceptual software arquitecture, extracted design, | 79 | Keywords: conceptual software architecture, extracted design, |
80 | reverse engineering, system structure. | 80 | reverse engineering, system structure. |
81 | Description: Conceptual software arquitecture of the Linux kernel, | 81 | Description: Conceptual software architecture of the Linux kernel, |
82 | automatically extracted from the source code. Very detailed. Good | 82 | automatically extracted from the source code. Very detailed. Good |
83 | figures. Gives good overall kernel understanding. | 83 | figures. Gives good overall kernel understanding. |
84 | 84 | ||
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 98cf90f2631d..b2361667839f 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -222,9 +222,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
222 | Warning: Many of these options can produce a lot of | 222 | Warning: Many of these options can produce a lot of |
223 | output and make your system unusable. Be very careful. | 223 | output and make your system unusable. Be very careful. |
224 | 224 | ||
225 | |||
226 | acpi_fake_ecdt [HW,ACPI] Workaround failure due to BIOS lacking ECDT | ||
227 | |||
228 | acpi_pm_good [X86-32,X86-64] | 225 | acpi_pm_good [X86-32,X86-64] |
229 | Override the pmtimer bug detection: force the kernel | 226 | Override the pmtimer bug detection: force the kernel |
230 | to assume that this machine's pmtimer latches its value | 227 | to assume that this machine's pmtimer latches its value |
@@ -297,9 +294,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
297 | apm= [APM] Advanced Power Management | 294 | apm= [APM] Advanced Power Management |
298 | See header of arch/i386/kernel/apm.c. | 295 | See header of arch/i386/kernel/apm.c. |
299 | 296 | ||
300 | applicom= [HW] | ||
301 | Format: <mem>,<irq> | ||
302 | |||
303 | arcrimi= [HW,NET] ARCnet - "RIM I" (entirely mem-mapped) cards | 297 | arcrimi= [HW,NET] ARCnet - "RIM I" (entirely mem-mapped) cards |
304 | Format: <io>,<irq>,<nodeID> | 298 | Format: <io>,<irq>,<nodeID> |
305 | 299 | ||
@@ -345,12 +339,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
345 | Format: <io>,<irq>,<mode> | 339 | Format: <io>,<irq>,<mode> |
346 | See header of drivers/net/hamradio/baycom_ser_hdx.c. | 340 | See header of drivers/net/hamradio/baycom_ser_hdx.c. |
347 | 341 | ||
348 | blkmtd_device= [HW,MTD] | ||
349 | blkmtd_erasesz= | ||
350 | blkmtd_ro= | ||
351 | blkmtd_bs= | ||
352 | blkmtd_count= | ||
353 | |||
354 | boot_delay= Milliseconds to delay each printk during boot. | 342 | boot_delay= Milliseconds to delay each printk during boot. |
355 | Values larger than 10 seconds (10000) are changed to | 343 | Values larger than 10 seconds (10000) are changed to |
356 | no delay (0). | 344 | no delay (0). |
@@ -431,8 +419,10 @@ and is between 256 and 4096 characters. It is defined in the file | |||
431 | over the 8254 in addition to over the IO-APIC. The | 419 | over the 8254 in addition to over the IO-APIC. The |
432 | kernel tries to set a sensible default. | 420 | kernel tries to set a sensible default. |
433 | 421 | ||
434 | hpet= [X86-32,HPET] option to disable HPET and use PIT. | 422 | hpet= [X86-32,HPET] option to control HPET usage |
435 | Format: disable | 423 | Format: { enable (default) | disable | force } |
424 | disable: disable HPET and use PIT instead | ||
425 | force: allow force enabled of undocumented chips (ICH4, VIA) | ||
436 | 426 | ||
437 | com20020= [HW,NET] ARCnet - COM20020 chipset | 427 | com20020= [HW,NET] ARCnet - COM20020 chipset |
438 | Format: | 428 | Format: |
@@ -479,6 +469,16 @@ and is between 256 and 4096 characters. It is defined in the file | |||
479 | UART at the specified I/O port or MMIO address. | 469 | UART at the specified I/O port or MMIO address. |
480 | The options are the same as for ttyS, above. | 470 | The options are the same as for ttyS, above. |
481 | 471 | ||
472 | no_console_suspend | ||
473 | [HW] Never suspend the console | ||
474 | Disable suspending of consoles during suspend and | ||
475 | hibernate operations. Once disabled, debugging | ||
476 | messages can reach various consoles while the rest | ||
477 | of the system is being put to sleep (ie, while | ||
478 | debugging driver suspend/resume hooks). This may | ||
479 | not work reliably with all consoles, but is known | ||
480 | to work with serial and VGA consoles. | ||
481 | |||
482 | cpcihp_generic= [HW,PCI] Generic port I/O CompactPCI driver | 482 | cpcihp_generic= [HW,PCI] Generic port I/O CompactPCI driver |
483 | Format: | 483 | Format: |
484 | <first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>] | 484 | <first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>] |
@@ -487,6 +487,13 @@ and is between 256 and 4096 characters. It is defined in the file | |||
487 | [KNL] Reserve a chunk of physical memory to | 487 | [KNL] Reserve a chunk of physical memory to |
488 | hold a kernel to switch to with kexec on panic. | 488 | hold a kernel to switch to with kexec on panic. |
489 | 489 | ||
490 | crashkernel=range1:size1[,range2:size2,...][@offset] | ||
491 | [KNL] Same as above, but depends on the memory | ||
492 | in the running system. The syntax of range is | ||
493 | start-[end] where start and end are both | ||
494 | a memory unit (amount[KMG]). See also | ||
495 | Documentation/kdump/kdump.txt for a example. | ||
496 | |||
490 | cs4232= [HW,OSS] | 497 | cs4232= [HW,OSS] |
491 | Format: <io>,<irq>,<dma>,<dma2>,<mpuio>,<mpuirq> | 498 | Format: <io>,<irq>,<dma>,<dma2>,<mpuio>,<mpuirq> |
492 | 499 | ||
@@ -496,8 +503,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
496 | cs89x0_media= [HW,NET] | 503 | cs89x0_media= [HW,NET] |
497 | Format: { rj45 | aui | bnc } | 504 | Format: { rj45 | aui | bnc } |
498 | 505 | ||
499 | cyclades= [HW,SERIAL] Cyclades multi-serial port adapter. | ||
500 | |||
501 | dasd= [HW,NET] | 506 | dasd= [HW,NET] |
502 | See header of drivers/s390/block/dasd_devmap.c. | 507 | See header of drivers/s390/block/dasd_devmap.c. |
503 | 508 | ||
@@ -555,10 +560,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
555 | See drivers/char/README.epca and | 560 | See drivers/char/README.epca and |
556 | Documentation/digiepca.txt. | 561 | Documentation/digiepca.txt. |
557 | 562 | ||
558 | dmascc= [HW,AX25,SERIAL] AX.25 Z80SCC driver with DMA | ||
559 | support available. | ||
560 | Format: <io_dev0>[,<io_dev1>[,..<io_dev32>]] | ||
561 | |||
562 | dmasound= [HW,OSS] Sound subsystem buffers | 563 | dmasound= [HW,OSS] Sound subsystem buffers |
563 | 564 | ||
564 | dscc4.setup= [NET] | 565 | dscc4.setup= [NET] |
@@ -589,17 +590,10 @@ and is between 256 and 4096 characters. It is defined in the file | |||
589 | 0: polling mode | 590 | 0: polling mode |
590 | non-0: interrupt mode (default) | 591 | non-0: interrupt mode (default) |
591 | 592 | ||
592 | eda= [HW,PS2] | ||
593 | |||
594 | edb= [HW,PS2] | ||
595 | |||
596 | edd= [EDD] | 593 | edd= [EDD] |
597 | Format: {"of[f]" | "sk[ipmbr]"} | 594 | Format: {"of[f]" | "sk[ipmbr]"} |
598 | See comment in arch/i386/boot/edd.S | 595 | See comment in arch/i386/boot/edd.S |
599 | 596 | ||
600 | eicon= [HW,ISDN] | ||
601 | Format: <id>,<membase>,<irq> | ||
602 | |||
603 | eisa_irq_edge= [PARISC,HW] | 597 | eisa_irq_edge= [PARISC,HW] |
604 | See header of drivers/parisc/eisa.c. | 598 | See header of drivers/parisc/eisa.c. |
605 | 599 | ||
@@ -778,6 +772,23 @@ and is between 256 and 4096 characters. It is defined in the file | |||
778 | 772 | ||
779 | inttest= [IA64] | 773 | inttest= [IA64] |
780 | 774 | ||
775 | intel_iommu= [DMAR] Intel IOMMU driver (DMAR) option | ||
776 | off | ||
777 | Disable intel iommu driver. | ||
778 | igfx_off [Default Off] | ||
779 | By default, gfx is mapped as normal device. If a gfx | ||
780 | device has a dedicated DMAR unit, the DMAR unit is | ||
781 | bypassed by not enabling DMAR with this option. In | ||
782 | this case, gfx device will use physical address for | ||
783 | DMA. | ||
784 | forcedac [x86_64] | ||
785 | With this option iommu will not optimize to look | ||
786 | for io virtual address below 32 bit forcing dual | ||
787 | address cycle on pci bus for cards supporting greater | ||
788 | than 32 bit addressing. The default is to look | ||
789 | for translation below 32 bit and if not available | ||
790 | then look in the higher range. | ||
791 | |||
781 | io7= [HW] IO7 for Marvel based alpha systems | 792 | io7= [HW] IO7 for Marvel based alpha systems |
782 | See comment before marvel_specify_io7 in | 793 | See comment before marvel_specify_io7 in |
783 | arch/alpha/kernel/core_marvel.c. | 794 | arch/alpha/kernel/core_marvel.c. |
@@ -875,9 +886,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
875 | lapic_timer_c2_ok [X86-32,x86-64,APIC] trust the local apic timer in | 886 | lapic_timer_c2_ok [X86-32,x86-64,APIC] trust the local apic timer in |
876 | C2 power state. | 887 | C2 power state. |
877 | 888 | ||
878 | lasi= [HW,SCSI] PARISC LASI driver for the 53c700 chip | ||
879 | Format: addr:<io>,irq:<irq> | ||
880 | |||
881 | libata.noacpi [LIBATA] Disables use of ACPI in libata suspend/resume | 889 | libata.noacpi [LIBATA] Disables use of ACPI in libata suspend/resume |
882 | when set. | 890 | when set. |
883 | Format: <int> | 891 | Format: <int> |
@@ -1125,9 +1133,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1125 | noapic [SMP,APIC] Tells the kernel to not make use of any | 1133 | noapic [SMP,APIC] Tells the kernel to not make use of any |
1126 | IOAPICs that may be present in the system. | 1134 | IOAPICs that may be present in the system. |
1127 | 1135 | ||
1128 | noasync [HW,M68K] Disables async and sync negotiation for | ||
1129 | all devices. | ||
1130 | |||
1131 | nobats [PPC] Do not use BATs for mapping kernel lowmem | 1136 | nobats [PPC] Do not use BATs for mapping kernel lowmem |
1132 | on "Classic" PPC cores. | 1137 | on "Classic" PPC cores. |
1133 | 1138 | ||
@@ -1439,6 +1444,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1439 | Param: <number> - step/bucket size as a power of 2 for | 1444 | Param: <number> - step/bucket size as a power of 2 for |
1440 | statistical time based profiling. | 1445 | statistical time based profiling. |
1441 | Param: "sleep" - profile D-state sleeping (millisecs) | 1446 | Param: "sleep" - profile D-state sleeping (millisecs) |
1447 | Param: "kvm" - profile VM exits. | ||
1442 | 1448 | ||
1443 | processor.max_cstate= [HW,ACPI] | 1449 | processor.max_cstate= [HW,ACPI] |
1444 | Limit processor to maximum C-state | 1450 | Limit processor to maximum C-state |
@@ -1565,9 +1571,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1565 | sa1100ir [NET] | 1571 | sa1100ir [NET] |
1566 | See drivers/net/irda/sa1100_ir.c. | 1572 | See drivers/net/irda/sa1100_ir.c. |
1567 | 1573 | ||
1568 | sb= [HW,OSS] | ||
1569 | Format: <io>,<irq>,<dma>,<dma2> | ||
1570 | |||
1571 | sbni= [NET] Granch SBNI12 leased line adapter | 1574 | sbni= [NET] Granch SBNI12 leased line adapter |
1572 | 1575 | ||
1573 | sc1200wdt= [HW,WDT] SC1200 WDT (watchdog) driver | 1576 | sc1200wdt= [HW,WDT] SC1200 WDT (watchdog) driver |
@@ -1611,8 +1614,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1611 | 1614 | ||
1612 | serialnumber [BUGS=X86-32] | 1615 | serialnumber [BUGS=X86-32] |
1613 | 1616 | ||
1614 | sg_def_reserved_size= [SCSI] | ||
1615 | |||
1616 | shapers= [NET] | 1617 | shapers= [NET] |
1617 | Maximal number of shapers. | 1618 | Maximal number of shapers. |
1618 | 1619 | ||
@@ -2003,10 +2004,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
2003 | norandmaps Don't use address space randomization | 2004 | norandmaps Don't use address space randomization |
2004 | Equivalent to echo 0 > /proc/sys/kernel/randomize_va_space | 2005 | Equivalent to echo 0 > /proc/sys/kernel/randomize_va_space |
2005 | 2006 | ||
2006 | unwind_debug=N N > 0 will enable dwarf2 unwinder debugging | ||
2007 | This is useful to get more information why | ||
2008 | you got a "dwarf2 unwinder stuck" | ||
2009 | |||
2010 | ______________________________________________________________________ | 2007 | ______________________________________________________________________ |
2011 | 2008 | ||
2012 | TODO: | 2009 | TODO: |
diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile index c0b7a4556390..bac037eb1cda 100644 --- a/Documentation/lguest/Makefile +++ b/Documentation/lguest/Makefile | |||
@@ -1,28 +1,8 @@ | |||
1 | # This creates the demonstration utility "lguest" which runs a Linux guest. | 1 | # This creates the demonstration utility "lguest" which runs a Linux guest. |
2 | 2 | CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include | |
3 | # For those people that have a separate object dir, look there for .config | ||
4 | KBUILD_OUTPUT := ../.. | ||
5 | ifdef O | ||
6 | ifeq ("$(origin O)", "command line") | ||
7 | KBUILD_OUTPUT := $(O) | ||
8 | endif | ||
9 | endif | ||
10 | # We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary. | ||
11 | include $(KBUILD_OUTPUT)/.config | ||
12 | LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000) | ||
13 | |||
14 | CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -Wl,-T,lguest.lds | ||
15 | LDLIBS:=-lz | 3 | LDLIBS:=-lz |
16 | # Removing this works for some versions of ld.so (eg. Ubuntu Feisty) and | ||
17 | # not others (eg. FC7). | ||
18 | LDFLAGS+=-static | ||
19 | all: lguest.lds lguest | ||
20 | 4 | ||
21 | # The linker script on x86 is so complex the only way of creating one | 5 | all: lguest |
22 | # which will link our binary in the right place is to mangle the | ||
23 | # default one. | ||
24 | lguest.lds: | ||
25 | $(LD) --verbose | awk '/^==========/ { PRINT=1; next; } /SIZEOF_HEADERS/ { gsub(/0x[0-9A-F]*/, "$(LGUEST_GUEST_TOP)") } { if (PRINT) print $$0; }' > $@ | ||
26 | 6 | ||
27 | clean: | 7 | clean: |
28 | rm -f lguest.lds lguest | 8 | rm -f lguest |
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 103e346c8b6a..5bdc37f81842 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c | |||
@@ -1,10 +1,7 @@ | |||
1 | /*P:100 This is the Launcher code, a simple program which lays out the | 1 | /*P:100 This is the Launcher code, a simple program which lays out the |
2 | * "physical" memory for the new Guest by mapping the kernel image and the | 2 | * "physical" memory for the new Guest by mapping the kernel image and the |
3 | * virtual devices, then reads repeatedly from /dev/lguest to run the Guest. | 3 | * virtual devices, then reads repeatedly from /dev/lguest to run the Guest. |
4 | * | 4 | :*/ |
5 | * The only trick: the Makefile links it at a high address so it will be clear | ||
6 | * of the guest memory region. It means that each Guest cannot have more than | ||
7 | * about 2.5G of memory on a normally configured Host. :*/ | ||
8 | #define _LARGEFILE64_SOURCE | 5 | #define _LARGEFILE64_SOURCE |
9 | #define _GNU_SOURCE | 6 | #define _GNU_SOURCE |
10 | #include <stdio.h> | 7 | #include <stdio.h> |
@@ -15,6 +12,7 @@ | |||
15 | #include <stdlib.h> | 12 | #include <stdlib.h> |
16 | #include <elf.h> | 13 | #include <elf.h> |
17 | #include <sys/mman.h> | 14 | #include <sys/mman.h> |
15 | #include <sys/param.h> | ||
18 | #include <sys/types.h> | 16 | #include <sys/types.h> |
19 | #include <sys/stat.h> | 17 | #include <sys/stat.h> |
20 | #include <sys/wait.h> | 18 | #include <sys/wait.h> |
@@ -34,7 +32,9 @@ | |||
34 | #include <termios.h> | 32 | #include <termios.h> |
35 | #include <getopt.h> | 33 | #include <getopt.h> |
36 | #include <zlib.h> | 34 | #include <zlib.h> |
37 | /*L:110 We can ignore the 28 include files we need for this program, but I do | 35 | #include <assert.h> |
36 | #include <sched.h> | ||
37 | /*L:110 We can ignore the 30 include files we need for this program, but I do | ||
38 | * want to draw attention to the use of kernel-style types. | 38 | * want to draw attention to the use of kernel-style types. |
39 | * | 39 | * |
40 | * As Linus said, "C is a Spartan language, and so should your naming be." I | 40 | * As Linus said, "C is a Spartan language, and so should your naming be." I |
@@ -45,8 +45,14 @@ typedef unsigned long long u64; | |||
45 | typedef uint32_t u32; | 45 | typedef uint32_t u32; |
46 | typedef uint16_t u16; | 46 | typedef uint16_t u16; |
47 | typedef uint8_t u8; | 47 | typedef uint8_t u8; |
48 | #include "../../include/linux/lguest_launcher.h" | 48 | #include "linux/lguest_launcher.h" |
49 | #include "../../include/asm-x86/e820_32.h" | 49 | #include "linux/pci_ids.h" |
50 | #include "linux/virtio_config.h" | ||
51 | #include "linux/virtio_net.h" | ||
52 | #include "linux/virtio_blk.h" | ||
53 | #include "linux/virtio_console.h" | ||
54 | #include "linux/virtio_ring.h" | ||
55 | #include "asm-x86/bootparam.h" | ||
50 | /*:*/ | 56 | /*:*/ |
51 | 57 | ||
52 | #define PAGE_PRESENT 0x7 /* Present, RW, Execute */ | 58 | #define PAGE_PRESENT 0x7 /* Present, RW, Execute */ |
@@ -55,6 +61,10 @@ typedef uint8_t u8; | |||
55 | #ifndef SIOCBRADDIF | 61 | #ifndef SIOCBRADDIF |
56 | #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ | 62 | #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ |
57 | #endif | 63 | #endif |
64 | /* We can have up to 256 pages for devices. */ | ||
65 | #define DEVICE_PAGES 256 | ||
66 | /* This fits nicely in a single 4096-byte page. */ | ||
67 | #define VIRTQUEUE_NUM 127 | ||
58 | 68 | ||
59 | /*L:120 verbose is both a global flag and a macro. The C preprocessor allows | 69 | /*L:120 verbose is both a global flag and a macro. The C preprocessor allows |
60 | * this, and although I wouldn't recommend it, it works quite nicely here. */ | 70 | * this, and although I wouldn't recommend it, it works quite nicely here. */ |
@@ -65,8 +75,10 @@ static bool verbose; | |||
65 | 75 | ||
66 | /* The pipe to send commands to the waker process */ | 76 | /* The pipe to send commands to the waker process */ |
67 | static int waker_fd; | 77 | static int waker_fd; |
68 | /* The top of guest physical memory. */ | 78 | /* The pointer to the start of guest memory. */ |
69 | static u32 top; | 79 | static void *guest_base; |
80 | /* The maximum guest physical address allowed, and maximum possible. */ | ||
81 | static unsigned long guest_limit, guest_max; | ||
70 | 82 | ||
71 | /* This is our list of devices. */ | 83 | /* This is our list of devices. */ |
72 | struct device_list | 84 | struct device_list |
@@ -76,8 +88,17 @@ struct device_list | |||
76 | fd_set infds; | 88 | fd_set infds; |
77 | int max_infd; | 89 | int max_infd; |
78 | 90 | ||
91 | /* Counter to assign interrupt numbers. */ | ||
92 | unsigned int next_irq; | ||
93 | |||
94 | /* Counter to print out convenient device numbers. */ | ||
95 | unsigned int device_num; | ||
96 | |||
79 | /* The descriptor page for the devices. */ | 97 | /* The descriptor page for the devices. */ |
80 | struct lguest_device_desc *descs; | 98 | u8 *descpage; |
99 | |||
100 | /* The tail of the last descriptor. */ | ||
101 | unsigned int desc_used; | ||
81 | 102 | ||
82 | /* A single linked list of devices. */ | 103 | /* A single linked list of devices. */ |
83 | struct device *dev; | 104 | struct device *dev; |
@@ -85,31 +106,111 @@ struct device_list | |||
85 | struct device **lastdev; | 106 | struct device **lastdev; |
86 | }; | 107 | }; |
87 | 108 | ||
109 | /* The list of Guest devices, based on command line arguments. */ | ||
110 | static struct device_list devices; | ||
111 | |||
88 | /* The device structure describes a single device. */ | 112 | /* The device structure describes a single device. */ |
89 | struct device | 113 | struct device |
90 | { | 114 | { |
91 | /* The linked-list pointer. */ | 115 | /* The linked-list pointer. */ |
92 | struct device *next; | 116 | struct device *next; |
93 | /* The descriptor for this device, as mapped into the Guest. */ | 117 | |
118 | /* The this device's descriptor, as mapped into the Guest. */ | ||
94 | struct lguest_device_desc *desc; | 119 | struct lguest_device_desc *desc; |
95 | /* The memory page(s) of this device, if any. Also mapped in Guest. */ | 120 | |
96 | void *mem; | 121 | /* The name of this device, for --verbose. */ |
122 | const char *name; | ||
97 | 123 | ||
98 | /* If handle_input is set, it wants to be called when this file | 124 | /* If handle_input is set, it wants to be called when this file |
99 | * descriptor is ready. */ | 125 | * descriptor is ready. */ |
100 | int fd; | 126 | int fd; |
101 | bool (*handle_input)(int fd, struct device *me); | 127 | bool (*handle_input)(int fd, struct device *me); |
102 | 128 | ||
103 | /* If handle_output is set, it wants to be called when the Guest sends | 129 | /* Any queues attached to this device */ |
104 | * DMA to this key. */ | 130 | struct virtqueue *vq; |
105 | unsigned long watch_key; | ||
106 | u32 (*handle_output)(int fd, const struct iovec *iov, | ||
107 | unsigned int num, struct device *me); | ||
108 | 131 | ||
109 | /* Device-specific data. */ | 132 | /* Device-specific data. */ |
110 | void *priv; | 133 | void *priv; |
111 | }; | 134 | }; |
112 | 135 | ||
136 | /* The virtqueue structure describes a queue attached to a device. */ | ||
137 | struct virtqueue | ||
138 | { | ||
139 | struct virtqueue *next; | ||
140 | |||
141 | /* Which device owns me. */ | ||
142 | struct device *dev; | ||
143 | |||
144 | /* The configuration for this queue. */ | ||
145 | struct lguest_vqconfig config; | ||
146 | |||
147 | /* The actual ring of buffers. */ | ||
148 | struct vring vring; | ||
149 | |||
150 | /* Last available index we saw. */ | ||
151 | u16 last_avail_idx; | ||
152 | |||
153 | /* The routine to call when the Guest pings us. */ | ||
154 | void (*handle_output)(int fd, struct virtqueue *me); | ||
155 | }; | ||
156 | |||
157 | /* Since guest is UP and we don't run at the same time, we don't need barriers. | ||
158 | * But I include them in the code in case others copy it. */ | ||
159 | #define wmb() | ||
160 | |||
161 | /* Convert an iovec element to the given type. | ||
162 | * | ||
163 | * This is a fairly ugly trick: we need to know the size of the type and | ||
164 | * alignment requirement to check the pointer is kosher. It's also nice to | ||
165 | * have the name of the type in case we report failure. | ||
166 | * | ||
167 | * Typing those three things all the time is cumbersome and error prone, so we | ||
168 | * have a macro which sets them all up and passes to the real function. */ | ||
169 | #define convert(iov, type) \ | ||
170 | ((type *)_convert((iov), sizeof(type), __alignof__(type), #type)) | ||
171 | |||
172 | static void *_convert(struct iovec *iov, size_t size, size_t align, | ||
173 | const char *name) | ||
174 | { | ||
175 | if (iov->iov_len != size) | ||
176 | errx(1, "Bad iovec size %zu for %s", iov->iov_len, name); | ||
177 | if ((unsigned long)iov->iov_base % align != 0) | ||
178 | errx(1, "Bad alignment %p for %s", iov->iov_base, name); | ||
179 | return iov->iov_base; | ||
180 | } | ||
181 | |||
182 | /* The virtio configuration space is defined to be little-endian. x86 is | ||
183 | * little-endian too, but it's nice to be explicit so we have these helpers. */ | ||
184 | #define cpu_to_le16(v16) (v16) | ||
185 | #define cpu_to_le32(v32) (v32) | ||
186 | #define cpu_to_le64(v64) (v64) | ||
187 | #define le16_to_cpu(v16) (v16) | ||
188 | #define le32_to_cpu(v32) (v32) | ||
189 | #define le64_to_cpu(v32) (v64) | ||
190 | |||
191 | /*L:100 The Launcher code itself takes us out into userspace, that scary place | ||
192 | * where pointers run wild and free! Unfortunately, like most userspace | ||
193 | * programs, it's quite boring (which is why everyone likes to hack on the | ||
194 | * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it | ||
195 | * will get you through this section. Or, maybe not. | ||
196 | * | ||
197 | * The Launcher sets up a big chunk of memory to be the Guest's "physical" | ||
198 | * memory and stores it in "guest_base". In other words, Guest physical == | ||
199 | * Launcher virtual with an offset. | ||
200 | * | ||
201 | * This can be tough to get your head around, but usually it just means that we | ||
202 | * use these trivial conversion functions when the Guest gives us it's | ||
203 | * "physical" addresses: */ | ||
204 | static void *from_guest_phys(unsigned long addr) | ||
205 | { | ||
206 | return guest_base + addr; | ||
207 | } | ||
208 | |||
209 | static unsigned long to_guest_phys(const void *addr) | ||
210 | { | ||
211 | return (addr - guest_base); | ||
212 | } | ||
213 | |||
113 | /*L:130 | 214 | /*L:130 |
114 | * Loading the Kernel. | 215 | * Loading the Kernel. |
115 | * | 216 | * |
@@ -123,43 +224,55 @@ static int open_or_die(const char *name, int flags) | |||
123 | return fd; | 224 | return fd; |
124 | } | 225 | } |
125 | 226 | ||
126 | /* map_zeroed_pages() takes a (page-aligned) address and a number of pages. */ | 227 | /* map_zeroed_pages() takes a number of pages. */ |
127 | static void *map_zeroed_pages(unsigned long addr, unsigned int num) | 228 | static void *map_zeroed_pages(unsigned int num) |
128 | { | 229 | { |
129 | /* We cache the /dev/zero file-descriptor so we only open it once. */ | 230 | int fd = open_or_die("/dev/zero", O_RDONLY); |
130 | static int fd = -1; | 231 | void *addr; |
131 | |||
132 | if (fd == -1) | ||
133 | fd = open_or_die("/dev/zero", O_RDONLY); | ||
134 | 232 | ||
135 | /* We use a private mapping (ie. if we write to the page, it will be | 233 | /* We use a private mapping (ie. if we write to the page, it will be |
136 | * copied), and obviously we insist that it be mapped where we ask. */ | 234 | * copied). */ |
137 | if (mmap((void *)addr, getpagesize() * num, | 235 | addr = mmap(NULL, getpagesize() * num, |
138 | PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, 0) | 236 | PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); |
139 | != (void *)addr) | 237 | if (addr == MAP_FAILED) |
140 | err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr); | 238 | err(1, "Mmaping %u pages of /dev/zero", num); |
141 | 239 | ||
142 | /* Returning the address is just a courtesy: can simplify callers. */ | 240 | return addr; |
143 | return (void *)addr; | ||
144 | } | 241 | } |
145 | 242 | ||
146 | /* To find out where to start we look for the magic Guest string, which marks | 243 | /* Get some more pages for a device. */ |
147 | * the code we see in lguest_asm.S. This is a hack which we are currently | 244 | static void *get_pages(unsigned int num) |
148 | * plotting to replace with the normal Linux entry point. */ | ||
149 | static unsigned long entry_point(void *start, void *end, | ||
150 | unsigned long page_offset) | ||
151 | { | 245 | { |
152 | void *p; | 246 | void *addr = from_guest_phys(guest_limit); |
153 | 247 | ||
154 | /* The scan gives us the physical starting address. We want the | 248 | guest_limit += num * getpagesize(); |
155 | * virtual address in this case, and fortunately, we already figured | 249 | if (guest_limit > guest_max) |
156 | * out the physical-virtual difference and passed it here in | 250 | errx(1, "Not enough memory for devices"); |
157 | * "page_offset". */ | 251 | return addr; |
158 | for (p = start; p < end; p++) | 252 | } |
159 | if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0) | ||
160 | return (long)p + strlen("GenuineLguest") + page_offset; | ||
161 | 253 | ||
162 | err(1, "Is this image a genuine lguest?"); | 254 | /* This routine is used to load the kernel or initrd. It tries mmap, but if |
255 | * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries), | ||
256 | * it falls back to reading the memory in. */ | ||
257 | static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) | ||
258 | { | ||
259 | ssize_t r; | ||
260 | |||
261 | /* We map writable even though for some segments are marked read-only. | ||
262 | * The kernel really wants to be writable: it patches its own | ||
263 | * instructions. | ||
264 | * | ||
265 | * MAP_PRIVATE means that the page won't be copied until a write is | ||
266 | * done to it. This allows us to share untouched memory between | ||
267 | * Guests. */ | ||
268 | if (mmap(addr, len, PROT_READ|PROT_WRITE|PROT_EXEC, | ||
269 | MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED) | ||
270 | return; | ||
271 | |||
272 | /* pread does a seek and a read in one shot: saves a few lines. */ | ||
273 | r = pread(fd, addr, len, offset); | ||
274 | if (r != len) | ||
275 | err(1, "Reading offset %lu len %lu gave %zi", offset, len, r); | ||
163 | } | 276 | } |
164 | 277 | ||
165 | /* This routine takes an open vmlinux image, which is in ELF, and maps it into | 278 | /* This routine takes an open vmlinux image, which is in ELF, and maps it into |
@@ -167,19 +280,14 @@ static unsigned long entry_point(void *start, void *end, | |||
167 | * by all modern binaries on Linux including the kernel. | 280 | * by all modern binaries on Linux including the kernel. |
168 | * | 281 | * |
169 | * The ELF headers give *two* addresses: a physical address, and a virtual | 282 | * The ELF headers give *two* addresses: a physical address, and a virtual |
170 | * address. The Guest kernel expects to be placed in memory at the physical | 283 | * address. We use the physical address; the Guest will map itself to the |
171 | * address, and the page tables set up so it will correspond to that virtual | 284 | * virtual address. |
172 | * address. We return the difference between the virtual and physical | ||
173 | * addresses in the "page_offset" pointer. | ||
174 | * | 285 | * |
175 | * We return the starting address. */ | 286 | * We return the starting address. */ |
176 | static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, | 287 | static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) |
177 | unsigned long *page_offset) | ||
178 | { | 288 | { |
179 | void *addr; | ||
180 | Elf32_Phdr phdr[ehdr->e_phnum]; | 289 | Elf32_Phdr phdr[ehdr->e_phnum]; |
181 | unsigned int i; | 290 | unsigned int i; |
182 | unsigned long start = -1UL, end = 0; | ||
183 | 291 | ||
184 | /* Sanity checks on the main ELF header: an x86 executable with a | 292 | /* Sanity checks on the main ELF header: an x86 executable with a |
185 | * reasonable number of correctly-sized program headers. */ | 293 | * reasonable number of correctly-sized program headers. */ |
@@ -199,9 +307,6 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, | |||
199 | if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) | 307 | if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) |
200 | err(1, "Reading program headers"); | 308 | err(1, "Reading program headers"); |
201 | 309 | ||
202 | /* We don't know page_offset yet. */ | ||
203 | *page_offset = 0; | ||
204 | |||
205 | /* Try all the headers: there are usually only three. A read-only one, | 310 | /* Try all the headers: there are usually only three. A read-only one, |
206 | * a read-write one, and a "note" section which isn't loadable. */ | 311 | * a read-write one, and a "note" section which isn't loadable. */ |
207 | for (i = 0; i < ehdr->e_phnum; i++) { | 312 | for (i = 0; i < ehdr->e_phnum; i++) { |
@@ -212,158 +317,53 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, | |||
212 | verbose("Section %i: size %i addr %p\n", | 317 | verbose("Section %i: size %i addr %p\n", |
213 | i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); | 318 | i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); |
214 | 319 | ||
215 | /* We expect a simple linear address space: every segment must | 320 | /* We map this section of the file at its physical address. */ |
216 | * have the same difference between virtual (p_vaddr) and | 321 | map_at(elf_fd, from_guest_phys(phdr[i].p_paddr), |
217 | * physical (p_paddr) address. */ | 322 | phdr[i].p_offset, phdr[i].p_filesz); |
218 | if (!*page_offset) | ||
219 | *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr; | ||
220 | else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr) | ||
221 | errx(1, "Page offset of section %i different", i); | ||
222 | |||
223 | /* We track the first and last address we mapped, so we can | ||
224 | * tell entry_point() where to scan. */ | ||
225 | if (phdr[i].p_paddr < start) | ||
226 | start = phdr[i].p_paddr; | ||
227 | if (phdr[i].p_paddr + phdr[i].p_filesz > end) | ||
228 | end = phdr[i].p_paddr + phdr[i].p_filesz; | ||
229 | |||
230 | /* We map this section of the file at its physical address. We | ||
231 | * map it read & write even if the header says this segment is | ||
232 | * read-only. The kernel really wants to be writable: it | ||
233 | * patches its own instructions which would normally be | ||
234 | * read-only. | ||
235 | * | ||
236 | * MAP_PRIVATE means that the page won't be copied until a | ||
237 | * write is done to it. This allows us to share much of the | ||
238 | * kernel memory between Guests. */ | ||
239 | addr = mmap((void *)phdr[i].p_paddr, | ||
240 | phdr[i].p_filesz, | ||
241 | PROT_READ|PROT_WRITE|PROT_EXEC, | ||
242 | MAP_FIXED|MAP_PRIVATE, | ||
243 | elf_fd, phdr[i].p_offset); | ||
244 | if (addr != (void *)phdr[i].p_paddr) | ||
245 | err(1, "Mmaping vmlinux seg %i gave %p not %p", | ||
246 | i, addr, (void *)phdr[i].p_paddr); | ||
247 | } | 323 | } |
248 | 324 | ||
249 | return entry_point((void *)start, (void *)end, *page_offset); | 325 | /* The entry point is given in the ELF header. */ |
326 | return ehdr->e_entry; | ||
250 | } | 327 | } |
251 | 328 | ||
252 | /*L:170 Prepare to be SHOCKED and AMAZED. And possibly a trifle nauseated. | 329 | /*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're |
253 | * | 330 | * supposed to jump into it and it will unpack itself. We used to have to |
254 | * We know that CONFIG_PAGE_OFFSET sets what virtual address the kernel expects | 331 | * perform some hairy magic because the unpacking code scared me. |
255 | * to be. We don't know what that option was, but we can figure it out | ||
256 | * approximately by looking at the addresses in the code. I chose the common | ||
257 | * case of reading a memory location into the %eax register: | ||
258 | * | ||
259 | * movl <some-address>, %eax | ||
260 | * | ||
261 | * This gets encoded as five bytes: "0xA1 <4-byte-address>". For example, | ||
262 | * "0xA1 0x18 0x60 0x47 0xC0" reads the address 0xC0476018 into %eax. | ||
263 | * | ||
264 | * In this example can guess that the kernel was compiled with | ||
265 | * CONFIG_PAGE_OFFSET set to 0xC0000000 (it's always a round number). If the | ||
266 | * kernel were larger than 16MB, we might see 0xC1 addresses show up, but our | ||
267 | * kernel isn't that bloated yet. | ||
268 | * | ||
269 | * Unfortunately, x86 has variable-length instructions, so finding this | ||
270 | * particular instruction properly involves writing a disassembler. Instead, | ||
271 | * we rely on statistics. We look for "0xA1" and tally the different bytes | ||
272 | * which occur 4 bytes later (the "0xC0" in our example above). When one of | ||
273 | * those bytes appears three times, we can be reasonably confident that it | ||
274 | * forms the start of CONFIG_PAGE_OFFSET. | ||
275 | * | 332 | * |
276 | * This is amazingly reliable. */ | 333 | * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote |
277 | static unsigned long intuit_page_offset(unsigned char *img, unsigned long len) | 334 | * a small patch to jump over the tricky bits in the Guest, so now we just read |
335 | * the funky header so we know where in the file to load, and away we go! */ | ||
336 | static unsigned long load_bzimage(int fd) | ||
278 | { | 337 | { |
279 | unsigned int i, possibilities[256] = { 0 }; | 338 | struct boot_params boot; |
339 | int r; | ||
340 | /* Modern bzImages get loaded at 1M. */ | ||
341 | void *p = from_guest_phys(0x100000); | ||
280 | 342 | ||
281 | for (i = 0; i + 4 < len; i++) { | 343 | /* Go back to the start of the file and read the header. It should be |
282 | /* mov 0xXXXXXXXX,%eax */ | 344 | * a Linux boot header (see Documentation/i386/boot.txt) */ |
283 | if (img[i] == 0xA1 && ++possibilities[img[i+4]] > 3) | 345 | lseek(fd, 0, SEEK_SET); |
284 | return (unsigned long)img[i+4] << 24; | 346 | read(fd, &boot, sizeof(boot)); |
285 | } | ||
286 | errx(1, "could not determine page offset"); | ||
287 | } | ||
288 | 347 | ||
289 | /*L:160 Unfortunately the entire ELF image isn't compressed: the segments | 348 | /* Inside the setup_hdr, we expect the magic "HdrS" */ |
290 | * which need loading are extracted and compressed raw. This denies us the | 349 | if (memcmp(&boot.hdr.header, "HdrS", 4) != 0) |
291 | * information we need to make a fully-general loader. */ | 350 | errx(1, "This doesn't look like a bzImage to me"); |
292 | static unsigned long unpack_bzimage(int fd, unsigned long *page_offset) | ||
293 | { | ||
294 | gzFile f; | ||
295 | int ret, len = 0; | ||
296 | /* A bzImage always gets loaded at physical address 1M. This is | ||
297 | * actually configurable as CONFIG_PHYSICAL_START, but as the comment | ||
298 | * there says, "Don't change this unless you know what you are doing". | ||
299 | * Indeed. */ | ||
300 | void *img = (void *)0x100000; | ||
301 | |||
302 | /* gzdopen takes our file descriptor (carefully placed at the start of | ||
303 | * the GZIP header we found) and returns a gzFile. */ | ||
304 | f = gzdopen(fd, "rb"); | ||
305 | /* We read it into memory in 64k chunks until we hit the end. */ | ||
306 | while ((ret = gzread(f, img + len, 65536)) > 0) | ||
307 | len += ret; | ||
308 | if (ret < 0) | ||
309 | err(1, "reading image from bzImage"); | ||
310 | |||
311 | verbose("Unpacked size %i addr %p\n", len, img); | ||
312 | |||
313 | /* Without the ELF header, we can't tell virtual-physical gap. This is | ||
314 | * CONFIG_PAGE_OFFSET, and people do actually change it. Fortunately, | ||
315 | * I have a clever way of figuring it out from the code itself. */ | ||
316 | *page_offset = intuit_page_offset(img, len); | ||
317 | |||
318 | return entry_point(img, img + len, *page_offset); | ||
319 | } | ||
320 | 351 | ||
321 | /*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're | 352 | /* Skip over the extra sectors of the header. */ |
322 | * supposed to jump into it and it will unpack itself. We can't do that | 353 | lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET); |
323 | * because the Guest can't run the unpacking code, and adding features to | 354 | |
324 | * lguest kills puppies, so we don't want to. | 355 | /* Now read everything into memory. in nice big chunks. */ |
325 | * | 356 | while ((r = read(fd, p, 65536)) > 0) |
326 | * The bzImage is formed by putting the decompressing code in front of the | 357 | p += r; |
327 | * compressed kernel code. So we can simple scan through it looking for the | 358 | |
328 | * first "gzip" header, and start decompressing from there. */ | 359 | /* Finally, code32_start tells us where to enter the kernel. */ |
329 | static unsigned long load_bzimage(int fd, unsigned long *page_offset) | 360 | return boot.hdr.code32_start; |
330 | { | ||
331 | unsigned char c; | ||
332 | int state = 0; | ||
333 | |||
334 | /* GZIP header is 0x1F 0x8B <method> <flags>... <compressed-by>. */ | ||
335 | while (read(fd, &c, 1) == 1) { | ||
336 | switch (state) { | ||
337 | case 0: | ||
338 | if (c == 0x1F) | ||
339 | state++; | ||
340 | break; | ||
341 | case 1: | ||
342 | if (c == 0x8B) | ||
343 | state++; | ||
344 | else | ||
345 | state = 0; | ||
346 | break; | ||
347 | case 2 ... 8: | ||
348 | state++; | ||
349 | break; | ||
350 | case 9: | ||
351 | /* Seek back to the start of the gzip header. */ | ||
352 | lseek(fd, -10, SEEK_CUR); | ||
353 | /* One final check: "compressed under UNIX". */ | ||
354 | if (c != 0x03) | ||
355 | state = -1; | ||
356 | else | ||
357 | return unpack_bzimage(fd, page_offset); | ||
358 | } | ||
359 | } | ||
360 | errx(1, "Could not find kernel in bzImage"); | ||
361 | } | 361 | } |
362 | 362 | ||
363 | /*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels | 363 | /*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels |
364 | * come wrapped up in the self-decompressing "bzImage" format. With some funky | 364 | * come wrapped up in the self-decompressing "bzImage" format. With some funky |
365 | * coding, we can load those, too. */ | 365 | * coding, we can load those, too. */ |
366 | static unsigned long load_kernel(int fd, unsigned long *page_offset) | 366 | static unsigned long load_kernel(int fd) |
367 | { | 367 | { |
368 | Elf32_Ehdr hdr; | 368 | Elf32_Ehdr hdr; |
369 | 369 | ||
@@ -373,10 +373,10 @@ static unsigned long load_kernel(int fd, unsigned long *page_offset) | |||
373 | 373 | ||
374 | /* If it's an ELF file, it starts with "\177ELF" */ | 374 | /* If it's an ELF file, it starts with "\177ELF" */ |
375 | if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) | 375 | if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) |
376 | return map_elf(fd, &hdr, page_offset); | 376 | return map_elf(fd, &hdr); |
377 | 377 | ||
378 | /* Otherwise we assume it's a bzImage, and try to unpack it */ | 378 | /* Otherwise we assume it's a bzImage, and try to unpack it */ |
379 | return load_bzimage(fd, page_offset); | 379 | return load_bzimage(fd); |
380 | } | 380 | } |
381 | 381 | ||
382 | /* This is a trivial little helper to align pages. Andi Kleen hated it because | 382 | /* This is a trivial little helper to align pages. Andi Kleen hated it because |
@@ -402,59 +402,45 @@ static unsigned long load_initrd(const char *name, unsigned long mem) | |||
402 | int ifd; | 402 | int ifd; |
403 | struct stat st; | 403 | struct stat st; |
404 | unsigned long len; | 404 | unsigned long len; |
405 | void *iaddr; | ||
406 | 405 | ||
407 | ifd = open_or_die(name, O_RDONLY); | 406 | ifd = open_or_die(name, O_RDONLY); |
408 | /* fstat() is needed to get the file size. */ | 407 | /* fstat() is needed to get the file size. */ |
409 | if (fstat(ifd, &st) < 0) | 408 | if (fstat(ifd, &st) < 0) |
410 | err(1, "fstat() on initrd '%s'", name); | 409 | err(1, "fstat() on initrd '%s'", name); |
411 | 410 | ||
412 | /* The length needs to be rounded up to a page size: mmap needs the | 411 | /* We map the initrd at the top of memory, but mmap wants it to be |
413 | * address to be page aligned. */ | 412 | * page-aligned, so we round the size up for that. */ |
414 | len = page_align(st.st_size); | 413 | len = page_align(st.st_size); |
415 | /* We map the initrd at the top of memory. */ | 414 | map_at(ifd, from_guest_phys(mem - len), 0, st.st_size); |
416 | iaddr = mmap((void *)mem - len, st.st_size, | ||
417 | PROT_READ|PROT_EXEC|PROT_WRITE, | ||
418 | MAP_FIXED|MAP_PRIVATE, ifd, 0); | ||
419 | if (iaddr != (void *)mem - len) | ||
420 | err(1, "Mmaping initrd '%s' returned %p not %p", | ||
421 | name, iaddr, (void *)mem - len); | ||
422 | /* Once a file is mapped, you can close the file descriptor. It's a | 415 | /* Once a file is mapped, you can close the file descriptor. It's a |
423 | * little odd, but quite useful. */ | 416 | * little odd, but quite useful. */ |
424 | close(ifd); | 417 | close(ifd); |
425 | verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr); | 418 | verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len); |
426 | 419 | ||
427 | /* We return the initrd size. */ | 420 | /* We return the initrd size. */ |
428 | return len; | 421 | return len; |
429 | } | 422 | } |
430 | 423 | ||
431 | /* Once we know how much memory we have, and the address the Guest kernel | 424 | /* Once we know how much memory we have, we can construct simple linear page |
432 | * expects, we can construct simple linear page tables which will get the Guest | 425 | * tables which set virtual == physical which will get the Guest far enough |
433 | * far enough into the boot to create its own. | 426 | * into the boot to create its own. |
434 | * | 427 | * |
435 | * We lay them out of the way, just below the initrd (which is why we need to | 428 | * We lay them out of the way, just below the initrd (which is why we need to |
436 | * know its size). */ | 429 | * know its size). */ |
437 | static unsigned long setup_pagetables(unsigned long mem, | 430 | static unsigned long setup_pagetables(unsigned long mem, |
438 | unsigned long initrd_size, | 431 | unsigned long initrd_size) |
439 | unsigned long page_offset) | ||
440 | { | 432 | { |
441 | u32 *pgdir, *linear; | 433 | unsigned long *pgdir, *linear; |
442 | unsigned int mapped_pages, i, linear_pages; | 434 | unsigned int mapped_pages, i, linear_pages; |
443 | unsigned int ptes_per_page = getpagesize()/sizeof(u32); | 435 | unsigned int ptes_per_page = getpagesize()/sizeof(void *); |
444 | 436 | ||
445 | /* Ideally we map all physical memory starting at page_offset. | 437 | mapped_pages = mem/getpagesize(); |
446 | * However, if page_offset is 0xC0000000 we can only map 1G of physical | ||
447 | * (0xC0000000 + 1G overflows). */ | ||
448 | if (mem <= -page_offset) | ||
449 | mapped_pages = mem/getpagesize(); | ||
450 | else | ||
451 | mapped_pages = -page_offset/getpagesize(); | ||
452 | 438 | ||
453 | /* Each PTE page can map ptes_per_page pages: how many do we need? */ | 439 | /* Each PTE page can map ptes_per_page pages: how many do we need? */ |
454 | linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page; | 440 | linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page; |
455 | 441 | ||
456 | /* We put the toplevel page directory page at the top of memory. */ | 442 | /* We put the toplevel page directory page at the top of memory. */ |
457 | pgdir = (void *)mem - initrd_size - getpagesize(); | 443 | pgdir = from_guest_phys(mem) - initrd_size - getpagesize(); |
458 | 444 | ||
459 | /* Now we use the next linear_pages pages as pte pages */ | 445 | /* Now we use the next linear_pages pages as pte pages */ |
460 | linear = (void *)pgdir - linear_pages*getpagesize(); | 446 | linear = (void *)pgdir - linear_pages*getpagesize(); |
@@ -465,20 +451,19 @@ static unsigned long setup_pagetables(unsigned long mem, | |||
465 | for (i = 0; i < mapped_pages; i++) | 451 | for (i = 0; i < mapped_pages; i++) |
466 | linear[i] = ((i * getpagesize()) | PAGE_PRESENT); | 452 | linear[i] = ((i * getpagesize()) | PAGE_PRESENT); |
467 | 453 | ||
468 | /* The top level points to the linear page table pages above. The | 454 | /* The top level points to the linear page table pages above. */ |
469 | * entry representing page_offset points to the first one, and they | ||
470 | * continue from there. */ | ||
471 | for (i = 0; i < mapped_pages; i += ptes_per_page) { | 455 | for (i = 0; i < mapped_pages; i += ptes_per_page) { |
472 | pgdir[(i + page_offset/getpagesize())/ptes_per_page] | 456 | pgdir[i/ptes_per_page] |
473 | = (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT); | 457 | = ((to_guest_phys(linear) + i*sizeof(void *)) |
458 | | PAGE_PRESENT); | ||
474 | } | 459 | } |
475 | 460 | ||
476 | verbose("Linear mapping of %u pages in %u pte pages at %p\n", | 461 | verbose("Linear mapping of %u pages in %u pte pages at %#lx\n", |
477 | mapped_pages, linear_pages, linear); | 462 | mapped_pages, linear_pages, to_guest_phys(linear)); |
478 | 463 | ||
479 | /* We return the top level (guest-physical) address: the kernel needs | 464 | /* We return the top level (guest-physical) address: the kernel needs |
480 | * to know where it is. */ | 465 | * to know where it is. */ |
481 | return (unsigned long)pgdir; | 466 | return to_guest_phys(pgdir); |
482 | } | 467 | } |
483 | 468 | ||
484 | /* Simple routine to roll all the commandline arguments together with spaces | 469 | /* Simple routine to roll all the commandline arguments together with spaces |
@@ -498,14 +483,17 @@ static void concat(char *dst, char *args[]) | |||
498 | 483 | ||
499 | /* This is where we actually tell the kernel to initialize the Guest. We saw | 484 | /* This is where we actually tell the kernel to initialize the Guest. We saw |
500 | * the arguments it expects when we looked at initialize() in lguest_user.c: | 485 | * the arguments it expects when we looked at initialize() in lguest_user.c: |
501 | * the top physical page to allow, the top level pagetable, the entry point and | 486 | * the base of guest "physical" memory, the top physical page to allow, the |
502 | * the page_offset constant for the Guest. */ | 487 | * top level pagetable and the entry point for the Guest. */ |
503 | static int tell_kernel(u32 pgdir, u32 start, u32 page_offset) | 488 | static int tell_kernel(unsigned long pgdir, unsigned long start) |
504 | { | 489 | { |
505 | u32 args[] = { LHREQ_INITIALIZE, | 490 | unsigned long args[] = { LHREQ_INITIALIZE, |
506 | top/getpagesize(), pgdir, start, page_offset }; | 491 | (unsigned long)guest_base, |
492 | guest_limit / getpagesize(), pgdir, start }; | ||
507 | int fd; | 493 | int fd; |
508 | 494 | ||
495 | verbose("Guest: %p - %p (%#lx)\n", | ||
496 | guest_base, guest_base + guest_limit, guest_limit); | ||
509 | fd = open_or_die("/dev/lguest", O_RDWR); | 497 | fd = open_or_die("/dev/lguest", O_RDWR); |
510 | if (write(fd, args, sizeof(args)) < 0) | 498 | if (write(fd, args, sizeof(args)) < 0) |
511 | err(1, "Writing to /dev/lguest"); | 499 | err(1, "Writing to /dev/lguest"); |
@@ -515,11 +503,11 @@ static int tell_kernel(u32 pgdir, u32 start, u32 page_offset) | |||
515 | } | 503 | } |
516 | /*:*/ | 504 | /*:*/ |
517 | 505 | ||
518 | static void set_fd(int fd, struct device_list *devices) | 506 | static void add_device_fd(int fd) |
519 | { | 507 | { |
520 | FD_SET(fd, &devices->infds); | 508 | FD_SET(fd, &devices.infds); |
521 | if (fd > devices->max_infd) | 509 | if (fd > devices.max_infd) |
522 | devices->max_infd = fd; | 510 | devices.max_infd = fd; |
523 | } | 511 | } |
524 | 512 | ||
525 | /*L:200 | 513 | /*L:200 |
@@ -537,36 +525,38 @@ static void set_fd(int fd, struct device_list *devices) | |||
537 | * | 525 | * |
538 | * This, of course, is merely a different *kind* of icky. | 526 | * This, of course, is merely a different *kind* of icky. |
539 | */ | 527 | */ |
540 | static void wake_parent(int pipefd, int lguest_fd, struct device_list *devices) | 528 | static void wake_parent(int pipefd, int lguest_fd) |
541 | { | 529 | { |
542 | /* Add the pipe from the Launcher to the fdset in the device_list, so | 530 | /* Add the pipe from the Launcher to the fdset in the device_list, so |
543 | * we watch it, too. */ | 531 | * we watch it, too. */ |
544 | set_fd(pipefd, devices); | 532 | add_device_fd(pipefd); |
545 | 533 | ||
546 | for (;;) { | 534 | for (;;) { |
547 | fd_set rfds = devices->infds; | 535 | fd_set rfds = devices.infds; |
548 | u32 args[] = { LHREQ_BREAK, 1 }; | 536 | unsigned long args[] = { LHREQ_BREAK, 1 }; |
549 | 537 | ||
550 | /* Wait until input is ready from one of the devices. */ | 538 | /* Wait until input is ready from one of the devices. */ |
551 | select(devices->max_infd+1, &rfds, NULL, NULL, NULL); | 539 | select(devices.max_infd+1, &rfds, NULL, NULL, NULL); |
552 | /* Is it a message from the Launcher? */ | 540 | /* Is it a message from the Launcher? */ |
553 | if (FD_ISSET(pipefd, &rfds)) { | 541 | if (FD_ISSET(pipefd, &rfds)) { |
554 | int ignorefd; | 542 | int fd; |
555 | /* If read() returns 0, it means the Launcher has | 543 | /* If read() returns 0, it means the Launcher has |
556 | * exited. We silently follow. */ | 544 | * exited. We silently follow. */ |
557 | if (read(pipefd, &ignorefd, sizeof(ignorefd)) == 0) | 545 | if (read(pipefd, &fd, sizeof(fd)) == 0) |
558 | exit(0); | 546 | exit(0); |
559 | /* Otherwise it's telling us there's a problem with one | 547 | /* Otherwise it's telling us to change what file |
560 | * of the devices, and we should ignore that file | 548 | * descriptors we're to listen to. */ |
561 | * descriptor from now on. */ | 549 | if (fd >= 0) |
562 | FD_CLR(ignorefd, &devices->infds); | 550 | FD_SET(fd, &devices.infds); |
551 | else | ||
552 | FD_CLR(-fd - 1, &devices.infds); | ||
563 | } else /* Send LHREQ_BREAK command. */ | 553 | } else /* Send LHREQ_BREAK command. */ |
564 | write(lguest_fd, args, sizeof(args)); | 554 | write(lguest_fd, args, sizeof(args)); |
565 | } | 555 | } |
566 | } | 556 | } |
567 | 557 | ||
568 | /* This routine just sets up a pipe to the Waker process. */ | 558 | /* This routine just sets up a pipe to the Waker process. */ |
569 | static int setup_waker(int lguest_fd, struct device_list *device_list) | 559 | static int setup_waker(int lguest_fd) |
570 | { | 560 | { |
571 | int pipefd[2], child; | 561 | int pipefd[2], child; |
572 | 562 | ||
@@ -580,7 +570,7 @@ static int setup_waker(int lguest_fd, struct device_list *device_list) | |||
580 | if (child == 0) { | 570 | if (child == 0) { |
581 | /* Close the "writing" end of our copy of the pipe */ | 571 | /* Close the "writing" end of our copy of the pipe */ |
582 | close(pipefd[1]); | 572 | close(pipefd[1]); |
583 | wake_parent(pipefd[0], lguest_fd, device_list); | 573 | wake_parent(pipefd[0], lguest_fd); |
584 | } | 574 | } |
585 | /* Close the reading end of our copy of the pipe. */ | 575 | /* Close the reading end of our copy of the pipe. */ |
586 | close(pipefd[0]); | 576 | close(pipefd[0]); |
@@ -602,83 +592,128 @@ static void *_check_pointer(unsigned long addr, unsigned int size, | |||
602 | { | 592 | { |
603 | /* We have to separately check addr and addr+size, because size could | 593 | /* We have to separately check addr and addr+size, because size could |
604 | * be huge and addr + size might wrap around. */ | 594 | * be huge and addr + size might wrap around. */ |
605 | if (addr >= top || addr + size >= top) | 595 | if (addr >= guest_limit || addr + size >= guest_limit) |
606 | errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr); | 596 | errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr); |
607 | /* We return a pointer for the caller's convenience, now we know it's | 597 | /* We return a pointer for the caller's convenience, now we know it's |
608 | * safe to use. */ | 598 | * safe to use. */ |
609 | return (void *)addr; | 599 | return from_guest_phys(addr); |
610 | } | 600 | } |
611 | /* A macro which transparently hands the line number to the real function. */ | 601 | /* A macro which transparently hands the line number to the real function. */ |
612 | #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) | 602 | #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) |
613 | 603 | ||
614 | /* The Guest has given us the address of a "struct lguest_dma". We check it's | 604 | /* This function returns the next descriptor in the chain, or vq->vring.num. */ |
615 | * OK and convert it to an iovec (which is a simple array of ptr/size | 605 | static unsigned next_desc(struct virtqueue *vq, unsigned int i) |
616 | * pairs). */ | ||
617 | static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num) | ||
618 | { | 606 | { |
619 | unsigned int i; | 607 | unsigned int next; |
620 | struct lguest_dma *udma; | ||
621 | |||
622 | /* First we make sure that the array memory itself is valid. */ | ||
623 | udma = check_pointer(dma, sizeof(*udma)); | ||
624 | /* Now we check each element */ | ||
625 | for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { | ||
626 | /* A zero length ends the array. */ | ||
627 | if (!udma->len[i]) | ||
628 | break; | ||
629 | 608 | ||
630 | iov[i].iov_base = check_pointer(udma->addr[i], udma->len[i]); | 609 | /* If this descriptor says it doesn't chain, we're done. */ |
631 | iov[i].iov_len = udma->len[i]; | 610 | if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT)) |
632 | } | 611 | return vq->vring.num; |
633 | *num = i; | 612 | |
613 | /* Check they're not leading us off end of descriptors. */ | ||
614 | next = vq->vring.desc[i].next; | ||
615 | /* Make sure compiler knows to grab that: we don't want it changing! */ | ||
616 | wmb(); | ||
634 | 617 | ||
635 | /* We return the pointer to where the caller should write the amount of | 618 | if (next >= vq->vring.num) |
636 | * the buffer used. */ | 619 | errx(1, "Desc next is %u", next); |
637 | return &udma->used_len; | 620 | |
621 | return next; | ||
622 | } | ||
623 | |||
624 | /* This looks in the virtqueue and for the first available buffer, and converts | ||
625 | * it to an iovec for convenient access. Since descriptors consist of some | ||
626 | * number of output then some number of input descriptors, it's actually two | ||
627 | * iovecs, but we pack them into one and note how many of each there were. | ||
628 | * | ||
629 | * This function returns the descriptor number found, or vq->vring.num (which | ||
630 | * is never a valid descriptor number) if none was found. */ | ||
631 | static unsigned get_vq_desc(struct virtqueue *vq, | ||
632 | struct iovec iov[], | ||
633 | unsigned int *out_num, unsigned int *in_num) | ||
634 | { | ||
635 | unsigned int i, head; | ||
636 | |||
637 | /* Check it isn't doing very strange things with descriptor numbers. */ | ||
638 | if ((u16)(vq->vring.avail->idx - vq->last_avail_idx) > vq->vring.num) | ||
639 | errx(1, "Guest moved used index from %u to %u", | ||
640 | vq->last_avail_idx, vq->vring.avail->idx); | ||
641 | |||
642 | /* If there's nothing new since last we looked, return invalid. */ | ||
643 | if (vq->vring.avail->idx == vq->last_avail_idx) | ||
644 | return vq->vring.num; | ||
645 | |||
646 | /* Grab the next descriptor number they're advertising, and increment | ||
647 | * the index we've seen. */ | ||
648 | head = vq->vring.avail->ring[vq->last_avail_idx++ % vq->vring.num]; | ||
649 | |||
650 | /* If their number is silly, that's a fatal mistake. */ | ||
651 | if (head >= vq->vring.num) | ||
652 | errx(1, "Guest says index %u is available", head); | ||
653 | |||
654 | /* When we start there are none of either input nor output. */ | ||
655 | *out_num = *in_num = 0; | ||
656 | |||
657 | i = head; | ||
658 | do { | ||
659 | /* Grab the first descriptor, and check it's OK. */ | ||
660 | iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len; | ||
661 | iov[*out_num + *in_num].iov_base | ||
662 | = check_pointer(vq->vring.desc[i].addr, | ||
663 | vq->vring.desc[i].len); | ||
664 | /* If this is an input descriptor, increment that count. */ | ||
665 | if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE) | ||
666 | (*in_num)++; | ||
667 | else { | ||
668 | /* If it's an output descriptor, they're all supposed | ||
669 | * to come before any input descriptors. */ | ||
670 | if (*in_num) | ||
671 | errx(1, "Descriptor has out after in"); | ||
672 | (*out_num)++; | ||
673 | } | ||
674 | |||
675 | /* If we've got too many, that implies a descriptor loop. */ | ||
676 | if (*out_num + *in_num > vq->vring.num) | ||
677 | errx(1, "Looped descriptor"); | ||
678 | } while ((i = next_desc(vq, i)) != vq->vring.num); | ||
679 | |||
680 | return head; | ||
638 | } | 681 | } |
639 | 682 | ||
640 | /* This routine gets a DMA buffer from the Guest for a given key, and converts | 683 | /* Once we've used one of their buffers, we tell them about it. We'll then |
641 | * it to an iovec array. It returns the interrupt the Guest wants when we're | 684 | * want to send them an interrupt, using trigger_irq(). */ |
642 | * finished, and a pointer to the "used_len" field to fill in. */ | 685 | static void add_used(struct virtqueue *vq, unsigned int head, int len) |
643 | static u32 *get_dma_buffer(int fd, void *key, | ||
644 | struct iovec iov[], unsigned int *num, u32 *irq) | ||
645 | { | 686 | { |
646 | u32 buf[] = { LHREQ_GETDMA, (u32)key }; | 687 | struct vring_used_elem *used; |
647 | unsigned long udma; | 688 | |
648 | u32 *res; | 689 | /* Get a pointer to the next entry in the used ring. */ |
649 | 690 | used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; | |
650 | /* Ask the kernel for a DMA buffer corresponding to this key. */ | 691 | used->id = head; |
651 | udma = write(fd, buf, sizeof(buf)); | 692 | used->len = len; |
652 | /* They haven't registered any, or they're all used? */ | 693 | /* Make sure buffer is written before we update index. */ |
653 | if (udma == (unsigned long)-1) | 694 | wmb(); |
654 | return NULL; | 695 | vq->vring.used->idx++; |
655 | |||
656 | /* Convert it into our iovec array */ | ||
657 | res = dma2iov(udma, iov, num); | ||
658 | /* The kernel stashes irq in ->used_len to get it out to us. */ | ||
659 | *irq = *res; | ||
660 | /* Return a pointer to ((struct lguest_dma *)udma)->used_len. */ | ||
661 | return res; | ||
662 | } | 696 | } |
663 | 697 | ||
664 | /* This is a convenient routine to send the Guest an interrupt. */ | 698 | /* This actually sends the interrupt for this virtqueue */ |
665 | static void trigger_irq(int fd, u32 irq) | 699 | static void trigger_irq(int fd, struct virtqueue *vq) |
666 | { | 700 | { |
667 | u32 buf[] = { LHREQ_IRQ, irq }; | 701 | unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; |
702 | |||
703 | if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) | ||
704 | return; | ||
705 | |||
706 | /* Send the Guest an interrupt tell them we used something up. */ | ||
668 | if (write(fd, buf, sizeof(buf)) != 0) | 707 | if (write(fd, buf, sizeof(buf)) != 0) |
669 | err(1, "Triggering irq %i", irq); | 708 | err(1, "Triggering irq %i", vq->config.irq); |
670 | } | 709 | } |
671 | 710 | ||
672 | /* This simply sets up an iovec array where we can put data to be discarded. | 711 | /* And here's the combo meal deal. Supersize me! */ |
673 | * This happens when the Guest doesn't want or can't handle the input: we have | 712 | static void add_used_and_trigger(int fd, struct virtqueue *vq, |
674 | * to get rid of it somewhere, and if we bury it in the ceiling space it will | 713 | unsigned int head, int len) |
675 | * start to smell after a week. */ | ||
676 | static void discard_iovec(struct iovec *iov, unsigned int *num) | ||
677 | { | 714 | { |
678 | static char discard_buf[1024]; | 715 | add_used(vq, head, len); |
679 | *num = 1; | 716 | trigger_irq(fd, vq); |
680 | iov->iov_base = discard_buf; | ||
681 | iov->iov_len = sizeof(discard_buf); | ||
682 | } | 717 | } |
683 | 718 | ||
684 | /* Here is the input terminal setting we save, and the routine to restore them | 719 | /* Here is the input terminal setting we save, and the routine to restore them |
@@ -701,38 +736,39 @@ struct console_abort | |||
701 | /* This is the routine which handles console input (ie. stdin). */ | 736 | /* This is the routine which handles console input (ie. stdin). */ |
702 | static bool handle_console_input(int fd, struct device *dev) | 737 | static bool handle_console_input(int fd, struct device *dev) |
703 | { | 738 | { |
704 | u32 irq = 0, *lenp; | ||
705 | int len; | 739 | int len; |
706 | unsigned int num; | 740 | unsigned int head, in_num, out_num; |
707 | struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; | 741 | struct iovec iov[dev->vq->vring.num]; |
708 | struct console_abort *abort = dev->priv; | 742 | struct console_abort *abort = dev->priv; |
709 | 743 | ||
710 | /* First we get the console buffer from the Guest. The key is dev->mem | 744 | /* First we need a console buffer from the Guests's input virtqueue. */ |
711 | * which was set to 0 in setup_console(). */ | 745 | head = get_vq_desc(dev->vq, iov, &out_num, &in_num); |
712 | lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq); | 746 | |
713 | if (!lenp) { | 747 | /* If they're not ready for input, stop listening to this file |
714 | /* If it's not ready for input, warn and set up to discard. */ | 748 | * descriptor. We'll start again once they add an input buffer. */ |
715 | warn("console: no dma buffer!"); | 749 | if (head == dev->vq->vring.num) |
716 | discard_iovec(iov, &num); | 750 | return false; |
717 | } | 751 | |
752 | if (out_num) | ||
753 | errx(1, "Output buffers in console in queue?"); | ||
718 | 754 | ||
719 | /* This is why we convert to iovecs: the readv() call uses them, and so | 755 | /* This is why we convert to iovecs: the readv() call uses them, and so |
720 | * it reads straight into the Guest's buffer. */ | 756 | * it reads straight into the Guest's buffer. */ |
721 | len = readv(dev->fd, iov, num); | 757 | len = readv(dev->fd, iov, in_num); |
722 | if (len <= 0) { | 758 | if (len <= 0) { |
723 | /* This implies that the console is closed, is /dev/null, or | 759 | /* This implies that the console is closed, is /dev/null, or |
724 | * something went terribly wrong. We still go through the rest | 760 | * something went terribly wrong. */ |
725 | * of the logic, though, especially the exit handling below. */ | ||
726 | warnx("Failed to get console input, ignoring console."); | 761 | warnx("Failed to get console input, ignoring console."); |
727 | len = 0; | 762 | /* Put the input terminal back. */ |
763 | restore_term(); | ||
764 | /* Remove callback from input vq, so it doesn't restart us. */ | ||
765 | dev->vq->handle_output = NULL; | ||
766 | /* Stop listening to this fd: don't call us again. */ | ||
767 | return false; | ||
728 | } | 768 | } |
729 | 769 | ||
730 | /* If we read the data into the Guest, fill in the length and send the | 770 | /* Tell the Guest about the new input. */ |
731 | * interrupt. */ | 771 | add_used_and_trigger(fd, dev->vq, head, len); |
732 | if (lenp) { | ||
733 | *lenp = len; | ||
734 | trigger_irq(fd, irq); | ||
735 | } | ||
736 | 772 | ||
737 | /* Three ^C within one second? Exit. | 773 | /* Three ^C within one second? Exit. |
738 | * | 774 | * |
@@ -746,7 +782,7 @@ static bool handle_console_input(int fd, struct device *dev) | |||
746 | struct timeval now; | 782 | struct timeval now; |
747 | gettimeofday(&now, NULL); | 783 | gettimeofday(&now, NULL); |
748 | if (now.tv_sec <= abort->start.tv_sec+1) { | 784 | if (now.tv_sec <= abort->start.tv_sec+1) { |
749 | u32 args[] = { LHREQ_BREAK, 0 }; | 785 | unsigned long args[] = { LHREQ_BREAK, 0 }; |
750 | /* Close the fd so Waker will know it has to | 786 | /* Close the fd so Waker will know it has to |
751 | * exit. */ | 787 | * exit. */ |
752 | close(waker_fd); | 788 | close(waker_fd); |
@@ -761,214 +797,163 @@ static bool handle_console_input(int fd, struct device *dev) | |||
761 | /* Any other key resets the abort counter. */ | 797 | /* Any other key resets the abort counter. */ |
762 | abort->count = 0; | 798 | abort->count = 0; |
763 | 799 | ||
764 | /* Now, if we didn't read anything, put the input terminal back and | ||
765 | * return failure (meaning, don't call us again). */ | ||
766 | if (!len) { | ||
767 | restore_term(); | ||
768 | return false; | ||
769 | } | ||
770 | /* Everything went OK! */ | 800 | /* Everything went OK! */ |
771 | return true; | 801 | return true; |
772 | } | 802 | } |
773 | 803 | ||
774 | /* Handling console output is much simpler than input. */ | 804 | /* Handling output for console is simple: we just get all the output buffers |
775 | static u32 handle_console_output(int fd, const struct iovec *iov, | 805 | * and write them to stdout. */ |
776 | unsigned num, struct device*dev) | 806 | static void handle_console_output(int fd, struct virtqueue *vq) |
777 | { | 807 | { |
778 | /* Whatever the Guest sends, write it to standard output. Return the | 808 | unsigned int head, out, in; |
779 | * number of bytes written. */ | 809 | int len; |
780 | return writev(STDOUT_FILENO, iov, num); | 810 | struct iovec iov[vq->vring.num]; |
781 | } | 811 | |
782 | 812 | /* Keep getting output buffers from the Guest until we run out. */ | |
783 | /* Guest->Host network output is also pretty easy. */ | 813 | while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { |
784 | static u32 handle_tun_output(int fd, const struct iovec *iov, | 814 | if (in) |
785 | unsigned num, struct device *dev) | 815 | errx(1, "Input buffers in output queue?"); |
786 | { | 816 | len = writev(STDOUT_FILENO, iov, out); |
787 | /* We put a flag in the "priv" pointer of the network device, and set | 817 | add_used_and_trigger(fd, vq, head, len); |
788 | * it as soon as we see output. We'll see why in handle_tun_input() */ | 818 | } |
789 | *(bool *)dev->priv = true; | ||
790 | /* Whatever packet the Guest sent us, write it out to the tun | ||
791 | * device. */ | ||
792 | return writev(dev->fd, iov, num); | ||
793 | } | 819 | } |
794 | 820 | ||
795 | /* This matches the peer_key() in lguest_net.c. The key for any given slot | 821 | /* Handling output for network is also simple: we get all the output buffers |
796 | * is the address of the network device's page plus 4 * the slot number. */ | 822 | * and write them (ignoring the first element) to this device's file descriptor |
797 | static unsigned long peer_offset(unsigned int peernum) | 823 | * (stdout). */ |
824 | static void handle_net_output(int fd, struct virtqueue *vq) | ||
798 | { | 825 | { |
799 | return 4 * peernum; | 826 | unsigned int head, out, in; |
827 | int len; | ||
828 | struct iovec iov[vq->vring.num]; | ||
829 | |||
830 | /* Keep getting output buffers from the Guest until we run out. */ | ||
831 | while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { | ||
832 | if (in) | ||
833 | errx(1, "Input buffers in output queue?"); | ||
834 | /* Check header, but otherwise ignore it (we said we supported | ||
835 | * no features). */ | ||
836 | (void)convert(&iov[0], struct virtio_net_hdr); | ||
837 | len = writev(vq->dev->fd, iov+1, out-1); | ||
838 | add_used_and_trigger(fd, vq, head, len); | ||
839 | } | ||
800 | } | 840 | } |
801 | 841 | ||
802 | /* This is where we handle a packet coming in from the tun device */ | 842 | /* This is where we handle a packet coming in from the tun device to our |
843 | * Guest. */ | ||
803 | static bool handle_tun_input(int fd, struct device *dev) | 844 | static bool handle_tun_input(int fd, struct device *dev) |
804 | { | 845 | { |
805 | u32 irq = 0, *lenp; | 846 | unsigned int head, in_num, out_num; |
806 | int len; | 847 | int len; |
807 | unsigned num; | 848 | struct iovec iov[dev->vq->vring.num]; |
808 | struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; | 849 | struct virtio_net_hdr *hdr; |
809 | 850 | ||
810 | /* First we get a buffer the Guest has bound to its key. */ | 851 | /* First we need a network buffer from the Guests's recv virtqueue. */ |
811 | lenp = get_dma_buffer(fd, dev->mem+peer_offset(NET_PEERNUM), iov, &num, | 852 | head = get_vq_desc(dev->vq, iov, &out_num, &in_num); |
812 | &irq); | 853 | if (head == dev->vq->vring.num) { |
813 | if (!lenp) { | ||
814 | /* Now, it's expected that if we try to send a packet too | 854 | /* Now, it's expected that if we try to send a packet too |
815 | * early, the Guest won't be ready yet. This is why we set a | 855 | * early, the Guest won't be ready yet. Wait until the device |
816 | * flag when the Guest sends its first packet. If it's sent a | 856 | * status says it's ready. */ |
817 | * packet we assume it should be ready to receive them. | 857 | /* FIXME: Actually want DRIVER_ACTIVE here. */ |
818 | * | 858 | if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) |
819 | * Actually, this is what the status bits in the descriptor are | ||
820 | * for: we should *use* them. FIXME! */ | ||
821 | if (*(bool *)dev->priv) | ||
822 | warn("network: no dma buffer!"); | 859 | warn("network: no dma buffer!"); |
823 | discard_iovec(iov, &num); | 860 | /* We'll turn this back on if input buffers are registered. */ |
824 | } | 861 | return false; |
862 | } else if (out_num) | ||
863 | errx(1, "Output buffers in network recv queue?"); | ||
864 | |||
865 | /* First element is the header: we set it to 0 (no features). */ | ||
866 | hdr = convert(&iov[0], struct virtio_net_hdr); | ||
867 | hdr->flags = 0; | ||
868 | hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; | ||
825 | 869 | ||
826 | /* Read the packet from the device directly into the Guest's buffer. */ | 870 | /* Read the packet from the device directly into the Guest's buffer. */ |
827 | len = readv(dev->fd, iov, num); | 871 | len = readv(dev->fd, iov+1, in_num-1); |
828 | if (len <= 0) | 872 | if (len <= 0) |
829 | err(1, "reading network"); | 873 | err(1, "reading network"); |
830 | 874 | ||
831 | /* Write the used_len, and trigger the interrupt for the Guest */ | 875 | /* Tell the Guest about the new packet. */ |
832 | if (lenp) { | 876 | add_used_and_trigger(fd, dev->vq, head, sizeof(*hdr) + len); |
833 | *lenp = len; | 877 | |
834 | trigger_irq(fd, irq); | ||
835 | } | ||
836 | verbose("tun input packet len %i [%02x %02x] (%s)\n", len, | 878 | verbose("tun input packet len %i [%02x %02x] (%s)\n", len, |
837 | ((u8 *)iov[0].iov_base)[0], ((u8 *)iov[0].iov_base)[1], | 879 | ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1], |
838 | lenp ? "sent" : "discarded"); | 880 | head != dev->vq->vring.num ? "sent" : "discarded"); |
881 | |||
839 | /* All good. */ | 882 | /* All good. */ |
840 | return true; | 883 | return true; |
841 | } | 884 | } |
842 | 885 | ||
843 | /* The last device handling routine is block output: the Guest has sent a DMA | 886 | /* This callback ensures we try again, in case we stopped console or net |
844 | * to the block device. It will have placed the command it wants in the | 887 | * delivery because Guest didn't have any buffers. */ |
845 | * "struct lguest_block_page". */ | 888 | static void enable_fd(int fd, struct virtqueue *vq) |
846 | static u32 handle_block_output(int fd, const struct iovec *iov, | ||
847 | unsigned num, struct device *dev) | ||
848 | { | 889 | { |
849 | struct lguest_block_page *p = dev->mem; | 890 | add_device_fd(vq->dev->fd); |
850 | u32 irq, *lenp; | 891 | /* Tell waker to listen to it again */ |
851 | unsigned int len, reply_num; | 892 | write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd)); |
852 | struct iovec reply[LGUEST_MAX_DMA_SECTIONS]; | ||
853 | off64_t device_len, off = (off64_t)p->sector * 512; | ||
854 | |||
855 | /* First we extract the device length from the dev->priv pointer. */ | ||
856 | device_len = *(off64_t *)dev->priv; | ||
857 | |||
858 | /* We first check that the read or write is within the length of the | ||
859 | * block file. */ | ||
860 | if (off >= device_len) | ||
861 | err(1, "Bad offset %llu vs %llu", off, device_len); | ||
862 | /* Move to the right location in the block file. This shouldn't fail, | ||
863 | * but best to check. */ | ||
864 | if (lseek64(dev->fd, off, SEEK_SET) != off) | ||
865 | err(1, "Bad seek to sector %i", p->sector); | ||
866 | |||
867 | verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off); | ||
868 | |||
869 | /* They were supposed to bind a reply buffer at key equal to the start | ||
870 | * of the block device memory. We need this to tell them when the | ||
871 | * request is finished. */ | ||
872 | lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq); | ||
873 | if (!lenp) | ||
874 | err(1, "Block request didn't give us a dma buffer"); | ||
875 | |||
876 | if (p->type) { | ||
877 | /* A write request. The DMA they sent contained the data, so | ||
878 | * write it out. */ | ||
879 | len = writev(dev->fd, iov, num); | ||
880 | /* Grr... Now we know how long the "struct lguest_dma" they | ||
881 | * sent was, we make sure they didn't try to write over the end | ||
882 | * of the block file (possibly extending it). */ | ||
883 | if (off + len > device_len) { | ||
884 | /* Trim it back to the correct length */ | ||
885 | ftruncate64(dev->fd, device_len); | ||
886 | /* Die, bad Guest, die. */ | ||
887 | errx(1, "Write past end %llu+%u", off, len); | ||
888 | } | ||
889 | /* The reply length is 0: we just send back an empty DMA to | ||
890 | * interrupt them and tell them the write is finished. */ | ||
891 | *lenp = 0; | ||
892 | } else { | ||
893 | /* A read request. They sent an empty DMA to start the | ||
894 | * request, and we put the read contents into the reply | ||
895 | * buffer. */ | ||
896 | len = readv(dev->fd, reply, reply_num); | ||
897 | *lenp = len; | ||
898 | } | ||
899 | |||
900 | /* The result is 1 (done), 2 if there was an error (short read or | ||
901 | * write). */ | ||
902 | p->result = 1 + (p->bytes != len); | ||
903 | /* Now tell them we've used their reply buffer. */ | ||
904 | trigger_irq(fd, irq); | ||
905 | |||
906 | /* We're supposed to return the number of bytes of the output buffer we | ||
907 | * used. But the block device uses the "result" field instead, so we | ||
908 | * don't bother. */ | ||
909 | return 0; | ||
910 | } | 893 | } |
911 | 894 | ||
912 | /* This is the generic routine we call when the Guest sends some DMA out. */ | 895 | /* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ |
913 | static void handle_output(int fd, unsigned long dma, unsigned long key, | 896 | static void handle_output(int fd, unsigned long addr) |
914 | struct device_list *devices) | ||
915 | { | 897 | { |
916 | struct device *i; | 898 | struct device *i; |
917 | u32 *lenp; | 899 | struct virtqueue *vq; |
918 | struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; | 900 | |
919 | unsigned num = 0; | 901 | /* Check each virtqueue. */ |
920 | 902 | for (i = devices.dev; i; i = i->next) { | |
921 | /* Convert the "struct lguest_dma" they're sending to a "struct | 903 | for (vq = i->vq; vq; vq = vq->next) { |
922 | * iovec". */ | 904 | if (vq->config.pfn == addr/getpagesize() |
923 | lenp = dma2iov(dma, iov, &num); | 905 | && vq->handle_output) { |
924 | 906 | verbose("Output to %s\n", vq->dev->name); | |
925 | /* Check each device: if they expect output to this key, tell them to | 907 | vq->handle_output(fd, vq); |
926 | * handle it. */ | 908 | return; |
927 | for (i = devices->dev; i; i = i->next) { | 909 | } |
928 | if (i->handle_output && key == i->watch_key) { | ||
929 | /* We write the result straight into the used_len field | ||
930 | * for them. */ | ||
931 | *lenp = i->handle_output(fd, iov, num, i); | ||
932 | return; | ||
933 | } | 910 | } |
934 | } | 911 | } |
935 | 912 | ||
936 | /* This can happen: the kernel sends any SEND_DMA which doesn't match | 913 | /* Early console write is done using notify on a nul-terminated string |
937 | * another Guest to us. It could be that another Guest just left a | 914 | * in Guest memory. */ |
938 | * network, for example. But it's unusual. */ | 915 | if (addr >= guest_limit) |
939 | warnx("Pending dma %p, key %p", (void *)dma, (void *)key); | 916 | errx(1, "Bad NOTIFY %#lx", addr); |
917 | |||
918 | write(STDOUT_FILENO, from_guest_phys(addr), | ||
919 | strnlen(from_guest_phys(addr), guest_limit - addr)); | ||
940 | } | 920 | } |
941 | 921 | ||
942 | /* This is called when the waker wakes us up: check for incoming file | 922 | /* This is called when the waker wakes us up: check for incoming file |
943 | * descriptors. */ | 923 | * descriptors. */ |
944 | static void handle_input(int fd, struct device_list *devices) | 924 | static void handle_input(int fd) |
945 | { | 925 | { |
946 | /* select() wants a zeroed timeval to mean "don't wait". */ | 926 | /* select() wants a zeroed timeval to mean "don't wait". */ |
947 | struct timeval poll = { .tv_sec = 0, .tv_usec = 0 }; | 927 | struct timeval poll = { .tv_sec = 0, .tv_usec = 0 }; |
948 | 928 | ||
949 | for (;;) { | 929 | for (;;) { |
950 | struct device *i; | 930 | struct device *i; |
951 | fd_set fds = devices->infds; | 931 | fd_set fds = devices.infds; |
952 | 932 | ||
953 | /* If nothing is ready, we're done. */ | 933 | /* If nothing is ready, we're done. */ |
954 | if (select(devices->max_infd+1, &fds, NULL, NULL, &poll) == 0) | 934 | if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0) |
955 | break; | 935 | break; |
956 | 936 | ||
957 | /* Otherwise, call the device(s) which have readable | 937 | /* Otherwise, call the device(s) which have readable |
958 | * file descriptors and a method of handling them. */ | 938 | * file descriptors and a method of handling them. */ |
959 | for (i = devices->dev; i; i = i->next) { | 939 | for (i = devices.dev; i; i = i->next) { |
960 | if (i->handle_input && FD_ISSET(i->fd, &fds)) { | 940 | if (i->handle_input && FD_ISSET(i->fd, &fds)) { |
941 | int dev_fd; | ||
942 | if (i->handle_input(fd, i)) | ||
943 | continue; | ||
944 | |||
961 | /* If handle_input() returns false, it means we | 945 | /* If handle_input() returns false, it means we |
962 | * should no longer service it. | 946 | * should no longer service it. Networking and |
963 | * handle_console_input() does this. */ | 947 | * console do this when there's no input |
964 | if (!i->handle_input(fd, i)) { | 948 | * buffers to deliver into. Console also uses |
965 | /* Clear it from the set of input file | 949 | * it when it discovers that stdin is |
966 | * descriptors kept at the head of the | 950 | * closed. */ |
967 | * device list. */ | 951 | FD_CLR(i->fd, &devices.infds); |
968 | FD_CLR(i->fd, &devices->infds); | 952 | /* Tell waker to ignore it too, by sending a |
969 | /* Tell waker to ignore it too... */ | 953 | * negative fd number (-1, since 0 is a valid |
970 | write(waker_fd, &i->fd, sizeof(i->fd)); | 954 | * FD number). */ |
971 | } | 955 | dev_fd = -i->fd - 1; |
956 | write(waker_fd, &dev_fd, sizeof(dev_fd)); | ||
972 | } | 957 | } |
973 | } | 958 | } |
974 | } | 959 | } |
@@ -982,43 +967,93 @@ static void handle_input(int fd, struct device_list *devices) | |||
982 | * routines to allocate them. | 967 | * routines to allocate them. |
983 | * | 968 | * |
984 | * This routine allocates a new "struct lguest_device_desc" from descriptor | 969 | * This routine allocates a new "struct lguest_device_desc" from descriptor |
985 | * table in the devices array just above the Guest's normal memory. */ | 970 | * table just above the Guest's normal memory. It returns a pointer to that |
986 | static struct lguest_device_desc * | 971 | * descriptor. */ |
987 | new_dev_desc(struct lguest_device_desc *descs, | 972 | static struct lguest_device_desc *new_dev_desc(u16 type) |
988 | u16 type, u16 features, u16 num_pages) | ||
989 | { | 973 | { |
990 | unsigned int i; | 974 | struct lguest_device_desc *d; |
991 | 975 | ||
992 | for (i = 0; i < LGUEST_MAX_DEVICES; i++) { | 976 | /* We only have one page for all the descriptors. */ |
993 | if (!descs[i].type) { | 977 | if (devices.desc_used + sizeof(*d) > getpagesize()) |
994 | descs[i].type = type; | 978 | errx(1, "Too many devices"); |
995 | descs[i].features = features; | 979 | |
996 | descs[i].num_pages = num_pages; | 980 | /* We don't need to set config_len or status: page is 0 already. */ |
997 | /* If they said the device needs memory, we allocate | 981 | d = (void *)devices.descpage + devices.desc_used; |
998 | * that now, bumping up the top of Guest memory. */ | 982 | d->type = type; |
999 | if (num_pages) { | 983 | devices.desc_used += sizeof(*d); |
1000 | map_zeroed_pages(top, num_pages); | 984 | |
1001 | descs[i].pfn = top/getpagesize(); | 985 | return d; |
1002 | top += num_pages*getpagesize(); | ||
1003 | } | ||
1004 | return &descs[i]; | ||
1005 | } | ||
1006 | } | ||
1007 | errx(1, "too many devices"); | ||
1008 | } | 986 | } |
1009 | 987 | ||
1010 | /* This monster routine does all the creation and setup of a new device, | 988 | /* Each device descriptor is followed by some configuration information. |
1011 | * including caling new_dev_desc() to allocate the descriptor and device | 989 | * The first byte is a "status" byte for the Guest to report what's happening. |
1012 | * memory. */ | 990 | * After that are fields: u8 type, u8 len, [... len bytes...]. |
1013 | static struct device *new_device(struct device_list *devices, | 991 | * |
1014 | u16 type, u16 num_pages, u16 features, | 992 | * This routine adds a new field to an existing device's descriptor. It only |
1015 | int fd, | 993 | * works for the last device, but that's OK because that's how we use it. */ |
1016 | bool (*handle_input)(int, struct device *), | 994 | static void add_desc_field(struct device *dev, u8 type, u8 len, const void *c) |
1017 | unsigned long watch_off, | 995 | { |
1018 | u32 (*handle_output)(int, | 996 | /* This is the last descriptor, right? */ |
1019 | const struct iovec *, | 997 | assert(devices.descpage + devices.desc_used |
1020 | unsigned, | 998 | == (u8 *)(dev->desc + 1) + dev->desc->config_len); |
1021 | struct device *)) | 999 | |
1000 | /* We only have one page of device descriptions. */ | ||
1001 | if (devices.desc_used + 2 + len > getpagesize()) | ||
1002 | errx(1, "Too many devices"); | ||
1003 | |||
1004 | /* Copy in the new config header: type then length. */ | ||
1005 | devices.descpage[devices.desc_used++] = type; | ||
1006 | devices.descpage[devices.desc_used++] = len; | ||
1007 | memcpy(devices.descpage + devices.desc_used, c, len); | ||
1008 | devices.desc_used += len; | ||
1009 | |||
1010 | /* Update the device descriptor length: two byte head then data. */ | ||
1011 | dev->desc->config_len += 2 + len; | ||
1012 | } | ||
1013 | |||
1014 | /* This routine adds a virtqueue to a device. We specify how many descriptors | ||
1015 | * the virtqueue is to have. */ | ||
1016 | static void add_virtqueue(struct device *dev, unsigned int num_descs, | ||
1017 | void (*handle_output)(int fd, struct virtqueue *me)) | ||
1018 | { | ||
1019 | unsigned int pages; | ||
1020 | struct virtqueue **i, *vq = malloc(sizeof(*vq)); | ||
1021 | void *p; | ||
1022 | |||
1023 | /* First we need some pages for this virtqueue. */ | ||
1024 | pages = (vring_size(num_descs) + getpagesize() - 1) / getpagesize(); | ||
1025 | p = get_pages(pages); | ||
1026 | |||
1027 | /* Initialize the configuration. */ | ||
1028 | vq->config.num = num_descs; | ||
1029 | vq->config.irq = devices.next_irq++; | ||
1030 | vq->config.pfn = to_guest_phys(p) / getpagesize(); | ||
1031 | |||
1032 | /* Initialize the vring. */ | ||
1033 | vring_init(&vq->vring, num_descs, p); | ||
1034 | |||
1035 | /* Add the configuration information to this device's descriptor. */ | ||
1036 | add_desc_field(dev, VIRTIO_CONFIG_F_VIRTQUEUE, | ||
1037 | sizeof(vq->config), &vq->config); | ||
1038 | |||
1039 | /* Add to tail of list, so dev->vq is first vq, dev->vq->next is | ||
1040 | * second. */ | ||
1041 | for (i = &dev->vq; *i; i = &(*i)->next); | ||
1042 | *i = vq; | ||
1043 | |||
1044 | /* Link virtqueue back to device. */ | ||
1045 | vq->dev = dev; | ||
1046 | |||
1047 | /* Set up handler. */ | ||
1048 | vq->handle_output = handle_output; | ||
1049 | if (!handle_output) | ||
1050 | vq->vring.used->flags = VRING_USED_F_NO_NOTIFY; | ||
1051 | } | ||
1052 | |||
1053 | /* This routine does all the creation and setup of a new device, including | ||
1054 | * caling new_dev_desc() to allocate the descriptor and device memory. */ | ||
1055 | static struct device *new_device(const char *name, u16 type, int fd, | ||
1056 | bool (*handle_input)(int, struct device *)) | ||
1022 | { | 1057 | { |
1023 | struct device *dev = malloc(sizeof(*dev)); | 1058 | struct device *dev = malloc(sizeof(*dev)); |
1024 | 1059 | ||
@@ -1026,27 +1061,25 @@ static struct device *new_device(struct device_list *devices, | |||
1026 | * easier, but the user expects the devices to be arranged on the bus | 1061 | * easier, but the user expects the devices to be arranged on the bus |
1027 | * in command-line order. The first network device on the command line | 1062 | * in command-line order. The first network device on the command line |
1028 | * is eth0, the first block device /dev/lgba, etc. */ | 1063 | * is eth0, the first block device /dev/lgba, etc. */ |
1029 | *devices->lastdev = dev; | 1064 | *devices.lastdev = dev; |
1030 | dev->next = NULL; | 1065 | dev->next = NULL; |
1031 | devices->lastdev = &dev->next; | 1066 | devices.lastdev = &dev->next; |
1032 | 1067 | ||
1033 | /* Now we populate the fields one at a time. */ | 1068 | /* Now we populate the fields one at a time. */ |
1034 | dev->fd = fd; | 1069 | dev->fd = fd; |
1035 | /* If we have an input handler for this file descriptor, then we add it | 1070 | /* If we have an input handler for this file descriptor, then we add it |
1036 | * to the device_list's fdset and maxfd. */ | 1071 | * to the device_list's fdset and maxfd. */ |
1037 | if (handle_input) | 1072 | if (handle_input) |
1038 | set_fd(dev->fd, devices); | 1073 | add_device_fd(dev->fd); |
1039 | dev->desc = new_dev_desc(devices->descs, type, features, num_pages); | 1074 | dev->desc = new_dev_desc(type); |
1040 | dev->mem = (void *)(dev->desc->pfn * getpagesize()); | ||
1041 | dev->handle_input = handle_input; | 1075 | dev->handle_input = handle_input; |
1042 | dev->watch_key = (unsigned long)dev->mem + watch_off; | 1076 | dev->name = name; |
1043 | dev->handle_output = handle_output; | ||
1044 | return dev; | 1077 | return dev; |
1045 | } | 1078 | } |
1046 | 1079 | ||
1047 | /* Our first setup routine is the console. It's a fairly simple device, but | 1080 | /* Our first setup routine is the console. It's a fairly simple device, but |
1048 | * UNIX tty handling makes it uglier than it could be. */ | 1081 | * UNIX tty handling makes it uglier than it could be. */ |
1049 | static void setup_console(struct device_list *devices) | 1082 | static void setup_console(void) |
1050 | { | 1083 | { |
1051 | struct device *dev; | 1084 | struct device *dev; |
1052 | 1085 | ||
@@ -1062,127 +1095,38 @@ static void setup_console(struct device_list *devices) | |||
1062 | atexit(restore_term); | 1095 | atexit(restore_term); |
1063 | } | 1096 | } |
1064 | 1097 | ||
1065 | /* We don't currently require any memory for the console, so we ask for | 1098 | dev = new_device("console", VIRTIO_ID_CONSOLE, |
1066 | * 0 pages. */ | 1099 | STDIN_FILENO, handle_console_input); |
1067 | dev = new_device(devices, LGUEST_DEVICE_T_CONSOLE, 0, 0, | ||
1068 | STDIN_FILENO, handle_console_input, | ||
1069 | LGUEST_CONSOLE_DMA_KEY, handle_console_output); | ||
1070 | /* We store the console state in dev->priv, and initialize it. */ | 1100 | /* We store the console state in dev->priv, and initialize it. */ |
1071 | dev->priv = malloc(sizeof(struct console_abort)); | 1101 | dev->priv = malloc(sizeof(struct console_abort)); |
1072 | ((struct console_abort *)dev->priv)->count = 0; | 1102 | ((struct console_abort *)dev->priv)->count = 0; |
1073 | verbose("device %p: console\n", | ||
1074 | (void *)(dev->desc->pfn * getpagesize())); | ||
1075 | } | ||
1076 | 1103 | ||
1077 | /* Setting up a block file is also fairly straightforward. */ | 1104 | /* The console needs two virtqueues: the input then the output. When |
1078 | static void setup_block_file(const char *filename, struct device_list *devices) | 1105 | * they put something the input queue, we make sure we're listening to |
1079 | { | 1106 | * stdin. When they put something in the output queue, we write it to |
1080 | int fd; | 1107 | * stdout. */ |
1081 | struct device *dev; | 1108 | add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); |
1082 | off64_t *device_len; | 1109 | add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output); |
1083 | struct lguest_block_page *p; | 1110 | |
1084 | 1111 | verbose("device %u: console\n", devices.device_num++); | |
1085 | /* We open with O_LARGEFILE because otherwise we get stuck at 2G. We | ||
1086 | * open with O_DIRECT because otherwise our benchmarks go much too | ||
1087 | * fast. */ | ||
1088 | fd = open_or_die(filename, O_RDWR|O_LARGEFILE|O_DIRECT); | ||
1089 | |||
1090 | /* We want one page, and have no input handler (the block file never | ||
1091 | * has anything interesting to say to us). Our timing will be quite | ||
1092 | * random, so it should be a reasonable randomness source. */ | ||
1093 | dev = new_device(devices, LGUEST_DEVICE_T_BLOCK, 1, | ||
1094 | LGUEST_DEVICE_F_RANDOMNESS, | ||
1095 | fd, NULL, 0, handle_block_output); | ||
1096 | |||
1097 | /* We store the device size in the private area */ | ||
1098 | device_len = dev->priv = malloc(sizeof(*device_len)); | ||
1099 | /* This is the safe way of establishing the size of our device: it | ||
1100 | * might be a normal file or an actual block device like /dev/hdb. */ | ||
1101 | *device_len = lseek64(fd, 0, SEEK_END); | ||
1102 | |||
1103 | /* The device memory is a "struct lguest_block_page". It's zeroed | ||
1104 | * already, we just need to put in the device size. Block devices | ||
1105 | * think in sectors (ie. 512 byte chunks), so we translate here. */ | ||
1106 | p = dev->mem; | ||
1107 | p->num_sectors = *device_len/512; | ||
1108 | verbose("device %p: block %i sectors\n", | ||
1109 | (void *)(dev->desc->pfn * getpagesize()), p->num_sectors); | ||
1110 | } | 1112 | } |
1113 | /*:*/ | ||
1111 | 1114 | ||
1112 | /* | 1115 | /*M:010 Inter-guest networking is an interesting area. Simplest is to have a |
1113 | * Network Devices. | 1116 | * --sharenet=<name> option which opens or creates a named pipe. This can be |
1117 | * used to send packets to another guest in a 1:1 manner. | ||
1114 | * | 1118 | * |
1115 | * Setting up network devices is quite a pain, because we have three types. | 1119 | * More sopisticated is to use one of the tools developed for project like UML |
1116 | * First, we have the inter-Guest network. This is a file which is mapped into | 1120 | * to do networking. |
1117 | * the address space of the Guests who are on the network. Because it is a | ||
1118 | * shared mapping, the same page underlies all the devices, and they can send | ||
1119 | * DMA to each other. | ||
1120 | * | 1121 | * |
1121 | * Remember from our network driver, the Guest is told what slot in the page it | 1122 | * Faster is to do virtio bonding in kernel. Doing this 1:1 would be |
1122 | * is to use. We use exclusive fnctl locks to reserve a slot. If another | 1123 | * completely generic ("here's my vring, attach to your vring") and would work |
1123 | * Guest is using a slot, the lock will fail and we try another. Because fnctl | 1124 | * for any traffic. Of course, namespace and permissions issues need to be |
1124 | * locks are cleaned up automatically when we die, this cleverly means that our | 1125 | * dealt with. A more sophisticated "multi-channel" virtio_net.c could hide |
1125 | * reservation on the slot will vanish if we crash. */ | 1126 | * multiple inter-guest channels behind one interface, although it would |
1126 | static unsigned int find_slot(int netfd, const char *filename) | 1127 | * require some manner of hotplugging new virtio channels. |
1127 | { | 1128 | * |
1128 | struct flock fl; | 1129 | * Finally, we could implement a virtio network switch in the kernel. :*/ |
1129 | |||
1130 | fl.l_type = F_WRLCK; | ||
1131 | fl.l_whence = SEEK_SET; | ||
1132 | fl.l_len = 1; | ||
1133 | /* Try a 1 byte lock in each possible position number */ | ||
1134 | for (fl.l_start = 0; | ||
1135 | fl.l_start < getpagesize()/sizeof(struct lguest_net); | ||
1136 | fl.l_start++) { | ||
1137 | /* If we succeed, return the slot number. */ | ||
1138 | if (fcntl(netfd, F_SETLK, &fl) == 0) | ||
1139 | return fl.l_start; | ||
1140 | } | ||
1141 | errx(1, "No free slots in network file %s", filename); | ||
1142 | } | ||
1143 | |||
1144 | /* This function sets up the network file */ | ||
1145 | static void setup_net_file(const char *filename, | ||
1146 | struct device_list *devices) | ||
1147 | { | ||
1148 | int netfd; | ||
1149 | struct device *dev; | ||
1150 | |||
1151 | /* We don't use open_or_die() here: for friendliness we create the file | ||
1152 | * if it doesn't already exist. */ | ||
1153 | netfd = open(filename, O_RDWR, 0); | ||
1154 | if (netfd < 0) { | ||
1155 | if (errno == ENOENT) { | ||
1156 | netfd = open(filename, O_RDWR|O_CREAT, 0600); | ||
1157 | if (netfd >= 0) { | ||
1158 | /* If we succeeded, initialize the file with a | ||
1159 | * blank page. */ | ||
1160 | char page[getpagesize()]; | ||
1161 | memset(page, 0, sizeof(page)); | ||
1162 | write(netfd, page, sizeof(page)); | ||
1163 | } | ||
1164 | } | ||
1165 | if (netfd < 0) | ||
1166 | err(1, "cannot open net file '%s'", filename); | ||
1167 | } | ||
1168 | |||
1169 | /* We need 1 page, and the features indicate the slot to use and that | ||
1170 | * no checksum is needed. We never touch this device again; it's | ||
1171 | * between the Guests on the network, so we don't register input or | ||
1172 | * output handlers. */ | ||
1173 | dev = new_device(devices, LGUEST_DEVICE_T_NET, 1, | ||
1174 | find_slot(netfd, filename)|LGUEST_NET_F_NOCSUM, | ||
1175 | -1, NULL, 0, NULL); | ||
1176 | |||
1177 | /* Map the shared file. */ | ||
1178 | if (mmap(dev->mem, getpagesize(), PROT_READ|PROT_WRITE, | ||
1179 | MAP_FIXED|MAP_SHARED, netfd, 0) != dev->mem) | ||
1180 | err(1, "could not mmap '%s'", filename); | ||
1181 | verbose("device %p: shared net %s, peer %i\n", | ||
1182 | (void *)(dev->desc->pfn * getpagesize()), filename, | ||
1183 | dev->desc->features & ~LGUEST_NET_F_NOCSUM); | ||
1184 | } | ||
1185 | /*:*/ | ||
1186 | 1130 | ||
1187 | static u32 str2ip(const char *ipaddr) | 1131 | static u32 str2ip(const char *ipaddr) |
1188 | { | 1132 | { |
@@ -1217,7 +1161,7 @@ static void add_to_bridge(int fd, const char *if_name, const char *br_name) | |||
1217 | 1161 | ||
1218 | /* This sets up the Host end of the network device with an IP address, brings | 1162 | /* This sets up the Host end of the network device with an IP address, brings |
1219 | * it up so packets will flow, the copies the MAC address into the hwaddr | 1163 | * it up so packets will flow, the copies the MAC address into the hwaddr |
1220 | * pointer (in practice, the Host's slot in the network device's memory). */ | 1164 | * pointer. */ |
1221 | static void configure_device(int fd, const char *devname, u32 ipaddr, | 1165 | static void configure_device(int fd, const char *devname, u32 ipaddr, |
1222 | unsigned char hwaddr[6]) | 1166 | unsigned char hwaddr[6]) |
1223 | { | 1167 | { |
@@ -1243,18 +1187,18 @@ static void configure_device(int fd, const char *devname, u32 ipaddr, | |||
1243 | memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6); | 1187 | memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6); |
1244 | } | 1188 | } |
1245 | 1189 | ||
1246 | /*L:195 The other kind of network is a Host<->Guest network. This can either | 1190 | /*L:195 Our network is a Host<->Guest network. This can either use bridging or |
1247 | * use briding or routing, but the principle is the same: it uses the "tun" | 1191 | * routing, but the principle is the same: it uses the "tun" device to inject |
1248 | * device to inject packets into the Host as if they came in from a normal | 1192 | * packets into the Host as if they came in from a normal network card. We |
1249 | * network card. We just shunt packets between the Guest and the tun | 1193 | * just shunt packets between the Guest and the tun device. */ |
1250 | * device. */ | 1194 | static void setup_tun_net(const char *arg) |
1251 | static void setup_tun_net(const char *arg, struct device_list *devices) | ||
1252 | { | 1195 | { |
1253 | struct device *dev; | 1196 | struct device *dev; |
1254 | struct ifreq ifr; | 1197 | struct ifreq ifr; |
1255 | int netfd, ipfd; | 1198 | int netfd, ipfd; |
1256 | u32 ip; | 1199 | u32 ip; |
1257 | const char *br_name = NULL; | 1200 | const char *br_name = NULL; |
1201 | u8 hwaddr[6]; | ||
1258 | 1202 | ||
1259 | /* We open the /dev/net/tun device and tell it we want a tap device. A | 1203 | /* We open the /dev/net/tun device and tell it we want a tap device. A |
1260 | * tap device is like a tun device, only somehow different. To tell | 1204 | * tap device is like a tun device, only somehow different. To tell |
@@ -1270,21 +1214,13 @@ static void setup_tun_net(const char *arg, struct device_list *devices) | |||
1270 | * device: trust us! */ | 1214 | * device: trust us! */ |
1271 | ioctl(netfd, TUNSETNOCSUM, 1); | 1215 | ioctl(netfd, TUNSETNOCSUM, 1); |
1272 | 1216 | ||
1273 | /* We create the net device with 1 page, using the features field of | 1217 | /* First we create a new network device. */ |
1274 | * the descriptor to tell the Guest it is in slot 1 (NET_PEERNUM), and | 1218 | dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input); |
1275 | * that the device has fairly random timing. We do *not* specify | ||
1276 | * LGUEST_NET_F_NOCSUM: these packets can reach the real world. | ||
1277 | * | ||
1278 | * We will put our MAC address is slot 0 for the Guest to see, so | ||
1279 | * it will send packets to us using the key "peer_offset(0)": */ | ||
1280 | dev = new_device(devices, LGUEST_DEVICE_T_NET, 1, | ||
1281 | NET_PEERNUM|LGUEST_DEVICE_F_RANDOMNESS, netfd, | ||
1282 | handle_tun_input, peer_offset(0), handle_tun_output); | ||
1283 | 1219 | ||
1284 | /* We keep a flag which says whether we've seen packets come out from | 1220 | /* Network devices need a receive and a send queue, just like |
1285 | * this network device. */ | 1221 | * console. */ |
1286 | dev->priv = malloc(sizeof(bool)); | 1222 | add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); |
1287 | *(bool *)dev->priv = false; | 1223 | add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output); |
1288 | 1224 | ||
1289 | /* We need a socket to perform the magic network ioctls to bring up the | 1225 | /* We need a socket to perform the magic network ioctls to bring up the |
1290 | * tap interface, connect to the bridge etc. Any socket will do! */ | 1226 | * tap interface, connect to the bridge etc. Any socket will do! */ |
@@ -1300,44 +1236,251 @@ static void setup_tun_net(const char *arg, struct device_list *devices) | |||
1300 | } else /* It is an IP address to set up the device with */ | 1236 | } else /* It is an IP address to set up the device with */ |
1301 | ip = str2ip(arg); | 1237 | ip = str2ip(arg); |
1302 | 1238 | ||
1303 | /* We are peer 0, ie. first slot, so we hand dev->mem to this routine | 1239 | /* Set up the tun device, and get the mac address for the interface. */ |
1304 | * to write the MAC address at the start of the device memory. */ | 1240 | configure_device(ipfd, ifr.ifr_name, ip, hwaddr); |
1305 | configure_device(ipfd, ifr.ifr_name, ip, dev->mem); | ||
1306 | 1241 | ||
1307 | /* Set "promisc" bit: we want every single packet if we're going to | 1242 | /* Tell Guest what MAC address to use. */ |
1308 | * bridge to other machines (and otherwise it doesn't matter). */ | 1243 | add_desc_field(dev, VIRTIO_CONFIG_NET_MAC_F, sizeof(hwaddr), hwaddr); |
1309 | *((u8 *)dev->mem) |= 0x1; | ||
1310 | 1244 | ||
1245 | /* We don't seed the socket any more; setup is done. */ | ||
1311 | close(ipfd); | 1246 | close(ipfd); |
1312 | 1247 | ||
1313 | verbose("device %p: tun net %u.%u.%u.%u\n", | 1248 | verbose("device %u: tun net %u.%u.%u.%u\n", |
1314 | (void *)(dev->desc->pfn * getpagesize()), | 1249 | devices.device_num++, |
1315 | (u8)(ip>>24), (u8)(ip>>16), (u8)(ip>>8), (u8)ip); | 1250 | (u8)(ip>>24),(u8)(ip>>16),(u8)(ip>>8),(u8)ip); |
1316 | if (br_name) | 1251 | if (br_name) |
1317 | verbose("attached to bridge: %s\n", br_name); | 1252 | verbose("attached to bridge: %s\n", br_name); |
1318 | } | 1253 | } |
1254 | |||
1255 | |||
1256 | /* | ||
1257 | * Block device. | ||
1258 | * | ||
1259 | * Serving a block device is really easy: the Guest asks for a block number and | ||
1260 | * we read or write that position in the file. | ||
1261 | * | ||
1262 | * Unfortunately, this is amazingly slow: the Guest waits until the read is | ||
1263 | * finished before running anything else, even if it could be doing useful | ||
1264 | * work. We could use async I/O, except it's reputed to suck so hard that | ||
1265 | * characters actually go missing from your code when you try to use it. | ||
1266 | * | ||
1267 | * So we farm the I/O out to thread, and communicate with it via a pipe. */ | ||
1268 | |||
1269 | /* This hangs off device->priv, with the data. */ | ||
1270 | struct vblk_info | ||
1271 | { | ||
1272 | /* The size of the file. */ | ||
1273 | off64_t len; | ||
1274 | |||
1275 | /* The file descriptor for the file. */ | ||
1276 | int fd; | ||
1277 | |||
1278 | /* IO thread listens on this file descriptor [0]. */ | ||
1279 | int workpipe[2]; | ||
1280 | |||
1281 | /* IO thread writes to this file descriptor to mark it done, then | ||
1282 | * Launcher triggers interrupt to Guest. */ | ||
1283 | int done_fd; | ||
1284 | }; | ||
1285 | |||
1286 | /* This is the core of the I/O thread. It returns true if it did something. */ | ||
1287 | static bool service_io(struct device *dev) | ||
1288 | { | ||
1289 | struct vblk_info *vblk = dev->priv; | ||
1290 | unsigned int head, out_num, in_num, wlen; | ||
1291 | int ret; | ||
1292 | struct virtio_blk_inhdr *in; | ||
1293 | struct virtio_blk_outhdr *out; | ||
1294 | struct iovec iov[dev->vq->vring.num]; | ||
1295 | off64_t off; | ||
1296 | |||
1297 | head = get_vq_desc(dev->vq, iov, &out_num, &in_num); | ||
1298 | if (head == dev->vq->vring.num) | ||
1299 | return false; | ||
1300 | |||
1301 | if (out_num == 0 || in_num == 0) | ||
1302 | errx(1, "Bad virtblk cmd %u out=%u in=%u", | ||
1303 | head, out_num, in_num); | ||
1304 | |||
1305 | out = convert(&iov[0], struct virtio_blk_outhdr); | ||
1306 | in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr); | ||
1307 | off = out->sector * 512; | ||
1308 | |||
1309 | /* This is how we implement barriers. Pretty poor, no? */ | ||
1310 | if (out->type & VIRTIO_BLK_T_BARRIER) | ||
1311 | fdatasync(vblk->fd); | ||
1312 | |||
1313 | if (out->type & VIRTIO_BLK_T_SCSI_CMD) { | ||
1314 | fprintf(stderr, "Scsi commands unsupported\n"); | ||
1315 | in->status = VIRTIO_BLK_S_UNSUPP; | ||
1316 | wlen = sizeof(in); | ||
1317 | } else if (out->type & VIRTIO_BLK_T_OUT) { | ||
1318 | /* Write */ | ||
1319 | |||
1320 | /* Move to the right location in the block file. This can fail | ||
1321 | * if they try to write past end. */ | ||
1322 | if (lseek64(vblk->fd, off, SEEK_SET) != off) | ||
1323 | err(1, "Bad seek to sector %llu", out->sector); | ||
1324 | |||
1325 | ret = writev(vblk->fd, iov+1, out_num-1); | ||
1326 | verbose("WRITE to sector %llu: %i\n", out->sector, ret); | ||
1327 | |||
1328 | /* Grr... Now we know how long the descriptor they sent was, we | ||
1329 | * make sure they didn't try to write over the end of the block | ||
1330 | * file (possibly extending it). */ | ||
1331 | if (ret > 0 && off + ret > vblk->len) { | ||
1332 | /* Trim it back to the correct length */ | ||
1333 | ftruncate64(vblk->fd, vblk->len); | ||
1334 | /* Die, bad Guest, die. */ | ||
1335 | errx(1, "Write past end %llu+%u", off, ret); | ||
1336 | } | ||
1337 | wlen = sizeof(in); | ||
1338 | in->status = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); | ||
1339 | } else { | ||
1340 | /* Read */ | ||
1341 | |||
1342 | /* Move to the right location in the block file. This can fail | ||
1343 | * if they try to read past end. */ | ||
1344 | if (lseek64(vblk->fd, off, SEEK_SET) != off) | ||
1345 | err(1, "Bad seek to sector %llu", out->sector); | ||
1346 | |||
1347 | ret = readv(vblk->fd, iov+1, in_num-1); | ||
1348 | verbose("READ from sector %llu: %i\n", out->sector, ret); | ||
1349 | if (ret >= 0) { | ||
1350 | wlen = sizeof(in) + ret; | ||
1351 | in->status = VIRTIO_BLK_S_OK; | ||
1352 | } else { | ||
1353 | wlen = sizeof(in); | ||
1354 | in->status = VIRTIO_BLK_S_IOERR; | ||
1355 | } | ||
1356 | } | ||
1357 | |||
1358 | /* We can't trigger an IRQ, because we're not the Launcher. It does | ||
1359 | * that when we tell it we're done. */ | ||
1360 | add_used(dev->vq, head, wlen); | ||
1361 | return true; | ||
1362 | } | ||
1363 | |||
1364 | /* This is the thread which actually services the I/O. */ | ||
1365 | static int io_thread(void *_dev) | ||
1366 | { | ||
1367 | struct device *dev = _dev; | ||
1368 | struct vblk_info *vblk = dev->priv; | ||
1369 | char c; | ||
1370 | |||
1371 | /* Close other side of workpipe so we get 0 read when main dies. */ | ||
1372 | close(vblk->workpipe[1]); | ||
1373 | /* Close the other side of the done_fd pipe. */ | ||
1374 | close(dev->fd); | ||
1375 | |||
1376 | /* When this read fails, it means Launcher died, so we follow. */ | ||
1377 | while (read(vblk->workpipe[0], &c, 1) == 1) { | ||
1378 | /* We acknowledge each request immediately, to reduce latency, | ||
1379 | * rather than waiting until we've done them all. I haven't | ||
1380 | * measured to see if it makes any difference. */ | ||
1381 | while (service_io(dev)) | ||
1382 | write(vblk->done_fd, &c, 1); | ||
1383 | } | ||
1384 | return 0; | ||
1385 | } | ||
1386 | |||
1387 | /* When the thread says some I/O is done, we interrupt the Guest. */ | ||
1388 | static bool handle_io_finish(int fd, struct device *dev) | ||
1389 | { | ||
1390 | char c; | ||
1391 | |||
1392 | /* If child died, presumably it printed message. */ | ||
1393 | if (read(dev->fd, &c, 1) != 1) | ||
1394 | exit(1); | ||
1395 | |||
1396 | /* It did some work, so trigger the irq. */ | ||
1397 | trigger_irq(fd, dev->vq); | ||
1398 | return true; | ||
1399 | } | ||
1400 | |||
1401 | /* When the Guest submits some I/O, we wake the I/O thread. */ | ||
1402 | static void handle_virtblk_output(int fd, struct virtqueue *vq) | ||
1403 | { | ||
1404 | struct vblk_info *vblk = vq->dev->priv; | ||
1405 | char c = 0; | ||
1406 | |||
1407 | /* Wake up I/O thread and tell it to go to work! */ | ||
1408 | if (write(vblk->workpipe[1], &c, 1) != 1) | ||
1409 | /* Presumably it indicated why it died. */ | ||
1410 | exit(1); | ||
1411 | } | ||
1412 | |||
1413 | /* This creates a virtual block device. */ | ||
1414 | static void setup_block_file(const char *filename) | ||
1415 | { | ||
1416 | int p[2]; | ||
1417 | struct device *dev; | ||
1418 | struct vblk_info *vblk; | ||
1419 | void *stack; | ||
1420 | u64 cap; | ||
1421 | unsigned int val; | ||
1422 | |||
1423 | /* This is the pipe the I/O thread will use to tell us I/O is done. */ | ||
1424 | pipe(p); | ||
1425 | |||
1426 | /* The device responds to return from I/O thread. */ | ||
1427 | dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish); | ||
1428 | |||
1429 | /* The device has a virtqueue. */ | ||
1430 | add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output); | ||
1431 | |||
1432 | /* Allocate the room for our own bookkeeping */ | ||
1433 | vblk = dev->priv = malloc(sizeof(*vblk)); | ||
1434 | |||
1435 | /* First we open the file and store the length. */ | ||
1436 | vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE); | ||
1437 | vblk->len = lseek64(vblk->fd, 0, SEEK_END); | ||
1438 | |||
1439 | /* Tell Guest how many sectors this device has. */ | ||
1440 | cap = cpu_to_le64(vblk->len / 512); | ||
1441 | add_desc_field(dev, VIRTIO_CONFIG_BLK_F_CAPACITY, sizeof(cap), &cap); | ||
1442 | |||
1443 | /* Tell Guest not to put in too many descriptors at once: two are used | ||
1444 | * for the in and out elements. */ | ||
1445 | val = cpu_to_le32(VIRTQUEUE_NUM - 2); | ||
1446 | add_desc_field(dev, VIRTIO_CONFIG_BLK_F_SEG_MAX, sizeof(val), &val); | ||
1447 | |||
1448 | /* The I/O thread writes to this end of the pipe when done. */ | ||
1449 | vblk->done_fd = p[1]; | ||
1450 | |||
1451 | /* This is how we tell the I/O thread about more work. */ | ||
1452 | pipe(vblk->workpipe); | ||
1453 | |||
1454 | /* Create stack for thread and run it */ | ||
1455 | stack = malloc(32768); | ||
1456 | if (clone(io_thread, stack + 32768, CLONE_VM, dev) == -1) | ||
1457 | err(1, "Creating clone"); | ||
1458 | |||
1459 | /* We don't need to keep the I/O thread's end of the pipes open. */ | ||
1460 | close(vblk->done_fd); | ||
1461 | close(vblk->workpipe[0]); | ||
1462 | |||
1463 | verbose("device %u: virtblock %llu sectors\n", | ||
1464 | devices.device_num, cap); | ||
1465 | } | ||
1319 | /* That's the end of device setup. */ | 1466 | /* That's the end of device setup. */ |
1320 | 1467 | ||
1321 | /*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves | 1468 | /*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves |
1322 | * its input and output, and finally, lays it to rest. */ | 1469 | * its input and output, and finally, lays it to rest. */ |
1323 | static void __attribute__((noreturn)) | 1470 | static void __attribute__((noreturn)) run_guest(int lguest_fd) |
1324 | run_guest(int lguest_fd, struct device_list *device_list) | ||
1325 | { | 1471 | { |
1326 | for (;;) { | 1472 | for (;;) { |
1327 | u32 args[] = { LHREQ_BREAK, 0 }; | 1473 | unsigned long args[] = { LHREQ_BREAK, 0 }; |
1328 | unsigned long arr[2]; | 1474 | unsigned long notify_addr; |
1329 | int readval; | 1475 | int readval; |
1330 | 1476 | ||
1331 | /* We read from the /dev/lguest device to run the Guest. */ | 1477 | /* We read from the /dev/lguest device to run the Guest. */ |
1332 | readval = read(lguest_fd, arr, sizeof(arr)); | 1478 | readval = read(lguest_fd, ¬ify_addr, sizeof(notify_addr)); |
1333 | |||
1334 | /* The read can only really return sizeof(arr) (the Guest did a | ||
1335 | * SEND_DMA to us), or an error. */ | ||
1336 | 1479 | ||
1337 | /* For a successful read, arr[0] is the address of the "struct | 1480 | /* One unsigned long means the Guest did HCALL_NOTIFY */ |
1338 | * lguest_dma", and arr[1] is the key the Guest sent to. */ | 1481 | if (readval == sizeof(notify_addr)) { |
1339 | if (readval == sizeof(arr)) { | 1482 | verbose("Notify on address %#lx\n", notify_addr); |
1340 | handle_output(lguest_fd, arr[0], arr[1], device_list); | 1483 | handle_output(lguest_fd, notify_addr); |
1341 | continue; | 1484 | continue; |
1342 | /* ENOENT means the Guest died. Reading tells us why. */ | 1485 | /* ENOENT means the Guest died. Reading tells us why. */ |
1343 | } else if (errno == ENOENT) { | 1486 | } else if (errno == ENOENT) { |
@@ -1351,7 +1494,7 @@ run_guest(int lguest_fd, struct device_list *device_list) | |||
1351 | 1494 | ||
1352 | /* Service input, then unset the BREAK which releases | 1495 | /* Service input, then unset the BREAK which releases |
1353 | * the Waker. */ | 1496 | * the Waker. */ |
1354 | handle_input(lguest_fd, device_list); | 1497 | handle_input(lguest_fd); |
1355 | if (write(lguest_fd, args, sizeof(args)) < 0) | 1498 | if (write(lguest_fd, args, sizeof(args)) < 0) |
1356 | err(1, "Resetting break"); | 1499 | err(1, "Resetting break"); |
1357 | } | 1500 | } |
@@ -1365,7 +1508,6 @@ run_guest(int lguest_fd, struct device_list *device_list) | |||
1365 | 1508 | ||
1366 | static struct option opts[] = { | 1509 | static struct option opts[] = { |
1367 | { "verbose", 0, NULL, 'v' }, | 1510 | { "verbose", 0, NULL, 'v' }, |
1368 | { "sharenet", 1, NULL, 's' }, | ||
1369 | { "tunnet", 1, NULL, 't' }, | 1511 | { "tunnet", 1, NULL, 't' }, |
1370 | { "block", 1, NULL, 'b' }, | 1512 | { "block", 1, NULL, 'b' }, |
1371 | { "initrd", 1, NULL, 'i' }, | 1513 | { "initrd", 1, NULL, 'i' }, |
@@ -1374,37 +1516,21 @@ static struct option opts[] = { | |||
1374 | static void usage(void) | 1516 | static void usage(void) |
1375 | { | 1517 | { |
1376 | errx(1, "Usage: lguest [--verbose] " | 1518 | errx(1, "Usage: lguest [--verbose] " |
1377 | "[--sharenet=<filename>|--tunnet=(<ipaddr>|bridge:<bridgename>)\n" | 1519 | "[--tunnet=(<ipaddr>|bridge:<bridgename>)\n" |
1378 | "|--block=<filename>|--initrd=<filename>]...\n" | 1520 | "|--block=<filename>|--initrd=<filename>]...\n" |
1379 | "<mem-in-mb> vmlinux [args...]"); | 1521 | "<mem-in-mb> vmlinux [args...]"); |
1380 | } | 1522 | } |
1381 | 1523 | ||
1382 | /*L:100 The Launcher code itself takes us out into userspace, that scary place | 1524 | /*L:105 The main routine is where the real work begins: */ |
1383 | * where pointers run wild and free! Unfortunately, like most userspace | ||
1384 | * programs, it's quite boring (which is why everyone like to hack on the | ||
1385 | * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it | ||
1386 | * will get you through this section. Or, maybe not. | ||
1387 | * | ||
1388 | * The Launcher binary sits up high, usually starting at address 0xB8000000. | ||
1389 | * Everything below this is the "physical" memory for the Guest. For example, | ||
1390 | * if the Guest were to write a "1" at physical address 0, we would see a "1" | ||
1391 | * in the Launcher at "(int *)0". Guest physical == Launcher virtual. | ||
1392 | * | ||
1393 | * This can be tough to get your head around, but usually it just means that we | ||
1394 | * don't need to do any conversion when the Guest gives us it's "physical" | ||
1395 | * addresses. | ||
1396 | */ | ||
1397 | int main(int argc, char *argv[]) | 1525 | int main(int argc, char *argv[]) |
1398 | { | 1526 | { |
1399 | /* Memory, top-level pagetable, code startpoint, PAGE_OFFSET and size | 1527 | /* Memory, top-level pagetable, code startpoint and size of the |
1400 | * of the (optional) initrd. */ | 1528 | * (optional) initrd. */ |
1401 | unsigned long mem = 0, pgdir, start, page_offset, initrd_size = 0; | 1529 | unsigned long mem = 0, pgdir, start, initrd_size = 0; |
1402 | /* A temporary and the /dev/lguest file descriptor. */ | 1530 | /* A temporary and the /dev/lguest file descriptor. */ |
1403 | int i, c, lguest_fd; | 1531 | int i, c, lguest_fd; |
1404 | /* The list of Guest devices, based on command line arguments. */ | 1532 | /* The boot information for the Guest. */ |
1405 | struct device_list device_list; | 1533 | struct boot_params *boot; |
1406 | /* The boot information for the Guest: at guest-physical address 0. */ | ||
1407 | void *boot = (void *)0; | ||
1408 | /* If they specify an initrd file to load. */ | 1534 | /* If they specify an initrd file to load. */ |
1409 | const char *initrd_name = NULL; | 1535 | const char *initrd_name = NULL; |
1410 | 1536 | ||
@@ -1412,11 +1538,12 @@ int main(int argc, char *argv[]) | |||
1412 | * device receive input from a file descriptor, we keep an fdset | 1538 | * device receive input from a file descriptor, we keep an fdset |
1413 | * (infds) and the maximum fd number (max_infd) with the head of the | 1539 | * (infds) and the maximum fd number (max_infd) with the head of the |
1414 | * list. We also keep a pointer to the last device, for easy appending | 1540 | * list. We also keep a pointer to the last device, for easy appending |
1415 | * to the list. */ | 1541 | * to the list. Finally, we keep the next interrupt number to hand out |
1416 | device_list.max_infd = -1; | 1542 | * (1: remember that 0 is used by the timer). */ |
1417 | device_list.dev = NULL; | 1543 | FD_ZERO(&devices.infds); |
1418 | device_list.lastdev = &device_list.dev; | 1544 | devices.max_infd = -1; |
1419 | FD_ZERO(&device_list.infds); | 1545 | devices.lastdev = &devices.dev; |
1546 | devices.next_irq = 1; | ||
1420 | 1547 | ||
1421 | /* We need to know how much memory so we can set up the device | 1548 | /* We need to know how much memory so we can set up the device |
1422 | * descriptor and memory pages for the devices as we parse the command | 1549 | * descriptor and memory pages for the devices as we parse the command |
@@ -1424,9 +1551,16 @@ int main(int argc, char *argv[]) | |||
1424 | * of memory now. */ | 1551 | * of memory now. */ |
1425 | for (i = 1; i < argc; i++) { | 1552 | for (i = 1; i < argc; i++) { |
1426 | if (argv[i][0] != '-') { | 1553 | if (argv[i][0] != '-') { |
1427 | mem = top = atoi(argv[i]) * 1024 * 1024; | 1554 | mem = atoi(argv[i]) * 1024 * 1024; |
1428 | device_list.descs = map_zeroed_pages(top, 1); | 1555 | /* We start by mapping anonymous pages over all of |
1429 | top += getpagesize(); | 1556 | * guest-physical memory range. This fills it with 0, |
1557 | * and ensures that the Guest won't be killed when it | ||
1558 | * tries to access it. */ | ||
1559 | guest_base = map_zeroed_pages(mem / getpagesize() | ||
1560 | + DEVICE_PAGES); | ||
1561 | guest_limit = mem; | ||
1562 | guest_max = mem + DEVICE_PAGES*getpagesize(); | ||
1563 | devices.descpage = get_pages(1); | ||
1430 | break; | 1564 | break; |
1431 | } | 1565 | } |
1432 | } | 1566 | } |
@@ -1437,14 +1571,11 @@ int main(int argc, char *argv[]) | |||
1437 | case 'v': | 1571 | case 'v': |
1438 | verbose = true; | 1572 | verbose = true; |
1439 | break; | 1573 | break; |
1440 | case 's': | ||
1441 | setup_net_file(optarg, &device_list); | ||
1442 | break; | ||
1443 | case 't': | 1574 | case 't': |
1444 | setup_tun_net(optarg, &device_list); | 1575 | setup_tun_net(optarg); |
1445 | break; | 1576 | break; |
1446 | case 'b': | 1577 | case 'b': |
1447 | setup_block_file(optarg, &device_list); | 1578 | setup_block_file(optarg); |
1448 | break; | 1579 | break; |
1449 | case 'i': | 1580 | case 'i': |
1450 | initrd_name = optarg; | 1581 | initrd_name = optarg; |
@@ -1459,56 +1590,60 @@ int main(int argc, char *argv[]) | |||
1459 | if (optind + 2 > argc) | 1590 | if (optind + 2 > argc) |
1460 | usage(); | 1591 | usage(); |
1461 | 1592 | ||
1462 | /* We always have a console device */ | 1593 | verbose("Guest base is at %p\n", guest_base); |
1463 | setup_console(&device_list); | ||
1464 | 1594 | ||
1465 | /* We start by mapping anonymous pages over all of guest-physical | 1595 | /* We always have a console device */ |
1466 | * memory range. This fills it with 0, and ensures that the Guest | 1596 | setup_console(); |
1467 | * won't be killed when it tries to access it. */ | ||
1468 | map_zeroed_pages(0, mem / getpagesize()); | ||
1469 | 1597 | ||
1470 | /* Now we load the kernel */ | 1598 | /* Now we load the kernel */ |
1471 | start = load_kernel(open_or_die(argv[optind+1], O_RDONLY), | 1599 | start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); |
1472 | &page_offset); | 1600 | |
1601 | /* Boot information is stashed at physical address 0 */ | ||
1602 | boot = from_guest_phys(0); | ||
1473 | 1603 | ||
1474 | /* Map the initrd image if requested (at top of physical memory) */ | 1604 | /* Map the initrd image if requested (at top of physical memory) */ |
1475 | if (initrd_name) { | 1605 | if (initrd_name) { |
1476 | initrd_size = load_initrd(initrd_name, mem); | 1606 | initrd_size = load_initrd(initrd_name, mem); |
1477 | /* These are the location in the Linux boot header where the | 1607 | /* These are the location in the Linux boot header where the |
1478 | * start and size of the initrd are expected to be found. */ | 1608 | * start and size of the initrd are expected to be found. */ |
1479 | *(unsigned long *)(boot+0x218) = mem - initrd_size; | 1609 | boot->hdr.ramdisk_image = mem - initrd_size; |
1480 | *(unsigned long *)(boot+0x21c) = initrd_size; | 1610 | boot->hdr.ramdisk_size = initrd_size; |
1481 | /* The bootloader type 0xFF means "unknown"; that's OK. */ | 1611 | /* The bootloader type 0xFF means "unknown"; that's OK. */ |
1482 | *(unsigned char *)(boot+0x210) = 0xFF; | 1612 | boot->hdr.type_of_loader = 0xFF; |
1483 | } | 1613 | } |
1484 | 1614 | ||
1485 | /* Set up the initial linear pagetables, starting below the initrd. */ | 1615 | /* Set up the initial linear pagetables, starting below the initrd. */ |
1486 | pgdir = setup_pagetables(mem, initrd_size, page_offset); | 1616 | pgdir = setup_pagetables(mem, initrd_size); |
1487 | 1617 | ||
1488 | /* The Linux boot header contains an "E820" memory map: ours is a | 1618 | /* The Linux boot header contains an "E820" memory map: ours is a |
1489 | * simple, single region. */ | 1619 | * simple, single region. */ |
1490 | *(char*)(boot+E820NR) = 1; | 1620 | boot->e820_entries = 1; |
1491 | *((struct e820entry *)(boot+E820MAP)) | 1621 | boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM }); |
1492 | = ((struct e820entry) { 0, mem, E820_RAM }); | ||
1493 | /* The boot header contains a command line pointer: we put the command | 1622 | /* The boot header contains a command line pointer: we put the command |
1494 | * line after the boot header (at address 4096) */ | 1623 | * line after the boot header. */ |
1495 | *(void **)(boot + 0x228) = boot + 4096; | 1624 | boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); |
1496 | concat(boot + 4096, argv+optind+2); | 1625 | concat((char *)(boot + 1), argv+optind+2); |
1626 | |||
1627 | /* Boot protocol version: 2.07 supports the fields for lguest. */ | ||
1628 | boot->hdr.version = 0x207; | ||
1629 | |||
1630 | /* The hardware_subarch value of "1" tells the Guest it's an lguest. */ | ||
1631 | boot->hdr.hardware_subarch = 1; | ||
1497 | 1632 | ||
1498 | /* The guest type value of "1" tells the Guest it's under lguest. */ | 1633 | /* Tell the entry path not to try to reload segment registers. */ |
1499 | *(int *)(boot + 0x23c) = 1; | 1634 | boot->hdr.loadflags |= KEEP_SEGMENTS; |
1500 | 1635 | ||
1501 | /* We tell the kernel to initialize the Guest: this returns the open | 1636 | /* We tell the kernel to initialize the Guest: this returns the open |
1502 | * /dev/lguest file descriptor. */ | 1637 | * /dev/lguest file descriptor. */ |
1503 | lguest_fd = tell_kernel(pgdir, start, page_offset); | 1638 | lguest_fd = tell_kernel(pgdir, start); |
1504 | 1639 | ||
1505 | /* We fork off a child process, which wakes the Launcher whenever one | 1640 | /* We fork off a child process, which wakes the Launcher whenever one |
1506 | * of the input file descriptors needs attention. Otherwise we would | 1641 | * of the input file descriptors needs attention. Otherwise we would |
1507 | * run the Guest until it tries to output something. */ | 1642 | * run the Guest until it tries to output something. */ |
1508 | waker_fd = setup_waker(lguest_fd, &device_list); | 1643 | waker_fd = setup_waker(lguest_fd); |
1509 | 1644 | ||
1510 | /* Finally, run the Guest. This doesn't return. */ | 1645 | /* Finally, run the Guest. This doesn't return. */ |
1511 | run_guest(lguest_fd, &device_list); | 1646 | run_guest(lguest_fd); |
1512 | } | 1647 | } |
1513 | /*:*/ | 1648 | /*:*/ |
1514 | 1649 | ||
diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt index 821617bd6c04..7885ab2d5f53 100644 --- a/Documentation/lguest/lguest.txt +++ b/Documentation/lguest/lguest.txt | |||
@@ -6,7 +6,7 @@ Lguest is designed to be a minimal hypervisor for the Linux kernel, for | |||
6 | Linux developers and users to experiment with virtualization with the | 6 | Linux developers and users to experiment with virtualization with the |
7 | minimum of complexity. Nonetheless, it should have sufficient | 7 | minimum of complexity. Nonetheless, it should have sufficient |
8 | features to make it useful for specific tasks, and, of course, you are | 8 | features to make it useful for specific tasks, and, of course, you are |
9 | encouraged to fork and enhance it. | 9 | encouraged to fork and enhance it (see drivers/lguest/README). |
10 | 10 | ||
11 | Features: | 11 | Features: |
12 | 12 | ||
@@ -23,19 +23,30 @@ Developer features: | |||
23 | 23 | ||
24 | Running Lguest: | 24 | Running Lguest: |
25 | 25 | ||
26 | - Lguest runs the same kernel as guest and host. You can configure | 26 | - The easiest way to run lguest is to use same kernel as guest and host. |
27 | them differently, but usually it's easiest not to. | 27 | You can configure them differently, but usually it's easiest not to. |
28 | 28 | ||
29 | You will need to configure your kernel with the following options: | 29 | You will need to configure your kernel with the following options: |
30 | 30 | ||
31 | CONFIG_HIGHMEM64G=n ("High Memory Support" "64GB")[1] | 31 | "General setup": |
32 | CONFIG_TUN=y/m ("Universal TUN/TAP device driver support") | 32 | "Prompt for development and/or incomplete code/drivers" = Y |
33 | CONFIG_EXPERIMENTAL=y ("Prompt for development and/or incomplete code/drivers") | 33 | (CONFIG_EXPERIMENTAL=y) |
34 | CONFIG_PARAVIRT=y ("Paravirtualization support (EXPERIMENTAL)") | 34 | |
35 | CONFIG_LGUEST=y/m ("Linux hypervisor example code") | 35 | "Processor type and features": |
36 | 36 | "Paravirtualized guest support" = Y | |
37 | and I recommend: | 37 | "Lguest guest support" = Y |
38 | CONFIG_HZ=100 ("Timer frequency")[2] | 38 | "High Memory Support" = off/4GB |
39 | "Alignment value to which kernel should be aligned" = 0x100000 | ||
40 | (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and | ||
41 | CONFIG_PHYSICAL_ALIGN=0x100000) | ||
42 | |||
43 | "Device Drivers": | ||
44 | "Network device support" | ||
45 | "Universal TUN/TAP device driver support" = M/Y | ||
46 | (CONFIG_TUN=m) | ||
47 | "Virtualization" | ||
48 | "Linux hypervisor example code" = M/Y | ||
49 | (CONFIG_LGUEST=m) | ||
39 | 50 | ||
40 | - A tool called "lguest" is available in this directory: type "make" | 51 | - A tool called "lguest" is available in this directory: type "make" |
41 | to build it. If you didn't build your kernel in-tree, use "make | 52 | to build it. If you didn't build your kernel in-tree, use "make |
@@ -51,14 +62,17 @@ Running Lguest: | |||
51 | dd if=/dev/zero of=rootfile bs=1M count=2048 | 62 | dd if=/dev/zero of=rootfile bs=1M count=2048 |
52 | qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d | 63 | qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d |
53 | 64 | ||
65 | Make sure that you install a getty on /dev/hvc0 if you want to log in on the | ||
66 | console! | ||
67 | |||
54 | - "modprobe lg" if you built it as a module. | 68 | - "modprobe lg" if you built it as a module. |
55 | 69 | ||
56 | - Run an lguest as root: | 70 | - Run an lguest as root: |
57 | 71 | ||
58 | Documentation/lguest/lguest 64m vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/lgba | 72 | Documentation/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/vda |
59 | 73 | ||
60 | Explanation: | 74 | Explanation: |
61 | 64m: the amount of memory to use. | 75 | 64: the amount of memory to use, in MB. |
62 | 76 | ||
63 | vmlinux: the kernel image found in the top of your build directory. You | 77 | vmlinux: the kernel image found in the top of your build directory. You |
64 | can also use a standard bzImage. | 78 | can also use a standard bzImage. |
@@ -66,10 +80,10 @@ Running Lguest: | |||
66 | --tunnet=192.168.19.1: configures a "tap" device for networking with this | 80 | --tunnet=192.168.19.1: configures a "tap" device for networking with this |
67 | IP address. | 81 | IP address. |
68 | 82 | ||
69 | --block=rootfile: a file or block device which becomes /dev/lgba | 83 | --block=rootfile: a file or block device which becomes /dev/vda |
70 | inside the guest. | 84 | inside the guest. |
71 | 85 | ||
72 | root=/dev/lgba: this (and anything else on the command line) are | 86 | root=/dev/vda: this (and anything else on the command line) are |
73 | kernel boot parameters. | 87 | kernel boot parameters. |
74 | 88 | ||
75 | - Configuring networking. I usually have the host masquerade, using | 89 | - Configuring networking. I usually have the host masquerade, using |
@@ -99,31 +113,7 @@ Running Lguest: | |||
99 | "--sharenet=<filename>": any two guests using the same file are on | 113 | "--sharenet=<filename>": any two guests using the same file are on |
100 | the same network. This file is created if it does not exist. | 114 | the same network. This file is created if it does not exist. |
101 | 115 | ||
102 | Lguest I/O model: | 116 | There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest |
103 | |||
104 | Lguest uses a simplified DMA model plus shared memory for I/O. Guests | ||
105 | can communicate with each other if they share underlying memory | ||
106 | (usually by the lguest program mmaping the same file), but they can | ||
107 | use any non-shared memory to communicate with the lguest process. | ||
108 | |||
109 | Guests can register DMA buffers at any key (must be a valid physical | ||
110 | address) using the LHCALL_BIND_DMA(key, dmabufs, num<<8|irq) | ||
111 | hypercall. "dmabufs" is the physical address of an array of "num" | ||
112 | "struct lguest_dma": each contains a used_len, and an array of | ||
113 | physical addresses and lengths. When a transfer occurs, the | ||
114 | "used_len" field of one of the buffers which has used_len 0 will be | ||
115 | set to the length transferred and the irq will fire. | ||
116 | 117 | ||
117 | Using an irq value of 0 unbinds the dma buffers. | 118 | Good luck! |
118 | |||
119 | To send DMA, the LHCALL_SEND_DMA(key, dma_physaddr) hypercall is used, | ||
120 | and the bytes used is written to the used_len field. This can be 0 if | ||
121 | noone else has bound a DMA buffer to that key or some other error. | ||
122 | DMA buffers bound by the same guest are ignored. | ||
123 | |||
124 | Cheers! | ||
125 | Rusty Russell rusty@rustcorp.com.au. | 119 | Rusty Russell rusty@rustcorp.com.au. |
126 | |||
127 | [1] These are on various places on the TODO list, waiting for you to | ||
128 | get annoyed enough at the limitation to fix it. | ||
129 | [2] Lguest is not yet tickless when idle. See [1]. | ||
diff --git a/Documentation/m68k/kernel-options.txt b/Documentation/m68k/kernel-options.txt index 8a523f6af48a..248589e8bcf5 100644 --- a/Documentation/m68k/kernel-options.txt +++ b/Documentation/m68k/kernel-options.txt | |||
@@ -890,10 +890,7 @@ Syntax: nosync:0 | |||
890 | 5.5.2) noasync | 890 | 5.5.2) noasync |
891 | -------------- | 891 | -------------- |
892 | 892 | ||
893 | Syntax: noasync:0 | 893 | [OBSOLETE, REMOVED] |
894 | |||
895 | Disables async and sync negotiation for all devices. Any value | ||
896 | after the colon is acceptable (and has the same effect). | ||
897 | 894 | ||
898 | 5.5.3) nodisconnect | 895 | 5.5.3) nodisconnect |
899 | ------------------- | 896 | ------------------- |
diff --git a/Documentation/markers.txt b/Documentation/markers.txt new file mode 100644 index 000000000000..295a71bc301e --- /dev/null +++ b/Documentation/markers.txt | |||
@@ -0,0 +1,81 @@ | |||
1 | Using the Linux Kernel Markers | ||
2 | |||
3 | Mathieu Desnoyers | ||
4 | |||
5 | |||
6 | This document introduces Linux Kernel Markers and their use. It provides | ||
7 | examples of how to insert markers in the kernel and connect probe functions to | ||
8 | them and provides some examples of probe functions. | ||
9 | |||
10 | |||
11 | * Purpose of markers | ||
12 | |||
13 | A marker placed in code provides a hook to call a function (probe) that you can | ||
14 | provide at runtime. A marker can be "on" (a probe is connected to it) or "off" | ||
15 | (no probe is attached). When a marker is "off" it has no effect, except for | ||
16 | adding a tiny time penalty (checking a condition for a branch) and space | ||
17 | penalty (adding a few bytes for the function call at the end of the | ||
18 | instrumented function and adds a data structure in a separate section). When a | ||
19 | marker is "on", the function you provide is called each time the marker is | ||
20 | executed, in the execution context of the caller. When the function provided | ||
21 | ends its execution, it returns to the caller (continuing from the marker site). | ||
22 | |||
23 | You can put markers at important locations in the code. Markers are | ||
24 | lightweight hooks that can pass an arbitrary number of parameters, | ||
25 | described in a printk-like format string, to the attached probe function. | ||
26 | |||
27 | They can be used for tracing and performance accounting. | ||
28 | |||
29 | |||
30 | * Usage | ||
31 | |||
32 | In order to use the macro trace_mark, you should include linux/marker.h. | ||
33 | |||
34 | #include <linux/marker.h> | ||
35 | |||
36 | And, | ||
37 | |||
38 | trace_mark(subsystem_event, "%d %s", someint, somestring); | ||
39 | Where : | ||
40 | - subsystem_event is an identifier unique to your event | ||
41 | - subsystem is the name of your subsystem. | ||
42 | - event is the name of the event to mark. | ||
43 | - "%d %s" is the formatted string for the serializer. | ||
44 | - someint is an integer. | ||
45 | - somestring is a char pointer. | ||
46 | |||
47 | Connecting a function (probe) to a marker is done by providing a probe (function | ||
48 | to call) for the specific marker through marker_probe_register() and can be | ||
49 | activated by calling marker_arm(). Marker deactivation can be done by calling | ||
50 | marker_disarm() as many times as marker_arm() has been called. Removing a probe | ||
51 | is done through marker_probe_unregister(); it will disarm the probe and make | ||
52 | sure there is no caller left using the probe when it returns. Probe removal is | ||
53 | preempt-safe because preemption is disabled around the probe call. See the | ||
54 | "Probe example" section below for a sample probe module. | ||
55 | |||
56 | The marker mechanism supports inserting multiple instances of the same marker. | ||
57 | Markers can be put in inline functions, inlined static functions, and | ||
58 | unrolled loops as well as regular functions. | ||
59 | |||
60 | The naming scheme "subsystem_event" is suggested here as a convention intended | ||
61 | to limit collisions. Marker names are global to the kernel: they are considered | ||
62 | as being the same whether they are in the core kernel image or in modules. | ||
63 | Conflicting format strings for markers with the same name will cause the markers | ||
64 | to be detected to have a different format string not to be armed and will output | ||
65 | a printk warning which identifies the inconsistency: | ||
66 | |||
67 | "Format mismatch for probe probe_name (format), marker (format)" | ||
68 | |||
69 | |||
70 | * Probe / marker example | ||
71 | |||
72 | See the example provided in samples/markers/src | ||
73 | |||
74 | Compile them with your kernel. | ||
75 | |||
76 | Run, as root : | ||
77 | modprobe marker-example (insmod order is not important) | ||
78 | modprobe probe-example | ||
79 | cat /proc/marker-example (returns an expected error) | ||
80 | rmmod marker-example probe-example | ||
81 | dmesg | ||
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 650657c54733..4e17beba2379 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt | |||
@@ -1479,7 +1479,8 @@ kernel. | |||
1479 | 1479 | ||
1480 | Any atomic operation that modifies some state in memory and returns information | 1480 | Any atomic operation that modifies some state in memory and returns information |
1481 | about the state (old or new) implies an SMP-conditional general memory barrier | 1481 | about the state (old or new) implies an SMP-conditional general memory barrier |
1482 | (smp_mb()) on each side of the actual operation. These include: | 1482 | (smp_mb()) on each side of the actual operation (with the exception of |
1483 | explicit lock operations, described later). These include: | ||
1483 | 1484 | ||
1484 | xchg(); | 1485 | xchg(); |
1485 | cmpxchg(); | 1486 | cmpxchg(); |
@@ -1536,10 +1537,19 @@ If they're used for constructing a lock of some description, then they probably | |||
1536 | do need memory barriers as a lock primitive generally has to do things in a | 1537 | do need memory barriers as a lock primitive generally has to do things in a |
1537 | specific order. | 1538 | specific order. |
1538 | 1539 | ||
1539 | |||
1540 | Basically, each usage case has to be carefully considered as to whether memory | 1540 | Basically, each usage case has to be carefully considered as to whether memory |
1541 | barriers are needed or not. | 1541 | barriers are needed or not. |
1542 | 1542 | ||
1543 | The following operations are special locking primitives: | ||
1544 | |||
1545 | test_and_set_bit_lock(); | ||
1546 | clear_bit_unlock(); | ||
1547 | __clear_bit_unlock(); | ||
1548 | |||
1549 | These implement LOCK-class and UNLOCK-class operations. These should be used in | ||
1550 | preference to other operations when implementing locking primitives, because | ||
1551 | their implementations can be optimised on many architectures. | ||
1552 | |||
1543 | [!] Note that special memory barrier primitives are available for these | 1553 | [!] Note that special memory barrier primitives are available for these |
1544 | situations because on some CPUs the atomic instructions used imply full memory | 1554 | situations because on some CPUs the atomic instructions used imply full memory |
1545 | barriers, and so barrier instructions are superfluous in conjunction with them, | 1555 | barriers, and so barrier instructions are superfluous in conjunction with them, |
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt index 5fbcc22c98e9..168117bd6ee8 100644 --- a/Documentation/memory-hotplug.txt +++ b/Documentation/memory-hotplug.txt | |||
@@ -2,7 +2,8 @@ | |||
2 | Memory Hotplug | 2 | Memory Hotplug |
3 | ============== | 3 | ============== |
4 | 4 | ||
5 | Last Updated: Jul 28 2007 | 5 | Created: Jul 28 2007 |
6 | Add description of notifier of memory hotplug Oct 11 2007 | ||
6 | 7 | ||
7 | This document is about memory hotplug including how-to-use and current status. | 8 | This document is about memory hotplug including how-to-use and current status. |
8 | Because Memory Hotplug is still under development, contents of this text will | 9 | Because Memory Hotplug is still under development, contents of this text will |
@@ -24,7 +25,8 @@ be changed often. | |||
24 | 6.1 Memory offline and ZONE_MOVABLE | 25 | 6.1 Memory offline and ZONE_MOVABLE |
25 | 6.2. How to offline memory | 26 | 6.2. How to offline memory |
26 | 7. Physical memory remove | 27 | 7. Physical memory remove |
27 | 8. Future Work List | 28 | 8. Memory hotplug event notifier |
29 | 9. Future Work List | ||
28 | 30 | ||
29 | Note(1): x86_64's has special implementation for memory hotplug. | 31 | Note(1): x86_64's has special implementation for memory hotplug. |
30 | This text does not describe it. | 32 | This text does not describe it. |
@@ -307,8 +309,58 @@ Need more implementation yet.... | |||
307 | - Notification completion of remove works by OS to firmware. | 309 | - Notification completion of remove works by OS to firmware. |
308 | - Guard from remove if not yet. | 310 | - Guard from remove if not yet. |
309 | 311 | ||
312 | -------------------------------- | ||
313 | 8. Memory hotplug event notifier | ||
314 | -------------------------------- | ||
315 | Memory hotplug has event notifer. There are 6 types of notification. | ||
316 | |||
317 | MEMORY_GOING_ONLINE | ||
318 | Generated before new memory becomes available in order to be able to | ||
319 | prepare subsystems to handle memory. The page allocator is still unable | ||
320 | to allocate from the new memory. | ||
321 | |||
322 | MEMORY_CANCEL_ONLINE | ||
323 | Generated if MEMORY_GOING_ONLINE fails. | ||
324 | |||
325 | MEMORY_ONLINE | ||
326 | Generated when memory has succesfully brought online. The callback may | ||
327 | allocate pages from the new memory. | ||
328 | |||
329 | MEMORY_GOING_OFFLINE | ||
330 | Generated to begin the process of offlining memory. Allocations are no | ||
331 | longer possible from the memory but some of the memory to be offlined | ||
332 | is still in use. The callback can be used to free memory known to a | ||
333 | subsystem from the indicated memory section. | ||
334 | |||
335 | MEMORY_CANCEL_OFFLINE | ||
336 | Generated if MEMORY_GOING_OFFLINE fails. Memory is available again from | ||
337 | the section that we attempted to offline. | ||
338 | |||
339 | MEMORY_OFFLINE | ||
340 | Generated after offlining memory is complete. | ||
341 | |||
342 | A callback routine can be registered by | ||
343 | hotplug_memory_notifier(callback_func, priority) | ||
344 | |||
345 | The second argument of callback function (action) is event types of above. | ||
346 | The third argument is passed by pointer of struct memory_notify. | ||
347 | |||
348 | struct memory_notify { | ||
349 | unsigned long start_pfn; | ||
350 | unsigned long nr_pages; | ||
351 | int status_cahnge_nid; | ||
352 | } | ||
353 | |||
354 | start_pfn is start_pfn of online/offline memory. | ||
355 | nr_pages is # of pages of online/offline memory. | ||
356 | status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be) | ||
357 | set/clear. It means a new(memoryless) node gets new memory by online and a | ||
358 | node loses all memory. If this is -1, then nodemask status is not changed. | ||
359 | If status_changed_nid >= 0, callback should create/discard structures for the | ||
360 | node if necessary. | ||
361 | |||
310 | -------------- | 362 | -------------- |
311 | 8. Future Work | 363 | 9. Future Work |
312 | -------------- | 364 | -------------- |
313 | - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like | 365 | - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like |
314 | sysctl or new control file. | 366 | sysctl or new control file. |
diff --git a/Documentation/mips/00-INDEX b/Documentation/mips/00-INDEX index 9df8a2eac7b4..3f13bf8043d2 100644 --- a/Documentation/mips/00-INDEX +++ b/Documentation/mips/00-INDEX | |||
@@ -4,5 +4,3 @@ AU1xxx_IDE.README | |||
4 | - README for MIPS AU1XXX IDE driver. | 4 | - README for MIPS AU1XXX IDE driver. |
5 | GT64120.README | 5 | GT64120.README |
6 | - README for dir with info on MIPS boards using GT-64120 or GT-64120A. | 6 | - README for dir with info on MIPS boards using GT-64120 or GT-64120A. |
7 | time.README | ||
8 | - README for MIPS time services. | ||
diff --git a/Documentation/mips/AU1xxx_IDE.README b/Documentation/mips/AU1xxx_IDE.README index afb31c141d9d..5c8334123f4f 100644 --- a/Documentation/mips/AU1xxx_IDE.README +++ b/Documentation/mips/AU1xxx_IDE.README | |||
@@ -59,7 +59,7 @@ Four configs variables are introduced: | |||
59 | CONFIG_BLK_DEV_IDE_AU1XXX_PIO_DBDMA - enable the PIO+DBDMA mode | 59 | CONFIG_BLK_DEV_IDE_AU1XXX_PIO_DBDMA - enable the PIO+DBDMA mode |
60 | CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA - enable the MWDMA mode | 60 | CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA - enable the MWDMA mode |
61 | CONFIG_BLK_DEV_IDE_AU1XXX_BURSTABLE_ON - set Burstable FIFO in DBDMA | 61 | CONFIG_BLK_DEV_IDE_AU1XXX_BURSTABLE_ON - set Burstable FIFO in DBDMA |
62 | controler | 62 | controller |
63 | CONFIG_BLK_DEV_IDE_AU1XXX_SEQTS_PER_RQ - maximum transfer size | 63 | CONFIG_BLK_DEV_IDE_AU1XXX_SEQTS_PER_RQ - maximum transfer size |
64 | per descriptor | 64 | per descriptor |
65 | 65 | ||
diff --git a/Documentation/mips/time.README b/Documentation/mips/time.README deleted file mode 100644 index a4ce603ed3b3..000000000000 --- a/Documentation/mips/time.README +++ /dev/null | |||
@@ -1,173 +0,0 @@ | |||
1 | README for MIPS time services | ||
2 | |||
3 | Jun Sun | ||
4 | jsun@mvista.com or jsun@junsun.net | ||
5 | |||
6 | |||
7 | ABOUT | ||
8 | ----- | ||
9 | This file describes the new arch/mips/kernel/time.c, related files and the | ||
10 | services they provide. | ||
11 | |||
12 | If you are short in patience and just want to know how to use time.c for a | ||
13 | new board or convert an existing board, go to the last section. | ||
14 | |||
15 | |||
16 | FILES, COMPATABILITY AND CONFIGS | ||
17 | --------------------------------- | ||
18 | |||
19 | The old arch/mips/kernel/time.c is renamed to old-time.c. | ||
20 | |||
21 | A new time.c is put there, together with include/asm-mips/time.h. | ||
22 | |||
23 | Two configs variables are introduced, CONFIG_OLD_TIME_C and CONFIG_NEW_TIME_C. | ||
24 | So we allow boards using | ||
25 | |||
26 | 1) old time.c (CONFIG_OLD_TIME_C) | ||
27 | 2) new time.c (CONFIG_NEW_TIME_C) | ||
28 | 3) neither (their own private time.c) | ||
29 | |||
30 | However, it is expected every board will move to the new time.c in the near | ||
31 | future. | ||
32 | |||
33 | |||
34 | WHAT THE NEW CODE PROVIDES? | ||
35 | --------------------------- | ||
36 | |||
37 | The new time code provide the following services: | ||
38 | |||
39 | a) Implements functions required by Linux common code: | ||
40 | time_init | ||
41 | |||
42 | b) provides an abstraction of RTC and null RTC implementation as default. | ||
43 | extern unsigned long (*rtc_get_time)(void); | ||
44 | extern int (*rtc_set_time)(unsigned long); | ||
45 | |||
46 | c) high-level and low-level timer interrupt routines where the timer | ||
47 | interrupt source may or may not be the CPU timer. The high-level | ||
48 | routine is dispatched through do_IRQ() while the low-level is | ||
49 | dispatched in assemably code (usually int-handler.S) | ||
50 | |||
51 | |||
52 | WHAT THE NEW CODE REQUIRES? | ||
53 | --------------------------- | ||
54 | |||
55 | For the new code to work properly, each board implementation needs to supply | ||
56 | the following functions or values: | ||
57 | |||
58 | a) board_time_init - a function pointer. Invoked at the beginnig of | ||
59 | time_init(). It is optional. | ||
60 | 1. (optional) set up RTC routines | ||
61 | 2. (optional) calibrate and set the mips_hpt_frequency | ||
62 | |||
63 | b) plat_timer_setup - a function pointer. Invoked at the end of time_init() | ||
64 | 1. (optional) over-ride any decisions made in time_init() | ||
65 | 2. set up the irqaction for timer interrupt. | ||
66 | 3. enable the timer interrupt | ||
67 | |||
68 | c) (optional) board-specific RTC routines. | ||
69 | |||
70 | d) (optional) mips_hpt_frequency - It must be definied if the board | ||
71 | is using CPU counter for timer interrupt. | ||
72 | |||
73 | |||
74 | PORTING GUIDE | ||
75 | ------------- | ||
76 | |||
77 | Step 1: decide how you like to implement the time services. | ||
78 | |||
79 | a) does this board have a RTC? If yes, implement the two RTC funcs. | ||
80 | |||
81 | b) does the CPU have counter/compare registers? | ||
82 | |||
83 | If the answer is no, you need a timer to provide the timer interrupt | ||
84 | at 100 HZ speed. | ||
85 | |||
86 | c) The following sub steps assume your CPU has counter register. | ||
87 | Do you plan to use the CPU counter register as the timer interrupt | ||
88 | or use an exnternal timer? | ||
89 | |||
90 | In order to use CPU counter register as the timer interrupt source, you | ||
91 | must know the counter speed (mips_hpt_frequency). It is usually the | ||
92 | same as the CPU speed or an integral divisor of it. | ||
93 | |||
94 | d) decide on whether you want to use high-level or low-level timer | ||
95 | interrupt routines. The low-level one is presumably faster, but should | ||
96 | not make too mcuh difference. | ||
97 | |||
98 | |||
99 | Step 2: the machine setup() function | ||
100 | |||
101 | If you supply board_time_init(), set the function poointer. | ||
102 | |||
103 | |||
104 | Step 3: implement rtc routines, board_time_init() and plat_timer_setup() | ||
105 | if needed. | ||
106 | |||
107 | board_time_init() - | ||
108 | a) (optional) set up RTC routines, | ||
109 | b) (optional) calibrate and set the mips_hpt_frequency | ||
110 | (only needed if you intended to use cpu counter as timer interrupt | ||
111 | source) | ||
112 | |||
113 | plat_timer_setup() - | ||
114 | a) (optional) over-write any choices made above by time_init(). | ||
115 | b) machine specific code should setup the timer irqaction. | ||
116 | c) enable the timer interrupt | ||
117 | |||
118 | |||
119 | If the RTC chip is a common chip, I suggest the routines are put under | ||
120 | arch/mips/libs. For example, for DS1386 chip, one would create | ||
121 | rtc-ds1386.c under arch/mips/lib directory. Add the following line to | ||
122 | the arch/mips/lib/Makefile: | ||
123 | |||
124 | obj-$(CONFIG_DDB5476) += rtc-ds1386.o | ||
125 | |||
126 | Step 4: if you are using low-level timer interrupt, change your interrupt | ||
127 | dispathcing code to check for timer interrupt and jump to | ||
128 | ll_timer_interrupt() directly if one is detected. | ||
129 | |||
130 | Step 5: Modify arch/mips/config.in and add CONFIG_NEW_TIME_C to your machine. | ||
131 | Modify the appropriate defconfig if applicable. | ||
132 | |||
133 | Final notes: | ||
134 | |||
135 | For some tricky cases, you may need to add your own wrapper functions | ||
136 | for some of the functions in time.c. | ||
137 | |||
138 | For example, you may define your own timer interrupt routine, which does | ||
139 | some of its own processing and then calls timer_interrupt(). | ||
140 | |||
141 | You can also over-ride any of the built-in functions (RTC routines | ||
142 | and/or timer interrupt routine). | ||
143 | |||
144 | |||
145 | PORTING NOTES FOR SMP | ||
146 | ---------------------- | ||
147 | |||
148 | If you have a SMP box, things are slightly more complicated. | ||
149 | |||
150 | The time service running every jiffy is logically divided into two parts: | ||
151 | |||
152 | 1) the one for the whole system (defined in timer_interrupt()) | ||
153 | 2) the one that should run for each CPU (defined in local_timer_interrupt()) | ||
154 | |||
155 | You need to decide on your timer interrupt sources. | ||
156 | |||
157 | case 1) - whole system has only one timer interrupt delivered to one CPU | ||
158 | |||
159 | In this case, you set up timer interrupt as in UP systems. In addtion, | ||
160 | you need to set emulate_local_timer_interrupt to 1 so that other | ||
161 | CPUs get to call local_timer_interrupt(). | ||
162 | |||
163 | THIS IS CURRENTLY NOT IMPLEMNETED. However, it is rather easy to write | ||
164 | one should such a need arise. You simply make a IPI call. | ||
165 | |||
166 | case 2) - each CPU has a separate timer interrupt | ||
167 | |||
168 | In this case, you need to set up IRQ such that each of them will | ||
169 | call local_timer_interrupt(). In addition, you need to arrange | ||
170 | one and only one of them to call timer_interrupt(). | ||
171 | |||
172 | You can also do the low-level version of those interrupt routines, | ||
173 | following similar dispatching routes described above. | ||
diff --git a/Documentation/mutex-design.txt b/Documentation/mutex-design.txt index 51f935191ae5..aa60d1f627e5 100644 --- a/Documentation/mutex-design.txt +++ b/Documentation/mutex-design.txt | |||
@@ -133,4 +133,6 @@ the APIs of 'struct mutex' have been streamlined: | |||
133 | int mutex_trylock(struct mutex *lock); | 133 | int mutex_trylock(struct mutex *lock); |
134 | void mutex_unlock(struct mutex *lock); | 134 | void mutex_unlock(struct mutex *lock); |
135 | int mutex_is_locked(struct mutex *lock); | 135 | int mutex_is_locked(struct mutex *lock); |
136 | 136 | void mutex_lock_nested(struct mutex *lock, unsigned int subclass); | |
137 | int mutex_lock_interruptible_nested(struct mutex *lock, | ||
138 | unsigned int subclass); | ||
diff --git a/Documentation/networking/bcm43xx.txt b/Documentation/networking/bcm43xx.txt index a136721499bf..d602c8d6ff3e 100644 --- a/Documentation/networking/bcm43xx.txt +++ b/Documentation/networking/bcm43xx.txt | |||
@@ -37,7 +37,7 @@ all, distributions. There is, however, additional software that is | |||
37 | required. The firmware used by the chip is the intellectual property | 37 | required. The firmware used by the chip is the intellectual property |
38 | of Broadcom and they have not given the bcm43xx team redistribution | 38 | of Broadcom and they have not given the bcm43xx team redistribution |
39 | rights to this firmware. Since we cannot legally redistribute | 39 | rights to this firmware. Since we cannot legally redistribute |
40 | the firwmare we cannot include it with the driver. Furthermore, it | 40 | the firmware we cannot include it with the driver. Furthermore, it |
41 | cannot be placed in the downloadable archives of any distributing | 41 | cannot be placed in the downloadable archives of any distributing |
42 | organization; therefore, the user is responsible for obtaining the | 42 | organization; therefore, the user is responsible for obtaining the |
43 | firmware and placing it in the appropriate location so that the driver | 43 | firmware and placing it in the appropriate location so that the driver |
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 6ae2feff3087..747a5d15d529 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt | |||
@@ -293,7 +293,7 @@ tcp_no_metrics_save - BOOLEAN | |||
293 | when the connection closes, so that connections established in the | 293 | when the connection closes, so that connections established in the |
294 | near future can use these to set initial conditions. Usually, this | 294 | near future can use these to set initial conditions. Usually, this |
295 | increases overall performance, but may sometimes cause performance | 295 | increases overall performance, but may sometimes cause performance |
296 | degredation. If set, TCP will not cache metrics on closing | 296 | degradation. If set, TCP will not cache metrics on closing |
297 | connections. | 297 | connections. |
298 | 298 | ||
299 | tcp_orphan_retries - INTEGER | 299 | tcp_orphan_retries - INTEGER |
diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt index c36b64b0020f..c3669a3fb4af 100644 --- a/Documentation/networking/rxrpc.txt +++ b/Documentation/networking/rxrpc.txt | |||
@@ -689,7 +689,7 @@ such as the AFS filesystem. This permits such a utility to: | |||
689 | buffers manipulated directly. | 689 | buffers manipulated directly. |
690 | 690 | ||
691 | To use the RxRPC facility, a kernel utility must still open an AF_RXRPC socket, | 691 | To use the RxRPC facility, a kernel utility must still open an AF_RXRPC socket, |
692 | bind an addess as appropriate and listen if it's to be a server socket, but | 692 | bind an address as appropriate and listen if it's to be a server socket, but |
693 | then it passes this to the kernel interface functions. | 693 | then it passes this to the kernel interface functions. |
694 | 694 | ||
695 | The kernel interface functions are as follows: | 695 | The kernel interface functions are as follows: |
diff --git a/Documentation/networking/udplite.txt b/Documentation/networking/udplite.txt index 6be09ba24a36..b6409cab075c 100644 --- a/Documentation/networking/udplite.txt +++ b/Documentation/networking/udplite.txt | |||
@@ -12,7 +12,7 @@ | |||
12 | For in-depth information, you can consult: | 12 | For in-depth information, you can consult: |
13 | 13 | ||
14 | o The UDP-Lite Homepage: http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/ | 14 | o The UDP-Lite Homepage: http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/ |
15 | Fom here you can also download some example application source code. | 15 | From here you can also download some example application source code. |
16 | 16 | ||
17 | o The UDP-Lite HOWTO on | 17 | o The UDP-Lite HOWTO on |
18 | http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/files/UDP-Lite-HOWTO.txt | 18 | http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/files/UDP-Lite-HOWTO.txt |
@@ -223,7 +223,7 @@ | |||
223 | While it is important that such cases are dealt with correctly, they | 223 | While it is important that such cases are dealt with correctly, they |
224 | are (annoyingly) rare: UDP-Lite is designed for optimising multimedia | 224 | are (annoyingly) rare: UDP-Lite is designed for optimising multimedia |
225 | performance over wireless (or generally noisy) links and thus smaller | 225 | performance over wireless (or generally noisy) links and thus smaller |
226 | coverage lenghts are likely to be expected. | 226 | coverage lengths are likely to be expected. |
227 | 227 | ||
228 | 228 | ||
229 | V) UDP-LITE RUNTIME STATISTICS AND THEIR MEANING | 229 | V) UDP-LITE RUNTIME STATISTICS AND THEIR MEANING |
@@ -259,7 +259,7 @@ | |||
259 | VI) IPTABLES | 259 | VI) IPTABLES |
260 | 260 | ||
261 | There is packet match support for UDP-Lite as well as support for the LOG target. | 261 | There is packet match support for UDP-Lite as well as support for the LOG target. |
262 | If you copy and paste the following line into /etc/protcols, | 262 | If you copy and paste the following line into /etc/protocols, |
263 | 263 | ||
264 | udplite 136 UDP-Lite # UDP-Lite [RFC 3828] | 264 | udplite 136 UDP-Lite # UDP-Lite [RFC 3828] |
265 | 265 | ||
diff --git a/Documentation/parport-lowlevel.txt b/Documentation/parport-lowlevel.txt index 8f2302415eff..265fcdcb8e5f 100644 --- a/Documentation/parport-lowlevel.txt +++ b/Documentation/parport-lowlevel.txt | |||
@@ -25,7 +25,6 @@ Global functions: | |||
25 | parport_open | 25 | parport_open |
26 | parport_close | 26 | parport_close |
27 | parport_device_id | 27 | parport_device_id |
28 | parport_device_num | ||
29 | parport_device_coords | 28 | parport_device_coords |
30 | parport_find_class | 29 | parport_find_class |
31 | parport_find_device | 30 | parport_find_device |
@@ -735,7 +734,7 @@ NULL is returned. | |||
735 | 734 | ||
736 | SEE ALSO | 735 | SEE ALSO |
737 | 736 | ||
738 | parport_register_device, parport_device_num | 737 | parport_register_device |
739 | 738 | ||
740 | parport_close - unregister device for particular device number | 739 | parport_close - unregister device for particular device number |
741 | ------------- | 740 | ------------- |
@@ -787,29 +786,7 @@ Many devices have ill-formed IEEE 1284 Device IDs. | |||
787 | 786 | ||
788 | SEE ALSO | 787 | SEE ALSO |
789 | 788 | ||
790 | parport_find_class, parport_find_device, parport_device_num | 789 | parport_find_class, parport_find_device |
791 | |||
792 | parport_device_num - convert device coordinates to device number | ||
793 | ------------------ | ||
794 | |||
795 | SYNOPSIS | ||
796 | |||
797 | #include <linux/parport.h> | ||
798 | |||
799 | int parport_device_num (int parport, int mux, int daisy); | ||
800 | |||
801 | DESCRIPTION | ||
802 | |||
803 | Convert between device coordinates (port, multiplexor, daisy chain | ||
804 | address) and device number (zero-based). | ||
805 | |||
806 | RETURN VALUE | ||
807 | |||
808 | Device number, or -1 if no device at given coordinates. | ||
809 | |||
810 | SEE ALSO | ||
811 | |||
812 | parport_device_coords, parport_open, parport_device_id | ||
813 | 790 | ||
814 | parport_device_coords - convert device number to device coordinates | 791 | parport_device_coords - convert device number to device coordinates |
815 | ------------------ | 792 | ------------------ |
@@ -833,7 +810,7 @@ Zero on success, in which case the coordinates are (*parport, *mux, | |||
833 | 810 | ||
834 | SEE ALSO | 811 | SEE ALSO |
835 | 812 | ||
836 | parport_device_num, parport_open, parport_device_id | 813 | parport_open, parport_device_id |
837 | 814 | ||
838 | parport_find_class - find a device by its class | 815 | parport_find_class - find a device by its class |
839 | ------------------ | 816 | ------------------ |
diff --git a/Documentation/power/basic-pm-debugging.txt b/Documentation/power/basic-pm-debugging.txt index 1a85e2b964dc..57aef2f6e0de 100644 --- a/Documentation/power/basic-pm-debugging.txt +++ b/Documentation/power/basic-pm-debugging.txt | |||
@@ -78,8 +78,8 @@ c) Advanced debugging | |||
78 | In case the STD does not work on your system even in the minimal configuration | 78 | In case the STD does not work on your system even in the minimal configuration |
79 | and compiling more drivers as modules is not practical or some modules cannot | 79 | and compiling more drivers as modules is not practical or some modules cannot |
80 | be unloaded, you can use one of the more advanced debugging techniques to find | 80 | be unloaded, you can use one of the more advanced debugging techniques to find |
81 | the problem. First, if there is a serial port in your box, you can set the | 81 | the problem. First, if there is a serial port in your box, you can boot the |
82 | CONFIG_DISABLE_CONSOLE_SUSPEND kernel configuration option and try to log kernel | 82 | kernel with the 'no_console_suspend' parameter and try to log kernel |
83 | messages using the serial console. This may provide you with some information | 83 | messages using the serial console. This may provide you with some information |
84 | about the reasons of the suspend (resume) failure. Alternatively, it may be | 84 | about the reasons of the suspend (resume) failure. Alternatively, it may be |
85 | possible to use a FireWire port for debugging with firescope | 85 | possible to use a FireWire port for debugging with firescope |
diff --git a/Documentation/power/freezing-of-tasks.txt b/Documentation/power/freezing-of-tasks.txt index 04dc1cf9d215..38b57248fd61 100644 --- a/Documentation/power/freezing-of-tasks.txt +++ b/Documentation/power/freezing-of-tasks.txt | |||
@@ -19,12 +19,13 @@ we only consider hibernation, but the description also applies to suspend). | |||
19 | Namely, as the first step of the hibernation procedure the function | 19 | Namely, as the first step of the hibernation procedure the function |
20 | freeze_processes() (defined in kernel/power/process.c) is called. It executes | 20 | freeze_processes() (defined in kernel/power/process.c) is called. It executes |
21 | try_to_freeze_tasks() that sets TIF_FREEZE for all of the freezable tasks and | 21 | try_to_freeze_tasks() that sets TIF_FREEZE for all of the freezable tasks and |
22 | sends a fake signal to each of them. A task that receives such a signal and has | 22 | either wakes them up, if they are kernel threads, or sends fake signals to them, |
23 | TIF_FREEZE set, should react to it by calling the refrigerator() function | 23 | if they are user space processes. A task that has TIF_FREEZE set, should react |
24 | (defined in kernel/power/process.c), which sets the task's PF_FROZEN flag, | 24 | to it by calling the function called refrigerator() (defined in |
25 | changes its state to TASK_UNINTERRUPTIBLE and makes it loop until PF_FROZEN is | 25 | kernel/power/process.c), which sets the task's PF_FROZEN flag, changes its state |
26 | cleared for it. Then, we say that the task is 'frozen' and therefore the set of | 26 | to TASK_UNINTERRUPTIBLE and makes it loop until PF_FROZEN is cleared for it. |
27 | functions handling this mechanism is called 'the freezer' (these functions are | 27 | Then, we say that the task is 'frozen' and therefore the set of functions |
28 | handling this mechanism is referred to as 'the freezer' (these functions are | ||
28 | defined in kernel/power/process.c and include/linux/freezer.h). User space | 29 | defined in kernel/power/process.c and include/linux/freezer.h). User space |
29 | processes are generally frozen before kernel threads. | 30 | processes are generally frozen before kernel threads. |
30 | 31 | ||
@@ -35,21 +36,27 @@ task enter refrigerator() if the flag is set. | |||
35 | 36 | ||
36 | For user space processes try_to_freeze() is called automatically from the | 37 | For user space processes try_to_freeze() is called automatically from the |
37 | signal-handling code, but the freezable kernel threads need to call it | 38 | signal-handling code, but the freezable kernel threads need to call it |
38 | explicitly in suitable places. The code to do this may look like the following: | 39 | explicitly in suitable places or use the wait_event_freezable() or |
40 | wait_event_freezable_timeout() macros (defined in include/linux/freezer.h) | ||
41 | that combine interruptible sleep with checking if TIF_FREEZE is set and calling | ||
42 | try_to_freeze(). The main loop of a freezable kernel thread may look like the | ||
43 | following one: | ||
39 | 44 | ||
45 | set_freezable(); | ||
40 | do { | 46 | do { |
41 | hub_events(); | 47 | hub_events(); |
42 | wait_event_interruptible(khubd_wait, | 48 | wait_event_freezable(khubd_wait, |
43 | !list_empty(&hub_event_list)); | 49 | !list_empty(&hub_event_list) || |
44 | try_to_freeze(); | 50 | kthread_should_stop()); |
45 | } while (!signal_pending(current)); | 51 | } while (!kthread_should_stop() || !list_empty(&hub_event_list)); |
46 | 52 | ||
47 | (from drivers/usb/core/hub.c::hub_thread()). | 53 | (from drivers/usb/core/hub.c::hub_thread()). |
48 | 54 | ||
49 | If a freezable kernel thread fails to call try_to_freeze() after the freezer has | 55 | If a freezable kernel thread fails to call try_to_freeze() after the freezer has |
50 | set TIF_FREEZE for it, the freezing of tasks will fail and the entire | 56 | set TIF_FREEZE for it, the freezing of tasks will fail and the entire |
51 | hibernation operation will be cancelled. For this reason, freezable kernel | 57 | hibernation operation will be cancelled. For this reason, freezable kernel |
52 | threads must call try_to_freeze() somewhere. | 58 | threads must call try_to_freeze() somewhere or use one of the |
59 | wait_event_freezable() and wait_event_freezable_timeout() macros. | ||
53 | 60 | ||
54 | After the system memory state has been restored from a hibernation image and | 61 | After the system memory state has been restored from a hibernation image and |
55 | devices have been reinitialized, the function thaw_processes() is called in | 62 | devices have been reinitialized, the function thaw_processes() is called in |
@@ -81,7 +88,16 @@ hibernation image has been created and before the system is finally powered off. | |||
81 | The majority of these are user space processes, but if any of the kernel threads | 88 | The majority of these are user space processes, but if any of the kernel threads |
82 | may cause something like this to happen, they have to be freezable. | 89 | may cause something like this to happen, they have to be freezable. |
83 | 90 | ||
84 | 2. The second reason is to prevent user space processes and some kernel threads | 91 | 2. Next, to create the hibernation image we need to free a sufficient amount of |
92 | memory (approximately 50% of available RAM) and we need to do that before | ||
93 | devices are deactivated, because we generally need them for swapping out. Then, | ||
94 | after the memory for the image has been freed, we don't want tasks to allocate | ||
95 | additional memory and we prevent them from doing that by freezing them earlier. | ||
96 | [Of course, this also means that device drivers should not allocate substantial | ||
97 | amounts of memory from their .suspend() callbacks before hibernation, but this | ||
98 | is e separate issue.] | ||
99 | |||
100 | 3. The third reason is to prevent user space processes and some kernel threads | ||
85 | from interfering with the suspending and resuming of devices. A user space | 101 | from interfering with the suspending and resuming of devices. A user space |
86 | process running on a second CPU while we are suspending devices may, for | 102 | process running on a second CPU while we are suspending devices may, for |
87 | example, be troublesome and without the freezing of tasks we would need some | 103 | example, be troublesome and without the freezing of tasks we would need some |
@@ -111,7 +127,7 @@ frozen before the driver's .suspend() callback is executed and it will be | |||
111 | thawed after the driver's .resume() callback has run, so it won't be accessing | 127 | thawed after the driver's .resume() callback has run, so it won't be accessing |
112 | the device while it's suspended. | 128 | the device while it's suspended. |
113 | 129 | ||
114 | 3. Another reason for freezing tasks is to prevent user space processes from | 130 | 4. Another reason for freezing tasks is to prevent user space processes from |
115 | realizing that hibernation (or suspend) operation takes place. Ideally, user | 131 | realizing that hibernation (or suspend) operation takes place. Ideally, user |
116 | space processes should not notice that such a system-wide operation has occurred | 132 | space processes should not notice that such a system-wide operation has occurred |
117 | and should continue running without any problems after the restore (or resume | 133 | and should continue running without any problems after the restore (or resume |
diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt index fd5192a8fa8a..e67211fe0ee2 100644 --- a/Documentation/power/interface.txt +++ b/Documentation/power/interface.txt | |||
@@ -20,7 +20,7 @@ states. | |||
20 | /sys/power/disk controls the operating mode of the suspend-to-disk | 20 | /sys/power/disk controls the operating mode of the suspend-to-disk |
21 | mechanism. Suspend-to-disk can be handled in several ways. We have a | 21 | mechanism. Suspend-to-disk can be handled in several ways. We have a |
22 | few options for putting the system to sleep - using the platform driver | 22 | few options for putting the system to sleep - using the platform driver |
23 | (e.g. ACPI or other pm_ops), powering off the system or rebooting the | 23 | (e.g. ACPI or other suspend_ops), powering off the system or rebooting the |
24 | system (for testing). | 24 | system (for testing). |
25 | 25 | ||
26 | Additionally, /sys/power/disk can be used to turn on one of the two testing | 26 | Additionally, /sys/power/disk can be used to turn on one of the two testing |
diff --git a/Documentation/power/swsusp-and-swap-files.txt b/Documentation/power/swsusp-and-swap-files.txt index 06f911a5f885..f281886de490 100644 --- a/Documentation/power/swsusp-and-swap-files.txt +++ b/Documentation/power/swsusp-and-swap-files.txt | |||
@@ -39,7 +39,7 @@ resume=<swap_file_partition> resume_offset=<swap_file_offset> | |||
39 | where <swap_file_partition> is the partition on which the swap file is located | 39 | where <swap_file_partition> is the partition on which the swap file is located |
40 | and <swap_file_offset> is the offset of the swap header determined by the | 40 | and <swap_file_offset> is the offset of the swap header determined by the |
41 | application in 2) (of course, this step may be carried out automatically | 41 | application in 2) (of course, this step may be carried out automatically |
42 | by the same application that determies the swap file's header offset using the | 42 | by the same application that determines the swap file's header offset using the |
43 | FIBMAP ioctl) | 43 | FIBMAP ioctl) |
44 | 44 | ||
45 | OR | 45 | OR |
diff --git a/Documentation/powerpc/eeh-pci-error-recovery.txt b/Documentation/powerpc/eeh-pci-error-recovery.txt index 4530d1bf0286..df7afe43d462 100644 --- a/Documentation/powerpc/eeh-pci-error-recovery.txt +++ b/Documentation/powerpc/eeh-pci-error-recovery.txt | |||
@@ -36,8 +36,8 @@ Causes of EEH Errors | |||
36 | EEH was originally designed to guard against hardware failure, such | 36 | EEH was originally designed to guard against hardware failure, such |
37 | as PCI cards dying from heat, humidity, dust, vibration and bad | 37 | as PCI cards dying from heat, humidity, dust, vibration and bad |
38 | electrical connections. The vast majority of EEH errors seen in | 38 | electrical connections. The vast majority of EEH errors seen in |
39 | "real life" are due to eithr poorly seated PCI cards, or, | 39 | "real life" are due to either poorly seated PCI cards, or, |
40 | unfortunately quite commonly, due device driver bugs, device firmware | 40 | unfortunately quite commonly, due to device driver bugs, device firmware |
41 | bugs, and sometimes PCI card hardware bugs. | 41 | bugs, and sometimes PCI card hardware bugs. |
42 | 42 | ||
43 | The most common software bug, is one that causes the device to | 43 | The most common software bug, is one that causes the device to |
diff --git a/Documentation/powerpc/mpc52xx-device-tree-bindings.txt b/Documentation/powerpc/mpc52xx-device-tree-bindings.txt index e59fcbbe338c..5e03610e186f 100644 --- a/Documentation/powerpc/mpc52xx-device-tree-bindings.txt +++ b/Documentation/powerpc/mpc52xx-device-tree-bindings.txt | |||
@@ -17,12 +17,12 @@ passed by the boot loader to the kernel at boot time. The device tree | |||
17 | describes what devices are present on the board and how they are | 17 | describes what devices are present on the board and how they are |
18 | connected. The device tree can either be passed as a binary blob (as | 18 | connected. The device tree can either be passed as a binary blob (as |
19 | described in Documentation/powerpc/booting-without-of.txt), or passed | 19 | described in Documentation/powerpc/booting-without-of.txt), or passed |
20 | by Open Firmare (IEEE 1275) compatible firmware using an OF compatible | 20 | by Open Firmware (IEEE 1275) compatible firmware using an OF compatible |
21 | client interface API. | 21 | client interface API. |
22 | 22 | ||
23 | This document specifies the requirements on the device-tree for mpc5200 | 23 | This document specifies the requirements on the device-tree for mpc5200 |
24 | based boards. These requirements are above and beyond the details | 24 | based boards. These requirements are above and beyond the details |
25 | specified in either the OpenFirmware spec or booting-without-of.txt | 25 | specified in either the Open Firmware spec or booting-without-of.txt |
26 | 26 | ||
27 | All new mpc5200-based boards are expected to match this document. In | 27 | All new mpc5200-based boards are expected to match this document. In |
28 | cases where this document is not sufficient to support a new board port, | 28 | cases where this document is not sufficient to support a new board port, |
@@ -73,8 +73,8 @@ match on the compatible list; the 'most compatible' driver should be | |||
73 | selected. | 73 | selected. |
74 | 74 | ||
75 | The split between the MPC5200 and the MPC5200B leaves a bit of a | 75 | The split between the MPC5200 and the MPC5200B leaves a bit of a |
76 | connundrum. How should the compatible property be set up to provide | 76 | conundrum. How should the compatible property be set up to provide |
77 | maximum compatability information; but still acurately describe the | 77 | maximum compatibility information; but still accurately describe the |
78 | chip? For the MPC5200; the answer is easy. Most of the SoC devices | 78 | chip? For the MPC5200; the answer is easy. Most of the SoC devices |
79 | originally appeared on the MPC5200. Since they didn't exist anywhere | 79 | originally appeared on the MPC5200. Since they didn't exist anywhere |
80 | else; the 5200 compatible properties will contain only one item; | 80 | else; the 5200 compatible properties will contain only one item; |
@@ -84,7 +84,7 @@ The 5200B is almost the same as the 5200, but not quite. It fixes | |||
84 | silicon bugs and it adds a small number of enhancements. Most of the | 84 | silicon bugs and it adds a small number of enhancements. Most of the |
85 | devices either provide exactly the same interface as on the 5200. A few | 85 | devices either provide exactly the same interface as on the 5200. A few |
86 | devices have extra functions but still have a backwards compatible mode. | 86 | devices have extra functions but still have a backwards compatible mode. |
87 | To express this infomation as completely as possible, 5200B device trees | 87 | To express this information as completely as possible, 5200B device trees |
88 | should have two items in the compatible list; | 88 | should have two items in the compatible list; |
89 | "mpc5200b-<device>\0mpc5200-<device>". It is *strongly* recommended | 89 | "mpc5200b-<device>\0mpc5200-<device>". It is *strongly* recommended |
90 | that 5200B device trees follow this convention (instead of only listing | 90 | that 5200B device trees follow this convention (instead of only listing |
@@ -185,7 +185,7 @@ bestcomm@<addr> dma-controller mpc5200-bestcomm 5200 pic also requires | |||
185 | Recommended soc5200 child nodes; populate as needed for your board | 185 | Recommended soc5200 child nodes; populate as needed for your board |
186 | name device_type compatible Description | 186 | name device_type compatible Description |
187 | ---- ----------- ---------- ----------- | 187 | ---- ----------- ---------- ----------- |
188 | gpt@<addr> gpt mpc5200-gpt General purpose timers | 188 | gpt@<addr> gpt fsl,mpc5200-gpt General purpose timers |
189 | rtc@<addr> rtc mpc5200-rtc Real time clock | 189 | rtc@<addr> rtc mpc5200-rtc Real time clock |
190 | mscan@<addr> mscan mpc5200-mscan CAN bus controller | 190 | mscan@<addr> mscan mpc5200-mscan CAN bus controller |
191 | pci@<addr> pci mpc5200-pci PCI bridge | 191 | pci@<addr> pci mpc5200-pci PCI bridge |
@@ -199,7 +199,7 @@ ethernet@<addr> network mpc5200-fec MPC5200 ethernet device | |||
199 | ata@<addr> ata mpc5200-ata IDE ATA interface | 199 | ata@<addr> ata mpc5200-ata IDE ATA interface |
200 | i2c@<addr> i2c mpc5200-i2c I2C controller | 200 | i2c@<addr> i2c mpc5200-i2c I2C controller |
201 | usb@<addr> usb-ohci-be mpc5200-ohci,ohci-be USB controller | 201 | usb@<addr> usb-ohci-be mpc5200-ohci,ohci-be USB controller |
202 | xlb@<addr> xlb mpc5200-xlb XLB arbritrator | 202 | xlb@<addr> xlb mpc5200-xlb XLB arbitrator |
203 | 203 | ||
204 | Important child node properties | 204 | Important child node properties |
205 | name type description | 205 | name type description |
@@ -213,7 +213,7 @@ cell-index int When multiple devices are present, is the | |||
213 | 5) General Purpose Timer nodes (child of soc5200 node) | 213 | 5) General Purpose Timer nodes (child of soc5200 node) |
214 | On the mpc5200 and 5200b, GPT0 has a watchdog timer function. If the board | 214 | On the mpc5200 and 5200b, GPT0 has a watchdog timer function. If the board |
215 | design supports the internal wdt, then the device node for GPT0 should | 215 | design supports the internal wdt, then the device node for GPT0 should |
216 | include the empty property 'has-wdt'. | 216 | include the empty property 'fsl,has-wdt'. |
217 | 217 | ||
218 | 6) PSC nodes (child of soc5200 node) | 218 | 6) PSC nodes (child of soc5200 node) |
219 | PSC nodes can define the optional 'port-number' property to force assignment | 219 | PSC nodes can define the optional 'port-number' property to force assignment |
diff --git a/Documentation/scsi/aic79xx.txt b/Documentation/scsi/aic79xx.txt index 6aa9a891f3d0..683ccae00ad4 100644 --- a/Documentation/scsi/aic79xx.txt +++ b/Documentation/scsi/aic79xx.txt | |||
@@ -120,7 +120,7 @@ The following information is available in this file: | |||
120 | list size to avoid SCSI malloc pool fragmentation. | 120 | list size to avoid SCSI malloc pool fragmentation. |
121 | - Cleanup channel display in our /proc output. | 121 | - Cleanup channel display in our /proc output. |
122 | - Workaround duplicate device entries in the mid-layer | 122 | - Workaround duplicate device entries in the mid-layer |
123 | devlice list during add-single-device. | 123 | device list during add-single-device. |
124 | 124 | ||
125 | 1.3.6 (March 28th, 2003) | 125 | 1.3.6 (March 28th, 2003) |
126 | - Correct a double free in the Domain Validation code. | 126 | - Correct a double free in the Domain Validation code. |
diff --git a/Documentation/scsi/aic7xxx.txt b/Documentation/scsi/aic7xxx.txt index 5f34d2ba69b4..b7e238cbb5a7 100644 --- a/Documentation/scsi/aic7xxx.txt +++ b/Documentation/scsi/aic7xxx.txt | |||
@@ -159,7 +159,7 @@ The following information is available in this file: | |||
159 | - Add support for 2.5.X's scsi_report_device_reset(). | 159 | - Add support for 2.5.X's scsi_report_device_reset(). |
160 | 160 | ||
161 | 6.2.34 (May 5th, 2003) | 161 | 6.2.34 (May 5th, 2003) |
162 | - Fix locking regression instroduced in 6.2.29 that | 162 | - Fix locking regression introduced in 6.2.29 that |
163 | could cause a lock order reversal between the io_request_lock | 163 | could cause a lock order reversal between the io_request_lock |
164 | and our per-softc lock. This was only possible on RH9, | 164 | and our per-softc lock. This was only possible on RH9, |
165 | SuSE, and kernel.org 2.4.X kernels. | 165 | SuSE, and kernel.org 2.4.X kernels. |
@@ -264,7 +264,7 @@ The following information is available in this file: | |||
264 | Option: tag_info:{{value[,value...]}[,{value[,value...]}...]} | 264 | Option: tag_info:{{value[,value...]}[,{value[,value...]}...]} |
265 | Definition: Set the per-target tagged queue depth on a | 265 | Definition: Set the per-target tagged queue depth on a |
266 | per controller basis. Both controllers and targets | 266 | per controller basis. Both controllers and targets |
267 | may be ommitted indicating that they should retain | 267 | may be omitted indicating that they should retain |
268 | the default tag depth. | 268 | the default tag depth. |
269 | Examples: tag_info:{{16,32,32,64,8,8,,32,32,32,32,32,32,32,32,32} | 269 | Examples: tag_info:{{16,32,32,64,8,8,,32,32,32,32,32,32,32,32,32} |
270 | On Controller 0 | 270 | On Controller 0 |
@@ -290,7 +290,7 @@ The following information is available in this file: | |||
290 | ----------------------------------------------------------------- | 290 | ----------------------------------------------------------------- |
291 | Option: dv: {value[,value...]} | 291 | Option: dv: {value[,value...]} |
292 | Definition: Set Domain Validation Policy on a per-controller basis. | 292 | Definition: Set Domain Validation Policy on a per-controller basis. |
293 | Controllers may be ommitted indicating that | 293 | Controllers may be omitted indicating that |
294 | they should retain the default read streaming setting. | 294 | they should retain the default read streaming setting. |
295 | Example: dv:{-1,0,,1,1,0} | 295 | Example: dv:{-1,0,,1,1,0} |
296 | On Controller 0 leave DV at its default setting. | 296 | On Controller 0 leave DV at its default setting. |
diff --git a/Documentation/scsi/arcmsr_spec.txt b/Documentation/scsi/arcmsr_spec.txt index 5e0042340fd3..45d9482c1517 100644 --- a/Documentation/scsi/arcmsr_spec.txt +++ b/Documentation/scsi/arcmsr_spec.txt | |||
@@ -3,7 +3,7 @@ | |||
3 | ******************************************************************************* | 3 | ******************************************************************************* |
4 | ** Usage of IOP331 adapter | 4 | ** Usage of IOP331 adapter |
5 | ** (All In/Out is in IOP331's view) | 5 | ** (All In/Out is in IOP331's view) |
6 | ** 1. Message 0 --> InitThread message and retrun code | 6 | ** 1. Message 0 --> InitThread message and return code |
7 | ** 2. Doorbell is used for RS-232 emulation | 7 | ** 2. Doorbell is used for RS-232 emulation |
8 | ** inDoorBell : bit0 -- data in ready | 8 | ** inDoorBell : bit0 -- data in ready |
9 | ** (DRIVER DATA WRITE OK) | 9 | ** (DRIVER DATA WRITE OK) |
diff --git a/Documentation/scsi/ibmmca.txt b/Documentation/scsi/ibmmca.txt index a08e225653d6..a810421f1fb3 100644 --- a/Documentation/scsi/ibmmca.txt +++ b/Documentation/scsi/ibmmca.txt | |||
@@ -21,7 +21,7 @@ | |||
21 | versions older than 4.0 do not work with kernels 2.4.0 or later! If you | 21 | versions older than 4.0 do not work with kernels 2.4.0 or later! If you |
22 | try to compile your kernel with the wrong driver source, the | 22 | try to compile your kernel with the wrong driver source, the |
23 | compilation is aborted and you get a corresponding error message. This is | 23 | compilation is aborted and you get a corresponding error message. This is |
24 | no bug in the driver. It prevents you from using the wrong sourcecode | 24 | no bug in the driver; it prevents you from using the wrong source code |
25 | with the wrong kernel version. | 25 | with the wrong kernel version. |
26 | 26 | ||
27 | Authors of this Driver | 27 | Authors of this Driver |
@@ -58,7 +58,7 @@ | |||
58 | 5 Users' Manual | 58 | 5 Users' Manual |
59 | 5.1 Commandline Parameters | 59 | 5.1 Commandline Parameters |
60 | 5.2 Troubleshooting | 60 | 5.2 Troubleshooting |
61 | 5.3 Bugreports | 61 | 5.3 Bug reports |
62 | 5.4 Support WWW-page | 62 | 5.4 Support WWW-page |
63 | 6 References | 63 | 6 References |
64 | 7 Credits to | 64 | 7 Credits to |
@@ -71,13 +71,13 @@ | |||
71 | 71 | ||
72 | 1 Abstract | 72 | 1 Abstract |
73 | ---------- | 73 | ---------- |
74 | This README-file describes the IBM SCSI-subsystem low level driver for | 74 | This README-file describes the IBM SCSI-subsystem low level driver for |
75 | Linux. The descriptions which were formerly kept in the source-code have | 75 | Linux. The descriptions which were formerly kept in the source code have |
76 | been taken out to this file to easify the codes' readability. The driver | 76 | been taken out of this file to simplify the codes readability. The driver |
77 | description has been updated, as most of the former description was already | 77 | description has been updated, as most of the former description was already |
78 | quite outdated. The history of the driver development is also kept inside | 78 | quite outdated. The history of the driver development is also kept inside |
79 | here. Multiple historical developments have been summarized to shorten the | 79 | here. Multiple historical developments have been summarized to shorten the |
80 | textsize a bit. At the end of this file you can find a small manual for | 80 | text size a bit. At the end of this file you can find a small manual for |
81 | this driver and hints to get it running on your machine. | 81 | this driver and hints to get it running on your machine. |
82 | 82 | ||
83 | 2 Driver Description | 83 | 2 Driver Description |
@@ -186,7 +186,7 @@ | |||
186 | between 0 and 7). The IBM SCSI-2 F/W adapter offers this on up to two | 186 | between 0 and 7). The IBM SCSI-2 F/W adapter offers this on up to two |
187 | busses and provides support for 30 logical devices at the same time, where | 187 | busses and provides support for 30 logical devices at the same time, where |
188 | in wide-addressing mode you can have 16 puns with 32 luns on each device. | 188 | in wide-addressing mode you can have 16 puns with 32 luns on each device. |
189 | This section dexribes you the handling of devices on non-F/W adapters. | 189 | This section describes the handling of devices on non-F/W adapters. |
190 | Just imagine, that you can have 16 * 32 = 512 devices on a F/W adapter | 190 | Just imagine, that you can have 16 * 32 = 512 devices on a F/W adapter |
191 | which means a lot of possible devices for such a small machine. | 191 | which means a lot of possible devices for such a small machine. |
192 | 192 | ||
@@ -209,10 +209,10 @@ | |||
209 | -------------------------------------------------------- | 209 | -------------------------------------------------------- |
210 | One consequence of information hiding is that the real (pun,lun) | 210 | One consequence of information hiding is that the real (pun,lun) |
211 | numbers are also hidden. The two possibilities to get around this problem | 211 | numbers are also hidden. The two possibilities to get around this problem |
212 | is to offer fake pun/lun combinations to the operating system or to | 212 | are to offer fake pun/lun combinations to the operating system or to |
213 | delete the whole mapping of the adapter and to reassign the ldns, using | 213 | delete the whole mapping of the adapter and to reassign the ldns, using |
214 | the immediate assign command of the SCSI-subsystem for probing through | 214 | the immediate assign command of the SCSI-subsystem for probing through |
215 | all possible pun/lun combinations. a ldn is a "logical device number" | 215 | all possible pun/lun combinations. An ldn is a "logical device number" |
216 | which is used by IBM SCSI-subsystems to access some valid SCSI-device. | 216 | which is used by IBM SCSI-subsystems to access some valid SCSI-device. |
217 | At the beginning of the development of this driver, the following approach | 217 | At the beginning of the development of this driver, the following approach |
218 | was used: | 218 | was used: |
@@ -251,9 +251,9 @@ | |||
251 | lun>0 or to non-existing devices, in order to satisfy the subsystem, if | 251 | lun>0 or to non-existing devices, in order to satisfy the subsystem, if |
252 | there are less than 15 SCSI-devices connected. In the case of more than 15 | 252 | there are less than 15 SCSI-devices connected. In the case of more than 15 |
253 | devices, the dynamical mapping goes active. If the get_scsi[][] reports a | 253 | devices, the dynamical mapping goes active. If the get_scsi[][] reports a |
254 | device to be existant, but it has no ldn assigned, it gets a ldn out of 7 | 254 | device to be existent, but it has no ldn assigned, it gets an ldn out of 7 |
255 | to 14. The numbers are assigned in cyclic order. Therefore it takes 8 | 255 | to 14. The numbers are assigned in cyclic order, therefore it takes 8 |
256 | dynamical reassignments on the SCSI-devices, until a certain device | 256 | dynamical reassignments on the SCSI-devices until a certain device |
257 | loses its ldn again. This assures that dynamical remapping is avoided | 257 | loses its ldn again. This assures that dynamical remapping is avoided |
258 | during intense I/O between up to 15 SCSI-devices (means pun,lun | 258 | during intense I/O between up to 15 SCSI-devices (means pun,lun |
259 | combinations). A further advantage of this method is that people who | 259 | combinations). A further advantage of this method is that people who |
@@ -551,7 +551,7 @@ | |||
551 | than devices are available, they are assigned to non existing pun,lun | 551 | than devices are available, they are assigned to non existing pun,lun |
552 | combinations to satisfy the adapter. With this, the dynamical mapping | 552 | combinations to satisfy the adapter. With this, the dynamical mapping |
553 | was possible to implement. (For further info see the text in the | 553 | was possible to implement. (For further info see the text in the |
554 | source-code and in the description below. Read the description | 554 | source code and in the description below. Read the description |
555 | below BEFORE installing this driver on your system!) | 555 | below BEFORE installing this driver on your system!) |
556 | 2) Changed the name IBMMCA_DRIVER_VERSION to IBMMCA_SCSI_DRIVER_VERSION. | 556 | 2) Changed the name IBMMCA_DRIVER_VERSION to IBMMCA_SCSI_DRIVER_VERSION. |
557 | 3) The LED-display shows on PS/2-95 no longer the ldn, but the SCSI-ID | 557 | 3) The LED-display shows on PS/2-95 no longer the ldn, but the SCSI-ID |
@@ -762,9 +762,9 @@ | |||
762 | - Michael Lang | 762 | - Michael Lang |
763 | 763 | ||
764 | Apr 23, 2000 (v3.2pre1) | 764 | Apr 23, 2000 (v3.2pre1) |
765 | 1) During a very long time, I collected a huge amount of bugreports from | 765 | 1) During a very long time, I collected a huge amount of bug reports from |
766 | various people, trying really quite different things on their SCSI- | 766 | various people, trying really quite different things on their SCSI- |
767 | PS/2s. Today, all these bugreports are taken into account and should be | 767 | PS/2s. Today, all these bug reports are taken into account and should be |
768 | mostly solved. The major topics were: | 768 | mostly solved. The major topics were: |
769 | - Driver crashes during boottime by no obvious reason. | 769 | - Driver crashes during boottime by no obvious reason. |
770 | - Driver panics while the midlevel-SCSI-driver is trying to inquire | 770 | - Driver panics while the midlevel-SCSI-driver is trying to inquire |
@@ -819,7 +819,7 @@ | |||
819 | - Michael Lang | 819 | - Michael Lang |
820 | 820 | ||
821 | July 17, 2000 (v3.2pre8) | 821 | July 17, 2000 (v3.2pre8) |
822 | A long period of collecting bugreports from all corners of the world | 822 | A long period of collecting bug reports from all corners of the world |
823 | now lead to the following corrections to the code: | 823 | now lead to the following corrections to the code: |
824 | 1) SCSI-2 F/W support crashed with a COMMAND ERROR. The reason for this | 824 | 1) SCSI-2 F/W support crashed with a COMMAND ERROR. The reason for this |
825 | was that it is possible to disable Fast-SCSI for the external bus. | 825 | was that it is possible to disable Fast-SCSI for the external bus. |
@@ -873,7 +873,7 @@ | |||
873 | July 26, 2000 (v3.2pre11) | 873 | July 26, 2000 (v3.2pre11) |
874 | 1) I passed a horrible weekend getting mad with NMIs on kernel 2.2.14 and | 874 | 1) I passed a horrible weekend getting mad with NMIs on kernel 2.2.14 and |
875 | a model 9595. Asking around in the community, nobody except of me has | 875 | a model 9595. Asking around in the community, nobody except of me has |
876 | seen such errors. Weired, but I am trying to recompile everything on | 876 | seen such errors. Weird, but I am trying to recompile everything on |
877 | the model 9595. Maybe, as I use a specially modified gcc, that could | 877 | the model 9595. Maybe, as I use a specially modified gcc, that could |
878 | cause problems. But, it was not the reason. The true background was, | 878 | cause problems. But, it was not the reason. The true background was, |
879 | that the kernel was compiled for i386 and the 9595 has a 486DX-2. | 879 | that the kernel was compiled for i386 and the 9595 has a 486DX-2. |
@@ -886,7 +886,7 @@ | |||
886 | alive rotator during boottime. This makes sense, when no monitor is | 886 | alive rotator during boottime. This makes sense, when no monitor is |
887 | connected to the system. You can get rid of all display activity, if | 887 | connected to the system. You can get rid of all display activity, if |
888 | you do not use any parameter or just ibmmcascsi=activity, for the | 888 | you do not use any parameter or just ibmmcascsi=activity, for the |
889 | harddrive activity LED, existant on all PS/2, except models 8595-XXX. | 889 | harddrive activity LED, existent on all PS/2, except models 8595-XXX. |
890 | If no monitor is available, please use ibmmcascsi=display, which works | 890 | If no monitor is available, please use ibmmcascsi=display, which works |
891 | fine together with the linuxinfo utility for the LED-panel. | 891 | fine together with the linuxinfo utility for the LED-panel. |
892 | - Michael Lang | 892 | - Michael Lang |
@@ -1115,7 +1115,7 @@ | |||
1115 | If this really happens, do also send e-mail to the maintainer, as | 1115 | If this really happens, do also send e-mail to the maintainer, as |
1116 | forced detection should be never necessary. Forced detection is in | 1116 | forced detection should be never necessary. Forced detection is in |
1117 | principal some flaw of the driver adapter detection and goes into | 1117 | principal some flaw of the driver adapter detection and goes into |
1118 | bugreports. | 1118 | bug reports. |
1119 | Q: The driver screws up, if it starts to probe SCSI-devices, is there | 1119 | Q: The driver screws up, if it starts to probe SCSI-devices, is there |
1120 | some way out of it? | 1120 | some way out of it? |
1121 | A: Yes, that was some recognition problem of the correct SCSI-adapter | 1121 | A: Yes, that was some recognition problem of the correct SCSI-adapter |
@@ -1172,7 +1172,7 @@ | |||
1172 | recommended version is 3.2 or later. Here, the F/W support is in | 1172 | recommended version is 3.2 or later. Here, the F/W support is in |
1173 | a stable and reliable condition. Wide-addressing is in addition | 1173 | a stable and reliable condition. Wide-addressing is in addition |
1174 | supported. | 1174 | supported. |
1175 | Q: I get a Ooops message and something like "killing interrupt". | 1175 | Q: I get an Oops message and something like "killing interrupt". |
1176 | A: The reason for this is that the IBM SCSI-subsystem only sends a | 1176 | A: The reason for this is that the IBM SCSI-subsystem only sends a |
1177 | termination status back, if some error appeared. In former releases | 1177 | termination status back, if some error appeared. In former releases |
1178 | of the driver, it was not checked, if the termination status block | 1178 | of the driver, it was not checked, if the termination status block |
@@ -1213,21 +1213,21 @@ | |||
1213 | problem. Not yet tried, but guessing that it could work. To get this, | 1213 | problem. Not yet tried, but guessing that it could work. To get this, |
1214 | set unchecked_isa_dma argument of ibmmca.h from 0 to 1. | 1214 | set unchecked_isa_dma argument of ibmmca.h from 0 to 1. |
1215 | 1215 | ||
1216 | 5.3 Bugreports | 1216 | 5.3 Bug reports |
1217 | -------------- | 1217 | -------------- |
1218 | If you really find bugs in the sourcecode or the driver will successfully | 1218 | If you really find bugs in the source code or the driver will successfully |
1219 | refuse to work on your machine, you should send a bug report to me. The | 1219 | refuse to work on your machine, you should send a bug report to me. The |
1220 | best for this is to follow the instructions on the WWW-page for this | 1220 | best for this is to follow the instructions on the WWW-page for this |
1221 | driver. Fill out the bug-report form, placed on the WWW-page and ship it, | 1221 | driver. Fill out the bug-report form, placed on the WWW-page and ship it, |
1222 | so the bugs can be taken into account with maximum efforts. But, please | 1222 | so the bugs can be taken into account with maximum efforts. But, please |
1223 | do not send bug reports about this driver to Linus Torvalds or Leonard | 1223 | do not send bug reports about this driver to Linus Torvalds or Leonard |
1224 | Zubkoff, as Linus is burried in E-Mail and Leonard is supervising all | 1224 | Zubkoff, as Linus is buried in E-Mail and Leonard is supervising all |
1225 | SCSI-drivers and won't have the time left to look inside every single | 1225 | SCSI-drivers and won't have the time left to look inside every single |
1226 | driver to fix a bug and especially DO NOT send modified code to Linus | 1226 | driver to fix a bug and especially DO NOT send modified code to Linus |
1227 | Torvalds or Alan J. Cox which has not been checked here!!! They are both | 1227 | Torvalds or Alan J. Cox which has not been checked here!!! They are both |
1228 | quite burried in E-mail (as me, sometimes, too) and one should first check | 1228 | quite buried in E-mail (as me, sometimes, too) and one should first check |
1229 | for problems on my local teststand. Recently, I got a lot of | 1229 | for problems on my local teststand. Recently, I got a lot of |
1230 | bugreports for errors in the ibmmca.c code, which I could not imagine, but | 1230 | bug reports for errors in the ibmmca.c code, which I could not imagine, but |
1231 | a look inside some Linux-distribution showed me quite often some modified | 1231 | a look inside some Linux-distribution showed me quite often some modified |
1232 | code, which did no longer work on most other machines than the one of the | 1232 | code, which did no longer work on most other machines than the one of the |
1233 | modifier. Ok, so now that there is maintenance service available for this | 1233 | modifier. Ok, so now that there is maintenance service available for this |
@@ -1261,7 +1261,7 @@ | |||
1261 | some e-mail directly, but at least with the same information as required by | 1261 | some e-mail directly, but at least with the same information as required by |
1262 | the formular. | 1262 | the formular. |
1263 | 1263 | ||
1264 | If you have extensive bugreports, including Ooops messages and | 1264 | If you have extensive bug reports, including Oops messages and |
1265 | screen-shots, please feel free to send it directly to the address | 1265 | screen-shots, please feel free to send it directly to the address |
1266 | of the maintainer, too. The current address of the maintainer is: | 1266 | of the maintainer, too. The current address of the maintainer is: |
1267 | 1267 | ||
@@ -1318,7 +1318,7 @@ | |||
1318 | detailed bug reports and ideas for this driver (and his | 1318 | detailed bug reports and ideas for this driver (and his |
1319 | patience ;-)). | 1319 | patience ;-)). |
1320 | Alan J. Cox | 1320 | Alan J. Cox |
1321 | for his bugreports and his bold activities in cross-checking | 1321 | for his bug reports and his bold activities in cross-checking |
1322 | the driver-code with his teststand. | 1322 | the driver-code with his teststand. |
1323 | 1323 | ||
1324 | 7.2 Sponsors & Supporters | 1324 | 7.2 Sponsors & Supporters |
diff --git a/Documentation/sharedsubtree.txt b/Documentation/sharedsubtree.txt index ccf1cebe744f..736540045dc7 100644 --- a/Documentation/sharedsubtree.txt +++ b/Documentation/sharedsubtree.txt | |||
@@ -153,6 +153,7 @@ replicas continue to be exactly same. | |||
153 | #include <stdio.h> | 153 | #include <stdio.h> |
154 | #include <stdlib.h> | 154 | #include <stdlib.h> |
155 | #include <unistd.h> | 155 | #include <unistd.h> |
156 | #include <string.h> | ||
156 | #include <sys/mount.h> | 157 | #include <sys/mount.h> |
157 | #include <sys/fsuid.h> | 158 | #include <sys/fsuid.h> |
158 | 159 | ||
diff --git a/Documentation/sound/alsa/soc/DAI.txt b/Documentation/sound/alsa/soc/DAI.txt index 58cbfd01ea8f..3feeb9ecdec4 100644 --- a/Documentation/sound/alsa/soc/DAI.txt +++ b/Documentation/sound/alsa/soc/DAI.txt | |||
@@ -20,12 +20,12 @@ I2S | |||
20 | === | 20 | === |
21 | 21 | ||
22 | I2S is a common 4 wire DAI used in HiFi, STB and portable devices. The Tx and | 22 | I2S is a common 4 wire DAI used in HiFi, STB and portable devices. The Tx and |
23 | Rx lines are used for audio transmision, whilst the bit clock (BCLK) and | 23 | Rx lines are used for audio transmission, whilst the bit clock (BCLK) and |
24 | left/right clock (LRC) synchronise the link. I2S is flexible in that either the | 24 | left/right clock (LRC) synchronise the link. I2S is flexible in that either the |
25 | controller or CODEC can drive (master) the BCLK and LRC clock lines. Bit clock | 25 | controller or CODEC can drive (master) the BCLK and LRC clock lines. Bit clock |
26 | usually varies depending on the sample rate and the master system clock | 26 | usually varies depending on the sample rate and the master system clock |
27 | (SYSCLK). LRCLK is the same as the sample rate. A few devices support separate | 27 | (SYSCLK). LRCLK is the same as the sample rate. A few devices support separate |
28 | ADC and DAC LRCLK's, this allows for similtanious capture and playback at | 28 | ADC and DAC LRCLK's, this allows for simultaneous capture and playback at |
29 | different sample rates. | 29 | different sample rates. |
30 | 30 | ||
31 | I2S has several different operating modes:- | 31 | I2S has several different operating modes:- |
@@ -41,12 +41,12 @@ I2S has several different operating modes:- | |||
41 | PCM | 41 | PCM |
42 | === | 42 | === |
43 | 43 | ||
44 | PCM is another 4 wire interface, very similar to I2S, that can support a more | 44 | PCM is another 4 wire interface, very similar to I2S, which can support a more |
45 | flexible protocol. It has bit clock (BCLK) and sync (SYNC) lines that are used | 45 | flexible protocol. It has bit clock (BCLK) and sync (SYNC) lines that are used |
46 | to synchronise the link whilst the Tx and Rx lines are used to transmit and | 46 | to synchronise the link whilst the Tx and Rx lines are used to transmit and |
47 | receive the audio data. Bit clock usually varies depending on sample rate | 47 | receive the audio data. Bit clock usually varies depending on sample rate |
48 | whilst sync runs at the sample rate. PCM also supports Time Division | 48 | whilst sync runs at the sample rate. PCM also supports Time Division |
49 | Multiplexing (TDM) in that several devices can use the bus similtaniuosly (This | 49 | Multiplexing (TDM) in that several devices can use the bus simultaneously (this |
50 | is sometimes referred to as network mode). | 50 | is sometimes referred to as network mode). |
51 | 51 | ||
52 | Common PCM operating modes:- | 52 | Common PCM operating modes:- |
diff --git a/Documentation/sound/alsa/soc/clocking.txt b/Documentation/sound/alsa/soc/clocking.txt index e93960d53a1e..14930887c25f 100644 --- a/Documentation/sound/alsa/soc/clocking.txt +++ b/Documentation/sound/alsa/soc/clocking.txt | |||
@@ -2,20 +2,20 @@ Audio Clocking | |||
2 | ============== | 2 | ============== |
3 | 3 | ||
4 | This text describes the audio clocking terms in ASoC and digital audio in | 4 | This text describes the audio clocking terms in ASoC and digital audio in |
5 | general. Note: Audio clocking can be complex ! | 5 | general. Note: Audio clocking can be complex! |
6 | 6 | ||
7 | 7 | ||
8 | Master Clock | 8 | Master Clock |
9 | ------------ | 9 | ------------ |
10 | 10 | ||
11 | Every audio subsystem is driven by a master clock (sometimes refered to as MCLK | 11 | Every audio subsystem is driven by a master clock (sometimes referred to as MCLK |
12 | or SYSCLK). This audio master clock can be derived from a number of sources | 12 | or SYSCLK). This audio master clock can be derived from a number of sources |
13 | (e.g. crystal, PLL, CPU clock) and is responsible for producing the correct | 13 | (e.g. crystal, PLL, CPU clock) and is responsible for producing the correct |
14 | audio playback and capture sample rates. | 14 | audio playback and capture sample rates. |
15 | 15 | ||
16 | Some master clocks (e.g. PLL's and CPU based clocks) are configuarble in that | 16 | Some master clocks (e.g. PLL's and CPU based clocks) are configurable in that |
17 | their speed can be altered by software (depending on the system use and to save | 17 | their speed can be altered by software (depending on the system use and to save |
18 | power). Other master clocks are fixed at at set frequency (i.e. crystals). | 18 | power). Other master clocks are fixed at a set frequency (i.e. crystals). |
19 | 19 | ||
20 | 20 | ||
21 | DAI Clocks | 21 | DAI Clocks |
@@ -44,7 +44,7 @@ This relationship depends on the codec or SoC CPU in particular. In general | |||
44 | it's best to configure BCLK to the lowest possible speed (depending on your | 44 | it's best to configure BCLK to the lowest possible speed (depending on your |
45 | rate, number of channels and wordsize) to save on power. | 45 | rate, number of channels and wordsize) to save on power. |
46 | 46 | ||
47 | It's also desireable to use the codec (if possible) to drive (or master) the | 47 | It's also desirable to use the codec (if possible) to drive (or master) the |
48 | audio clocks as it's usually gives more accurate sample rates than the CPU. | 48 | audio clocks as it's usually gives more accurate sample rates than the CPU. |
49 | 49 | ||
50 | 50 | ||
diff --git a/Documentation/sound/alsa/soc/codec.txt b/Documentation/sound/alsa/soc/codec.txt index 48983c75aad9..1e766ad0ebd1 100644 --- a/Documentation/sound/alsa/soc/codec.txt +++ b/Documentation/sound/alsa/soc/codec.txt | |||
@@ -19,7 +19,7 @@ Optionally, codec drivers can also provide:- | |||
19 | 6) DAPM event handler. | 19 | 6) DAPM event handler. |
20 | 7) DAC Digital mute control. | 20 | 7) DAC Digital mute control. |
21 | 21 | ||
22 | It's probably best to use this guide in conjuction with the existing codec | 22 | It's probably best to use this guide in conjunction with the existing codec |
23 | driver code in sound/soc/codecs/ | 23 | driver code in sound/soc/codecs/ |
24 | 24 | ||
25 | ASoC Codec driver breakdown | 25 | ASoC Codec driver breakdown |
@@ -28,7 +28,7 @@ ASoC Codec driver breakdown | |||
28 | 1 - Codec DAI and PCM configuration | 28 | 1 - Codec DAI and PCM configuration |
29 | ----------------------------------- | 29 | ----------------------------------- |
30 | Each codec driver must have a struct snd_soc_codec_dai to define it's DAI and | 30 | Each codec driver must have a struct snd_soc_codec_dai to define it's DAI and |
31 | PCM's capablities and operations. This struct is exported so that it can be | 31 | PCM's capabilities and operations. This struct is exported so that it can be |
32 | registered with the core by your machine driver. | 32 | registered with the core by your machine driver. |
33 | 33 | ||
34 | e.g. | 34 | e.g. |
@@ -67,7 +67,7 @@ EXPORT_SYMBOL_GPL(wm8731_dai); | |||
67 | 67 | ||
68 | 2 - Codec control IO | 68 | 2 - Codec control IO |
69 | -------------------- | 69 | -------------------- |
70 | The codec can ususally be controlled via an I2C or SPI style interface (AC97 | 70 | The codec can usually be controlled via an I2C or SPI style interface (AC97 |
71 | combines control with data in the DAI). The codec drivers will have to provide | 71 | combines control with data in the DAI). The codec drivers will have to provide |
72 | functions to read and write the codec registers along with supplying a register | 72 | functions to read and write the codec registers along with supplying a register |
73 | cache:- | 73 | cache:- |
diff --git a/Documentation/sound/alsa/soc/dapm.txt b/Documentation/sound/alsa/soc/dapm.txt index c11877f5b4a1..ab0766fd7869 100644 --- a/Documentation/sound/alsa/soc/dapm.txt +++ b/Documentation/sound/alsa/soc/dapm.txt | |||
@@ -11,7 +11,7 @@ other PM systems. | |||
11 | 11 | ||
12 | DAPM is also completely transparent to all user space applications as all power | 12 | DAPM is also completely transparent to all user space applications as all power |
13 | switching is done within the ASoC core. No code changes or recompiling are | 13 | switching is done within the ASoC core. No code changes or recompiling are |
14 | required for user space applications. DAPM makes power switching descisions based | 14 | required for user space applications. DAPM makes power switching decisions based |
15 | upon any audio stream (capture/playback) activity and audio mixer settings | 15 | upon any audio stream (capture/playback) activity and audio mixer settings |
16 | within the device. | 16 | within the device. |
17 | 17 | ||
@@ -38,7 +38,7 @@ There are 4 power domains within DAPM | |||
38 | Enabled and disabled when stream playback/capture is started and | 38 | Enabled and disabled when stream playback/capture is started and |
39 | stopped respectively. e.g. aplay, arecord. | 39 | stopped respectively. e.g. aplay, arecord. |
40 | 40 | ||
41 | All DAPM power switching descisons are made automatically by consulting an audio | 41 | All DAPM power switching decisions are made automatically by consulting an audio |
42 | routing map of the whole machine. This map is specific to each machine and | 42 | routing map of the whole machine. This map is specific to each machine and |
43 | consists of the interconnections between every audio component (including | 43 | consists of the interconnections between every audio component (including |
44 | internal codec components). All audio components that effect power are called | 44 | internal codec components). All audio components that effect power are called |
diff --git a/Documentation/sound/alsa/soc/overview.txt b/Documentation/sound/alsa/soc/overview.txt index 753c5cc5984a..c47ce9530677 100644 --- a/Documentation/sound/alsa/soc/overview.txt +++ b/Documentation/sound/alsa/soc/overview.txt | |||
@@ -2,18 +2,19 @@ ALSA SoC Layer | |||
2 | ============== | 2 | ============== |
3 | 3 | ||
4 | The overall project goal of the ALSA System on Chip (ASoC) layer is to provide | 4 | The overall project goal of the ALSA System on Chip (ASoC) layer is to provide |
5 | better ALSA support for embedded system on chip procesors (e.g. pxa2xx, au1x00, | 5 | better ALSA support for embedded system-on-chip processors (e.g. pxa2xx, au1x00, |
6 | iMX, etc) and portable audio codecs. Currently there is some support in the | 6 | iMX, etc) and portable audio codecs. Currently there is some support in the |
7 | kernel for SoC audio, however it has some limitations:- | 7 | kernel for SoC audio, however it has some limitations:- |
8 | 8 | ||
9 | * Currently, codec drivers are often tightly coupled to the underlying SoC | 9 | * Currently, codec drivers are often tightly coupled to the underlying SoC |
10 | cpu. This is not ideal and leads to code duplication i.e. Linux now has 4 | 10 | CPU. This is not ideal and leads to code duplication i.e. Linux now has 4 |
11 | different wm8731 drivers for 4 different SoC platforms. | 11 | different wm8731 drivers for 4 different SoC platforms. |
12 | 12 | ||
13 | * There is no standard method to signal user initiated audio events. | 13 | * There is no standard method to signal user initiated audio events (e.g. |
14 | e.g. Headphone/Mic insertion, Headphone/Mic detection after an insertion | 14 | Headphone/Mic insertion, Headphone/Mic detection after an insertion |
15 | event. These are quite common events on portable devices and ofter require | 15 | event). These are quite common events on portable devices and often require |
16 | machine specific code to re route audio, enable amps etc after such an event. | 16 | machine specific code to re-route audio, enable amps, etc., after such an |
17 | event. | ||
17 | 18 | ||
18 | * Current drivers tend to power up the entire codec when playing | 19 | * Current drivers tend to power up the entire codec when playing |
19 | (or recording) audio. This is fine for a PC, but tends to waste a lot of | 20 | (or recording) audio. This is fine for a PC, but tends to waste a lot of |
@@ -44,7 +45,7 @@ features :- | |||
44 | signals the codec when to change power states. | 45 | signals the codec when to change power states. |
45 | 46 | ||
46 | * Machine specific controls: Allow machines to add controls to the sound card | 47 | * Machine specific controls: Allow machines to add controls to the sound card |
47 | e.g. volume control for speaker amp. | 48 | (e.g. volume control for speaker amp). |
48 | 49 | ||
49 | To achieve all this, ASoC basically splits an embedded audio system into 3 | 50 | To achieve all this, ASoC basically splits an embedded audio system into 3 |
50 | components :- | 51 | components :- |
@@ -57,7 +58,7 @@ components :- | |||
57 | interface drivers (e.g. I2S, AC97, PCM) for that platform. | 58 | interface drivers (e.g. I2S, AC97, PCM) for that platform. |
58 | 59 | ||
59 | * Machine driver: The machine driver handles any machine specific controls and | 60 | * Machine driver: The machine driver handles any machine specific controls and |
60 | audio events. i.e. turing on an amp at start of playback. | 61 | audio events (e.g. turning on an amp at start of playback). |
61 | 62 | ||
62 | 63 | ||
63 | Documentation | 64 | Documentation |
diff --git a/Documentation/sound/alsa/soc/platform.txt b/Documentation/sound/alsa/soc/platform.txt index e95b16d5a53b..d4678b4dc6c6 100644 --- a/Documentation/sound/alsa/soc/platform.txt +++ b/Documentation/sound/alsa/soc/platform.txt | |||
@@ -20,7 +20,7 @@ struct snd_soc_ops { | |||
20 | int (*trigger)(struct snd_pcm_substream *, int); | 20 | int (*trigger)(struct snd_pcm_substream *, int); |
21 | }; | 21 | }; |
22 | 22 | ||
23 | The platform driver exports it's DMA functionailty via struct snd_soc_platform:- | 23 | The platform driver exports its DMA functionality via struct snd_soc_platform:- |
24 | 24 | ||
25 | struct snd_soc_platform { | 25 | struct snd_soc_platform { |
26 | char *name; | 26 | char *name; |
diff --git a/Documentation/sound/alsa/soc/pops_clicks.txt b/Documentation/sound/alsa/soc/pops_clicks.txt index 2cf7ee5b3d74..3371bd9d7cfa 100644 --- a/Documentation/sound/alsa/soc/pops_clicks.txt +++ b/Documentation/sound/alsa/soc/pops_clicks.txt | |||
@@ -2,7 +2,7 @@ Audio Pops and Clicks | |||
2 | ===================== | 2 | ===================== |
3 | 3 | ||
4 | Pops and clicks are unwanted audio artifacts caused by the powering up and down | 4 | Pops and clicks are unwanted audio artifacts caused by the powering up and down |
5 | of components within the audio subsystem. This is noticable on PC's when an | 5 | of components within the audio subsystem. This is noticeable on PCs when an |
6 | audio module is either loaded or unloaded (at module load time the sound card is | 6 | audio module is either loaded or unloaded (at module load time the sound card is |
7 | powered up and causes a popping noise on the speakers). | 7 | powered up and causes a popping noise on the speakers). |
8 | 8 | ||
@@ -16,7 +16,7 @@ Minimising Playback Pops and Clicks | |||
16 | =================================== | 16 | =================================== |
17 | 17 | ||
18 | Playback pops in portable audio subsystems cannot be completely eliminated atm, | 18 | Playback pops in portable audio subsystems cannot be completely eliminated atm, |
19 | however future audio codec hardware will have better pop and click supression. | 19 | however future audio codec hardware will have better pop and click suppression. |
20 | Pops can be reduced within playback by powering the audio components in a | 20 | Pops can be reduced within playback by powering the audio components in a |
21 | specific order. This order is different for startup and shutdown and follows | 21 | specific order. This order is different for startup and shutdown and follows |
22 | some basic rules:- | 22 | some basic rules:- |
@@ -33,7 +33,7 @@ Minimising Capture Pops and Clicks | |||
33 | ================================== | 33 | ================================== |
34 | 34 | ||
35 | Capture artifacts are somewhat easier to get rid as we can delay activating the | 35 | Capture artifacts are somewhat easier to get rid as we can delay activating the |
36 | ADC until all the pops have occured. This follows similar power rules to | 36 | ADC until all the pops have occurred. This follows similar power rules to |
37 | playback in that components are powered in a sequence depending upon stream | 37 | playback in that components are powered in a sequence depending upon stream |
38 | startup or shutdown. | 38 | startup or shutdown. |
39 | 39 | ||
diff --git a/Documentation/sound/oss/es1371 b/Documentation/sound/oss/es1371 deleted file mode 100644 index c3151266771c..000000000000 --- a/Documentation/sound/oss/es1371 +++ /dev/null | |||
@@ -1,64 +0,0 @@ | |||
1 | /proc/sound, /dev/sndstat | ||
2 | ------------------------- | ||
3 | |||
4 | /proc/sound and /dev/sndstat is not supported by the | ||
5 | driver. To find out whether the driver succeeded loading, | ||
6 | check the kernel log (dmesg). | ||
7 | |||
8 | |||
9 | ALaw/uLaw sample formats | ||
10 | ------------------------ | ||
11 | |||
12 | This driver does not support the ALaw/uLaw sample formats. | ||
13 | ALaw is the default mode when opening a sound device | ||
14 | using OSS/Free. The reason for the lack of support is | ||
15 | that the hardware does not support these formats, and adding | ||
16 | conversion routines to the kernel would lead to very ugly | ||
17 | code in the presence of the mmap interface to the driver. | ||
18 | And since xquake uses mmap, mmap is considered important :-) | ||
19 | and no sane application uses ALaw/uLaw these days anyway. | ||
20 | In short, playing a Sun .au file as follows: | ||
21 | |||
22 | cat my_file.au > /dev/dsp | ||
23 | |||
24 | does not work. Instead, you may use the play script from | ||
25 | Chris Bagwell's sox-12.14 package (available from the URL | ||
26 | below) to play many different audio file formats. | ||
27 | The script automatically determines the audio format | ||
28 | and does do audio conversions if necessary. | ||
29 | http://home.sprynet.com/sprynet/cbagwell/projects.html | ||
30 | |||
31 | |||
32 | Blocking vs. nonblocking IO | ||
33 | --------------------------- | ||
34 | |||
35 | Unlike OSS/Free this driver honours the O_NONBLOCK file flag | ||
36 | not only during open, but also during read and write. | ||
37 | This is an effort to make the sound driver interface more | ||
38 | regular. Timidity has problems with this; a patch | ||
39 | is available from http://www.ife.ee.ethz.ch/~sailer/linux/pciaudio.html. | ||
40 | (Timidity patched will also run on OSS/Free). | ||
41 | |||
42 | |||
43 | MIDI UART | ||
44 | --------- | ||
45 | |||
46 | The driver supports a simple MIDI UART interface, with | ||
47 | no ioctl's supported. | ||
48 | |||
49 | |||
50 | MIDI synthesizer | ||
51 | ---------------- | ||
52 | |||
53 | This soundcard does not have any hardware MIDI synthesizer; | ||
54 | MIDI synthesis has to be done in software. To allow this | ||
55 | the driver/soundcard supports two PCM (/dev/dsp) interfaces. | ||
56 | |||
57 | There is a freely available software package that allows | ||
58 | MIDI file playback on this soundcard called Timidity. | ||
59 | See http://www.cgs.fi/~tt/timidity/. | ||
60 | |||
61 | |||
62 | |||
63 | Thomas Sailer | ||
64 | t.sailer@alumni.ethz.ch | ||
diff --git a/Documentation/spi/pxa2xx b/Documentation/spi/pxa2xx index 215e3b8e7266..f3853cc37bde 100644 --- a/Documentation/spi/pxa2xx +++ b/Documentation/spi/pxa2xx | |||
@@ -1,4 +1,4 @@ | |||
1 | PXA2xx SPI on SSP driver HOWTO | 1 | PXA2xx SPI on SSP driver HOWTO |
2 | =================================================== | 2 | =================================================== |
3 | This a mini howto on the pxa2xx_spi driver. The driver turns a PXA2xx | 3 | This a mini howto on the pxa2xx_spi driver. The driver turns a PXA2xx |
4 | synchronous serial port into a SPI master controller | 4 | synchronous serial port into a SPI master controller |
diff --git a/Documentation/thinkpad-acpi.txt b/Documentation/thinkpad-acpi.txt index 60953d6c919d..ec499265deca 100644 --- a/Documentation/thinkpad-acpi.txt +++ b/Documentation/thinkpad-acpi.txt | |||
@@ -105,10 +105,15 @@ The version of thinkpad-acpi's sysfs interface is exported by the driver | |||
105 | as a driver attribute (see below). | 105 | as a driver attribute (see below). |
106 | 106 | ||
107 | Sysfs driver attributes are on the driver's sysfs attribute space, | 107 | Sysfs driver attributes are on the driver's sysfs attribute space, |
108 | for 2.6.20 this is /sys/bus/platform/drivers/thinkpad_acpi/. | 108 | for 2.6.23 this is /sys/bus/platform/drivers/thinkpad_acpi/ and |
109 | /sys/bus/platform/drivers/thinkpad_hwmon/ | ||
109 | 110 | ||
110 | Sysfs device attributes are on the driver's sysfs attribute space, | 111 | Sysfs device attributes are on the thinkpad_acpi device sysfs attribute |
111 | for 2.6.20 this is /sys/devices/platform/thinkpad_acpi/. | 112 | space, for 2.6.23 this is /sys/devices/platform/thinkpad_acpi/. |
113 | |||
114 | Sysfs device attributes for the sensors and fan are on the | ||
115 | thinkpad_hwmon device's sysfs attribute space, but you should locate it | ||
116 | looking for a hwmon device with the name attribute of "thinkpad". | ||
112 | 117 | ||
113 | Driver version | 118 | Driver version |
114 | -------------- | 119 | -------------- |
@@ -766,7 +771,7 @@ Temperature sensors | |||
766 | ------------------- | 771 | ------------------- |
767 | 772 | ||
768 | procfs: /proc/acpi/ibm/thermal | 773 | procfs: /proc/acpi/ibm/thermal |
769 | sysfs device attributes: (hwmon) temp*_input | 774 | sysfs device attributes: (hwmon "thinkpad") temp*_input |
770 | 775 | ||
771 | Most ThinkPads include six or more separate temperature sensors but only | 776 | Most ThinkPads include six or more separate temperature sensors but only |
772 | expose the CPU temperature through the standard ACPI methods. This | 777 | expose the CPU temperature through the standard ACPI methods. This |
@@ -989,7 +994,9 @@ Fan control and monitoring: fan speed, fan enable/disable | |||
989 | --------------------------------------------------------- | 994 | --------------------------------------------------------- |
990 | 995 | ||
991 | procfs: /proc/acpi/ibm/fan | 996 | procfs: /proc/acpi/ibm/fan |
992 | sysfs device attributes: (hwmon) fan_input, pwm1, pwm1_enable | 997 | sysfs device attributes: (hwmon "thinkpad") fan1_input, pwm1, |
998 | pwm1_enable | ||
999 | sysfs hwmon driver attributes: fan_watchdog | ||
993 | 1000 | ||
994 | NOTE NOTE NOTE: fan control operations are disabled by default for | 1001 | NOTE NOTE NOTE: fan control operations are disabled by default for |
995 | safety reasons. To enable them, the module parameter "fan_control=1" | 1002 | safety reasons. To enable them, the module parameter "fan_control=1" |
@@ -1028,7 +1035,7 @@ enable it if necessary to avoid overheating. | |||
1028 | 1035 | ||
1029 | An enabled fan in level "auto" may stop spinning if the EC decides the | 1036 | An enabled fan in level "auto" may stop spinning if the EC decides the |
1030 | ThinkPad is cool enough and doesn't need the extra airflow. This is | 1037 | ThinkPad is cool enough and doesn't need the extra airflow. This is |
1031 | normal, and the EC will spin the fan up if the varios thermal readings | 1038 | normal, and the EC will spin the fan up if the various thermal readings |
1032 | rise too much. | 1039 | rise too much. |
1033 | 1040 | ||
1034 | On the X40, this seems to depend on the CPU and HDD temperatures. | 1041 | On the X40, this seems to depend on the CPU and HDD temperatures. |
@@ -1131,7 +1138,7 @@ hwmon device attribute fan1_input: | |||
1131 | which can take up to two minutes. May return rubbish on older | 1138 | which can take up to two minutes. May return rubbish on older |
1132 | ThinkPads. | 1139 | ThinkPads. |
1133 | 1140 | ||
1134 | driver attribute fan_watchdog: | 1141 | hwmon driver attribute fan_watchdog: |
1135 | Fan safety watchdog timer interval, in seconds. Minimum is | 1142 | Fan safety watchdog timer interval, in seconds. Minimum is |
1136 | 1 second, maximum is 120 seconds. 0 disables the watchdog. | 1143 | 1 second, maximum is 120 seconds. 0 disables the watchdog. |
1137 | 1144 | ||
@@ -1196,7 +1203,7 @@ for example: | |||
1196 | Enabling debugging output | 1203 | Enabling debugging output |
1197 | ------------------------- | 1204 | ------------------------- |
1198 | 1205 | ||
1199 | The module takes a debug paramater which can be used to selectively | 1206 | The module takes a debug parameter which can be used to selectively |
1200 | enable various classes of debugging output, for example: | 1207 | enable various classes of debugging output, for example: |
1201 | 1208 | ||
1202 | modprobe ibm_acpi debug=0xffff | 1209 | modprobe ibm_acpi debug=0xffff |
@@ -1233,3 +1240,9 @@ Sysfs interface changelog: | |||
1233 | layer, the radio switch generates input event EV_RADIO, | 1240 | layer, the radio switch generates input event EV_RADIO, |
1234 | and the driver enables hot key handling by default in | 1241 | and the driver enables hot key handling by default in |
1235 | the firmware. | 1242 | the firmware. |
1243 | |||
1244 | 0x020000: ABI fix: added a separate hwmon platform device and | ||
1245 | driver, which must be located by name (thinkpad) | ||
1246 | and the hwmon class for libsensors4 (lm-sensors 3) | ||
1247 | compatibility. Moved all hwmon attributes to this | ||
1248 | new platform device. | ||
diff --git a/Documentation/usb/usb-serial.txt b/Documentation/usb/usb-serial.txt index 4e0b62b8566f..8b077e43eee7 100644 --- a/Documentation/usb/usb-serial.txt +++ b/Documentation/usb/usb-serial.txt | |||
@@ -338,7 +338,7 @@ MCT USB Single Port Serial Adapter U232 | |||
338 | This driver is for the MCT USB-RS232 Converter (25 pin, Model No. | 338 | This driver is for the MCT USB-RS232 Converter (25 pin, Model No. |
339 | U232-P25) from Magic Control Technology Corp. (there is also a 9 pin | 339 | U232-P25) from Magic Control Technology Corp. (there is also a 9 pin |
340 | Model No. U232-P9). More information about this device can be found at | 340 | Model No. U232-P9). More information about this device can be found at |
341 | the manufacture's web-site: http://www.mct.com.tw. | 341 | the manufacturer's web-site: http://www.mct.com.tw. |
342 | 342 | ||
343 | The driver is generally working, though it still needs some more testing. | 343 | The driver is generally working, though it still needs some more testing. |
344 | It is derived from the Belkin USB Serial Adapter F5U103 driver and its | 344 | It is derived from the Belkin USB Serial Adapter F5U103 driver and its |
diff --git a/Documentation/watchdog/src/watchdog-simple.c b/Documentation/watchdog/src/watchdog-simple.c index 47801bc7e742..4cf72f3fa8e9 100644 --- a/Documentation/watchdog/src/watchdog-simple.c +++ b/Documentation/watchdog/src/watchdog-simple.c | |||
@@ -3,15 +3,25 @@ | |||
3 | #include <unistd.h> | 3 | #include <unistd.h> |
4 | #include <fcntl.h> | 4 | #include <fcntl.h> |
5 | 5 | ||
6 | int main(int argc, const char *argv[]) { | 6 | int main(void) |
7 | { | ||
7 | int fd = open("/dev/watchdog", O_WRONLY); | 8 | int fd = open("/dev/watchdog", O_WRONLY); |
9 | int ret = 0; | ||
8 | if (fd == -1) { | 10 | if (fd == -1) { |
9 | perror("watchdog"); | 11 | perror("watchdog"); |
10 | exit(1); | 12 | exit(EXIT_FAILURE); |
11 | } | 13 | } |
12 | while (1) { | 14 | while (1) { |
13 | write(fd, "\0", 1); | 15 | ret = write(fd, "\0", 1); |
14 | fsync(fd); | 16 | if (ret != 1) { |
17 | ret = -1; | ||
18 | break; | ||
19 | } | ||
20 | ret = fsync(fd); | ||
21 | if (ret) | ||
22 | break; | ||
15 | sleep(10); | 23 | sleep(10); |
16 | } | 24 | } |
25 | close(fd); | ||
26 | return ret; | ||
17 | } | 27 | } |