diff options
author | Jiri Kosina <jkosina@suse.cz> | 2010-06-16 12:08:13 -0400 |
---|---|---|
committer | Jiri Kosina <jkosina@suse.cz> | 2010-06-16 12:08:13 -0400 |
commit | f1bbbb6912662b9f6070c5bfc4ca9eb1f06a9d5b (patch) | |
tree | c2c130a74be25b0b2dff992e1a195e2728bdaadd /Documentation | |
parent | fd0961ff67727482bb20ca7e8ea97b83e9de2ddb (diff) | |
parent | 7e27d6e778cd87b6f2415515d7127eba53fe5d02 (diff) |
Merge branch 'master' into for-next
Diffstat (limited to 'Documentation')
63 files changed, 4213 insertions, 633 deletions
diff --git a/Documentation/.gitignore b/Documentation/.gitignore new file mode 100644 index 000000000000..bcd907b4141f --- /dev/null +++ b/Documentation/.gitignore | |||
@@ -0,0 +1,7 @@ | |||
1 | filesystems/dnotify_test | ||
2 | laptops/dslm | ||
3 | timers/hpet_example | ||
4 | vm/hugepage-mmap | ||
5 | vm/hugepage-shm | ||
6 | vm/map_hugetlb | ||
7 | |||
diff --git a/Documentation/ABI/testing/sysfs-class-power b/Documentation/ABI/testing/sysfs-class-power new file mode 100644 index 000000000000..78c7baca3587 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-power | |||
@@ -0,0 +1,20 @@ | |||
1 | What: /sys/class/power/ds2760-battery.*/charge_now | ||
2 | Date: May 2010 | ||
3 | KernelVersion: 2.6.35 | ||
4 | Contact: Daniel Mack <daniel@caiaq.de> | ||
5 | Description: | ||
6 | This file is writeable and can be used to set the current | ||
7 | coloumb counter value inside the battery monitor chip. This | ||
8 | is needed for unavoidable corrections of aging batteries. | ||
9 | A userspace daemon can monitor the battery charging logic | ||
10 | and once the counter drops out of considerable bounds, take | ||
11 | appropriate action. | ||
12 | |||
13 | What: /sys/class/power/ds2760-battery.*/charge_full | ||
14 | Date: May 2010 | ||
15 | KernelVersion: 2.6.35 | ||
16 | Contact: Daniel Mack <daniel@caiaq.de> | ||
17 | Description: | ||
18 | This file is writeable and can be used to set the assumed | ||
19 | battery 'full level'. As batteries age, this value has to be | ||
20 | amended over time. | ||
diff --git a/Documentation/ABI/testing/sysfs-devices-node b/Documentation/ABI/testing/sysfs-devices-node new file mode 100644 index 000000000000..453a210c3ceb --- /dev/null +++ b/Documentation/ABI/testing/sysfs-devices-node | |||
@@ -0,0 +1,7 @@ | |||
1 | What: /sys/devices/system/node/nodeX/compact | ||
2 | Date: February 2010 | ||
3 | Contact: Mel Gorman <mel@csn.ul.ie> | ||
4 | Description: | ||
5 | When this file is written to, all memory within that node | ||
6 | will be compacted. When it completes, memory will be freed | ||
7 | into blocks which have as many contiguous pages as possible | ||
diff --git a/Documentation/ABI/testing/sysfs-firmware-sfi b/Documentation/ABI/testing/sysfs-firmware-sfi new file mode 100644 index 000000000000..4be7d44aeacf --- /dev/null +++ b/Documentation/ABI/testing/sysfs-firmware-sfi | |||
@@ -0,0 +1,15 @@ | |||
1 | What: /sys/firmware/sfi/tables/ | ||
2 | Date: May 2010 | ||
3 | Contact: Len Brown <lenb@kernel.org> | ||
4 | Description: | ||
5 | SFI defines a number of small static memory tables | ||
6 | so the kernel can get platform information from firmware. | ||
7 | |||
8 | The tables are defined in the latest SFI specification: | ||
9 | http://simplefirmware.org/documentation | ||
10 | |||
11 | While the tables are used by the kernel, user-space | ||
12 | can observe them this way: | ||
13 | |||
14 | # cd /sys/firmware/sfi/tables | ||
15 | # cat $TABLENAME > $TABLENAME.bin | ||
diff --git a/Documentation/DMA-API-HOWTO.txt b/Documentation/DMA-API-HOWTO.txt index 2e435adfbd6b..98ce51796f71 100644 --- a/Documentation/DMA-API-HOWTO.txt +++ b/Documentation/DMA-API-HOWTO.txt | |||
@@ -639,6 +639,36 @@ is planned to completely remove virt_to_bus() and bus_to_virt() as | |||
639 | they are entirely deprecated. Some ports already do not provide these | 639 | they are entirely deprecated. Some ports already do not provide these |
640 | as it is impossible to correctly support them. | 640 | as it is impossible to correctly support them. |
641 | 641 | ||
642 | Handling Errors | ||
643 | |||
644 | DMA address space is limited on some architectures and an allocation | ||
645 | failure can be determined by: | ||
646 | |||
647 | - checking if dma_alloc_coherent returns NULL or dma_map_sg returns 0 | ||
648 | |||
649 | - checking the returned dma_addr_t of dma_map_single and dma_map_page | ||
650 | by using dma_mapping_error(): | ||
651 | |||
652 | dma_addr_t dma_handle; | ||
653 | |||
654 | dma_handle = dma_map_single(dev, addr, size, direction); | ||
655 | if (dma_mapping_error(dev, dma_handle)) { | ||
656 | /* | ||
657 | * reduce current DMA mapping usage, | ||
658 | * delay and try again later or | ||
659 | * reset driver. | ||
660 | */ | ||
661 | } | ||
662 | |||
663 | Networking drivers must call dev_kfree_skb to free the socket buffer | ||
664 | and return NETDEV_TX_OK if the DMA mapping fails on the transmit hook | ||
665 | (ndo_start_xmit). This means that the socket buffer is just dropped in | ||
666 | the failure case. | ||
667 | |||
668 | SCSI drivers must return SCSI_MLQUEUE_HOST_BUSY if the DMA mapping | ||
669 | fails in the queuecommand hook. This means that the SCSI subsystem | ||
670 | passes the command to the driver again later. | ||
671 | |||
642 | Optimizing Unmap State Space Consumption | 672 | Optimizing Unmap State Space Consumption |
643 | 673 | ||
644 | On many platforms, dma_unmap_{single,page}() is simply a nop. | 674 | On many platforms, dma_unmap_{single,page}() is simply a nop. |
@@ -703,42 +733,25 @@ to "Closing". | |||
703 | 733 | ||
704 | 1) Struct scatterlist requirements. | 734 | 1) Struct scatterlist requirements. |
705 | 735 | ||
706 | Struct scatterlist must contain, at a minimum, the following | 736 | Don't invent the architecture specific struct scatterlist; just use |
707 | members: | 737 | <asm-generic/scatterlist.h>. You need to enable |
708 | 738 | CONFIG_NEED_SG_DMA_LENGTH if the architecture supports IOMMUs | |
709 | struct page *page; | 739 | (including software IOMMU). |
710 | unsigned int offset; | 740 | |
711 | unsigned int length; | 741 | 2) ARCH_KMALLOC_MINALIGN |
712 | 742 | ||
713 | The base address is specified by a "page+offset" pair. | 743 | Architectures must ensure that kmalloc'ed buffer is |
714 | 744 | DMA-safe. Drivers and subsystems depend on it. If an architecture | |
715 | Previous versions of struct scatterlist contained a "void *address" | 745 | isn't fully DMA-coherent (i.e. hardware doesn't ensure that data in |
716 | field that was sometimes used instead of page+offset. As of Linux | 746 | the CPU cache is identical to data in main memory), |
717 | 2.5., page+offset is always used, and the "address" field has been | 747 | ARCH_KMALLOC_MINALIGN must be set so that the memory allocator |
718 | deleted. | 748 | makes sure that kmalloc'ed buffer doesn't share a cache line with |
719 | 749 | the others. See arch/arm/include/asm/cache.h as an example. | |
720 | 2) More to come... | 750 | |
721 | 751 | Note that ARCH_KMALLOC_MINALIGN is about DMA memory alignment | |
722 | Handling Errors | 752 | constraints. You don't need to worry about the architecture data |
723 | 753 | alignment constraints (e.g. the alignment constraints about 64-bit | |
724 | DMA address space is limited on some architectures and an allocation | 754 | objects). |
725 | failure can be determined by: | ||
726 | |||
727 | - checking if dma_alloc_coherent returns NULL or dma_map_sg returns 0 | ||
728 | |||
729 | - checking the returned dma_addr_t of dma_map_single and dma_map_page | ||
730 | by using dma_mapping_error(): | ||
731 | |||
732 | dma_addr_t dma_handle; | ||
733 | |||
734 | dma_handle = dma_map_single(dev, addr, size, direction); | ||
735 | if (dma_mapping_error(dev, dma_handle)) { | ||
736 | /* | ||
737 | * reduce current DMA mapping usage, | ||
738 | * delay and try again later or | ||
739 | * reset driver. | ||
740 | */ | ||
741 | } | ||
742 | 755 | ||
743 | Closing | 756 | Closing |
744 | 757 | ||
diff --git a/Documentation/DocBook/drm.tmpl b/Documentation/DocBook/drm.tmpl index 7583dc7cf64d..910c923a9b86 100644 --- a/Documentation/DocBook/drm.tmpl +++ b/Documentation/DocBook/drm.tmpl | |||
@@ -389,7 +389,7 @@ | |||
389 | </para> | 389 | </para> |
390 | <para> | 390 | <para> |
391 | If your driver supports memory management (it should!), you'll | 391 | If your driver supports memory management (it should!), you'll |
392 | need to set that up at load time as well. How you intialize | 392 | need to set that up at load time as well. How you initialize |
393 | it depends on which memory manager you're using, TTM or GEM. | 393 | it depends on which memory manager you're using, TTM or GEM. |
394 | </para> | 394 | </para> |
395 | <sect3> | 395 | <sect3> |
@@ -399,7 +399,7 @@ | |||
399 | aperture space for graphics devices. TTM supports both UMA devices | 399 | aperture space for graphics devices. TTM supports both UMA devices |
400 | and devices with dedicated video RAM (VRAM), i.e. most discrete | 400 | and devices with dedicated video RAM (VRAM), i.e. most discrete |
401 | graphics devices. If your device has dedicated RAM, supporting | 401 | graphics devices. If your device has dedicated RAM, supporting |
402 | TTM is desireable. TTM also integrates tightly with your | 402 | TTM is desirable. TTM also integrates tightly with your |
403 | driver specific buffer execution function. See the radeon | 403 | driver specific buffer execution function. See the radeon |
404 | driver for examples. | 404 | driver for examples. |
405 | </para> | 405 | </para> |
@@ -443,7 +443,7 @@ | |||
443 | likely eventually calling ttm_bo_global_init and | 443 | likely eventually calling ttm_bo_global_init and |
444 | ttm_bo_global_release, respectively. Also like the previous | 444 | ttm_bo_global_release, respectively. Also like the previous |
445 | object, ttm_global_item_ref is used to create an initial reference | 445 | object, ttm_global_item_ref is used to create an initial reference |
446 | count for the TTM, which will call your initalization function. | 446 | count for the TTM, which will call your initialization function. |
447 | </para> | 447 | </para> |
448 | </sect3> | 448 | </sect3> |
449 | <sect3> | 449 | <sect3> |
@@ -557,7 +557,7 @@ void intel_crt_init(struct drm_device *dev) | |||
557 | CRT connector and encoder combination is created. A device | 557 | CRT connector and encoder combination is created. A device |
558 | specific i2c bus is also created, for fetching EDID data and | 558 | specific i2c bus is also created, for fetching EDID data and |
559 | performing monitor detection. Once the process is complete, | 559 | performing monitor detection. Once the process is complete, |
560 | the new connector is regsitered with sysfs, to make its | 560 | the new connector is registered with sysfs, to make its |
561 | properties available to applications. | 561 | properties available to applications. |
562 | </para> | 562 | </para> |
563 | <sect4> | 563 | <sect4> |
@@ -581,12 +581,12 @@ void intel_crt_init(struct drm_device *dev) | |||
581 | <para> | 581 | <para> |
582 | For each encoder, CRTC and connector, several functions must | 582 | For each encoder, CRTC and connector, several functions must |
583 | be provided, depending on the object type. Encoder objects | 583 | be provided, depending on the object type. Encoder objects |
584 | need should provide a DPMS (basically on/off) function, mode fixup | 584 | need to provide a DPMS (basically on/off) function, mode fixup |
585 | (for converting requested modes into native hardware timings), | 585 | (for converting requested modes into native hardware timings), |
586 | and prepare, set and commit functions for use by the core DRM | 586 | and prepare, set and commit functions for use by the core DRM |
587 | helper functions. Connector helpers need to provide mode fetch and | 587 | helper functions. Connector helpers need to provide mode fetch and |
588 | validity functions as well as an encoder matching function for | 588 | validity functions as well as an encoder matching function for |
589 | returing an ideal encoder for a given connector. The core | 589 | returning an ideal encoder for a given connector. The core |
590 | connector functions include a DPMS callback, (deprecated) | 590 | connector functions include a DPMS callback, (deprecated) |
591 | save/restore routines, detection, mode probing, property handling, | 591 | save/restore routines, detection, mode probing, property handling, |
592 | and cleanup functions. | 592 | and cleanup functions. |
diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl index 133cd6c3f3c1..020ac80d4682 100644 --- a/Documentation/DocBook/mtdnand.tmpl +++ b/Documentation/DocBook/mtdnand.tmpl | |||
@@ -269,7 +269,7 @@ static void board_hwcontrol(struct mtd_info *mtd, int cmd) | |||
269 | information about the device. | 269 | information about the device. |
270 | </para> | 270 | </para> |
271 | <programlisting> | 271 | <programlisting> |
272 | int __init board_init (void) | 272 | static int __init board_init (void) |
273 | { | 273 | { |
274 | struct nand_chip *this; | 274 | struct nand_chip *this; |
275 | int err = 0; | 275 | int err = 0; |
diff --git a/Documentation/DocBook/v4l/v4l2.xml b/Documentation/DocBook/v4l/v4l2.xml index 9737243377a3..7c3c098d5d08 100644 --- a/Documentation/DocBook/v4l/v4l2.xml +++ b/Documentation/DocBook/v4l/v4l2.xml | |||
@@ -58,7 +58,7 @@ MPEG stream embedded, sliced VBI data format in this specification. | |||
58 | </contrib> | 58 | </contrib> |
59 | <affiliation> | 59 | <affiliation> |
60 | <address> | 60 | <address> |
61 | <email>awalls@radix.net</email> | 61 | <email>awalls@md.metrocast.net</email> |
62 | </address> | 62 | </address> |
63 | </affiliation> | 63 | </affiliation> |
64 | </author> | 64 | </author> |
diff --git a/Documentation/DocBook/v4l/vidioc-query-dv-preset.xml b/Documentation/DocBook/v4l/vidioc-query-dv-preset.xml index 87e4f0f6151c..402229ee06f6 100644 --- a/Documentation/DocBook/v4l/vidioc-query-dv-preset.xml +++ b/Documentation/DocBook/v4l/vidioc-query-dv-preset.xml | |||
@@ -53,8 +53,10 @@ input</refpurpose> | |||
53 | automatically, similar to sensing the video standard. To do so, applications | 53 | automatically, similar to sensing the video standard. To do so, applications |
54 | call <constant> VIDIOC_QUERY_DV_PRESET</constant> with a pointer to a | 54 | call <constant> VIDIOC_QUERY_DV_PRESET</constant> with a pointer to a |
55 | &v4l2-dv-preset; type. Once the hardware detects a preset, that preset is | 55 | &v4l2-dv-preset; type. Once the hardware detects a preset, that preset is |
56 | returned in the preset field of &v4l2-dv-preset;. When detection is not | 56 | returned in the preset field of &v4l2-dv-preset;. If the preset could not be |
57 | possible or fails, the value V4L2_DV_INVALID is returned.</para> | 57 | detected because there was no signal, or the signal was unreliable, or the |
58 | signal did not map to a supported preset, then the value V4L2_DV_INVALID is | ||
59 | returned.</para> | ||
58 | </refsect1> | 60 | </refsect1> |
59 | 61 | ||
60 | <refsect1> | 62 | <refsect1> |
diff --git a/Documentation/PCI/pcieaer-howto.txt b/Documentation/PCI/pcieaer-howto.txt index be21001ab144..26d3d945c3c2 100644 --- a/Documentation/PCI/pcieaer-howto.txt +++ b/Documentation/PCI/pcieaer-howto.txt | |||
@@ -13,7 +13,7 @@ Reporting (AER) driver and provides information on how to use it, as | |||
13 | well as how to enable the drivers of endpoint devices to conform with | 13 | well as how to enable the drivers of endpoint devices to conform with |
14 | PCI Express AER driver. | 14 | PCI Express AER driver. |
15 | 15 | ||
16 | 1.2 Copyright © Intel Corporation 2006. | 16 | 1.2 Copyright (C) Intel Corporation 2006. |
17 | 17 | ||
18 | 1.3 What is the PCI Express AER Driver? | 18 | 1.3 What is the PCI Express AER Driver? |
19 | 19 | ||
@@ -71,15 +71,11 @@ console. If it's a correctable error, it is outputed as a warning. | |||
71 | Otherwise, it is printed as an error. So users could choose different | 71 | Otherwise, it is printed as an error. So users could choose different |
72 | log level to filter out correctable error messages. | 72 | log level to filter out correctable error messages. |
73 | 73 | ||
74 | Below shows an example. | 74 | Below shows an example: |
75 | +------ PCI-Express Device Error -----+ | 75 | 0000:50:00.0: PCIe Bus Error: severity=Uncorrected (Fatal), type=Transaction Layer, id=0500(Requester ID) |
76 | Error Severity : Uncorrected (Fatal) | 76 | 0000:50:00.0: device [8086:0329] error status/mask=00100000/00000000 |
77 | PCIE Bus Error type : Transaction Layer | 77 | 0000:50:00.0: [20] Unsupported Request (First) |
78 | Unsupported Request : First | 78 | 0000:50:00.0: TLP Header: 04000001 00200a03 05010000 00050100 |
79 | Requester ID : 0500 | ||
80 | VendorID=8086h, DeviceID=0329h, Bus=05h, Device=00h, Function=00h | ||
81 | TLB Header: | ||
82 | 04000001 00200a03 05010000 00050100 | ||
83 | 79 | ||
84 | In the example, 'Requester ID' means the ID of the device who sends | 80 | In the example, 'Requester ID' means the ID of the device who sends |
85 | the error message to root port. Pls. refer to pci express specs for | 81 | the error message to root port. Pls. refer to pci express specs for |
@@ -112,7 +108,7 @@ but the PCI Express link itself is fully functional. Fatal errors, on | |||
112 | the other hand, cause the link to be unreliable. | 108 | the other hand, cause the link to be unreliable. |
113 | 109 | ||
114 | When AER is enabled, a PCI Express device will automatically send an | 110 | When AER is enabled, a PCI Express device will automatically send an |
115 | error message to the PCIE root port above it when the device captures | 111 | error message to the PCIe root port above it when the device captures |
116 | an error. The Root Port, upon receiving an error reporting message, | 112 | an error. The Root Port, upon receiving an error reporting message, |
117 | internally processes and logs the error message in its PCI Express | 113 | internally processes and logs the error message in its PCI Express |
118 | capability structure. Error information being logged includes storing | 114 | capability structure. Error information being logged includes storing |
@@ -198,8 +194,9 @@ to reset link, AER port service driver is required to provide the | |||
198 | function to reset link. Firstly, kernel looks for if the upstream | 194 | function to reset link. Firstly, kernel looks for if the upstream |
199 | component has an aer driver. If it has, kernel uses the reset_link | 195 | component has an aer driver. If it has, kernel uses the reset_link |
200 | callback of the aer driver. If the upstream component has no aer driver | 196 | callback of the aer driver. If the upstream component has no aer driver |
201 | and the port is downstream port, we will use the aer driver of the | 197 | and the port is downstream port, we will perform a hot reset as the |
202 | root port who reports the AER error. As for upstream ports, | 198 | default by setting the Secondary Bus Reset bit of the Bridge Control |
199 | register associated with the downstream port. As for upstream ports, | ||
203 | they should provide their own aer service drivers with reset_link | 200 | they should provide their own aer service drivers with reset_link |
204 | function. If error_detected returns PCI_ERS_RESULT_CAN_RECOVER and | 201 | function. If error_detected returns PCI_ERS_RESULT_CAN_RECOVER and |
205 | reset_link returns PCI_ERS_RESULT_RECOVERED, the error handling goes | 202 | reset_link returns PCI_ERS_RESULT_RECOVERED, the error handling goes |
@@ -253,11 +250,11 @@ cleanup uncorrectable status register. Pls. refer to section 3.3. | |||
253 | 250 | ||
254 | 4. Software error injection | 251 | 4. Software error injection |
255 | 252 | ||
256 | Debugging PCIE AER error recovery code is quite difficult because it | 253 | Debugging PCIe AER error recovery code is quite difficult because it |
257 | is hard to trigger real hardware errors. Software based error | 254 | is hard to trigger real hardware errors. Software based error |
258 | injection can be used to fake various kinds of PCIE errors. | 255 | injection can be used to fake various kinds of PCIe errors. |
259 | 256 | ||
260 | First you should enable PCIE AER software error injection in kernel | 257 | First you should enable PCIe AER software error injection in kernel |
261 | configuration, that is, following item should be in your .config. | 258 | configuration, that is, following item should be in your .config. |
262 | 259 | ||
263 | CONFIG_PCIEAER_INJECT=y or CONFIG_PCIEAER_INJECT=m | 260 | CONFIG_PCIEAER_INJECT=y or CONFIG_PCIEAER_INJECT=m |
diff --git a/Documentation/SubmitChecklist b/Documentation/SubmitChecklist index 8916ca48bc95..da0382daa395 100644 --- a/Documentation/SubmitChecklist +++ b/Documentation/SubmitChecklist | |||
@@ -18,6 +18,8 @@ kernel patches. | |||
18 | 18 | ||
19 | 2b: Passes allnoconfig, allmodconfig | 19 | 2b: Passes allnoconfig, allmodconfig |
20 | 20 | ||
21 | 2c: Builds successfully when using O=builddir | ||
22 | |||
21 | 3: Builds on multiple CPU architectures by using local cross-compile tools | 23 | 3: Builds on multiple CPU architectures by using local cross-compile tools |
22 | or some other build farm. | 24 | or some other build farm. |
23 | 25 | ||
@@ -95,3 +97,13 @@ kernel patches. | |||
95 | 97 | ||
96 | 25: If any ioctl's are added by the patch, then also update | 98 | 25: If any ioctl's are added by the patch, then also update |
97 | Documentation/ioctl/ioctl-number.txt. | 99 | Documentation/ioctl/ioctl-number.txt. |
100 | |||
101 | 26: If your modified source code depends on or uses any of the kernel | ||
102 | APIs or features that are related to the following kconfig symbols, | ||
103 | then test multiple builds with the related kconfig symbols disabled | ||
104 | and/or =m (if that option is available) [not all of these at the | ||
105 | same time, just various/random combinations of them]: | ||
106 | |||
107 | CONFIG_SMP, CONFIG_SYSFS, CONFIG_PROC_FS, CONFIG_INPUT, CONFIG_PCI, | ||
108 | CONFIG_BLOCK, CONFIG_PM, CONFIG_HOTPLUG, CONFIG_MAGIC_SYSRQ, | ||
109 | CONFIG_NET, CONFIG_INET=n (but latter with CONFIG_NET=y) | ||
diff --git a/Documentation/SubmittingDrivers b/Documentation/SubmittingDrivers index 99e72a81fa2f..4947fd8fb182 100644 --- a/Documentation/SubmittingDrivers +++ b/Documentation/SubmittingDrivers | |||
@@ -130,6 +130,8 @@ Linux kernel master tree: | |||
130 | ftp.??.kernel.org:/pub/linux/kernel/... | 130 | ftp.??.kernel.org:/pub/linux/kernel/... |
131 | ?? == your country code, such as "us", "uk", "fr", etc. | 131 | ?? == your country code, such as "us", "uk", "fr", etc. |
132 | 132 | ||
133 | http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git | ||
134 | |||
133 | Linux kernel mailing list: | 135 | Linux kernel mailing list: |
134 | linux-kernel@vger.kernel.org | 136 | linux-kernel@vger.kernel.org |
135 | [mail majordomo@vger.kernel.org to subscribe] | 137 | [mail majordomo@vger.kernel.org to subscribe] |
@@ -160,3 +162,6 @@ How to NOT write kernel driver by Arjan van de Ven: | |||
160 | 162 | ||
161 | Kernel Janitor: | 163 | Kernel Janitor: |
162 | http://janitor.kernelnewbies.org/ | 164 | http://janitor.kernelnewbies.org/ |
165 | |||
166 | GIT, Fast Version Control System: | ||
167 | http://git-scm.com/ | ||
diff --git a/Documentation/acpi/apei/einj.txt b/Documentation/acpi/apei/einj.txt new file mode 100644 index 000000000000..dfab71848dc8 --- /dev/null +++ b/Documentation/acpi/apei/einj.txt | |||
@@ -0,0 +1,59 @@ | |||
1 | APEI Error INJection | ||
2 | ~~~~~~~~~~~~~~~~~~~~ | ||
3 | |||
4 | EINJ provides a hardware error injection mechanism | ||
5 | It is very useful for debugging and testing of other APEI and RAS features. | ||
6 | |||
7 | To use EINJ, make sure the following are enabled in your kernel | ||
8 | configuration: | ||
9 | |||
10 | CONFIG_DEBUG_FS | ||
11 | CONFIG_ACPI_APEI | ||
12 | CONFIG_ACPI_APEI_EINJ | ||
13 | |||
14 | The user interface of EINJ is debug file system, under the | ||
15 | directory apei/einj. The following files are provided. | ||
16 | |||
17 | - available_error_type | ||
18 | Reading this file returns the error injection capability of the | ||
19 | platform, that is, which error types are supported. The error type | ||
20 | definition is as follow, the left field is the error type value, the | ||
21 | right field is error description. | ||
22 | |||
23 | 0x00000001 Processor Correctable | ||
24 | 0x00000002 Processor Uncorrectable non-fatal | ||
25 | 0x00000004 Processor Uncorrectable fatal | ||
26 | 0x00000008 Memory Correctable | ||
27 | 0x00000010 Memory Uncorrectable non-fatal | ||
28 | 0x00000020 Memory Uncorrectable fatal | ||
29 | 0x00000040 PCI Express Correctable | ||
30 | 0x00000080 PCI Express Uncorrectable fatal | ||
31 | 0x00000100 PCI Express Uncorrectable non-fatal | ||
32 | 0x00000200 Platform Correctable | ||
33 | 0x00000400 Platform Uncorrectable non-fatal | ||
34 | 0x00000800 Platform Uncorrectable fatal | ||
35 | |||
36 | The format of file contents are as above, except there are only the | ||
37 | available error type lines. | ||
38 | |||
39 | - error_type | ||
40 | This file is used to set the error type value. The error type value | ||
41 | is defined in "available_error_type" description. | ||
42 | |||
43 | - error_inject | ||
44 | Write any integer to this file to trigger the error | ||
45 | injection. Before this, please specify all necessary error | ||
46 | parameters. | ||
47 | |||
48 | - param1 | ||
49 | This file is used to set the first error parameter value. Effect of | ||
50 | parameter depends on error_type specified. For memory error, this is | ||
51 | physical memory address. | ||
52 | |||
53 | - param2 | ||
54 | This file is used to set the second error parameter value. Effect of | ||
55 | parameter depends on error_type specified. For memory error, this is | ||
56 | physical memory address mask. | ||
57 | |||
58 | For more information about EINJ, please refer to ACPI specification | ||
59 | version 4.0, section 17.5. | ||
diff --git a/Documentation/arm/Samsung-S3C24XX/GPIO.txt b/Documentation/arm/Samsung-S3C24XX/GPIO.txt index 2af2cf39915f..816d6071669e 100644 --- a/Documentation/arm/Samsung-S3C24XX/GPIO.txt +++ b/Documentation/arm/Samsung-S3C24XX/GPIO.txt | |||
@@ -12,6 +12,8 @@ Introduction | |||
12 | of the s3c2410 GPIO system, please read the Samsung provided | 12 | of the s3c2410 GPIO system, please read the Samsung provided |
13 | data-sheet/users manual to find out the complete list. | 13 | data-sheet/users manual to find out the complete list. |
14 | 14 | ||
15 | See Documentation/arm/Samsung/GPIO.txt for the core implemetation. | ||
16 | |||
15 | 17 | ||
16 | GPIOLIB | 18 | GPIOLIB |
17 | ------- | 19 | ------- |
@@ -24,8 +26,60 @@ GPIOLIB | |||
24 | listed below will be removed (they may be marked as __deprecated | 26 | listed below will be removed (they may be marked as __deprecated |
25 | in the near future). | 27 | in the near future). |
26 | 28 | ||
27 | - s3c2410_gpio_getpin | 29 | The following functions now either have a s3c_ specific variant |
28 | - s3c2410_gpio_setpin | 30 | or are merged into gpiolib. See the definitions in |
31 | arch/arm/plat-samsung/include/plat/gpio-cfg.h: | ||
32 | |||
33 | s3c2410_gpio_setpin() gpio_set_value() or gpio_direction_output() | ||
34 | s3c2410_gpio_getpin() gpio_get_value() or gpio_direction_input() | ||
35 | s3c2410_gpio_getirq() gpio_to_irq() | ||
36 | s3c2410_gpio_cfgpin() s3c_gpio_cfgpin() | ||
37 | s3c2410_gpio_getcfg() s3c_gpio_getcfg() | ||
38 | s3c2410_gpio_pullup() s3c_gpio_setpull() | ||
39 | |||
40 | |||
41 | GPIOLIB conversion | ||
42 | ------------------ | ||
43 | |||
44 | If you need to convert your board or driver to use gpiolib from the exiting | ||
45 | s3c2410 api, then here are some notes on the process. | ||
46 | |||
47 | 1) If your board is exclusively using an GPIO, say to control peripheral | ||
48 | power, then it will require to claim the gpio with gpio_request() before | ||
49 | it can use it. | ||
50 | |||
51 | It is recommended to check the return value, with at least WARN_ON() | ||
52 | during initialisation. | ||
53 | |||
54 | 2) The s3c2410_gpio_cfgpin() can be directly replaced with s3c_gpio_cfgpin() | ||
55 | as they have the same arguments, and can either take the pin specific | ||
56 | values, or the more generic special-function-number arguments. | ||
57 | |||
58 | 3) s3c2410_gpio_pullup() changs have the problem that whilst the | ||
59 | s3c2410_gpio_pullup(x, 1) can be easily translated to the | ||
60 | s3c_gpio_setpull(x, S3C_GPIO_PULL_NONE), the s3c2410_gpio_pullup(x, 0) | ||
61 | are not so easy. | ||
62 | |||
63 | The s3c2410_gpio_pullup(x, 0) case enables the pull-up (or in the case | ||
64 | of some of the devices, a pull-down) and as such the new API distinguishes | ||
65 | between the UP and DOWN case. There is currently no 'just turn on' setting | ||
66 | which may be required if this becomes a problem. | ||
67 | |||
68 | 4) s3c2410_gpio_setpin() can be replaced by gpio_set_value(), the old call | ||
69 | does not implicitly configure the relevant gpio to output. The gpio | ||
70 | direction should be changed before using gpio_set_value(). | ||
71 | |||
72 | 5) s3c2410_gpio_getpin() is replaceable by gpio_get_value() if the pin | ||
73 | has been set to input. It is currently unknown what the behaviour is | ||
74 | when using gpio_get_value() on an output pin (s3c2410_gpio_getpin | ||
75 | would return the value the pin is supposed to be outputting). | ||
76 | |||
77 | 6) s3c2410_gpio_getirq() should be directly replacable with the | ||
78 | gpio_to_irq() call. | ||
79 | |||
80 | The s3c2410_gpio and gpio_ calls have always operated on the same gpio | ||
81 | numberspace, so there is no problem with converting the gpio numbering | ||
82 | between the calls. | ||
29 | 83 | ||
30 | 84 | ||
31 | Headers | 85 | Headers |
@@ -54,6 +108,11 @@ PIN Numbers | |||
54 | eg S3C2410_GPA(0) or S3C2410_GPF(1). These defines are used to tell | 108 | eg S3C2410_GPA(0) or S3C2410_GPF(1). These defines are used to tell |
55 | the GPIO functions which pin is to be used. | 109 | the GPIO functions which pin is to be used. |
56 | 110 | ||
111 | With the conversion to gpiolib, there is no longer a direct conversion | ||
112 | from gpio pin number to register base address as in earlier kernels. This | ||
113 | is due to the number space required for newer SoCs where the later | ||
114 | GPIOs are not contiguous. | ||
115 | |||
57 | 116 | ||
58 | Configuring a pin | 117 | Configuring a pin |
59 | ----------------- | 118 | ----------------- |
@@ -71,6 +130,8 @@ Configuring a pin | |||
71 | which would turn GPA(0) into the lowest Address line A0, and set | 130 | which would turn GPA(0) into the lowest Address line A0, and set |
72 | GPE(8) to be connected to the SDIO/MMC controller's SDDAT1 line. | 131 | GPE(8) to be connected to the SDIO/MMC controller's SDDAT1 line. |
73 | 132 | ||
133 | The s3c_gpio_cfgpin() call is a functional replacement for this call. | ||
134 | |||
74 | 135 | ||
75 | Reading the current configuration | 136 | Reading the current configuration |
76 | --------------------------------- | 137 | --------------------------------- |
@@ -82,6 +143,9 @@ Reading the current configuration | |||
82 | The return value will be from the same set of values which can be | 143 | The return value will be from the same set of values which can be |
83 | passed to s3c2410_gpio_cfgpin(). | 144 | passed to s3c2410_gpio_cfgpin(). |
84 | 145 | ||
146 | The s3c_gpio_getcfg() call should be a functional replacement for | ||
147 | this call. | ||
148 | |||
85 | 149 | ||
86 | Configuring a pull-up resistor | 150 | Configuring a pull-up resistor |
87 | ------------------------------ | 151 | ------------------------------ |
@@ -95,6 +159,10 @@ Configuring a pull-up resistor | |||
95 | Where the to value is zero to set the pull-up off, and 1 to enable | 159 | Where the to value is zero to set the pull-up off, and 1 to enable |
96 | the specified pull-up. Any other values are currently undefined. | 160 | the specified pull-up. Any other values are currently undefined. |
97 | 161 | ||
162 | The s3c_gpio_setpull() offers similar functionality, but with the | ||
163 | ability to encode whether the pull is up or down. Currently there | ||
164 | is no 'just on' state, so up or down must be selected. | ||
165 | |||
98 | 166 | ||
99 | Getting the state of a PIN | 167 | Getting the state of a PIN |
100 | -------------------------- | 168 | -------------------------- |
@@ -106,6 +174,9 @@ Getting the state of a PIN | |||
106 | This will return either zero or non-zero. Do not count on this | 174 | This will return either zero or non-zero. Do not count on this |
107 | function returning 1 if the pin is set. | 175 | function returning 1 if the pin is set. |
108 | 176 | ||
177 | This call is now implemented by the relevant gpiolib calls, convert | ||
178 | your board or driver to use gpiolib. | ||
179 | |||
109 | 180 | ||
110 | Setting the state of a PIN | 181 | Setting the state of a PIN |
111 | -------------------------- | 182 | -------------------------- |
@@ -117,6 +188,9 @@ Setting the state of a PIN | |||
117 | Which sets the given pin to the value. Use 0 to write 0, and 1 to | 188 | Which sets the given pin to the value. Use 0 to write 0, and 1 to |
118 | set the output to 1. | 189 | set the output to 1. |
119 | 190 | ||
191 | This call is now implemented by the relevant gpiolib calls, convert | ||
192 | your board or driver to use gpiolib. | ||
193 | |||
120 | 194 | ||
121 | Getting the IRQ number associated with a PIN | 195 | Getting the IRQ number associated with a PIN |
122 | -------------------------------------------- | 196 | -------------------------------------------- |
@@ -128,6 +202,9 @@ Getting the IRQ number associated with a PIN | |||
128 | 202 | ||
129 | Note, not all pins have an IRQ. | 203 | Note, not all pins have an IRQ. |
130 | 204 | ||
205 | This call is now implemented by the relevant gpiolib calls, convert | ||
206 | your board or driver to use gpiolib. | ||
207 | |||
131 | 208 | ||
132 | Authour | 209 | Authour |
133 | ------- | 210 | ------- |
diff --git a/Documentation/arm/Samsung-S3C24XX/Overview.txt b/Documentation/arm/Samsung-S3C24XX/Overview.txt index 081892df4fda..c12bfc1a00c9 100644 --- a/Documentation/arm/Samsung-S3C24XX/Overview.txt +++ b/Documentation/arm/Samsung-S3C24XX/Overview.txt | |||
@@ -8,10 +8,16 @@ Introduction | |||
8 | 8 | ||
9 | The Samsung S3C24XX range of ARM9 System-on-Chip CPUs are supported | 9 | The Samsung S3C24XX range of ARM9 System-on-Chip CPUs are supported |
10 | by the 's3c2410' architecture of ARM Linux. Currently the S3C2410, | 10 | by the 's3c2410' architecture of ARM Linux. Currently the S3C2410, |
11 | S3C2412, S3C2413, S3C2440, S3C2442 and S3C2443 devices are supported. | 11 | S3C2412, S3C2413, S3C2416 S3C2440, S3C2442, S3C2443 and S3C2450 devices |
12 | are supported. | ||
12 | 13 | ||
13 | Support for the S3C2400 and S3C24A0 series are in progress. | 14 | Support for the S3C2400 and S3C24A0 series are in progress. |
14 | 15 | ||
16 | The S3C2416 and S3C2450 devices are very similar and S3C2450 support is | ||
17 | included under the arch/arm/mach-s3c2416 directory. Note, whilst core | ||
18 | support for these SoCs is in, work on some of the extra peripherals | ||
19 | and extra interrupts is still ongoing. | ||
20 | |||
15 | 21 | ||
16 | Configuration | 22 | Configuration |
17 | ------------- | 23 | ------------- |
@@ -209,6 +215,13 @@ GPIO | |||
209 | Newer kernels carry GPIOLIB, and support is being moved towards | 215 | Newer kernels carry GPIOLIB, and support is being moved towards |
210 | this with some of the older support in line to be removed. | 216 | this with some of the older support in line to be removed. |
211 | 217 | ||
218 | As of v2.6.34, the move towards using gpiolib support is almost | ||
219 | complete, and very little of the old calls are left. | ||
220 | |||
221 | See Documentation/arm/Samsung-S3C24XX/GPIO.txt for the S3C24XX specific | ||
222 | support and Documentation/arm/Samsung/GPIO.txt for the core Samsung | ||
223 | implementation. | ||
224 | |||
212 | 225 | ||
213 | Clock Management | 226 | Clock Management |
214 | ---------------- | 227 | ---------------- |
diff --git a/Documentation/arm/Samsung/GPIO.txt b/Documentation/arm/Samsung/GPIO.txt new file mode 100644 index 000000000000..05850c62abeb --- /dev/null +++ b/Documentation/arm/Samsung/GPIO.txt | |||
@@ -0,0 +1,42 @@ | |||
1 | Samsung GPIO implementation | ||
2 | =========================== | ||
3 | |||
4 | Introduction | ||
5 | ------------ | ||
6 | |||
7 | This outlines the Samsung GPIO implementation and the architecture | ||
8 | specfic calls provided alongisde the drivers/gpio core. | ||
9 | |||
10 | |||
11 | S3C24XX (Legacy) | ||
12 | ---------------- | ||
13 | |||
14 | See Documentation/arm/Samsung-S3C24XX/GPIO.txt for more information | ||
15 | about these devices. Their implementation is being brought into line | ||
16 | with the core samsung implementation described in this document. | ||
17 | |||
18 | |||
19 | GPIOLIB integration | ||
20 | ------------------- | ||
21 | |||
22 | The gpio implementation uses gpiolib as much as possible, only providing | ||
23 | specific calls for the items that require Samsung specific handling, such | ||
24 | as pin special-function or pull resistor control. | ||
25 | |||
26 | GPIO numbering is synchronised between the Samsung and gpiolib system. | ||
27 | |||
28 | |||
29 | PIN configuration | ||
30 | ----------------- | ||
31 | |||
32 | Pin configuration is specific to the Samsung architecutre, with each SoC | ||
33 | registering the necessary information for the core gpio configuration | ||
34 | implementation to configure pins as necessary. | ||
35 | |||
36 | The s3c_gpio_cfgpin() and s3c_gpio_setpull() provide the means for a | ||
37 | driver or machine to change gpio configuration. | ||
38 | |||
39 | See arch/arm/plat-samsung/include/plat/gpio-cfg.h for more information | ||
40 | on these functions. | ||
41 | |||
42 | |||
diff --git a/Documentation/arm/Samsung/Overview.txt b/Documentation/arm/Samsung/Overview.txt index 7cced1fea9c3..c3094ea51aa7 100644 --- a/Documentation/arm/Samsung/Overview.txt +++ b/Documentation/arm/Samsung/Overview.txt | |||
@@ -13,9 +13,10 @@ Introduction | |||
13 | 13 | ||
14 | - S3C24XX: See Documentation/arm/Samsung-S3C24XX/Overview.txt for full list | 14 | - S3C24XX: See Documentation/arm/Samsung-S3C24XX/Overview.txt for full list |
15 | - S3C64XX: S3C6400 and S3C6410 | 15 | - S3C64XX: S3C6400 and S3C6410 |
16 | - S5PC6440 | 16 | - S5P6440 |
17 | 17 | - S5P6442 | |
18 | S5PC100 and S5PC110 support is currently being merged | 18 | - S5PC100 |
19 | - S5PC110 / S5PV210 | ||
19 | 20 | ||
20 | 21 | ||
21 | S3C24XX Systems | 22 | S3C24XX Systems |
@@ -35,7 +36,10 @@ Configuration | |||
35 | unifying all the SoCs into one kernel. | 36 | unifying all the SoCs into one kernel. |
36 | 37 | ||
37 | s5p6440_defconfig - S5P6440 specific default configuration | 38 | s5p6440_defconfig - S5P6440 specific default configuration |
39 | s5p6442_defconfig - S5P6442 specific default configuration | ||
38 | s5pc100_defconfig - S5PC100 specific default configuration | 40 | s5pc100_defconfig - S5PC100 specific default configuration |
41 | s5pc110_defconfig - S5PC110 specific default configuration | ||
42 | s5pv210_defconfig - S5PV210 specific default configuration | ||
39 | 43 | ||
40 | 44 | ||
41 | Layout | 45 | Layout |
@@ -50,18 +54,27 @@ Layout | |||
50 | specific information. It contains the base clock, GPIO and device definitions | 54 | specific information. It contains the base clock, GPIO and device definitions |
51 | to get the system running. | 55 | to get the system running. |
52 | 56 | ||
53 | plat-s3c is the s3c24xx/s3c64xx platform directory, although it is currently | ||
54 | involved in other builds this will be phased out once the relevant code is | ||
55 | moved elsewhere. | ||
56 | |||
57 | plat-s3c24xx is for s3c24xx specific builds, see the S3C24XX docs. | 57 | plat-s3c24xx is for s3c24xx specific builds, see the S3C24XX docs. |
58 | 58 | ||
59 | plat-s3c64xx is for the s3c64xx specific bits, see the S3C24XX docs. | 59 | plat-s5p is for s5p specific builds, and contains common support for the |
60 | S5P specific systems. Not all S5Ps use all the features in this directory | ||
61 | due to differences in the hardware. | ||
62 | |||
63 | |||
64 | Layout changes | ||
65 | -------------- | ||
66 | |||
67 | The old plat-s3c and plat-s5pc1xx directories have been removed, with | ||
68 | support moved to either plat-samsung or plat-s5p as necessary. These moves | ||
69 | where to simplify the include and dependency issues involved with having | ||
70 | so many different platform directories. | ||
60 | 71 | ||
61 | plat-s5p is for s5p specific builds, more to be added. | 72 | It was decided to remove plat-s5pc1xx as some of the support was already |
73 | in plat-s5p or plat-samsung, with the S5PC110 support added with S5PV210 | ||
74 | the only user was the S5PC100. The S5PC100 specific items where moved to | ||
75 | arch/arm/mach-s5pc100. | ||
62 | 76 | ||
63 | 77 | ||
64 | [ to finish ] | ||
65 | 78 | ||
66 | 79 | ||
67 | Port Contributors | 80 | Port Contributors |
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt index 630879cd9a42..48e0b21b0059 100644 --- a/Documentation/cgroups/blkio-controller.txt +++ b/Documentation/cgroups/blkio-controller.txt | |||
@@ -17,6 +17,9 @@ HOWTO | |||
17 | You can do a very simple testing of running two dd threads in two different | 17 | You can do a very simple testing of running two dd threads in two different |
18 | cgroups. Here is what you can do. | 18 | cgroups. Here is what you can do. |
19 | 19 | ||
20 | - Enable Block IO controller | ||
21 | CONFIG_BLK_CGROUP=y | ||
22 | |||
20 | - Enable group scheduling in CFQ | 23 | - Enable group scheduling in CFQ |
21 | CONFIG_CFQ_GROUP_IOSCHED=y | 24 | CONFIG_CFQ_GROUP_IOSCHED=y |
22 | 25 | ||
@@ -54,32 +57,52 @@ cgroups. Here is what you can do. | |||
54 | 57 | ||
55 | Various user visible config options | 58 | Various user visible config options |
56 | =================================== | 59 | =================================== |
57 | CONFIG_CFQ_GROUP_IOSCHED | ||
58 | - Enables group scheduling in CFQ. Currently only 1 level of group | ||
59 | creation is allowed. | ||
60 | |||
61 | CONFIG_DEBUG_CFQ_IOSCHED | ||
62 | - Enables some debugging messages in blktrace. Also creates extra | ||
63 | cgroup file blkio.dequeue. | ||
64 | |||
65 | Config options selected automatically | ||
66 | ===================================== | ||
67 | These config options are not user visible and are selected/deselected | ||
68 | automatically based on IO scheduler configuration. | ||
69 | |||
70 | CONFIG_BLK_CGROUP | 60 | CONFIG_BLK_CGROUP |
71 | - Block IO controller. Selected by CONFIG_CFQ_GROUP_IOSCHED. | 61 | - Block IO controller. |
72 | 62 | ||
73 | CONFIG_DEBUG_BLK_CGROUP | 63 | CONFIG_DEBUG_BLK_CGROUP |
74 | - Debug help. Selected by CONFIG_DEBUG_CFQ_IOSCHED. | 64 | - Debug help. Right now some additional stats file show up in cgroup |
65 | if this option is enabled. | ||
66 | |||
67 | CONFIG_CFQ_GROUP_IOSCHED | ||
68 | - Enables group scheduling in CFQ. Currently only 1 level of group | ||
69 | creation is allowed. | ||
75 | 70 | ||
76 | Details of cgroup files | 71 | Details of cgroup files |
77 | ======================= | 72 | ======================= |
78 | - blkio.weight | 73 | - blkio.weight |
79 | - Specifies per cgroup weight. | 74 | - Specifies per cgroup weight. This is default weight of the group |
80 | 75 | on all the devices until and unless overridden by per device rule. | |
76 | (See blkio.weight_device). | ||
81 | Currently allowed range of weights is from 100 to 1000. | 77 | Currently allowed range of weights is from 100 to 1000. |
82 | 78 | ||
79 | - blkio.weight_device | ||
80 | - One can specify per cgroup per device rules using this interface. | ||
81 | These rules override the default value of group weight as specified | ||
82 | by blkio.weight. | ||
83 | |||
84 | Following is the format. | ||
85 | |||
86 | #echo dev_maj:dev_minor weight > /path/to/cgroup/blkio.weight_device | ||
87 | Configure weight=300 on /dev/sdb (8:16) in this cgroup | ||
88 | # echo 8:16 300 > blkio.weight_device | ||
89 | # cat blkio.weight_device | ||
90 | dev weight | ||
91 | 8:16 300 | ||
92 | |||
93 | Configure weight=500 on /dev/sda (8:0) in this cgroup | ||
94 | # echo 8:0 500 > blkio.weight_device | ||
95 | # cat blkio.weight_device | ||
96 | dev weight | ||
97 | 8:0 500 | ||
98 | 8:16 300 | ||
99 | |||
100 | Remove specific weight for /dev/sda in this cgroup | ||
101 | # echo 8:0 0 > blkio.weight_device | ||
102 | # cat blkio.weight_device | ||
103 | dev weight | ||
104 | 8:16 300 | ||
105 | |||
83 | - blkio.time | 106 | - blkio.time |
84 | - disk time allocated to cgroup per device in milliseconds. First | 107 | - disk time allocated to cgroup per device in milliseconds. First |
85 | two fields specify the major and minor number of the device and | 108 | two fields specify the major and minor number of the device and |
@@ -92,13 +115,105 @@ Details of cgroup files | |||
92 | third field specifies the number of sectors transferred by the | 115 | third field specifies the number of sectors transferred by the |
93 | group to/from the device. | 116 | group to/from the device. |
94 | 117 | ||
118 | - blkio.io_service_bytes | ||
119 | - Number of bytes transferred to/from the disk by the group. These | ||
120 | are further divided by the type of operation - read or write, sync | ||
121 | or async. First two fields specify the major and minor number of the | ||
122 | device, third field specifies the operation type and the fourth field | ||
123 | specifies the number of bytes. | ||
124 | |||
125 | - blkio.io_serviced | ||
126 | - Number of IOs completed to/from the disk by the group. These | ||
127 | are further divided by the type of operation - read or write, sync | ||
128 | or async. First two fields specify the major and minor number of the | ||
129 | device, third field specifies the operation type and the fourth field | ||
130 | specifies the number of IOs. | ||
131 | |||
132 | - blkio.io_service_time | ||
133 | - Total amount of time between request dispatch and request completion | ||
134 | for the IOs done by this cgroup. This is in nanoseconds to make it | ||
135 | meaningful for flash devices too. For devices with queue depth of 1, | ||
136 | this time represents the actual service time. When queue_depth > 1, | ||
137 | that is no longer true as requests may be served out of order. This | ||
138 | may cause the service time for a given IO to include the service time | ||
139 | of multiple IOs when served out of order which may result in total | ||
140 | io_service_time > actual time elapsed. This time is further divided by | ||
141 | the type of operation - read or write, sync or async. First two fields | ||
142 | specify the major and minor number of the device, third field | ||
143 | specifies the operation type and the fourth field specifies the | ||
144 | io_service_time in ns. | ||
145 | |||
146 | - blkio.io_wait_time | ||
147 | - Total amount of time the IOs for this cgroup spent waiting in the | ||
148 | scheduler queues for service. This can be greater than the total time | ||
149 | elapsed since it is cumulative io_wait_time for all IOs. It is not a | ||
150 | measure of total time the cgroup spent waiting but rather a measure of | ||
151 | the wait_time for its individual IOs. For devices with queue_depth > 1 | ||
152 | this metric does not include the time spent waiting for service once | ||
153 | the IO is dispatched to the device but till it actually gets serviced | ||
154 | (there might be a time lag here due to re-ordering of requests by the | ||
155 | device). This is in nanoseconds to make it meaningful for flash | ||
156 | devices too. This time is further divided by the type of operation - | ||
157 | read or write, sync or async. First two fields specify the major and | ||
158 | minor number of the device, third field specifies the operation type | ||
159 | and the fourth field specifies the io_wait_time in ns. | ||
160 | |||
161 | - blkio.io_merged | ||
162 | - Total number of bios/requests merged into requests belonging to this | ||
163 | cgroup. This is further divided by the type of operation - read or | ||
164 | write, sync or async. | ||
165 | |||
166 | - blkio.io_queued | ||
167 | - Total number of requests queued up at any given instant for this | ||
168 | cgroup. This is further divided by the type of operation - read or | ||
169 | write, sync or async. | ||
170 | |||
171 | - blkio.avg_queue_size | ||
172 | - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. | ||
173 | The average queue size for this cgroup over the entire time of this | ||
174 | cgroup's existence. Queue size samples are taken each time one of the | ||
175 | queues of this cgroup gets a timeslice. | ||
176 | |||
177 | - blkio.group_wait_time | ||
178 | - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. | ||
179 | This is the amount of time the cgroup had to wait since it became busy | ||
180 | (i.e., went from 0 to 1 request queued) to get a timeslice for one of | ||
181 | its queues. This is different from the io_wait_time which is the | ||
182 | cumulative total of the amount of time spent by each IO in that cgroup | ||
183 | waiting in the scheduler queue. This is in nanoseconds. If this is | ||
184 | read when the cgroup is in a waiting (for timeslice) state, the stat | ||
185 | will only report the group_wait_time accumulated till the last time it | ||
186 | got a timeslice and will not include the current delta. | ||
187 | |||
188 | - blkio.empty_time | ||
189 | - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. | ||
190 | This is the amount of time a cgroup spends without any pending | ||
191 | requests when not being served, i.e., it does not include any time | ||
192 | spent idling for one of the queues of the cgroup. This is in | ||
193 | nanoseconds. If this is read when the cgroup is in an empty state, | ||
194 | the stat will only report the empty_time accumulated till the last | ||
195 | time it had a pending request and will not include the current delta. | ||
196 | |||
197 | - blkio.idle_time | ||
198 | - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. | ||
199 | This is the amount of time spent by the IO scheduler idling for a | ||
200 | given cgroup in anticipation of a better request than the exising ones | ||
201 | from other queues/cgroups. This is in nanoseconds. If this is read | ||
202 | when the cgroup is in an idling state, the stat will only report the | ||
203 | idle_time accumulated till the last idle period and will not include | ||
204 | the current delta. | ||
205 | |||
95 | - blkio.dequeue | 206 | - blkio.dequeue |
96 | - Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This | 207 | - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This |
97 | gives the statistics about how many a times a group was dequeued | 208 | gives the statistics about how many a times a group was dequeued |
98 | from service tree of the device. First two fields specify the major | 209 | from service tree of the device. First two fields specify the major |
99 | and minor number of the device and third field specifies the number | 210 | and minor number of the device and third field specifies the number |
100 | of times a group was dequeued from a particular device. | 211 | of times a group was dequeued from a particular device. |
101 | 212 | ||
213 | - blkio.reset_stats | ||
214 | - Writing an int to this file will result in resetting all the stats | ||
215 | for that cgroup. | ||
216 | |||
102 | CFQ sysfs tunable | 217 | CFQ sysfs tunable |
103 | ================= | 218 | ================= |
104 | /sys/block/<disk>/queue/iosched/group_isolation | 219 | /sys/block/<disk>/queue/iosched/group_isolation |
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 57444c2609fc..b34823ff1646 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt | |||
@@ -339,7 +339,7 @@ To mount a cgroup hierarchy with all available subsystems, type: | |||
339 | The "xxx" is not interpreted by the cgroup code, but will appear in | 339 | The "xxx" is not interpreted by the cgroup code, but will appear in |
340 | /proc/mounts so may be any useful identifying string that you like. | 340 | /proc/mounts so may be any useful identifying string that you like. |
341 | 341 | ||
342 | To mount a cgroup hierarchy with just the cpuset and numtasks | 342 | To mount a cgroup hierarchy with just the cpuset and memory |
343 | subsystems, type: | 343 | subsystems, type: |
344 | # mount -t cgroup -o cpuset,memory hier1 /dev/cgroup | 344 | # mount -t cgroup -o cpuset,memory hier1 /dev/cgroup |
345 | 345 | ||
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 6cab1f29da4c..7781857dc940 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
@@ -1,18 +1,15 @@ | |||
1 | Memory Resource Controller | 1 | Memory Resource Controller |
2 | 2 | ||
3 | NOTE: The Memory Resource Controller has been generically been referred | 3 | NOTE: The Memory Resource Controller has been generically been referred |
4 | to as the memory controller in this document. Do not confuse memory controller | 4 | to as the memory controller in this document. Do not confuse memory |
5 | used here with the memory controller that is used in hardware. | 5 | controller used here with the memory controller that is used in hardware. |
6 | 6 | ||
7 | Salient features | 7 | (For editors) |
8 | 8 | In this document: | |
9 | a. Enable control of Anonymous, Page Cache (mapped and unmapped) and | 9 | When we mention a cgroup (cgroupfs's directory) with memory controller, |
10 | Swap Cache memory pages. | 10 | we call it "memory cgroup". When you see git-log and source code, you'll |
11 | b. The infrastructure allows easy addition of other types of memory to control | 11 | see patch's title and function names tend to use "memcg". |
12 | c. Provides *zero overhead* for non memory controller users | 12 | In this document, we avoid using it. |
13 | d. Provides a double LRU: global memory pressure causes reclaim from the | ||
14 | global LRU; a cgroup on hitting a limit, reclaims from the per | ||
15 | cgroup LRU | ||
16 | 13 | ||
17 | Benefits and Purpose of the memory controller | 14 | Benefits and Purpose of the memory controller |
18 | 15 | ||
@@ -33,6 +30,45 @@ d. A CD/DVD burner could control the amount of memory used by the | |||
33 | e. There are several other use cases, find one or use the controller just | 30 | e. There are several other use cases, find one or use the controller just |
34 | for fun (to learn and hack on the VM subsystem). | 31 | for fun (to learn and hack on the VM subsystem). |
35 | 32 | ||
33 | Current Status: linux-2.6.34-mmotm(development version of 2010/April) | ||
34 | |||
35 | Features: | ||
36 | - accounting anonymous pages, file caches, swap caches usage and limiting them. | ||
37 | - private LRU and reclaim routine. (system's global LRU and private LRU | ||
38 | work independently from each other) | ||
39 | - optionally, memory+swap usage can be accounted and limited. | ||
40 | - hierarchical accounting | ||
41 | - soft limit | ||
42 | - moving(recharging) account at moving a task is selectable. | ||
43 | - usage threshold notifier | ||
44 | - oom-killer disable knob and oom-notifier | ||
45 | - Root cgroup has no limit controls. | ||
46 | |||
47 | Kernel memory and Hugepages are not under control yet. We just manage | ||
48 | pages on LRU. To add more controls, we have to take care of performance. | ||
49 | |||
50 | Brief summary of control files. | ||
51 | |||
52 | tasks # attach a task(thread) and show list of threads | ||
53 | cgroup.procs # show list of processes | ||
54 | cgroup.event_control # an interface for event_fd() | ||
55 | memory.usage_in_bytes # show current memory(RSS+Cache) usage. | ||
56 | memory.memsw.usage_in_bytes # show current memory+Swap usage | ||
57 | memory.limit_in_bytes # set/show limit of memory usage | ||
58 | memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage | ||
59 | memory.failcnt # show the number of memory usage hits limits | ||
60 | memory.memsw.failcnt # show the number of memory+Swap hits limits | ||
61 | memory.max_usage_in_bytes # show max memory usage recorded | ||
62 | memory.memsw.usage_in_bytes # show max memory+Swap usage recorded | ||
63 | memory.soft_limit_in_bytes # set/show soft limit of memory usage | ||
64 | memory.stat # show various statistics | ||
65 | memory.use_hierarchy # set/show hierarchical account enabled | ||
66 | memory.force_empty # trigger forced move charge to parent | ||
67 | memory.swappiness # set/show swappiness parameter of vmscan | ||
68 | (See sysctl's vm.swappiness) | ||
69 | memory.move_charge_at_immigrate # set/show controls of moving charges | ||
70 | memory.oom_control # set/show oom controls. | ||
71 | |||
36 | 1. History | 72 | 1. History |
37 | 73 | ||
38 | The memory controller has a long history. A request for comments for the memory | 74 | The memory controller has a long history. A request for comments for the memory |
@@ -106,14 +142,14 @@ the necessary data structures and check if the cgroup that is being charged | |||
106 | is over its limit. If it is then reclaim is invoked on the cgroup. | 142 | is over its limit. If it is then reclaim is invoked on the cgroup. |
107 | More details can be found in the reclaim section of this document. | 143 | More details can be found in the reclaim section of this document. |
108 | If everything goes well, a page meta-data-structure called page_cgroup is | 144 | If everything goes well, a page meta-data-structure called page_cgroup is |
109 | allocated and associated with the page. This routine also adds the page to | 145 | updated. page_cgroup has its own LRU on cgroup. |
110 | the per cgroup LRU. | 146 | (*) page_cgroup structure is allocated at boot/memory-hotplug time. |
111 | 147 | ||
112 | 2.2.1 Accounting details | 148 | 2.2.1 Accounting details |
113 | 149 | ||
114 | All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. | 150 | All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. |
115 | (some pages which never be reclaimable and will not be on global LRU | 151 | Some pages which are never reclaimable and will not be on the global LRU |
116 | are not accounted. we just accounts pages under usual vm management.) | 152 | are not accounted. We just account pages under usual VM management. |
117 | 153 | ||
118 | RSS pages are accounted at page_fault unless they've already been accounted | 154 | RSS pages are accounted at page_fault unless they've already been accounted |
119 | for earlier. A file page will be accounted for as Page Cache when it's | 155 | for earlier. A file page will be accounted for as Page Cache when it's |
@@ -121,12 +157,19 @@ inserted into inode (radix-tree). While it's mapped into the page tables of | |||
121 | processes, duplicate accounting is carefully avoided. | 157 | processes, duplicate accounting is carefully avoided. |
122 | 158 | ||
123 | A RSS page is unaccounted when it's fully unmapped. A PageCache page is | 159 | A RSS page is unaccounted when it's fully unmapped. A PageCache page is |
124 | unaccounted when it's removed from radix-tree. | 160 | unaccounted when it's removed from radix-tree. Even if RSS pages are fully |
161 | unmapped (by kswapd), they may exist as SwapCache in the system until they | ||
162 | are really freed. Such SwapCaches also also accounted. | ||
163 | A swapped-in page is not accounted until it's mapped. | ||
164 | |||
165 | Note: The kernel does swapin-readahead and read multiple swaps at once. | ||
166 | This means swapped-in pages may contain pages for other tasks than a task | ||
167 | causing page fault. So, we avoid accounting at swap-in I/O. | ||
125 | 168 | ||
126 | At page migration, accounting information is kept. | 169 | At page migration, accounting information is kept. |
127 | 170 | ||
128 | Note: we just account pages-on-lru because our purpose is to control amount | 171 | Note: we just account pages-on-LRU because our purpose is to control amount |
129 | of used pages. not-on-lru pages are tend to be out-of-control from vm view. | 172 | of used pages; not-on-LRU pages tend to be out-of-control from VM view. |
130 | 173 | ||
131 | 2.3 Shared Page Accounting | 174 | 2.3 Shared Page Accounting |
132 | 175 | ||
@@ -143,6 +186,7 @@ caller of swapoff rather than the users of shmem. | |||
143 | 186 | ||
144 | 187 | ||
145 | 2.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP) | 188 | 2.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP) |
189 | |||
146 | Swap Extension allows you to record charge for swap. A swapped-in page is | 190 | Swap Extension allows you to record charge for swap. A swapped-in page is |
147 | charged back to original page allocator if possible. | 191 | charged back to original page allocator if possible. |
148 | 192 | ||
@@ -150,13 +194,20 @@ When swap is accounted, following files are added. | |||
150 | - memory.memsw.usage_in_bytes. | 194 | - memory.memsw.usage_in_bytes. |
151 | - memory.memsw.limit_in_bytes. | 195 | - memory.memsw.limit_in_bytes. |
152 | 196 | ||
153 | usage of mem+swap is limited by memsw.limit_in_bytes. | 197 | memsw means memory+swap. Usage of memory+swap is limited by |
198 | memsw.limit_in_bytes. | ||
154 | 199 | ||
155 | * why 'mem+swap' rather than swap. | 200 | Example: Assume a system with 4G of swap. A task which allocates 6G of memory |
201 | (by mistake) under 2G memory limitation will use all swap. | ||
202 | In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap. | ||
203 | By using memsw limit, you can avoid system OOM which can be caused by swap | ||
204 | shortage. | ||
205 | |||
206 | * why 'memory+swap' rather than swap. | ||
156 | The global LRU(kswapd) can swap out arbitrary pages. Swap-out means | 207 | The global LRU(kswapd) can swap out arbitrary pages. Swap-out means |
157 | to move account from memory to swap...there is no change in usage of | 208 | to move account from memory to swap...there is no change in usage of |
158 | mem+swap. In other words, when we want to limit the usage of swap without | 209 | memory+swap. In other words, when we want to limit the usage of swap without |
159 | affecting global LRU, mem+swap limit is better than just limiting swap from | 210 | affecting global LRU, memory+swap limit is better than just limiting swap from |
160 | OS point of view. | 211 | OS point of view. |
161 | 212 | ||
162 | * What happens when a cgroup hits memory.memsw.limit_in_bytes | 213 | * What happens when a cgroup hits memory.memsw.limit_in_bytes |
@@ -168,12 +219,12 @@ it by cgroup. | |||
168 | 219 | ||
169 | 2.5 Reclaim | 220 | 2.5 Reclaim |
170 | 221 | ||
171 | Each cgroup maintains a per cgroup LRU that consists of an active | 222 | Each cgroup maintains a per cgroup LRU which has the same structure as |
172 | and inactive list. When a cgroup goes over its limit, we first try | 223 | global VM. When a cgroup goes over its limit, we first try |
173 | to reclaim memory from the cgroup so as to make space for the new | 224 | to reclaim memory from the cgroup so as to make space for the new |
174 | pages that the cgroup has touched. If the reclaim is unsuccessful, | 225 | pages that the cgroup has touched. If the reclaim is unsuccessful, |
175 | an OOM routine is invoked to select and kill the bulkiest task in the | 226 | an OOM routine is invoked to select and kill the bulkiest task in the |
176 | cgroup. | 227 | cgroup. (See 10. OOM Control below.) |
177 | 228 | ||
178 | The reclaim algorithm has not been modified for cgroups, except that | 229 | The reclaim algorithm has not been modified for cgroups, except that |
179 | pages that are selected for reclaiming come from the per cgroup LRU | 230 | pages that are selected for reclaiming come from the per cgroup LRU |
@@ -184,13 +235,22 @@ limits on the root cgroup. | |||
184 | 235 | ||
185 | Note2: When panic_on_oom is set to "2", the whole system will panic. | 236 | Note2: When panic_on_oom is set to "2", the whole system will panic. |
186 | 237 | ||
187 | 2. Locking | 238 | When oom event notifier is registered, event will be delivered. |
239 | (See oom_control section) | ||
240 | |||
241 | 2.6 Locking | ||
188 | 242 | ||
189 | The memory controller uses the following hierarchy | 243 | lock_page_cgroup()/unlock_page_cgroup() should not be called under |
244 | mapping->tree_lock. | ||
190 | 245 | ||
191 | 1. zone->lru_lock is used for selecting pages to be isolated | 246 | Other lock order is following: |
192 | 2. mem->per_zone->lru_lock protects the per cgroup LRU (per zone) | 247 | PG_locked. |
193 | 3. lock_page_cgroup() is used to protect page->page_cgroup | 248 | mm->page_table_lock |
249 | zone->lru_lock | ||
250 | lock_page_cgroup. | ||
251 | In many cases, just lock_page_cgroup() is called. | ||
252 | per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by | ||
253 | zone->lru_lock, it has no lock of its own. | ||
194 | 254 | ||
195 | 3. User Interface | 255 | 3. User Interface |
196 | 256 | ||
@@ -199,6 +259,7 @@ The memory controller uses the following hierarchy | |||
199 | a. Enable CONFIG_CGROUPS | 259 | a. Enable CONFIG_CGROUPS |
200 | b. Enable CONFIG_RESOURCE_COUNTERS | 260 | b. Enable CONFIG_RESOURCE_COUNTERS |
201 | c. Enable CONFIG_CGROUP_MEM_RES_CTLR | 261 | c. Enable CONFIG_CGROUP_MEM_RES_CTLR |
262 | d. Enable CONFIG_CGROUP_MEM_RES_CTLR_SWAP (to use swap extension) | ||
202 | 263 | ||
203 | 1. Prepare the cgroups | 264 | 1. Prepare the cgroups |
204 | # mkdir -p /cgroups | 265 | # mkdir -p /cgroups |
@@ -206,31 +267,28 @@ c. Enable CONFIG_CGROUP_MEM_RES_CTLR | |||
206 | 267 | ||
207 | 2. Make the new group and move bash into it | 268 | 2. Make the new group and move bash into it |
208 | # mkdir /cgroups/0 | 269 | # mkdir /cgroups/0 |
209 | # echo $$ > /cgroups/0/tasks | 270 | # echo $$ > /cgroups/0/tasks |
210 | 271 | ||
211 | Since now we're in the 0 cgroup, | 272 | Since now we're in the 0 cgroup, we can alter the memory limit: |
212 | We can alter the memory limit: | ||
213 | # echo 4M > /cgroups/0/memory.limit_in_bytes | 273 | # echo 4M > /cgroups/0/memory.limit_in_bytes |
214 | 274 | ||
215 | NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, | 275 | NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, |
216 | mega or gigabytes. | 276 | mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, Gibibytes.) |
277 | |||
217 | NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited). | 278 | NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited). |
218 | NOTE: We cannot set limits on the root cgroup any more. | 279 | NOTE: We cannot set limits on the root cgroup any more. |
219 | 280 | ||
220 | # cat /cgroups/0/memory.limit_in_bytes | 281 | # cat /cgroups/0/memory.limit_in_bytes |
221 | 4194304 | 282 | 4194304 |
222 | 283 | ||
223 | NOTE: The interface has now changed to display the usage in bytes | ||
224 | instead of pages | ||
225 | |||
226 | We can check the usage: | 284 | We can check the usage: |
227 | # cat /cgroups/0/memory.usage_in_bytes | 285 | # cat /cgroups/0/memory.usage_in_bytes |
228 | 1216512 | 286 | 1216512 |
229 | 287 | ||
230 | A successful write to this file does not guarantee a successful set of | 288 | A successful write to this file does not guarantee a successful set of |
231 | this limit to the value written into the file. This can be due to a | 289 | this limit to the value written into the file. This can be due to a |
232 | number of factors, such as rounding up to page boundaries or the total | 290 | number of factors, such as rounding up to page boundaries or the total |
233 | availability of memory on the system. The user is required to re-read | 291 | availability of memory on the system. The user is required to re-read |
234 | this file after a write to guarantee the value committed by the kernel. | 292 | this file after a write to guarantee the value committed by the kernel. |
235 | 293 | ||
236 | # echo 1 > memory.limit_in_bytes | 294 | # echo 1 > memory.limit_in_bytes |
@@ -245,15 +303,23 @@ caches, RSS and Active pages/Inactive pages are shown. | |||
245 | 303 | ||
246 | 4. Testing | 304 | 4. Testing |
247 | 305 | ||
248 | Balbir posted lmbench, AIM9, LTP and vmmstress results [10] and [11]. | 306 | For testing features and implementation, see memcg_test.txt. |
249 | Apart from that v6 has been tested with several applications and regular | 307 | |
250 | daily use. The controller has also been tested on the PPC64, x86_64 and | 308 | Performance test is also important. To see pure memory controller's overhead, |
251 | UML platforms. | 309 | testing on tmpfs will give you good numbers of small overheads. |
310 | Example: do kernel make on tmpfs. | ||
311 | |||
312 | Page-fault scalability is also important. At measuring parallel | ||
313 | page fault test, multi-process test may be better than multi-thread | ||
314 | test because it has noise of shared objects/status. | ||
315 | |||
316 | But the above two are testing extreme situations. | ||
317 | Trying usual test under memory controller is always helpful. | ||
252 | 318 | ||
253 | 4.1 Troubleshooting | 319 | 4.1 Troubleshooting |
254 | 320 | ||
255 | Sometimes a user might find that the application under a cgroup is | 321 | Sometimes a user might find that the application under a cgroup is |
256 | terminated. There are several causes for this: | 322 | terminated by OOM killer. There are several causes for this: |
257 | 323 | ||
258 | 1. The cgroup limit is too low (just too low to do anything useful) | 324 | 1. The cgroup limit is too low (just too low to do anything useful) |
259 | 2. The user is using anonymous memory and swap is turned off or too low | 325 | 2. The user is using anonymous memory and swap is turned off or too low |
@@ -261,6 +327,9 @@ terminated. There are several causes for this: | |||
261 | A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of | 327 | A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of |
262 | some of the pages cached in the cgroup (page cache pages). | 328 | some of the pages cached in the cgroup (page cache pages). |
263 | 329 | ||
330 | To know what happens, disable OOM_Kill by 10. OOM Control(see below) and | ||
331 | seeing what happens will be helpful. | ||
332 | |||
264 | 4.2 Task migration | 333 | 4.2 Task migration |
265 | 334 | ||
266 | When a task migrates from one cgroup to another, its charge is not | 335 | When a task migrates from one cgroup to another, its charge is not |
@@ -268,16 +337,19 @@ carried forward by default. The pages allocated from the original cgroup still | |||
268 | remain charged to it, the charge is dropped when the page is freed or | 337 | remain charged to it, the charge is dropped when the page is freed or |
269 | reclaimed. | 338 | reclaimed. |
270 | 339 | ||
271 | Note: You can move charges of a task along with task migration. See 8. | 340 | You can move charges of a task along with task migration. |
341 | See 8. "Move charges at task migration" | ||
272 | 342 | ||
273 | 4.3 Removing a cgroup | 343 | 4.3 Removing a cgroup |
274 | 344 | ||
275 | A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a | 345 | A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a |
276 | cgroup might have some charge associated with it, even though all | 346 | cgroup might have some charge associated with it, even though all |
277 | tasks have migrated away from it. | 347 | tasks have migrated away from it. (because we charge against pages, not |
278 | Such charges are freed(at default) or moved to its parent. When moved, | 348 | against tasks.) |
279 | both of RSS and CACHES are moved to parent. | 349 | |
280 | If both of them are busy, rmdir() returns -EBUSY. See 5.1 Also. | 350 | Such charges are freed or moved to their parent. At moving, both of RSS |
351 | and CACHES are moved to parent. | ||
352 | rmdir() may return -EBUSY if freeing/moving fails. See 5.1 also. | ||
281 | 353 | ||
282 | Charges recorded in swap information is not updated at removal of cgroup. | 354 | Charges recorded in swap information is not updated at removal of cgroup. |
283 | Recorded information is discarded and a cgroup which uses swap (swapcache) | 355 | Recorded information is discarded and a cgroup which uses swap (swapcache) |
@@ -293,10 +365,10 @@ will be charged as a new owner of it. | |||
293 | 365 | ||
294 | # echo 0 > memory.force_empty | 366 | # echo 0 > memory.force_empty |
295 | 367 | ||
296 | Almost all pages tracked by this memcg will be unmapped and freed. Some of | 368 | Almost all pages tracked by this memory cgroup will be unmapped and freed. |
297 | pages cannot be freed because it's locked or in-use. Such pages are moved | 369 | Some pages cannot be freed because they are locked or in-use. Such pages are |
298 | to parent and this cgroup will be empty. But this may return -EBUSY in | 370 | moved to parent and this cgroup will be empty. This may return -EBUSY if |
299 | some too busy case. | 371 | VM is too busy to free/move all pages immediately. |
300 | 372 | ||
301 | Typical use case of this interface is that calling this before rmdir(). | 373 | Typical use case of this interface is that calling this before rmdir(). |
302 | Because rmdir() moves all pages to parent, some out-of-use page caches can be | 374 | Because rmdir() moves all pages to parent, some out-of-use page caches can be |
@@ -306,19 +378,41 @@ will be charged as a new owner of it. | |||
306 | 378 | ||
307 | memory.stat file includes following statistics | 379 | memory.stat file includes following statistics |
308 | 380 | ||
381 | # per-memory cgroup local status | ||
309 | cache - # of bytes of page cache memory. | 382 | cache - # of bytes of page cache memory. |
310 | rss - # of bytes of anonymous and swap cache memory. | 383 | rss - # of bytes of anonymous and swap cache memory. |
384 | mapped_file - # of bytes of mapped file (includes tmpfs/shmem) | ||
311 | pgpgin - # of pages paged in (equivalent to # of charging events). | 385 | pgpgin - # of pages paged in (equivalent to # of charging events). |
312 | pgpgout - # of pages paged out (equivalent to # of uncharging events). | 386 | pgpgout - # of pages paged out (equivalent to # of uncharging events). |
313 | active_anon - # of bytes of anonymous and swap cache memory on active | 387 | swap - # of bytes of swap usage |
314 | lru list. | ||
315 | inactive_anon - # of bytes of anonymous memory and swap cache memory on | 388 | inactive_anon - # of bytes of anonymous memory and swap cache memory on |
316 | inactive lru list. | 389 | LRU list. |
317 | active_file - # of bytes of file-backed memory on active lru list. | 390 | active_anon - # of bytes of anonymous and swap cache memory on active |
318 | inactive_file - # of bytes of file-backed memory on inactive lru list. | 391 | inactive LRU list. |
392 | inactive_file - # of bytes of file-backed memory on inactive LRU list. | ||
393 | active_file - # of bytes of file-backed memory on active LRU list. | ||
319 | unevictable - # of bytes of memory that cannot be reclaimed (mlocked etc). | 394 | unevictable - # of bytes of memory that cannot be reclaimed (mlocked etc). |
320 | 395 | ||
321 | The following additional stats are dependent on CONFIG_DEBUG_VM. | 396 | # status considering hierarchy (see memory.use_hierarchy settings) |
397 | |||
398 | hierarchical_memory_limit - # of bytes of memory limit with regard to hierarchy | ||
399 | under which the memory cgroup is | ||
400 | hierarchical_memsw_limit - # of bytes of memory+swap limit with regard to | ||
401 | hierarchy under which memory cgroup is. | ||
402 | |||
403 | total_cache - sum of all children's "cache" | ||
404 | total_rss - sum of all children's "rss" | ||
405 | total_mapped_file - sum of all children's "cache" | ||
406 | total_pgpgin - sum of all children's "pgpgin" | ||
407 | total_pgpgout - sum of all children's "pgpgout" | ||
408 | total_swap - sum of all children's "swap" | ||
409 | total_inactive_anon - sum of all children's "inactive_anon" | ||
410 | total_active_anon - sum of all children's "active_anon" | ||
411 | total_inactive_file - sum of all children's "inactive_file" | ||
412 | total_active_file - sum of all children's "active_file" | ||
413 | total_unevictable - sum of all children's "unevictable" | ||
414 | |||
415 | # The following additional stats are dependent on CONFIG_DEBUG_VM. | ||
322 | 416 | ||
323 | inactive_ratio - VM internal parameter. (see mm/page_alloc.c) | 417 | inactive_ratio - VM internal parameter. (see mm/page_alloc.c) |
324 | recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) | 418 | recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) |
@@ -327,24 +421,37 @@ recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) | |||
327 | recent_scanned_file - VM internal parameter. (see mm/vmscan.c) | 421 | recent_scanned_file - VM internal parameter. (see mm/vmscan.c) |
328 | 422 | ||
329 | Memo: | 423 | Memo: |
330 | recent_rotated means recent frequency of lru rotation. | 424 | recent_rotated means recent frequency of LRU rotation. |
331 | recent_scanned means recent # of scans to lru. | 425 | recent_scanned means recent # of scans to LRU. |
332 | showing for better debug please see the code for meanings. | 426 | showing for better debug please see the code for meanings. |
333 | 427 | ||
334 | Note: | 428 | Note: |
335 | Only anonymous and swap cache memory is listed as part of 'rss' stat. | 429 | Only anonymous and swap cache memory is listed as part of 'rss' stat. |
336 | This should not be confused with the true 'resident set size' or the | 430 | This should not be confused with the true 'resident set size' or the |
337 | amount of physical memory used by the cgroup. Per-cgroup rss | 431 | amount of physical memory used by the cgroup. |
338 | accounting is not done yet. | 432 | 'rss + file_mapped" will give you resident set size of cgroup. |
433 | (Note: file and shmem may be shared among other cgroups. In that case, | ||
434 | file_mapped is accounted only when the memory cgroup is owner of page | ||
435 | cache.) | ||
339 | 436 | ||
340 | 5.3 swappiness | 437 | 5.3 swappiness |
341 | Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. | ||
342 | 438 | ||
343 | Following cgroups' swappiness can't be changed. | 439 | Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. |
344 | - root cgroup (uses /proc/sys/vm/swappiness). | ||
345 | - a cgroup which uses hierarchy and it has child cgroup. | ||
346 | - a cgroup which uses hierarchy and not the root of hierarchy. | ||
347 | 440 | ||
441 | Following cgroups' swappiness can't be changed. | ||
442 | - root cgroup (uses /proc/sys/vm/swappiness). | ||
443 | - a cgroup which uses hierarchy and it has other cgroup(s) below it. | ||
444 | - a cgroup which uses hierarchy and not the root of hierarchy. | ||
445 | |||
446 | 5.4 failcnt | ||
447 | |||
448 | A memory cgroup provides memory.failcnt and memory.memsw.failcnt files. | ||
449 | This failcnt(== failure count) shows the number of times that a usage counter | ||
450 | hit its limit. When a memory cgroup hits a limit, failcnt increases and | ||
451 | memory under it will be reclaimed. | ||
452 | |||
453 | You can reset failcnt by writing 0 to failcnt file. | ||
454 | # echo 0 > .../memory.failcnt | ||
348 | 455 | ||
349 | 6. Hierarchy support | 456 | 6. Hierarchy support |
350 | 457 | ||
@@ -363,13 +470,13 @@ hierarchy | |||
363 | 470 | ||
364 | In the diagram above, with hierarchical accounting enabled, all memory | 471 | In the diagram above, with hierarchical accounting enabled, all memory |
365 | usage of e, is accounted to its ancestors up until the root (i.e, c and root), | 472 | usage of e, is accounted to its ancestors up until the root (i.e, c and root), |
366 | that has memory.use_hierarchy enabled. If one of the ancestors goes over its | 473 | that has memory.use_hierarchy enabled. If one of the ancestors goes over its |
367 | limit, the reclaim algorithm reclaims from the tasks in the ancestor and the | 474 | limit, the reclaim algorithm reclaims from the tasks in the ancestor and the |
368 | children of the ancestor. | 475 | children of the ancestor. |
369 | 476 | ||
370 | 6.1 Enabling hierarchical accounting and reclaim | 477 | 6.1 Enabling hierarchical accounting and reclaim |
371 | 478 | ||
372 | The memory controller by default disables the hierarchy feature. Support | 479 | A memory cgroup by default disables the hierarchy feature. Support |
373 | can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup | 480 | can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup |
374 | 481 | ||
375 | # echo 1 > memory.use_hierarchy | 482 | # echo 1 > memory.use_hierarchy |
@@ -379,10 +486,10 @@ The feature can be disabled by | |||
379 | # echo 0 > memory.use_hierarchy | 486 | # echo 0 > memory.use_hierarchy |
380 | 487 | ||
381 | NOTE1: Enabling/disabling will fail if the cgroup already has other | 488 | NOTE1: Enabling/disabling will fail if the cgroup already has other |
382 | cgroups created below it. | 489 | cgroups created below it. |
383 | 490 | ||
384 | NOTE2: When panic_on_oom is set to "2", the whole system will panic in | 491 | NOTE2: When panic_on_oom is set to "2", the whole system will panic in |
385 | case of an oom event in any cgroup. | 492 | case of an OOM event in any cgroup. |
386 | 493 | ||
387 | 7. Soft limits | 494 | 7. Soft limits |
388 | 495 | ||
@@ -392,7 +499,7 @@ is to allow control groups to use as much of the memory as needed, provided | |||
392 | a. There is no memory contention | 499 | a. There is no memory contention |
393 | b. They do not exceed their hard limit | 500 | b. They do not exceed their hard limit |
394 | 501 | ||
395 | When the system detects memory contention or low memory control groups | 502 | When the system detects memory contention or low memory, control groups |
396 | are pushed back to their soft limits. If the soft limit of each control | 503 | are pushed back to their soft limits. If the soft limit of each control |
397 | group is very high, they are pushed back as much as possible to make | 504 | group is very high, they are pushed back as much as possible to make |
398 | sure that one control group does not starve the others of memory. | 505 | sure that one control group does not starve the others of memory. |
@@ -406,7 +513,7 @@ it gets invoked from balance_pgdat (kswapd). | |||
406 | 7.1 Interface | 513 | 7.1 Interface |
407 | 514 | ||
408 | Soft limits can be setup by using the following commands (in this example we | 515 | Soft limits can be setup by using the following commands (in this example we |
409 | assume a soft limit of 256 megabytes) | 516 | assume a soft limit of 256 MiB) |
410 | 517 | ||
411 | # echo 256M > memory.soft_limit_in_bytes | 518 | # echo 256M > memory.soft_limit_in_bytes |
412 | 519 | ||
@@ -442,7 +549,7 @@ Note: Charges are moved only when you move mm->owner, IOW, a leader of a thread | |||
442 | Note: If we cannot find enough space for the task in the destination cgroup, we | 549 | Note: If we cannot find enough space for the task in the destination cgroup, we |
443 | try to make space by reclaiming memory. Task migration may fail if we | 550 | try to make space by reclaiming memory. Task migration may fail if we |
444 | cannot make enough space. | 551 | cannot make enough space. |
445 | Note: It can take several seconds if you move charges in giga bytes order. | 552 | Note: It can take several seconds if you move charges much. |
446 | 553 | ||
447 | And if you want disable it again: | 554 | And if you want disable it again: |
448 | 555 | ||
@@ -451,21 +558,27 @@ And if you want disable it again: | |||
451 | 8.2 Type of charges which can be move | 558 | 8.2 Type of charges which can be move |
452 | 559 | ||
453 | Each bits of move_charge_at_immigrate has its own meaning about what type of | 560 | Each bits of move_charge_at_immigrate has its own meaning about what type of |
454 | charges should be moved. | 561 | charges should be moved. But in any cases, it must be noted that an account of |
562 | a page or a swap can be moved only when it is charged to the task's current(old) | ||
563 | memory cgroup. | ||
455 | 564 | ||
456 | bit | what type of charges would be moved ? | 565 | bit | what type of charges would be moved ? |
457 | -----+------------------------------------------------------------------------ | 566 | -----+------------------------------------------------------------------------ |
458 | 0 | A charge of an anonymous page(or swap of it) used by the target task. | 567 | 0 | A charge of an anonymous page(or swap of it) used by the target task. |
459 | | Those pages and swaps must be used only by the target task. You must | 568 | | Those pages and swaps must be used only by the target task. You must |
460 | | enable Swap Extension(see 2.4) to enable move of swap charges. | 569 | | enable Swap Extension(see 2.4) to enable move of swap charges. |
461 | 570 | -----+------------------------------------------------------------------------ | |
462 | Note: Those pages and swaps must be charged to the old cgroup. | 571 | 1 | A charge of file pages(normal file, tmpfs file(e.g. ipc shared memory) |
463 | Note: More type of pages(e.g. file cache, shmem,) will be supported by other | 572 | | and swaps of tmpfs file) mmapped by the target task. Unlike the case of |
464 | bits in future. | 573 | | anonymous pages, file pages(and swaps) in the range mmapped by the task |
574 | | will be moved even if the task hasn't done page fault, i.e. they might | ||
575 | | not be the task's "RSS", but other task's "RSS" that maps the same file. | ||
576 | | And mapcount of the page is ignored(the page can be moved even if | ||
577 | | page_mapcount(page) > 1). You must enable Swap Extension(see 2.4) to | ||
578 | | enable move of swap charges. | ||
465 | 579 | ||
466 | 8.3 TODO | 580 | 8.3 TODO |
467 | 581 | ||
468 | - Add support for other types of pages(e.g. file cache, shmem, etc.). | ||
469 | - Implement madvise(2) to let users decide the vma to be moved or not to be | 582 | - Implement madvise(2) to let users decide the vma to be moved or not to be |
470 | moved. | 583 | moved. |
471 | - All of moving charge operations are done under cgroup_mutex. It's not good | 584 | - All of moving charge operations are done under cgroup_mutex. It's not good |
@@ -473,22 +586,61 @@ Note: More type of pages(e.g. file cache, shmem,) will be supported by other | |||
473 | 586 | ||
474 | 9. Memory thresholds | 587 | 9. Memory thresholds |
475 | 588 | ||
476 | Memory controler implements memory thresholds using cgroups notification | 589 | Memory cgroup implements memory thresholds using cgroups notification |
477 | API (see cgroups.txt). It allows to register multiple memory and memsw | 590 | API (see cgroups.txt). It allows to register multiple memory and memsw |
478 | thresholds and gets notifications when it crosses. | 591 | thresholds and gets notifications when it crosses. |
479 | 592 | ||
480 | To register a threshold application need: | 593 | To register a threshold application need: |
481 | - create an eventfd using eventfd(2); | 594 | - create an eventfd using eventfd(2); |
482 | - open memory.usage_in_bytes or memory.memsw.usage_in_bytes; | 595 | - open memory.usage_in_bytes or memory.memsw.usage_in_bytes; |
483 | - write string like "<event_fd> <memory.usage_in_bytes> <threshold>" to | 596 | - write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to |
484 | cgroup.event_control. | 597 | cgroup.event_control. |
485 | 598 | ||
486 | Application will be notified through eventfd when memory usage crosses | 599 | Application will be notified through eventfd when memory usage crosses |
487 | threshold in any direction. | 600 | threshold in any direction. |
488 | 601 | ||
489 | It's applicable for root and non-root cgroup. | 602 | It's applicable for root and non-root cgroup. |
490 | 603 | ||
491 | 10. TODO | 604 | 10. OOM Control |
605 | |||
606 | memory.oom_control file is for OOM notification and other controls. | ||
607 | |||
608 | Memory cgroup implements OOM notifier using cgroup notification | ||
609 | API (See cgroups.txt). It allows to register multiple OOM notification | ||
610 | delivery and gets notification when OOM happens. | ||
611 | |||
612 | To register a notifier, application need: | ||
613 | - create an eventfd using eventfd(2) | ||
614 | - open memory.oom_control file | ||
615 | - write string like "<event_fd> <fd of memory.oom_control>" to | ||
616 | cgroup.event_control | ||
617 | |||
618 | Application will be notified through eventfd when OOM happens. | ||
619 | OOM notification doesn't work for root cgroup. | ||
620 | |||
621 | You can disable OOM-killer by writing "1" to memory.oom_control file, as: | ||
622 | |||
623 | #echo 1 > memory.oom_control | ||
624 | |||
625 | This operation is only allowed to the top cgroup of sub-hierarchy. | ||
626 | If OOM-killer is disabled, tasks under cgroup will hang/sleep | ||
627 | in memory cgroup's OOM-waitqueue when they request accountable memory. | ||
628 | |||
629 | For running them, you have to relax the memory cgroup's OOM status by | ||
630 | * enlarge limit or reduce usage. | ||
631 | To reduce usage, | ||
632 | * kill some tasks. | ||
633 | * move some tasks to other group with account migration. | ||
634 | * remove some files (on tmpfs?) | ||
635 | |||
636 | Then, stopped tasks will work again. | ||
637 | |||
638 | At reading, current status of OOM is shown. | ||
639 | oom_kill_disable 0 or 1 (if 1, oom-killer is disabled) | ||
640 | under_oom 0 or 1 (if 1, the memory cgroup is under OOM, tasks may | ||
641 | be stopped.) | ||
642 | |||
643 | 11. TODO | ||
492 | 644 | ||
493 | 1. Add support for accounting huge pages (as a separate controller) | 645 | 1. Add support for accounting huge pages (as a separate controller) |
494 | 2. Make per-cgroup scanner reclaim not-shared pages first | 646 | 2. Make per-cgroup scanner reclaim not-shared pages first |
diff --git a/Documentation/development-process/2.Process b/Documentation/development-process/2.Process index d750321acd5a..97726eba6102 100644 --- a/Documentation/development-process/2.Process +++ b/Documentation/development-process/2.Process | |||
@@ -151,7 +151,7 @@ The stages that a patch goes through are, generally: | |||
151 | well. | 151 | well. |
152 | 152 | ||
153 | - Wider review. When the patch is getting close to ready for mainline | 153 | - Wider review. When the patch is getting close to ready for mainline |
154 | inclusion, it will be accepted by a relevant subsystem maintainer - | 154 | inclusion, it should be accepted by a relevant subsystem maintainer - |
155 | though this acceptance is not a guarantee that the patch will make it | 155 | though this acceptance is not a guarantee that the patch will make it |
156 | all the way to the mainline. The patch will show up in the maintainer's | 156 | all the way to the mainline. The patch will show up in the maintainer's |
157 | subsystem tree and into the staging trees (described below). When the | 157 | subsystem tree and into the staging trees (described below). When the |
@@ -159,6 +159,15 @@ The stages that a patch goes through are, generally: | |||
159 | the discovery of any problems resulting from the integration of this | 159 | the discovery of any problems resulting from the integration of this |
160 | patch with work being done by others. | 160 | patch with work being done by others. |
161 | 161 | ||
162 | - Please note that most maintainers also have day jobs, so merging | ||
163 | your patch may not be their highest priority. If your patch is | ||
164 | getting feedback about changes that are needed, you should either | ||
165 | make those changes or justify why they should not be made. If your | ||
166 | patch has no review complaints but is not being merged by its | ||
167 | appropriate subsystem or driver maintainer, you should be persistent | ||
168 | in updating the patch to the current kernel so that it applies cleanly | ||
169 | and keep sending it for review and merging. | ||
170 | |||
162 | - Merging into the mainline. Eventually, a successful patch will be | 171 | - Merging into the mainline. Eventually, a successful patch will be |
163 | merged into the mainline repository managed by Linus Torvalds. More | 172 | merged into the mainline repository managed by Linus Torvalds. More |
164 | comments and/or problems may surface at this time; it is important that | 173 | comments and/or problems may surface at this time; it is important that |
@@ -258,12 +267,8 @@ an appropriate subsystem tree or be sent directly to Linus. In a typical | |||
258 | development cycle, approximately 10% of the patches going into the mainline | 267 | development cycle, approximately 10% of the patches going into the mainline |
259 | get there via -mm. | 268 | get there via -mm. |
260 | 269 | ||
261 | The current -mm patch can always be found from the front page of | 270 | The current -mm patch is available in the "mmotm" (-mm of the moment) |
262 | 271 | directory at: | |
263 | http://kernel.org/ | ||
264 | |||
265 | Those who want to see the current state of -mm can get the "-mm of the | ||
266 | moment" tree, found at: | ||
267 | 272 | ||
268 | http://userweb.kernel.org/~akpm/mmotm/ | 273 | http://userweb.kernel.org/~akpm/mmotm/ |
269 | 274 | ||
@@ -298,6 +303,12 @@ volatility of linux-next tends to make it a difficult development target. | |||
298 | See http://lwn.net/Articles/289013/ for more information on this topic, and | 303 | See http://lwn.net/Articles/289013/ for more information on this topic, and |
299 | stay tuned; much is still in flux where linux-next is involved. | 304 | stay tuned; much is still in flux where linux-next is involved. |
300 | 305 | ||
306 | Besides the mmotm and linux-next trees, the kernel source tree now contains | ||
307 | the drivers/staging/ directory and many sub-directories for drivers or | ||
308 | filesystems that are on their way to being added to the kernel tree | ||
309 | proper, but they remain in drivers/staging/ while they still need more | ||
310 | work. | ||
311 | |||
301 | 312 | ||
302 | 2.5: TOOLS | 313 | 2.5: TOOLS |
303 | 314 | ||
@@ -319,9 +330,9 @@ developers; even if they do not use it for their own work, they'll need git | |||
319 | to keep up with what other developers (and the mainline) are doing. | 330 | to keep up with what other developers (and the mainline) are doing. |
320 | 331 | ||
321 | Git is now packaged by almost all Linux distributions. There is a home | 332 | Git is now packaged by almost all Linux distributions. There is a home |
322 | page at | 333 | page at: |
323 | 334 | ||
324 | http://git.or.cz/ | 335 | http://git-scm.com/ |
325 | 336 | ||
326 | That page has pointers to documentation and tutorials. One should be | 337 | That page has pointers to documentation and tutorials. One should be |
327 | aware, in particular, of the Kernel Hacker's Guide to git, which has | 338 | aware, in particular, of the Kernel Hacker's Guide to git, which has |
diff --git a/Documentation/development-process/7.AdvancedTopics b/Documentation/development-process/7.AdvancedTopics index a2cf74093aa1..837179447e17 100644 --- a/Documentation/development-process/7.AdvancedTopics +++ b/Documentation/development-process/7.AdvancedTopics | |||
@@ -25,7 +25,7 @@ long document in its own right. Instead, the focus here will be on how git | |||
25 | fits into the kernel development process in particular. Developers who | 25 | fits into the kernel development process in particular. Developers who |
26 | wish to come up to speed with git will find more information at: | 26 | wish to come up to speed with git will find more information at: |
27 | 27 | ||
28 | http://git.or.cz/ | 28 | http://git-scm.com/ |
29 | 29 | ||
30 | http://www.kernel.org/pub/software/scm/git/docs/user-manual.html | 30 | http://www.kernel.org/pub/software/scm/git/docs/user-manual.html |
31 | 31 | ||
diff --git a/Documentation/devices.txt b/Documentation/devices.txt index 53d64d382343..1d83d124056c 100644 --- a/Documentation/devices.txt +++ b/Documentation/devices.txt | |||
@@ -443,6 +443,8 @@ Your cooperation is appreciated. | |||
443 | 231 = /dev/snapshot System memory snapshot device | 443 | 231 = /dev/snapshot System memory snapshot device |
444 | 232 = /dev/kvm Kernel-based virtual machine (hardware virtualization extensions) | 444 | 232 = /dev/kvm Kernel-based virtual machine (hardware virtualization extensions) |
445 | 233 = /dev/kmview View-OS A process with a view | 445 | 233 = /dev/kmview View-OS A process with a view |
446 | 234 = /dev/btrfs-control Btrfs control device | ||
447 | 235 = /dev/autofs Autofs control device | ||
446 | 240-254 Reserved for local use | 448 | 240-254 Reserved for local use |
447 | 255 Reserved for MISC_DYNAMIC_MINOR | 449 | 255 Reserved for MISC_DYNAMIC_MINOR |
448 | 450 | ||
diff --git a/Documentation/edac.txt b/Documentation/edac.txt index 79c533223762..0b875e8da969 100644 --- a/Documentation/edac.txt +++ b/Documentation/edac.txt | |||
@@ -6,6 +6,8 @@ Written by Doug Thompson <dougthompson@xmission.com> | |||
6 | 7 Dec 2005 | 6 | 7 Dec 2005 |
7 | 17 Jul 2007 Updated | 7 | 17 Jul 2007 Updated |
8 | 8 | ||
9 | (c) Mauro Carvalho Chehab <mchehab@redhat.com> | ||
10 | 05 Aug 2009 Nehalem interface | ||
9 | 11 | ||
10 | EDAC is maintained and written by: | 12 | EDAC is maintained and written by: |
11 | 13 | ||
@@ -717,3 +719,153 @@ unique drivers for their hardware systems. | |||
717 | The 'test_device_edac' sample driver is located at the | 719 | The 'test_device_edac' sample driver is located at the |
718 | bluesmoke.sourceforge.net project site for EDAC. | 720 | bluesmoke.sourceforge.net project site for EDAC. |
719 | 721 | ||
722 | ======================================================================= | ||
723 | NEHALEM USAGE OF EDAC APIs | ||
724 | |||
725 | This chapter documents some EXPERIMENTAL mappings for EDAC API to handle | ||
726 | Nehalem EDAC driver. They will likely be changed on future versions | ||
727 | of the driver. | ||
728 | |||
729 | Due to the way Nehalem exports Memory Controller data, some adjustments | ||
730 | were done at i7core_edac driver. This chapter will cover those differences | ||
731 | |||
732 | 1) On Nehalem, there are one Memory Controller per Quick Patch Interconnect | ||
733 | (QPI). At the driver, the term "socket" means one QPI. This is | ||
734 | associated with a physical CPU socket. | ||
735 | |||
736 | Each MC have 3 physical read channels, 3 physical write channels and | ||
737 | 3 logic channels. The driver currenty sees it as just 3 channels. | ||
738 | Each channel can have up to 3 DIMMs. | ||
739 | |||
740 | The minimum known unity is DIMMs. There are no information about csrows. | ||
741 | As EDAC API maps the minimum unity is csrows, the driver sequencially | ||
742 | maps channel/dimm into different csrows. | ||
743 | |||
744 | For example, suposing the following layout: | ||
745 | Ch0 phy rd0, wr0 (0x063f4031): 2 ranks, UDIMMs | ||
746 | dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400 | ||
747 | dimm 1 1024 Mb offset: 4, bank: 8, rank: 1, row: 0x4000, col: 0x400 | ||
748 | Ch1 phy rd1, wr1 (0x063f4031): 2 ranks, UDIMMs | ||
749 | dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400 | ||
750 | Ch2 phy rd3, wr3 (0x063f4031): 2 ranks, UDIMMs | ||
751 | dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400 | ||
752 | The driver will map it as: | ||
753 | csrow0: channel 0, dimm0 | ||
754 | csrow1: channel 0, dimm1 | ||
755 | csrow2: channel 1, dimm0 | ||
756 | csrow3: channel 2, dimm0 | ||
757 | |||
758 | exports one | ||
759 | DIMM per csrow. | ||
760 | |||
761 | Each QPI is exported as a different memory controller. | ||
762 | |||
763 | 2) Nehalem MC has the hability to generate errors. The driver implements this | ||
764 | functionality via some error injection nodes: | ||
765 | |||
766 | For injecting a memory error, there are some sysfs nodes, under | ||
767 | /sys/devices/system/edac/mc/mc?/: | ||
768 | |||
769 | inject_addrmatch/*: | ||
770 | Controls the error injection mask register. It is possible to specify | ||
771 | several characteristics of the address to match an error code: | ||
772 | dimm = the affected dimm. Numbers are relative to a channel; | ||
773 | rank = the memory rank; | ||
774 | channel = the channel that will generate an error; | ||
775 | bank = the affected bank; | ||
776 | page = the page address; | ||
777 | column (or col) = the address column. | ||
778 | each of the above values can be set to "any" to match any valid value. | ||
779 | |||
780 | At driver init, all values are set to any. | ||
781 | |||
782 | For example, to generate an error at rank 1 of dimm 2, for any channel, | ||
783 | any bank, any page, any column: | ||
784 | echo 2 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/dimm | ||
785 | echo 1 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/rank | ||
786 | |||
787 | To return to the default behaviour of matching any, you can do: | ||
788 | echo any >/sys/devices/system/edac/mc/mc0/inject_addrmatch/dimm | ||
789 | echo any >/sys/devices/system/edac/mc/mc0/inject_addrmatch/rank | ||
790 | |||
791 | inject_eccmask: | ||
792 | specifies what bits will have troubles, | ||
793 | |||
794 | inject_section: | ||
795 | specifies what ECC cache section will get the error: | ||
796 | 3 for both | ||
797 | 2 for the highest | ||
798 | 1 for the lowest | ||
799 | |||
800 | inject_type: | ||
801 | specifies the type of error, being a combination of the following bits: | ||
802 | bit 0 - repeat | ||
803 | bit 1 - ecc | ||
804 | bit 2 - parity | ||
805 | |||
806 | inject_enable starts the error generation when something different | ||
807 | than 0 is written. | ||
808 | |||
809 | All inject vars can be read. root permission is needed for write. | ||
810 | |||
811 | Datasheet states that the error will only be generated after a write on an | ||
812 | address that matches inject_addrmatch. It seems, however, that reading will | ||
813 | also produce an error. | ||
814 | |||
815 | For example, the following code will generate an error for any write access | ||
816 | at socket 0, on any DIMM/address on channel 2: | ||
817 | |||
818 | echo 2 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/channel | ||
819 | echo 2 >/sys/devices/system/edac/mc/mc0/inject_type | ||
820 | echo 64 >/sys/devices/system/edac/mc/mc0/inject_eccmask | ||
821 | echo 3 >/sys/devices/system/edac/mc/mc0/inject_section | ||
822 | echo 1 >/sys/devices/system/edac/mc/mc0/inject_enable | ||
823 | dd if=/dev/mem of=/dev/null seek=16k bs=4k count=1 >& /dev/null | ||
824 | |||
825 | For socket 1, it is needed to replace "mc0" by "mc1" at the above | ||
826 | commands. | ||
827 | |||
828 | The generated error message will look like: | ||
829 | |||
830 | EDAC MC0: UE row 0, channel-a= 0 channel-b= 0 labels "-": NON_FATAL (addr = 0x0075b980, socket=0, Dimm=0, Channel=2, syndrome=0x00000040, count=1, Err=8c0000400001009f:4000080482 (read error: read ECC error)) | ||
831 | |||
832 | 3) Nehalem specific Corrected Error memory counters | ||
833 | |||
834 | Nehalem have some registers to count memory errors. The driver uses those | ||
835 | registers to report Corrected Errors on devices with Registered Dimms. | ||
836 | |||
837 | However, those counters don't work with Unregistered Dimms. As the chipset | ||
838 | offers some counters that also work with UDIMMS (but with a worse level of | ||
839 | granularity than the default ones), the driver exposes those registers for | ||
840 | UDIMM memories. | ||
841 | |||
842 | They can be read by looking at the contents of all_channel_counts/ | ||
843 | |||
844 | $ for i in /sys/devices/system/edac/mc/mc0/all_channel_counts/*; do echo $i; cat $i; done | ||
845 | /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm0 | ||
846 | 0 | ||
847 | /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm1 | ||
848 | 0 | ||
849 | /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm2 | ||
850 | 0 | ||
851 | |||
852 | What happens here is that errors on different csrows, but at the same | ||
853 | dimm number will increment the same counter. | ||
854 | So, in this memory mapping: | ||
855 | csrow0: channel 0, dimm0 | ||
856 | csrow1: channel 0, dimm1 | ||
857 | csrow2: channel 1, dimm0 | ||
858 | csrow3: channel 2, dimm0 | ||
859 | The hardware will increment udimm0 for an error at the first dimm at either | ||
860 | csrow0, csrow2 or csrow3; | ||
861 | The hardware will increment udimm1 for an error at the second dimm at either | ||
862 | csrow0, csrow2 or csrow3; | ||
863 | The hardware will increment udimm2 for an error at the third dimm at either | ||
864 | csrow0, csrow2 or csrow3; | ||
865 | |||
866 | 4) Standard error counters | ||
867 | |||
868 | The standard error counters are generated when an mcelog error is received | ||
869 | by the driver. Since, with udimm, this is counted by software, it is | ||
870 | possible that some errors could be lost. With rdimm's, they displays the | ||
871 | contents of the registers | ||
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index a86152ae2f6f..c268783bc4e7 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt | |||
@@ -578,15 +578,6 @@ Who: Avi Kivity <avi@redhat.com> | |||
578 | 578 | ||
579 | ---------------------------- | 579 | ---------------------------- |
580 | 580 | ||
581 | What: "acpi=ht" boot option | ||
582 | When: 2.6.35 | ||
583 | Why: Useful in 2003, implementation is a hack. | ||
584 | Generally invoked by accident today. | ||
585 | Seen as doing more harm than good. | ||
586 | Who: Len Brown <len.brown@intel.com> | ||
587 | |||
588 | ---------------------------- | ||
589 | |||
590 | What: iwlwifi 50XX module parameters | 581 | What: iwlwifi 50XX module parameters |
591 | When: 2.6.40 | 582 | When: 2.6.40 |
592 | Why: The "..50" modules parameters were used to configure 5000 series and | 583 | Why: The "..50" modules parameters were used to configure 5000 series and |
@@ -646,3 +637,13 @@ Who: Thomas Gleixner <tglx@linutronix.de> | |||
646 | 637 | ||
647 | ---------------------------- | 638 | ---------------------------- |
648 | 639 | ||
640 | What: old ieee1394 subsystem (CONFIG_IEEE1394) | ||
641 | When: 2.6.37 | ||
642 | Files: drivers/ieee1394/ except init_ohci1394_dma.c | ||
643 | Why: superseded by drivers/firewire/ (CONFIG_FIREWIRE) which offers more | ||
644 | features, better performance, and better security, all with smaller | ||
645 | and more modern code base | ||
646 | Who: Stefan Richter <stefanr@s5r6.in-berlin.de> | ||
647 | |||
648 | ---------------------------- | ||
649 | |||
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index af1608070cd5..96d4293607ec 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
@@ -380,7 +380,7 @@ prototypes: | |||
380 | int (*open) (struct inode *, struct file *); | 380 | int (*open) (struct inode *, struct file *); |
381 | int (*flush) (struct file *); | 381 | int (*flush) (struct file *); |
382 | int (*release) (struct inode *, struct file *); | 382 | int (*release) (struct inode *, struct file *); |
383 | int (*fsync) (struct file *, struct dentry *, int datasync); | 383 | int (*fsync) (struct file *, int datasync); |
384 | int (*aio_fsync) (struct kiocb *, int datasync); | 384 | int (*aio_fsync) (struct kiocb *, int datasync); |
385 | int (*fasync) (int, struct file *, int); | 385 | int (*fasync) (int, struct file *, int); |
386 | int (*lock) (struct file *, int, struct file_lock *); | 386 | int (*lock) (struct file *, int, struct file_lock *); |
@@ -429,8 +429,9 @@ check_flags: no | |||
429 | implementations. If your fs is not using generic_file_llseek, you | 429 | implementations. If your fs is not using generic_file_llseek, you |
430 | need to acquire and release the appropriate locks in your ->llseek(). | 430 | need to acquire and release the appropriate locks in your ->llseek(). |
431 | For many filesystems, it is probably safe to acquire the inode | 431 | For many filesystems, it is probably safe to acquire the inode |
432 | mutex. Note some filesystems (i.e. remote ones) provide no | 432 | mutex or just to use i_size_read() instead. |
433 | protection for i_size so you will need to use the BKL. | 433 | Note: this does not protect the file->f_pos against concurrent modifications |
434 | since this is something the userspace has to take care about. | ||
434 | 435 | ||
435 | Note: ext2_release() was *the* source of contention on fs-intensive | 436 | Note: ext2_release() was *the* source of contention on fs-intensive |
436 | loads and dropping BKL on ->release() helps to get rid of that (we still | 437 | loads and dropping BKL on ->release() helps to get rid of that (we still |
diff --git a/Documentation/filesystems/squashfs.txt b/Documentation/filesystems/squashfs.txt index b324c033035a..203f7202cc9e 100644 --- a/Documentation/filesystems/squashfs.txt +++ b/Documentation/filesystems/squashfs.txt | |||
@@ -38,7 +38,8 @@ Hard link support: yes no | |||
38 | Real inode numbers: yes no | 38 | Real inode numbers: yes no |
39 | 32-bit uids/gids: yes no | 39 | 32-bit uids/gids: yes no |
40 | File creation time: yes no | 40 | File creation time: yes no |
41 | Xattr and ACL support: no no | 41 | Xattr support: yes no |
42 | ACL support: no no | ||
42 | 43 | ||
43 | Squashfs compresses data, inodes and directories. In addition, inode and | 44 | Squashfs compresses data, inodes and directories. In addition, inode and |
44 | directory data are highly compacted, and packed on byte boundaries. Each | 45 | directory data are highly compacted, and packed on byte boundaries. Each |
@@ -58,7 +59,7 @@ obtained from this site also. | |||
58 | 3. SQUASHFS FILESYSTEM DESIGN | 59 | 3. SQUASHFS FILESYSTEM DESIGN |
59 | ----------------------------- | 60 | ----------------------------- |
60 | 61 | ||
61 | A squashfs filesystem consists of seven parts, packed together on a byte | 62 | A squashfs filesystem consists of a maximum of eight parts, packed together on a byte |
62 | alignment: | 63 | alignment: |
63 | 64 | ||
64 | --------------- | 65 | --------------- |
@@ -80,6 +81,9 @@ alignment: | |||
80 | |---------------| | 81 | |---------------| |
81 | | uid/gid | | 82 | | uid/gid | |
82 | | lookup table | | 83 | | lookup table | |
84 | |---------------| | ||
85 | | xattr | | ||
86 | | table | | ||
83 | --------------- | 87 | --------------- |
84 | 88 | ||
85 | Compressed data blocks are written to the filesystem as files are read from | 89 | Compressed data blocks are written to the filesystem as files are read from |
@@ -192,6 +196,26 @@ This table is stored compressed into metadata blocks. A second index table is | |||
192 | used to locate these. This second index table for speed of access (and because | 196 | used to locate these. This second index table for speed of access (and because |
193 | it is small) is read at mount time and cached in memory. | 197 | it is small) is read at mount time and cached in memory. |
194 | 198 | ||
199 | 3.7 Xattr table | ||
200 | --------------- | ||
201 | |||
202 | The xattr table contains extended attributes for each inode. The xattrs | ||
203 | for each inode are stored in a list, each list entry containing a type, | ||
204 | name and value field. The type field encodes the xattr prefix | ||
205 | ("user.", "trusted." etc) and it also encodes how the name/value fields | ||
206 | should be interpreted. Currently the type indicates whether the value | ||
207 | is stored inline (in which case the value field contains the xattr value), | ||
208 | or if it is stored out of line (in which case the value field stores a | ||
209 | reference to where the actual value is stored). This allows large values | ||
210 | to be stored out of line improving scanning and lookup performance and it | ||
211 | also allows values to be de-duplicated, the value being stored once, and | ||
212 | all other occurences holding an out of line reference to that value. | ||
213 | |||
214 | The xattr lists are packed into compressed 8K metadata blocks. | ||
215 | To reduce overhead in inodes, rather than storing the on-disk | ||
216 | location of the xattr list inside each inode, a 32-bit xattr id | ||
217 | is stored. This xattr id is mapped into the location of the xattr | ||
218 | list using a second xattr id lookup table. | ||
195 | 219 | ||
196 | 4. TODOS AND OUTSTANDING ISSUES | 220 | 4. TODOS AND OUTSTANDING ISSUES |
197 | ------------------------------- | 221 | ------------------------------- |
@@ -199,9 +223,7 @@ it is small) is read at mount time and cached in memory. | |||
199 | 4.1 Todo list | 223 | 4.1 Todo list |
200 | ------------- | 224 | ------------- |
201 | 225 | ||
202 | Implement Xattr and ACL support. The Squashfs 4.0 filesystem layout has hooks | 226 | Implement ACL support. |
203 | for these but the code has not been written. Once the code has been written | ||
204 | the existing layout should not require modification. | ||
205 | 227 | ||
206 | 4.2 Squashfs internal cache | 228 | 4.2 Squashfs internal cache |
207 | --------------------------- | 229 | --------------------------- |
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt index fe09a2cb1858..98ef55124158 100644 --- a/Documentation/filesystems/tmpfs.txt +++ b/Documentation/filesystems/tmpfs.txt | |||
@@ -94,11 +94,19 @@ NodeList format is a comma-separated list of decimal numbers and ranges, | |||
94 | a range being two hyphen-separated decimal numbers, the smallest and | 94 | a range being two hyphen-separated decimal numbers, the smallest and |
95 | largest node numbers in the range. For example, mpol=bind:0-3,5,7,9-15 | 95 | largest node numbers in the range. For example, mpol=bind:0-3,5,7,9-15 |
96 | 96 | ||
97 | A memory policy with a valid NodeList will be saved, as specified, for | ||
98 | use at file creation time. When a task allocates a file in the file | ||
99 | system, the mount option memory policy will be applied with a NodeList, | ||
100 | if any, modified by the calling task's cpuset constraints | ||
101 | [See Documentation/cgroups/cpusets.txt] and any optional flags, listed | ||
102 | below. If the resulting NodeLists is the empty set, the effective memory | ||
103 | policy for the file will revert to "default" policy. | ||
104 | |||
97 | NUMA memory allocation policies have optional flags that can be used in | 105 | NUMA memory allocation policies have optional flags that can be used in |
98 | conjunction with their modes. These optional flags can be specified | 106 | conjunction with their modes. These optional flags can be specified |
99 | when tmpfs is mounted by appending them to the mode before the NodeList. | 107 | when tmpfs is mounted by appending them to the mode before the NodeList. |
100 | See Documentation/vm/numa_memory_policy.txt for a list of all available | 108 | See Documentation/vm/numa_memory_policy.txt for a list of all available |
101 | memory allocation policy mode flags. | 109 | memory allocation policy mode flags and their effect on memory policy. |
102 | 110 | ||
103 | =static is equivalent to MPOL_F_STATIC_NODES | 111 | =static is equivalent to MPOL_F_STATIC_NODES |
104 | =relative is equivalent to MPOL_F_RELATIVE_NODES | 112 | =relative is equivalent to MPOL_F_RELATIVE_NODES |
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index b66858538df5..94677e7dcb13 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt | |||
@@ -401,11 +401,16 @@ otherwise noted. | |||
401 | started might not be in the page cache at the end of the | 401 | started might not be in the page cache at the end of the |
402 | walk). | 402 | walk). |
403 | 403 | ||
404 | truncate: called by the VFS to change the size of a file. The | 404 | truncate: Deprecated. This will not be called if ->setsize is defined. |
405 | Called by the VFS to change the size of a file. The | ||
405 | i_size field of the inode is set to the desired size by the | 406 | i_size field of the inode is set to the desired size by the |
406 | VFS before this method is called. This method is called by | 407 | VFS before this method is called. This method is called by |
407 | the truncate(2) system call and related functionality. | 408 | the truncate(2) system call and related functionality. |
408 | 409 | ||
410 | Note: ->truncate and vmtruncate are deprecated. Do not add new | ||
411 | instances/calls of these. Filesystems should be converted to do their | ||
412 | truncate sequence via ->setattr(). | ||
413 | |||
409 | permission: called by the VFS to check for access rights on a POSIX-like | 414 | permission: called by the VFS to check for access rights on a POSIX-like |
410 | filesystem. | 415 | filesystem. |
411 | 416 | ||
@@ -729,7 +734,7 @@ struct file_operations { | |||
729 | int (*open) (struct inode *, struct file *); | 734 | int (*open) (struct inode *, struct file *); |
730 | int (*flush) (struct file *); | 735 | int (*flush) (struct file *); |
731 | int (*release) (struct inode *, struct file *); | 736 | int (*release) (struct inode *, struct file *); |
732 | int (*fsync) (struct file *, struct dentry *, int datasync); | 737 | int (*fsync) (struct file *, int datasync); |
733 | int (*aio_fsync) (struct kiocb *, int datasync); | 738 | int (*aio_fsync) (struct kiocb *, int datasync); |
734 | int (*fasync) (int, struct file *, int); | 739 | int (*fasync) (int, struct file *, int); |
735 | int (*lock) (struct file *, int, struct file_lock *); | 740 | int (*lock) (struct file *, int, struct file_lock *); |
diff --git a/Documentation/filesystems/xfs-delayed-logging-design.txt b/Documentation/filesystems/xfs-delayed-logging-design.txt new file mode 100644 index 000000000000..96d0df28bed3 --- /dev/null +++ b/Documentation/filesystems/xfs-delayed-logging-design.txt | |||
@@ -0,0 +1,811 @@ | |||
1 | XFS Delayed Logging Design | ||
2 | -------------------------- | ||
3 | |||
4 | Introduction to Re-logging in XFS | ||
5 | --------------------------------- | ||
6 | |||
7 | XFS logging is a combination of logical and physical logging. Some objects, | ||
8 | such as inodes and dquots, are logged in logical format where the details | ||
9 | logged are made up of the changes to in-core structures rather than on-disk | ||
10 | structures. Other objects - typically buffers - have their physical changes | ||
11 | logged. The reason for these differences is to reduce the amount of log space | ||
12 | required for objects that are frequently logged. Some parts of inodes are more | ||
13 | frequently logged than others, and inodes are typically more frequently logged | ||
14 | than any other object (except maybe the superblock buffer) so keeping the | ||
15 | amount of metadata logged low is of prime importance. | ||
16 | |||
17 | The reason that this is such a concern is that XFS allows multiple separate | ||
18 | modifications to a single object to be carried in the log at any given time. | ||
19 | This allows the log to avoid needing to flush each change to disk before | ||
20 | recording a new change to the object. XFS does this via a method called | ||
21 | "re-logging". Conceptually, this is quite simple - all it requires is that any | ||
22 | new change to the object is recorded with a *new copy* of all the existing | ||
23 | changes in the new transaction that is written to the log. | ||
24 | |||
25 | That is, if we have a sequence of changes A through to F, and the object was | ||
26 | written to disk after change D, we would see in the log the following series | ||
27 | of transactions, their contents and the log sequence number (LSN) of the | ||
28 | transaction: | ||
29 | |||
30 | Transaction Contents LSN | ||
31 | A A X | ||
32 | B A+B X+n | ||
33 | C A+B+C X+n+m | ||
34 | D A+B+C+D X+n+m+o | ||
35 | <object written to disk> | ||
36 | E E Y (> X+n+m+o) | ||
37 | F E+F YÙ+p | ||
38 | |||
39 | In other words, each time an object is relogged, the new transaction contains | ||
40 | the aggregation of all the previous changes currently held only in the log. | ||
41 | |||
42 | This relogging technique also allows objects to be moved forward in the log so | ||
43 | that an object being relogged does not prevent the tail of the log from ever | ||
44 | moving forward. This can be seen in the table above by the changing | ||
45 | (increasing) LSN of each subsquent transaction - the LSN is effectively a | ||
46 | direct encoding of the location in the log of the transaction. | ||
47 | |||
48 | This relogging is also used to implement long-running, multiple-commit | ||
49 | transactions. These transaction are known as rolling transactions, and require | ||
50 | a special log reservation known as a permanent transaction reservation. A | ||
51 | typical example of a rolling transaction is the removal of extents from an | ||
52 | inode which can only be done at a rate of two extents per transaction because | ||
53 | of reservation size limitations. Hence a rolling extent removal transaction | ||
54 | keeps relogging the inode and btree buffers as they get modified in each | ||
55 | removal operation. This keeps them moving forward in the log as the operation | ||
56 | progresses, ensuring that current operation never gets blocked by itself if the | ||
57 | log wraps around. | ||
58 | |||
59 | Hence it can be seen that the relogging operation is fundamental to the correct | ||
60 | working of the XFS journalling subsystem. From the above description, most | ||
61 | people should be able to see why the XFS metadata operations writes so much to | ||
62 | the log - repeated operations to the same objects write the same changes to | ||
63 | the log over and over again. Worse is the fact that objects tend to get | ||
64 | dirtier as they get relogged, so each subsequent transaction is writing more | ||
65 | metadata into the log. | ||
66 | |||
67 | Another feature of the XFS transaction subsystem is that most transactions are | ||
68 | asynchronous. That is, they don't commit to disk until either a log buffer is | ||
69 | filled (a log buffer can hold multiple transactions) or a synchronous operation | ||
70 | forces the log buffers holding the transactions to disk. This means that XFS is | ||
71 | doing aggregation of transactions in memory - batching them, if you like - to | ||
72 | minimise the impact of the log IO on transaction throughput. | ||
73 | |||
74 | The limitation on asynchronous transaction throughput is the number and size of | ||
75 | log buffers made available by the log manager. By default there are 8 log | ||
76 | buffers available and the size of each is 32kB - the size can be increased up | ||
77 | to 256kB by use of a mount option. | ||
78 | |||
79 | Effectively, this gives us the maximum bound of outstanding metadata changes | ||
80 | that can be made to the filesystem at any point in time - if all the log | ||
81 | buffers are full and under IO, then no more transactions can be committed until | ||
82 | the current batch completes. It is now common for a single current CPU core to | ||
83 | be to able to issue enough transactions to keep the log buffers full and under | ||
84 | IO permanently. Hence the XFS journalling subsystem can be considered to be IO | ||
85 | bound. | ||
86 | |||
87 | Delayed Logging: Concepts | ||
88 | ------------------------- | ||
89 | |||
90 | The key thing to note about the asynchronous logging combined with the | ||
91 | relogging technique XFS uses is that we can be relogging changed objects | ||
92 | multiple times before they are committed to disk in the log buffers. If we | ||
93 | return to the previous relogging example, it is entirely possible that | ||
94 | transactions A through D are committed to disk in the same log buffer. | ||
95 | |||
96 | That is, a single log buffer may contain multiple copies of the same object, | ||
97 | but only one of those copies needs to be there - the last one "D", as it | ||
98 | contains all the changes from the previous changes. In other words, we have one | ||
99 | necessary copy in the log buffer, and three stale copies that are simply | ||
100 | wasting space. When we are doing repeated operations on the same set of | ||
101 | objects, these "stale objects" can be over 90% of the space used in the log | ||
102 | buffers. It is clear that reducing the number of stale objects written to the | ||
103 | log would greatly reduce the amount of metadata we write to the log, and this | ||
104 | is the fundamental goal of delayed logging. | ||
105 | |||
106 | From a conceptual point of view, XFS is already doing relogging in memory (where | ||
107 | memory == log buffer), only it is doing it extremely inefficiently. It is using | ||
108 | logical to physical formatting to do the relogging because there is no | ||
109 | infrastructure to keep track of logical changes in memory prior to physically | ||
110 | formatting the changes in a transaction to the log buffer. Hence we cannot avoid | ||
111 | accumulating stale objects in the log buffers. | ||
112 | |||
113 | Delayed logging is the name we've given to keeping and tracking transactional | ||
114 | changes to objects in memory outside the log buffer infrastructure. Because of | ||
115 | the relogging concept fundamental to the XFS journalling subsystem, this is | ||
116 | actually relatively easy to do - all the changes to logged items are already | ||
117 | tracked in the current infrastructure. The big problem is how to accumulate | ||
118 | them and get them to the log in a consistent, recoverable manner. | ||
119 | Describing the problems and how they have been solved is the focus of this | ||
120 | document. | ||
121 | |||
122 | One of the key changes that delayed logging makes to the operation of the | ||
123 | journalling subsystem is that it disassociates the amount of outstanding | ||
124 | metadata changes from the size and number of log buffers available. In other | ||
125 | words, instead of there only being a maximum of 2MB of transaction changes not | ||
126 | written to the log at any point in time, there may be a much greater amount | ||
127 | being accumulated in memory. Hence the potential for loss of metadata on a | ||
128 | crash is much greater than for the existing logging mechanism. | ||
129 | |||
130 | It should be noted that this does not change the guarantee that log recovery | ||
131 | will result in a consistent filesystem. What it does mean is that as far as the | ||
132 | recovered filesystem is concerned, there may be many thousands of transactions | ||
133 | that simply did not occur as a result of the crash. This makes it even more | ||
134 | important that applications that care about their data use fsync() where they | ||
135 | need to ensure application level data integrity is maintained. | ||
136 | |||
137 | It should be noted that delayed logging is not an innovative new concept that | ||
138 | warrants rigorous proofs to determine whether it is correct or not. The method | ||
139 | of accumulating changes in memory for some period before writing them to the | ||
140 | log is used effectively in many filesystems including ext3 and ext4. Hence | ||
141 | no time is spent in this document trying to convince the reader that the | ||
142 | concept is sound. Instead it is simply considered a "solved problem" and as | ||
143 | such implementing it in XFS is purely an exercise in software engineering. | ||
144 | |||
145 | The fundamental requirements for delayed logging in XFS are simple: | ||
146 | |||
147 | 1. Reduce the amount of metadata written to the log by at least | ||
148 | an order of magnitude. | ||
149 | 2. Supply sufficient statistics to validate Requirement #1. | ||
150 | 3. Supply sufficient new tracing infrastructure to be able to debug | ||
151 | problems with the new code. | ||
152 | 4. No on-disk format change (metadata or log format). | ||
153 | 5. Enable and disable with a mount option. | ||
154 | 6. No performance regressions for synchronous transaction workloads. | ||
155 | |||
156 | Delayed Logging: Design | ||
157 | ----------------------- | ||
158 | |||
159 | Storing Changes | ||
160 | |||
161 | The problem with accumulating changes at a logical level (i.e. just using the | ||
162 | existing log item dirty region tracking) is that when it comes to writing the | ||
163 | changes to the log buffers, we need to ensure that the object we are formatting | ||
164 | is not changing while we do this. This requires locking the object to prevent | ||
165 | concurrent modification. Hence flushing the logical changes to the log would | ||
166 | require us to lock every object, format them, and then unlock them again. | ||
167 | |||
168 | This introduces lots of scope for deadlocks with transactions that are already | ||
169 | running. For example, a transaction has object A locked and modified, but needs | ||
170 | the delayed logging tracking lock to commit the transaction. However, the | ||
171 | flushing thread has the delayed logging tracking lock already held, and is | ||
172 | trying to get the lock on object A to flush it to the log buffer. This appears | ||
173 | to be an unsolvable deadlock condition, and it was solving this problem that | ||
174 | was the barrier to implementing delayed logging for so long. | ||
175 | |||
176 | The solution is relatively simple - it just took a long time to recognise it. | ||
177 | Put simply, the current logging code formats the changes to each item into an | ||
178 | vector array that points to the changed regions in the item. The log write code | ||
179 | simply copies the memory these vectors point to into the log buffer during | ||
180 | transaction commit while the item is locked in the transaction. Instead of | ||
181 | using the log buffer as the destination of the formatting code, we can use an | ||
182 | allocated memory buffer big enough to fit the formatted vector. | ||
183 | |||
184 | If we then copy the vector into the memory buffer and rewrite the vector to | ||
185 | point to the memory buffer rather than the object itself, we now have a copy of | ||
186 | the changes in a format that is compatible with the log buffer writing code. | ||
187 | that does not require us to lock the item to access. This formatting and | ||
188 | rewriting can all be done while the object is locked during transaction commit, | ||
189 | resulting in a vector that is transactionally consistent and can be accessed | ||
190 | without needing to lock the owning item. | ||
191 | |||
192 | Hence we avoid the need to lock items when we need to flush outstanding | ||
193 | asynchronous transactions to the log. The differences between the existing | ||
194 | formatting method and the delayed logging formatting can be seen in the | ||
195 | diagram below. | ||
196 | |||
197 | Current format log vector: | ||
198 | |||
199 | Object +---------------------------------------------+ | ||
200 | Vector 1 +----+ | ||
201 | Vector 2 +----+ | ||
202 | Vector 3 +----------+ | ||
203 | |||
204 | After formatting: | ||
205 | |||
206 | Log Buffer +-V1-+-V2-+----V3----+ | ||
207 | |||
208 | Delayed logging vector: | ||
209 | |||
210 | Object +---------------------------------------------+ | ||
211 | Vector 1 +----+ | ||
212 | Vector 2 +----+ | ||
213 | Vector 3 +----------+ | ||
214 | |||
215 | After formatting: | ||
216 | |||
217 | Memory Buffer +-V1-+-V2-+----V3----+ | ||
218 | Vector 1 +----+ | ||
219 | Vector 2 +----+ | ||
220 | Vector 3 +----------+ | ||
221 | |||
222 | The memory buffer and associated vector need to be passed as a single object, | ||
223 | but still need to be associated with the parent object so if the object is | ||
224 | relogged we can replace the current memory buffer with a new memory buffer that | ||
225 | contains the latest changes. | ||
226 | |||
227 | The reason for keeping the vector around after we've formatted the memory | ||
228 | buffer is to support splitting vectors across log buffer boundaries correctly. | ||
229 | If we don't keep the vector around, we do not know where the region boundaries | ||
230 | are in the item, so we'd need a new encapsulation method for regions in the log | ||
231 | buffer writing (i.e. double encapsulation). This would be an on-disk format | ||
232 | change and as such is not desirable. It also means we'd have to write the log | ||
233 | region headers in the formatting stage, which is problematic as there is per | ||
234 | region state that needs to be placed into the headers during the log write. | ||
235 | |||
236 | Hence we need to keep the vector, but by attaching the memory buffer to it and | ||
237 | rewriting the vector addresses to point at the memory buffer we end up with a | ||
238 | self-describing object that can be passed to the log buffer write code to be | ||
239 | handled in exactly the same manner as the existing log vectors are handled. | ||
240 | Hence we avoid needing a new on-disk format to handle items that have been | ||
241 | relogged in memory. | ||
242 | |||
243 | |||
244 | Tracking Changes | ||
245 | |||
246 | Now that we can record transactional changes in memory in a form that allows | ||
247 | them to be used without limitations, we need to be able to track and accumulate | ||
248 | them so that they can be written to the log at some later point in time. The | ||
249 | log item is the natural place to store this vector and buffer, and also makes sense | ||
250 | to be the object that is used to track committed objects as it will always | ||
251 | exist once the object has been included in a transaction. | ||
252 | |||
253 | The log item is already used to track the log items that have been written to | ||
254 | the log but not yet written to disk. Such log items are considered "active" | ||
255 | and as such are stored in the Active Item List (AIL) which is a LSN-ordered | ||
256 | double linked list. Items are inserted into this list during log buffer IO | ||
257 | completion, after which they are unpinned and can be written to disk. An object | ||
258 | that is in the AIL can be relogged, which causes the object to be pinned again | ||
259 | and then moved forward in the AIL when the log buffer IO completes for that | ||
260 | transaction. | ||
261 | |||
262 | Essentially, this shows that an item that is in the AIL can still be modified | ||
263 | and relogged, so any tracking must be separate to the AIL infrastructure. As | ||
264 | such, we cannot reuse the AIL list pointers for tracking committed items, nor | ||
265 | can we store state in any field that is protected by the AIL lock. Hence the | ||
266 | committed item tracking needs it's own locks, lists and state fields in the log | ||
267 | item. | ||
268 | |||
269 | Similar to the AIL, tracking of committed items is done through a new list | ||
270 | called the Committed Item List (CIL). The list tracks log items that have been | ||
271 | committed and have formatted memory buffers attached to them. It tracks objects | ||
272 | in transaction commit order, so when an object is relogged it is removed from | ||
273 | it's place in the list and re-inserted at the tail. This is entirely arbitrary | ||
274 | and done to make it easy for debugging - the last items in the list are the | ||
275 | ones that are most recently modified. Ordering of the CIL is not necessary for | ||
276 | transactional integrity (as discussed in the next section) so the ordering is | ||
277 | done for convenience/sanity of the developers. | ||
278 | |||
279 | |||
280 | Delayed Logging: Checkpoints | ||
281 | |||
282 | When we have a log synchronisation event, commonly known as a "log force", | ||
283 | all the items in the CIL must be written into the log via the log buffers. | ||
284 | We need to write these items in the order that they exist in the CIL, and they | ||
285 | need to be written as an atomic transaction. The need for all the objects to be | ||
286 | written as an atomic transaction comes from the requirements of relogging and | ||
287 | log replay - all the changes in all the objects in a given transaction must | ||
288 | either be completely replayed during log recovery, or not replayed at all. If | ||
289 | a transaction is not replayed because it is not complete in the log, then | ||
290 | no later transactions should be replayed, either. | ||
291 | |||
292 | To fulfill this requirement, we need to write the entire CIL in a single log | ||
293 | transaction. Fortunately, the XFS log code has no fixed limit on the size of a | ||
294 | transaction, nor does the log replay code. The only fundamental limit is that | ||
295 | the transaction cannot be larger than just under half the size of the log. The | ||
296 | reason for this limit is that to find the head and tail of the log, there must | ||
297 | be at least one complete transaction in the log at any given time. If a | ||
298 | transaction is larger than half the log, then there is the possibility that a | ||
299 | crash during the write of a such a transaction could partially overwrite the | ||
300 | only complete previous transaction in the log. This will result in a recovery | ||
301 | failure and an inconsistent filesystem and hence we must enforce the maximum | ||
302 | size of a checkpoint to be slightly less than a half the log. | ||
303 | |||
304 | Apart from this size requirement, a checkpoint transaction looks no different | ||
305 | to any other transaction - it contains a transaction header, a series of | ||
306 | formatted log items and a commit record at the tail. From a recovery | ||
307 | perspective, the checkpoint transaction is also no different - just a lot | ||
308 | bigger with a lot more items in it. The worst case effect of this is that we | ||
309 | might need to tune the recovery transaction object hash size. | ||
310 | |||
311 | Because the checkpoint is just another transaction and all the changes to log | ||
312 | items are stored as log vectors, we can use the existing log buffer writing | ||
313 | code to write the changes into the log. To do this efficiently, we need to | ||
314 | minimise the time we hold the CIL locked while writing the checkpoint | ||
315 | transaction. The current log write code enables us to do this easily with the | ||
316 | way it separates the writing of the transaction contents (the log vectors) from | ||
317 | the transaction commit record, but tracking this requires us to have a | ||
318 | per-checkpoint context that travels through the log write process through to | ||
319 | checkpoint completion. | ||
320 | |||
321 | Hence a checkpoint has a context that tracks the state of the current | ||
322 | checkpoint from initiation to checkpoint completion. A new context is initiated | ||
323 | at the same time a checkpoint transaction is started. That is, when we remove | ||
324 | all the current items from the CIL during a checkpoint operation, we move all | ||
325 | those changes into the current checkpoint context. We then initialise a new | ||
326 | context and attach that to the CIL for aggregation of new transactions. | ||
327 | |||
328 | This allows us to unlock the CIL immediately after transfer of all the | ||
329 | committed items and effectively allow new transactions to be issued while we | ||
330 | are formatting the checkpoint into the log. It also allows concurrent | ||
331 | checkpoints to be written into the log buffers in the case of log force heavy | ||
332 | workloads, just like the existing transaction commit code does. This, however, | ||
333 | requires that we strictly order the commit records in the log so that | ||
334 | checkpoint sequence order is maintained during log replay. | ||
335 | |||
336 | To ensure that we can be writing an item into a checkpoint transaction at | ||
337 | the same time another transaction modifies the item and inserts the log item | ||
338 | into the new CIL, then checkpoint transaction commit code cannot use log items | ||
339 | to store the list of log vectors that need to be written into the transaction. | ||
340 | Hence log vectors need to be able to be chained together to allow them to be | ||
341 | detatched from the log items. That is, when the CIL is flushed the memory | ||
342 | buffer and log vector attached to each log item needs to be attached to the | ||
343 | checkpoint context so that the log item can be released. In diagrammatic form, | ||
344 | the CIL would look like this before the flush: | ||
345 | |||
346 | CIL Head | ||
347 | | | ||
348 | V | ||
349 | Log Item <-> log vector 1 -> memory buffer | ||
350 | | -> vector array | ||
351 | V | ||
352 | Log Item <-> log vector 2 -> memory buffer | ||
353 | | -> vector array | ||
354 | V | ||
355 | ...... | ||
356 | | | ||
357 | V | ||
358 | Log Item <-> log vector N-1 -> memory buffer | ||
359 | | -> vector array | ||
360 | V | ||
361 | Log Item <-> log vector N -> memory buffer | ||
362 | -> vector array | ||
363 | |||
364 | And after the flush the CIL head is empty, and the checkpoint context log | ||
365 | vector list would look like: | ||
366 | |||
367 | Checkpoint Context | ||
368 | | | ||
369 | V | ||
370 | log vector 1 -> memory buffer | ||
371 | | -> vector array | ||
372 | | -> Log Item | ||
373 | V | ||
374 | log vector 2 -> memory buffer | ||
375 | | -> vector array | ||
376 | | -> Log Item | ||
377 | V | ||
378 | ...... | ||
379 | | | ||
380 | V | ||
381 | log vector N-1 -> memory buffer | ||
382 | | -> vector array | ||
383 | | -> Log Item | ||
384 | V | ||
385 | log vector N -> memory buffer | ||
386 | -> vector array | ||
387 | -> Log Item | ||
388 | |||
389 | Once this transfer is done, the CIL can be unlocked and new transactions can | ||
390 | start, while the checkpoint flush code works over the log vector chain to | ||
391 | commit the checkpoint. | ||
392 | |||
393 | Once the checkpoint is written into the log buffers, the checkpoint context is | ||
394 | attached to the log buffer that the commit record was written to along with a | ||
395 | completion callback. Log IO completion will call that callback, which can then | ||
396 | run transaction committed processing for the log items (i.e. insert into AIL | ||
397 | and unpin) in the log vector chain and then free the log vector chain and | ||
398 | checkpoint context. | ||
399 | |||
400 | Discussion Point: I am uncertain as to whether the log item is the most | ||
401 | efficient way to track vectors, even though it seems like the natural way to do | ||
402 | it. The fact that we walk the log items (in the CIL) just to chain the log | ||
403 | vectors and break the link between the log item and the log vector means that | ||
404 | we take a cache line hit for the log item list modification, then another for | ||
405 | the log vector chaining. If we track by the log vectors, then we only need to | ||
406 | break the link between the log item and the log vector, which means we should | ||
407 | dirty only the log item cachelines. Normally I wouldn't be concerned about one | ||
408 | vs two dirty cachelines except for the fact I've seen upwards of 80,000 log | ||
409 | vectors in one checkpoint transaction. I'd guess this is a "measure and | ||
410 | compare" situation that can be done after a working and reviewed implementation | ||
411 | is in the dev tree.... | ||
412 | |||
413 | Delayed Logging: Checkpoint Sequencing | ||
414 | |||
415 | One of the key aspects of the XFS transaction subsystem is that it tags | ||
416 | committed transactions with the log sequence number of the transaction commit. | ||
417 | This allows transactions to be issued asynchronously even though there may be | ||
418 | future operations that cannot be completed until that transaction is fully | ||
419 | committed to the log. In the rare case that a dependent operation occurs (e.g. | ||
420 | re-using a freed metadata extent for a data extent), a special, optimised log | ||
421 | force can be issued to force the dependent transaction to disk immediately. | ||
422 | |||
423 | To do this, transactions need to record the LSN of the commit record of the | ||
424 | transaction. This LSN comes directly from the log buffer the transaction is | ||
425 | written into. While this works just fine for the existing transaction | ||
426 | mechanism, it does not work for delayed logging because transactions are not | ||
427 | written directly into the log buffers. Hence some other method of sequencing | ||
428 | transactions is required. | ||
429 | |||
430 | As discussed in the checkpoint section, delayed logging uses per-checkpoint | ||
431 | contexts, and as such it is simple to assign a sequence number to each | ||
432 | checkpoint. Because the switching of checkpoint contexts must be done | ||
433 | atomically, it is simple to ensure that each new context has a monotonically | ||
434 | increasing sequence number assigned to it without the need for an external | ||
435 | atomic counter - we can just take the current context sequence number and add | ||
436 | one to it for the new context. | ||
437 | |||
438 | Then, instead of assigning a log buffer LSN to the transaction commit LSN | ||
439 | during the commit, we can assign the current checkpoint sequence. This allows | ||
440 | operations that track transactions that have not yet completed know what | ||
441 | checkpoint sequence needs to be committed before they can continue. As a | ||
442 | result, the code that forces the log to a specific LSN now needs to ensure that | ||
443 | the log forces to a specific checkpoint. | ||
444 | |||
445 | To ensure that we can do this, we need to track all the checkpoint contexts | ||
446 | that are currently committing to the log. When we flush a checkpoint, the | ||
447 | context gets added to a "committing" list which can be searched. When a | ||
448 | checkpoint commit completes, it is removed from the committing list. Because | ||
449 | the checkpoint context records the LSN of the commit record for the checkpoint, | ||
450 | we can also wait on the log buffer that contains the commit record, thereby | ||
451 | using the existing log force mechanisms to execute synchronous forces. | ||
452 | |||
453 | It should be noted that the synchronous forces may need to be extended with | ||
454 | mitigation algorithms similar to the current log buffer code to allow | ||
455 | aggregation of multiple synchronous transactions if there are already | ||
456 | synchronous transactions being flushed. Investigation of the performance of the | ||
457 | current design is needed before making any decisions here. | ||
458 | |||
459 | The main concern with log forces is to ensure that all the previous checkpoints | ||
460 | are also committed to disk before the one we need to wait for. Therefore we | ||
461 | need to check that all the prior contexts in the committing list are also | ||
462 | complete before waiting on the one we need to complete. We do this | ||
463 | synchronisation in the log force code so that we don't need to wait anywhere | ||
464 | else for such serialisation - it only matters when we do a log force. | ||
465 | |||
466 | The only remaining complexity is that a log force now also has to handle the | ||
467 | case where the forcing sequence number is the same as the current context. That | ||
468 | is, we need to flush the CIL and potentially wait for it to complete. This is a | ||
469 | simple addition to the existing log forcing code to check the sequence numbers | ||
470 | and push if required. Indeed, placing the current sequence checkpoint flush in | ||
471 | the log force code enables the current mechanism for issuing synchronous | ||
472 | transactions to remain untouched (i.e. commit an asynchronous transaction, then | ||
473 | force the log at the LSN of that transaction) and so the higher level code | ||
474 | behaves the same regardless of whether delayed logging is being used or not. | ||
475 | |||
476 | Delayed Logging: Checkpoint Log Space Accounting | ||
477 | |||
478 | The big issue for a checkpoint transaction is the log space reservation for the | ||
479 | transaction. We don't know how big a checkpoint transaction is going to be | ||
480 | ahead of time, nor how many log buffers it will take to write out, nor the | ||
481 | number of split log vector regions are going to be used. We can track the | ||
482 | amount of log space required as we add items to the commit item list, but we | ||
483 | still need to reserve the space in the log for the checkpoint. | ||
484 | |||
485 | A typical transaction reserves enough space in the log for the worst case space | ||
486 | usage of the transaction. The reservation accounts for log record headers, | ||
487 | transaction and region headers, headers for split regions, buffer tail padding, | ||
488 | etc. as well as the actual space for all the changed metadata in the | ||
489 | transaction. While some of this is fixed overhead, much of it is dependent on | ||
490 | the size of the transaction and the number of regions being logged (the number | ||
491 | of log vectors in the transaction). | ||
492 | |||
493 | An example of the differences would be logging directory changes versus logging | ||
494 | inode changes. If you modify lots of inode cores (e.g. chmod -R g+w *), then | ||
495 | there are lots of transactions that only contain an inode core and an inode log | ||
496 | format structure. That is, two vectors totaling roughly 150 bytes. If we modify | ||
497 | 10,000 inodes, we have about 1.5MB of metadata to write in 20,000 vectors. Each | ||
498 | vector is 12 bytes, so the total to be logged is approximately 1.75MB. In | ||
499 | comparison, if we are logging full directory buffers, they are typically 4KB | ||
500 | each, so we in 1.5MB of directory buffers we'd have roughly 400 buffers and a | ||
501 | buffer format structure for each buffer - roughly 800 vectors or 1.51MB total | ||
502 | space. From this, it should be obvious that a static log space reservation is | ||
503 | not particularly flexible and is difficult to select the "optimal value" for | ||
504 | all workloads. | ||
505 | |||
506 | Further, if we are going to use a static reservation, which bit of the entire | ||
507 | reservation does it cover? We account for space used by the transaction | ||
508 | reservation by tracking the space currently used by the object in the CIL and | ||
509 | then calculating the increase or decrease in space used as the object is | ||
510 | relogged. This allows for a checkpoint reservation to only have to account for | ||
511 | log buffer metadata used such as log header records. | ||
512 | |||
513 | However, even using a static reservation for just the log metadata is | ||
514 | problematic. Typically log record headers use at least 16KB of log space per | ||
515 | 1MB of log space consumed (512 bytes per 32k) and the reservation needs to be | ||
516 | large enough to handle arbitrary sized checkpoint transactions. This | ||
517 | reservation needs to be made before the checkpoint is started, and we need to | ||
518 | be able to reserve the space without sleeping. For a 8MB checkpoint, we need a | ||
519 | reservation of around 150KB, which is a non-trivial amount of space. | ||
520 | |||
521 | A static reservation needs to manipulate the log grant counters - we can take a | ||
522 | permanent reservation on the space, but we still need to make sure we refresh | ||
523 | the write reservation (the actual space available to the transaction) after | ||
524 | every checkpoint transaction completion. Unfortunately, if this space is not | ||
525 | available when required, then the regrant code will sleep waiting for it. | ||
526 | |||
527 | The problem with this is that it can lead to deadlocks as we may need to commit | ||
528 | checkpoints to be able to free up log space (refer back to the description of | ||
529 | rolling transactions for an example of this). Hence we *must* always have | ||
530 | space available in the log if we are to use static reservations, and that is | ||
531 | very difficult and complex to arrange. It is possible to do, but there is a | ||
532 | simpler way. | ||
533 | |||
534 | The simpler way of doing this is tracking the entire log space used by the | ||
535 | items in the CIL and using this to dynamically calculate the amount of log | ||
536 | space required by the log metadata. If this log metadata space changes as a | ||
537 | result of a transaction commit inserting a new memory buffer into the CIL, then | ||
538 | the difference in space required is removed from the transaction that causes | ||
539 | the change. Transactions at this level will *always* have enough space | ||
540 | available in their reservation for this as they have already reserved the | ||
541 | maximal amount of log metadata space they require, and such a delta reservation | ||
542 | will always be less than or equal to the maximal amount in the reservation. | ||
543 | |||
544 | Hence we can grow the checkpoint transaction reservation dynamically as items | ||
545 | are added to the CIL and avoid the need for reserving and regranting log space | ||
546 | up front. This avoids deadlocks and removes a blocking point from the | ||
547 | checkpoint flush code. | ||
548 | |||
549 | As mentioned early, transactions can't grow to more than half the size of the | ||
550 | log. Hence as part of the reservation growing, we need to also check the size | ||
551 | of the reservation against the maximum allowed transaction size. If we reach | ||
552 | the maximum threshold, we need to push the CIL to the log. This is effectively | ||
553 | a "background flush" and is done on demand. This is identical to | ||
554 | a CIL push triggered by a log force, only that there is no waiting for the | ||
555 | checkpoint commit to complete. This background push is checked and executed by | ||
556 | transaction commit code. | ||
557 | |||
558 | If the transaction subsystem goes idle while we still have items in the CIL, | ||
559 | they will be flushed by the periodic log force issued by the xfssyncd. This log | ||
560 | force will push the CIL to disk, and if the transaction subsystem stays idle, | ||
561 | allow the idle log to be covered (effectively marked clean) in exactly the same | ||
562 | manner that is done for the existing logging method. A discussion point is | ||
563 | whether this log force needs to be done more frequently than the current rate | ||
564 | which is once every 30s. | ||
565 | |||
566 | |||
567 | Delayed Logging: Log Item Pinning | ||
568 | |||
569 | Currently log items are pinned during transaction commit while the items are | ||
570 | still locked. This happens just after the items are formatted, though it could | ||
571 | be done any time before the items are unlocked. The result of this mechanism is | ||
572 | that items get pinned once for every transaction that is committed to the log | ||
573 | buffers. Hence items that are relogged in the log buffers will have a pin count | ||
574 | for every outstanding transaction they were dirtied in. When each of these | ||
575 | transactions is completed, they will unpin the item once. As a result, the item | ||
576 | only becomes unpinned when all the transactions complete and there are no | ||
577 | pending transactions. Thus the pinning and unpinning of a log item is symmetric | ||
578 | as there is a 1:1 relationship with transaction commit and log item completion. | ||
579 | |||
580 | For delayed logging, however, we have an assymetric transaction commit to | ||
581 | completion relationship. Every time an object is relogged in the CIL it goes | ||
582 | through the commit process without a corresponding completion being registered. | ||
583 | That is, we now have a many-to-one relationship between transaction commit and | ||
584 | log item completion. The result of this is that pinning and unpinning of the | ||
585 | log items becomes unbalanced if we retain the "pin on transaction commit, unpin | ||
586 | on transaction completion" model. | ||
587 | |||
588 | To keep pin/unpin symmetry, the algorithm needs to change to a "pin on | ||
589 | insertion into the CIL, unpin on checkpoint completion". In other words, the | ||
590 | pinning and unpinning becomes symmetric around a checkpoint context. We have to | ||
591 | pin the object the first time it is inserted into the CIL - if it is already in | ||
592 | the CIL during a transaction commit, then we do not pin it again. Because there | ||
593 | can be multiple outstanding checkpoint contexts, we can still see elevated pin | ||
594 | counts, but as each checkpoint completes the pin count will retain the correct | ||
595 | value according to it's context. | ||
596 | |||
597 | Just to make matters more slightly more complex, this checkpoint level context | ||
598 | for the pin count means that the pinning of an item must take place under the | ||
599 | CIL commit/flush lock. If we pin the object outside this lock, we cannot | ||
600 | guarantee which context the pin count is associated with. This is because of | ||
601 | the fact pinning the item is dependent on whether the item is present in the | ||
602 | current CIL or not. If we don't pin the CIL first before we check and pin the | ||
603 | object, we have a race with CIL being flushed between the check and the pin | ||
604 | (or not pinning, as the case may be). Hence we must hold the CIL flush/commit | ||
605 | lock to guarantee that we pin the items correctly. | ||
606 | |||
607 | Delayed Logging: Concurrent Scalability | ||
608 | |||
609 | A fundamental requirement for the CIL is that accesses through transaction | ||
610 | commits must scale to many concurrent commits. The current transaction commit | ||
611 | code does not break down even when there are transactions coming from 2048 | ||
612 | processors at once. The current transaction code does not go any faster than if | ||
613 | there was only one CPU using it, but it does not slow down either. | ||
614 | |||
615 | As a result, the delayed logging transaction commit code needs to be designed | ||
616 | for concurrency from the ground up. It is obvious that there are serialisation | ||
617 | points in the design - the three important ones are: | ||
618 | |||
619 | 1. Locking out new transaction commits while flushing the CIL | ||
620 | 2. Adding items to the CIL and updating item space accounting | ||
621 | 3. Checkpoint commit ordering | ||
622 | |||
623 | Looking at the transaction commit and CIL flushing interactions, it is clear | ||
624 | that we have a many-to-one interaction here. That is, the only restriction on | ||
625 | the number of concurrent transactions that can be trying to commit at once is | ||
626 | the amount of space available in the log for their reservations. The practical | ||
627 | limit here is in the order of several hundred concurrent transactions for a | ||
628 | 128MB log, which means that it is generally one per CPU in a machine. | ||
629 | |||
630 | The amount of time a transaction commit needs to hold out a flush is a | ||
631 | relatively long period of time - the pinning of log items needs to be done | ||
632 | while we are holding out a CIL flush, so at the moment that means it is held | ||
633 | across the formatting of the objects into memory buffers (i.e. while memcpy()s | ||
634 | are in progress). Ultimately a two pass algorithm where the formatting is done | ||
635 | separately to the pinning of objects could be used to reduce the hold time of | ||
636 | the transaction commit side. | ||
637 | |||
638 | Because of the number of potential transaction commit side holders, the lock | ||
639 | really needs to be a sleeping lock - if the CIL flush takes the lock, we do not | ||
640 | want every other CPU in the machine spinning on the CIL lock. Given that | ||
641 | flushing the CIL could involve walking a list of tens of thousands of log | ||
642 | items, it will get held for a significant time and so spin contention is a | ||
643 | significant concern. Preventing lots of CPUs spinning doing nothing is the | ||
644 | main reason for choosing a sleeping lock even though nothing in either the | ||
645 | transaction commit or CIL flush side sleeps with the lock held. | ||
646 | |||
647 | It should also be noted that CIL flushing is also a relatively rare operation | ||
648 | compared to transaction commit for asynchronous transaction workloads - only | ||
649 | time will tell if using a read-write semaphore for exclusion will limit | ||
650 | transaction commit concurrency due to cache line bouncing of the lock on the | ||
651 | read side. | ||
652 | |||
653 | The second serialisation point is on the transaction commit side where items | ||
654 | are inserted into the CIL. Because transactions can enter this code | ||
655 | concurrently, the CIL needs to be protected separately from the above | ||
656 | commit/flush exclusion. It also needs to be an exclusive lock but it is only | ||
657 | held for a very short time and so a spin lock is appropriate here. It is | ||
658 | possible that this lock will become a contention point, but given the short | ||
659 | hold time once per transaction I think that contention is unlikely. | ||
660 | |||
661 | The final serialisation point is the checkpoint commit record ordering code | ||
662 | that is run as part of the checkpoint commit and log force sequencing. The code | ||
663 | path that triggers a CIL flush (i.e. whatever triggers the log force) will enter | ||
664 | an ordering loop after writing all the log vectors into the log buffers but | ||
665 | before writing the commit record. This loop walks the list of committing | ||
666 | checkpoints and needs to block waiting for checkpoints to complete their commit | ||
667 | record write. As a result it needs a lock and a wait variable. Log force | ||
668 | sequencing also requires the same lock, list walk, and blocking mechanism to | ||
669 | ensure completion of checkpoints. | ||
670 | |||
671 | These two sequencing operations can use the mechanism even though the | ||
672 | events they are waiting for are different. The checkpoint commit record | ||
673 | sequencing needs to wait until checkpoint contexts contain a commit LSN | ||
674 | (obtained through completion of a commit record write) while log force | ||
675 | sequencing needs to wait until previous checkpoint contexts are removed from | ||
676 | the committing list (i.e. they've completed). A simple wait variable and | ||
677 | broadcast wakeups (thundering herds) has been used to implement these two | ||
678 | serialisation queues. They use the same lock as the CIL, too. If we see too | ||
679 | much contention on the CIL lock, or too many context switches as a result of | ||
680 | the broadcast wakeups these operations can be put under a new spinlock and | ||
681 | given separate wait lists to reduce lock contention and the number of processes | ||
682 | woken by the wrong event. | ||
683 | |||
684 | |||
685 | Lifecycle Changes | ||
686 | |||
687 | The existing log item life cycle is as follows: | ||
688 | |||
689 | 1. Transaction allocate | ||
690 | 2. Transaction reserve | ||
691 | 3. Lock item | ||
692 | 4. Join item to transaction | ||
693 | If not already attached, | ||
694 | Allocate log item | ||
695 | Attach log item to owner item | ||
696 | Attach log item to transaction | ||
697 | 5. Modify item | ||
698 | Record modifications in log item | ||
699 | 6. Transaction commit | ||
700 | Pin item in memory | ||
701 | Format item into log buffer | ||
702 | Write commit LSN into transaction | ||
703 | Unlock item | ||
704 | Attach transaction to log buffer | ||
705 | |||
706 | <log buffer IO dispatched> | ||
707 | <log buffer IO completes> | ||
708 | |||
709 | 7. Transaction completion | ||
710 | Mark log item committed | ||
711 | Insert log item into AIL | ||
712 | Write commit LSN into log item | ||
713 | Unpin log item | ||
714 | 8. AIL traversal | ||
715 | Lock item | ||
716 | Mark log item clean | ||
717 | Flush item to disk | ||
718 | |||
719 | <item IO completion> | ||
720 | |||
721 | 9. Log item removed from AIL | ||
722 | Moves log tail | ||
723 | Item unlocked | ||
724 | |||
725 | Essentially, steps 1-6 operate independently from step 7, which is also | ||
726 | independent of steps 8-9. An item can be locked in steps 1-6 or steps 8-9 | ||
727 | at the same time step 7 is occurring, but only steps 1-6 or 8-9 can occur | ||
728 | at the same time. If the log item is in the AIL or between steps 6 and 7 | ||
729 | and steps 1-6 are re-entered, then the item is relogged. Only when steps 8-9 | ||
730 | are entered and completed is the object considered clean. | ||
731 | |||
732 | With delayed logging, there are new steps inserted into the life cycle: | ||
733 | |||
734 | 1. Transaction allocate | ||
735 | 2. Transaction reserve | ||
736 | 3. Lock item | ||
737 | 4. Join item to transaction | ||
738 | If not already attached, | ||
739 | Allocate log item | ||
740 | Attach log item to owner item | ||
741 | Attach log item to transaction | ||
742 | 5. Modify item | ||
743 | Record modifications in log item | ||
744 | 6. Transaction commit | ||
745 | Pin item in memory if not pinned in CIL | ||
746 | Format item into log vector + buffer | ||
747 | Attach log vector and buffer to log item | ||
748 | Insert log item into CIL | ||
749 | Write CIL context sequence into transaction | ||
750 | Unlock item | ||
751 | |||
752 | <next log force> | ||
753 | |||
754 | 7. CIL push | ||
755 | lock CIL flush | ||
756 | Chain log vectors and buffers together | ||
757 | Remove items from CIL | ||
758 | unlock CIL flush | ||
759 | write log vectors into log | ||
760 | sequence commit records | ||
761 | attach checkpoint context to log buffer | ||
762 | |||
763 | <log buffer IO dispatched> | ||
764 | <log buffer IO completes> | ||
765 | |||
766 | 8. Checkpoint completion | ||
767 | Mark log item committed | ||
768 | Insert item into AIL | ||
769 | Write commit LSN into log item | ||
770 | Unpin log item | ||
771 | 9. AIL traversal | ||
772 | Lock item | ||
773 | Mark log item clean | ||
774 | Flush item to disk | ||
775 | <item IO completion> | ||
776 | 10. Log item removed from AIL | ||
777 | Moves log tail | ||
778 | Item unlocked | ||
779 | |||
780 | From this, it can be seen that the only life cycle differences between the two | ||
781 | logging methods are in the middle of the life cycle - they still have the same | ||
782 | beginning and end and execution constraints. The only differences are in the | ||
783 | commiting of the log items to the log itself and the completion processing. | ||
784 | Hence delayed logging should not introduce any constraints on log item | ||
785 | behaviour, allocation or freeing that don't already exist. | ||
786 | |||
787 | As a result of this zero-impact "insertion" of delayed logging infrastructure | ||
788 | and the design of the internal structures to avoid on disk format changes, we | ||
789 | can basically switch between delayed logging and the existing mechanism with a | ||
790 | mount option. Fundamentally, there is no reason why the log manager would not | ||
791 | be able to swap methods automatically and transparently depending on load | ||
792 | characteristics, but this should not be necessary if delayed logging works as | ||
793 | designed. | ||
794 | |||
795 | Roadmap: | ||
796 | |||
797 | 2.6.37 Remove experimental tag from mount option | ||
798 | => should be roughly 6 months after initial merge | ||
799 | => enough time to: | ||
800 | => gain confidence and fix problems reported by early | ||
801 | adopters (a.k.a. guinea pigs) | ||
802 | => address worst performance regressions and undesired | ||
803 | behaviours | ||
804 | => start tuning/optimising code for parallelism | ||
805 | => start tuning/optimising algorithms consuming | ||
806 | excessive CPU time | ||
807 | |||
808 | 2.6.39 Switch default mount option to use delayed logging | ||
809 | => should be roughly 12 months after initial merge | ||
810 | => enough time to shake out remaining problems before next round of | ||
811 | enterprise distro kernel rebases | ||
diff --git a/Documentation/hwmon/dme1737 b/Documentation/hwmon/dme1737 index 001d2e70bc11..fc5df7654d63 100644 --- a/Documentation/hwmon/dme1737 +++ b/Documentation/hwmon/dme1737 | |||
@@ -9,11 +9,15 @@ Supported chips: | |||
9 | * SMSC SCH3112, SCH3114, SCH3116 | 9 | * SMSC SCH3112, SCH3114, SCH3116 |
10 | Prefix: 'sch311x' | 10 | Prefix: 'sch311x' |
11 | Addresses scanned: none, address read from Super-I/O config space | 11 | Addresses scanned: none, address read from Super-I/O config space |
12 | Datasheet: http://www.nuhorizons.com/FeaturedProducts/Volume1/SMSC/311x.pdf | 12 | Datasheet: Available on the Internet |
13 | * SMSC SCH5027 | 13 | * SMSC SCH5027 |
14 | Prefix: 'sch5027' | 14 | Prefix: 'sch5027' |
15 | Addresses scanned: I2C 0x2c, 0x2d, 0x2e | 15 | Addresses scanned: I2C 0x2c, 0x2d, 0x2e |
16 | Datasheet: Provided by SMSC upon request and under NDA | 16 | Datasheet: Provided by SMSC upon request and under NDA |
17 | * SMSC SCH5127 | ||
18 | Prefix: 'sch5127' | ||
19 | Addresses scanned: none, address read from Super-I/O config space | ||
20 | Datasheet: Provided by SMSC upon request and under NDA | ||
17 | 21 | ||
18 | Authors: | 22 | Authors: |
19 | Juerg Haefliger <juergh@gmail.com> | 23 | Juerg Haefliger <juergh@gmail.com> |
@@ -36,8 +40,8 @@ Description | |||
36 | ----------- | 40 | ----------- |
37 | 41 | ||
38 | This driver implements support for the hardware monitoring capabilities of the | 42 | This driver implements support for the hardware monitoring capabilities of the |
39 | SMSC DME1737 and Asus A8000 (which are the same), SMSC SCH5027, and SMSC | 43 | SMSC DME1737 and Asus A8000 (which are the same), SMSC SCH5027, SCH311x, |
40 | SCH311x Super-I/O chips. These chips feature monitoring of 3 temp sensors | 44 | and SCH5127 Super-I/O chips. These chips feature monitoring of 3 temp sensors |
41 | temp[1-3] (2 remote diodes and 1 internal), 7 voltages in[0-6] (6 external and | 45 | temp[1-3] (2 remote diodes and 1 internal), 7 voltages in[0-6] (6 external and |
42 | 1 internal) and up to 6 fan speeds fan[1-6]. Additionally, the chips implement | 46 | 1 internal) and up to 6 fan speeds fan[1-6]. Additionally, the chips implement |
43 | up to 5 PWM outputs pwm[1-3,5-6] for controlling fan speeds both manually and | 47 | up to 5 PWM outputs pwm[1-3,5-6] for controlling fan speeds both manually and |
@@ -48,14 +52,14 @@ Fan[3-6] and pwm[3,5-6] are optional features and their availability depends on | |||
48 | the configuration of the chip. The driver will detect which features are | 52 | the configuration of the chip. The driver will detect which features are |
49 | present during initialization and create the sysfs attributes accordingly. | 53 | present during initialization and create the sysfs attributes accordingly. |
50 | 54 | ||
51 | For the SCH311x, fan[1-3] and pwm[1-3] are always present and fan[4-6] and | 55 | For the SCH311x and SCH5127, fan[1-3] and pwm[1-3] are always present and |
52 | pwm[5-6] don't exist. | 56 | fan[4-6] and pwm[5-6] don't exist. |
53 | 57 | ||
54 | The hardware monitoring features of the DME1737, A8000, and SCH5027 are only | 58 | The hardware monitoring features of the DME1737, A8000, and SCH5027 are only |
55 | accessible via SMBus, while the SCH311x only provides access via the ISA bus. | 59 | accessible via SMBus, while the SCH311x and SCH5127 only provide access via |
56 | The driver will therefore register itself as an I2C client driver if it detects | 60 | the ISA bus. The driver will therefore register itself as an I2C client driver |
57 | a DME1737, A8000, or SCH5027 and as a platform driver if it detects a SCH311x | 61 | if it detects a DME1737, A8000, or SCH5027 and as a platform driver if it |
58 | chip. | 62 | detects a SCH311x or SCH5127 chip. |
59 | 63 | ||
60 | 64 | ||
61 | Voltage Monitoring | 65 | Voltage Monitoring |
@@ -76,7 +80,7 @@ DME1737, A8000: | |||
76 | in6: Vbat (+3.0V) 0V - 4.38V | 80 | in6: Vbat (+3.0V) 0V - 4.38V |
77 | 81 | ||
78 | SCH311x: | 82 | SCH311x: |
79 | in0: +2.5V 0V - 6.64V | 83 | in0: +2.5V 0V - 3.32V |
80 | in1: Vccp (processor core) 0V - 2V | 84 | in1: Vccp (processor core) 0V - 2V |
81 | in2: VCC (internal +3.3V) 0V - 4.38V | 85 | in2: VCC (internal +3.3V) 0V - 4.38V |
82 | in3: +5V 0V - 6.64V | 86 | in3: +5V 0V - 6.64V |
@@ -93,6 +97,15 @@ SCH5027: | |||
93 | in5: VTR (+3.3V standby) 0V - 4.38V | 97 | in5: VTR (+3.3V standby) 0V - 4.38V |
94 | in6: Vbat (+3.0V) 0V - 4.38V | 98 | in6: Vbat (+3.0V) 0V - 4.38V |
95 | 99 | ||
100 | SCH5127: | ||
101 | in0: +2.5 0V - 3.32V | ||
102 | in1: Vccp (processor core) 0V - 3V | ||
103 | in2: VCC (internal +3.3V) 0V - 4.38V | ||
104 | in3: V2_IN 0V - 1.5V | ||
105 | in4: V1_IN 0V - 1.5V | ||
106 | in5: VTR (+3.3V standby) 0V - 4.38V | ||
107 | in6: Vbat (+3.0V) 0V - 4.38V | ||
108 | |||
96 | Each voltage input has associated min and max limits which trigger an alarm | 109 | Each voltage input has associated min and max limits which trigger an alarm |
97 | when crossed. | 110 | when crossed. |
98 | 111 | ||
@@ -293,3 +306,21 @@ pwm[1-3]_auto_point1_pwm RW Auto PWM pwm point. Auto_point1 is the | |||
293 | pwm[1-3]_auto_point2_pwm RO Auto PWM pwm point. Auto_point2 is the | 306 | pwm[1-3]_auto_point2_pwm RO Auto PWM pwm point. Auto_point2 is the |
294 | full-speed duty-cycle which is hard- | 307 | full-speed duty-cycle which is hard- |
295 | wired to 255 (100% duty-cycle). | 308 | wired to 255 (100% duty-cycle). |
309 | |||
310 | Chip Differences | ||
311 | ---------------- | ||
312 | |||
313 | Feature dme1737 sch311x sch5027 sch5127 | ||
314 | ------------------------------------------------------- | ||
315 | temp[1-3]_offset yes yes | ||
316 | vid yes | ||
317 | zone3 yes yes yes | ||
318 | zone[1-3]_hyst yes yes | ||
319 | pwm min/off yes yes | ||
320 | fan3 opt yes opt yes | ||
321 | pwm3 opt yes opt yes | ||
322 | fan4 opt opt | ||
323 | fan5 opt opt | ||
324 | pwm5 opt opt | ||
325 | fan6 opt opt | ||
326 | pwm6 opt opt | ||
diff --git a/Documentation/hwmon/lm63 b/Documentation/hwmon/lm63 index 31660bf97979..b9843eab1afb 100644 --- a/Documentation/hwmon/lm63 +++ b/Documentation/hwmon/lm63 | |||
@@ -7,6 +7,11 @@ Supported chips: | |||
7 | Addresses scanned: I2C 0x4c | 7 | Addresses scanned: I2C 0x4c |
8 | Datasheet: Publicly available at the National Semiconductor website | 8 | Datasheet: Publicly available at the National Semiconductor website |
9 | http://www.national.com/pf/LM/LM63.html | 9 | http://www.national.com/pf/LM/LM63.html |
10 | * National Semiconductor LM64 | ||
11 | Prefix: 'lm64' | ||
12 | Addresses scanned: I2C 0x18 and 0x4e | ||
13 | Datasheet: Publicly available at the National Semiconductor website | ||
14 | http://www.national.com/pf/LM/LM64.html | ||
10 | 15 | ||
11 | Author: Jean Delvare <khali@linux-fr.org> | 16 | Author: Jean Delvare <khali@linux-fr.org> |
12 | 17 | ||
@@ -55,3 +60,5 @@ The lm63 driver will not update its values more frequently than every | |||
55 | second; reading them more often will do no harm, but will return 'old' | 60 | second; reading them more often will do no harm, but will return 'old' |
56 | values. | 61 | values. |
57 | 62 | ||
63 | The LM64 is effectively an LM63 with GPIO lines. The driver does not | ||
64 | support these GPIO lines at present. | ||
diff --git a/Documentation/hwmon/ltc4245 b/Documentation/hwmon/ltc4245 index 02838a47d862..86b5880d8502 100644 --- a/Documentation/hwmon/ltc4245 +++ b/Documentation/hwmon/ltc4245 | |||
@@ -72,9 +72,7 @@ in6_min_alarm 5v output undervoltage alarm | |||
72 | in7_min_alarm 3v output undervoltage alarm | 72 | in7_min_alarm 3v output undervoltage alarm |
73 | in8_min_alarm Vee (-12v) output undervoltage alarm | 73 | in8_min_alarm Vee (-12v) output undervoltage alarm |
74 | 74 | ||
75 | in9_input GPIO #1 voltage data | 75 | in9_input GPIO voltage data |
76 | in10_input GPIO #2 voltage data | ||
77 | in11_input GPIO #3 voltage data | ||
78 | 76 | ||
79 | power1_input 12v power usage (mW) | 77 | power1_input 12v power usage (mW) |
80 | power2_input 5v power usage (mW) | 78 | power2_input 5v power usage (mW) |
diff --git a/Documentation/hwmon/sysfs-interface b/Documentation/hwmon/sysfs-interface index 3de6b0bcb147..d4e2917c6f18 100644 --- a/Documentation/hwmon/sysfs-interface +++ b/Documentation/hwmon/sysfs-interface | |||
@@ -80,9 +80,9 @@ All entries (except name) are optional, and should only be created in a | |||
80 | given driver if the chip has the feature. | 80 | given driver if the chip has the feature. |
81 | 81 | ||
82 | 82 | ||
83 | ******** | 83 | ********************* |
84 | * Name * | 84 | * Global attributes * |
85 | ******** | 85 | ********************* |
86 | 86 | ||
87 | name The chip name. | 87 | name The chip name. |
88 | This should be a short, lowercase string, not containing | 88 | This should be a short, lowercase string, not containing |
@@ -91,6 +91,13 @@ name The chip name. | |||
91 | I2C devices get this attribute created automatically. | 91 | I2C devices get this attribute created automatically. |
92 | RO | 92 | RO |
93 | 93 | ||
94 | update_rate The rate at which the chip will update readings. | ||
95 | Unit: millisecond | ||
96 | RW | ||
97 | Some devices have a variable update rate. This attribute | ||
98 | can be used to change the update rate to the desired | ||
99 | frequency. | ||
100 | |||
94 | 101 | ||
95 | ************ | 102 | ************ |
96 | * Voltages * | 103 | * Voltages * |
diff --git a/Documentation/hwmon/tmp102 b/Documentation/hwmon/tmp102 new file mode 100644 index 000000000000..8454a7763122 --- /dev/null +++ b/Documentation/hwmon/tmp102 | |||
@@ -0,0 +1,26 @@ | |||
1 | Kernel driver tmp102 | ||
2 | ==================== | ||
3 | |||
4 | Supported chips: | ||
5 | * Texas Instruments TMP102 | ||
6 | Prefix: 'tmp102' | ||
7 | Addresses scanned: none | ||
8 | Datasheet: http://focus.ti.com/docs/prod/folders/print/tmp102.html | ||
9 | |||
10 | Author: | ||
11 | Steven King <sfking@fdwdc.com> | ||
12 | |||
13 | Description | ||
14 | ----------- | ||
15 | |||
16 | The Texas Instruments TMP102 implements one temperature sensor. Limits can be | ||
17 | set through the Overtemperature Shutdown register and Hysteresis register. The | ||
18 | sensor is accurate to 0.5 degree over the range of -25 to +85 C, and to 1.0 | ||
19 | degree from -40 to +125 C. Resolution of the sensor is 0.0625 degree. The | ||
20 | operating temperature has a minimum of -55 C and a maximum of +150 C. | ||
21 | |||
22 | The TMP102 has a programmable update rate that can select between 8, 4, 1, and | ||
23 | 0.5 Hz. (Currently the driver only supports the default of 4 Hz). | ||
24 | |||
25 | The driver provides the common sysfs-interface for temperatures (see | ||
26 | Documentation/hwmon/sysfs-interface under Temperatures). | ||
diff --git a/Documentation/i2c/busses/i2c-ali1535 b/Documentation/i2c/busses/i2c-ali1535 index 0db3b4c74ad1..acbc65a08097 100644 --- a/Documentation/i2c/busses/i2c-ali1535 +++ b/Documentation/i2c/busses/i2c-ali1535 | |||
@@ -6,12 +6,12 @@ Supported adapters: | |||
6 | http://www.ali.com.tw/eng/support/datasheet_request.php | 6 | http://www.ali.com.tw/eng/support/datasheet_request.php |
7 | 7 | ||
8 | Authors: | 8 | Authors: |
9 | Frodo Looijaard <frodol@dds.nl>, | 9 | Frodo Looijaard <frodol@dds.nl>, |
10 | Philip Edelbrock <phil@netroedge.com>, | 10 | Philip Edelbrock <phil@netroedge.com>, |
11 | Mark D. Studebaker <mdsxyz123@yahoo.com>, | 11 | Mark D. Studebaker <mdsxyz123@yahoo.com>, |
12 | Dan Eaton <dan.eaton@rocketlogix.com>, | 12 | Dan Eaton <dan.eaton@rocketlogix.com>, |
13 | Stephen Rousset<stephen.rousset@rocketlogix.com> | 13 | Stephen Rousset<stephen.rousset@rocketlogix.com> |
14 | 14 | ||
15 | Description | 15 | Description |
16 | ----------- | 16 | ----------- |
17 | 17 | ||
diff --git a/Documentation/i2c/busses/i2c-ali1563 b/Documentation/i2c/busses/i2c-ali1563 index 99ad4b9bcc32..54691698d2dd 100644 --- a/Documentation/i2c/busses/i2c-ali1563 +++ b/Documentation/i2c/busses/i2c-ali1563 | |||
@@ -18,7 +18,7 @@ For an overview of these chips see http://www.acerlabs.com | |||
18 | The M1563 southbridge is deceptively similar to the M1533, with a few | 18 | The M1563 southbridge is deceptively similar to the M1533, with a few |
19 | notable exceptions. One of those happens to be the fact they upgraded the | 19 | notable exceptions. One of those happens to be the fact they upgraded the |
20 | i2c core to be SMBus 2.0 compliant, and happens to be almost identical to | 20 | i2c core to be SMBus 2.0 compliant, and happens to be almost identical to |
21 | the i2c controller found in the Intel 801 south bridges. | 21 | the i2c controller found in the Intel 801 south bridges. |
22 | 22 | ||
23 | Features | 23 | Features |
24 | -------- | 24 | -------- |
diff --git a/Documentation/i2c/busses/i2c-ali15x3 b/Documentation/i2c/busses/i2c-ali15x3 index ff28d381bebe..600da90b8f12 100644 --- a/Documentation/i2c/busses/i2c-ali15x3 +++ b/Documentation/i2c/busses/i2c-ali15x3 | |||
@@ -6,8 +6,8 @@ Supported adapters: | |||
6 | http://www.ali.com.tw/eng/support/datasheet_request.php | 6 | http://www.ali.com.tw/eng/support/datasheet_request.php |
7 | 7 | ||
8 | Authors: | 8 | Authors: |
9 | Frodo Looijaard <frodol@dds.nl>, | 9 | Frodo Looijaard <frodol@dds.nl>, |
10 | Philip Edelbrock <phil@netroedge.com>, | 10 | Philip Edelbrock <phil@netroedge.com>, |
11 | Mark D. Studebaker <mdsxyz123@yahoo.com> | 11 | Mark D. Studebaker <mdsxyz123@yahoo.com> |
12 | 12 | ||
13 | Module Parameters | 13 | Module Parameters |
@@ -40,10 +40,10 @@ M1541 and M1543C South Bridges. | |||
40 | The M1543C is a South bridge for desktop systems. | 40 | The M1543C is a South bridge for desktop systems. |
41 | The M1541 is a South bridge for portable systems. | 41 | The M1541 is a South bridge for portable systems. |
42 | They are part of the following ALI chipsets: | 42 | They are part of the following ALI chipsets: |
43 | 43 | ||
44 | * "Aladdin Pro 2" includes the M1621 Slot 1 North bridge with AGP and | 44 | * "Aladdin Pro 2" includes the M1621 Slot 1 North bridge with AGP and |
45 | 100MHz CPU Front Side bus | 45 | 100MHz CPU Front Side bus |
46 | * "Aladdin V" includes the M1541 Socket 7 North bridge with AGP and 100MHz | 46 | * "Aladdin V" includes the M1541 Socket 7 North bridge with AGP and 100MHz |
47 | CPU Front Side bus | 47 | CPU Front Side bus |
48 | Some Aladdin V motherboards: | 48 | Some Aladdin V motherboards: |
49 | Asus P5A | 49 | Asus P5A |
@@ -77,7 +77,7 @@ output of lspci will show something similar to the following: | |||
77 | ** then run lspci. | 77 | ** then run lspci. |
78 | ** If you see the 1533 and 5229 devices but NOT the 7101 device, | 78 | ** If you see the 1533 and 5229 devices but NOT the 7101 device, |
79 | ** then you must enable ACPI, the PMU, SMB, or something similar | 79 | ** then you must enable ACPI, the PMU, SMB, or something similar |
80 | ** in the BIOS. | 80 | ** in the BIOS. |
81 | ** The driver won't work if it can't find the M7101 device. | 81 | ** The driver won't work if it can't find the M7101 device. |
82 | 82 | ||
83 | The SMB controller is part of the M7101 device, which is an ACPI-compliant | 83 | The SMB controller is part of the M7101 device, which is an ACPI-compliant |
@@ -87,8 +87,8 @@ The whole M7101 device has to be enabled for the SMB to work. You can't | |||
87 | just enable the SMB alone. The SMB and the ACPI have separate I/O spaces. | 87 | just enable the SMB alone. The SMB and the ACPI have separate I/O spaces. |
88 | We make sure that the SMB is enabled. We leave the ACPI alone. | 88 | We make sure that the SMB is enabled. We leave the ACPI alone. |
89 | 89 | ||
90 | Features | 90 | Features |
91 | -------- | 91 | -------- |
92 | 92 | ||
93 | This driver controls the SMB Host only. The SMB Slave | 93 | This driver controls the SMB Host only. The SMB Slave |
94 | controller on the M15X3 is not enabled. This driver does not use | 94 | controller on the M15X3 is not enabled. This driver does not use |
diff --git a/Documentation/i2c/busses/i2c-pca-isa b/Documentation/i2c/busses/i2c-pca-isa index 6fc8f4c27c3c..b044e5265488 100644 --- a/Documentation/i2c/busses/i2c-pca-isa +++ b/Documentation/i2c/busses/i2c-pca-isa | |||
@@ -1,10 +1,10 @@ | |||
1 | Kernel driver i2c-pca-isa | 1 | Kernel driver i2c-pca-isa |
2 | 2 | ||
3 | Supported adapters: | 3 | Supported adapters: |
4 | This driver supports ISA boards using the Philips PCA 9564 | 4 | This driver supports ISA boards using the Philips PCA 9564 |
5 | Parallel bus to I2C bus controller | 5 | Parallel bus to I2C bus controller |
6 | 6 | ||
7 | Author: Ian Campbell <icampbell@arcom.com>, Arcom Control Systems | 7 | Author: Ian Campbell <icampbell@arcom.com>, Arcom Control Systems |
8 | 8 | ||
9 | Module Parameters | 9 | Module Parameters |
10 | ----------------- | 10 | ----------------- |
@@ -12,12 +12,12 @@ Module Parameters | |||
12 | * base int | 12 | * base int |
13 | I/O base address | 13 | I/O base address |
14 | * irq int | 14 | * irq int |
15 | IRQ interrupt | 15 | IRQ interrupt |
16 | * clock int | 16 | * clock int |
17 | Clock rate as described in table 1 of PCA9564 datasheet | 17 | Clock rate as described in table 1 of PCA9564 datasheet |
18 | 18 | ||
19 | Description | 19 | Description |
20 | ----------- | 20 | ----------- |
21 | 21 | ||
22 | This driver supports ISA boards using the Philips PCA 9564 | 22 | This driver supports ISA boards using the Philips PCA 9564 |
23 | Parallel bus to I2C bus controller | 23 | Parallel bus to I2C bus controller |
diff --git a/Documentation/i2c/busses/i2c-sis5595 b/Documentation/i2c/busses/i2c-sis5595 index cc47db7d00a9..ecd21fb49a8f 100644 --- a/Documentation/i2c/busses/i2c-sis5595 +++ b/Documentation/i2c/busses/i2c-sis5595 | |||
@@ -1,41 +1,41 @@ | |||
1 | Kernel driver i2c-sis5595 | 1 | Kernel driver i2c-sis5595 |
2 | 2 | ||
3 | Authors: | 3 | Authors: |
4 | Frodo Looijaard <frodol@dds.nl>, | 4 | Frodo Looijaard <frodol@dds.nl>, |
5 | Mark D. Studebaker <mdsxyz123@yahoo.com>, | 5 | Mark D. Studebaker <mdsxyz123@yahoo.com>, |
6 | Philip Edelbrock <phil@netroedge.com> | 6 | Philip Edelbrock <phil@netroedge.com> |
7 | 7 | ||
8 | Supported adapters: | 8 | Supported adapters: |
9 | * Silicon Integrated Systems Corp. SiS5595 Southbridge | 9 | * Silicon Integrated Systems Corp. SiS5595 Southbridge |
10 | Datasheet: Publicly available at the Silicon Integrated Systems Corp. site. | 10 | Datasheet: Publicly available at the Silicon Integrated Systems Corp. site. |
11 | 11 | ||
12 | Note: all have mfr. ID 0x1039. | 12 | Note: all have mfr. ID 0x1039. |
13 | 13 | ||
14 | SUPPORTED PCI ID | 14 | SUPPORTED PCI ID |
15 | 5595 0008 | 15 | 5595 0008 |
16 | 16 | ||
17 | Note: these chips contain a 0008 device which is incompatible with the | 17 | Note: these chips contain a 0008 device which is incompatible with the |
18 | 5595. We recognize these by the presence of the listed | 18 | 5595. We recognize these by the presence of the listed |
19 | "blacklist" PCI ID and refuse to load. | 19 | "blacklist" PCI ID and refuse to load. |
20 | 20 | ||
21 | NOT SUPPORTED PCI ID BLACKLIST PCI ID | 21 | NOT SUPPORTED PCI ID BLACKLIST PCI ID |
22 | 540 0008 0540 | 22 | 540 0008 0540 |
23 | 550 0008 0550 | 23 | 550 0008 0550 |
24 | 5513 0008 5511 | 24 | 5513 0008 5511 |
25 | 5581 0008 5597 | 25 | 5581 0008 5597 |
26 | 5582 0008 5597 | 26 | 5582 0008 5597 |
27 | 5597 0008 5597 | 27 | 5597 0008 5597 |
28 | 5598 0008 5597/5598 | 28 | 5598 0008 5597/5598 |
29 | 630 0008 0630 | 29 | 630 0008 0630 |
30 | 645 0008 0645 | 30 | 645 0008 0645 |
31 | 646 0008 0646 | 31 | 646 0008 0646 |
32 | 648 0008 0648 | 32 | 648 0008 0648 |
33 | 650 0008 0650 | 33 | 650 0008 0650 |
34 | 651 0008 0651 | 34 | 651 0008 0651 |
35 | 730 0008 0730 | 35 | 730 0008 0730 |
36 | 735 0008 0735 | 36 | 735 0008 0735 |
37 | 745 0008 0745 | 37 | 745 0008 0745 |
38 | 746 0008 0746 | 38 | 746 0008 0746 |
39 | 39 | ||
40 | Module Parameters | 40 | Module Parameters |
41 | ----------------- | 41 | ----------------- |
diff --git a/Documentation/i2c/busses/i2c-sis630 b/Documentation/i2c/busses/i2c-sis630 index 9aca6889f748..629ea2c356fd 100644 --- a/Documentation/i2c/busses/i2c-sis630 +++ b/Documentation/i2c/busses/i2c-sis630 | |||
@@ -14,9 +14,9 @@ Module Parameters | |||
14 | * force = [1|0] Forcibly enable the SIS630. DANGEROUS! | 14 | * force = [1|0] Forcibly enable the SIS630. DANGEROUS! |
15 | This can be interesting for chipsets not named | 15 | This can be interesting for chipsets not named |
16 | above to check if it works for you chipset, but DANGEROUS! | 16 | above to check if it works for you chipset, but DANGEROUS! |
17 | 17 | ||
18 | * high_clock = [1|0] Forcibly set Host Master Clock to 56KHz (default, | 18 | * high_clock = [1|0] Forcibly set Host Master Clock to 56KHz (default, |
19 | what your BIOS use). DANGEROUS! This should be a bit | 19 | what your BIOS use). DANGEROUS! This should be a bit |
20 | faster, but freeze some systems (i.e. my Laptop). | 20 | faster, but freeze some systems (i.e. my Laptop). |
21 | 21 | ||
22 | 22 | ||
@@ -44,6 +44,6 @@ Philip Edelbrock <phil@netroedge.com> | |||
44 | - testing SiS730 support | 44 | - testing SiS730 support |
45 | Mark M. Hoffman <mhoffman@lightlink.com> | 45 | Mark M. Hoffman <mhoffman@lightlink.com> |
46 | - bug fixes | 46 | - bug fixes |
47 | 47 | ||
48 | To anyone else which I forgot here ;), thanks! | 48 | To anyone else which I forgot here ;), thanks! |
49 | 49 | ||
diff --git a/Documentation/i2c/ten-bit-addresses b/Documentation/i2c/ten-bit-addresses index 200074f81360..e9890709c508 100644 --- a/Documentation/i2c/ten-bit-addresses +++ b/Documentation/i2c/ten-bit-addresses | |||
@@ -1,17 +1,17 @@ | |||
1 | The I2C protocol knows about two kinds of device addresses: normal 7 bit | 1 | The I2C protocol knows about two kinds of device addresses: normal 7 bit |
2 | addresses, and an extended set of 10 bit addresses. The sets of addresses | 2 | addresses, and an extended set of 10 bit addresses. The sets of addresses |
3 | do not intersect: the 7 bit address 0x10 is not the same as the 10 bit | 3 | do not intersect: the 7 bit address 0x10 is not the same as the 10 bit |
4 | address 0x10 (though a single device could respond to both of them). You | 4 | address 0x10 (though a single device could respond to both of them). You |
5 | select a 10 bit address by adding an extra byte after the address | 5 | select a 10 bit address by adding an extra byte after the address |
6 | byte: | 6 | byte: |
7 | S Addr7 Rd/Wr .... | 7 | S Addr7 Rd/Wr .... |
8 | becomes | 8 | becomes |
9 | S 11110 Addr10 Rd/Wr | 9 | S 11110 Addr10 Rd/Wr |
10 | S is the start bit, Rd/Wr the read/write bit, and if you count the number | 10 | S is the start bit, Rd/Wr the read/write bit, and if you count the number |
11 | of bits, you will see the there are 8 after the S bit for 7 bit addresses, | 11 | of bits, you will see the there are 8 after the S bit for 7 bit addresses, |
12 | and 16 after the S bit for 10 bit addresses. | 12 | and 16 after the S bit for 10 bit addresses. |
13 | 13 | ||
14 | WARNING! The current 10 bit address support is EXPERIMENTAL. There are | 14 | WARNING! The current 10 bit address support is EXPERIMENTAL. There are |
15 | several places in the code that will cause SEVERE PROBLEMS with 10 bit | 15 | several places in the code that will cause SEVERE PROBLEMS with 10 bit |
16 | addresses, even though there is some basic handling and hooks. Also, | 16 | addresses, even though there is some basic handling and hooks. Also, |
17 | almost no supported adapter handles the 10 bit addresses correctly. | 17 | almost no supported adapter handles the 10 bit addresses correctly. |
diff --git a/Documentation/kbuild/kbuild.txt b/Documentation/kbuild/kbuild.txt index 6f8c1cabbc5d..634c625da8ce 100644 --- a/Documentation/kbuild/kbuild.txt +++ b/Documentation/kbuild/kbuild.txt | |||
@@ -65,7 +65,7 @@ CROSS_COMPILE | |||
65 | Specify an optional fixed part of the binutils filename. | 65 | Specify an optional fixed part of the binutils filename. |
66 | CROSS_COMPILE can be a part of the filename or the full path. | 66 | CROSS_COMPILE can be a part of the filename or the full path. |
67 | 67 | ||
68 | CROSS_COMPILE is also used for ccache is some setups. | 68 | CROSS_COMPILE is also used for ccache in some setups. |
69 | 69 | ||
70 | CF | 70 | CF |
71 | -------------------------------------------------- | 71 | -------------------------------------------------- |
@@ -162,3 +162,7 @@ For tags/TAGS/cscope targets, you can specify more than one arch | |||
162 | to be included in the databases, separated by blank space. E.g.: | 162 | to be included in the databases, separated by blank space. E.g.: |
163 | 163 | ||
164 | $ make ALLSOURCE_ARCHS="x86 mips arm" tags | 164 | $ make ALLSOURCE_ARCHS="x86 mips arm" tags |
165 | |||
166 | To get all available archs you can also specify all. E.g.: | ||
167 | |||
168 | $ make ALLSOURCE_ARCHS=all tags | ||
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index f5fce483930c..1808f1157f30 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -145,11 +145,10 @@ and is between 256 and 4096 characters. It is defined in the file | |||
145 | 145 | ||
146 | acpi= [HW,ACPI,X86] | 146 | acpi= [HW,ACPI,X86] |
147 | Advanced Configuration and Power Interface | 147 | Advanced Configuration and Power Interface |
148 | Format: { force | off | ht | strict | noirq | rsdt } | 148 | Format: { force | off | strict | noirq | rsdt } |
149 | force -- enable ACPI if default was off | 149 | force -- enable ACPI if default was off |
150 | off -- disable ACPI if default was on | 150 | off -- disable ACPI if default was on |
151 | noirq -- do not use ACPI for IRQ routing | 151 | noirq -- do not use ACPI for IRQ routing |
152 | ht -- run only enough ACPI to enable Hyper Threading | ||
153 | strict -- Be less tolerant of platforms that are not | 152 | strict -- Be less tolerant of platforms that are not |
154 | strictly ACPI specification compliant. | 153 | strictly ACPI specification compliant. |
155 | rsdt -- prefer RSDT over (default) XSDT | 154 | rsdt -- prefer RSDT over (default) XSDT |
@@ -290,9 +289,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
290 | advansys= [HW,SCSI] | 289 | advansys= [HW,SCSI] |
291 | See header of drivers/scsi/advansys.c. | 290 | See header of drivers/scsi/advansys.c. |
292 | 291 | ||
293 | advwdt= [HW,WDT] Advantech WDT | ||
294 | Format: <iostart>,<iostop> | ||
295 | |||
296 | aedsp16= [HW,OSS] Audio Excel DSP 16 | 292 | aedsp16= [HW,OSS] Audio Excel DSP 16 |
297 | Format: <io>,<irq>,<dma>,<mss_io>,<mpu_io>,<mpu_irq> | 293 | Format: <io>,<irq>,<dma>,<mss_io>,<mpu_io>,<mpu_irq> |
298 | See also header of sound/oss/aedsp16.c. | 294 | See also header of sound/oss/aedsp16.c. |
@@ -761,13 +757,14 @@ and is between 256 and 4096 characters. It is defined in the file | |||
761 | Default value is 0. | 757 | Default value is 0. |
762 | Value can be changed at runtime via /selinux/enforce. | 758 | Value can be changed at runtime via /selinux/enforce. |
763 | 759 | ||
760 | erst_disable [ACPI] | ||
761 | Disable Error Record Serialization Table (ERST) | ||
762 | support. | ||
763 | |||
764 | ether= [HW,NET] Ethernet cards parameters | 764 | ether= [HW,NET] Ethernet cards parameters |
765 | This option is obsoleted by the "netdev=" option, which | 765 | This option is obsoleted by the "netdev=" option, which |
766 | has equivalent usage. See its documentation for details. | 766 | has equivalent usage. See its documentation for details. |
767 | 767 | ||
768 | eurwdt= [HW,WDT] Eurotech CPU-1220/1410 onboard watchdog. | ||
769 | Format: <io>[,<irq>] | ||
770 | |||
771 | failslab= | 768 | failslab= |
772 | fail_page_alloc= | 769 | fail_page_alloc= |
773 | fail_make_request=[KNL] | 770 | fail_make_request=[KNL] |
@@ -858,6 +855,11 @@ and is between 256 and 4096 characters. It is defined in the file | |||
858 | hd= [EIDE] (E)IDE hard drive subsystem geometry | 855 | hd= [EIDE] (E)IDE hard drive subsystem geometry |
859 | Format: <cyl>,<head>,<sect> | 856 | Format: <cyl>,<head>,<sect> |
860 | 857 | ||
858 | hest_disable [ACPI] | ||
859 | Disable Hardware Error Source Table (HEST) support; | ||
860 | corresponding firmware-first mode error processing | ||
861 | logic will be disabled. | ||
862 | |||
861 | highmem=nn[KMG] [KNL,BOOT] forces the highmem zone to have an exact | 863 | highmem=nn[KMG] [KNL,BOOT] forces the highmem zone to have an exact |
862 | size of <nn>. This works even on boxes that have no | 864 | size of <nn>. This works even on boxes that have no |
863 | highmem otherwise. This also works to reduce highmem | 865 | highmem otherwise. This also works to reduce highmem |
@@ -1258,6 +1260,8 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1258 | * nohrst, nosrst, norst: suppress hard, soft | 1260 | * nohrst, nosrst, norst: suppress hard, soft |
1259 | and both resets. | 1261 | and both resets. |
1260 | 1262 | ||
1263 | * dump_id: dump IDENTIFY data. | ||
1264 | |||
1261 | If there are multiple matching configurations changing | 1265 | If there are multiple matching configurations changing |
1262 | the same attribute, the last one is used. | 1266 | the same attribute, the last one is used. |
1263 | 1267 | ||
@@ -2267,9 +2271,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
2267 | 2271 | ||
2268 | sched_debug [KNL] Enables verbose scheduler debug messages. | 2272 | sched_debug [KNL] Enables verbose scheduler debug messages. |
2269 | 2273 | ||
2270 | sc1200wdt= [HW,WDT] SC1200 WDT (watchdog) driver | ||
2271 | Format: <io>[,<timeout>[,<isapnp>]] | ||
2272 | |||
2273 | scsi_debug_*= [SCSI] | 2274 | scsi_debug_*= [SCSI] |
2274 | See drivers/scsi/scsi_debug.c. | 2275 | See drivers/scsi/scsi_debug.c. |
2275 | 2276 | ||
@@ -2858,8 +2859,10 @@ and is between 256 and 4096 characters. It is defined in the file | |||
2858 | wd7000= [HW,SCSI] | 2859 | wd7000= [HW,SCSI] |
2859 | See header of drivers/scsi/wd7000.c. | 2860 | See header of drivers/scsi/wd7000.c. |
2860 | 2861 | ||
2861 | wdt= [WDT] Watchdog | 2862 | watchdog timers [HW,WDT] For information on watchdog timers, |
2862 | See Documentation/watchdog/wdt.txt. | 2863 | see Documentation/watchdog/watchdog-parameters.txt |
2864 | or other driver-specific files in the | ||
2865 | Documentation/watchdog/ directory. | ||
2863 | 2866 | ||
2864 | x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of | 2867 | x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of |
2865 | default x2apic cluster mode on platforms | 2868 | default x2apic cluster mode on platforms |
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt index c6416a398163..a237518e51b9 100644 --- a/Documentation/kvm/api.txt +++ b/Documentation/kvm/api.txt | |||
@@ -656,6 +656,7 @@ struct kvm_clock_data { | |||
656 | 4.29 KVM_GET_VCPU_EVENTS | 656 | 4.29 KVM_GET_VCPU_EVENTS |
657 | 657 | ||
658 | Capability: KVM_CAP_VCPU_EVENTS | 658 | Capability: KVM_CAP_VCPU_EVENTS |
659 | Extended by: KVM_CAP_INTR_SHADOW | ||
659 | Architectures: x86 | 660 | Architectures: x86 |
660 | Type: vm ioctl | 661 | Type: vm ioctl |
661 | Parameters: struct kvm_vcpu_event (out) | 662 | Parameters: struct kvm_vcpu_event (out) |
@@ -676,7 +677,7 @@ struct kvm_vcpu_events { | |||
676 | __u8 injected; | 677 | __u8 injected; |
677 | __u8 nr; | 678 | __u8 nr; |
678 | __u8 soft; | 679 | __u8 soft; |
679 | __u8 pad; | 680 | __u8 shadow; |
680 | } interrupt; | 681 | } interrupt; |
681 | struct { | 682 | struct { |
682 | __u8 injected; | 683 | __u8 injected; |
@@ -688,9 +689,13 @@ struct kvm_vcpu_events { | |||
688 | __u32 flags; | 689 | __u32 flags; |
689 | }; | 690 | }; |
690 | 691 | ||
692 | KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that | ||
693 | interrupt.shadow contains a valid state. Otherwise, this field is undefined. | ||
694 | |||
691 | 4.30 KVM_SET_VCPU_EVENTS | 695 | 4.30 KVM_SET_VCPU_EVENTS |
692 | 696 | ||
693 | Capability: KVM_CAP_VCPU_EVENTS | 697 | Capability: KVM_CAP_VCPU_EVENTS |
698 | Extended by: KVM_CAP_INTR_SHADOW | ||
694 | Architectures: x86 | 699 | Architectures: x86 |
695 | Type: vm ioctl | 700 | Type: vm ioctl |
696 | Parameters: struct kvm_vcpu_event (in) | 701 | Parameters: struct kvm_vcpu_event (in) |
@@ -709,6 +714,183 @@ current in-kernel state. The bits are: | |||
709 | KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel | 714 | KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel |
710 | KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector | 715 | KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector |
711 | 716 | ||
717 | If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in | ||
718 | the flags field to signal that interrupt.shadow contains a valid state and | ||
719 | shall be written into the VCPU. | ||
720 | |||
721 | 4.32 KVM_GET_DEBUGREGS | ||
722 | |||
723 | Capability: KVM_CAP_DEBUGREGS | ||
724 | Architectures: x86 | ||
725 | Type: vm ioctl | ||
726 | Parameters: struct kvm_debugregs (out) | ||
727 | Returns: 0 on success, -1 on error | ||
728 | |||
729 | Reads debug registers from the vcpu. | ||
730 | |||
731 | struct kvm_debugregs { | ||
732 | __u64 db[4]; | ||
733 | __u64 dr6; | ||
734 | __u64 dr7; | ||
735 | __u64 flags; | ||
736 | __u64 reserved[9]; | ||
737 | }; | ||
738 | |||
739 | 4.33 KVM_SET_DEBUGREGS | ||
740 | |||
741 | Capability: KVM_CAP_DEBUGREGS | ||
742 | Architectures: x86 | ||
743 | Type: vm ioctl | ||
744 | Parameters: struct kvm_debugregs (in) | ||
745 | Returns: 0 on success, -1 on error | ||
746 | |||
747 | Writes debug registers into the vcpu. | ||
748 | |||
749 | See KVM_GET_DEBUGREGS for the data structure. The flags field is unused | ||
750 | yet and must be cleared on entry. | ||
751 | |||
752 | 4.34 KVM_SET_USER_MEMORY_REGION | ||
753 | |||
754 | Capability: KVM_CAP_USER_MEM | ||
755 | Architectures: all | ||
756 | Type: vm ioctl | ||
757 | Parameters: struct kvm_userspace_memory_region (in) | ||
758 | Returns: 0 on success, -1 on error | ||
759 | |||
760 | struct kvm_userspace_memory_region { | ||
761 | __u32 slot; | ||
762 | __u32 flags; | ||
763 | __u64 guest_phys_addr; | ||
764 | __u64 memory_size; /* bytes */ | ||
765 | __u64 userspace_addr; /* start of the userspace allocated memory */ | ||
766 | }; | ||
767 | |||
768 | /* for kvm_memory_region::flags */ | ||
769 | #define KVM_MEM_LOG_DIRTY_PAGES 1UL | ||
770 | |||
771 | This ioctl allows the user to create or modify a guest physical memory | ||
772 | slot. When changing an existing slot, it may be moved in the guest | ||
773 | physical memory space, or its flags may be modified. It may not be | ||
774 | resized. Slots may not overlap in guest physical address space. | ||
775 | |||
776 | Memory for the region is taken starting at the address denoted by the | ||
777 | field userspace_addr, which must point at user addressable memory for | ||
778 | the entire memory slot size. Any object may back this memory, including | ||
779 | anonymous memory, ordinary files, and hugetlbfs. | ||
780 | |||
781 | It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr | ||
782 | be identical. This allows large pages in the guest to be backed by large | ||
783 | pages in the host. | ||
784 | |||
785 | The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which | ||
786 | instructs kvm to keep track of writes to memory within the slot. See | ||
787 | the KVM_GET_DIRTY_LOG ioctl. | ||
788 | |||
789 | When the KVM_CAP_SYNC_MMU capability, changes in the backing of the memory | ||
790 | region are automatically reflected into the guest. For example, an mmap() | ||
791 | that affects the region will be made visible immediately. Another example | ||
792 | is madvise(MADV_DROP). | ||
793 | |||
794 | It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl. | ||
795 | The KVM_SET_MEMORY_REGION does not allow fine grained control over memory | ||
796 | allocation and is deprecated. | ||
797 | |||
798 | 4.35 KVM_SET_TSS_ADDR | ||
799 | |||
800 | Capability: KVM_CAP_SET_TSS_ADDR | ||
801 | Architectures: x86 | ||
802 | Type: vm ioctl | ||
803 | Parameters: unsigned long tss_address (in) | ||
804 | Returns: 0 on success, -1 on error | ||
805 | |||
806 | This ioctl defines the physical address of a three-page region in the guest | ||
807 | physical address space. The region must be within the first 4GB of the | ||
808 | guest physical address space and must not conflict with any memory slot | ||
809 | or any mmio address. The guest may malfunction if it accesses this memory | ||
810 | region. | ||
811 | |||
812 | This ioctl is required on Intel-based hosts. This is needed on Intel hardware | ||
813 | because of a quirk in the virtualization implementation (see the internals | ||
814 | documentation when it pops into existence). | ||
815 | |||
816 | 4.36 KVM_ENABLE_CAP | ||
817 | |||
818 | Capability: KVM_CAP_ENABLE_CAP | ||
819 | Architectures: ppc | ||
820 | Type: vcpu ioctl | ||
821 | Parameters: struct kvm_enable_cap (in) | ||
822 | Returns: 0 on success; -1 on error | ||
823 | |||
824 | +Not all extensions are enabled by default. Using this ioctl the application | ||
825 | can enable an extension, making it available to the guest. | ||
826 | |||
827 | On systems that do not support this ioctl, it always fails. On systems that | ||
828 | do support it, it only works for extensions that are supported for enablement. | ||
829 | |||
830 | To check if a capability can be enabled, the KVM_CHECK_EXTENSION ioctl should | ||
831 | be used. | ||
832 | |||
833 | struct kvm_enable_cap { | ||
834 | /* in */ | ||
835 | __u32 cap; | ||
836 | |||
837 | The capability that is supposed to get enabled. | ||
838 | |||
839 | __u32 flags; | ||
840 | |||
841 | A bitfield indicating future enhancements. Has to be 0 for now. | ||
842 | |||
843 | __u64 args[4]; | ||
844 | |||
845 | Arguments for enabling a feature. If a feature needs initial values to | ||
846 | function properly, this is the place to put them. | ||
847 | |||
848 | __u8 pad[64]; | ||
849 | }; | ||
850 | |||
851 | 4.37 KVM_GET_MP_STATE | ||
852 | |||
853 | Capability: KVM_CAP_MP_STATE | ||
854 | Architectures: x86, ia64 | ||
855 | Type: vcpu ioctl | ||
856 | Parameters: struct kvm_mp_state (out) | ||
857 | Returns: 0 on success; -1 on error | ||
858 | |||
859 | struct kvm_mp_state { | ||
860 | __u32 mp_state; | ||
861 | }; | ||
862 | |||
863 | Returns the vcpu's current "multiprocessing state" (though also valid on | ||
864 | uniprocessor guests). | ||
865 | |||
866 | Possible values are: | ||
867 | |||
868 | - KVM_MP_STATE_RUNNABLE: the vcpu is currently running | ||
869 | - KVM_MP_STATE_UNINITIALIZED: the vcpu is an application processor (AP) | ||
870 | which has not yet received an INIT signal | ||
871 | - KVM_MP_STATE_INIT_RECEIVED: the vcpu has received an INIT signal, and is | ||
872 | now ready for a SIPI | ||
873 | - KVM_MP_STATE_HALTED: the vcpu has executed a HLT instruction and | ||
874 | is waiting for an interrupt | ||
875 | - KVM_MP_STATE_SIPI_RECEIVED: the vcpu has just received a SIPI (vector | ||
876 | accesible via KVM_GET_VCPU_EVENTS) | ||
877 | |||
878 | This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel | ||
879 | irqchip, the multiprocessing state must be maintained by userspace. | ||
880 | |||
881 | 4.38 KVM_SET_MP_STATE | ||
882 | |||
883 | Capability: KVM_CAP_MP_STATE | ||
884 | Architectures: x86, ia64 | ||
885 | Type: vcpu ioctl | ||
886 | Parameters: struct kvm_mp_state (in) | ||
887 | Returns: 0 on success; -1 on error | ||
888 | |||
889 | Sets the vcpu's current "multiprocessing state"; see KVM_GET_MP_STATE for | ||
890 | arguments. | ||
891 | |||
892 | This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel | ||
893 | irqchip, the multiprocessing state must be maintained by userspace. | ||
712 | 894 | ||
713 | 5. The kvm_run structure | 895 | 5. The kvm_run structure |
714 | 896 | ||
@@ -820,6 +1002,13 @@ executed a memory-mapped I/O instruction which could not be satisfied | |||
820 | by kvm. The 'data' member contains the written data if 'is_write' is | 1002 | by kvm. The 'data' member contains the written data if 'is_write' is |
821 | true, and should be filled by application code otherwise. | 1003 | true, and should be filled by application code otherwise. |
822 | 1004 | ||
1005 | NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO and KVM_EXIT_OSI, the corresponding | ||
1006 | operations are complete (and guest state is consistent) only after userspace | ||
1007 | has re-entered the kernel with KVM_RUN. The kernel side will first finish | ||
1008 | incomplete operations and then check for pending signals. Userspace | ||
1009 | can re-enter the guest with an unmasked signal pending to complete | ||
1010 | pending operations. | ||
1011 | |||
823 | /* KVM_EXIT_HYPERCALL */ | 1012 | /* KVM_EXIT_HYPERCALL */ |
824 | struct { | 1013 | struct { |
825 | __u64 nr; | 1014 | __u64 nr; |
@@ -829,7 +1018,9 @@ true, and should be filled by application code otherwise. | |||
829 | __u32 pad; | 1018 | __u32 pad; |
830 | } hypercall; | 1019 | } hypercall; |
831 | 1020 | ||
832 | Unused. | 1021 | Unused. This was once used for 'hypercall to userspace'. To implement |
1022 | such functionality, use KVM_EXIT_IO (x86) or KVM_EXIT_MMIO (all except s390). | ||
1023 | Note KVM_EXIT_IO is significantly faster than KVM_EXIT_MMIO. | ||
833 | 1024 | ||
834 | /* KVM_EXIT_TPR_ACCESS */ | 1025 | /* KVM_EXIT_TPR_ACCESS */ |
835 | struct { | 1026 | struct { |
@@ -870,6 +1061,19 @@ s390 specific. | |||
870 | 1061 | ||
871 | powerpc specific. | 1062 | powerpc specific. |
872 | 1063 | ||
1064 | /* KVM_EXIT_OSI */ | ||
1065 | struct { | ||
1066 | __u64 gprs[32]; | ||
1067 | } osi; | ||
1068 | |||
1069 | MOL uses a special hypercall interface it calls 'OSI'. To enable it, we catch | ||
1070 | hypercalls and exit with this exit struct that contains all the guest gprs. | ||
1071 | |||
1072 | If exit_reason is KVM_EXIT_OSI, then the vcpu has triggered such a hypercall. | ||
1073 | Userspace can now handle the hypercall and when it's done modify the gprs as | ||
1074 | necessary. Upon guest entry all guest GPRs will then be replaced by the values | ||
1075 | in this struct. | ||
1076 | |||
873 | /* Fix the size of the union. */ | 1077 | /* Fix the size of the union. */ |
874 | char padding[256]; | 1078 | char padding[256]; |
875 | }; | 1079 | }; |
diff --git a/Documentation/kvm/cpuid.txt b/Documentation/kvm/cpuid.txt new file mode 100644 index 000000000000..14a12ea92b7f --- /dev/null +++ b/Documentation/kvm/cpuid.txt | |||
@@ -0,0 +1,42 @@ | |||
1 | KVM CPUID bits | ||
2 | Glauber Costa <glommer@redhat.com>, Red Hat Inc, 2010 | ||
3 | ===================================================== | ||
4 | |||
5 | A guest running on a kvm host, can check some of its features using | ||
6 | cpuid. This is not always guaranteed to work, since userspace can | ||
7 | mask-out some, or even all KVM-related cpuid features before launching | ||
8 | a guest. | ||
9 | |||
10 | KVM cpuid functions are: | ||
11 | |||
12 | function: KVM_CPUID_SIGNATURE (0x40000000) | ||
13 | returns : eax = 0, | ||
14 | ebx = 0x4b4d564b, | ||
15 | ecx = 0x564b4d56, | ||
16 | edx = 0x4d. | ||
17 | Note that this value in ebx, ecx and edx corresponds to the string "KVMKVMKVM". | ||
18 | This function queries the presence of KVM cpuid leafs. | ||
19 | |||
20 | |||
21 | function: define KVM_CPUID_FEATURES (0x40000001) | ||
22 | returns : ebx, ecx, edx = 0 | ||
23 | eax = and OR'ed group of (1 << flag), where each flags is: | ||
24 | |||
25 | |||
26 | flag || value || meaning | ||
27 | ============================================================================= | ||
28 | KVM_FEATURE_CLOCKSOURCE || 0 || kvmclock available at msrs | ||
29 | || || 0x11 and 0x12. | ||
30 | ------------------------------------------------------------------------------ | ||
31 | KVM_FEATURE_NOP_IO_DELAY || 1 || not necessary to perform delays | ||
32 | || || on PIO operations. | ||
33 | ------------------------------------------------------------------------------ | ||
34 | KVM_FEATURE_MMU_OP || 2 || deprecated. | ||
35 | ------------------------------------------------------------------------------ | ||
36 | KVM_FEATURE_CLOCKSOURCE2 || 3 || kvmclock available at msrs | ||
37 | || || 0x4b564d00 and 0x4b564d01 | ||
38 | ------------------------------------------------------------------------------ | ||
39 | KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side | ||
40 | || || per-cpu warps are expected in | ||
41 | || || kvmclock. | ||
42 | ------------------------------------------------------------------------------ | ||
diff --git a/Documentation/kvm/mmu.txt b/Documentation/kvm/mmu.txt new file mode 100644 index 000000000000..aaed6ab9d7ab --- /dev/null +++ b/Documentation/kvm/mmu.txt | |||
@@ -0,0 +1,304 @@ | |||
1 | The x86 kvm shadow mmu | ||
2 | ====================== | ||
3 | |||
4 | The mmu (in arch/x86/kvm, files mmu.[ch] and paging_tmpl.h) is responsible | ||
5 | for presenting a standard x86 mmu to the guest, while translating guest | ||
6 | physical addresses to host physical addresses. | ||
7 | |||
8 | The mmu code attempts to satisfy the following requirements: | ||
9 | |||
10 | - correctness: the guest should not be able to determine that it is running | ||
11 | on an emulated mmu except for timing (we attempt to comply | ||
12 | with the specification, not emulate the characteristics of | ||
13 | a particular implementation such as tlb size) | ||
14 | - security: the guest must not be able to touch host memory not assigned | ||
15 | to it | ||
16 | - performance: minimize the performance penalty imposed by the mmu | ||
17 | - scaling: need to scale to large memory and large vcpu guests | ||
18 | - hardware: support the full range of x86 virtualization hardware | ||
19 | - integration: Linux memory management code must be in control of guest memory | ||
20 | so that swapping, page migration, page merging, transparent | ||
21 | hugepages, and similar features work without change | ||
22 | - dirty tracking: report writes to guest memory to enable live migration | ||
23 | and framebuffer-based displays | ||
24 | - footprint: keep the amount of pinned kernel memory low (most memory | ||
25 | should be shrinkable) | ||
26 | - reliablity: avoid multipage or GFP_ATOMIC allocations | ||
27 | |||
28 | Acronyms | ||
29 | ======== | ||
30 | |||
31 | pfn host page frame number | ||
32 | hpa host physical address | ||
33 | hva host virtual address | ||
34 | gfn guest frame number | ||
35 | gpa guest physical address | ||
36 | gva guest virtual address | ||
37 | ngpa nested guest physical address | ||
38 | ngva nested guest virtual address | ||
39 | pte page table entry (used also to refer generically to paging structure | ||
40 | entries) | ||
41 | gpte guest pte (referring to gfns) | ||
42 | spte shadow pte (referring to pfns) | ||
43 | tdp two dimensional paging (vendor neutral term for NPT and EPT) | ||
44 | |||
45 | Virtual and real hardware supported | ||
46 | =================================== | ||
47 | |||
48 | The mmu supports first-generation mmu hardware, which allows an atomic switch | ||
49 | of the current paging mode and cr3 during guest entry, as well as | ||
50 | two-dimensional paging (AMD's NPT and Intel's EPT). The emulated hardware | ||
51 | it exposes is the traditional 2/3/4 level x86 mmu, with support for global | ||
52 | pages, pae, pse, pse36, cr0.wp, and 1GB pages. Work is in progress to support | ||
53 | exposing NPT capable hardware on NPT capable hosts. | ||
54 | |||
55 | Translation | ||
56 | =========== | ||
57 | |||
58 | The primary job of the mmu is to program the processor's mmu to translate | ||
59 | addresses for the guest. Different translations are required at different | ||
60 | times: | ||
61 | |||
62 | - when guest paging is disabled, we translate guest physical addresses to | ||
63 | host physical addresses (gpa->hpa) | ||
64 | - when guest paging is enabled, we translate guest virtual addresses, to | ||
65 | guest physical addresses, to host physical addresses (gva->gpa->hpa) | ||
66 | - when the guest launches a guest of its own, we translate nested guest | ||
67 | virtual addresses, to nested guest physical addresses, to guest physical | ||
68 | addresses, to host physical addresses (ngva->ngpa->gpa->hpa) | ||
69 | |||
70 | The primary challenge is to encode between 1 and 3 translations into hardware | ||
71 | that support only 1 (traditional) and 2 (tdp) translations. When the | ||
72 | number of required translations matches the hardware, the mmu operates in | ||
73 | direct mode; otherwise it operates in shadow mode (see below). | ||
74 | |||
75 | Memory | ||
76 | ====== | ||
77 | |||
78 | Guest memory (gpa) is part of the user address space of the process that is | ||
79 | using kvm. Userspace defines the translation between guest addresses and user | ||
80 | addresses (gpa->hva); note that two gpas may alias to the same gva, but not | ||
81 | vice versa. | ||
82 | |||
83 | These gvas may be backed using any method available to the host: anonymous | ||
84 | memory, file backed memory, and device memory. Memory might be paged by the | ||
85 | host at any time. | ||
86 | |||
87 | Events | ||
88 | ====== | ||
89 | |||
90 | The mmu is driven by events, some from the guest, some from the host. | ||
91 | |||
92 | Guest generated events: | ||
93 | - writes to control registers (especially cr3) | ||
94 | - invlpg/invlpga instruction execution | ||
95 | - access to missing or protected translations | ||
96 | |||
97 | Host generated events: | ||
98 | - changes in the gpa->hpa translation (either through gpa->hva changes or | ||
99 | through hva->hpa changes) | ||
100 | - memory pressure (the shrinker) | ||
101 | |||
102 | Shadow pages | ||
103 | ============ | ||
104 | |||
105 | The principal data structure is the shadow page, 'struct kvm_mmu_page'. A | ||
106 | shadow page contains 512 sptes, which can be either leaf or nonleaf sptes. A | ||
107 | shadow page may contain a mix of leaf and nonleaf sptes. | ||
108 | |||
109 | A nonleaf spte allows the hardware mmu to reach the leaf pages and | ||
110 | is not related to a translation directly. It points to other shadow pages. | ||
111 | |||
112 | A leaf spte corresponds to either one or two translations encoded into | ||
113 | one paging structure entry. These are always the lowest level of the | ||
114 | translation stack, with optional higher level translations left to NPT/EPT. | ||
115 | Leaf ptes point at guest pages. | ||
116 | |||
117 | The following table shows translations encoded by leaf ptes, with higher-level | ||
118 | translations in parentheses: | ||
119 | |||
120 | Non-nested guests: | ||
121 | nonpaging: gpa->hpa | ||
122 | paging: gva->gpa->hpa | ||
123 | paging, tdp: (gva->)gpa->hpa | ||
124 | Nested guests: | ||
125 | non-tdp: ngva->gpa->hpa (*) | ||
126 | tdp: (ngva->)ngpa->gpa->hpa | ||
127 | |||
128 | (*) the guest hypervisor will encode the ngva->gpa translation into its page | ||
129 | tables if npt is not present | ||
130 | |||
131 | Shadow pages contain the following information: | ||
132 | role.level: | ||
133 | The level in the shadow paging hierarchy that this shadow page belongs to. | ||
134 | 1=4k sptes, 2=2M sptes, 3=1G sptes, etc. | ||
135 | role.direct: | ||
136 | If set, leaf sptes reachable from this page are for a linear range. | ||
137 | Examples include real mode translation, large guest pages backed by small | ||
138 | host pages, and gpa->hpa translations when NPT or EPT is active. | ||
139 | The linear range starts at (gfn << PAGE_SHIFT) and its size is determined | ||
140 | by role.level (2MB for first level, 1GB for second level, 0.5TB for third | ||
141 | level, 256TB for fourth level) | ||
142 | If clear, this page corresponds to a guest page table denoted by the gfn | ||
143 | field. | ||
144 | role.quadrant: | ||
145 | When role.cr4_pae=0, the guest uses 32-bit gptes while the host uses 64-bit | ||
146 | sptes. That means a guest page table contains more ptes than the host, | ||
147 | so multiple shadow pages are needed to shadow one guest page. | ||
148 | For first-level shadow pages, role.quadrant can be 0 or 1 and denotes the | ||
149 | first or second 512-gpte block in the guest page table. For second-level | ||
150 | page tables, each 32-bit gpte is converted to two 64-bit sptes | ||
151 | (since each first-level guest page is shadowed by two first-level | ||
152 | shadow pages) so role.quadrant takes values in the range 0..3. Each | ||
153 | quadrant maps 1GB virtual address space. | ||
154 | role.access: | ||
155 | Inherited guest access permissions in the form uwx. Note execute | ||
156 | permission is positive, not negative. | ||
157 | role.invalid: | ||
158 | The page is invalid and should not be used. It is a root page that is | ||
159 | currently pinned (by a cpu hardware register pointing to it); once it is | ||
160 | unpinned it will be destroyed. | ||
161 | role.cr4_pae: | ||
162 | Contains the value of cr4.pae for which the page is valid (e.g. whether | ||
163 | 32-bit or 64-bit gptes are in use). | ||
164 | role.cr4_nxe: | ||
165 | Contains the value of efer.nxe for which the page is valid. | ||
166 | role.cr0_wp: | ||
167 | Contains the value of cr0.wp for which the page is valid. | ||
168 | gfn: | ||
169 | Either the guest page table containing the translations shadowed by this | ||
170 | page, or the base page frame for linear translations. See role.direct. | ||
171 | spt: | ||
172 | A pageful of 64-bit sptes containing the translations for this page. | ||
173 | Accessed by both kvm and hardware. | ||
174 | The page pointed to by spt will have its page->private pointing back | ||
175 | at the shadow page structure. | ||
176 | sptes in spt point either at guest pages, or at lower-level shadow pages. | ||
177 | Specifically, if sp1 and sp2 are shadow pages, then sp1->spt[n] may point | ||
178 | at __pa(sp2->spt). sp2 will point back at sp1 through parent_pte. | ||
179 | The spt array forms a DAG structure with the shadow page as a node, and | ||
180 | guest pages as leaves. | ||
181 | gfns: | ||
182 | An array of 512 guest frame numbers, one for each present pte. Used to | ||
183 | perform a reverse map from a pte to a gfn. | ||
184 | slot_bitmap: | ||
185 | A bitmap containing one bit per memory slot. If the page contains a pte | ||
186 | mapping a page from memory slot n, then bit n of slot_bitmap will be set | ||
187 | (if a page is aliased among several slots, then it is not guaranteed that | ||
188 | all slots will be marked). | ||
189 | Used during dirty logging to avoid scanning a shadow page if none if its | ||
190 | pages need tracking. | ||
191 | root_count: | ||
192 | A counter keeping track of how many hardware registers (guest cr3 or | ||
193 | pdptrs) are now pointing at the page. While this counter is nonzero, the | ||
194 | page cannot be destroyed. See role.invalid. | ||
195 | multimapped: | ||
196 | Whether there exist multiple sptes pointing at this page. | ||
197 | parent_pte/parent_ptes: | ||
198 | If multimapped is zero, parent_pte points at the single spte that points at | ||
199 | this page's spt. Otherwise, parent_ptes points at a data structure | ||
200 | with a list of parent_ptes. | ||
201 | unsync: | ||
202 | If true, then the translations in this page may not match the guest's | ||
203 | translation. This is equivalent to the state of the tlb when a pte is | ||
204 | changed but before the tlb entry is flushed. Accordingly, unsync ptes | ||
205 | are synchronized when the guest executes invlpg or flushes its tlb by | ||
206 | other means. Valid for leaf pages. | ||
207 | unsync_children: | ||
208 | How many sptes in the page point at pages that are unsync (or have | ||
209 | unsynchronized children). | ||
210 | unsync_child_bitmap: | ||
211 | A bitmap indicating which sptes in spt point (directly or indirectly) at | ||
212 | pages that may be unsynchronized. Used to quickly locate all unsychronized | ||
213 | pages reachable from a given page. | ||
214 | |||
215 | Reverse map | ||
216 | =========== | ||
217 | |||
218 | The mmu maintains a reverse mapping whereby all ptes mapping a page can be | ||
219 | reached given its gfn. This is used, for example, when swapping out a page. | ||
220 | |||
221 | Synchronized and unsynchronized pages | ||
222 | ===================================== | ||
223 | |||
224 | The guest uses two events to synchronize its tlb and page tables: tlb flushes | ||
225 | and page invalidations (invlpg). | ||
226 | |||
227 | A tlb flush means that we need to synchronize all sptes reachable from the | ||
228 | guest's cr3. This is expensive, so we keep all guest page tables write | ||
229 | protected, and synchronize sptes to gptes when a gpte is written. | ||
230 | |||
231 | A special case is when a guest page table is reachable from the current | ||
232 | guest cr3. In this case, the guest is obliged to issue an invlpg instruction | ||
233 | before using the translation. We take advantage of that by removing write | ||
234 | protection from the guest page, and allowing the guest to modify it freely. | ||
235 | We synchronize modified gptes when the guest invokes invlpg. This reduces | ||
236 | the amount of emulation we have to do when the guest modifies multiple gptes, | ||
237 | or when the a guest page is no longer used as a page table and is used for | ||
238 | random guest data. | ||
239 | |||
240 | As a side effect we have to resynchronize all reachable unsynchronized shadow | ||
241 | pages on a tlb flush. | ||
242 | |||
243 | |||
244 | Reaction to events | ||
245 | ================== | ||
246 | |||
247 | - guest page fault (or npt page fault, or ept violation) | ||
248 | |||
249 | This is the most complicated event. The cause of a page fault can be: | ||
250 | |||
251 | - a true guest fault (the guest translation won't allow the access) (*) | ||
252 | - access to a missing translation | ||
253 | - access to a protected translation | ||
254 | - when logging dirty pages, memory is write protected | ||
255 | - synchronized shadow pages are write protected (*) | ||
256 | - access to untranslatable memory (mmio) | ||
257 | |||
258 | (*) not applicable in direct mode | ||
259 | |||
260 | Handling a page fault is performed as follows: | ||
261 | |||
262 | - if needed, walk the guest page tables to determine the guest translation | ||
263 | (gva->gpa or ngpa->gpa) | ||
264 | - if permissions are insufficient, reflect the fault back to the guest | ||
265 | - determine the host page | ||
266 | - if this is an mmio request, there is no host page; call the emulator | ||
267 | to emulate the instruction instead | ||
268 | - walk the shadow page table to find the spte for the translation, | ||
269 | instantiating missing intermediate page tables as necessary | ||
270 | - try to unsynchronize the page | ||
271 | - if successful, we can let the guest continue and modify the gpte | ||
272 | - emulate the instruction | ||
273 | - if failed, unshadow the page and let the guest continue | ||
274 | - update any translations that were modified by the instruction | ||
275 | |||
276 | invlpg handling: | ||
277 | |||
278 | - walk the shadow page hierarchy and drop affected translations | ||
279 | - try to reinstantiate the indicated translation in the hope that the | ||
280 | guest will use it in the near future | ||
281 | |||
282 | Guest control register updates: | ||
283 | |||
284 | - mov to cr3 | ||
285 | - look up new shadow roots | ||
286 | - synchronize newly reachable shadow pages | ||
287 | |||
288 | - mov to cr0/cr4/efer | ||
289 | - set up mmu context for new paging mode | ||
290 | - look up new shadow roots | ||
291 | - synchronize newly reachable shadow pages | ||
292 | |||
293 | Host translation updates: | ||
294 | |||
295 | - mmu notifier called with updated hva | ||
296 | - look up affected sptes through reverse map | ||
297 | - drop (or update) translations | ||
298 | |||
299 | Further reading | ||
300 | =============== | ||
301 | |||
302 | - NPT presentation from KVM Forum 2008 | ||
303 | http://www.linux-kvm.org/wiki/images/c/c8/KvmForum2008%24kdf2008_21.pdf | ||
304 | |||
diff --git a/Documentation/laptops/thinkpad-acpi.txt b/Documentation/laptops/thinkpad-acpi.txt index 39c0a09d0105..fc15538d8b46 100644 --- a/Documentation/laptops/thinkpad-acpi.txt +++ b/Documentation/laptops/thinkpad-acpi.txt | |||
@@ -292,13 +292,13 @@ sysfs notes: | |||
292 | 292 | ||
293 | Warning: when in NVRAM mode, the volume up/down/mute | 293 | Warning: when in NVRAM mode, the volume up/down/mute |
294 | keys are synthesized according to changes in the mixer, | 294 | keys are synthesized according to changes in the mixer, |
295 | so you have to use volume up or volume down to unmute, | 295 | which uses a single volume up or volume down hotkey |
296 | as per the ThinkPad volume mixer user interface. When | 296 | press to unmute, as per the ThinkPad volume mixer user |
297 | in ACPI event mode, volume up/down/mute are reported as | 297 | interface. When in ACPI event mode, volume up/down/mute |
298 | separate events, but this behaviour may be corrected in | 298 | events are reported by the firmware and can behave |
299 | future releases of this driver, in which case the | 299 | differently (and that behaviour changes with firmware |
300 | ThinkPad volume mixer user interface semantics will be | 300 | version -- not just with firmware models -- as well as |
301 | enforced. | 301 | OSI(Linux) state). |
302 | 302 | ||
303 | hotkey_poll_freq: | 303 | hotkey_poll_freq: |
304 | frequency in Hz for hot key polling. It must be between | 304 | frequency in Hz for hot key polling. It must be between |
@@ -309,7 +309,7 @@ sysfs notes: | |||
309 | will cause hot key presses that require NVRAM polling | 309 | will cause hot key presses that require NVRAM polling |
310 | to never be reported. | 310 | to never be reported. |
311 | 311 | ||
312 | Setting hotkey_poll_freq too low will cause repeated | 312 | Setting hotkey_poll_freq too low may cause repeated |
313 | pressings of the same hot key to be misreported as a | 313 | pressings of the same hot key to be misreported as a |
314 | single key press, or to not even be detected at all. | 314 | single key press, or to not even be detected at all. |
315 | The recommended polling frequency is 10Hz. | 315 | The recommended polling frequency is 10Hz. |
@@ -397,6 +397,7 @@ ACPI Scan | |||
397 | event code Key Notes | 397 | event code Key Notes |
398 | 398 | ||
399 | 0x1001 0x00 FN+F1 - | 399 | 0x1001 0x00 FN+F1 - |
400 | |||
400 | 0x1002 0x01 FN+F2 IBM: battery (rare) | 401 | 0x1002 0x01 FN+F2 IBM: battery (rare) |
401 | Lenovo: Screen lock | 402 | Lenovo: Screen lock |
402 | 403 | ||
@@ -404,7 +405,8 @@ event code Key Notes | |||
404 | this hot key, even with hot keys | 405 | this hot key, even with hot keys |
405 | disabled or with Fn+F3 masked | 406 | disabled or with Fn+F3 masked |
406 | off | 407 | off |
407 | IBM: screen lock | 408 | IBM: screen lock, often turns |
409 | off the ThinkLight as side-effect | ||
408 | Lenovo: battery | 410 | Lenovo: battery |
409 | 411 | ||
410 | 0x1004 0x03 FN+F4 Sleep button (ACPI sleep button | 412 | 0x1004 0x03 FN+F4 Sleep button (ACPI sleep button |
@@ -433,7 +435,8 @@ event code Key Notes | |||
433 | Do you feel lucky today? | 435 | Do you feel lucky today? |
434 | 436 | ||
435 | 0x1008 0x07 FN+F8 IBM: toggle screen expand | 437 | 0x1008 0x07 FN+F8 IBM: toggle screen expand |
436 | Lenovo: configure UltraNav | 438 | Lenovo: configure UltraNav, |
439 | or toggle screen expand | ||
437 | 440 | ||
438 | 0x1009 0x08 FN+F9 - | 441 | 0x1009 0x08 FN+F9 - |
439 | .. .. .. | 442 | .. .. .. |
@@ -444,7 +447,7 @@ event code Key Notes | |||
444 | either through the ACPI event, | 447 | either through the ACPI event, |
445 | or through a hotkey event. | 448 | or through a hotkey event. |
446 | The firmware may refuse to | 449 | The firmware may refuse to |
447 | generate further FN+F4 key | 450 | generate further FN+F12 key |
448 | press events until a S3 or S4 | 451 | press events until a S3 or S4 |
449 | ACPI sleep cycle is performed, | 452 | ACPI sleep cycle is performed, |
450 | or some time passes. | 453 | or some time passes. |
@@ -512,15 +515,19 @@ events for switches: | |||
512 | SW_RFKILL_ALL T60 and later hardware rfkill rocker switch | 515 | SW_RFKILL_ALL T60 and later hardware rfkill rocker switch |
513 | SW_TABLET_MODE Tablet ThinkPads HKEY events 0x5009 and 0x500A | 516 | SW_TABLET_MODE Tablet ThinkPads HKEY events 0x5009 and 0x500A |
514 | 517 | ||
515 | Non hot-key ACPI HKEY event map: | 518 | Non hotkey ACPI HKEY event map: |
519 | ------------------------------- | ||
520 | |||
521 | Events that are not propagated by the driver, except for legacy | ||
522 | compatibility purposes when hotkey_report_mode is set to 1: | ||
523 | |||
516 | 0x5001 Lid closed | 524 | 0x5001 Lid closed |
517 | 0x5002 Lid opened | 525 | 0x5002 Lid opened |
518 | 0x5009 Tablet swivel: switched to tablet mode | 526 | 0x5009 Tablet swivel: switched to tablet mode |
519 | 0x500A Tablet swivel: switched to normal mode | 527 | 0x500A Tablet swivel: switched to normal mode |
520 | 0x7000 Radio Switch may have changed state | 528 | 0x7000 Radio Switch may have changed state |
521 | 529 | ||
522 | The above events are not propagated by the driver, except for legacy | 530 | Events that are never propagated by the driver: |
523 | compatibility purposes when hotkey_report_mode is set to 1. | ||
524 | 531 | ||
525 | 0x2304 System is waking up from suspend to undock | 532 | 0x2304 System is waking up from suspend to undock |
526 | 0x2305 System is waking up from suspend to eject bay | 533 | 0x2305 System is waking up from suspend to eject bay |
@@ -528,14 +535,39 @@ compatibility purposes when hotkey_report_mode is set to 1. | |||
528 | 0x2405 System is waking up from hibernation to eject bay | 535 | 0x2405 System is waking up from hibernation to eject bay |
529 | 0x5010 Brightness level changed/control event | 536 | 0x5010 Brightness level changed/control event |
530 | 537 | ||
531 | The above events are never propagated by the driver. | 538 | Events that are propagated by the driver to userspace: |
532 | 539 | ||
540 | 0x2313 ALARM: System is waking up from suspend because | ||
541 | the battery is nearly empty | ||
542 | 0x2413 ALARM: System is waking up from hibernation because | ||
543 | the battery is nearly empty | ||
533 | 0x3003 Bay ejection (see 0x2x05) complete, can sleep again | 544 | 0x3003 Bay ejection (see 0x2x05) complete, can sleep again |
545 | 0x3006 Bay hotplug request (hint to power up SATA link when | ||
546 | the optical drive tray is ejected) | ||
534 | 0x4003 Undocked (see 0x2x04), can sleep again | 547 | 0x4003 Undocked (see 0x2x04), can sleep again |
535 | 0x500B Tablet pen inserted into its storage bay | 548 | 0x500B Tablet pen inserted into its storage bay |
536 | 0x500C Tablet pen removed from its storage bay | 549 | 0x500C Tablet pen removed from its storage bay |
537 | 550 | 0x6011 ALARM: battery is too hot | |
538 | The above events are propagated by the driver. | 551 | 0x6012 ALARM: battery is extremely hot |
552 | 0x6021 ALARM: a sensor is too hot | ||
553 | 0x6022 ALARM: a sensor is extremely hot | ||
554 | 0x6030 System thermal table changed | ||
555 | |||
556 | Battery nearly empty alarms are a last resort attempt to get the | ||
557 | operating system to hibernate or shutdown cleanly (0x2313), or shutdown | ||
558 | cleanly (0x2413) before power is lost. They must be acted upon, as the | ||
559 | wake up caused by the firmware will have negated most safety nets... | ||
560 | |||
561 | When any of the "too hot" alarms happen, according to Lenovo the user | ||
562 | should suspend or hibernate the laptop (and in the case of battery | ||
563 | alarms, unplug the AC adapter) to let it cool down. These alarms do | ||
564 | signal that something is wrong, they should never happen on normal | ||
565 | operating conditions. | ||
566 | |||
567 | The "extremely hot" alarms are emergencies. According to Lenovo, the | ||
568 | operating system is to force either an immediate suspend or hibernate | ||
569 | cycle, or a system shutdown. Obviously, something is very wrong if this | ||
570 | happens. | ||
539 | 571 | ||
540 | Compatibility notes: | 572 | Compatibility notes: |
541 | 573 | ||
diff --git a/Documentation/mutex-design.txt b/Documentation/mutex-design.txt index aa60d1f627e5..c91ccc0720fa 100644 --- a/Documentation/mutex-design.txt +++ b/Documentation/mutex-design.txt | |||
@@ -66,14 +66,14 @@ of advantages of mutexes: | |||
66 | 66 | ||
67 | c0377ccb <mutex_lock>: | 67 | c0377ccb <mutex_lock>: |
68 | c0377ccb: f0 ff 08 lock decl (%eax) | 68 | c0377ccb: f0 ff 08 lock decl (%eax) |
69 | c0377cce: 78 0e js c0377cde <.text.lock.mutex> | 69 | c0377cce: 78 0e js c0377cde <.text..lock.mutex> |
70 | c0377cd0: c3 ret | 70 | c0377cd0: c3 ret |
71 | 71 | ||
72 | the unlocking fastpath is equally tight: | 72 | the unlocking fastpath is equally tight: |
73 | 73 | ||
74 | c0377cd1 <mutex_unlock>: | 74 | c0377cd1 <mutex_unlock>: |
75 | c0377cd1: f0 ff 00 lock incl (%eax) | 75 | c0377cd1: f0 ff 00 lock incl (%eax) |
76 | c0377cd4: 7e 0f jle c0377ce5 <.text.lock.mutex+0x7> | 76 | c0377cd4: 7e 0f jle c0377ce5 <.text..lock.mutex+0x7> |
77 | c0377cd6: c3 ret | 77 | c0377cd6: c3 ret |
78 | 78 | ||
79 | - 'struct mutex' semantics are well-defined and are enforced if | 79 | - 'struct mutex' semantics are well-defined and are enforced if |
diff --git a/Documentation/oops-tracing.txt b/Documentation/oops-tracing.txt index c10c022b911c..6fe9001b9263 100644 --- a/Documentation/oops-tracing.txt +++ b/Documentation/oops-tracing.txt | |||
@@ -256,9 +256,13 @@ characters, each representing a particular tainted value. | |||
256 | 9: 'A' if the ACPI table has been overridden. | 256 | 9: 'A' if the ACPI table has been overridden. |
257 | 257 | ||
258 | 10: 'W' if a warning has previously been issued by the kernel. | 258 | 10: 'W' if a warning has previously been issued by the kernel. |
259 | (Though some warnings may set more specific taint flags.) | ||
259 | 260 | ||
260 | 11: 'C' if a staging driver has been loaded. | 261 | 11: 'C' if a staging driver has been loaded. |
261 | 262 | ||
263 | 12: 'I' if the kernel is working around a severe bug in the platform | ||
264 | firmware (BIOS or similar). | ||
265 | |||
262 | The primary reason for the 'Tainted: ' string is to tell kernel | 266 | The primary reason for the 'Tainted: ' string is to tell kernel |
263 | debuggers if this is a clean kernel or if anything unusual has | 267 | debuggers if this is a clean kernel or if anything unusual has |
264 | occurred. Tainting is permanent: even if an offending module is | 268 | occurred. Tainting is permanent: even if an offending module is |
diff --git a/Documentation/power/pci.txt b/Documentation/power/pci.txt index dd8fe43888d3..62328d76b55b 100644 --- a/Documentation/power/pci.txt +++ b/Documentation/power/pci.txt | |||
@@ -1,299 +1,1025 @@ | |||
1 | |||
2 | PCI Power Management | 1 | PCI Power Management |
3 | ~~~~~~~~~~~~~~~~~~~~ | ||
4 | 2 | ||
5 | An overview of the concepts and the related functions in the Linux kernel | 3 | Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. |
4 | |||
5 | An overview of concepts and the Linux kernel's interfaces related to PCI power | ||
6 | management. Based on previous work by Patrick Mochel <mochel@transmeta.com> | ||
7 | (and others). | ||
6 | 8 | ||
7 | Patrick Mochel <mochel@transmeta.com> | 9 | This document only covers the aspects of power management specific to PCI |
8 | (and others) | 10 | devices. For general description of the kernel's interfaces related to device |
11 | power management refer to Documentation/power/devices.txt and | ||
12 | Documentation/power/runtime_pm.txt. | ||
9 | 13 | ||
10 | --------------------------------------------------------------------------- | 14 | --------------------------------------------------------------------------- |
11 | 15 | ||
12 | 1. Overview | 16 | 1. Hardware and Platform Support for PCI Power Management |
13 | 2. How the PCI Subsystem Does Power Management | 17 | 2. PCI Subsystem and Device Power Management |
14 | 3. PCI Utility Functions | 18 | 3. PCI Device Drivers and Power Management |
15 | 4. PCI Device Drivers | 19 | 4. Resources |
16 | 5. Resources | 20 | |
17 | 21 | ||
18 | 1. Overview | 22 | 1. Hardware and Platform Support for PCI Power Management |
19 | ~~~~~~~~~~~ | 23 | ========================================================= |
20 | 24 | ||
21 | The PCI Power Management Specification was introduced between the PCI 2.1 and | 25 | 1.1. Native and Platform-Based Power Management |
22 | PCI 2.2 Specifications. It a standard interface for controlling various | 26 | ----------------------------------------------- |
23 | power management operations. | 27 | In general, power management is a feature allowing one to save energy by putting |
24 | 28 | devices into states in which they draw less power (low-power states) at the | |
25 | Implementation of the PCI PM Spec is optional, as are several sub-components of | 29 | price of reduced functionality or performance. |
26 | it. If a device supports the PCI PM Spec, the device will have an 8 byte | 30 | |
27 | capability field in its PCI configuration space. This field is used to describe | 31 | Usually, a device is put into a low-power state when it is underutilized or |
28 | and control the standard PCI power management features. | 32 | completely inactive. However, when it is necessary to use the device once |
29 | 33 | again, it has to be put back into the "fully functional" state (full-power | |
30 | The PCI PM spec defines 4 operating states for devices (D0 - D3) and for buses | 34 | state). This may happen when there are some data for the device to handle or |
31 | (B0 - B3). The higher the number, the less power the device consumes. However, | 35 | as a result of an external event requiring the device to be active, which may |
32 | the higher the number, the longer the latency is for the device to return to | 36 | be signaled by the device itself. |
33 | an operational state (D0). | 37 | |
34 | 38 | PCI devices may be put into low-power states in two ways, by using the device | |
35 | There are actually two D3 states. When someone talks about D3, they usually | 39 | capabilities introduced by the PCI Bus Power Management Interface Specification, |
36 | mean D3hot, which corresponds to an ACPI D2 state (power is reduced, the | 40 | or with the help of platform firmware, such as an ACPI BIOS. In the first |
37 | device may lose some context). But they may also mean D3cold, which is an | 41 | approach, that is referred to as the native PCI power management (native PCI PM) |
38 | ACPI D3 state (power is fully off, all state was discarded); or both. | 42 | in what follows, the device power state is changed as a result of writing a |
39 | 43 | specific value into one of its standard configuration registers. The second | |
40 | Bus power management is not covered in this version of this document. | 44 | approach requires the platform firmware to provide special methods that may be |
41 | 45 | used by the kernel to change the device's power state. | |
42 | Note that all PCI devices support D0 and D3cold by default, regardless of | 46 | |
43 | whether or not they implement any of the PCI PM spec. | 47 | Devices supporting the native PCI PM usually can generate wakeup signals called |
44 | 48 | Power Management Events (PMEs) to let the kernel know about external events | |
45 | The possible state transitions that a device can undergo are: | 49 | requiring the device to be active. After receiving a PME the kernel is supposed |
46 | 50 | to put the device that sent it into the full-power state. However, the PCI Bus | |
47 | +---------------------------+ | 51 | Power Management Interface Specification doesn't define any standard method of |
48 | | Current State | New State | | 52 | delivering the PME from the device to the CPU and the operating system kernel. |
49 | +---------------------------+ | 53 | It is assumed that the platform firmware will perform this task and therefore, |
50 | | D0 | D1, D2, D3| | 54 | even though a PCI device is set up to generate PMEs, it also may be necessary to |
51 | +---------------------------+ | 55 | prepare the platform firmware for notifying the CPU of the PMEs coming from the |
52 | | D1 | D2, D3 | | 56 | device (e.g. by generating interrupts). |
53 | +---------------------------+ | 57 | |
54 | | D2 | D3 | | 58 | In turn, if the methods provided by the platform firmware are used for changing |
55 | +---------------------------+ | 59 | the power state of a device, usually the platform also provides a method for |
56 | | D1, D2, D3 | D0 | | 60 | preparing the device to generate wakeup signals. In that case, however, it |
57 | +---------------------------+ | 61 | often also is necessary to prepare the device for generating PMEs using the |
58 | 62 | native PCI PM mechanism, because the method provided by the platform depends on | |
59 | Note that when the system is entering a global suspend state, all devices will | 63 | that. |
60 | be placed into D3 and when resuming, all devices will be placed into D0. | 64 | |
61 | However, when the system is running, other state transitions are possible. | 65 | Thus in many situations both the native and the platform-based power management |
62 | 66 | mechanisms have to be used simultaneously to obtain the desired result. | |
63 | 2. How The PCI Subsystem Handles Power Management | 67 | |
64 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 68 | 1.2. Native PCI Power Management |
65 | 69 | -------------------------------- | |
66 | The PCI suspend/resume functionality is accessed indirectly via the Power | 70 | The PCI Bus Power Management Interface Specification (PCI PM Spec) was |
67 | Management subsystem. At boot, the PCI driver registers a power management | 71 | introduced between the PCI 2.1 and PCI 2.2 Specifications. It defined a |
68 | callback with that layer. Upon entering a suspend state, the PM layer iterates | 72 | standard interface for performing various operations related to power |
69 | through all of its registered callbacks. This currently takes place only during | 73 | management. |
70 | APM state transitions. | 74 | |
71 | 75 | The implementation of the PCI PM Spec is optional for conventional PCI devices, | |
72 | Upon going to sleep, the PCI subsystem walks its device tree twice. Both times, | 76 | but it is mandatory for PCI Express devices. If a device supports the PCI PM |
73 | it does a depth first walk of the device tree. The first walk saves each of the | 77 | Spec, it has an 8 byte power management capability field in its PCI |
74 | device's state and checks for devices that will prevent the system from entering | 78 | configuration space. This field is used to describe and control the standard |
75 | a global power state. The next walk then places the devices in a low power | 79 | features related to the native PCI power management. |
80 | |||
81 | The PCI PM Spec defines 4 operating states for devices (D0-D3) and for buses | ||
82 | (B0-B3). The higher the number, the less power is drawn by the device or bus | ||
83 | in that state. However, the higher the number, the longer the latency for | ||
84 | the device or bus to return to the full-power state (D0 or B0, respectively). | ||
85 | |||
86 | There are two variants of the D3 state defined by the specification. The first | ||
87 | one is D3hot, referred to as the software accessible D3, because devices can be | ||
88 | programmed to go into it. The second one, D3cold, is the state that PCI devices | ||
89 | are in when the supply voltage (Vcc) is removed from them. It is not possible | ||
90 | to program a PCI device to go into D3cold, although there may be a programmable | ||
91 | interface for putting the bus the device is on into a state in which Vcc is | ||
92 | removed from all devices on the bus. | ||
93 | |||
94 | PCI bus power management, however, is not supported by the Linux kernel at the | ||
95 | time of this writing and therefore it is not covered by this document. | ||
96 | |||
97 | Note that every PCI device can be in the full-power state (D0) or in D3cold, | ||
98 | regardless of whether or not it implements the PCI PM Spec. In addition to | ||
99 | that, if the PCI PM Spec is implemented by the device, it must support D3hot | ||
100 | as well as D0. The support for the D1 and D2 power states is optional. | ||
101 | |||
102 | PCI devices supporting the PCI PM Spec can be programmed to go to any of the | ||
103 | supported low-power states (except for D3cold). While in D1-D3hot the | ||
104 | standard configuration registers of the device must be accessible to software | ||
105 | (i.e. the device is required to respond to PCI configuration accesses), although | ||
106 | its I/O and memory spaces are then disabled. This allows the device to be | ||
107 | programmatically put into D0. Thus the kernel can switch the device back and | ||
108 | forth between D0 and the supported low-power states (except for D3cold) and the | ||
109 | possible power state transitions the device can undergo are the following: | ||
110 | |||
111 | +----------------------------+ | ||
112 | | Current State | New State | | ||
113 | +----------------------------+ | ||
114 | | D0 | D1, D2, D3 | | ||
115 | +----------------------------+ | ||
116 | | D1 | D2, D3 | | ||
117 | +----------------------------+ | ||
118 | | D2 | D3 | | ||
119 | +----------------------------+ | ||
120 | | D1, D2, D3 | D0 | | ||
121 | +----------------------------+ | ||
122 | |||
123 | The transition from D3cold to D0 occurs when the supply voltage is provided to | ||
124 | the device (i.e. power is restored). In that case the device returns to D0 with | ||
125 | a full power-on reset sequence and the power-on defaults are restored to the | ||
126 | device by hardware just as at initial power up. | ||
127 | |||
128 | PCI devices supporting the PCI PM Spec can be programmed to generate PMEs | ||
129 | while in a low-power state (D1-D3), but they are not required to be capable | ||
130 | of generating PMEs from all supported low-power states. In particular, the | ||
131 | capability of generating PMEs from D3cold is optional and depends on the | ||
132 | presence of additional voltage (3.3Vaux) allowing the device to remain | ||
133 | sufficiently active to generate a wakeup signal. | ||
134 | |||
135 | 1.3. ACPI Device Power Management | ||
136 | --------------------------------- | ||
137 | The platform firmware support for the power management of PCI devices is | ||
138 | system-specific. However, if the system in question is compliant with the | ||
139 | Advanced Configuration and Power Interface (ACPI) Specification, like the | ||
140 | majority of x86-based systems, it is supposed to implement device power | ||
141 | management interfaces defined by the ACPI standard. | ||
142 | |||
143 | For this purpose the ACPI BIOS provides special functions called "control | ||
144 | methods" that may be executed by the kernel to perform specific tasks, such as | ||
145 | putting a device into a low-power state. These control methods are encoded | ||
146 | using special byte-code language called the ACPI Machine Language (AML) and | ||
147 | stored in the machine's BIOS. The kernel loads them from the BIOS and executes | ||
148 | them as needed using an AML interpreter that translates the AML byte code into | ||
149 | computations and memory or I/O space accesses. This way, in theory, a BIOS | ||
150 | writer can provide the kernel with a means to perform actions depending | ||
151 | on the system design in a system-specific fashion. | ||
152 | |||
153 | ACPI control methods may be divided into global control methods, that are not | ||
154 | associated with any particular devices, and device control methods, that have | ||
155 | to be defined separately for each device supposed to be handled with the help of | ||
156 | the platform. This means, in particular, that ACPI device control methods can | ||
157 | only be used to handle devices that the BIOS writer knew about in advance. The | ||
158 | ACPI methods used for device power management fall into that category. | ||
159 | |||
160 | The ACPI specification assumes that devices can be in one of four power states | ||
161 | labeled as D0, D1, D2, and D3 that roughly correspond to the native PCI PM | ||
162 | D0-D3 states (although the difference between D3hot and D3cold is not taken | ||
163 | into account by ACPI). Moreover, for each power state of a device there is a | ||
164 | set of power resources that have to be enabled for the device to be put into | ||
165 | that state. These power resources are controlled (i.e. enabled or disabled) | ||
166 | with the help of their own control methods, _ON and _OFF, that have to be | ||
167 | defined individually for each of them. | ||
168 | |||
169 | To put a device into the ACPI power state Dx (where x is a number between 0 and | ||
170 | 3 inclusive) the kernel is supposed to (1) enable the power resources required | ||
171 | by the device in this state using their _ON control methods and (2) execute the | ||
172 | _PSx control method defined for the device. In addition to that, if the device | ||
173 | is going to be put into a low-power state (D1-D3) and is supposed to generate | ||
174 | wakeup signals from that state, the _DSW (or _PSW, replaced with _DSW by ACPI | ||
175 | 3.0) control method defined for it has to be executed before _PSx. Power | ||
176 | resources that are not required by the device in the target power state and are | ||
177 | not required any more by any other device should be disabled (by executing their | ||
178 | _OFF control methods). If the current power state of the device is D3, it can | ||
179 | only be put into D0 this way. | ||
180 | |||
181 | However, quite often the power states of devices are changed during a | ||
182 | system-wide transition into a sleep state or back into the working state. ACPI | ||
183 | defines four system sleep states, S1, S2, S3, and S4, and denotes the system | ||
184 | working state as S0. In general, the target system sleep (or working) state | ||
185 | determines the highest power (lowest number) state the device can be put | ||
186 | into and the kernel is supposed to obtain this information by executing the | ||
187 | device's _SxD control method (where x is a number between 0 and 4 inclusive). | ||
188 | If the device is required to wake up the system from the target sleep state, the | ||
189 | lowest power (highest number) state it can be put into is also determined by the | ||
190 | target state of the system. The kernel is then supposed to use the device's | ||
191 | _SxW control method to obtain the number of that state. It also is supposed to | ||
192 | use the device's _PRW control method to learn which power resources need to be | ||
193 | enabled for the device to be able to generate wakeup signals. | ||
194 | |||
195 | 1.4. Wakeup Signaling | ||
196 | --------------------- | ||
197 | Wakeup signals generated by PCI devices, either as native PCI PMEs, or as | ||
198 | a result of the execution of the _DSW (or _PSW) ACPI control method before | ||
199 | putting the device into a low-power state, have to be caught and handled as | ||
200 | appropriate. If they are sent while the system is in the working state | ||
201 | (ACPI S0), they should be translated into interrupts so that the kernel can | ||
202 | put the devices generating them into the full-power state and take care of the | ||
203 | events that triggered them. In turn, if they are sent while the system is | ||
204 | sleeping, they should cause the system's core logic to trigger wakeup. | ||
205 | |||
206 | On ACPI-based systems wakeup signals sent by conventional PCI devices are | ||
207 | converted into ACPI General-Purpose Events (GPEs) which are hardware signals | ||
208 | from the system core logic generated in response to various events that need to | ||
209 | be acted upon. Every GPE is associated with one or more sources of potentially | ||
210 | interesting events. In particular, a GPE may be associated with a PCI device | ||
211 | capable of signaling wakeup. The information on the connections between GPEs | ||
212 | and event sources is recorded in the system's ACPI BIOS from where it can be | ||
213 | read by the kernel. | ||
214 | |||
215 | If a PCI device known to the system's ACPI BIOS signals wakeup, the GPE | ||
216 | associated with it (if there is one) is triggered. The GPEs associated with PCI | ||
217 | bridges may also be triggered in response to a wakeup signal from one of the | ||
218 | devices below the bridge (this also is the case for root bridges) and, for | ||
219 | example, native PCI PMEs from devices unknown to the system's ACPI BIOS may be | ||
220 | handled this way. | ||
221 | |||
222 | A GPE may be triggered when the system is sleeping (i.e. when it is in one of | ||
223 | the ACPI S1-S4 states), in which case system wakeup is started by its core logic | ||
224 | (the device that was the source of the signal causing the system wakeup to occur | ||
225 | may be identified later). The GPEs used in such situations are referred to as | ||
226 | wakeup GPEs. | ||
227 | |||
228 | Usually, however, GPEs are also triggered when the system is in the working | ||
229 | state (ACPI S0) and in that case the system's core logic generates a System | ||
230 | Control Interrupt (SCI) to notify the kernel of the event. Then, the SCI | ||
231 | handler identifies the GPE that caused the interrupt to be generated which, | ||
232 | in turn, allows the kernel to identify the source of the event (that may be | ||
233 | a PCI device signaling wakeup). The GPEs used for notifying the kernel of | ||
234 | events occurring while the system is in the working state are referred to as | ||
235 | runtime GPEs. | ||
236 | |||
237 | Unfortunately, there is no standard way of handling wakeup signals sent by | ||
238 | conventional PCI devices on systems that are not ACPI-based, but there is one | ||
239 | for PCI Express devices. Namely, the PCI Express Base Specification introduced | ||
240 | a native mechanism for converting native PCI PMEs into interrupts generated by | ||
241 | root ports. For conventional PCI devices native PMEs are out-of-band, so they | ||
242 | are routed separately and they need not pass through bridges (in principle they | ||
243 | may be routed directly to the system's core logic), but for PCI Express devices | ||
244 | they are in-band messages that have to pass through the PCI Express hierarchy, | ||
245 | including the root port on the path from the device to the Root Complex. Thus | ||
246 | it was possible to introduce a mechanism by which a root port generates an | ||
247 | interrupt whenever it receives a PME message from one of the devices below it. | ||
248 | The PCI Express Requester ID of the device that sent the PME message is then | ||
249 | recorded in one of the root port's configuration registers from where it may be | ||
250 | read by the interrupt handler allowing the device to be identified. [PME | ||
251 | messages sent by PCI Express endpoints integrated with the Root Complex don't | ||
252 | pass through root ports, but instead they cause a Root Complex Event Collector | ||
253 | (if there is one) to generate interrupts.] | ||
254 | |||
255 | In principle the native PCI Express PME signaling may also be used on ACPI-based | ||
256 | systems along with the GPEs, but to use it the kernel has to ask the system's | ||
257 | ACPI BIOS to release control of root port configuration registers. The ACPI | ||
258 | BIOS, however, is not required to allow the kernel to control these registers | ||
259 | and if it doesn't do that, the kernel must not modify their contents. Of course | ||
260 | the native PCI Express PME signaling cannot be used by the kernel in that case. | ||
261 | |||
262 | |||
263 | 2. PCI Subsystem and Device Power Management | ||
264 | ============================================ | ||
265 | |||
266 | 2.1. Device Power Management Callbacks | ||
267 | -------------------------------------- | ||
268 | The PCI Subsystem participates in the power management of PCI devices in a | ||
269 | number of ways. First of all, it provides an intermediate code layer between | ||
270 | the device power management core (PM core) and PCI device drivers. | ||
271 | Specifically, the pm field of the PCI subsystem's struct bus_type object, | ||
272 | pci_bus_type, points to a struct dev_pm_ops object, pci_dev_pm_ops, containing | ||
273 | pointers to several device power management callbacks: | ||
274 | |||
275 | const struct dev_pm_ops pci_dev_pm_ops = { | ||
276 | .prepare = pci_pm_prepare, | ||
277 | .complete = pci_pm_complete, | ||
278 | .suspend = pci_pm_suspend, | ||
279 | .resume = pci_pm_resume, | ||
280 | .freeze = pci_pm_freeze, | ||
281 | .thaw = pci_pm_thaw, | ||
282 | .poweroff = pci_pm_poweroff, | ||
283 | .restore = pci_pm_restore, | ||
284 | .suspend_noirq = pci_pm_suspend_noirq, | ||
285 | .resume_noirq = pci_pm_resume_noirq, | ||
286 | .freeze_noirq = pci_pm_freeze_noirq, | ||
287 | .thaw_noirq = pci_pm_thaw_noirq, | ||
288 | .poweroff_noirq = pci_pm_poweroff_noirq, | ||
289 | .restore_noirq = pci_pm_restore_noirq, | ||
290 | .runtime_suspend = pci_pm_runtime_suspend, | ||
291 | .runtime_resume = pci_pm_runtime_resume, | ||
292 | .runtime_idle = pci_pm_runtime_idle, | ||
293 | }; | ||
294 | |||
295 | These callbacks are executed by the PM core in various situations related to | ||
296 | device power management and they, in turn, execute power management callbacks | ||
297 | provided by PCI device drivers. They also perform power management operations | ||
298 | involving some standard configuration registers of PCI devices that device | ||
299 | drivers need not know or care about. | ||
300 | |||
301 | The structure representing a PCI device, struct pci_dev, contains several fields | ||
302 | that these callbacks operate on: | ||
303 | |||
304 | struct pci_dev { | ||
305 | ... | ||
306 | pci_power_t current_state; /* Current operating state. */ | ||
307 | int pm_cap; /* PM capability offset in the | ||
308 | configuration space */ | ||
309 | unsigned int pme_support:5; /* Bitmask of states from which PME# | ||
310 | can be generated */ | ||
311 | unsigned int pme_interrupt:1;/* Is native PCIe PME signaling used? */ | ||
312 | unsigned int d1_support:1; /* Low power state D1 is supported */ | ||
313 | unsigned int d2_support:1; /* Low power state D2 is supported */ | ||
314 | unsigned int no_d1d2:1; /* D1 and D2 are forbidden */ | ||
315 | unsigned int wakeup_prepared:1; /* Device prepared for wake up */ | ||
316 | unsigned int d3_delay; /* D3->D0 transition time in ms */ | ||
317 | ... | ||
318 | }; | ||
319 | |||
320 | They also indirectly use some fields of the struct device that is embedded in | ||
321 | struct pci_dev. | ||
322 | |||
323 | 2.2. Device Initialization | ||
324 | -------------------------- | ||
325 | The PCI subsystem's first task related to device power management is to | ||
326 | prepare the device for power management and initialize the fields of struct | ||
327 | pci_dev used for this purpose. This happens in two functions defined in | ||
328 | drivers/pci/pci.c, pci_pm_init() and platform_pci_wakeup_init(). | ||
329 | |||
330 | The first of these functions checks if the device supports native PCI PM | ||
331 | and if that's the case the offset of its power management capability structure | ||
332 | in the configuration space is stored in the pm_cap field of the device's struct | ||
333 | pci_dev object. Next, the function checks which PCI low-power states are | ||
334 | supported by the device and from which low-power states the device can generate | ||
335 | native PCI PMEs. The power management fields of the device's struct pci_dev and | ||
336 | the struct device embedded in it are updated accordingly and the generation of | ||
337 | PMEs by the device is disabled. | ||
338 | |||
339 | The second function checks if the device can be prepared to signal wakeup with | ||
340 | the help of the platform firmware, such as the ACPI BIOS. If that is the case, | ||
341 | the function updates the wakeup fields in struct device embedded in the | ||
342 | device's struct pci_dev and uses the firmware-provided method to prevent the | ||
343 | device from signaling wakeup. | ||
344 | |||
345 | At this point the device is ready for power management. For driverless devices, | ||
346 | however, this functionality is limited to a few basic operations carried out | ||
347 | during system-wide transitions to a sleep state and back to the working state. | ||
348 | |||
349 | 2.3. Runtime Device Power Management | ||
350 | ------------------------------------ | ||
351 | The PCI subsystem plays a vital role in the runtime power management of PCI | ||
352 | devices. For this purpose it uses the general runtime power management | ||
353 | (runtime PM) framework described in Documentation/power/runtime_pm.txt. | ||
354 | Namely, it provides subsystem-level callbacks: | ||
355 | |||
356 | pci_pm_runtime_suspend() | ||
357 | pci_pm_runtime_resume() | ||
358 | pci_pm_runtime_idle() | ||
359 | |||
360 | that are executed by the core runtime PM routines. It also implements the | ||
361 | entire mechanics necessary for handling runtime wakeup signals from PCI devices | ||
362 | in low-power states, which at the time of this writing works for both the native | ||
363 | PCI Express PME signaling and the ACPI GPE-based wakeup signaling described in | ||
364 | Section 1. | ||
365 | |||
366 | First, a PCI device is put into a low-power state, or suspended, with the help | ||
367 | of pm_schedule_suspend() or pm_runtime_suspend() which for PCI devices call | ||
368 | pci_pm_runtime_suspend() to do the actual job. For this to work, the device's | ||
369 | driver has to provide a pm->runtime_suspend() callback (see below), which is | ||
370 | run by pci_pm_runtime_suspend() as the first action. If the driver's callback | ||
371 | returns successfully, the device's standard configuration registers are saved, | ||
372 | the device is prepared to generate wakeup signals and, finally, it is put into | ||
373 | the target low-power state. | ||
374 | |||
375 | The low-power state to put the device into is the lowest-power (highest number) | ||
376 | state from which it can signal wakeup. The exact method of signaling wakeup is | ||
377 | system-dependent and is determined by the PCI subsystem on the basis of the | ||
378 | reported capabilities of the device and the platform firmware. To prepare the | ||
379 | device for signaling wakeup and put it into the selected low-power state, the | ||
380 | PCI subsystem can use the platform firmware as well as the device's native PCI | ||
381 | PM capabilities, if supported. | ||
382 | |||
383 | It is expected that the device driver's pm->runtime_suspend() callback will | ||
384 | not attempt to prepare the device for signaling wakeup or to put it into a | ||
385 | low-power state. The driver ought to leave these tasks to the PCI subsystem | ||
386 | that has all of the information necessary to perform them. | ||
387 | |||
388 | A suspended device is brought back into the "active" state, or resumed, | ||
389 | with the help of pm_request_resume() or pm_runtime_resume() which both call | ||
390 | pci_pm_runtime_resume() for PCI devices. Again, this only works if the device's | ||
391 | driver provides a pm->runtime_resume() callback (see below). However, before | ||
392 | the driver's callback is executed, pci_pm_runtime_resume() brings the device | ||
393 | back into the full-power state, prevents it from signaling wakeup while in that | ||
394 | state and restores its standard configuration registers. Thus the driver's | ||
395 | callback need not worry about the PCI-specific aspects of the device resume. | ||
396 | |||
397 | Note that generally pci_pm_runtime_resume() may be called in two different | ||
398 | situations. First, it may be called at the request of the device's driver, for | ||
399 | example if there are some data for it to process. Second, it may be called | ||
400 | as a result of a wakeup signal from the device itself (this sometimes is | ||
401 | referred to as "remote wakeup"). Of course, for this purpose the wakeup signal | ||
402 | is handled in one of the ways described in Section 1 and finally converted into | ||
403 | a notification for the PCI subsystem after the source device has been | ||
404 | identified. | ||
405 | |||
406 | The pci_pm_runtime_idle() function, called for PCI devices by pm_runtime_idle() | ||
407 | and pm_request_idle(), executes the device driver's pm->runtime_idle() | ||
408 | callback, if defined, and if that callback doesn't return error code (or is not | ||
409 | present at all), suspends the device with the help of pm_runtime_suspend(). | ||
410 | Sometimes pci_pm_runtime_idle() is called automatically by the PM core (for | ||
411 | example, it is called right after the device has just been resumed), in which | ||
412 | cases it is expected to suspend the device if that makes sense. Usually, | ||
413 | however, the PCI subsystem doesn't really know if the device really can be | ||
414 | suspended, so it lets the device's driver decide by running its | ||
415 | pm->runtime_idle() callback. | ||
416 | |||
417 | 2.4. System-Wide Power Transitions | ||
418 | ---------------------------------- | ||
419 | There are a few different types of system-wide power transitions, described in | ||
420 | Documentation/power/devices.txt. Each of them requires devices to be handled | ||
421 | in a specific way and the PM core executes subsystem-level power management | ||
422 | callbacks for this purpose. They are executed in phases such that each phase | ||
423 | involves executing the same subsystem-level callback for every device belonging | ||
424 | to the given subsystem before the next phase begins. These phases always run | ||
425 | after tasks have been frozen. | ||
426 | |||
427 | 2.4.1. System Suspend | ||
428 | |||
429 | When the system is going into a sleep state in which the contents of memory will | ||
430 | be preserved, such as one of the ACPI sleep states S1-S3, the phases are: | ||
431 | |||
432 | prepare, suspend, suspend_noirq. | ||
433 | |||
434 | The following PCI bus type's callbacks, respectively, are used in these phases: | ||
435 | |||
436 | pci_pm_prepare() | ||
437 | pci_pm_suspend() | ||
438 | pci_pm_suspend_noirq() | ||
439 | |||
440 | The pci_pm_prepare() routine first puts the device into the "fully functional" | ||
441 | state with the help of pm_runtime_resume(). Then, it executes the device | ||
442 | driver's pm->prepare() callback if defined (i.e. if the driver's struct | ||
443 | dev_pm_ops object is present and the prepare pointer in that object is valid). | ||
444 | |||
445 | The pci_pm_suspend() routine first checks if the device's driver implements | ||
446 | legacy PCI suspend routines (see Section 3), in which case the driver's legacy | ||
447 | suspend callback is executed, if present, and its result is returned. Next, if | ||
448 | the device's driver doesn't provide a struct dev_pm_ops object (containing | ||
449 | pointers to the driver's callbacks), pci_pm_default_suspend() is called, which | ||
450 | simply turns off the device's bus master capability and runs | ||
451 | pcibios_disable_device() to disable it, unless the device is a bridge (PCI | ||
452 | bridges are ignored by this routine). Next, the device driver's pm->suspend() | ||
453 | callback is executed, if defined, and its result is returned if it fails. | ||
454 | Finally, pci_fixup_device() is called to apply hardware suspend quirks related | ||
455 | to the device if necessary. | ||
456 | |||
457 | Note that the suspend phase is carried out asynchronously for PCI devices, so | ||
458 | the pci_pm_suspend() callback may be executed in parallel for any pair of PCI | ||
459 | devices that don't depend on each other in a known way (i.e. none of the paths | ||
460 | in the device tree from the root bridge to a leaf device contains both of them). | ||
461 | |||
462 | The pci_pm_suspend_noirq() routine is executed after suspend_device_irqs() has | ||
463 | been called, which means that the device driver's interrupt handler won't be | ||
464 | invoked while this routine is running. It first checks if the device's driver | ||
465 | implements legacy PCI suspends routines (Section 3), in which case the legacy | ||
466 | late suspend routine is called and its result is returned (the standard | ||
467 | configuration registers of the device are saved if the driver's callback hasn't | ||
468 | done that). Second, if the device driver's struct dev_pm_ops object is not | ||
469 | present, the device's standard configuration registers are saved and the routine | ||
470 | returns success. Otherwise the device driver's pm->suspend_noirq() callback is | ||
471 | executed, if present, and its result is returned if it fails. Next, if the | ||
472 | device's standard configuration registers haven't been saved yet (one of the | ||
473 | device driver's callbacks executed before might do that), pci_pm_suspend_noirq() | ||
474 | saves them, prepares the device to signal wakeup (if necessary) and puts it into | ||
475 | a low-power state. | ||
476 | |||
477 | The low-power state to put the device into is the lowest-power (highest number) | ||
478 | state from which it can signal wakeup while the system is in the target sleep | ||
479 | state. Just like in the runtime PM case described above, the mechanism of | ||
480 | signaling wakeup is system-dependent and determined by the PCI subsystem, which | ||
481 | is also responsible for preparing the device to signal wakeup from the system's | ||
482 | target sleep state as appropriate. | ||
483 | |||
484 | PCI device drivers (that don't implement legacy power management callbacks) are | ||
485 | generally not expected to prepare devices for signaling wakeup or to put them | ||
486 | into low-power states. However, if one of the driver's suspend callbacks | ||
487 | (pm->suspend() or pm->suspend_noirq()) saves the device's standard configuration | ||
488 | registers, pci_pm_suspend_noirq() will assume that the device has been prepared | ||
489 | to signal wakeup and put into a low-power state by the driver (the driver is | ||
490 | then assumed to have used the helper functions provided by the PCI subsystem for | ||
491 | this purpose). PCI device drivers are not encouraged to do that, but in some | ||
492 | rare cases doing that in the driver may be the optimum approach. | ||
493 | |||
494 | 2.4.2. System Resume | ||
495 | |||
496 | When the system is undergoing a transition from a sleep state in which the | ||
497 | contents of memory have been preserved, such as one of the ACPI sleep states | ||
498 | S1-S3, into the working state (ACPI S0), the phases are: | ||
499 | |||
500 | resume_noirq, resume, complete. | ||
501 | |||
502 | The following PCI bus type's callbacks, respectively, are executed in these | ||
503 | phases: | ||
504 | |||
505 | pci_pm_resume_noirq() | ||
506 | pci_pm_resume() | ||
507 | pci_pm_complete() | ||
508 | |||
509 | The pci_pm_resume_noirq() routine first puts the device into the full-power | ||
510 | state, restores its standard configuration registers and applies early resume | ||
511 | hardware quirks related to the device, if necessary. This is done | ||
512 | unconditionally, regardless of whether or not the device's driver implements | ||
513 | legacy PCI power management callbacks (this way all PCI devices are in the | ||
514 | full-power state and their standard configuration registers have been restored | ||
515 | when their interrupt handlers are invoked for the first time during resume, | ||
516 | which allows the kernel to avoid problems with the handling of shared interrupts | ||
517 | by drivers whose devices are still suspended). If legacy PCI power management | ||
518 | callbacks (see Section 3) are implemented by the device's driver, the legacy | ||
519 | early resume callback is executed and its result is returned. Otherwise, the | ||
520 | device driver's pm->resume_noirq() callback is executed, if defined, and its | ||
521 | result is returned. | ||
522 | |||
523 | The pci_pm_resume() routine first checks if the device's standard configuration | ||
524 | registers have been restored and restores them if that's not the case (this | ||
525 | only is necessary in the error path during a failing suspend). Next, resume | ||
526 | hardware quirks related to the device are applied, if necessary, and if the | ||
527 | device's driver implements legacy PCI power management callbacks (see | ||
528 | Section 3), the driver's legacy resume callback is executed and its result is | ||
529 | returned. Otherwise, the device's wakeup signaling mechanisms are blocked and | ||
530 | its driver's pm->resume() callback is executed, if defined (the callback's | ||
531 | result is then returned). | ||
532 | |||
533 | The resume phase is carried out asynchronously for PCI devices, like the | ||
534 | suspend phase described above, which means that if two PCI devices don't depend | ||
535 | on each other in a known way, the pci_pm_resume() routine may be executed for | ||
536 | the both of them in parallel. | ||
537 | |||
538 | The pci_pm_complete() routine only executes the device driver's pm->complete() | ||
539 | callback, if defined. | ||
540 | |||
541 | 2.4.3. System Hibernation | ||
542 | |||
543 | System hibernation is more complicated than system suspend, because it requires | ||
544 | a system image to be created and written into a persistent storage medium. The | ||
545 | image is created atomically and all devices are quiesced, or frozen, before that | ||
546 | happens. | ||
547 | |||
548 | The freezing of devices is carried out after enough memory has been freed (at | ||
549 | the time of this writing the image creation requires at least 50% of system RAM | ||
550 | to be free) in the following three phases: | ||
551 | |||
552 | prepare, freeze, freeze_noirq | ||
553 | |||
554 | that correspond to the PCI bus type's callbacks: | ||
555 | |||
556 | pci_pm_prepare() | ||
557 | pci_pm_freeze() | ||
558 | pci_pm_freeze_noirq() | ||
559 | |||
560 | This means that the prepare phase is exactly the same as for system suspend. | ||
561 | The other two phases, however, are different. | ||
562 | |||
563 | The pci_pm_freeze() routine is quite similar to pci_pm_suspend(), but it runs | ||
564 | the device driver's pm->freeze() callback, if defined, instead of pm->suspend(), | ||
565 | and it doesn't apply the suspend-related hardware quirks. It is executed | ||
566 | asynchronously for different PCI devices that don't depend on each other in a | ||
567 | known way. | ||
568 | |||
569 | The pci_pm_freeze_noirq() routine, in turn, is similar to | ||
570 | pci_pm_suspend_noirq(), but it calls the device driver's pm->freeze_noirq() | ||
571 | routine instead of pm->suspend_noirq(). It also doesn't attempt to prepare the | ||
572 | device for signaling wakeup and put it into a low-power state. Still, it saves | ||
573 | the device's standard configuration registers if they haven't been saved by one | ||
574 | of the driver's callbacks. | ||
575 | |||
576 | Once the image has been created, it has to be saved. However, at this point all | ||
577 | devices are frozen and they cannot handle I/O, while their ability to handle | ||
578 | I/O is obviously necessary for the image saving. Thus they have to be brought | ||
579 | back to the fully functional state and this is done in the following phases: | ||
580 | |||
581 | thaw_noirq, thaw, complete | ||
582 | |||
583 | using the following PCI bus type's callbacks: | ||
584 | |||
585 | pci_pm_thaw_noirq() | ||
586 | pci_pm_thaw() | ||
587 | pci_pm_complete() | ||
588 | |||
589 | respectively. | ||
590 | |||
591 | The first of them, pci_pm_thaw_noirq(), is analogous to pci_pm_resume_noirq(), | ||
592 | but it doesn't put the device into the full power state and doesn't attempt to | ||
593 | restore its standard configuration registers. It also executes the device | ||
594 | driver's pm->thaw_noirq() callback, if defined, instead of pm->resume_noirq(). | ||
595 | |||
596 | The pci_pm_thaw() routine is similar to pci_pm_resume(), but it runs the device | ||
597 | driver's pm->thaw() callback instead of pm->resume(). It is executed | ||
598 | asynchronously for different PCI devices that don't depend on each other in a | ||
599 | known way. | ||
600 | |||
601 | The complete phase it the same as for system resume. | ||
602 | |||
603 | After saving the image, devices need to be powered down before the system can | ||
604 | enter the target sleep state (ACPI S4 for ACPI-based systems). This is done in | ||
605 | three phases: | ||
606 | |||
607 | prepare, poweroff, poweroff_noirq | ||
608 | |||
609 | where the prepare phase is exactly the same as for system suspend. The other | ||
610 | two phases are analogous to the suspend and suspend_noirq phases, respectively. | ||
611 | The PCI subsystem-level callbacks they correspond to | ||
612 | |||
613 | pci_pm_poweroff() | ||
614 | pci_pm_poweroff_noirq() | ||
615 | |||
616 | work in analogy with pci_pm_suspend() and pci_pm_poweroff_noirq(), respectively, | ||
617 | although they don't attempt to save the device's standard configuration | ||
618 | registers. | ||
619 | |||
620 | 2.4.4. System Restore | ||
621 | |||
622 | System restore requires a hibernation image to be loaded into memory and the | ||
623 | pre-hibernation memory contents to be restored before the pre-hibernation system | ||
624 | activity can be resumed. | ||
625 | |||
626 | As described in Documentation/power/devices.txt, the hibernation image is loaded | ||
627 | into memory by a fresh instance of the kernel, called the boot kernel, which in | ||
628 | turn is loaded and run by a boot loader in the usual way. After the boot kernel | ||
629 | has loaded the image, it needs to replace its own code and data with the code | ||
630 | and data of the "hibernated" kernel stored within the image, called the image | ||
631 | kernel. For this purpose all devices are frozen just like before creating | ||
632 | the image during hibernation, in the | ||
633 | |||
634 | prepare, freeze, freeze_noirq | ||
635 | |||
636 | phases described above. However, the devices affected by these phases are only | ||
637 | those having drivers in the boot kernel; other devices will still be in whatever | ||
638 | state the boot loader left them. | ||
639 | |||
640 | Should the restoration of the pre-hibernation memory contents fail, the boot | ||
641 | kernel would go through the "thawing" procedure described above, using the | ||
642 | thaw_noirq, thaw, and complete phases (that will only affect the devices having | ||
643 | drivers in the boot kernel), and then continue running normally. | ||
644 | |||
645 | If the pre-hibernation memory contents are restored successfully, which is the | ||
646 | usual situation, control is passed to the image kernel, which then becomes | ||
647 | responsible for bringing the system back to the working state. To achieve this, | ||
648 | it must restore the devices' pre-hibernation functionality, which is done much | ||
649 | like waking up from the memory sleep state, although it involves different | ||
650 | phases: | ||
651 | |||
652 | restore_noirq, restore, complete | ||
653 | |||
654 | The first two of these are analogous to the resume_noirq and resume phases | ||
655 | described above, respectively, and correspond to the following PCI subsystem | ||
656 | callbacks: | ||
657 | |||
658 | pci_pm_restore_noirq() | ||
659 | pci_pm_restore() | ||
660 | |||
661 | These callbacks work in analogy with pci_pm_resume_noirq() and pci_pm_resume(), | ||
662 | respectively, but they execute the device driver's pm->restore_noirq() and | ||
663 | pm->restore() callbacks, if available. | ||
664 | |||
665 | The complete phase is carried out in exactly the same way as during system | ||
666 | resume. | ||
667 | |||
668 | |||
669 | 3. PCI Device Drivers and Power Management | ||
670 | ========================================== | ||
671 | |||
672 | 3.1. Power Management Callbacks | ||
673 | ------------------------------- | ||
674 | PCI device drivers participate in power management by providing callbacks to be | ||
675 | executed by the PCI subsystem's power management routines described above and by | ||
676 | controlling the runtime power management of their devices. | ||
677 | |||
678 | At the time of this writing there are two ways to define power management | ||
679 | callbacks for a PCI device driver, the recommended one, based on using a | ||
680 | dev_pm_ops structure described in Documentation/power/devices.txt, and the | ||
681 | "legacy" one, in which the .suspend(), .suspend_late(), .resume_early(), and | ||
682 | .resume() callbacks from struct pci_driver are used. The legacy approach, | ||
683 | however, doesn't allow one to define runtime power management callbacks and is | ||
684 | not really suitable for any new drivers. Therefore it is not covered by this | ||
685 | document (refer to the source code to learn more about it). | ||
686 | |||
687 | It is recommended that all PCI device drivers define a struct dev_pm_ops object | ||
688 | containing pointers to power management (PM) callbacks that will be executed by | ||
689 | the PCI subsystem's PM routines in various circumstances. A pointer to the | ||
690 | driver's struct dev_pm_ops object has to be assigned to the driver.pm field in | ||
691 | its struct pci_driver object. Once that has happened, the "legacy" PM callbacks | ||
692 | in struct pci_driver are ignored (even if they are not NULL). | ||
693 | |||
694 | The PM callbacks in struct dev_pm_ops are not mandatory and if they are not | ||
695 | defined (i.e. the respective fields of struct dev_pm_ops are unset) the PCI | ||
696 | subsystem will handle the device in a simplified default manner. If they are | ||
697 | defined, though, they are expected to behave as described in the following | ||
698 | subsections. | ||
699 | |||
700 | 3.1.1. prepare() | ||
701 | |||
702 | The prepare() callback is executed during system suspend, during hibernation | ||
703 | (when a hibernation image is about to be created), during power-off after | ||
704 | saving a hibernation image and during system restore, when a hibernation image | ||
705 | has just been loaded into memory. | ||
706 | |||
707 | This callback is only necessary if the driver's device has children that in | ||
708 | general may be registered at any time. In that case the role of the prepare() | ||
709 | callback is to prevent new children of the device from being registered until | ||
710 | one of the resume_noirq(), thaw_noirq(), or restore_noirq() callbacks is run. | ||
711 | |||
712 | In addition to that the prepare() callback may carry out some operations | ||
713 | preparing the device to be suspended, although it should not allocate memory | ||
714 | (if additional memory is required to suspend the device, it has to be | ||
715 | preallocated earlier, for example in a suspend/hibernate notifier as described | ||
716 | in Documentation/power/notifiers.txt). | ||
717 | |||
718 | 3.1.2. suspend() | ||
719 | |||
720 | The suspend() callback is only executed during system suspend, after prepare() | ||
721 | callbacks have been executed for all devices in the system. | ||
722 | |||
723 | This callback is expected to quiesce the device and prepare it to be put into a | ||
724 | low-power state by the PCI subsystem. It is not required (in fact it even is | ||
725 | not recommended) that a PCI driver's suspend() callback save the standard | ||
726 | configuration registers of the device, prepare it for waking up the system, or | ||
727 | put it into a low-power state. All of these operations can very well be taken | ||
728 | care of by the PCI subsystem, without the driver's participation. | ||
729 | |||
730 | However, in some rare case it is convenient to carry out these operations in | ||
731 | a PCI driver. Then, pci_save_state(), pci_prepare_to_sleep(), and | ||
732 | pci_set_power_state() should be used to save the device's standard configuration | ||
733 | registers, to prepare it for system wakeup (if necessary), and to put it into a | ||
734 | low-power state, respectively. Moreover, if the driver calls pci_save_state(), | ||
735 | the PCI subsystem will not execute either pci_prepare_to_sleep(), or | ||
736 | pci_set_power_state() for its device, so the driver is then responsible for | ||
737 | handling the device as appropriate. | ||
738 | |||
739 | While the suspend() callback is being executed, the driver's interrupt handler | ||
740 | can be invoked to handle an interrupt from the device, so all suspend-related | ||
741 | operations relying on the driver's ability to handle interrupts should be | ||
742 | carried out in this callback. | ||
743 | |||
744 | 3.1.3. suspend_noirq() | ||
745 | |||
746 | The suspend_noirq() callback is only executed during system suspend, after | ||
747 | suspend() callbacks have been executed for all devices in the system and | ||
748 | after device interrupts have been disabled by the PM core. | ||
749 | |||
750 | The difference between suspend_noirq() and suspend() is that the driver's | ||
751 | interrupt handler will not be invoked while suspend_noirq() is running. Thus | ||
752 | suspend_noirq() can carry out operations that would cause race conditions to | ||
753 | arise if they were performed in suspend(). | ||
754 | |||
755 | 3.1.4. freeze() | ||
756 | |||
757 | The freeze() callback is hibernation-specific and is executed in two situations, | ||
758 | during hibernation, after prepare() callbacks have been executed for all devices | ||
759 | in preparation for the creation of a system image, and during restore, | ||
760 | after a system image has been loaded into memory from persistent storage and the | ||
761 | prepare() callbacks have been executed for all devices. | ||
762 | |||
763 | The role of this callback is analogous to the role of the suspend() callback | ||
764 | described above. In fact, they only need to be different in the rare cases when | ||
765 | the driver takes the responsibility for putting the device into a low-power | ||
76 | state. | 766 | state. |
77 | 767 | ||
78 | The first walk allows a graceful recovery in the event of a failure, since none | 768 | In that cases the freeze() callback should not prepare the device system wakeup |
79 | of the devices have actually been powered down. | 769 | or put it into a low-power state. Still, either it or freeze_noirq() should |
80 | 770 | save the device's standard configuration registers using pci_save_state(). | |
81 | In both walks, in particular the second, all children of a bridge are touched | ||
82 | before the actual bridge itself. This allows the bridge to retain power while | ||
83 | its children are being accessed. | ||
84 | |||
85 | Upon resuming from sleep, just the opposite must be true: all bridges must be | ||
86 | powered on and restored before their children are powered on. This is easily | ||
87 | accomplished with a breadth-first walk of the PCI device tree. | ||
88 | |||
89 | |||
90 | 3. PCI Utility Functions | ||
91 | ~~~~~~~~~~~~~~~~~~~~~~~~ | ||
92 | |||
93 | These are helper functions designed to be called by individual device drivers. | ||
94 | Assuming that a device behaves as advertised, these should be applicable in most | ||
95 | cases. However, results may vary. | ||
96 | |||
97 | Note that these functions are never implicitly called for the driver. The driver | ||
98 | is always responsible for deciding when and if to call these. | ||
99 | |||
100 | |||
101 | pci_save_state | ||
102 | -------------- | ||
103 | |||
104 | Usage: | ||
105 | pci_save_state(struct pci_dev *dev); | ||
106 | |||
107 | Description: | ||
108 | Save first 64 bytes of PCI config space, along with any additional | ||
109 | PCI-Express or PCI-X information. | ||
110 | |||
111 | |||
112 | pci_restore_state | ||
113 | ----------------- | ||
114 | |||
115 | Usage: | ||
116 | pci_restore_state(struct pci_dev *dev); | ||
117 | |||
118 | Description: | ||
119 | Restore previously saved config space. | ||
120 | |||
121 | |||
122 | pci_set_power_state | ||
123 | ------------------- | ||
124 | |||
125 | Usage: | ||
126 | pci_set_power_state(struct pci_dev *dev, pci_power_t state); | ||
127 | |||
128 | Description: | ||
129 | Transition device to low power state using PCI PM Capabilities | ||
130 | registers. | ||
131 | |||
132 | Will fail under one of the following conditions: | ||
133 | - If state is less than current state, but not D0 (illegal transition) | ||
134 | - Device doesn't support PM Capabilities | ||
135 | - Device does not support requested state | ||
136 | |||
137 | |||
138 | pci_enable_wake | ||
139 | --------------- | ||
140 | |||
141 | Usage: | ||
142 | pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable); | ||
143 | |||
144 | Description: | ||
145 | Enable device to generate PME# during low power state using PCI PM | ||
146 | Capabilities. | ||
147 | |||
148 | Checks whether if device supports generating PME# from requested state | ||
149 | and fail if it does not, unless enable == 0 (request is to disable wake | ||
150 | events, which is implicit if it doesn't even support it in the first | ||
151 | place). | ||
152 | |||
153 | Note that the PMC Register in the device's PM Capabilities has a bitmask | ||
154 | of the states it supports generating PME# from. D3hot is bit 3 and | ||
155 | D3cold is bit 4. So, while a value of 4 as the state may not seem | ||
156 | semantically correct, it is. | ||
157 | |||
158 | |||
159 | 4. PCI Device Drivers | ||
160 | ~~~~~~~~~~~~~~~~~~~~~ | ||
161 | |||
162 | These functions are intended for use by individual drivers, and are defined in | ||
163 | struct pci_driver: | ||
164 | |||
165 | int (*suspend) (struct pci_dev *dev, pm_message_t state); | ||
166 | int (*resume) (struct pci_dev *dev); | ||
167 | |||
168 | |||
169 | suspend | ||
170 | ------- | ||
171 | |||
172 | Usage: | ||
173 | |||
174 | if (dev->driver && dev->driver->suspend) | ||
175 | dev->driver->suspend(dev,state); | ||
176 | |||
177 | A driver uses this function to actually transition the device into a low power | ||
178 | state. This should include disabling I/O, IRQs, and bus-mastering, as well as | ||
179 | physically transitioning the device to a lower power state; it may also include | ||
180 | calls to pci_enable_wake(). | ||
181 | |||
182 | Bus mastering may be disabled by doing: | ||
183 | |||
184 | pci_disable_device(dev); | ||
185 | |||
186 | For devices that support the PCI PM Spec, this may be used to set the device's | ||
187 | power state to match the suspend() parameter: | ||
188 | |||
189 | pci_set_power_state(dev,state); | ||
190 | |||
191 | The driver is also responsible for disabling any other device-specific features | ||
192 | (e.g blanking screen, turning off on-card memory, etc). | ||
193 | |||
194 | The driver should be sure to track the current state of the device, as it may | ||
195 | obviate the need for some operations. | ||
196 | |||
197 | The driver should update the current_state field in its pci_dev structure in | ||
198 | this function, except for PM-capable devices when pci_set_power_state is used. | ||
199 | |||
200 | resume | ||
201 | ------ | ||
202 | |||
203 | Usage: | ||
204 | |||
205 | if (dev->driver && dev->driver->resume) | ||
206 | dev->driver->resume(dev) | ||
207 | 771 | ||
208 | The resume callback may be called from any power state, and is always meant to | 772 | 3.1.5. freeze_noirq() |
209 | transition the device to the D0 state. | ||
210 | 773 | ||
211 | The driver is responsible for reenabling any features of the device that had | 774 | The freeze_noirq() callback is hibernation-specific. It is executed during |
212 | been disabled during previous suspend calls, such as IRQs and bus mastering, | 775 | hibernation, after prepare() and freeze() callbacks have been executed for all |
213 | as well as calling pci_restore_state(). | 776 | devices in preparation for the creation of a system image, and during restore, |
777 | after a system image has been loaded into memory and after prepare() and | ||
778 | freeze() callbacks have been executed for all devices. It is always executed | ||
779 | after device interrupts have been disabled by the PM core. | ||
214 | 780 | ||
215 | If the device is currently in D3, it may need to be reinitialized in resume(). | 781 | The role of this callback is analogous to the role of the suspend_noirq() |
782 | callback described above and it very rarely is necessary to define | ||
783 | freeze_noirq(). | ||
216 | 784 | ||
217 | * Some types of devices, like bus controllers, will preserve context in D3hot | 785 | The difference between freeze_noirq() and freeze() is analogous to the |
218 | (using Vcc power). Their drivers will often want to avoid re-initializing | 786 | difference between suspend_noirq() and suspend(). |
219 | them after re-entering D0 (perhaps to avoid resetting downstream devices). | ||
220 | 787 | ||
221 | * Other kinds of devices in D3hot will discard device context as part of a | 788 | 3.1.6. poweroff() |
222 | soft reset when re-entering the D0 state. | ||
223 | |||
224 | * Devices resuming from D3cold always go through a power-on reset. Some | ||
225 | device context can also be preserved using Vaux power. | ||
226 | 789 | ||
227 | * Some systems hide D3cold resume paths from drivers. For example, on PCs | 790 | The poweroff() callback is hibernation-specific. It is executed when the system |
228 | the resume path for suspend-to-disk often runs BIOS powerup code, which | 791 | is about to be powered off after saving a hibernation image to a persistent |
229 | will sometimes re-initialize the device. | 792 | storage. prepare() callbacks are executed for all devices before poweroff() is |
793 | called. | ||
230 | 794 | ||
231 | To handle resets during D3 to D0 transitions, it may be convenient to share | 795 | The role of this callback is analogous to the role of the suspend() and freeze() |
232 | device initialization code between probe() and resume(). Device parameters | 796 | callbacks described above, although it does not need to save the contents of |
233 | can also be saved before the driver suspends into D3, avoiding re-probe. | 797 | the device's registers. In particular, if the driver wants to put the device |
798 | into a low-power state itself instead of allowing the PCI subsystem to do that, | ||
799 | the poweroff() callback should use pci_prepare_to_sleep() and | ||
800 | pci_set_power_state() to prepare the device for system wakeup and to put it | ||
801 | into a low-power state, respectively, but it need not save the device's standard | ||
802 | configuration registers. | ||
234 | 803 | ||
235 | If the device supports the PCI PM Spec, it can use this to physically transition | 804 | 3.1.7. poweroff_noirq() |
236 | the device to D0: | ||
237 | 805 | ||
238 | pci_set_power_state(dev,0); | 806 | The poweroff_noirq() callback is hibernation-specific. It is executed after |
807 | poweroff() callbacks have been executed for all devices in the system. | ||
239 | 808 | ||
240 | Note that if the entire system is transitioning out of a global sleep state, all | 809 | The role of this callback is analogous to the role of the suspend_noirq() and |
241 | devices will be placed in the D0 state, so this is not necessary. However, in | 810 | freeze_noirq() callbacks described above, but it does not need to save the |
242 | the event that the device is placed in the D3 state during normal operation, | 811 | contents of the device's registers. |
243 | this call is necessary. It is impossible to determine which of the two events is | ||
244 | taking place in the driver, so it is always a good idea to make that call. | ||
245 | 812 | ||
246 | The driver should take note of the state that it is resuming from in order to | 813 | The difference between poweroff_noirq() and poweroff() is analogous to the |
247 | ensure correct (and speedy) operation. | 814 | difference between suspend_noirq() and suspend(). |
248 | 815 | ||
249 | The driver should update the current_state field in its pci_dev structure in | 816 | 3.1.8. resume_noirq() |
250 | this function, except for PM-capable devices when pci_set_power_state is used. | ||
251 | 817 | ||
818 | The resume_noirq() callback is only executed during system resume, after the | ||
819 | PM core has enabled the non-boot CPUs. The driver's interrupt handler will not | ||
820 | be invoked while resume_noirq() is running, so this callback can carry out | ||
821 | operations that might race with the interrupt handler. | ||
252 | 822 | ||
823 | Since the PCI subsystem unconditionally puts all devices into the full power | ||
824 | state in the resume_noirq phase of system resume and restores their standard | ||
825 | configuration registers, resume_noirq() is usually not necessary. In general | ||
826 | it should only be used for performing operations that would lead to race | ||
827 | conditions if carried out by resume(). | ||
253 | 828 | ||
254 | A reference implementation | 829 | 3.1.9. resume() |
255 | ------------------------- | ||
256 | .suspend() | ||
257 | { | ||
258 | /* driver specific operations */ | ||
259 | 830 | ||
260 | /* Disable IRQ */ | 831 | The resume() callback is only executed during system resume, after |
261 | free_irq(); | 832 | resume_noirq() callbacks have been executed for all devices in the system and |
262 | /* If using MSI */ | 833 | device interrupts have been enabled by the PM core. |
263 | pci_disable_msi(); | ||
264 | 834 | ||
265 | pci_save_state(); | 835 | This callback is responsible for restoring the pre-suspend configuration of the |
266 | pci_enable_wake(); | 836 | device and bringing it back to the fully functional state. The device should be |
267 | /* Disable IO/bus master/irq router */ | 837 | able to process I/O in a usual way after resume() has returned. |
268 | pci_disable_device(); | ||
269 | pci_set_power_state(pci_choose_state()); | ||
270 | } | ||
271 | 838 | ||
272 | .resume() | 839 | 3.1.10. thaw_noirq() |
273 | { | ||
274 | pci_set_power_state(PCI_D0); | ||
275 | pci_restore_state(); | ||
276 | /* device's irq possibly is changed, driver should take care */ | ||
277 | pci_enable_device(); | ||
278 | pci_set_master(); | ||
279 | 840 | ||
280 | /* if using MSI, device's vector possibly is changed */ | 841 | The thaw_noirq() callback is hibernation-specific. It is executed after a |
281 | pci_enable_msi(); | 842 | system image has been created and the non-boot CPUs have been enabled by the PM |
843 | core, in the thaw_noirq phase of hibernation. It also may be executed if the | ||
844 | loading of a hibernation image fails during system restore (it is then executed | ||
845 | after enabling the non-boot CPUs). The driver's interrupt handler will not be | ||
846 | invoked while thaw_noirq() is running. | ||
282 | 847 | ||
283 | request_irq(); | 848 | The role of this callback is analogous to the role of resume_noirq(). The |
284 | /* driver specific operations; */ | 849 | difference between these two callbacks is that thaw_noirq() is executed after |
285 | } | 850 | freeze() and freeze_noirq(), so in general it does not need to modify the |
851 | contents of the device's registers. | ||
286 | 852 | ||
287 | This is a typical implementation. Drivers can slightly change the order | 853 | 3.1.11. thaw() |
288 | of the operations in the implementation, ignore some operations or add | ||
289 | more driver specific operations in it, but drivers should do something like | ||
290 | this on the whole. | ||
291 | 854 | ||
292 | 5. Resources | 855 | The thaw() callback is hibernation-specific. It is executed after thaw_noirq() |
293 | ~~~~~~~~~~~~ | 856 | callbacks have been executed for all devices in the system and after device |
857 | interrupts have been enabled by the PM core. | ||
294 | 858 | ||
295 | PCI Local Bus Specification | 859 | This callback is responsible for restoring the pre-freeze configuration of |
296 | PCI Bus Power Management Interface Specification | 860 | the device, so that it will work in a usual way after thaw() has returned. |
297 | 861 | ||
298 | http://www.pcisig.com | 862 | 3.1.12. restore_noirq() |
299 | 863 | ||
864 | The restore_noirq() callback is hibernation-specific. It is executed in the | ||
865 | restore_noirq phase of hibernation, when the boot kernel has passed control to | ||
866 | the image kernel and the non-boot CPUs have been enabled by the image kernel's | ||
867 | PM core. | ||
868 | |||
869 | This callback is analogous to resume_noirq() with the exception that it cannot | ||
870 | make any assumption on the previous state of the device, even if the BIOS (or | ||
871 | generally the platform firmware) is known to preserve that state over a | ||
872 | suspend-resume cycle. | ||
873 | |||
874 | For the vast majority of PCI device drivers there is no difference between | ||
875 | resume_noirq() and restore_noirq(). | ||
876 | |||
877 | 3.1.13. restore() | ||
878 | |||
879 | The restore() callback is hibernation-specific. It is executed after | ||
880 | restore_noirq() callbacks have been executed for all devices in the system and | ||
881 | after the PM core has enabled device drivers' interrupt handlers to be invoked. | ||
882 | |||
883 | This callback is analogous to resume(), just like restore_noirq() is analogous | ||
884 | to resume_noirq(). Consequently, the difference between restore_noirq() and | ||
885 | restore() is analogous to the difference between resume_noirq() and resume(). | ||
886 | |||
887 | For the vast majority of PCI device drivers there is no difference between | ||
888 | resume() and restore(). | ||
889 | |||
890 | 3.1.14. complete() | ||
891 | |||
892 | The complete() callback is executed in the following situations: | ||
893 | - during system resume, after resume() callbacks have been executed for all | ||
894 | devices, | ||
895 | - during hibernation, before saving the system image, after thaw() callbacks | ||
896 | have been executed for all devices, | ||
897 | - during system restore, when the system is going back to its pre-hibernation | ||
898 | state, after restore() callbacks have been executed for all devices. | ||
899 | It also may be executed if the loading of a hibernation image into memory fails | ||
900 | (in that case it is run after thaw() callbacks have been executed for all | ||
901 | devices that have drivers in the boot kernel). | ||
902 | |||
903 | This callback is entirely optional, although it may be necessary if the | ||
904 | prepare() callback performs operations that need to be reversed. | ||
905 | |||
906 | 3.1.15. runtime_suspend() | ||
907 | |||
908 | The runtime_suspend() callback is specific to device runtime power management | ||
909 | (runtime PM). It is executed by the PM core's runtime PM framework when the | ||
910 | device is about to be suspended (i.e. quiesced and put into a low-power state) | ||
911 | at run time. | ||
912 | |||
913 | This callback is responsible for freezing the device and preparing it to be | ||
914 | put into a low-power state, but it must allow the PCI subsystem to perform all | ||
915 | of the PCI-specific actions necessary for suspending the device. | ||
916 | |||
917 | 3.1.16. runtime_resume() | ||
918 | |||
919 | The runtime_resume() callback is specific to device runtime PM. It is executed | ||
920 | by the PM core's runtime PM framework when the device is about to be resumed | ||
921 | (i.e. put into the full-power state and programmed to process I/O normally) at | ||
922 | run time. | ||
923 | |||
924 | This callback is responsible for restoring the normal functionality of the | ||
925 | device after it has been put into the full-power state by the PCI subsystem. | ||
926 | The device is expected to be able to process I/O in the usual way after | ||
927 | runtime_resume() has returned. | ||
928 | |||
929 | 3.1.17. runtime_idle() | ||
930 | |||
931 | The runtime_idle() callback is specific to device runtime PM. It is executed | ||
932 | by the PM core's runtime PM framework whenever it may be desirable to suspend | ||
933 | the device according to the PM core's information. In particular, it is | ||
934 | automatically executed right after runtime_resume() has returned in case the | ||
935 | resume of the device has happened as a result of a spurious event. | ||
936 | |||
937 | This callback is optional, but if it is not implemented or if it returns 0, the | ||
938 | PCI subsystem will call pm_runtime_suspend() for the device, which in turn will | ||
939 | cause the driver's runtime_suspend() callback to be executed. | ||
940 | |||
941 | 3.1.18. Pointing Multiple Callback Pointers to One Routine | ||
942 | |||
943 | Although in principle each of the callbacks described in the previous | ||
944 | subsections can be defined as a separate function, it often is convenient to | ||
945 | point two or more members of struct dev_pm_ops to the same routine. There are | ||
946 | a few convenience macros that can be used for this purpose. | ||
947 | |||
948 | The SIMPLE_DEV_PM_OPS macro declares a struct dev_pm_ops object with one | ||
949 | suspend routine pointed to by the .suspend(), .freeze(), and .poweroff() | ||
950 | members and one resume routine pointed to by the .resume(), .thaw(), and | ||
951 | .restore() members. The other function pointers in this struct dev_pm_ops are | ||
952 | unset. | ||
953 | |||
954 | The UNIVERSAL_DEV_PM_OPS macro is similar to SIMPLE_DEV_PM_OPS, but it | ||
955 | additionally sets the .runtime_resume() pointer to the same value as | ||
956 | .resume() (and .thaw(), and .restore()) and the .runtime_suspend() pointer to | ||
957 | the same value as .suspend() (and .freeze() and .poweroff()). | ||
958 | |||
959 | The SET_SYSTEM_SLEEP_PM_OPS can be used inside of a declaration of struct | ||
960 | dev_pm_ops to indicate that one suspend routine is to be pointed to by the | ||
961 | .suspend(), .freeze(), and .poweroff() members and one resume routine is to | ||
962 | be pointed to by the .resume(), .thaw(), and .restore() members. | ||
963 | |||
964 | 3.2. Device Runtime Power Management | ||
965 | ------------------------------------ | ||
966 | In addition to providing device power management callbacks PCI device drivers | ||
967 | are responsible for controlling the runtime power management (runtime PM) of | ||
968 | their devices. | ||
969 | |||
970 | The PCI device runtime PM is optional, but it is recommended that PCI device | ||
971 | drivers implement it at least in the cases where there is a reliable way of | ||
972 | verifying that the device is not used (like when the network cable is detached | ||
973 | from an Ethernet adapter or there are no devices attached to a USB controller). | ||
974 | |||
975 | To support the PCI runtime PM the driver first needs to implement the | ||
976 | runtime_suspend() and runtime_resume() callbacks. It also may need to implement | ||
977 | the runtime_idle() callback to prevent the device from being suspended again | ||
978 | every time right after the runtime_resume() callback has returned | ||
979 | (alternatively, the runtime_suspend() callback will have to check if the | ||
980 | device should really be suspended and return -EAGAIN if that is not the case). | ||
981 | |||
982 | The runtime PM of PCI devices is disabled by default. It is also blocked by | ||
983 | pci_pm_init() that runs the pm_runtime_forbid() helper function. If a PCI | ||
984 | driver implements the runtime PM callbacks and intends to use the runtime PM | ||
985 | framework provided by the PM core and the PCI subsystem, it should enable this | ||
986 | feature by executing the pm_runtime_enable() helper function. However, the | ||
987 | driver should not call the pm_runtime_allow() helper function unblocking | ||
988 | the runtime PM of the device. Instead, it should allow user space or some | ||
989 | platform-specific code to do that (user space can do it via sysfs), although | ||
990 | once it has called pm_runtime_enable(), it must be prepared to handle the | ||
991 | runtime PM of the device correctly as soon as pm_runtime_allow() is called | ||
992 | (which may happen at any time). [It also is possible that user space causes | ||
993 | pm_runtime_allow() to be called via sysfs before the driver is loaded, so in | ||
994 | fact the driver has to be prepared to handle the runtime PM of the device as | ||
995 | soon as it calls pm_runtime_enable().] | ||
996 | |||
997 | The runtime PM framework works by processing requests to suspend or resume | ||
998 | devices, or to check if they are idle (in which cases it is reasonable to | ||
999 | subsequently request that they be suspended). These requests are represented | ||
1000 | by work items put into the power management workqueue, pm_wq. Although there | ||
1001 | are a few situations in which power management requests are automatically | ||
1002 | queued by the PM core (for example, after processing a request to resume a | ||
1003 | device the PM core automatically queues a request to check if the device is | ||
1004 | idle), device drivers are generally responsible for queuing power management | ||
1005 | requests for their devices. For this purpose they should use the runtime PM | ||
1006 | helper functions provided by the PM core, discussed in | ||
1007 | Documentation/power/runtime_pm.txt. | ||
1008 | |||
1009 | Devices can also be suspended and resumed synchronously, without placing a | ||
1010 | request into pm_wq. In the majority of cases this also is done by their | ||
1011 | drivers that use helper functions provided by the PM core for this purpose. | ||
1012 | |||
1013 | For more information on the runtime PM of devices refer to | ||
1014 | Documentation/power/runtime_pm.txt. | ||
1015 | |||
1016 | |||
1017 | 4. Resources | ||
1018 | ============ | ||
1019 | |||
1020 | PCI Local Bus Specification, Rev. 3.0 | ||
1021 | PCI Bus Power Management Interface Specification, Rev. 1.2 | ||
1022 | Advanced Configuration and Power Interface (ACPI) Specification, Rev. 3.0b | ||
1023 | PCI Express Base Specification, Rev. 2.0 | ||
1024 | Documentation/power/devices.txt | ||
1025 | Documentation/power/runtime_pm.txt | ||
diff --git a/Documentation/spi/ep93xx_spi b/Documentation/spi/ep93xx_spi new file mode 100644 index 000000000000..6325f5b48635 --- /dev/null +++ b/Documentation/spi/ep93xx_spi | |||
@@ -0,0 +1,95 @@ | |||
1 | Cirrus EP93xx SPI controller driver HOWTO | ||
2 | ========================================= | ||
3 | |||
4 | ep93xx_spi driver brings SPI master support for EP93xx SPI controller. Chip | ||
5 | selects are implemented with GPIO lines. | ||
6 | |||
7 | NOTE: If possible, don't use SFRMOUT (SFRM1) signal as a chip select. It will | ||
8 | not work correctly (it cannot be controlled by software). Use GPIO lines | ||
9 | instead. | ||
10 | |||
11 | Sample configuration | ||
12 | ==================== | ||
13 | |||
14 | Typically driver configuration is done in platform board files (the files under | ||
15 | arch/arm/mach-ep93xx/*.c). In this example we configure MMC over SPI through | ||
16 | this driver on TS-7260 board. You can adapt the code to suit your needs. | ||
17 | |||
18 | This example uses EGPIO9 as SD/MMC card chip select (this is wired in DIO1 | ||
19 | header on the board). | ||
20 | |||
21 | You need to select CONFIG_MMC_SPI to use mmc_spi driver. | ||
22 | |||
23 | arch/arm/mach-ep93xx/ts72xx.c: | ||
24 | |||
25 | ... | ||
26 | #include <linux/gpio.h> | ||
27 | #include <linux/spi/spi.h> | ||
28 | |||
29 | #include <mach/ep93xx_spi.h> | ||
30 | |||
31 | /* this is our GPIO line used for chip select */ | ||
32 | #define MMC_CHIP_SELECT_GPIO EP93XX_GPIO_LINE_EGPIO9 | ||
33 | |||
34 | static int ts72xx_mmc_spi_setup(struct spi_device *spi) | ||
35 | { | ||
36 | int err; | ||
37 | |||
38 | err = gpio_request(MMC_CHIP_SELECT_GPIO, spi->modalias); | ||
39 | if (err) | ||
40 | return err; | ||
41 | |||
42 | gpio_direction_output(MMC_CHIP_SELECT_GPIO, 1); | ||
43 | |||
44 | return 0; | ||
45 | } | ||
46 | |||
47 | static void ts72xx_mmc_spi_cleanup(struct spi_device *spi) | ||
48 | { | ||
49 | gpio_set_value(MMC_CHIP_SELECT_GPIO, 1); | ||
50 | gpio_direction_input(MMC_CHIP_SELECT_GPIO); | ||
51 | gpio_free(MMC_CHIP_SELECT_GPIO); | ||
52 | } | ||
53 | |||
54 | static void ts72xx_mmc_spi_cs_control(struct spi_device *spi, int value) | ||
55 | { | ||
56 | gpio_set_value(MMC_CHIP_SELECT_GPIO, value); | ||
57 | } | ||
58 | |||
59 | static struct ep93xx_spi_chip_ops ts72xx_mmc_spi_ops = { | ||
60 | .setup = ts72xx_mmc_spi_setup, | ||
61 | .cleanup = ts72xx_mmc_spi_cleanup, | ||
62 | .cs_control = ts72xx_mmc_spi_cs_control, | ||
63 | }; | ||
64 | |||
65 | static struct spi_board_info ts72xx_spi_devices[] __initdata = { | ||
66 | { | ||
67 | .modalias = "mmc_spi", | ||
68 | .controller_data = &ts72xx_mmc_spi_ops, | ||
69 | /* | ||
70 | * We use 10 MHz even though the maximum is 7.4 MHz. The driver | ||
71 | * will limit it automatically to max. frequency. | ||
72 | */ | ||
73 | .max_speed_hz = 10 * 1000 * 1000, | ||
74 | .bus_num = 0, | ||
75 | .chip_select = 0, | ||
76 | .mode = SPI_MODE_0, | ||
77 | }, | ||
78 | }; | ||
79 | |||
80 | static struct ep93xx_spi_info ts72xx_spi_info = { | ||
81 | .num_chipselect = ARRAY_SIZE(ts72xx_spi_devices), | ||
82 | }; | ||
83 | |||
84 | static void __init ts72xx_init_machine(void) | ||
85 | { | ||
86 | ... | ||
87 | ep93xx_register_spi(&ts72xx_spi_info, ts72xx_spi_devices, | ||
88 | ARRAY_SIZE(ts72xx_spi_devices)); | ||
89 | } | ||
90 | |||
91 | Thanks to | ||
92 | ========= | ||
93 | Martin Guy, H. Hartley Sweeten and others who helped me during development of | ||
94 | the driver. Simplemachines.it donated me a Sim.One board which I used testing | ||
95 | the driver on EP9307. | ||
diff --git a/Documentation/spi/spidev_fdx.c b/Documentation/spi/spidev_fdx.c index fc354f760384..36ec0774ca0b 100644 --- a/Documentation/spi/spidev_fdx.c +++ b/Documentation/spi/spidev_fdx.c | |||
@@ -58,10 +58,10 @@ static void do_msg(int fd, int len) | |||
58 | len = sizeof buf; | 58 | len = sizeof buf; |
59 | 59 | ||
60 | buf[0] = 0xaa; | 60 | buf[0] = 0xaa; |
61 | xfer[0].tx_buf = (__u64) buf; | 61 | xfer[0].tx_buf = (unsigned long)buf; |
62 | xfer[0].len = 1; | 62 | xfer[0].len = 1; |
63 | 63 | ||
64 | xfer[1].rx_buf = (__u64) buf; | 64 | xfer[1].rx_buf = (unsigned long) buf; |
65 | xfer[1].len = len; | 65 | xfer[1].len = len; |
66 | 66 | ||
67 | status = ioctl(fd, SPI_IOC_MESSAGE(2), xfer); | 67 | status = ioctl(fd, SPI_IOC_MESSAGE(2), xfer); |
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 6c7d18c53f84..5fdbb612aeb8 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt | |||
@@ -19,6 +19,7 @@ files can be found in mm/swap.c. | |||
19 | Currently, these files are in /proc/sys/vm: | 19 | Currently, these files are in /proc/sys/vm: |
20 | 20 | ||
21 | - block_dump | 21 | - block_dump |
22 | - compact_memory | ||
22 | - dirty_background_bytes | 23 | - dirty_background_bytes |
23 | - dirty_background_ratio | 24 | - dirty_background_ratio |
24 | - dirty_bytes | 25 | - dirty_bytes |
@@ -26,6 +27,7 @@ Currently, these files are in /proc/sys/vm: | |||
26 | - dirty_ratio | 27 | - dirty_ratio |
27 | - dirty_writeback_centisecs | 28 | - dirty_writeback_centisecs |
28 | - drop_caches | 29 | - drop_caches |
30 | - extfrag_threshold | ||
29 | - hugepages_treat_as_movable | 31 | - hugepages_treat_as_movable |
30 | - hugetlb_shm_group | 32 | - hugetlb_shm_group |
31 | - laptop_mode | 33 | - laptop_mode |
@@ -64,6 +66,15 @@ information on block I/O debugging is in Documentation/laptops/laptop-mode.txt. | |||
64 | 66 | ||
65 | ============================================================== | 67 | ============================================================== |
66 | 68 | ||
69 | compact_memory | ||
70 | |||
71 | Available only when CONFIG_COMPACTION is set. When 1 is written to the file, | ||
72 | all zones are compacted such that free memory is available in contiguous | ||
73 | blocks where possible. This can be important for example in the allocation of | ||
74 | huge pages although processes will also directly compact memory as required. | ||
75 | |||
76 | ============================================================== | ||
77 | |||
67 | dirty_background_bytes | 78 | dirty_background_bytes |
68 | 79 | ||
69 | Contains the amount of dirty memory at which the pdflush background writeback | 80 | Contains the amount of dirty memory at which the pdflush background writeback |
@@ -139,6 +150,20 @@ user should run `sync' first. | |||
139 | 150 | ||
140 | ============================================================== | 151 | ============================================================== |
141 | 152 | ||
153 | extfrag_threshold | ||
154 | |||
155 | This parameter affects whether the kernel will compact memory or direct | ||
156 | reclaim to satisfy a high-order allocation. /proc/extfrag_index shows what | ||
157 | the fragmentation index for each order is in each zone in the system. Values | ||
158 | tending towards 0 imply allocations would fail due to lack of memory, | ||
159 | values towards 1000 imply failures are due to fragmentation and -1 implies | ||
160 | that the allocation will succeed as long as watermarks are met. | ||
161 | |||
162 | The kernel will not compact memory in a zone if the | ||
163 | fragmentation index is <= extfrag_threshold. The default value is 500. | ||
164 | |||
165 | ============================================================== | ||
166 | |||
142 | hugepages_treat_as_movable | 167 | hugepages_treat_as_movable |
143 | 168 | ||
144 | This parameter is only useful when kernelcore= is specified at boot time to | 169 | This parameter is only useful when kernelcore= is specified at boot time to |
diff --git a/Documentation/timers/Makefile b/Documentation/timers/Makefile index c85625f4ab25..73f75f8a87dc 100644 --- a/Documentation/timers/Makefile +++ b/Documentation/timers/Makefile | |||
@@ -2,7 +2,7 @@ | |||
2 | obj- := dummy.o | 2 | obj- := dummy.o |
3 | 3 | ||
4 | # List of programs to build | 4 | # List of programs to build |
5 | hostprogs-y := hpet_example | 5 | hostprogs-$(CONFIG_X86) := hpet_example |
6 | 6 | ||
7 | # Tell kbuild to always build the programs | 7 | # Tell kbuild to always build the programs |
8 | always := $(hostprogs-y) | 8 | always := $(hostprogs-y) |
diff --git a/Documentation/timers/hpet_example.c b/Documentation/timers/hpet_example.c index f9ce2d9fdfd5..4bfafb7bc4c5 100644 --- a/Documentation/timers/hpet_example.c +++ b/Documentation/timers/hpet_example.c | |||
@@ -10,7 +10,6 @@ | |||
10 | #include <sys/types.h> | 10 | #include <sys/types.h> |
11 | #include <sys/wait.h> | 11 | #include <sys/wait.h> |
12 | #include <signal.h> | 12 | #include <signal.h> |
13 | #include <fcntl.h> | ||
14 | #include <errno.h> | 13 | #include <errno.h> |
15 | #include <sys/time.h> | 14 | #include <sys/time.h> |
16 | #include <linux/hpet.h> | 15 | #include <linux/hpet.h> |
@@ -24,7 +23,6 @@ extern void hpet_read(int, const char **); | |||
24 | 23 | ||
25 | #include <sys/poll.h> | 24 | #include <sys/poll.h> |
26 | #include <sys/ioctl.h> | 25 | #include <sys/ioctl.h> |
27 | #include <signal.h> | ||
28 | 26 | ||
29 | struct hpet_command { | 27 | struct hpet_command { |
30 | char *command; | 28 | char *command; |
diff --git a/Documentation/video4linux/CARDLIST.saa7134 b/Documentation/video4linux/CARDLIST.saa7134 index 070f2576707e..1387a69ae3aa 100644 --- a/Documentation/video4linux/CARDLIST.saa7134 +++ b/Documentation/video4linux/CARDLIST.saa7134 | |||
@@ -176,5 +176,6 @@ | |||
176 | 175 -> Leadtek Winfast DTV1000S [107d:6655] | 176 | 175 -> Leadtek Winfast DTV1000S [107d:6655] |
177 | 176 -> Beholder BeholdTV 505 RDS [0000:5051] | 177 | 176 -> Beholder BeholdTV 505 RDS [0000:5051] |
178 | 177 -> Hawell HW-404M7 | 178 | 177 -> Hawell HW-404M7 |
179 | 179 -> Beholder BeholdTV H7 [5ace:7190] | 179 | 178 -> Beholder BeholdTV H7 [5ace:7190] |
180 | 180 -> Beholder BeholdTV A7 [5ace:7090] | 180 | 179 -> Beholder BeholdTV A7 [5ace:7090] |
181 | 180 -> Avermedia M733A [1461:4155,1461:4255] | ||
diff --git a/Documentation/video4linux/gspca.txt b/Documentation/video4linux/gspca.txt index 8f3f5d33327c..f13eb036c439 100644 --- a/Documentation/video4linux/gspca.txt +++ b/Documentation/video4linux/gspca.txt | |||
@@ -290,6 +290,7 @@ sonixb 0c45:602e Genius VideoCam Messenger | |||
290 | sonixj 0c45:6040 Speed NVC 350K | 290 | sonixj 0c45:6040 Speed NVC 350K |
291 | sonixj 0c45:607c Sonix sn9c102p Hv7131R | 291 | sonixj 0c45:607c Sonix sn9c102p Hv7131R |
292 | sonixj 0c45:60c0 Sangha Sn535 | 292 | sonixj 0c45:60c0 Sangha Sn535 |
293 | sonixj 0c45:60ce USB-PC-Camera-168 (TALK-5067) | ||
293 | sonixj 0c45:60ec SN9C105+MO4000 | 294 | sonixj 0c45:60ec SN9C105+MO4000 |
294 | sonixj 0c45:60fb Surfer NoName | 295 | sonixj 0c45:60fb Surfer NoName |
295 | sonixj 0c45:60fc LG-LIC300 | 296 | sonixj 0c45:60fc LG-LIC300 |
diff --git a/Documentation/vm/map_hugetlb.c b/Documentation/vm/map_hugetlb.c index 9969c7d9f985..eda1a6d3578a 100644 --- a/Documentation/vm/map_hugetlb.c +++ b/Documentation/vm/map_hugetlb.c | |||
@@ -19,7 +19,7 @@ | |||
19 | #define PROTECTION (PROT_READ | PROT_WRITE) | 19 | #define PROTECTION (PROT_READ | PROT_WRITE) |
20 | 20 | ||
21 | #ifndef MAP_HUGETLB | 21 | #ifndef MAP_HUGETLB |
22 | #define MAP_HUGETLB 0x40 | 22 | #define MAP_HUGETLB 0x40000 /* arch specific */ |
23 | #endif | 23 | #endif |
24 | 24 | ||
25 | /* Only ia64 requires this */ | 25 | /* Only ia64 requires this */ |
diff --git a/Documentation/vm/numa b/Documentation/vm/numa index e93ad9425e2a..a200a386429d 100644 --- a/Documentation/vm/numa +++ b/Documentation/vm/numa | |||
@@ -1,41 +1,149 @@ | |||
1 | Started Nov 1999 by Kanoj Sarcar <kanoj@sgi.com> | 1 | Started Nov 1999 by Kanoj Sarcar <kanoj@sgi.com> |
2 | 2 | ||
3 | The intent of this file is to have an uptodate, running commentary | 3 | What is NUMA? |
4 | from different people about NUMA specific code in the Linux vm. | 4 | |
5 | 5 | This question can be answered from a couple of perspectives: the | |
6 | What is NUMA? It is an architecture where the memory access times | 6 | hardware view and the Linux software view. |
7 | for different regions of memory from a given processor varies | 7 | |
8 | according to the "distance" of the memory region from the processor. | 8 | From the hardware perspective, a NUMA system is a computer platform that |
9 | Each region of memory to which access times are the same from any | 9 | comprises multiple components or assemblies each of which may contain 0 |
10 | cpu, is called a node. On such architectures, it is beneficial if | 10 | or more CPUs, local memory, and/or IO buses. For brevity and to |
11 | the kernel tries to minimize inter node communications. Schemes | 11 | disambiguate the hardware view of these physical components/assemblies |
12 | for this range from kernel text and read-only data replication | 12 | from the software abstraction thereof, we'll call the components/assemblies |
13 | across nodes, and trying to house all the data structures that | 13 | 'cells' in this document. |
14 | key components of the kernel need on memory on that node. | 14 | |
15 | 15 | Each of the 'cells' may be viewed as an SMP [symmetric multi-processor] subset | |
16 | Currently, all the numa support is to provide efficient handling | 16 | of the system--although some components necessary for a stand-alone SMP system |
17 | of widely discontiguous physical memory, so architectures which | 17 | may not be populated on any given cell. The cells of the NUMA system are |
18 | are not NUMA but can have huge holes in the physical address space | 18 | connected together with some sort of system interconnect--e.g., a crossbar or |
19 | can use the same code. All this code is bracketed by CONFIG_DISCONTIGMEM. | 19 | point-to-point link are common types of NUMA system interconnects. Both of |
20 | 20 | these types of interconnects can be aggregated to create NUMA platforms with | |
21 | The initial port includes NUMAizing the bootmem allocator code by | 21 | cells at multiple distances from other cells. |
22 | encapsulating all the pieces of information into a bootmem_data_t | 22 | |
23 | structure. Node specific calls have been added to the allocator. | 23 | For Linux, the NUMA platforms of interest are primarily what is known as Cache |
24 | In theory, any platform which uses the bootmem allocator should | 24 | Coherent NUMA or ccNUMA systems. With ccNUMA systems, all memory is visible |
25 | be able to put the bootmem and mem_map data structures anywhere | 25 | to and accessible from any CPU attached to any cell and cache coherency |
26 | it deems best. | 26 | is handled in hardware by the processor caches and/or the system interconnect. |
27 | 27 | ||
28 | Each node's page allocation data structures have also been encapsulated | 28 | Memory access time and effective memory bandwidth varies depending on how far |
29 | into a pg_data_t. The bootmem_data_t is just one part of this. To | 29 | away the cell containing the CPU or IO bus making the memory access is from the |
30 | make the code look uniform between NUMA and regular UMA platforms, | 30 | cell containing the target memory. For example, access to memory by CPUs |
31 | UMA platforms have a statically allocated pg_data_t too (contig_page_data). | 31 | attached to the same cell will experience faster access times and higher |
32 | For the sake of uniformity, the function num_online_nodes() is also defined | 32 | bandwidths than accesses to memory on other, remote cells. NUMA platforms |
33 | for all platforms. As we run benchmarks, we might decide to NUMAize | 33 | can have cells at multiple remote distances from any given cell. |
34 | more variables like low_on_memory, nr_free_pages etc into the pg_data_t. | 34 | |
35 | 35 | Platform vendors don't build NUMA systems just to make software developers' | |
36 | The NUMA aware page allocation code currently tries to allocate pages | 36 | lives interesting. Rather, this architecture is a means to provide scalable |
37 | from different nodes in a round robin manner. This will be changed to | 37 | memory bandwidth. However, to achieve scalable memory bandwidth, system and |
38 | do concentratic circle search, starting from current node, once the | 38 | application software must arrange for a large majority of the memory references |
39 | NUMA port achieves more maturity. The call alloc_pages_node has been | 39 | [cache misses] to be to "local" memory--memory on the same cell, if any--or |
40 | added, so that drivers can make the call and not worry about whether | 40 | to the closest cell with memory. |
41 | it is running on a NUMA or UMA platform. | 41 | |
42 | This leads to the Linux software view of a NUMA system: | ||
43 | |||
44 | Linux divides the system's hardware resources into multiple software | ||
45 | abstractions called "nodes". Linux maps the nodes onto the physical cells | ||
46 | of the hardware platform, abstracting away some of the details for some | ||
47 | architectures. As with physical cells, software nodes may contain 0 or more | ||
48 | CPUs, memory and/or IO buses. And, again, memory accesses to memory on | ||
49 | "closer" nodes--nodes that map to closer cells--will generally experience | ||
50 | faster access times and higher effective bandwidth than accesses to more | ||
51 | remote cells. | ||
52 | |||
53 | For some architectures, such as x86, Linux will "hide" any node representing a | ||
54 | physical cell that has no memory attached, and reassign any CPUs attached to | ||
55 | that cell to a node representing a cell that does have memory. Thus, on | ||
56 | these architectures, one cannot assume that all CPUs that Linux associates with | ||
57 | a given node will see the same local memory access times and bandwidth. | ||
58 | |||
59 | In addition, for some architectures, again x86 is an example, Linux supports | ||
60 | the emulation of additional nodes. For NUMA emulation, linux will carve up | ||
61 | the existing nodes--or the system memory for non-NUMA platforms--into multiple | ||
62 | nodes. Each emulated node will manage a fraction of the underlying cells' | ||
63 | physical memory. NUMA emluation is useful for testing NUMA kernel and | ||
64 | application features on non-NUMA platforms, and as a sort of memory resource | ||
65 | management mechanism when used together with cpusets. | ||
66 | [see Documentation/cgroups/cpusets.txt] | ||
67 | |||
68 | For each node with memory, Linux constructs an independent memory management | ||
69 | subsystem, complete with its own free page lists, in-use page lists, usage | ||
70 | statistics and locks to mediate access. In addition, Linux constructs for | ||
71 | each memory zone [one or more of DMA, DMA32, NORMAL, HIGH_MEMORY, MOVABLE], | ||
72 | an ordered "zonelist". A zonelist specifies the zones/nodes to visit when a | ||
73 | selected zone/node cannot satisfy the allocation request. This situation, | ||
74 | when a zone has no available memory to satisfy a request, is called | ||
75 | "overflow" or "fallback". | ||
76 | |||
77 | Because some nodes contain multiple zones containing different types of | ||
78 | memory, Linux must decide whether to order the zonelists such that allocations | ||
79 | fall back to the same zone type on a different node, or to a different zone | ||
80 | type on the same node. This is an important consideration because some zones, | ||
81 | such as DMA or DMA32, represent relatively scarce resources. Linux chooses | ||
82 | a default zonelist order based on the sizes of the various zone types relative | ||
83 | to the total memory of the node and the total memory of the system. The | ||
84 | default zonelist order may be overridden using the numa_zonelist_order kernel | ||
85 | boot parameter or sysctl. [see Documentation/kernel-parameters.txt and | ||
86 | Documentation/sysctl/vm.txt] | ||
87 | |||
88 | By default, Linux will attempt to satisfy memory allocation requests from the | ||
89 | node to which the CPU that executes the request is assigned. Specifically, | ||
90 | Linux will attempt to allocate from the first node in the appropriate zonelist | ||
91 | for the node where the request originates. This is called "local allocation." | ||
92 | If the "local" node cannot satisfy the request, the kernel will examine other | ||
93 | nodes' zones in the selected zonelist looking for the first zone in the list | ||
94 | that can satisfy the request. | ||
95 | |||
96 | Local allocation will tend to keep subsequent access to the allocated memory | ||
97 | "local" to the underlying physical resources and off the system interconnect-- | ||
98 | as long as the task on whose behalf the kernel allocated some memory does not | ||
99 | later migrate away from that memory. The Linux scheduler is aware of the | ||
100 | NUMA topology of the platform--embodied in the "scheduling domains" data | ||
101 | structures [see Documentation/scheduler/sched-domains.txt]--and the scheduler | ||
102 | attempts to minimize task migration to distant scheduling domains. However, | ||
103 | the scheduler does not take a task's NUMA footprint into account directly. | ||
104 | Thus, under sufficient imbalance, tasks can migrate between nodes, remote | ||
105 | from their initial node and kernel data structures. | ||
106 | |||
107 | System administrators and application designers can restrict a task's migration | ||
108 | to improve NUMA locality using various CPU affinity command line interfaces, | ||
109 | such as taskset(1) and numactl(1), and program interfaces such as | ||
110 | sched_setaffinity(2). Further, one can modify the kernel's default local | ||
111 | allocation behavior using Linux NUMA memory policy. | ||
112 | [see Documentation/vm/numa_memory_policy.] | ||
113 | |||
114 | System administrators can restrict the CPUs and nodes' memories that a non- | ||
115 | privileged user can specify in the scheduling or NUMA commands and functions | ||
116 | using control groups and CPUsets. [see Documentation/cgroups/CPUsets.txt] | ||
117 | |||
118 | On architectures that do not hide memoryless nodes, Linux will include only | ||
119 | zones [nodes] with memory in the zonelists. This means that for a memoryless | ||
120 | node the "local memory node"--the node of the first zone in CPU's node's | ||
121 | zonelist--will not be the node itself. Rather, it will be the node that the | ||
122 | kernel selected as the nearest node with memory when it built the zonelists. | ||
123 | So, default, local allocations will succeed with the kernel supplying the | ||
124 | closest available memory. This is a consequence of the same mechanism that | ||
125 | allows such allocations to fallback to other nearby nodes when a node that | ||
126 | does contain memory overflows. | ||
127 | |||
128 | Some kernel allocations do not want or cannot tolerate this allocation fallback | ||
129 | behavior. Rather they want to be sure they get memory from the specified node | ||
130 | or get notified that the node has no free memory. This is usually the case when | ||
131 | a subsystem allocates per CPU memory resources, for example. | ||
132 | |||
133 | A typical model for making such an allocation is to obtain the node id of the | ||
134 | node to which the "current CPU" is attached using one of the kernel's | ||
135 | numa_node_id() or CPU_to_node() functions and then request memory from only | ||
136 | the node id returned. When such an allocation fails, the requesting subsystem | ||
137 | may revert to its own fallback path. The slab kernel memory allocator is an | ||
138 | example of this. Or, the subsystem may choose to disable or not to enable | ||
139 | itself on allocation failure. The kernel profiling subsystem is an example of | ||
140 | this. | ||
141 | |||
142 | If the architecture supports--does not hide--memoryless nodes, then CPUs | ||
143 | attached to memoryless nodes would always incur the fallback path overhead | ||
144 | or some subsystems would fail to initialize if they attempted to allocated | ||
145 | memory exclusively from a node without memory. To support such | ||
146 | architectures transparently, kernel subsystems can use the numa_mem_id() | ||
147 | or cpu_to_mem() function to locate the "local memory node" for the calling or | ||
148 | specified CPU. Again, this is the same node from which default, local page | ||
149 | allocations will be attempted. | ||
diff --git a/Documentation/watchdog/00-INDEX b/Documentation/watchdog/00-INDEX index c3ea47e507fe..ee994513a9b1 100644 --- a/Documentation/watchdog/00-INDEX +++ b/Documentation/watchdog/00-INDEX | |||
@@ -1,10 +1,15 @@ | |||
1 | 00-INDEX | 1 | 00-INDEX |
2 | - this file. | 2 | - this file. |
3 | hpwdt.txt | ||
4 | - information on the HP iLO2 NMI watchdog | ||
3 | pcwd-watchdog.txt | 5 | pcwd-watchdog.txt |
4 | - documentation for Berkshire Products PC Watchdog ISA cards. | 6 | - documentation for Berkshire Products PC Watchdog ISA cards. |
5 | src/ | 7 | src/ |
6 | - directory holding watchdog related example programs. | 8 | - directory holding watchdog related example programs. |
7 | watchdog-api.txt | 9 | watchdog-api.txt |
8 | - description of the Linux Watchdog driver API. | 10 | - description of the Linux Watchdog driver API. |
11 | watchdog-parameters.txt | ||
12 | - information on driver parameters (for drivers other than | ||
13 | the ones that have driver-specific files here) | ||
9 | wdt.txt | 14 | wdt.txt |
10 | - description of the Watchdog Timer Interfaces for Linux. | 15 | - description of the Watchdog Timer Interfaces for Linux. |
diff --git a/Documentation/watchdog/watchdog-parameters.txt b/Documentation/watchdog/watchdog-parameters.txt new file mode 100644 index 000000000000..41c95cc1dc1f --- /dev/null +++ b/Documentation/watchdog/watchdog-parameters.txt | |||
@@ -0,0 +1,390 @@ | |||
1 | This file provides information on the module parameters of many of | ||
2 | the Linux watchdog drivers. Watchdog driver parameter specs should | ||
3 | be listed here unless the driver has its own driver-specific information | ||
4 | file. | ||
5 | |||
6 | |||
7 | See Documentation/kernel-parameters.txt for information on | ||
8 | providing kernel parameters for builtin drivers versus loadable | ||
9 | modules. | ||
10 | |||
11 | |||
12 | ------------------------------------------------- | ||
13 | acquirewdt: | ||
14 | wdt_stop: Acquire WDT 'stop' io port (default 0x43) | ||
15 | wdt_start: Acquire WDT 'start' io port (default 0x443) | ||
16 | nowayout: Watchdog cannot be stopped once started | ||
17 | (default=kernel config parameter) | ||
18 | ------------------------------------------------- | ||
19 | advantechwdt: | ||
20 | wdt_stop: Advantech WDT 'stop' io port (default 0x443) | ||
21 | wdt_start: Advantech WDT 'start' io port (default 0x443) | ||
22 | timeout: Watchdog timeout in seconds. 1<= timeout <=63, default=60. | ||
23 | nowayout: Watchdog cannot be stopped once started | ||
24 | (default=kernel config parameter) | ||
25 | ------------------------------------------------- | ||
26 | alim1535_wdt: | ||
27 | timeout: Watchdog timeout in seconds. (0 < timeout < 18000, default=60 | ||
28 | nowayout: Watchdog cannot be stopped once started | ||
29 | (default=kernel config parameter) | ||
30 | ------------------------------------------------- | ||
31 | alim7101_wdt: | ||
32 | timeout: Watchdog timeout in seconds. (1<=timeout<=3600, default=30 | ||
33 | use_gpio: Use the gpio watchdog (required by old cobalt boards). | ||
34 | default=0/off/no | ||
35 | nowayout: Watchdog cannot be stopped once started | ||
36 | (default=kernel config parameter) | ||
37 | ------------------------------------------------- | ||
38 | ar7_wdt: | ||
39 | margin: Watchdog margin in seconds (default=60) | ||
40 | nowayout: Disable watchdog shutdown on close | ||
41 | (default=kernel config parameter) | ||
42 | ------------------------------------------------- | ||
43 | at32ap700x_wdt: | ||
44 | timeout: Timeout value. Limited to be 1 or 2 seconds. (default=2) | ||
45 | nowayout: Watchdog cannot be stopped once started | ||
46 | (default=kernel config parameter) | ||
47 | ------------------------------------------------- | ||
48 | at91rm9200_wdt: | ||
49 | wdt_time: Watchdog time in seconds. (default=5) | ||
50 | nowayout: Watchdog cannot be stopped once started | ||
51 | (default=kernel config parameter) | ||
52 | ------------------------------------------------- | ||
53 | at91sam9_wdt: | ||
54 | heartbeat: Watchdog heartbeats in seconds. (default = 15) | ||
55 | nowayout: Watchdog cannot be stopped once started | ||
56 | (default=kernel config parameter) | ||
57 | ------------------------------------------------- | ||
58 | bcm47xx_wdt: | ||
59 | wdt_time: Watchdog time in seconds. (default=30) | ||
60 | nowayout: Watchdog cannot be stopped once started | ||
61 | (default=kernel config parameter) | ||
62 | ------------------------------------------------- | ||
63 | bfin_wdt: | ||
64 | timeout: Watchdog timeout in seconds. (1<=timeout<=((2^32)/SCLK), default=20) | ||
65 | nowayout: Watchdog cannot be stopped once started | ||
66 | (default=kernel config parameter) | ||
67 | ------------------------------------------------- | ||
68 | coh901327_wdt: | ||
69 | margin: Watchdog margin in seconds (default 60s) | ||
70 | ------------------------------------------------- | ||
71 | cpu5wdt: | ||
72 | port: base address of watchdog card, default is 0x91 | ||
73 | verbose: be verbose, default is 0 (no) | ||
74 | ticks: count down ticks, default is 10000 | ||
75 | ------------------------------------------------- | ||
76 | cpwd: | ||
77 | wd0_timeout: Default watchdog0 timeout in 1/10secs | ||
78 | wd1_timeout: Default watchdog1 timeout in 1/10secs | ||
79 | wd2_timeout: Default watchdog2 timeout in 1/10secs | ||
80 | ------------------------------------------------- | ||
81 | davinci_wdt: | ||
82 | heartbeat: Watchdog heartbeat period in seconds from 1 to 600, default 60 | ||
83 | ------------------------------------------------- | ||
84 | ep93xx_wdt: | ||
85 | nowayout: Watchdog cannot be stopped once started | ||
86 | timeout: Watchdog timeout in seconds. (1<=timeout<=3600, default=TBD) | ||
87 | ------------------------------------------------- | ||
88 | eurotechwdt: | ||
89 | nowayout: Watchdog cannot be stopped once started | ||
90 | (default=kernel config parameter) | ||
91 | io: Eurotech WDT io port (default=0x3f0) | ||
92 | irq: Eurotech WDT irq (default=10) | ||
93 | ev: Eurotech WDT event type (default is `int') | ||
94 | ------------------------------------------------- | ||
95 | gef_wdt: | ||
96 | nowayout: Watchdog cannot be stopped once started | ||
97 | (default=kernel config parameter) | ||
98 | ------------------------------------------------- | ||
99 | geodewdt: | ||
100 | timeout: Watchdog timeout in seconds. 1<= timeout <=131, default=60. | ||
101 | nowayout: Watchdog cannot be stopped once started | ||
102 | (default=kernel config parameter) | ||
103 | ------------------------------------------------- | ||
104 | i6300esb: | ||
105 | heartbeat: Watchdog heartbeat in seconds. (1<heartbeat<2046, default=30) | ||
106 | nowayout: Watchdog cannot be stopped once started | ||
107 | (default=kernel config parameter) | ||
108 | ------------------------------------------------- | ||
109 | iTCO_wdt: | ||
110 | heartbeat: Watchdog heartbeat in seconds. | ||
111 | (2<heartbeat<39 (TCO v1) or 613 (TCO v2), default=30) | ||
112 | nowayout: Watchdog cannot be stopped once started | ||
113 | (default=kernel config parameter) | ||
114 | ------------------------------------------------- | ||
115 | iTCO_vendor_support: | ||
116 | vendorsupport: iTCO vendor specific support mode, default=0 (none), | ||
117 | 1=SuperMicro Pent3, 2=SuperMicro Pent4+, 911=Broken SMI BIOS | ||
118 | ------------------------------------------------- | ||
119 | ib700wdt: | ||
120 | timeout: Watchdog timeout in seconds. 0<= timeout <=30, default=30. | ||
121 | nowayout: Watchdog cannot be stopped once started | ||
122 | (default=kernel config parameter) | ||
123 | ------------------------------------------------- | ||
124 | ibmasr: | ||
125 | nowayout: Watchdog cannot be stopped once started | ||
126 | (default=kernel config parameter) | ||
127 | ------------------------------------------------- | ||
128 | indydog: | ||
129 | nowayout: Watchdog cannot be stopped once started | ||
130 | (default=kernel config parameter) | ||
131 | ------------------------------------------------- | ||
132 | iop_wdt: | ||
133 | nowayout: Watchdog cannot be stopped once started | ||
134 | (default=kernel config parameter) | ||
135 | ------------------------------------------------- | ||
136 | it8712f_wdt: | ||
137 | margin: Watchdog margin in seconds (default 60) | ||
138 | nowayout: Disable watchdog shutdown on close | ||
139 | (default=kernel config parameter) | ||
140 | ------------------------------------------------- | ||
141 | it87_wdt: | ||
142 | nogameport: Forbid the activation of game port, default=0 | ||
143 | exclusive: Watchdog exclusive device open, default=1 | ||
144 | timeout: Watchdog timeout in seconds, default=60 | ||
145 | testmode: Watchdog test mode (1 = no reboot), default=0 | ||
146 | nowayout: Watchdog cannot be stopped once started | ||
147 | (default=kernel config parameter) | ||
148 | ------------------------------------------------- | ||
149 | ixp2000_wdt: | ||
150 | heartbeat: Watchdog heartbeat in seconds (default 60s) | ||
151 | nowayout: Watchdog cannot be stopped once started | ||
152 | (default=kernel config parameter) | ||
153 | ------------------------------------------------- | ||
154 | ixp4xx_wdt: | ||
155 | heartbeat: Watchdog heartbeat in seconds (default 60s) | ||
156 | nowayout: Watchdog cannot be stopped once started | ||
157 | (default=kernel config parameter) | ||
158 | ------------------------------------------------- | ||
159 | ks8695_wdt: | ||
160 | wdt_time: Watchdog time in seconds. (default=5) | ||
161 | nowayout: Watchdog cannot be stopped once started | ||
162 | (default=kernel config parameter) | ||
163 | ------------------------------------------------- | ||
164 | machzwd: | ||
165 | nowayout: Watchdog cannot be stopped once started | ||
166 | (default=kernel config parameter) | ||
167 | action: after watchdog resets, generate: | ||
168 | 0 = RESET(*) 1 = SMI 2 = NMI 3 = SCI | ||
169 | ------------------------------------------------- | ||
170 | max63xx_wdt: | ||
171 | heartbeat: Watchdog heartbeat period in seconds from 1 to 60, default 60 | ||
172 | nowayout: Watchdog cannot be stopped once started | ||
173 | (default=kernel config parameter) | ||
174 | nodelay: Force selection of a timeout setting without initial delay | ||
175 | (max6373/74 only, default=0) | ||
176 | ------------------------------------------------- | ||
177 | mixcomwd: | ||
178 | nowayout: Watchdog cannot be stopped once started | ||
179 | (default=kernel config parameter) | ||
180 | ------------------------------------------------- | ||
181 | mpc8xxx_wdt: | ||
182 | timeout: Watchdog timeout in ticks. (0<timeout<65536, default=65535) | ||
183 | reset: Watchdog Interrupt/Reset Mode. 0 = interrupt, 1 = reset | ||
184 | nowayout: Watchdog cannot be stopped once started | ||
185 | (default=kernel config parameter) | ||
186 | ------------------------------------------------- | ||
187 | mpcore_wdt: | ||
188 | mpcore_margin: MPcore timer margin in seconds. | ||
189 | (0 < mpcore_margin < 65536, default=60) | ||
190 | nowayout: Watchdog cannot be stopped once started | ||
191 | (default=kernel config parameter) | ||
192 | mpcore_noboot: MPcore watchdog action, set to 1 to ignore reboots, | ||
193 | 0 to reboot (default=0 | ||
194 | ------------------------------------------------- | ||
195 | mv64x60_wdt: | ||
196 | nowayout: Watchdog cannot be stopped once started | ||
197 | (default=kernel config parameter) | ||
198 | ------------------------------------------------- | ||
199 | nuc900_wdt: | ||
200 | heartbeat: Watchdog heartbeats in seconds. | ||
201 | (default = 15) | ||
202 | nowayout: Watchdog cannot be stopped once started | ||
203 | (default=kernel config parameter) | ||
204 | ------------------------------------------------- | ||
205 | omap_wdt: | ||
206 | timer_margin: initial watchdog timeout (in seconds) | ||
207 | ------------------------------------------------- | ||
208 | orion_wdt: | ||
209 | heartbeat: Initial watchdog heartbeat in seconds | ||
210 | nowayout: Watchdog cannot be stopped once started | ||
211 | (default=kernel config parameter) | ||
212 | ------------------------------------------------- | ||
213 | pc87413_wdt: | ||
214 | io: pc87413 WDT I/O port (default: io). | ||
215 | timeout: Watchdog timeout in minutes (default=timeout). | ||
216 | nowayout: Watchdog cannot be stopped once started | ||
217 | (default=kernel config parameter) | ||
218 | ------------------------------------------------- | ||
219 | pika_wdt: | ||
220 | heartbeat: Watchdog heartbeats in seconds. (default = 15) | ||
221 | nowayout: Watchdog cannot be stopped once started | ||
222 | (default=kernel config parameter) | ||
223 | ------------------------------------------------- | ||
224 | pnx4008_wdt: | ||
225 | heartbeat: Watchdog heartbeat period in seconds from 1 to 60, default 19 | ||
226 | nowayout: Set to 1 to keep watchdog running after device release | ||
227 | ------------------------------------------------- | ||
228 | pnx833x_wdt: | ||
229 | timeout: Watchdog timeout in Mhz. (68Mhz clock), default=2040000000 (30 seconds) | ||
230 | nowayout: Watchdog cannot be stopped once started | ||
231 | (default=kernel config parameter) | ||
232 | start_enabled: Watchdog is started on module insertion (default=1) | ||
233 | ------------------------------------------------- | ||
234 | rc32434_wdt: | ||
235 | timeout: Watchdog timeout value, in seconds (default=20) | ||
236 | nowayout: Watchdog cannot be stopped once started | ||
237 | (default=kernel config parameter) | ||
238 | ------------------------------------------------- | ||
239 | riowd: | ||
240 | riowd_timeout: Watchdog timeout in minutes (default=1) | ||
241 | ------------------------------------------------- | ||
242 | s3c2410_wdt: | ||
243 | tmr_margin: Watchdog tmr_margin in seconds. (default=15) | ||
244 | tmr_atboot: Watchdog is started at boot time if set to 1, default=0 | ||
245 | nowayout: Watchdog cannot be stopped once started | ||
246 | (default=kernel config parameter) | ||
247 | soft_noboot: Watchdog action, set to 1 to ignore reboots, 0 to reboot | ||
248 | debug: Watchdog debug, set to >1 for debug, (default 0) | ||
249 | ------------------------------------------------- | ||
250 | sa1100_wdt: | ||
251 | margin: Watchdog margin in seconds (default 60s) | ||
252 | ------------------------------------------------- | ||
253 | sb_wdog: | ||
254 | timeout: Watchdog timeout in microseconds (max/default 8388607 or 8.3ish secs) | ||
255 | ------------------------------------------------- | ||
256 | sbc60xxwdt: | ||
257 | wdt_stop: SBC60xx WDT 'stop' io port (default 0x45) | ||
258 | wdt_start: SBC60xx WDT 'start' io port (default 0x443) | ||
259 | timeout: Watchdog timeout in seconds. (1<=timeout<=3600, default=30) | ||
260 | nowayout: Watchdog cannot be stopped once started | ||
261 | (default=kernel config parameter) | ||
262 | ------------------------------------------------- | ||
263 | sbc7240_wdt: | ||
264 | timeout: Watchdog timeout in seconds. (1<=timeout<=255, default=30) | ||
265 | nowayout: Disable watchdog when closing device file | ||
266 | ------------------------------------------------- | ||
267 | sbc8360: | ||
268 | timeout: Index into timeout table (0-63) (default=27 (60s)) | ||
269 | nowayout: Watchdog cannot be stopped once started | ||
270 | (default=kernel config parameter) | ||
271 | ------------------------------------------------- | ||
272 | sbc_epx_c3: | ||
273 | nowayout: Watchdog cannot be stopped once started | ||
274 | (default=kernel config parameter) | ||
275 | ------------------------------------------------- | ||
276 | sbc_fitpc2_wdt: | ||
277 | margin: Watchdog margin in seconds (default 60s) | ||
278 | nowayout: Watchdog cannot be stopped once started | ||
279 | ------------------------------------------------- | ||
280 | sc1200wdt: | ||
281 | isapnp: When set to 0 driver ISA PnP support will be disabled (default=1) | ||
282 | io: io port | ||
283 | timeout: range is 0-255 minutes, default is 1 | ||
284 | nowayout: Watchdog cannot be stopped once started | ||
285 | (default=kernel config parameter) | ||
286 | ------------------------------------------------- | ||
287 | sc520_wdt: | ||
288 | timeout: Watchdog timeout in seconds. (1 <= timeout <= 3600, default=30) | ||
289 | nowayout: Watchdog cannot be stopped once started | ||
290 | (default=kernel config parameter) | ||
291 | ------------------------------------------------- | ||
292 | sch311x_wdt: | ||
293 | force_id: Override the detected device ID | ||
294 | therm_trip: Should a ThermTrip trigger the reset generator | ||
295 | timeout: Watchdog timeout in seconds. 1<= timeout <=15300, default=60 | ||
296 | nowayout: Watchdog cannot be stopped once started | ||
297 | (default=kernel config parameter) | ||
298 | ------------------------------------------------- | ||
299 | scx200_wdt: | ||
300 | margin: Watchdog margin in seconds | ||
301 | nowayout: Disable watchdog shutdown on close | ||
302 | ------------------------------------------------- | ||
303 | shwdt: | ||
304 | clock_division_ratio: Clock division ratio. Valid ranges are from 0x5 (1.31ms) | ||
305 | to 0x7 (5.25ms). (default=7) | ||
306 | heartbeat: Watchdog heartbeat in seconds. (1 <= heartbeat <= 3600, default=30 | ||
307 | nowayout: Watchdog cannot be stopped once started | ||
308 | (default=kernel config parameter) | ||
309 | ------------------------------------------------- | ||
310 | smsc37b787_wdt: | ||
311 | timeout: range is 1-255 units, default is 60 | ||
312 | nowayout: Watchdog cannot be stopped once started | ||
313 | (default=kernel config parameter) | ||
314 | ------------------------------------------------- | ||
315 | softdog: | ||
316 | soft_margin: Watchdog soft_margin in seconds. | ||
317 | (0 < soft_margin < 65536, default=60) | ||
318 | nowayout: Watchdog cannot be stopped once started | ||
319 | (default=kernel config parameter) | ||
320 | soft_noboot: Softdog action, set to 1 to ignore reboots, 0 to reboot | ||
321 | (default=0) | ||
322 | ------------------------------------------------- | ||
323 | stmp3xxx_wdt: | ||
324 | heartbeat: Watchdog heartbeat period in seconds from 1 to 4194304, default 19 | ||
325 | ------------------------------------------------- | ||
326 | ts72xx_wdt: | ||
327 | timeout: Watchdog timeout in seconds. (1 <= timeout <= 8, default=8) | ||
328 | nowayout: Disable watchdog shutdown on close | ||
329 | ------------------------------------------------- | ||
330 | twl4030_wdt: | ||
331 | nowayout: Watchdog cannot be stopped once started | ||
332 | (default=kernel config parameter) | ||
333 | ------------------------------------------------- | ||
334 | txx9wdt: | ||
335 | timeout: Watchdog timeout in seconds. (0<timeout<N, default=60) | ||
336 | nowayout: Watchdog cannot be stopped once started | ||
337 | (default=kernel config parameter) | ||
338 | ------------------------------------------------- | ||
339 | w83627hf_wdt: | ||
340 | wdt_io: w83627hf/thf WDT io port (default 0x2E) | ||
341 | timeout: Watchdog timeout in seconds. 1 <= timeout <= 255, default=60. | ||
342 | nowayout: Watchdog cannot be stopped once started | ||
343 | (default=kernel config parameter) | ||
344 | ------------------------------------------------- | ||
345 | w83697hf_wdt: | ||
346 | wdt_io: w83697hf/hg WDT io port (default 0x2e, 0 = autodetect) | ||
347 | timeout: Watchdog timeout in seconds. 1<= timeout <=255 (default=60) | ||
348 | nowayout: Watchdog cannot be stopped once started | ||
349 | (default=kernel config parameter) | ||
350 | early_disable: Watchdog gets disabled at boot time (default=1) | ||
351 | ------------------------------------------------- | ||
352 | w83697ug_wdt: | ||
353 | wdt_io: w83697ug/uf WDT io port (default 0x2e) | ||
354 | timeout: Watchdog timeout in seconds. 1<= timeout <=255 (default=60) | ||
355 | nowayout: Watchdog cannot be stopped once started | ||
356 | (default=kernel config parameter) | ||
357 | ------------------------------------------------- | ||
358 | w83877f_wdt: | ||
359 | timeout: Watchdog timeout in seconds. (1<=timeout<=3600, default=30) | ||
360 | nowayout: Watchdog cannot be stopped once started | ||
361 | (default=kernel config parameter) | ||
362 | ------------------------------------------------- | ||
363 | w83977f_wdt: | ||
364 | timeout: Watchdog timeout in seconds (15..7635), default=45) | ||
365 | testmode: Watchdog testmode (1 = no reboot), default=0 | ||
366 | nowayout: Watchdog cannot be stopped once started | ||
367 | (default=kernel config parameter) | ||
368 | ------------------------------------------------- | ||
369 | wafer5823wdt: | ||
370 | timeout: Watchdog timeout in seconds. 1 <= timeout <= 255, default=60. | ||
371 | nowayout: Watchdog cannot be stopped once started | ||
372 | (default=kernel config parameter) | ||
373 | ------------------------------------------------- | ||
374 | wdt285: | ||
375 | soft_margin: Watchdog timeout in seconds (default=60) | ||
376 | ------------------------------------------------- | ||
377 | wdt977: | ||
378 | timeout: Watchdog timeout in seconds (60..15300, default=60) | ||
379 | testmode: Watchdog testmode (1 = no reboot), default=0 | ||
380 | nowayout: Watchdog cannot be stopped once started | ||
381 | (default=kernel config parameter) | ||
382 | ------------------------------------------------- | ||
383 | wm831x_wdt: | ||
384 | nowayout: Watchdog cannot be stopped once started | ||
385 | (default=kernel config parameter) | ||
386 | ------------------------------------------------- | ||
387 | wm8350_wdt: | ||
388 | nowayout: Watchdog cannot be stopped once started | ||
389 | (default=kernel config parameter) | ||
390 | ------------------------------------------------- | ||
diff --git a/Documentation/watchdog/wdt.txt b/Documentation/watchdog/wdt.txt index 03fd756d976d..061c2e35384f 100644 --- a/Documentation/watchdog/wdt.txt +++ b/Documentation/watchdog/wdt.txt | |||
@@ -14,14 +14,22 @@ reboot will depend on the state of the machines and interrupts. The hardware | |||
14 | boards physically pull the machine down off their own onboard timers and | 14 | boards physically pull the machine down off their own onboard timers and |
15 | will reboot from almost anything. | 15 | will reboot from almost anything. |
16 | 16 | ||
17 | A second temperature monitoring interface is available on the WDT501P cards | 17 | A second temperature monitoring interface is available on the WDT501P cards. |
18 | This provides /dev/temperature. This is the machine internal temperature in | 18 | This provides /dev/temperature. This is the machine internal temperature in |
19 | degrees Fahrenheit. Each read returns a single byte giving the temperature. | 19 | degrees Fahrenheit. Each read returns a single byte giving the temperature. |
20 | 20 | ||
21 | The third interface logs kernel messages on additional alert events. | 21 | The third interface logs kernel messages on additional alert events. |
22 | 22 | ||
23 | The wdt card cannot be safely probed for. Instead you need to pass | 23 | The ICS ISA-bus wdt card cannot be safely probed for. Instead you need to |
24 | wdt=ioaddr,irq as a boot parameter - eg "wdt=0x240,11". | 24 | pass IO address and IRQ boot parameters. E.g.: |
25 | wdt.io=0x240 wdt.irq=11 | ||
26 | |||
27 | Other "wdt" driver parameters are: | ||
28 | heartbeat Watchdog heartbeat in seconds (default 60) | ||
29 | nowayout Watchdog cannot be stopped once started (kernel | ||
30 | build parameter) | ||
31 | tachometer WDT501-P Fan Tachometer support (0=disable, default=0) | ||
32 | type WDT501-P Card type (500 or 501, default=500) | ||
25 | 33 | ||
26 | Features | 34 | Features |
27 | -------- | 35 | -------- |
@@ -40,4 +48,3 @@ Minor numbers are however allocated for it. | |||
40 | 48 | ||
41 | 49 | ||
42 | Example Watchdog Driver: see Documentation/watchdog/src/watchdog-simple.c | 50 | Example Watchdog Driver: see Documentation/watchdog/src/watchdog-simple.c |
43 | |||