diff options
Diffstat (limited to 'Documentation')
27 files changed, 1958 insertions, 492 deletions
diff --git a/Documentation/Changes b/Documentation/Changes index 783ddc3ce4e8..86b86399d61d 100644 --- a/Documentation/Changes +++ b/Documentation/Changes | |||
@@ -139,9 +139,14 @@ You'll probably want to upgrade. | |||
139 | Ksymoops | 139 | Ksymoops |
140 | -------- | 140 | -------- |
141 | 141 | ||
142 | If the unthinkable happens and your kernel oopses, you'll need a 2.4 | 142 | If the unthinkable happens and your kernel oopses, you may need the |
143 | version of ksymoops to decode the report; see REPORTING-BUGS in the | 143 | ksymoops tool to decode it, but in most cases you don't. |
144 | root of the Linux source for more information. | 144 | In the 2.6 kernel it is generally preferred to build the kernel with |
145 | CONFIG_KALLSYMS so that it produces readable dumps that can be used as-is | ||
146 | (this also produces better output than ksymoops). | ||
147 | If for some reason your kernel is not build with CONFIG_KALLSYMS and | ||
148 | you have no way to rebuild and reproduce the Oops with that option, then | ||
149 | you can still decode that Oops with ksymoops. | ||
145 | 150 | ||
146 | Module-Init-Tools | 151 | Module-Init-Tools |
147 | ----------------- | 152 | ----------------- |
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile index fa3e29ad8a46..7018f5c6a447 100644 --- a/Documentation/DocBook/Makefile +++ b/Documentation/DocBook/Makefile | |||
@@ -10,7 +10,7 @@ DOCBOOKS := wanbook.xml z8530book.xml mcabook.xml videobook.xml \ | |||
10 | kernel-hacking.xml kernel-locking.xml deviceiobook.xml \ | 10 | kernel-hacking.xml kernel-locking.xml deviceiobook.xml \ |
11 | procfs-guide.xml writing_usb_driver.xml \ | 11 | procfs-guide.xml writing_usb_driver.xml \ |
12 | sis900.xml kernel-api.xml journal-api.xml lsm.xml usb.xml \ | 12 | sis900.xml kernel-api.xml journal-api.xml lsm.xml usb.xml \ |
13 | gadget.xml libata.xml mtdnand.xml librs.xml | 13 | gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml |
14 | 14 | ||
15 | ### | 15 | ### |
16 | # The build process is as follows (targets): | 16 | # The build process is as follows (targets): |
diff --git a/Documentation/DocBook/journal-api.tmpl b/Documentation/DocBook/journal-api.tmpl index 341aaa4ce481..2077f9a28c19 100644 --- a/Documentation/DocBook/journal-api.tmpl +++ b/Documentation/DocBook/journal-api.tmpl | |||
@@ -306,7 +306,7 @@ an example. | |||
306 | </para> | 306 | </para> |
307 | <sect1><title>Journal Level</title> | 307 | <sect1><title>Journal Level</title> |
308 | !Efs/jbd/journal.c | 308 | !Efs/jbd/journal.c |
309 | !Efs/jbd/recovery.c | 309 | !Ifs/jbd/recovery.c |
310 | </sect1> | 310 | </sect1> |
311 | <sect1><title>Transasction Level</title> | 311 | <sect1><title>Transasction Level</title> |
312 | !Efs/jbd/transaction.c | 312 | !Efs/jbd/transaction.c |
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl index ec474e5a25ed..a8316b1a3e3d 100644 --- a/Documentation/DocBook/kernel-api.tmpl +++ b/Documentation/DocBook/kernel-api.tmpl | |||
@@ -118,7 +118,7 @@ X!Ilib/string.c | |||
118 | </sect1> | 118 | </sect1> |
119 | <sect1><title>User Space Memory Access</title> | 119 | <sect1><title>User Space Memory Access</title> |
120 | !Iinclude/asm-i386/uaccess.h | 120 | !Iinclude/asm-i386/uaccess.h |
121 | !Iarch/i386/lib/usercopy.c | 121 | !Earch/i386/lib/usercopy.c |
122 | </sect1> | 122 | </sect1> |
123 | <sect1><title>More Memory Management Functions</title> | 123 | <sect1><title>More Memory Management Functions</title> |
124 | !Iinclude/linux/rmap.h | 124 | !Iinclude/linux/rmap.h |
@@ -174,7 +174,6 @@ X!Ilib/string.c | |||
174 | <title>The Linux VFS</title> | 174 | <title>The Linux VFS</title> |
175 | <sect1><title>The Filesystem types</title> | 175 | <sect1><title>The Filesystem types</title> |
176 | !Iinclude/linux/fs.h | 176 | !Iinclude/linux/fs.h |
177 | !Einclude/linux/fs.h | ||
178 | </sect1> | 177 | </sect1> |
179 | <sect1><title>The Directory Cache</title> | 178 | <sect1><title>The Directory Cache</title> |
180 | !Efs/dcache.c | 179 | !Efs/dcache.c |
@@ -266,7 +265,7 @@ X!Ekernel/module.c | |||
266 | <chapter id="hardware"> | 265 | <chapter id="hardware"> |
267 | <title>Hardware Interfaces</title> | 266 | <title>Hardware Interfaces</title> |
268 | <sect1><title>Interrupt Handling</title> | 267 | <sect1><title>Interrupt Handling</title> |
269 | !Ikernel/irq/manage.c | 268 | !Ekernel/irq/manage.c |
270 | </sect1> | 269 | </sect1> |
271 | 270 | ||
272 | <sect1><title>Resources Management</title> | 271 | <sect1><title>Resources Management</title> |
@@ -501,7 +500,7 @@ KAO --> | |||
501 | !Edrivers/video/modedb.c | 500 | !Edrivers/video/modedb.c |
502 | </sect1> | 501 | </sect1> |
503 | <sect1><title>Frame Buffer Macintosh Video Mode Database</title> | 502 | <sect1><title>Frame Buffer Macintosh Video Mode Database</title> |
504 | !Idrivers/video/macmodes.c | 503 | !Edrivers/video/macmodes.c |
505 | </sect1> | 504 | </sect1> |
506 | <sect1><title>Frame Buffer Fonts</title> | 505 | <sect1><title>Frame Buffer Fonts</title> |
507 | <para> | 506 | <para> |
diff --git a/Documentation/DocBook/rapidio.tmpl b/Documentation/DocBook/rapidio.tmpl new file mode 100644 index 000000000000..1becf27ba27e --- /dev/null +++ b/Documentation/DocBook/rapidio.tmpl | |||
@@ -0,0 +1,160 @@ | |||
1 | <?xml version="1.0" encoding="UTF-8"?> | ||
2 | <!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" | ||
3 | "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" [ | ||
4 | <!ENTITY rapidio SYSTEM "rapidio.xml"> | ||
5 | ]> | ||
6 | |||
7 | <book id="RapidIO-Guide"> | ||
8 | <bookinfo> | ||
9 | <title>RapidIO Subsystem Guide</title> | ||
10 | |||
11 | <authorgroup> | ||
12 | <author> | ||
13 | <firstname>Matt</firstname> | ||
14 | <surname>Porter</surname> | ||
15 | <affiliation> | ||
16 | <address> | ||
17 | <email>mporter@kernel.crashing.org</email> | ||
18 | <email>mporter@mvista.com</email> | ||
19 | </address> | ||
20 | </affiliation> | ||
21 | </author> | ||
22 | </authorgroup> | ||
23 | |||
24 | <copyright> | ||
25 | <year>2005</year> | ||
26 | <holder>MontaVista Software, Inc.</holder> | ||
27 | </copyright> | ||
28 | |||
29 | <legalnotice> | ||
30 | <para> | ||
31 | This documentation is free software; you can redistribute | ||
32 | it and/or modify it under the terms of the GNU General Public | ||
33 | License version 2 as published by the Free Software Foundation. | ||
34 | </para> | ||
35 | |||
36 | <para> | ||
37 | This program is distributed in the hope that it will be | ||
38 | useful, but WITHOUT ANY WARRANTY; without even the implied | ||
39 | warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | ||
40 | See the GNU General Public License for more details. | ||
41 | </para> | ||
42 | |||
43 | <para> | ||
44 | You should have received a copy of the GNU General Public | ||
45 | License along with this program; if not, write to the Free | ||
46 | Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, | ||
47 | MA 02111-1307 USA | ||
48 | </para> | ||
49 | |||
50 | <para> | ||
51 | For more details see the file COPYING in the source | ||
52 | distribution of Linux. | ||
53 | </para> | ||
54 | </legalnotice> | ||
55 | </bookinfo> | ||
56 | |||
57 | <toc></toc> | ||
58 | |||
59 | <chapter id="intro"> | ||
60 | <title>Introduction</title> | ||
61 | <para> | ||
62 | RapidIO is a high speed switched fabric interconnect with | ||
63 | features aimed at the embedded market. RapidIO provides | ||
64 | support for memory-mapped I/O as well as message-based | ||
65 | transactions over the switched fabric network. RapidIO has | ||
66 | a standardized discovery mechanism not unlike the PCI bus | ||
67 | standard that allows simple detection of devices in a | ||
68 | network. | ||
69 | </para> | ||
70 | <para> | ||
71 | This documentation is provided for developers intending | ||
72 | to support RapidIO on new architectures, write new drivers, | ||
73 | or to understand the subsystem internals. | ||
74 | </para> | ||
75 | </chapter> | ||
76 | |||
77 | <chapter id="bugs"> | ||
78 | <title>Known Bugs and Limitations</title> | ||
79 | |||
80 | <sect1> | ||
81 | <title>Bugs</title> | ||
82 | <para>None. ;)</para> | ||
83 | </sect1> | ||
84 | <sect1> | ||
85 | <title>Limitations</title> | ||
86 | <para> | ||
87 | <orderedlist> | ||
88 | <listitem><para>Access/management of RapidIO memory regions is not supported</para></listitem> | ||
89 | <listitem><para>Multiple host enumeration is not supported</para></listitem> | ||
90 | </orderedlist> | ||
91 | </para> | ||
92 | </sect1> | ||
93 | </chapter> | ||
94 | |||
95 | <chapter id="drivers"> | ||
96 | <title>RapidIO driver interface</title> | ||
97 | <para> | ||
98 | Drivers are provided a set of calls in order | ||
99 | to interface with the subsystem to gather info | ||
100 | on devices, request/map memory region resources, | ||
101 | and manage mailboxes/doorbells. | ||
102 | </para> | ||
103 | <sect1> | ||
104 | <title>Functions</title> | ||
105 | !Iinclude/linux/rio_drv.h | ||
106 | !Edrivers/rapidio/rio-driver.c | ||
107 | !Edrivers/rapidio/rio.c | ||
108 | </sect1> | ||
109 | </chapter> | ||
110 | |||
111 | <chapter id="internals"> | ||
112 | <title>Internals</title> | ||
113 | |||
114 | <para> | ||
115 | This chapter contains the autogenerated documentation of the RapidIO | ||
116 | subsystem. | ||
117 | </para> | ||
118 | |||
119 | <sect1><title>Structures</title> | ||
120 | !Iinclude/linux/rio.h | ||
121 | </sect1> | ||
122 | <sect1><title>Enumeration and Discovery</title> | ||
123 | !Idrivers/rapidio/rio-scan.c | ||
124 | </sect1> | ||
125 | <sect1><title>Driver functionality</title> | ||
126 | !Idrivers/rapidio/rio.c | ||
127 | !Idrivers/rapidio/rio-access.c | ||
128 | </sect1> | ||
129 | <sect1><title>Device model support</title> | ||
130 | !Idrivers/rapidio/rio-driver.c | ||
131 | </sect1> | ||
132 | <sect1><title>Sysfs support</title> | ||
133 | !Idrivers/rapidio/rio-sysfs.c | ||
134 | </sect1> | ||
135 | <sect1><title>PPC32 support</title> | ||
136 | !Iarch/ppc/kernel/rio.c | ||
137 | !Earch/ppc/syslib/ppc85xx_rio.c | ||
138 | !Iarch/ppc/syslib/ppc85xx_rio.c | ||
139 | </sect1> | ||
140 | </chapter> | ||
141 | |||
142 | <chapter id="credits"> | ||
143 | <title>Credits</title> | ||
144 | <para> | ||
145 | The following people have contributed to the RapidIO | ||
146 | subsystem directly or indirectly: | ||
147 | <orderedlist> | ||
148 | <listitem><para>Matt Porter<email>mporter@kernel.crashing.org</email></para></listitem> | ||
149 | <listitem><para>Randy Vinson<email>rvinson@mvista.com</email></para></listitem> | ||
150 | <listitem><para>Dan Malek<email>dan@embeddedalley.com</email></para></listitem> | ||
151 | </orderedlist> | ||
152 | </para> | ||
153 | <para> | ||
154 | The following people have contributed to this document: | ||
155 | <orderedlist> | ||
156 | <listitem><para>Matt Porter<email>mporter@kernel.crashing.org</email></para></listitem> | ||
157 | </orderedlist> | ||
158 | </para> | ||
159 | </chapter> | ||
160 | </book> | ||
diff --git a/Documentation/MSI-HOWTO.txt b/Documentation/MSI-HOWTO.txt index 63edc5f847c4..3ec6c720b016 100644 --- a/Documentation/MSI-HOWTO.txt +++ b/Documentation/MSI-HOWTO.txt | |||
@@ -10,14 +10,22 @@ | |||
10 | This guide describes the basics of Message Signaled Interrupts (MSI), | 10 | This guide describes the basics of Message Signaled Interrupts (MSI), |
11 | the advantages of using MSI over traditional interrupt mechanisms, | 11 | the advantages of using MSI over traditional interrupt mechanisms, |
12 | and how to enable your driver to use MSI or MSI-X. Also included is | 12 | and how to enable your driver to use MSI or MSI-X. Also included is |
13 | a Frequently Asked Questions. | 13 | a Frequently Asked Questions (FAQ) section. |
14 | |||
15 | 1.1 Terminology | ||
16 | |||
17 | PCI devices can be single-function or multi-function. In either case, | ||
18 | when this text talks about enabling or disabling MSI on a "device | ||
19 | function," it is referring to one specific PCI device and function and | ||
20 | not to all functions on a PCI device (unless the PCI device has only | ||
21 | one function). | ||
14 | 22 | ||
15 | 2. Copyright 2003 Intel Corporation | 23 | 2. Copyright 2003 Intel Corporation |
16 | 24 | ||
17 | 3. What is MSI/MSI-X? | 25 | 3. What is MSI/MSI-X? |
18 | 26 | ||
19 | Message Signaled Interrupt (MSI), as described in the PCI Local Bus | 27 | Message Signaled Interrupt (MSI), as described in the PCI Local Bus |
20 | Specification Revision 2.3 or latest, is an optional feature, and a | 28 | Specification Revision 2.3 or later, is an optional feature, and a |
21 | required feature for PCI Express devices. MSI enables a device function | 29 | required feature for PCI Express devices. MSI enables a device function |
22 | to request service by sending an Inbound Memory Write on its PCI bus to | 30 | to request service by sending an Inbound Memory Write on its PCI bus to |
23 | the FSB as a Message Signal Interrupt transaction. Because MSI is | 31 | the FSB as a Message Signal Interrupt transaction. Because MSI is |
@@ -27,7 +35,7 @@ supported. | |||
27 | 35 | ||
28 | A PCI device that supports MSI must also support pin IRQ assertion | 36 | A PCI device that supports MSI must also support pin IRQ assertion |
29 | interrupt mechanism to provide backward compatibility for systems that | 37 | interrupt mechanism to provide backward compatibility for systems that |
30 | do not support MSI. In Systems, which support MSI, the bus driver is | 38 | do not support MSI. In systems which support MSI, the bus driver is |
31 | responsible for initializing the message address and message data of | 39 | responsible for initializing the message address and message data of |
32 | the device function's MSI/MSI-X capability structure during device | 40 | the device function's MSI/MSI-X capability structure during device |
33 | initial configuration. | 41 | initial configuration. |
@@ -61,17 +69,17 @@ over the MSI capability structure as described below. | |||
61 | 69 | ||
62 | - MSI and MSI-X both support per-vector masking. Per-vector | 70 | - MSI and MSI-X both support per-vector masking. Per-vector |
63 | masking is an optional extension of MSI but a required | 71 | masking is an optional extension of MSI but a required |
64 | feature for MSI-X. Per-vector masking provides the kernel | 72 | feature for MSI-X. Per-vector masking provides the kernel the |
65 | the ability to mask/unmask MSI when servicing its software | 73 | ability to mask/unmask a single MSI while running its |
66 | interrupt service routing handler. If per-vector masking is | 74 | interrupt service routine. If per-vector masking is |
67 | not supported, then the device driver should provide the | 75 | not supported, then the device driver should provide the |
68 | hardware/software synchronization to ensure that the device | 76 | hardware/software synchronization to ensure that the device |
69 | generates MSI when the driver wants it to do so. | 77 | generates MSI when the driver wants it to do so. |
70 | 78 | ||
71 | 4. Why use MSI? | 79 | 4. Why use MSI? |
72 | 80 | ||
73 | As a benefit the simplification of board design, MSI allows board | 81 | As a benefit to the simplification of board design, MSI allows board |
74 | designers to remove out of band interrupt routing. MSI is another | 82 | designers to remove out-of-band interrupt routing. MSI is another |
75 | step towards a legacy-free environment. | 83 | step towards a legacy-free environment. |
76 | 84 | ||
77 | Due to increasing pressure on chipset and processor packages to | 85 | Due to increasing pressure on chipset and processor packages to |
@@ -87,7 +95,7 @@ support. As a result, the PCI Express technology requires MSI | |||
87 | support for better interrupt performance. | 95 | support for better interrupt performance. |
88 | 96 | ||
89 | Using MSI enables the device functions to support two or more | 97 | Using MSI enables the device functions to support two or more |
90 | vectors, which can be configured to target different CPU's to | 98 | vectors, which can be configured to target different CPUs to |
91 | increase scalability. | 99 | increase scalability. |
92 | 100 | ||
93 | 5. Configuring a driver to use MSI/MSI-X | 101 | 5. Configuring a driver to use MSI/MSI-X |
@@ -119,13 +127,13 @@ pci_enable_msi() explicitly. | |||
119 | 127 | ||
120 | int pci_enable_msi(struct pci_dev *dev) | 128 | int pci_enable_msi(struct pci_dev *dev) |
121 | 129 | ||
122 | With this new API, any existing device driver, which like to have | 130 | With this new API, a device driver that wants to have MSI |
123 | MSI enabled on its device function, must call this API to enable MSI | 131 | enabled on its device function must call this API to enable MSI. |
124 | A successful call will initialize the MSI capability structure | 132 | A successful call will initialize the MSI capability structure |
125 | with ONE vector, regardless of whether a device function is | 133 | with ONE vector, regardless of whether a device function is |
126 | capable of supporting multiple messages. This vector replaces the | 134 | capable of supporting multiple messages. This vector replaces the |
127 | pre-assigned dev->irq with a new MSI vector. To avoid the conflict | 135 | pre-assigned dev->irq with a new MSI vector. To avoid a conflict |
128 | of new assigned vector with existing pre-assigned vector requires | 136 | of the new assigned vector with existing pre-assigned vector requires |
129 | a device driver to call this API before calling request_irq(). | 137 | a device driver to call this API before calling request_irq(). |
130 | 138 | ||
131 | 5.2.2 API pci_disable_msi | 139 | 5.2.2 API pci_disable_msi |
@@ -137,14 +145,14 @@ when a device driver is unloading. This API restores dev->irq with | |||
137 | the pre-assigned IOAPIC vector and switches a device's interrupt | 145 | the pre-assigned IOAPIC vector and switches a device's interrupt |
138 | mode to PCI pin-irq assertion/INTx emulation mode. | 146 | mode to PCI pin-irq assertion/INTx emulation mode. |
139 | 147 | ||
140 | Note that a device driver should always call free_irq() on MSI vector | 148 | Note that a device driver should always call free_irq() on the MSI vector |
141 | it has done request_irq() on before calling this API. Failure to do | 149 | that it has done request_irq() on before calling this API. Failure to do |
142 | so results a BUG_ON() and a device will be left with MSI enabled and | 150 | so results in a BUG_ON() and a device will be left with MSI enabled and |
143 | leaks its vector. | 151 | leaks its vector. |
144 | 152 | ||
145 | 5.2.3 MSI mode vs. legacy mode diagram | 153 | 5.2.3 MSI mode vs. legacy mode diagram |
146 | 154 | ||
147 | The below diagram shows the events, which switches the interrupt | 155 | The below diagram shows the events which switch the interrupt |
148 | mode on the MSI-capable device function between MSI mode and | 156 | mode on the MSI-capable device function between MSI mode and |
149 | PIN-IRQ assertion mode. | 157 | PIN-IRQ assertion mode. |
150 | 158 | ||
@@ -155,9 +163,9 @@ PIN-IRQ assertion mode. | |||
155 | ------------ pci_disable_msi ------------------------ | 163 | ------------ pci_disable_msi ------------------------ |
156 | 164 | ||
157 | 165 | ||
158 | Figure 1.0 MSI Mode vs. Legacy Mode | 166 | Figure 1. MSI Mode vs. Legacy Mode |
159 | 167 | ||
160 | In Figure 1.0, a device operates by default in legacy mode. Legacy | 168 | In Figure 1, a device operates by default in legacy mode. Legacy |
161 | in this context means PCI pin-irq assertion or PCI-Express INTx | 169 | in this context means PCI pin-irq assertion or PCI-Express INTx |
162 | emulation. A successful MSI request (using pci_enable_msi()) switches | 170 | emulation. A successful MSI request (using pci_enable_msi()) switches |
163 | a device's interrupt mode to MSI mode. A pre-assigned IOAPIC vector | 171 | a device's interrupt mode to MSI mode. A pre-assigned IOAPIC vector |
@@ -166,11 +174,11 @@ assigned MSI vector will replace dev->irq. | |||
166 | 174 | ||
167 | To return back to its default mode, a device driver should always call | 175 | To return back to its default mode, a device driver should always call |
168 | pci_disable_msi() to undo the effect of pci_enable_msi(). Note that a | 176 | pci_disable_msi() to undo the effect of pci_enable_msi(). Note that a |
169 | device driver should always call free_irq() on MSI vector it has done | 177 | device driver should always call free_irq() on the MSI vector it has |
170 | request_irq() on before calling pci_disable_msi(). Failure to do so | 178 | done request_irq() on before calling pci_disable_msi(). Failure to do |
171 | results a BUG_ON() and a device will be left with MSI enabled and | 179 | so results in a BUG_ON() and a device will be left with MSI enabled and |
172 | leaks its vector. Otherwise, the PCI subsystem restores a device's | 180 | leaks its vector. Otherwise, the PCI subsystem restores a device's |
173 | dev->irq with a pre-assigned IOAPIC vector and marks released | 181 | dev->irq with a pre-assigned IOAPIC vector and marks the released |
174 | MSI vector as unused. | 182 | MSI vector as unused. |
175 | 183 | ||
176 | Once being marked as unused, there is no guarantee that the PCI | 184 | Once being marked as unused, there is no guarantee that the PCI |
@@ -178,8 +186,8 @@ subsystem will reserve this MSI vector for a device. Depending on | |||
178 | the availability of current PCI vector resources and the number of | 186 | the availability of current PCI vector resources and the number of |
179 | MSI/MSI-X requests from other drivers, this MSI may be re-assigned. | 187 | MSI/MSI-X requests from other drivers, this MSI may be re-assigned. |
180 | 188 | ||
181 | For the case where the PCI subsystem re-assigned this MSI vector | 189 | For the case where the PCI subsystem re-assigns this MSI vector to |
182 | another driver, a request to switching back to MSI mode may result | 190 | another driver, a request to switch back to MSI mode may result |
183 | in being assigned a different MSI vector or a failure if no more | 191 | in being assigned a different MSI vector or a failure if no more |
184 | vectors are available. | 192 | vectors are available. |
185 | 193 | ||
@@ -208,12 +216,12 @@ Unlike the function pci_enable_msi(), the function pci_enable_msix() | |||
208 | does not replace the pre-assigned IOAPIC dev->irq with a new MSI | 216 | does not replace the pre-assigned IOAPIC dev->irq with a new MSI |
209 | vector because the PCI subsystem writes the 1:1 vector-to-entry mapping | 217 | vector because the PCI subsystem writes the 1:1 vector-to-entry mapping |
210 | into the field vector of each element contained in a second argument. | 218 | into the field vector of each element contained in a second argument. |
211 | Note that the pre-assigned IO-APIC dev->irq is valid only if the device | 219 | Note that the pre-assigned IOAPIC dev->irq is valid only if the device |
212 | operates in PIN-IRQ assertion mode. In MSI-X mode, any attempt of | 220 | operates in PIN-IRQ assertion mode. In MSI-X mode, any attempt at |
213 | using dev->irq by the device driver to request for interrupt service | 221 | using dev->irq by the device driver to request for interrupt service |
214 | may result unpredictabe behavior. | 222 | may result unpredictabe behavior. |
215 | 223 | ||
216 | For each MSI-X vector granted, a device driver is responsible to call | 224 | For each MSI-X vector granted, a device driver is responsible for calling |
217 | other functions like request_irq(), enable_irq(), etc. to enable | 225 | other functions like request_irq(), enable_irq(), etc. to enable |
218 | this vector with its corresponding interrupt service handler. It is | 226 | this vector with its corresponding interrupt service handler. It is |
219 | a device driver's choice to assign all vectors with the same | 227 | a device driver's choice to assign all vectors with the same |
@@ -224,13 +232,13 @@ service handler. | |||
224 | 232 | ||
225 | The PCI 3.0 specification has implementation notes that MMIO address | 233 | The PCI 3.0 specification has implementation notes that MMIO address |
226 | space for a device's MSI-X structure should be isolated so that the | 234 | space for a device's MSI-X structure should be isolated so that the |
227 | software system can set different page for controlling accesses to | 235 | software system can set different pages for controlling accesses to the |
228 | the MSI-X structure. The implementation of MSI patch requires the PCI | 236 | MSI-X structure. The implementation of MSI support requires the PCI |
229 | subsystem, not a device driver, to maintain full control of the MSI-X | 237 | subsystem, not a device driver, to maintain full control of the MSI-X |
230 | table/MSI-X PBA and MMIO address space of the MSI-X table/MSI-X PBA. | 238 | table/MSI-X PBA (Pending Bit Array) and MMIO address space of the MSI-X |
231 | A device driver is prohibited from requesting the MMIO address space | 239 | table/MSI-X PBA. A device driver is prohibited from requesting the MMIO |
232 | of the MSI-X table/MSI-X PBA. Otherwise, the PCI subsystem will fail | 240 | address space of the MSI-X table/MSI-X PBA. Otherwise, the PCI subsystem |
233 | enabling MSI-X on its hardware device when it calls the function | 241 | will fail enabling MSI-X on its hardware device when it calls the function |
234 | pci_enable_msix(). | 242 | pci_enable_msix(). |
235 | 243 | ||
236 | 5.3.2 Handling MSI-X allocation | 244 | 5.3.2 Handling MSI-X allocation |
@@ -274,9 +282,9 @@ For the case where fewer MSI-X vectors are allocated to a function | |||
274 | than requested, the function pci_enable_msix() will return the | 282 | than requested, the function pci_enable_msix() will return the |
275 | maximum number of MSI-X vectors available to the caller. A device | 283 | maximum number of MSI-X vectors available to the caller. A device |
276 | driver may re-send its request with fewer or equal vectors indicated | 284 | driver may re-send its request with fewer or equal vectors indicated |
277 | in a return. For example, if a device driver requests 5 vectors, but | 285 | in the return. For example, if a device driver requests 5 vectors, but |
278 | the number of available vectors is 3 vectors, a value of 3 will be a | 286 | the number of available vectors is 3 vectors, a value of 3 will be |
279 | return as a result of pci_enable_msix() call. A function could be | 287 | returned as a result of pci_enable_msix() call. A function could be |
280 | designed for its driver to use only 3 MSI-X table entries as | 288 | designed for its driver to use only 3 MSI-X table entries as |
281 | different combinations as ABC--, A-B-C, A--CB, etc. Note that this | 289 | different combinations as ABC--, A-B-C, A--CB, etc. Note that this |
282 | patch does not support multiple entries with the same vector. Such | 290 | patch does not support multiple entries with the same vector. Such |
@@ -285,49 +293,46 @@ as ABBCC, AABCC, BCCBA, etc will result as a failure by the function | |||
285 | pci_enable_msix(). Below are the reasons why supporting multiple | 293 | pci_enable_msix(). Below are the reasons why supporting multiple |
286 | entries with the same vector is an undesirable solution. | 294 | entries with the same vector is an undesirable solution. |
287 | 295 | ||
288 | - The PCI subsystem can not determine which entry, which | 296 | - The PCI subsystem cannot determine the entry that |
289 | generated the message, to mask/unmask MSI while handling | 297 | generated the message to mask/unmask MSI while handling |
290 | software driver ISR. Attempting to walk through all MSI-X | 298 | software driver ISR. Attempting to walk through all MSI-X |
291 | table entries (2048 max) to mask/unmask any match vector | 299 | table entries (2048 max) to mask/unmask any match vector |
292 | is an undesirable solution. | 300 | is an undesirable solution. |
293 | 301 | ||
294 | - Walk through all MSI-X table entries (2048 max) to handle | 302 | - Walking through all MSI-X table entries (2048 max) to handle |
295 | SMP affinity of any match vector is an undesirable solution. | 303 | SMP affinity of any match vector is an undesirable solution. |
296 | 304 | ||
297 | 5.3.4 API pci_enable_msix | 305 | 5.3.4 API pci_enable_msix |
298 | 306 | ||
299 | int pci_enable_msix(struct pci_dev *dev, u32 *entries, int nvec) | 307 | int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec) |
300 | 308 | ||
301 | This API enables a device driver to request the PCI subsystem | 309 | This API enables a device driver to request the PCI subsystem |
302 | for enabling MSI-X messages on its hardware device. Depending on | 310 | to enable MSI-X messages on its hardware device. Depending on |
303 | the availability of PCI vectors resources, the PCI subsystem enables | 311 | the availability of PCI vectors resources, the PCI subsystem enables |
304 | either all or nothing. | 312 | either all or none of the requested vectors. |
305 | 313 | ||
306 | Argument dev points to the device (pci_dev) structure. | 314 | Argument 'dev' points to the device (pci_dev) structure. |
307 | 315 | ||
308 | Argument entries is a pointer of unsigned integer type. The number of | 316 | Argument 'entries' is a pointer to an array of msix_entry structs. |
309 | elements is indicated in argument nvec. The content of each element | 317 | The number of entries is indicated in argument 'nvec'. |
310 | will be mapped to the following struct defined in /driver/pci/msi.h. | 318 | struct msix_entry is defined in /driver/pci/msi.h: |
311 | 319 | ||
312 | struct msix_entry { | 320 | struct msix_entry { |
313 | u16 vector; /* kernel uses to write alloc vector */ | 321 | u16 vector; /* kernel uses to write alloc vector */ |
314 | u16 entry; /* driver uses to specify entry */ | 322 | u16 entry; /* driver uses to specify entry */ |
315 | }; | 323 | }; |
316 | 324 | ||
317 | A device driver is responsible for initializing the field entry of | 325 | A device driver is responsible for initializing the field 'entry' of |
318 | each element with unique entry supported by MSI-X table. Otherwise, | 326 | each element with a unique entry supported by MSI-X table. Otherwise, |
319 | -EINVAL will be returned as a result. A successful return of zero | 327 | -EINVAL will be returned as a result. A successful return of zero |
320 | indicates the PCI subsystem completes initializing each of requested | 328 | indicates the PCI subsystem completed initializing each of the requested |
321 | entries of the MSI-X table with message address and message data. | 329 | entries of the MSI-X table with message address and message data. |
322 | Last but not least, the PCI subsystem will write the 1:1 | 330 | Last but not least, the PCI subsystem will write the 1:1 |
323 | vector-to-entry mapping into the field vector of each element. A | 331 | vector-to-entry mapping into the field 'vector' of each element. A |
324 | device driver is responsible of keeping track of allocated MSI-X | 332 | device driver is responsible for keeping track of allocated MSI-X |
325 | vectors in its internal data structure. | 333 | vectors in its internal data structure. |
326 | 334 | ||
327 | Argument nvec is an integer indicating the number of messages | 335 | A return of zero indicates that the number of MSI-X vectors was |
328 | requested. | ||
329 | |||
330 | A return of zero indicates that the number of MSI-X vectors is | ||
331 | successfully allocated. A return of greater than zero indicates | 336 | successfully allocated. A return of greater than zero indicates |
332 | MSI-X vector shortage. Or a return of less than zero indicates | 337 | MSI-X vector shortage. Or a return of less than zero indicates |
333 | a failure. This failure may be a result of duplicate entries | 338 | a failure. This failure may be a result of duplicate entries |
@@ -341,12 +346,12 @@ void pci_disable_msix(struct pci_dev *dev) | |||
341 | This API should always be used to undo the effect of pci_enable_msix() | 346 | This API should always be used to undo the effect of pci_enable_msix() |
342 | when a device driver is unloading. Note that a device driver should | 347 | when a device driver is unloading. Note that a device driver should |
343 | always call free_irq() on all MSI-X vectors it has done request_irq() | 348 | always call free_irq() on all MSI-X vectors it has done request_irq() |
344 | on before calling this API. Failure to do so results a BUG_ON() and | 349 | on before calling this API. Failure to do so results in a BUG_ON() and |
345 | a device will be left with MSI-X enabled and leaks its vectors. | 350 | a device will be left with MSI-X enabled and leaks its vectors. |
346 | 351 | ||
347 | 5.3.6 MSI-X mode vs. legacy mode diagram | 352 | 5.3.6 MSI-X mode vs. legacy mode diagram |
348 | 353 | ||
349 | The below diagram shows the events, which switches the interrupt | 354 | The below diagram shows the events which switch the interrupt |
350 | mode on the MSI-X capable device function between MSI-X mode and | 355 | mode on the MSI-X capable device function between MSI-X mode and |
351 | PIN-IRQ assertion mode (legacy). | 356 | PIN-IRQ assertion mode (legacy). |
352 | 357 | ||
@@ -356,22 +361,22 @@ PIN-IRQ assertion mode (legacy). | |||
356 | | | ===============> | | | 361 | | | ===============> | | |
357 | ------------ pci_disable_msix ------------------------ | 362 | ------------ pci_disable_msix ------------------------ |
358 | 363 | ||
359 | Figure 2.0 MSI-X Mode vs. Legacy Mode | 364 | Figure 2. MSI-X Mode vs. Legacy Mode |
360 | 365 | ||
361 | In Figure 2.0, a device operates by default in legacy mode. A | 366 | In Figure 2, a device operates by default in legacy mode. A |
362 | successful MSI-X request (using pci_enable_msix()) switches a | 367 | successful MSI-X request (using pci_enable_msix()) switches a |
363 | device's interrupt mode to MSI-X mode. A pre-assigned IOAPIC vector | 368 | device's interrupt mode to MSI-X mode. A pre-assigned IOAPIC vector |
364 | stored in dev->irq will be saved by the PCI subsystem; however, | 369 | stored in dev->irq will be saved by the PCI subsystem; however, |
365 | unlike MSI mode, the PCI subsystem will not replace dev->irq with | 370 | unlike MSI mode, the PCI subsystem will not replace dev->irq with |
366 | assigned MSI-X vector because the PCI subsystem already writes the 1:1 | 371 | assigned MSI-X vector because the PCI subsystem already writes the 1:1 |
367 | vector-to-entry mapping into the field vector of each element | 372 | vector-to-entry mapping into the field 'vector' of each element |
368 | specified in second argument. | 373 | specified in second argument. |
369 | 374 | ||
370 | To return back to its default mode, a device driver should always call | 375 | To return back to its default mode, a device driver should always call |
371 | pci_disable_msix() to undo the effect of pci_enable_msix(). Note that | 376 | pci_disable_msix() to undo the effect of pci_enable_msix(). Note that |
372 | a device driver should always call free_irq() on all MSI-X vectors it | 377 | a device driver should always call free_irq() on all MSI-X vectors it |
373 | has done request_irq() on before calling pci_disable_msix(). Failure | 378 | has done request_irq() on before calling pci_disable_msix(). Failure |
374 | to do so results a BUG_ON() and a device will be left with MSI-X | 379 | to do so results in a BUG_ON() and a device will be left with MSI-X |
375 | enabled and leaks its vectors. Otherwise, the PCI subsystem switches a | 380 | enabled and leaks its vectors. Otherwise, the PCI subsystem switches a |
376 | device function's interrupt mode from MSI-X mode to legacy mode and | 381 | device function's interrupt mode from MSI-X mode to legacy mode and |
377 | marks all allocated MSI-X vectors as unused. | 382 | marks all allocated MSI-X vectors as unused. |
@@ -383,53 +388,56 @@ MSI/MSI-X requests from other drivers, these MSI-X vectors may be | |||
383 | re-assigned. | 388 | re-assigned. |
384 | 389 | ||
385 | For the case where the PCI subsystem re-assigned these MSI-X vectors | 390 | For the case where the PCI subsystem re-assigned these MSI-X vectors |
386 | to other driver, a request to switching back to MSI-X mode may result | 391 | to other drivers, a request to switch back to MSI-X mode may result |
387 | being assigned with another set of MSI-X vectors or a failure if no | 392 | being assigned with another set of MSI-X vectors or a failure if no |
388 | more vectors are available. | 393 | more vectors are available. |
389 | 394 | ||
390 | 5.4 Handling function implementng both MSI and MSI-X capabilities | 395 | 5.4 Handling function implementing both MSI and MSI-X capabilities |
391 | 396 | ||
392 | For the case where a function implements both MSI and MSI-X | 397 | For the case where a function implements both MSI and MSI-X |
393 | capabilities, the PCI subsystem enables a device to run either in MSI | 398 | capabilities, the PCI subsystem enables a device to run either in MSI |
394 | mode or MSI-X mode but not both. A device driver determines whether it | 399 | mode or MSI-X mode but not both. A device driver determines whether it |
395 | wants MSI or MSI-X enabled on its hardware device. Once a device | 400 | wants MSI or MSI-X enabled on its hardware device. Once a device |
396 | driver requests for MSI, for example, it is prohibited to request for | 401 | driver requests for MSI, for example, it is prohibited from requesting |
397 | MSI-X; in other words, a device driver is not permitted to ping-pong | 402 | MSI-X; in other words, a device driver is not permitted to ping-pong |
398 | between MSI mod MSI-X mode during a run-time. | 403 | between MSI mod MSI-X mode during a run-time. |
399 | 404 | ||
400 | 5.5 Hardware requirements for MSI/MSI-X support | 405 | 5.5 Hardware requirements for MSI/MSI-X support |
406 | |||
401 | MSI/MSI-X support requires support from both system hardware and | 407 | MSI/MSI-X support requires support from both system hardware and |
402 | individual hardware device functions. | 408 | individual hardware device functions. |
403 | 409 | ||
404 | 5.5.1 System hardware support | 410 | 5.5.1 System hardware support |
411 | |||
405 | Since the target of MSI address is the local APIC CPU, enabling | 412 | Since the target of MSI address is the local APIC CPU, enabling |
406 | MSI/MSI-X support in Linux kernel is dependent on whether existing | 413 | MSI/MSI-X support in the Linux kernel is dependent on whether existing |
407 | system hardware supports local APIC. Users should verify their | 414 | system hardware supports local APIC. Users should verify that their |
408 | system whether it runs when CONFIG_X86_LOCAL_APIC=y. | 415 | system supports local APIC operation by testing that it runs when |
416 | CONFIG_X86_LOCAL_APIC=y. | ||
409 | 417 | ||
410 | In SMP environment, CONFIG_X86_LOCAL_APIC is automatically set; | 418 | In SMP environment, CONFIG_X86_LOCAL_APIC is automatically set; |
411 | however, in UP environment, users must manually set | 419 | however, in UP environment, users must manually set |
412 | CONFIG_X86_LOCAL_APIC. Once CONFIG_X86_LOCAL_APIC=y, setting | 420 | CONFIG_X86_LOCAL_APIC. Once CONFIG_X86_LOCAL_APIC=y, setting |
413 | CONFIG_PCI_MSI enables the VECTOR based scheme and | 421 | CONFIG_PCI_MSI enables the VECTOR based scheme and the option for |
414 | the option for MSI-capable device drivers to selectively enable | 422 | MSI-capable device drivers to selectively enable MSI/MSI-X. |
415 | MSI/MSI-X. | ||
416 | 423 | ||
417 | Note that CONFIG_X86_IO_APIC setting is irrelevant because MSI/MSI-X | 424 | Note that CONFIG_X86_IO_APIC setting is irrelevant because MSI/MSI-X |
418 | vector is allocated new during runtime and MSI/MSI-X support does not | 425 | vector is allocated new during runtime and MSI/MSI-X support does not |
419 | depend on BIOS support. This key independency enables MSI/MSI-X | 426 | depend on BIOS support. This key independency enables MSI/MSI-X |
420 | support on future IOxAPIC free platform. | 427 | support on future IOxAPIC free platforms. |
421 | 428 | ||
422 | 5.5.2 Device hardware support | 429 | 5.5.2 Device hardware support |
430 | |||
423 | The hardware device function supports MSI by indicating the | 431 | The hardware device function supports MSI by indicating the |
424 | MSI/MSI-X capability structure on its PCI capability list. By | 432 | MSI/MSI-X capability structure on its PCI capability list. By |
425 | default, this capability structure will not be initialized by | 433 | default, this capability structure will not be initialized by |
426 | the kernel to enable MSI during the system boot. In other words, | 434 | the kernel to enable MSI during the system boot. In other words, |
427 | the device function is running on its default pin assertion mode. | 435 | the device function is running on its default pin assertion mode. |
428 | Note that in many cases the hardware supporting MSI have bugs, | 436 | Note that in many cases the hardware supporting MSI have bugs, |
429 | which may result in system hang. The software driver of specific | 437 | which may result in system hangs. The software driver of specific |
430 | MSI-capable hardware is responsible for whether calling | 438 | MSI-capable hardware is responsible for deciding whether to call |
431 | pci_enable_msi or not. A return of zero indicates the kernel | 439 | pci_enable_msi or not. A return of zero indicates the kernel |
432 | successfully initializes the MSI/MSI-X capability structure of the | 440 | successfully initialized the MSI/MSI-X capability structure of the |
433 | device function. The device function is now running on MSI/MSI-X mode. | 441 | device function. The device function is now running on MSI/MSI-X mode. |
434 | 442 | ||
435 | 5.6 How to tell whether MSI/MSI-X is enabled on device function | 443 | 5.6 How to tell whether MSI/MSI-X is enabled on device function |
@@ -439,10 +447,10 @@ pci_enable_msi()/pci_enable_msix() indicates to a device driver that | |||
439 | its device function is initialized successfully and ready to run in | 447 | its device function is initialized successfully and ready to run in |
440 | MSI/MSI-X mode. | 448 | MSI/MSI-X mode. |
441 | 449 | ||
442 | At the user level, users can use command 'cat /proc/interrupts' | 450 | At the user level, users can use the command 'cat /proc/interrupts' |
443 | to display the vector allocated for a device and its interrupt | 451 | to display the vectors allocated for devices and their interrupt |
444 | MSI/MSI-X mode ("PCI MSI"/"PCI MSIX"). Below shows below MSI mode is | 452 | MSI/MSI-X modes ("PCI-MSI"/"PCI-MSI-X"). Below shows MSI mode is |
445 | enabled on a SCSI Adaptec 39320D Ultra320. | 453 | enabled on a SCSI Adaptec 39320D Ultra320 controller. |
446 | 454 | ||
447 | CPU0 CPU1 | 455 | CPU0 CPU1 |
448 | 0: 324639 0 IO-APIC-edge timer | 456 | 0: 324639 0 IO-APIC-edge timer |
@@ -453,8 +461,8 @@ enabled on a SCSI Adaptec 39320D Ultra320. | |||
453 | 15: 1 0 IO-APIC-edge ide1 | 461 | 15: 1 0 IO-APIC-edge ide1 |
454 | 169: 0 0 IO-APIC-level uhci-hcd | 462 | 169: 0 0 IO-APIC-level uhci-hcd |
455 | 185: 0 0 IO-APIC-level uhci-hcd | 463 | 185: 0 0 IO-APIC-level uhci-hcd |
456 | 193: 138 10 PCI MSI aic79xx | 464 | 193: 138 10 PCI-MSI aic79xx |
457 | 201: 30 0 PCI MSI aic79xx | 465 | 201: 30 0 PCI-MSI aic79xx |
458 | 225: 30 0 IO-APIC-level aic7xxx | 466 | 225: 30 0 IO-APIC-level aic7xxx |
459 | 233: 30 0 IO-APIC-level aic7xxx | 467 | 233: 30 0 IO-APIC-level aic7xxx |
460 | NMI: 0 0 | 468 | NMI: 0 0 |
@@ -490,8 +498,8 @@ target address set as 0xfeexxxxx, as conformed to PCI | |||
490 | specification 2.3 or latest, then it should work. | 498 | specification 2.3 or latest, then it should work. |
491 | 499 | ||
492 | Q4. From the driver point of view, if the MSI is lost because | 500 | Q4. From the driver point of view, if the MSI is lost because |
493 | of the errors occur during inbound memory write, then it may | 501 | of errors occurring during inbound memory write, then it may |
494 | wait for ever. Is there a mechanism for it to recover? | 502 | wait forever. Is there a mechanism for it to recover? |
495 | 503 | ||
496 | A4. Since the target of the transaction is an inbound memory | 504 | A4. Since the target of the transaction is an inbound memory |
497 | write, all transaction termination conditions (Retry, | 505 | write, all transaction termination conditions (Retry, |
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 354d89c78377..15da16861fa3 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt | |||
@@ -772,8 +772,6 @@ RCU pointer/list traversal: | |||
772 | list_for_each_entry_rcu | 772 | list_for_each_entry_rcu |
773 | list_for_each_continue_rcu (to be deprecated in favor of new | 773 | list_for_each_continue_rcu (to be deprecated in favor of new |
774 | list_for_each_entry_continue_rcu) | 774 | list_for_each_entry_continue_rcu) |
775 | hlist_for_each_rcu (to be deprecated in favor of | ||
776 | hlist_for_each_entry_rcu) | ||
777 | hlist_for_each_entry_rcu | 775 | hlist_for_each_entry_rcu |
778 | 776 | ||
779 | RCU pointer update: | 777 | RCU pointer update: |
diff --git a/Documentation/device-mapper/snapshot.txt b/Documentation/device-mapper/snapshot.txt index dca274ff4005..a5009c8300f3 100644 --- a/Documentation/device-mapper/snapshot.txt +++ b/Documentation/device-mapper/snapshot.txt | |||
@@ -19,7 +19,6 @@ There are two dm targets available: snapshot and snapshot-origin. | |||
19 | *) snapshot-origin <origin> | 19 | *) snapshot-origin <origin> |
20 | 20 | ||
21 | which will normally have one or more snapshots based on it. | 21 | which will normally have one or more snapshots based on it. |
22 | You must create the snapshot-origin device before you can create snapshots. | ||
23 | Reads will be mapped directly to the backing device. For each write, the | 22 | Reads will be mapped directly to the backing device. For each write, the |
24 | original data will be saved in the <COW device> of each snapshot to keep | 23 | original data will be saved in the <COW device> of each snapshot to keep |
25 | its visible content unchanged, at least until the <COW device> fills up. | 24 | its visible content unchanged, at least until the <COW device> fills up. |
@@ -27,7 +26,7 @@ its visible content unchanged, at least until the <COW device> fills up. | |||
27 | 26 | ||
28 | *) snapshot <origin> <COW device> <persistent?> <chunksize> | 27 | *) snapshot <origin> <COW device> <persistent?> <chunksize> |
29 | 28 | ||
30 | A snapshot is created of the <origin> block device. Changed chunks of | 29 | A snapshot of the <origin> block device is created. Changed chunks of |
31 | <chunksize> sectors will be stored on the <COW device>. Writes will | 30 | <chunksize> sectors will be stored on the <COW device>. Writes will |
32 | only go to the <COW device>. Reads will come from the <COW device> or | 31 | only go to the <COW device>. Reads will come from the <COW device> or |
33 | from <origin> for unchanged data. <COW device> will often be | 32 | from <origin> for unchanged data. <COW device> will often be |
@@ -37,6 +36,8 @@ the amount of free space and expand the <COW device> before it fills up. | |||
37 | 36 | ||
38 | <persistent?> is P (Persistent) or N (Not persistent - will not survive | 37 | <persistent?> is P (Persistent) or N (Not persistent - will not survive |
39 | after reboot). | 38 | after reboot). |
39 | The difference is that for transient snapshots less metadata must be | ||
40 | saved on disk - they can be kept in memory by the kernel. | ||
40 | 41 | ||
41 | 42 | ||
42 | How this is used by LVM2 | 43 | How this is used by LVM2 |
diff --git a/Documentation/fb/vesafb.txt b/Documentation/fb/vesafb.txt index 62db6758d1c1..ee277dd204b0 100644 --- a/Documentation/fb/vesafb.txt +++ b/Documentation/fb/vesafb.txt | |||
@@ -146,10 +146,10 @@ pmipal Use the protected mode interface for palette changes. | |||
146 | 146 | ||
147 | mtrr:n setup memory type range registers for the vesafb framebuffer | 147 | mtrr:n setup memory type range registers for the vesafb framebuffer |
148 | where n: | 148 | where n: |
149 | 0 - disabled (equivalent to nomtrr) | 149 | 0 - disabled (equivalent to nomtrr) (default) |
150 | 1 - uncachable | 150 | 1 - uncachable |
151 | 2 - write-back | 151 | 2 - write-back |
152 | 3 - write-combining (default) | 152 | 3 - write-combining |
153 | 4 - write-through | 153 | 4 - write-through |
154 | 154 | ||
155 | If you see the following in dmesg, choose the type that matches the | 155 | If you see the following in dmesg, choose the type that matches the |
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index b67189a8d8d4..decdf9917e0d 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt | |||
@@ -69,6 +69,22 @@ Who: Grant Coady <gcoady@gmail.com> | |||
69 | 69 | ||
70 | --------------------------- | 70 | --------------------------- |
71 | 71 | ||
72 | What: remove EXPORT_SYMBOL(panic_timeout) | ||
73 | When: April 2006 | ||
74 | Files: kernel/panic.c | ||
75 | Why: No modular usage in the kernel. | ||
76 | Who: Adrian Bunk <bunk@stusta.de> | ||
77 | |||
78 | --------------------------- | ||
79 | |||
80 | What: remove EXPORT_SYMBOL(insert_resource) | ||
81 | When: April 2006 | ||
82 | Files: kernel/resource.c | ||
83 | Why: No modular usage in the kernel. | ||
84 | Who: Adrian Bunk <bunk@stusta.de> | ||
85 | |||
86 | --------------------------- | ||
87 | |||
72 | What: PCMCIA control ioctl (needed for pcmcia-cs [cardmgr, cardctl]) | 88 | What: PCMCIA control ioctl (needed for pcmcia-cs [cardmgr, cardctl]) |
73 | When: November 2005 | 89 | When: November 2005 |
74 | Files: drivers/pcmcia/: pcmcia_ioctl.c | 90 | Files: drivers/pcmcia/: pcmcia_ioctl.c |
diff --git a/Documentation/filesystems/dentry-locking.txt b/Documentation/filesystems/dentry-locking.txt new file mode 100644 index 000000000000..4c0c575a4012 --- /dev/null +++ b/Documentation/filesystems/dentry-locking.txt | |||
@@ -0,0 +1,173 @@ | |||
1 | RCU-based dcache locking model | ||
2 | ============================== | ||
3 | |||
4 | On many workloads, the most common operation on dcache is to look up a | ||
5 | dentry, given a parent dentry and the name of the child. Typically, | ||
6 | for every open(), stat() etc., the dentry corresponding to the | ||
7 | pathname will be looked up by walking the tree starting with the first | ||
8 | component of the pathname and using that dentry along with the next | ||
9 | component to look up the next level and so on. Since it is a frequent | ||
10 | operation for workloads like multiuser environments and web servers, | ||
11 | it is important to optimize this path. | ||
12 | |||
13 | Prior to 2.5.10, dcache_lock was acquired in d_lookup and thus in | ||
14 | every component during path look-up. Since 2.5.10 onwards, fast-walk | ||
15 | algorithm changed this by holding the dcache_lock at the beginning and | ||
16 | walking as many cached path component dentries as possible. This | ||
17 | significantly decreases the number of acquisition of | ||
18 | dcache_lock. However it also increases the lock hold time | ||
19 | significantly and affects performance in large SMP machines. Since | ||
20 | 2.5.62 kernel, dcache has been using a new locking model that uses RCU | ||
21 | to make dcache look-up lock-free. | ||
22 | |||
23 | The current dcache locking model is not very different from the | ||
24 | existing dcache locking model. Prior to 2.5.62 kernel, dcache_lock | ||
25 | protected the hash chain, d_child, d_alias, d_lru lists as well as | ||
26 | d_inode and several other things like mount look-up. RCU-based changes | ||
27 | affect only the way the hash chain is protected. For everything else | ||
28 | the dcache_lock must be taken for both traversing as well as | ||
29 | updating. The hash chain updates too take the dcache_lock. The | ||
30 | significant change is the way d_lookup traverses the hash chain, it | ||
31 | doesn't acquire the dcache_lock for this and rely on RCU to ensure | ||
32 | that the dentry has not been *freed*. | ||
33 | |||
34 | |||
35 | Dcache locking details | ||
36 | ====================== | ||
37 | |||
38 | For many multi-user workloads, open() and stat() on files are very | ||
39 | frequently occurring operations. Both involve walking of path names to | ||
40 | find the dentry corresponding to the concerned file. In 2.4 kernel, | ||
41 | dcache_lock was held during look-up of each path component. Contention | ||
42 | and cache-line bouncing of this global lock caused significant | ||
43 | scalability problems. With the introduction of RCU in Linux kernel, | ||
44 | this was worked around by making the look-up of path components during | ||
45 | path walking lock-free. | ||
46 | |||
47 | |||
48 | Safe lock-free look-up of dcache hash table | ||
49 | =========================================== | ||
50 | |||
51 | Dcache is a complex data structure with the hash table entries also | ||
52 | linked together in other lists. In 2.4 kernel, dcache_lock protected | ||
53 | all the lists. We applied RCU only on hash chain walking. The rest of | ||
54 | the lists are still protected by dcache_lock. Some of the important | ||
55 | changes are : | ||
56 | |||
57 | 1. The deletion from hash chain is done using hlist_del_rcu() macro | ||
58 | which doesn't initialize next pointer of the deleted dentry and | ||
59 | this allows us to walk safely lock-free while a deletion is | ||
60 | happening. | ||
61 | |||
62 | 2. Insertion of a dentry into the hash table is done using | ||
63 | hlist_add_head_rcu() which take care of ordering the writes - the | ||
64 | writes to the dentry must be visible before the dentry is | ||
65 | inserted. This works in conjunction with hlist_for_each_rcu() while | ||
66 | walking the hash chain. The only requirement is that all | ||
67 | initialization to the dentry must be done before | ||
68 | hlist_add_head_rcu() since we don't have dcache_lock protection | ||
69 | while traversing the hash chain. This isn't different from the | ||
70 | existing code. | ||
71 | |||
72 | 3. The dentry looked up without holding dcache_lock by cannot be | ||
73 | returned for walking if it is unhashed. It then may have a NULL | ||
74 | d_inode or other bogosity since RCU doesn't protect the other | ||
75 | fields in the dentry. We therefore use a flag DCACHE_UNHASHED to | ||
76 | indicate unhashed dentries and use this in conjunction with a | ||
77 | per-dentry lock (d_lock). Once looked up without the dcache_lock, | ||
78 | we acquire the per-dentry lock (d_lock) and check if the dentry is | ||
79 | unhashed. If so, the look-up is failed. If not, the reference count | ||
80 | of the dentry is increased and the dentry is returned. | ||
81 | |||
82 | 4. Once a dentry is looked up, it must be ensured during the path walk | ||
83 | for that component it doesn't go away. In pre-2.5.10 code, this was | ||
84 | done holding a reference to the dentry. dcache_rcu does the same. | ||
85 | In some sense, dcache_rcu path walking looks like the pre-2.5.10 | ||
86 | version. | ||
87 | |||
88 | 5. All dentry hash chain updates must take the dcache_lock as well as | ||
89 | the per-dentry lock in that order. dput() does this to ensure that | ||
90 | a dentry that has just been looked up in another CPU doesn't get | ||
91 | deleted before dget() can be done on it. | ||
92 | |||
93 | 6. There are several ways to do reference counting of RCU protected | ||
94 | objects. One such example is in ipv4 route cache where deferred | ||
95 | freeing (using call_rcu()) is done as soon as the reference count | ||
96 | goes to zero. This cannot be done in the case of dentries because | ||
97 | tearing down of dentries require blocking (dentry_iput()) which | ||
98 | isn't supported from RCU callbacks. Instead, tearing down of | ||
99 | dentries happen synchronously in dput(), but actual freeing happens | ||
100 | later when RCU grace period is over. This allows safe lock-free | ||
101 | walking of the hash chains, but a matched dentry may have been | ||
102 | partially torn down. The checking of DCACHE_UNHASHED flag with | ||
103 | d_lock held detects such dentries and prevents them from being | ||
104 | returned from look-up. | ||
105 | |||
106 | |||
107 | Maintaining POSIX rename semantics | ||
108 | ================================== | ||
109 | |||
110 | Since look-up of dentries is lock-free, it can race against a | ||
111 | concurrent rename operation. For example, during rename of file A to | ||
112 | B, look-up of either A or B must succeed. So, if look-up of B happens | ||
113 | after A has been removed from the hash chain but not added to the new | ||
114 | hash chain, it may fail. Also, a comparison while the name is being | ||
115 | written concurrently by a rename may result in false positive matches | ||
116 | violating rename semantics. Issues related to race with rename are | ||
117 | handled as described below : | ||
118 | |||
119 | 1. Look-up can be done in two ways - d_lookup() which is safe from | ||
120 | simultaneous renames and __d_lookup() which is not. If | ||
121 | __d_lookup() fails, it must be followed up by a d_lookup() to | ||
122 | correctly determine whether a dentry is in the hash table or | ||
123 | not. d_lookup() protects look-ups using a sequence lock | ||
124 | (rename_lock). | ||
125 | |||
126 | 2. The name associated with a dentry (d_name) may be changed if a | ||
127 | rename is allowed to happen simultaneously. To avoid memcmp() in | ||
128 | __d_lookup() go out of bounds due to a rename and false positive | ||
129 | comparison, the name comparison is done while holding the | ||
130 | per-dentry lock. This prevents concurrent renames during this | ||
131 | operation. | ||
132 | |||
133 | 3. Hash table walking during look-up may move to a different bucket as | ||
134 | the current dentry is moved to a different bucket due to rename. | ||
135 | But we use hlists in dcache hash table and they are | ||
136 | null-terminated. So, even if a dentry moves to a different bucket, | ||
137 | hash chain walk will terminate. [with a list_head list, it may not | ||
138 | since termination is when the list_head in the original bucket is | ||
139 | reached]. Since we redo the d_parent check and compare name while | ||
140 | holding d_lock, lock-free look-up will not race against d_move(). | ||
141 | |||
142 | 4. There can be a theoretical race when a dentry keeps coming back to | ||
143 | original bucket due to double moves. Due to this look-up may | ||
144 | consider that it has never moved and can end up in a infinite loop. | ||
145 | But this is not any worse that theoretical livelocks we already | ||
146 | have in the kernel. | ||
147 | |||
148 | |||
149 | Important guidelines for filesystem developers related to dcache_rcu | ||
150 | ==================================================================== | ||
151 | |||
152 | 1. Existing dcache interfaces (pre-2.5.62) exported to filesystem | ||
153 | don't change. Only dcache internal implementation changes. However | ||
154 | filesystems *must not* delete from the dentry hash chains directly | ||
155 | using the list macros like allowed earlier. They must use dcache | ||
156 | APIs like d_drop() or __d_drop() depending on the situation. | ||
157 | |||
158 | 2. d_flags is now protected by a per-dentry lock (d_lock). All access | ||
159 | to d_flags must be protected by it. | ||
160 | |||
161 | 3. For a hashed dentry, checking of d_count needs to be protected by | ||
162 | d_lock. | ||
163 | |||
164 | |||
165 | Papers and other documentation on dcache locking | ||
166 | ================================================ | ||
167 | |||
168 | 1. Scaling dcache with RCU (http://linuxjournal.com/article.php?sid=7124). | ||
169 | |||
170 | 2. http://lse.sourceforge.net/locking/dcache/dcache.html | ||
171 | |||
172 | |||
173 | |||
diff --git a/Documentation/filesystems/devfs/README b/Documentation/filesystems/devfs/README index 54366ecc241f..aabfba24bc2e 100644 --- a/Documentation/filesystems/devfs/README +++ b/Documentation/filesystems/devfs/README | |||
@@ -1812,11 +1812,6 @@ it may overflow the messages buffer, but try to get as much of it as | |||
1812 | you can | 1812 | you can |
1813 | 1813 | ||
1814 | 1814 | ||
1815 | if you get an Oops, run ksymoops to decode it so that the | ||
1816 | names of the offending functions are provided. A non-decoded Oops is | ||
1817 | pretty useless | ||
1818 | |||
1819 | |||
1820 | send a copy of your devfsd configuration file(s) | 1815 | send a copy of your devfsd configuration file(s) |
1821 | 1816 | ||
1822 | send the bug report to me first. | 1817 | send the bug report to me first. |
diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.txt b/Documentation/filesystems/ramfs-rootfs-initramfs.txt new file mode 100644 index 000000000000..b3404a032596 --- /dev/null +++ b/Documentation/filesystems/ramfs-rootfs-initramfs.txt | |||
@@ -0,0 +1,195 @@ | |||
1 | ramfs, rootfs and initramfs | ||
2 | October 17, 2005 | ||
3 | Rob Landley <rob@landley.net> | ||
4 | ============================= | ||
5 | |||
6 | What is ramfs? | ||
7 | -------------- | ||
8 | |||
9 | Ramfs is a very simple filesystem that exports Linux's disk caching | ||
10 | mechanisms (the page cache and dentry cache) as a dynamically resizable | ||
11 | ram-based filesystem. | ||
12 | |||
13 | Normally all files are cached in memory by Linux. Pages of data read from | ||
14 | backing store (usually the block device the filesystem is mounted on) are kept | ||
15 | around in case it's needed again, but marked as clean (freeable) in case the | ||
16 | Virtual Memory system needs the memory for something else. Similarly, data | ||
17 | written to files is marked clean as soon as it has been written to backing | ||
18 | store, but kept around for caching purposes until the VM reallocates the | ||
19 | memory. A similar mechanism (the dentry cache) greatly speeds up access to | ||
20 | directories. | ||
21 | |||
22 | With ramfs, there is no backing store. Files written into ramfs allocate | ||
23 | dentries and page cache as usual, but there's nowhere to write them to. | ||
24 | This means the pages are never marked clean, so they can't be freed by the | ||
25 | VM when it's looking to recycle memory. | ||
26 | |||
27 | The amount of code required to implement ramfs is tiny, because all the | ||
28 | work is done by the existing Linux caching infrastructure. Basically, | ||
29 | you're mounting the disk cache as a filesystem. Because of this, ramfs is not | ||
30 | an optional component removable via menuconfig, since there would be negligible | ||
31 | space savings. | ||
32 | |||
33 | ramfs and ramdisk: | ||
34 | ------------------ | ||
35 | |||
36 | The older "ram disk" mechanism created a synthetic block device out of | ||
37 | an area of ram and used it as backing store for a filesystem. This block | ||
38 | device was of fixed size, so the filesystem mounted on it was of fixed | ||
39 | size. Using a ram disk also required unnecessarily copying memory from the | ||
40 | fake block device into the page cache (and copying changes back out), as well | ||
41 | as creating and destroying dentries. Plus it needed a filesystem driver | ||
42 | (such as ext2) to format and interpret this data. | ||
43 | |||
44 | Compared to ramfs, this wastes memory (and memory bus bandwidth), creates | ||
45 | unnecessary work for the CPU, and pollutes the CPU caches. (There are tricks | ||
46 | to avoid this copying by playing with the page tables, but they're unpleasantly | ||
47 | complicated and turn out to be about as expensive as the copying anyway.) | ||
48 | More to the point, all the work ramfs is doing has to happen _anyway_, | ||
49 | since all file access goes through the page and dentry caches. The ram | ||
50 | disk is simply unnecessary, ramfs is internally much simpler. | ||
51 | |||
52 | Another reason ramdisks are semi-obsolete is that the introduction of | ||
53 | loopback devices offered a more flexible and convenient way to create | ||
54 | synthetic block devices, now from files instead of from chunks of memory. | ||
55 | See losetup (8) for details. | ||
56 | |||
57 | ramfs and tmpfs: | ||
58 | ---------------- | ||
59 | |||
60 | One downside of ramfs is you can keep writing data into it until you fill | ||
61 | up all memory, and the VM can't free it because the VM thinks that files | ||
62 | should get written to backing store (rather than swap space), but ramfs hasn't | ||
63 | got any backing store. Because of this, only root (or a trusted user) should | ||
64 | be allowed write access to a ramfs mount. | ||
65 | |||
66 | A ramfs derivative called tmpfs was created to add size limits, and the ability | ||
67 | to write the data to swap space. Normal users can be allowed write access to | ||
68 | tmpfs mounts. See Documentation/filesystems/tmpfs.txt for more information. | ||
69 | |||
70 | What is rootfs? | ||
71 | --------------- | ||
72 | |||
73 | Rootfs is a special instance of ramfs, which is always present in 2.6 systems. | ||
74 | (It's used internally as the starting and stopping point for searches of the | ||
75 | kernel's doubly-linked list of mount points.) | ||
76 | |||
77 | Most systems just mount another filesystem over it and ignore it. The | ||
78 | amount of space an empty instance of ramfs takes up is tiny. | ||
79 | |||
80 | What is initramfs? | ||
81 | ------------------ | ||
82 | |||
83 | All 2.6 Linux kernels contain a gzipped "cpio" format archive, which is | ||
84 | extracted into rootfs when the kernel boots up. After extracting, the kernel | ||
85 | checks to see if rootfs contains a file "init", and if so it executes it as PID | ||
86 | 1. If found, this init process is responsible for bringing the system the | ||
87 | rest of the way up, including locating and mounting the real root device (if | ||
88 | any). If rootfs does not contain an init program after the embedded cpio | ||
89 | archive is extracted into it, the kernel will fall through to the older code | ||
90 | to locate and mount a root partition, then exec some variant of /sbin/init | ||
91 | out of that. | ||
92 | |||
93 | All this differs from the old initrd in several ways: | ||
94 | |||
95 | - The old initrd was a separate file, while the initramfs archive is linked | ||
96 | into the linux kernel image. (The directory linux-*/usr is devoted to | ||
97 | generating this archive during the build.) | ||
98 | |||
99 | - The old initrd file was a gzipped filesystem image (in some file format, | ||
100 | such as ext2, that had to be built into the kernel), while the new | ||
101 | initramfs archive is a gzipped cpio archive (like tar only simpler, | ||
102 | see cpio(1) and Documentation/early-userspace/buffer-format.txt). | ||
103 | |||
104 | - The program run by the old initrd (which was called /initrd, not /init) did | ||
105 | some setup and then returned to the kernel, while the init program from | ||
106 | initramfs is not expected to return to the kernel. (If /init needs to hand | ||
107 | off control it can overmount / with a new root device and exec another init | ||
108 | program. See the switch_root utility, below.) | ||
109 | |||
110 | - When switching another root device, initrd would pivot_root and then | ||
111 | umount the ramdisk. But initramfs is rootfs: you can neither pivot_root | ||
112 | rootfs, nor unmount it. Instead delete everything out of rootfs to | ||
113 | free up the space (find -xdev / -exec rm '{}' ';'), overmount rootfs | ||
114 | with the new root (cd /newmount; mount --move . /; chroot .), attach | ||
115 | stdin/stdout/stderr to the new /dev/console, and exec the new init. | ||
116 | |||
117 | Since this is a remarkably persnickity process (and involves deleting | ||
118 | commands before you can run them), the klibc package introduced a helper | ||
119 | program (utils/run_init.c) to do all this for you. Most other packages | ||
120 | (such as busybox) have named this command "switch_root". | ||
121 | |||
122 | Populating initramfs: | ||
123 | --------------------- | ||
124 | |||
125 | The 2.6 kernel build process always creates a gzipped cpio format initramfs | ||
126 | archive and links it into the resulting kernel binary. By default, this | ||
127 | archive is empty (consuming 134 bytes on x86). The config option | ||
128 | CONFIG_INITRAMFS_SOURCE (for some reason buried under devices->block devices | ||
129 | in menuconfig, and living in usr/Kconfig) can be used to specify a source for | ||
130 | the initramfs archive, which will automatically be incorporated into the | ||
131 | resulting binary. This option can point to an existing gzipped cpio archive, a | ||
132 | directory containing files to be archived, or a text file specification such | ||
133 | as the following example: | ||
134 | |||
135 | dir /dev 755 0 0 | ||
136 | nod /dev/console 644 0 0 c 5 1 | ||
137 | nod /dev/loop0 644 0 0 b 7 0 | ||
138 | dir /bin 755 1000 1000 | ||
139 | slink /bin/sh busybox 777 0 0 | ||
140 | file /bin/busybox initramfs/busybox 755 0 0 | ||
141 | dir /proc 755 0 0 | ||
142 | dir /sys 755 0 0 | ||
143 | dir /mnt 755 0 0 | ||
144 | file /init initramfs/init.sh 755 0 0 | ||
145 | |||
146 | One advantage of the text file is that root access is not required to | ||
147 | set permissions or create device nodes in the new archive. (Note that those | ||
148 | two example "file" entries expect to find files named "init.sh" and "busybox" in | ||
149 | a directory called "initramfs", under the linux-2.6.* directory. See | ||
150 | Documentation/early-userspace/README for more details.) | ||
151 | |||
152 | If you don't already understand what shared libraries, devices, and paths | ||
153 | you need to get a minimal root filesystem up and running, here are some | ||
154 | references: | ||
155 | http://www.tldp.org/HOWTO/Bootdisk-HOWTO/ | ||
156 | http://www.tldp.org/HOWTO/From-PowerUp-To-Bash-Prompt-HOWTO.html | ||
157 | http://www.linuxfromscratch.org/lfs/view/stable/ | ||
158 | |||
159 | The "klibc" package (http://www.kernel.org/pub/linux/libs/klibc) is | ||
160 | designed to be a tiny C library to statically link early userspace | ||
161 | code against, along with some related utilities. It is BSD licensed. | ||
162 | |||
163 | I use uClibc (http://www.uclibc.org) and busybox (http://www.busybox.net) | ||
164 | myself. These are LGPL and GPL, respectively. | ||
165 | |||
166 | In theory you could use glibc, but that's not well suited for small embedded | ||
167 | uses like this. (A "hello world" program statically linked against glibc is | ||
168 | over 400k. With uClibc it's 7k. Also note that glibc dlopens libnss to do | ||
169 | name lookups, even when otherwise statically linked.) | ||
170 | |||
171 | Future directions: | ||
172 | ------------------ | ||
173 | |||
174 | Today (2.6.14), initramfs is always compiled in, but not always used. The | ||
175 | kernel falls back to legacy boot code that is reached only if initramfs does | ||
176 | not contain an /init program. The fallback is legacy code, there to ensure a | ||
177 | smooth transition and allowing early boot functionality to gradually move to | ||
178 | "early userspace" (I.E. initramfs). | ||
179 | |||
180 | The move to early userspace is necessary because finding and mounting the real | ||
181 | root device is complex. Root partitions can span multiple devices (raid or | ||
182 | separate journal). They can be out on the network (requiring dhcp, setting a | ||
183 | specific mac address, logging into a server, etc). They can live on removable | ||
184 | media, with dynamically allocated major/minor numbers and persistent naming | ||
185 | issues requiring a full udev implementation to sort out. They can be | ||
186 | compressed, encrypted, copy-on-write, loopback mounted, strangely partitioned, | ||
187 | and so on. | ||
188 | |||
189 | This kind of complexity (which inevitably includes policy) is rightly handled | ||
190 | in userspace. Both klibc and busybox/uClibc are working on simple initramfs | ||
191 | packages to drop into a kernel build, and when standard solutions are ready | ||
192 | and widely deployed, the kernel's legacy early boot code will become obsolete | ||
193 | and a candidate for the feature removal schedule. | ||
194 | |||
195 | But that's a while off yet. | ||
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index f042c12e0ed2..ee4c0a8b8db7 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt | |||
@@ -3,7 +3,7 @@ | |||
3 | 3 | ||
4 | Original author: Richard Gooch <rgooch@atnf.csiro.au> | 4 | Original author: Richard Gooch <rgooch@atnf.csiro.au> |
5 | 5 | ||
6 | Last updated on August 25, 2005 | 6 | Last updated on October 28, 2005 |
7 | 7 | ||
8 | Copyright (C) 1999 Richard Gooch | 8 | Copyright (C) 1999 Richard Gooch |
9 | Copyright (C) 2005 Pekka Enberg | 9 | Copyright (C) 2005 Pekka Enberg |
@@ -11,62 +11,61 @@ | |||
11 | This file is released under the GPLv2. | 11 | This file is released under the GPLv2. |
12 | 12 | ||
13 | 13 | ||
14 | What is it? | 14 | Introduction |
15 | =========== | 15 | ============ |
16 | 16 | ||
17 | The Virtual File System (otherwise known as the Virtual Filesystem | 17 | The Virtual File System (also known as the Virtual Filesystem Switch) |
18 | Switch) is the software layer in the kernel that provides the | 18 | is the software layer in the kernel that provides the filesystem |
19 | filesystem interface to userspace programs. It also provides an | 19 | interface to userspace programs. It also provides an abstraction |
20 | abstraction within the kernel which allows different filesystem | 20 | within the kernel which allows different filesystem implementations to |
21 | implementations to coexist. | 21 | coexist. |
22 | 22 | ||
23 | VFS system calls open(2), stat(2), read(2), write(2), chmod(2) and so | ||
24 | on are called from a process context. Filesystem locking is described | ||
25 | in the document Documentation/filesystems/Locking. | ||
23 | 26 | ||
24 | A Quick Look At How It Works | ||
25 | ============================ | ||
26 | 27 | ||
27 | In this section I'll briefly describe how things work, before | 28 | Directory Entry Cache (dcache) |
28 | launching into the details. I'll start with describing what happens | 29 | ------------------------------ |
29 | when user programs open and manipulate files, and then look from the | ||
30 | other view which is how a filesystem is supported and subsequently | ||
31 | mounted. | ||
32 | |||
33 | |||
34 | Opening a File | ||
35 | -------------- | ||
36 | |||
37 | The VFS implements the open(2), stat(2), chmod(2) and similar system | ||
38 | calls. The pathname argument is used by the VFS to search through the | ||
39 | directory entry cache (dentry cache or "dcache"). This provides a very | ||
40 | fast look-up mechanism to translate a pathname (filename) into a | ||
41 | specific dentry. | ||
42 | |||
43 | An individual dentry usually has a pointer to an inode. Inodes are the | ||
44 | things that live on disc drives, and can be regular files (you know: | ||
45 | those things that you write data into), directories, FIFOs and other | ||
46 | beasts. Dentries live in RAM and are never saved to disc: they exist | ||
47 | only for performance. Inodes live on disc and are copied into memory | ||
48 | when required. Later any changes are written back to disc. The inode | ||
49 | that lives in RAM is a VFS inode, and it is this which the dentry | ||
50 | points to. A single inode can be pointed to by multiple dentries | ||
51 | (think about hardlinks). | ||
52 | |||
53 | The dcache is meant to be a view into your entire filespace. Unlike | ||
54 | Linus, most of us losers can't fit enough dentries into RAM to cover | ||
55 | all of our filespace, so the dcache has bits missing. In order to | ||
56 | resolve your pathname into a dentry, the VFS may have to resort to | ||
57 | creating dentries along the way, and then loading the inode. This is | ||
58 | done by looking up the inode. | ||
59 | |||
60 | To look up an inode (usually read from disc) requires that the VFS | ||
61 | calls the lookup() method of the parent directory inode. This method | ||
62 | is installed by the specific filesystem implementation that the inode | ||
63 | lives in. There will be more on this later. | ||
64 | 30 | ||
65 | Once the VFS has the required dentry (and hence the inode), we can do | 31 | The VFS implements the open(2), stat(2), chmod(2), and similar system |
66 | all those boring things like open(2) the file, or stat(2) it to peek | 32 | calls. The pathname argument that is passed to them is used by the VFS |
67 | at the inode data. The stat(2) operation is fairly simple: once the | 33 | to search through the directory entry cache (also known as the dentry |
68 | VFS has the dentry, it peeks at the inode data and passes some of it | 34 | cache or dcache). This provides a very fast look-up mechanism to |
69 | back to userspace. | 35 | translate a pathname (filename) into a specific dentry. Dentries live |
36 | in RAM and are never saved to disc: they exist only for performance. | ||
37 | |||
38 | The dentry cache is meant to be a view into your entire filespace. As | ||
39 | most computers cannot fit all dentries in the RAM at the same time, | ||
40 | some bits of the cache are missing. In order to resolve your pathname | ||
41 | into a dentry, the VFS may have to resort to creating dentries along | ||
42 | the way, and then loading the inode. This is done by looking up the | ||
43 | inode. | ||
44 | |||
45 | |||
46 | The Inode Object | ||
47 | ---------------- | ||
48 | |||
49 | An individual dentry usually has a pointer to an inode. Inodes are | ||
50 | filesystem objects such as regular files, directories, FIFOs and other | ||
51 | beasts. They live either on the disc (for block device filesystems) | ||
52 | or in the memory (for pseudo filesystems). Inodes that live on the | ||
53 | disc are copied into the memory when required and changes to the inode | ||
54 | are written back to disc. A single inode can be pointed to by multiple | ||
55 | dentries (hard links, for example, do this). | ||
56 | |||
57 | To look up an inode requires that the VFS calls the lookup() method of | ||
58 | the parent directory inode. This method is installed by the specific | ||
59 | filesystem implementation that the inode lives in. Once the VFS has | ||
60 | the required dentry (and hence the inode), we can do all those boring | ||
61 | things like open(2) the file, or stat(2) it to peek at the inode | ||
62 | data. The stat(2) operation is fairly simple: once the VFS has the | ||
63 | dentry, it peeks at the inode data and passes some of it back to | ||
64 | userspace. | ||
65 | |||
66 | |||
67 | The File Object | ||
68 | --------------- | ||
70 | 69 | ||
71 | Opening a file requires another operation: allocation of a file | 70 | Opening a file requires another operation: allocation of a file |
72 | structure (this is the kernel-side implementation of file | 71 | structure (this is the kernel-side implementation of file |
@@ -74,51 +73,39 @@ descriptors). The freshly allocated file structure is initialized with | |||
74 | a pointer to the dentry and a set of file operation member functions. | 73 | a pointer to the dentry and a set of file operation member functions. |
75 | These are taken from the inode data. The open() file method is then | 74 | These are taken from the inode data. The open() file method is then |
76 | called so the specific filesystem implementation can do it's work. You | 75 | called so the specific filesystem implementation can do it's work. You |
77 | can see that this is another switch performed by the VFS. | 76 | can see that this is another switch performed by the VFS. The file |
78 | 77 | structure is placed into the file descriptor table for the process. | |
79 | The file structure is placed into the file descriptor table for the | ||
80 | process. | ||
81 | 78 | ||
82 | Reading, writing and closing files (and other assorted VFS operations) | 79 | Reading, writing and closing files (and other assorted VFS operations) |
83 | is done by using the userspace file descriptor to grab the appropriate | 80 | is done by using the userspace file descriptor to grab the appropriate |
84 | file structure, and then calling the required file structure method | 81 | file structure, and then calling the required file structure method to |
85 | function to do whatever is required. | 82 | do whatever is required. For as long as the file is open, it keeps the |
86 | 83 | dentry in use, which in turn means that the VFS inode is still in use. | |
87 | For as long as the file is open, it keeps the dentry "open" (in use), | ||
88 | which in turn means that the VFS inode is still in use. | ||
89 | |||
90 | All VFS system calls (i.e. open(2), stat(2), read(2), write(2), | ||
91 | chmod(2) and so on) are called from a process context. You should | ||
92 | assume that these calls are made without any kernel locks being | ||
93 | held. This means that the processes may be executing the same piece of | ||
94 | filesystem or driver code at the same time, on different | ||
95 | processors. You should ensure that access to shared resources is | ||
96 | protected by appropriate locks. | ||
97 | 84 | ||
98 | 85 | ||
99 | Registering and Mounting a Filesystem | 86 | Registering and Mounting a Filesystem |
100 | ------------------------------------- | 87 | ===================================== |
101 | 88 | ||
102 | If you want to support a new kind of filesystem in the kernel, all you | 89 | To register and unregister a filesystem, use the following API |
103 | need to do is call register_filesystem(). You pass a structure | 90 | functions: |
104 | describing the filesystem implementation (struct file_system_type) | ||
105 | which is then added to an internal table of supported filesystems. You | ||
106 | can do: | ||
107 | 91 | ||
108 | % cat /proc/filesystems | 92 | #include <linux/fs.h> |
109 | 93 | ||
110 | to see what filesystems are currently available on your system. | 94 | extern int register_filesystem(struct file_system_type *); |
95 | extern int unregister_filesystem(struct file_system_type *); | ||
111 | 96 | ||
112 | When a request is made to mount a block device onto a directory in | 97 | The passed struct file_system_type describes your filesystem. When a |
113 | your filespace the VFS will call the appropriate method for the | 98 | request is made to mount a device onto a directory in your filespace, |
114 | specific filesystem. The dentry for the mount point will then be | 99 | the VFS will call the appropriate get_sb() method for the specific |
115 | updated to point to the root inode for the new filesystem. | 100 | filesystem. The dentry for the mount point will then be updated to |
101 | point to the root inode for the new filesystem. | ||
116 | 102 | ||
117 | It's now time to look at things in more detail. | 103 | You can see all filesystems that are registered to the kernel in the |
104 | file /proc/filesystems. | ||
118 | 105 | ||
119 | 106 | ||
120 | struct file_system_type | 107 | struct file_system_type |
121 | ======================= | 108 | ----------------------- |
122 | 109 | ||
123 | This describes the filesystem. As of kernel 2.6.13, the following | 110 | This describes the filesystem. As of kernel 2.6.13, the following |
124 | members are defined: | 111 | members are defined: |
@@ -197,8 +184,14 @@ A fill_super() method implementation has the following arguments: | |||
197 | int silent: whether or not to be silent on error | 184 | int silent: whether or not to be silent on error |
198 | 185 | ||
199 | 186 | ||
187 | The Superblock Object | ||
188 | ===================== | ||
189 | |||
190 | A superblock object represents a mounted filesystem. | ||
191 | |||
192 | |||
200 | struct super_operations | 193 | struct super_operations |
201 | ======================= | 194 | ----------------------- |
202 | 195 | ||
203 | This describes how the VFS can manipulate the superblock of your | 196 | This describes how the VFS can manipulate the superblock of your |
204 | filesystem. As of kernel 2.6.13, the following members are defined: | 197 | filesystem. As of kernel 2.6.13, the following members are defined: |
@@ -286,9 +279,9 @@ or bottom half). | |||
286 | a superblock. The second parameter indicates whether the method | 279 | a superblock. The second parameter indicates whether the method |
287 | should wait until the write out has been completed. Optional. | 280 | should wait until the write out has been completed. Optional. |
288 | 281 | ||
289 | write_super_lockfs: called when VFS is locking a filesystem and forcing | 282 | write_super_lockfs: called when VFS is locking a filesystem and |
290 | it into a consistent state. This function is currently used by the | 283 | forcing it into a consistent state. This method is currently |
291 | Logical Volume Manager (LVM). | 284 | used by the Logical Volume Manager (LVM). |
292 | 285 | ||
293 | unlockfs: called when VFS is unlocking a filesystem and making it writable | 286 | unlockfs: called when VFS is unlocking a filesystem and making it writable |
294 | again. | 287 | again. |
@@ -317,8 +310,14 @@ field. This is a pointer to a "struct inode_operations" which | |||
317 | describes the methods that can be performed on individual inodes. | 310 | describes the methods that can be performed on individual inodes. |
318 | 311 | ||
319 | 312 | ||
313 | The Inode Object | ||
314 | ================ | ||
315 | |||
316 | An inode object represents an object within the filesystem. | ||
317 | |||
318 | |||
320 | struct inode_operations | 319 | struct inode_operations |
321 | ======================= | 320 | ----------------------- |
322 | 321 | ||
323 | This describes how the VFS can manipulate an inode in your | 322 | This describes how the VFS can manipulate an inode in your |
324 | filesystem. As of kernel 2.6.13, the following members are defined: | 323 | filesystem. As of kernel 2.6.13, the following members are defined: |
@@ -394,51 +393,62 @@ otherwise noted. | |||
394 | will probably need to call d_instantiate() just as you would | 393 | will probably need to call d_instantiate() just as you would |
395 | in the create() method | 394 | in the create() method |
396 | 395 | ||
396 | rename: called by the rename(2) system call to rename the object to | ||
397 | have the parent and name given by the second inode and dentry. | ||
398 | |||
397 | readlink: called by the readlink(2) system call. Only required if | 399 | readlink: called by the readlink(2) system call. Only required if |
398 | you want to support reading symbolic links | 400 | you want to support reading symbolic links |
399 | 401 | ||
400 | follow_link: called by the VFS to follow a symbolic link to the | 402 | follow_link: called by the VFS to follow a symbolic link to the |
401 | inode it points to. Only required if you want to support | 403 | inode it points to. Only required if you want to support |
402 | symbolic links. This function returns a void pointer cookie | 404 | symbolic links. This method returns a void pointer cookie |
403 | that is passed to put_link(). | 405 | that is passed to put_link(). |
404 | 406 | ||
405 | put_link: called by the VFS to release resources allocated by | 407 | put_link: called by the VFS to release resources allocated by |
406 | follow_link(). The cookie returned by follow_link() is passed to | 408 | follow_link(). The cookie returned by follow_link() is passed |
407 | to this function as the last parameter. It is used by filesystems | 409 | to to this method as the last parameter. It is used by |
408 | such as NFS where page cache is not stable (i.e. page that was | 410 | filesystems such as NFS where page cache is not stable |
409 | installed when the symbolic link walk started might not be in the | 411 | (i.e. page that was installed when the symbolic link walk |
410 | page cache at the end of the walk). | 412 | started might not be in the page cache at the end of the |
411 | 413 | walk). | |
412 | truncate: called by the VFS to change the size of a file. The i_size | 414 | |
413 | field of the inode is set to the desired size by the VFS before | 415 | truncate: called by the VFS to change the size of a file. The |
414 | this function is called. This function is called by the truncate(2) | 416 | i_size field of the inode is set to the desired size by the |
415 | system call and related functionality. | 417 | VFS before this method is called. This method is called by |
418 | the truncate(2) system call and related functionality. | ||
416 | 419 | ||
417 | permission: called by the VFS to check for access rights on a POSIX-like | 420 | permission: called by the VFS to check for access rights on a POSIX-like |
418 | filesystem. | 421 | filesystem. |
419 | 422 | ||
420 | setattr: called by the VFS to set attributes for a file. This function is | 423 | setattr: called by the VFS to set attributes for a file. This method |
421 | called by chmod(2) and related system calls. | 424 | is called by chmod(2) and related system calls. |
422 | 425 | ||
423 | getattr: called by the VFS to get attributes of a file. This function is | 426 | getattr: called by the VFS to get attributes of a file. This method |
424 | called by stat(2) and related system calls. | 427 | is called by stat(2) and related system calls. |
425 | 428 | ||
426 | setxattr: called by the VFS to set an extended attribute for a file. | 429 | setxattr: called by the VFS to set an extended attribute for a file. |
427 | Extended attribute is a name:value pair associated with an inode. This | 430 | Extended attribute is a name:value pair associated with an |
428 | function is called by setxattr(2) system call. | 431 | inode. This method is called by setxattr(2) system call. |
432 | |||
433 | getxattr: called by the VFS to retrieve the value of an extended | ||
434 | attribute name. This method is called by getxattr(2) function | ||
435 | call. | ||
429 | 436 | ||
430 | getxattr: called by the VFS to retrieve the value of an extended attribute | 437 | listxattr: called by the VFS to list all extended attributes for a |
431 | name. This function is called by getxattr(2) function call. | 438 | given file. This method is called by listxattr(2) system call. |
432 | 439 | ||
433 | listxattr: called by the VFS to list all extended attributes for a given | 440 | removexattr: called by the VFS to remove an extended attribute from |
434 | file. This function is called by listxattr(2) system call. | 441 | a file. This method is called by removexattr(2) system call. |
435 | 442 | ||
436 | removexattr: called by the VFS to remove an extended attribute from a file. | 443 | |
437 | This function is called by removexattr(2) system call. | 444 | The Address Space Object |
445 | ======================== | ||
446 | |||
447 | The address space object is used to identify pages in the page cache. | ||
438 | 448 | ||
439 | 449 | ||
440 | struct address_space_operations | 450 | struct address_space_operations |
441 | =============================== | 451 | ------------------------------- |
442 | 452 | ||
443 | This describes how the VFS can manipulate mapping of a file to page cache in | 453 | This describes how the VFS can manipulate mapping of a file to page cache in |
444 | your filesystem. As of kernel 2.6.13, the following members are defined: | 454 | your filesystem. As of kernel 2.6.13, the following members are defined: |
@@ -502,8 +512,14 @@ struct address_space_operations { | |||
502 | it. An example implementation can be found in fs/ext2/xip.c. | 512 | it. An example implementation can be found in fs/ext2/xip.c. |
503 | 513 | ||
504 | 514 | ||
515 | The File Object | ||
516 | =============== | ||
517 | |||
518 | A file object represents a file opened by a process. | ||
519 | |||
520 | |||
505 | struct file_operations | 521 | struct file_operations |
506 | ====================== | 522 | ---------------------- |
507 | 523 | ||
508 | This describes how the VFS can manipulate an open file. As of kernel | 524 | This describes how the VFS can manipulate an open file. As of kernel |
509 | 2.6.13, the following members are defined: | 525 | 2.6.13, the following members are defined: |
@@ -661,7 +677,7 @@ of child dentries. Child dentries are basically like files in a | |||
661 | directory. | 677 | directory. |
662 | 678 | ||
663 | 679 | ||
664 | Directory Entry Cache APIs | 680 | Directory Entry Cache API |
665 | -------------------------- | 681 | -------------------------- |
666 | 682 | ||
667 | There are a number of functions defined which permit a filesystem to | 683 | There are a number of functions defined which permit a filesystem to |
@@ -705,178 +721,24 @@ manipulate dentries: | |||
705 | and the dentry is returned. The caller must use d_put() | 721 | and the dentry is returned. The caller must use d_put() |
706 | to free the dentry when it finishes using it. | 722 | to free the dentry when it finishes using it. |
707 | 723 | ||
724 | For further information on dentry locking, please refer to the document | ||
725 | Documentation/filesystems/dentry-locking.txt. | ||
708 | 726 | ||
709 | RCU-based dcache locking model | ||
710 | ------------------------------ | ||
711 | 727 | ||
712 | On many workloads, the most common operation on dcache is | 728 | Resources |
713 | to look up a dentry, given a parent dentry and the name | 729 | ========= |
714 | of the child. Typically, for every open(), stat() etc., | 730 | |
715 | the dentry corresponding to the pathname will be looked | 731 | (Note some of these resources are not up-to-date with the latest kernel |
716 | up by walking the tree starting with the first component | 732 | version.) |
717 | of the pathname and using that dentry along with the next | 733 | |
718 | component to look up the next level and so on. Since it | 734 | Creating Linux virtual filesystems. 2002 |
719 | is a frequent operation for workloads like multiuser | 735 | <http://lwn.net/Articles/13325/> |
720 | environments and web servers, it is important to optimize | 736 | |
721 | this path. | 737 | The Linux Virtual File-system Layer by Neil Brown. 1999 |
722 | 738 | <http://www.cse.unsw.edu.au/~neilb/oss/linux-commentary/vfs.html> | |
723 | Prior to 2.5.10, dcache_lock was acquired in d_lookup and thus | 739 | |
724 | in every component during path look-up. Since 2.5.10 onwards, | 740 | A tour of the Linux VFS by Michael K. Johnson. 1996 |
725 | fast-walk algorithm changed this by holding the dcache_lock | 741 | <http://www.tldp.org/LDP/khg/HyperNews/get/fs/vfstour.html> |
726 | at the beginning and walking as many cached path component | ||
727 | dentries as possible. This significantly decreases the number | ||
728 | of acquisition of dcache_lock. However it also increases the | ||
729 | lock hold time significantly and affects performance in large | ||
730 | SMP machines. Since 2.5.62 kernel, dcache has been using | ||
731 | a new locking model that uses RCU to make dcache look-up | ||
732 | lock-free. | ||
733 | |||
734 | The current dcache locking model is not very different from the existing | ||
735 | dcache locking model. Prior to 2.5.62 kernel, dcache_lock | ||
736 | protected the hash chain, d_child, d_alias, d_lru lists as well | ||
737 | as d_inode and several other things like mount look-up. RCU-based | ||
738 | changes affect only the way the hash chain is protected. For everything | ||
739 | else the dcache_lock must be taken for both traversing as well as | ||
740 | updating. The hash chain updates too take the dcache_lock. | ||
741 | The significant change is the way d_lookup traverses the hash chain, | ||
742 | it doesn't acquire the dcache_lock for this and rely on RCU to | ||
743 | ensure that the dentry has not been *freed*. | ||
744 | |||
745 | |||
746 | Dcache locking details | ||
747 | ---------------------- | ||
748 | 742 | ||
749 | For many multi-user workloads, open() and stat() on files are | 743 | A small trail through the Linux kernel by Andries Brouwer. 2001 |
750 | very frequently occurring operations. Both involve walking | 744 | <http://www.win.tue.nl/~aeb/linux/vfs/trail.html> |
751 | of path names to find the dentry corresponding to the | ||
752 | concerned file. In 2.4 kernel, dcache_lock was held | ||
753 | during look-up of each path component. Contention and | ||
754 | cache-line bouncing of this global lock caused significant | ||
755 | scalability problems. With the introduction of RCU | ||
756 | in Linux kernel, this was worked around by making | ||
757 | the look-up of path components during path walking lock-free. | ||
758 | |||
759 | |||
760 | Safe lock-free look-up of dcache hash table | ||
761 | =========================================== | ||
762 | |||
763 | Dcache is a complex data structure with the hash table entries | ||
764 | also linked together in other lists. In 2.4 kernel, dcache_lock | ||
765 | protected all the lists. We applied RCU only on hash chain | ||
766 | walking. The rest of the lists are still protected by dcache_lock. | ||
767 | Some of the important changes are : | ||
768 | |||
769 | 1. The deletion from hash chain is done using hlist_del_rcu() macro which | ||
770 | doesn't initialize next pointer of the deleted dentry and this | ||
771 | allows us to walk safely lock-free while a deletion is happening. | ||
772 | |||
773 | 2. Insertion of a dentry into the hash table is done using | ||
774 | hlist_add_head_rcu() which take care of ordering the writes - | ||
775 | the writes to the dentry must be visible before the dentry | ||
776 | is inserted. This works in conjunction with hlist_for_each_rcu() | ||
777 | while walking the hash chain. The only requirement is that | ||
778 | all initialization to the dentry must be done before hlist_add_head_rcu() | ||
779 | since we don't have dcache_lock protection while traversing | ||
780 | the hash chain. This isn't different from the existing code. | ||
781 | |||
782 | 3. The dentry looked up without holding dcache_lock by cannot be | ||
783 | returned for walking if it is unhashed. It then may have a NULL | ||
784 | d_inode or other bogosity since RCU doesn't protect the other | ||
785 | fields in the dentry. We therefore use a flag DCACHE_UNHASHED to | ||
786 | indicate unhashed dentries and use this in conjunction with a | ||
787 | per-dentry lock (d_lock). Once looked up without the dcache_lock, | ||
788 | we acquire the per-dentry lock (d_lock) and check if the | ||
789 | dentry is unhashed. If so, the look-up is failed. If not, the | ||
790 | reference count of the dentry is increased and the dentry is returned. | ||
791 | |||
792 | 4. Once a dentry is looked up, it must be ensured during the path | ||
793 | walk for that component it doesn't go away. In pre-2.5.10 code, | ||
794 | this was done holding a reference to the dentry. dcache_rcu does | ||
795 | the same. In some sense, dcache_rcu path walking looks like | ||
796 | the pre-2.5.10 version. | ||
797 | |||
798 | 5. All dentry hash chain updates must take the dcache_lock as well as | ||
799 | the per-dentry lock in that order. dput() does this to ensure | ||
800 | that a dentry that has just been looked up in another CPU | ||
801 | doesn't get deleted before dget() can be done on it. | ||
802 | |||
803 | 6. There are several ways to do reference counting of RCU protected | ||
804 | objects. One such example is in ipv4 route cache where | ||
805 | deferred freeing (using call_rcu()) is done as soon as | ||
806 | the reference count goes to zero. This cannot be done in | ||
807 | the case of dentries because tearing down of dentries | ||
808 | require blocking (dentry_iput()) which isn't supported from | ||
809 | RCU callbacks. Instead, tearing down of dentries happen | ||
810 | synchronously in dput(), but actual freeing happens later | ||
811 | when RCU grace period is over. This allows safe lock-free | ||
812 | walking of the hash chains, but a matched dentry may have | ||
813 | been partially torn down. The checking of DCACHE_UNHASHED | ||
814 | flag with d_lock held detects such dentries and prevents | ||
815 | them from being returned from look-up. | ||
816 | |||
817 | |||
818 | Maintaining POSIX rename semantics | ||
819 | ================================== | ||
820 | |||
821 | Since look-up of dentries is lock-free, it can race against | ||
822 | a concurrent rename operation. For example, during rename | ||
823 | of file A to B, look-up of either A or B must succeed. | ||
824 | So, if look-up of B happens after A has been removed from the | ||
825 | hash chain but not added to the new hash chain, it may fail. | ||
826 | Also, a comparison while the name is being written concurrently | ||
827 | by a rename may result in false positive matches violating | ||
828 | rename semantics. Issues related to race with rename are | ||
829 | handled as described below : | ||
830 | |||
831 | 1. Look-up can be done in two ways - d_lookup() which is safe | ||
832 | from simultaneous renames and __d_lookup() which is not. | ||
833 | If __d_lookup() fails, it must be followed up by a d_lookup() | ||
834 | to correctly determine whether a dentry is in the hash table | ||
835 | or not. d_lookup() protects look-ups using a sequence | ||
836 | lock (rename_lock). | ||
837 | |||
838 | 2. The name associated with a dentry (d_name) may be changed if | ||
839 | a rename is allowed to happen simultaneously. To avoid memcmp() | ||
840 | in __d_lookup() go out of bounds due to a rename and false | ||
841 | positive comparison, the name comparison is done while holding the | ||
842 | per-dentry lock. This prevents concurrent renames during this | ||
843 | operation. | ||
844 | |||
845 | 3. Hash table walking during look-up may move to a different bucket as | ||
846 | the current dentry is moved to a different bucket due to rename. | ||
847 | But we use hlists in dcache hash table and they are null-terminated. | ||
848 | So, even if a dentry moves to a different bucket, hash chain | ||
849 | walk will terminate. [with a list_head list, it may not since | ||
850 | termination is when the list_head in the original bucket is reached]. | ||
851 | Since we redo the d_parent check and compare name while holding | ||
852 | d_lock, lock-free look-up will not race against d_move(). | ||
853 | |||
854 | 4. There can be a theoretical race when a dentry keeps coming back | ||
855 | to original bucket due to double moves. Due to this look-up may | ||
856 | consider that it has never moved and can end up in a infinite loop. | ||
857 | But this is not any worse that theoretical livelocks we already | ||
858 | have in the kernel. | ||
859 | |||
860 | |||
861 | Important guidelines for filesystem developers related to dcache_rcu | ||
862 | ==================================================================== | ||
863 | |||
864 | 1. Existing dcache interfaces (pre-2.5.62) exported to filesystem | ||
865 | don't change. Only dcache internal implementation changes. However | ||
866 | filesystems *must not* delete from the dentry hash chains directly | ||
867 | using the list macros like allowed earlier. They must use dcache | ||
868 | APIs like d_drop() or __d_drop() depending on the situation. | ||
869 | |||
870 | 2. d_flags is now protected by a per-dentry lock (d_lock). All | ||
871 | access to d_flags must be protected by it. | ||
872 | |||
873 | 3. For a hashed dentry, checking of d_count needs to be protected | ||
874 | by d_lock. | ||
875 | |||
876 | |||
877 | Papers and other documentation on dcache locking | ||
878 | ================================================ | ||
879 | |||
880 | 1. Scaling dcache with RCU (http://linuxjournal.com/article.php?sid=7124). | ||
881 | |||
882 | 2. http://lse.sourceforge.net/locking/dcache/dcache.html | ||
diff --git a/Documentation/hpet.txt b/Documentation/hpet.txt index 4e7cc8d3359b..e52457581f47 100644 --- a/Documentation/hpet.txt +++ b/Documentation/hpet.txt | |||
@@ -1,18 +1,21 @@ | |||
1 | High Precision Event Timer Driver for Linux | 1 | High Precision Event Timer Driver for Linux |
2 | 2 | ||
3 | The High Precision Event Timer (HPET) hardware is the future replacement for the 8254 and Real | 3 | The High Precision Event Timer (HPET) hardware is the future replacement |
4 | Time Clock (RTC) periodic timer functionality. Each HPET can have up two 32 timers. It is possible | 4 | for the 8254 and Real Time Clock (RTC) periodic timer functionality. |
5 | to configure the first two timers as legacy replacements for 8254 and RTC periodic. A specification | 5 | Each HPET can have up two 32 timers. It is possible to configure the |
6 | done by INTEL and Microsoft can be found at http://www.intel.com/labs/platcomp/hpet/hpetspec.htm. | 6 | first two timers as legacy replacements for 8254 and RTC periodic timers. |
7 | 7 | A specification done by Intel and Microsoft can be found at | |
8 | The driver supports detection of HPET driver allocation and initialization of the HPET before the | 8 | <http://www.intel.com/hardwaredesign/hpetspec.htm>. |
9 | driver module_init routine is called. This enables platform code which uses timer 0 or 1 as the | 9 | |
10 | main timer to intercept HPET initialization. An example of this initialization can be found in | 10 | The driver supports detection of HPET driver allocation and initialization |
11 | of the HPET before the driver module_init routine is called. This enables | ||
12 | platform code which uses timer 0 or 1 as the main timer to intercept HPET | ||
13 | initialization. An example of this initialization can be found in | ||
11 | arch/i386/kernel/time_hpet.c. | 14 | arch/i386/kernel/time_hpet.c. |
12 | 15 | ||
13 | The driver provides two APIs which are very similar to the API found in the rtc.c driver. | 16 | The driver provides two APIs which are very similar to the API found in |
14 | There is a user space API and a kernel space API. An example user space program is provided | 17 | the rtc.c driver. There is a user space API and a kernel space API. |
15 | below. | 18 | An example user space program is provided below. |
16 | 19 | ||
17 | #include <stdio.h> | 20 | #include <stdio.h> |
18 | #include <stdlib.h> | 21 | #include <stdlib.h> |
@@ -290,9 +293,8 @@ The kernel API has three interfaces exported from the driver: | |||
290 | hpet_unregister(struct hpet_task *tp) | 293 | hpet_unregister(struct hpet_task *tp) |
291 | hpet_control(struct hpet_task *tp, unsigned int cmd, unsigned long arg) | 294 | hpet_control(struct hpet_task *tp, unsigned int cmd, unsigned long arg) |
292 | 295 | ||
293 | The kernel module using this interface fills in the ht_func and ht_data members of the | 296 | The kernel module using this interface fills in the ht_func and ht_data |
294 | hpet_task structure before calling hpet_register. hpet_control simply vectors to the hpet_ioctl | 297 | members of the hpet_task structure before calling hpet_register. |
295 | routine and has the same commands and respective arguments as the user API. hpet_unregister | 298 | hpet_control simply vectors to the hpet_ioctl routine and has the same |
299 | commands and respective arguments as the user API. hpet_unregister | ||
296 | is used to terminate usage of the HPET timer reserved by hpet_register. | 300 | is used to terminate usage of the HPET timer reserved by hpet_register. |
297 | |||
298 | |||
diff --git a/Documentation/ioctl-number.txt b/Documentation/ioctl-number.txt index 769f925c8526..87f4d052e39c 100644 --- a/Documentation/ioctl-number.txt +++ b/Documentation/ioctl-number.txt | |||
@@ -130,8 +130,6 @@ Code Seq# Include File Comments | |||
130 | <mailto:zapman@interlan.net> | 130 | <mailto:zapman@interlan.net> |
131 | 'i' 00-3F linux/i2o.h | 131 | 'i' 00-3F linux/i2o.h |
132 | 'j' 00-3F linux/joystick.h | 132 | 'j' 00-3F linux/joystick.h |
133 | 'k' all asm-sparc/kbio.h | ||
134 | asm-sparc64/kbio.h | ||
135 | 'l' 00-3F linux/tcfs_fs.h transparent cryptographic file system | 133 | 'l' 00-3F linux/tcfs_fs.h transparent cryptographic file system |
136 | <http://mikonos.dia.unisa.it/tcfs> | 134 | <http://mikonos.dia.unisa.it/tcfs> |
137 | 'l' 40-7F linux/udf_fs_i.h in development: | 135 | 'l' 40-7F linux/udf_fs_i.h in development: |
diff --git a/Documentation/magic-number.txt b/Documentation/magic-number.txt index bd8eefa17587..af67faccf4de 100644 --- a/Documentation/magic-number.txt +++ b/Documentation/magic-number.txt | |||
@@ -120,7 +120,7 @@ ISDN_NET_MAGIC 0x49344C02 isdn_net_local_s drivers/isdn/i4l/isdn_net_li | |||
120 | SAVEKMSG_MAGIC2 0x4B4D5347 savekmsg arch/*/amiga/config.c | 120 | SAVEKMSG_MAGIC2 0x4B4D5347 savekmsg arch/*/amiga/config.c |
121 | STLI_BOARDMAGIC 0x4bc6c825 stlibrd include/linux/istallion.h | 121 | STLI_BOARDMAGIC 0x4bc6c825 stlibrd include/linux/istallion.h |
122 | CS_STATE_MAGIC 0x4c4f4749 cs_state sound/oss/cs46xx.c | 122 | CS_STATE_MAGIC 0x4c4f4749 cs_state sound/oss/cs46xx.c |
123 | SLAB_C_MAGIC 0x4f17a36d kmem_cache_s mm/slab.c | 123 | SLAB_C_MAGIC 0x4f17a36d kmem_cache mm/slab.c |
124 | COW_MAGIC 0x4f4f4f4d cow_header_v1 arch/um/drivers/ubd_user.c | 124 | COW_MAGIC 0x4f4f4f4d cow_header_v1 arch/um/drivers/ubd_user.c |
125 | I810_CARD_MAGIC 0x5072696E i810_card sound/oss/i810_audio.c | 125 | I810_CARD_MAGIC 0x5072696E i810_card sound/oss/i810_audio.c |
126 | TRIDENT_CARD_MAGIC 0x5072696E trident_card sound/oss/trident.c | 126 | TRIDENT_CARD_MAGIC 0x5072696E trident_card sound/oss/trident.c |
diff --git a/Documentation/networking/decnet.txt b/Documentation/networking/decnet.txt index c6bd25f5d61d..e6c39c5831f5 100644 --- a/Documentation/networking/decnet.txt +++ b/Documentation/networking/decnet.txt | |||
@@ -176,8 +176,6 @@ information (_most_ of which _is_ _essential_) includes: | |||
176 | - Which client caused the problem ? | 176 | - Which client caused the problem ? |
177 | - How much data was being transferred ? | 177 | - How much data was being transferred ? |
178 | - Was the network congested ? | 178 | - Was the network congested ? |
179 | - If there was a kernel panic, please run the output through ksymoops | ||
180 | before sending it to me, otherwise its _useless_. | ||
181 | - How can the problem be reproduced ? | 179 | - How can the problem be reproduced ? |
182 | - Can you use tcpdump to get a trace ? (N.B. Most (all?) versions of | 180 | - Can you use tcpdump to get a trace ? (N.B. Most (all?) versions of |
183 | tcpdump don't understand how to dump DECnet properly, so including | 181 | tcpdump don't understand how to dump DECnet properly, so including |
diff --git a/Documentation/oops-tracing.txt b/Documentation/oops-tracing.txt index 66eaaab7773d..c563842ed805 100644 --- a/Documentation/oops-tracing.txt +++ b/Documentation/oops-tracing.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | NOTE: ksymoops is useless on 2.6. Please use the Oops in its original format | 1 | NOTE: ksymoops is useless on 2.6. Please use the Oops in its original format |
2 | (from dmesg, etc). Ignore any references in this or other docs to "decoding | 2 | (from dmesg, etc). Ignore any references in this or other docs to "decoding |
3 | the Oops" or "running it through ksymoops". If you post an Oops fron 2.6 that | 3 | the Oops" or "running it through ksymoops". If you post an Oops from 2.6 that |
4 | has been run through ksymoops, people will just tell you to repost it. | 4 | has been run through ksymoops, people will just tell you to repost it. |
5 | 5 | ||
6 | Quick Summary | 6 | Quick Summary |
diff --git a/Documentation/power/video.txt b/Documentation/power/video.txt index 526d6dd267ea..912bed87c758 100644 --- a/Documentation/power/video.txt +++ b/Documentation/power/video.txt | |||
@@ -11,9 +11,9 @@ boot video card. (Kernel usually does not even contain video card | |||
11 | driver -- vesafb and vgacon are widely used). | 11 | driver -- vesafb and vgacon are widely used). |
12 | 12 | ||
13 | This is not problem for swsusp, because during swsusp resume, BIOS is | 13 | This is not problem for swsusp, because during swsusp resume, BIOS is |
14 | run normally so video card is normally initialized. S3 has absolutely | 14 | run normally so video card is normally initialized. It should not be |
15 | no chance of working with SMP/HT. Be sure it to turn it off before | 15 | problem for S1 standby, because hardware should retain its state over |
16 | testing (swsusp should work ok, OTOH). | 16 | that. |
17 | 17 | ||
18 | There are a few types of systems where video works after S3 resume: | 18 | There are a few types of systems where video works after S3 resume: |
19 | 19 | ||
@@ -64,7 +64,7 @@ your video card (good luck getting docs :-(). Maybe suspending from X | |||
64 | (proper X, knowing your hardware, not XF68_FBcon) might have better | 64 | (proper X, knowing your hardware, not XF68_FBcon) might have better |
65 | chance of working. | 65 | chance of working. |
66 | 66 | ||
67 | Table of known working systems: | 67 | Table of known working notebooks: |
68 | 68 | ||
69 | Model hack (or "how to do it") | 69 | Model hack (or "how to do it") |
70 | ------------------------------------------------------------------------------ | 70 | ------------------------------------------------------------------------------ |
@@ -73,7 +73,7 @@ Acer TM 242FX vbetool (6) | |||
73 | Acer TM C110 video_post (8) | 73 | Acer TM C110 video_post (8) |
74 | Acer TM C300 vga=normal (only suspend on console, not in X), vbetool (6) or video_post (8) | 74 | Acer TM C300 vga=normal (only suspend on console, not in X), vbetool (6) or video_post (8) |
75 | Acer TM 4052LCi s3_bios (2) | 75 | Acer TM 4052LCi s3_bios (2) |
76 | Acer TM 636Lci s3_bios vga=normal (2) | 76 | Acer TM 636Lci s3_bios,s3_mode (4) |
77 | Acer TM 650 (Radeon M7) vga=normal plus boot-radeon (5) gets text console back | 77 | Acer TM 650 (Radeon M7) vga=normal plus boot-radeon (5) gets text console back |
78 | Acer TM 660 ??? (*) | 78 | Acer TM 660 ??? (*) |
79 | Acer TM 800 vga=normal, X patches, see webpage (5) or vbetool (6) | 79 | Acer TM 800 vga=normal, X patches, see webpage (5) or vbetool (6) |
@@ -137,6 +137,13 @@ Toshiba Satellite P10-554 s3_bios,s3_mode (4)(****) | |||
137 | Toshiba M30 (2) xor X with nvidia driver using internal AGP | 137 | Toshiba M30 (2) xor X with nvidia driver using internal AGP |
138 | Uniwill 244IIO ??? (*) | 138 | Uniwill 244IIO ??? (*) |
139 | 139 | ||
140 | Known working desktop systems | ||
141 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
142 | |||
143 | Mainboard Graphics card hack (or "how to do it") | ||
144 | ------------------------------------------------------------------------------ | ||
145 | Asus A7V8X nVidia RIVA TNT2 model 64 s3_bios,s3_mode (4) | ||
146 | |||
140 | 147 | ||
141 | (*) from http://www.ubuntulinux.org/wiki/HoaryPMResults, not sure | 148 | (*) from http://www.ubuntulinux.org/wiki/HoaryPMResults, not sure |
142 | which options to use. If you know, please tell me. | 149 | which options to use. If you know, please tell me. |
diff --git a/Documentation/s390/driver-model.txt b/Documentation/s390/driver-model.txt index 19461958e2bd..df09758bf3fe 100644 --- a/Documentation/s390/driver-model.txt +++ b/Documentation/s390/driver-model.txt | |||
@@ -8,11 +8,10 @@ All devices which can be addressed by means of ccws are called 'CCW devices' - | |||
8 | even if they aren't actually driven by ccws. | 8 | even if they aren't actually driven by ccws. |
9 | 9 | ||
10 | All ccw devices are accessed via a subchannel, this is reflected in the | 10 | All ccw devices are accessed via a subchannel, this is reflected in the |
11 | structures under root/: | 11 | structures under devices/: |
12 | 12 | ||
13 | root/ | 13 | devices/ |
14 | - sys | 14 | - system/ |
15 | - legacy | ||
16 | - css0/ | 15 | - css0/ |
17 | - 0.0.0000/0.0.0815/ | 16 | - 0.0.0000/0.0.0815/ |
18 | - 0.0.0001/0.0.4711/ | 17 | - 0.0.0001/0.0.4711/ |
@@ -36,7 +35,7 @@ availability: Can be 'good' or 'boxed'; 'no path' or 'no device' for | |||
36 | 35 | ||
37 | online: An interface to set the device online and offline. | 36 | online: An interface to set the device online and offline. |
38 | In the special case of the device being disconnected (see the | 37 | In the special case of the device being disconnected (see the |
39 | notify function under 1.2), piping 0 to online will focibly delete | 38 | notify function under 1.2), piping 0 to online will forcibly delete |
40 | the device. | 39 | the device. |
41 | 40 | ||
42 | The device drivers can add entries to export per-device data and interfaces. | 41 | The device drivers can add entries to export per-device data and interfaces. |
@@ -222,7 +221,7 @@ and are called 'chp0.<chpid>'. They have no driver and do not belong to any bus. | |||
222 | Please note, that unlike /proc/chpids in 2.4, the channel path objects reflect | 221 | Please note, that unlike /proc/chpids in 2.4, the channel path objects reflect |
223 | only the logical state and not the physical state, since we cannot track the | 222 | only the logical state and not the physical state, since we cannot track the |
224 | latter consistently due to lacking machine support (we don't need to be aware | 223 | latter consistently due to lacking machine support (we don't need to be aware |
225 | of anyway). | 224 | of it anyway). |
226 | 225 | ||
227 | status - Can be 'online' or 'offline'. | 226 | status - Can be 'online' or 'offline'. |
228 | Piping 'on' or 'off' sets the chpid logically online/offline. | 227 | Piping 'on' or 'off' sets the chpid logically online/offline. |
@@ -235,12 +234,16 @@ status - Can be 'online' or 'offline'. | |||
235 | 3. System devices | 234 | 3. System devices |
236 | ----------------- | 235 | ----------------- |
237 | 236 | ||
238 | Note: cpus may yet be added here. | ||
239 | |||
240 | 3.1 xpram | 237 | 3.1 xpram |
241 | --------- | 238 | --------- |
242 | 239 | ||
243 | xpram shows up under sys/ as 'xpram'. | 240 | xpram shows up under devices/system/ as 'xpram'. |
241 | |||
242 | 3.2 cpus | ||
243 | -------- | ||
244 | |||
245 | For each cpu, a directory is created under devices/system/cpu/. Each cpu has an | ||
246 | attribute 'online' which can be 0 or 1. | ||
244 | 247 | ||
245 | 248 | ||
246 | 4. Other devices | 249 | 4. Other devices |
diff --git a/Documentation/sharedsubtree.txt b/Documentation/sharedsubtree.txt new file mode 100644 index 000000000000..2d8f403eb6eb --- /dev/null +++ b/Documentation/sharedsubtree.txt | |||
@@ -0,0 +1,1060 @@ | |||
1 | Shared Subtrees | ||
2 | --------------- | ||
3 | |||
4 | Contents: | ||
5 | 1) Overview | ||
6 | 2) Features | ||
7 | 3) smount command | ||
8 | 4) Use-case | ||
9 | 5) Detailed semantics | ||
10 | 6) Quiz | ||
11 | 7) FAQ | ||
12 | 8) Implementation | ||
13 | |||
14 | |||
15 | 1) Overview | ||
16 | ----------- | ||
17 | |||
18 | Consider the following situation: | ||
19 | |||
20 | A process wants to clone its own namespace, but still wants to access the CD | ||
21 | that got mounted recently. Shared subtree semantics provide the necessary | ||
22 | mechanism to accomplish the above. | ||
23 | |||
24 | It provides the necessary building blocks for features like per-user-namespace | ||
25 | and versioned filesystem. | ||
26 | |||
27 | 2) Features | ||
28 | ----------- | ||
29 | |||
30 | Shared subtree provides four different flavors of mounts; struct vfsmount to be | ||
31 | precise | ||
32 | |||
33 | a. shared mount | ||
34 | b. slave mount | ||
35 | c. private mount | ||
36 | d. unbindable mount | ||
37 | |||
38 | |||
39 | 2a) A shared mount can be replicated to as many mountpoints and all the | ||
40 | replicas continue to be exactly same. | ||
41 | |||
42 | Here is an example: | ||
43 | |||
44 | Lets say /mnt has a mount that is shared. | ||
45 | mount --make-shared /mnt | ||
46 | |||
47 | note: mount command does not yet support the --make-shared flag. | ||
48 | I have included a small C program which does the same by executing | ||
49 | 'smount /mnt shared' | ||
50 | |||
51 | #mount --bind /mnt /tmp | ||
52 | The above command replicates the mount at /mnt to the mountpoint /tmp | ||
53 | and the contents of both the mounts remain identical. | ||
54 | |||
55 | #ls /mnt | ||
56 | a b c | ||
57 | |||
58 | #ls /tmp | ||
59 | a b c | ||
60 | |||
61 | Now lets say we mount a device at /tmp/a | ||
62 | #mount /dev/sd0 /tmp/a | ||
63 | |||
64 | #ls /tmp/a | ||
65 | t1 t2 t2 | ||
66 | |||
67 | #ls /mnt/a | ||
68 | t1 t2 t2 | ||
69 | |||
70 | Note that the mount has propagated to the mount at /mnt as well. | ||
71 | |||
72 | And the same is true even when /dev/sd0 is mounted on /mnt/a. The | ||
73 | contents will be visible under /tmp/a too. | ||
74 | |||
75 | |||
76 | 2b) A slave mount is like a shared mount except that mount and umount events | ||
77 | only propagate towards it. | ||
78 | |||
79 | All slave mounts have a master mount which is a shared. | ||
80 | |||
81 | Here is an example: | ||
82 | |||
83 | Lets say /mnt has a mount which is shared. | ||
84 | #mount --make-shared /mnt | ||
85 | |||
86 | Lets bind mount /mnt to /tmp | ||
87 | #mount --bind /mnt /tmp | ||
88 | |||
89 | the new mount at /tmp becomes a shared mount and it is a replica of | ||
90 | the mount at /mnt. | ||
91 | |||
92 | Now lets make the mount at /tmp; a slave of /mnt | ||
93 | #mount --make-slave /tmp | ||
94 | [or smount /tmp slave] | ||
95 | |||
96 | lets mount /dev/sd0 on /mnt/a | ||
97 | #mount /dev/sd0 /mnt/a | ||
98 | |||
99 | #ls /mnt/a | ||
100 | t1 t2 t3 | ||
101 | |||
102 | #ls /tmp/a | ||
103 | t1 t2 t3 | ||
104 | |||
105 | Note the mount event has propagated to the mount at /tmp | ||
106 | |||
107 | However lets see what happens if we mount something on the mount at /tmp | ||
108 | |||
109 | #mount /dev/sd1 /tmp/b | ||
110 | |||
111 | #ls /tmp/b | ||
112 | s1 s2 s3 | ||
113 | |||
114 | #ls /mnt/b | ||
115 | |||
116 | Note how the mount event has not propagated to the mount at | ||
117 | /mnt | ||
118 | |||
119 | |||
120 | 2c) A private mount does not forward or receive propagation. | ||
121 | |||
122 | This is the mount we are familiar with. Its the default type. | ||
123 | |||
124 | |||
125 | 2d) A unbindable mount is a unbindable private mount | ||
126 | |||
127 | lets say we have a mount at /mnt and we make is unbindable | ||
128 | |||
129 | #mount --make-unbindable /mnt | ||
130 | [ smount /mnt unbindable ] | ||
131 | |||
132 | Lets try to bind mount this mount somewhere else. | ||
133 | # mount --bind /mnt /tmp | ||
134 | mount: wrong fs type, bad option, bad superblock on /mnt, | ||
135 | or too many mounted file systems | ||
136 | |||
137 | Binding a unbindable mount is a invalid operation. | ||
138 | |||
139 | |||
140 | 3) smount command | ||
141 | |||
142 | Currently the mount command is not aware of shared subtree features. | ||
143 | Work is in progress to add the support in mount ( util-linux package ). | ||
144 | Till then use the following program. | ||
145 | |||
146 | ------------------------------------------------------------------------ | ||
147 | // | ||
148 | //this code was developed my Miklos Szeredi <miklos@szeredi.hu> | ||
149 | //and modified by Ram Pai <linuxram@us.ibm.com> | ||
150 | // sample usage: | ||
151 | // smount /tmp shared | ||
152 | // | ||
153 | #include <stdio.h> | ||
154 | #include <stdlib.h> | ||
155 | #include <unistd.h> | ||
156 | #include <sys/mount.h> | ||
157 | #include <sys/fsuid.h> | ||
158 | |||
159 | #ifndef MS_REC | ||
160 | #define MS_REC 0x4000 /* 16384: Recursive loopback */ | ||
161 | #endif | ||
162 | |||
163 | #ifndef MS_SHARED | ||
164 | #define MS_SHARED 1<<20 /* Shared */ | ||
165 | #endif | ||
166 | |||
167 | #ifndef MS_PRIVATE | ||
168 | #define MS_PRIVATE 1<<18 /* Private */ | ||
169 | #endif | ||
170 | |||
171 | #ifndef MS_SLAVE | ||
172 | #define MS_SLAVE 1<<19 /* Slave */ | ||
173 | #endif | ||
174 | |||
175 | #ifndef MS_UNBINDABLE | ||
176 | #define MS_UNBINDABLE 1<<17 /* Unbindable */ | ||
177 | #endif | ||
178 | |||
179 | int main(int argc, char *argv[]) | ||
180 | { | ||
181 | int type; | ||
182 | if(argc != 3) { | ||
183 | fprintf(stderr, "usage: %s dir " | ||
184 | "<rshared|rslave|rprivate|runbindable|shared|slave" | ||
185 | "|private|unbindable>\n" , argv[0]); | ||
186 | return 1; | ||
187 | } | ||
188 | |||
189 | fprintf(stdout, "%s %s %s\n", argv[0], argv[1], argv[2]); | ||
190 | |||
191 | if (strcmp(argv[2],"rshared")==0) | ||
192 | type=(MS_SHARED|MS_REC); | ||
193 | else if (strcmp(argv[2],"rslave")==0) | ||
194 | type=(MS_SLAVE|MS_REC); | ||
195 | else if (strcmp(argv[2],"rprivate")==0) | ||
196 | type=(MS_PRIVATE|MS_REC); | ||
197 | else if (strcmp(argv[2],"runbindable")==0) | ||
198 | type=(MS_UNBINDABLE|MS_REC); | ||
199 | else if (strcmp(argv[2],"shared")==0) | ||
200 | type=MS_SHARED; | ||
201 | else if (strcmp(argv[2],"slave")==0) | ||
202 | type=MS_SLAVE; | ||
203 | else if (strcmp(argv[2],"private")==0) | ||
204 | type=MS_PRIVATE; | ||
205 | else if (strcmp(argv[2],"unbindable")==0) | ||
206 | type=MS_UNBINDABLE; | ||
207 | else { | ||
208 | fprintf(stderr, "invalid operation: %s\n", argv[2]); | ||
209 | return 1; | ||
210 | } | ||
211 | setfsuid(getuid()); | ||
212 | |||
213 | if(mount("", argv[1], "dontcare", type, "") == -1) { | ||
214 | perror("mount"); | ||
215 | return 1; | ||
216 | } | ||
217 | return 0; | ||
218 | } | ||
219 | ----------------------------------------------------------------------- | ||
220 | |||
221 | Copy the above code snippet into smount.c | ||
222 | gcc -o smount smount.c | ||
223 | |||
224 | |||
225 | (i) To mark all the mounts under /mnt as shared execute the following | ||
226 | command: | ||
227 | |||
228 | smount /mnt rshared | ||
229 | the corresponding syntax planned for mount command is | ||
230 | mount --make-rshared /mnt | ||
231 | |||
232 | just to mark a mount /mnt as shared, execute the following | ||
233 | command: | ||
234 | smount /mnt shared | ||
235 | the corresponding syntax planned for mount command is | ||
236 | mount --make-shared /mnt | ||
237 | |||
238 | (ii) To mark all the shared mounts under /mnt as slave execute the | ||
239 | following | ||
240 | |||
241 | command: | ||
242 | smount /mnt rslave | ||
243 | the corresponding syntax planned for mount command is | ||
244 | mount --make-rslave /mnt | ||
245 | |||
246 | just to mark a mount /mnt as slave, execute the following | ||
247 | command: | ||
248 | smount /mnt slave | ||
249 | the corresponding syntax planned for mount command is | ||
250 | mount --make-slave /mnt | ||
251 | |||
252 | (iii) To mark all the mounts under /mnt as private execute the | ||
253 | following command: | ||
254 | |||
255 | smount /mnt rprivate | ||
256 | the corresponding syntax planned for mount command is | ||
257 | mount --make-rprivate /mnt | ||
258 | |||
259 | just to mark a mount /mnt as private, execute the following | ||
260 | command: | ||
261 | smount /mnt private | ||
262 | the corresponding syntax planned for mount command is | ||
263 | mount --make-private /mnt | ||
264 | |||
265 | NOTE: by default all the mounts are created as private. But if | ||
266 | you want to change some shared/slave/unbindable mount as | ||
267 | private at a later point in time, this command can help. | ||
268 | |||
269 | (iv) To mark all the mounts under /mnt as unbindable execute the | ||
270 | following | ||
271 | |||
272 | command: | ||
273 | smount /mnt runbindable | ||
274 | the corresponding syntax planned for mount command is | ||
275 | mount --make-runbindable /mnt | ||
276 | |||
277 | just to mark a mount /mnt as unbindable, execute the following | ||
278 | command: | ||
279 | smount /mnt unbindable | ||
280 | the corresponding syntax planned for mount command is | ||
281 | mount --make-unbindable /mnt | ||
282 | |||
283 | |||
284 | 4) Use cases | ||
285 | ------------ | ||
286 | |||
287 | A) A process wants to clone its own namespace, but still wants to | ||
288 | access the CD that got mounted recently. | ||
289 | |||
290 | Solution: | ||
291 | |||
292 | The system administrator can make the mount at /cdrom shared | ||
293 | mount --bind /cdrom /cdrom | ||
294 | mount --make-shared /cdrom | ||
295 | |||
296 | Now any process that clones off a new namespace will have a | ||
297 | mount at /cdrom which is a replica of the same mount in the | ||
298 | parent namespace. | ||
299 | |||
300 | So when a CD is inserted and mounted at /cdrom that mount gets | ||
301 | propagated to the other mount at /cdrom in all the other clone | ||
302 | namespaces. | ||
303 | |||
304 | B) A process wants its mounts invisible to any other process, but | ||
305 | still be able to see the other system mounts. | ||
306 | |||
307 | Solution: | ||
308 | |||
309 | To begin with, the administrator can mark the entire mount tree | ||
310 | as shareable. | ||
311 | |||
312 | mount --make-rshared / | ||
313 | |||
314 | A new process can clone off a new namespace. And mark some part | ||
315 | of its namespace as slave | ||
316 | |||
317 | mount --make-rslave /myprivatetree | ||
318 | |||
319 | Hence forth any mounts within the /myprivatetree done by the | ||
320 | process will not show up in any other namespace. However mounts | ||
321 | done in the parent namespace under /myprivatetree still shows | ||
322 | up in the process's namespace. | ||
323 | |||
324 | |||
325 | Apart from the above semantics this feature provides the | ||
326 | building blocks to solve the following problems: | ||
327 | |||
328 | C) Per-user namespace | ||
329 | |||
330 | The above semantics allows a way to share mounts across | ||
331 | namespaces. But namespaces are associated with processes. If | ||
332 | namespaces are made first class objects with user API to | ||
333 | associate/disassociate a namespace with userid, then each user | ||
334 | could have his/her own namespace and tailor it to his/her | ||
335 | requirements. Offcourse its needs support from PAM. | ||
336 | |||
337 | D) Versioned files | ||
338 | |||
339 | If the entire mount tree is visible at multiple locations, then | ||
340 | a underlying versioning file system can return different | ||
341 | version of the file depending on the path used to access that | ||
342 | file. | ||
343 | |||
344 | An example is: | ||
345 | |||
346 | mount --make-shared / | ||
347 | mount --rbind / /view/v1 | ||
348 | mount --rbind / /view/v2 | ||
349 | mount --rbind / /view/v3 | ||
350 | mount --rbind / /view/v4 | ||
351 | |||
352 | and if /usr has a versioning filesystem mounted, than that | ||
353 | mount appears at /view/v1/usr, /view/v2/usr, /view/v3/usr and | ||
354 | /view/v4/usr too | ||
355 | |||
356 | A user can request v3 version of the file /usr/fs/namespace.c | ||
357 | by accessing /view/v3/usr/fs/namespace.c . The underlying | ||
358 | versioning filesystem can then decipher that v3 version of the | ||
359 | filesystem is being requested and return the corresponding | ||
360 | inode. | ||
361 | |||
362 | 5) Detailed semantics: | ||
363 | ------------------- | ||
364 | The section below explains the detailed semantics of | ||
365 | bind, rbind, move, mount, umount and clone-namespace operations. | ||
366 | |||
367 | Note: the word 'vfsmount' and the noun 'mount' have been used | ||
368 | to mean the same thing, throughout this document. | ||
369 | |||
370 | 5a) Mount states | ||
371 | |||
372 | A given mount can be in one of the following states | ||
373 | 1) shared | ||
374 | 2) slave | ||
375 | 3) shared and slave | ||
376 | 4) private | ||
377 | 5) unbindable | ||
378 | |||
379 | A 'propagation event' is defined as event generated on a vfsmount | ||
380 | that leads to mount or unmount actions in other vfsmounts. | ||
381 | |||
382 | A 'peer group' is defined as a group of vfsmounts that propagate | ||
383 | events to each other. | ||
384 | |||
385 | (1) Shared mounts | ||
386 | |||
387 | A 'shared mount' is defined as a vfsmount that belongs to a | ||
388 | 'peer group'. | ||
389 | |||
390 | For example: | ||
391 | mount --make-shared /mnt | ||
392 | mount --bin /mnt /tmp | ||
393 | |||
394 | The mount at /mnt and that at /tmp are both shared and belong | ||
395 | to the same peer group. Anything mounted or unmounted under | ||
396 | /mnt or /tmp reflect in all the other mounts of its peer | ||
397 | group. | ||
398 | |||
399 | |||
400 | (2) Slave mounts | ||
401 | |||
402 | A 'slave mount' is defined as a vfsmount that receives | ||
403 | propagation events and does not forward propagation events. | ||
404 | |||
405 | A slave mount as the name implies has a master mount from which | ||
406 | mount/unmount events are received. Events do not propagate from | ||
407 | the slave mount to the master. Only a shared mount can be made | ||
408 | a slave by executing the following command | ||
409 | |||
410 | mount --make-slave mount | ||
411 | |||
412 | A shared mount that is made as a slave is no more shared unless | ||
413 | modified to become shared. | ||
414 | |||
415 | (3) Shared and Slave | ||
416 | |||
417 | A vfsmount can be both shared as well as slave. This state | ||
418 | indicates that the mount is a slave of some vfsmount, and | ||
419 | has its own peer group too. This vfsmount receives propagation | ||
420 | events from its master vfsmount, and also forwards propagation | ||
421 | events to its 'peer group' and to its slave vfsmounts. | ||
422 | |||
423 | Strictly speaking, the vfsmount is shared having its own | ||
424 | peer group, and this peer-group is a slave of some other | ||
425 | peer group. | ||
426 | |||
427 | Only a slave vfsmount can be made as 'shared and slave' by | ||
428 | either executing the following command | ||
429 | mount --make-shared mount | ||
430 | or by moving the slave vfsmount under a shared vfsmount. | ||
431 | |||
432 | (4) Private mount | ||
433 | |||
434 | A 'private mount' is defined as vfsmount that does not | ||
435 | receive or forward any propagation events. | ||
436 | |||
437 | (5) Unbindable mount | ||
438 | |||
439 | A 'unbindable mount' is defined as vfsmount that does not | ||
440 | receive or forward any propagation events and cannot | ||
441 | be bind mounted. | ||
442 | |||
443 | |||
444 | State diagram: | ||
445 | The state diagram below explains the state transition of a mount, | ||
446 | in response to various commands. | ||
447 | ------------------------------------------------------------------------ | ||
448 | | |make-shared | make-slave | make-private |make-unbindab| | ||
449 | --------------|------------|--------------|--------------|-------------| | ||
450 | |shared |shared |*slave/private| private | unbindable | | ||
451 | | | | | | | | ||
452 | |-------------|------------|--------------|--------------|-------------| | ||
453 | |slave |shared | **slave | private | unbindable | | ||
454 | | |and slave | | | | | ||
455 | |-------------|------------|--------------|--------------|-------------| | ||
456 | |shared |shared | slave | private | unbindable | | ||
457 | |and slave |and slave | | | | | ||
458 | |-------------|------------|--------------|--------------|-------------| | ||
459 | |private |shared | **private | private | unbindable | | ||
460 | |-------------|------------|--------------|--------------|-------------| | ||
461 | |unbindable |shared |**unbindable | private | unbindable | | ||
462 | ------------------------------------------------------------------------ | ||
463 | |||
464 | * if the shared mount is the only mount in its peer group, making it | ||
465 | slave, makes it private automatically. Note that there is no master to | ||
466 | which it can be slaved to. | ||
467 | |||
468 | ** slaving a non-shared mount has no effect on the mount. | ||
469 | |||
470 | Apart from the commands listed below, the 'move' operation also changes | ||
471 | the state of a mount depending on type of the destination mount. Its | ||
472 | explained in section 5d. | ||
473 | |||
474 | 5b) Bind semantics | ||
475 | |||
476 | Consider the following command | ||
477 | |||
478 | mount --bind A/a B/b | ||
479 | |||
480 | where 'A' is the source mount, 'a' is the dentry in the mount 'A', 'B' | ||
481 | is the destination mount and 'b' is the dentry in the destination mount. | ||
482 | |||
483 | The outcome depends on the type of mount of 'A' and 'B'. The table | ||
484 | below contains quick reference. | ||
485 | --------------------------------------------------------------------------- | ||
486 | | BIND MOUNT OPERATION | | ||
487 | |************************************************************************** | ||
488 | |source(A)->| shared | private | slave | unbindable | | ||
489 | | dest(B) | | | | | | ||
490 | | | | | | | | | ||
491 | | v | | | | | | ||
492 | |************************************************************************** | ||
493 | | shared | shared | shared | shared & slave | invalid | | ||
494 | | | | | | | | ||
495 | |non-shared| shared | private | slave | invalid | | ||
496 | *************************************************************************** | ||
497 | |||
498 | Details: | ||
499 | |||
500 | 1. 'A' is a shared mount and 'B' is a shared mount. A new mount 'C' | ||
501 | which is clone of 'A', is created. Its root dentry is 'a' . 'C' is | ||
502 | mounted on mount 'B' at dentry 'b'. Also new mount 'C1', 'C2', 'C3' ... | ||
503 | are created and mounted at the dentry 'b' on all mounts where 'B' | ||
504 | propagates to. A new propagation tree containing 'C1',..,'Cn' is | ||
505 | created. This propagation tree is identical to the propagation tree of | ||
506 | 'B'. And finally the peer-group of 'C' is merged with the peer group | ||
507 | of 'A'. | ||
508 | |||
509 | 2. 'A' is a private mount and 'B' is a shared mount. A new mount 'C' | ||
510 | which is clone of 'A', is created. Its root dentry is 'a'. 'C' is | ||
511 | mounted on mount 'B' at dentry 'b'. Also new mount 'C1', 'C2', 'C3' ... | ||
512 | are created and mounted at the dentry 'b' on all mounts where 'B' | ||
513 | propagates to. A new propagation tree is set containing all new mounts | ||
514 | 'C', 'C1', .., 'Cn' with exactly the same configuration as the | ||
515 | propagation tree for 'B'. | ||
516 | |||
517 | 3. 'A' is a slave mount of mount 'Z' and 'B' is a shared mount. A new | ||
518 | mount 'C' which is clone of 'A', is created. Its root dentry is 'a' . | ||
519 | 'C' is mounted on mount 'B' at dentry 'b'. Also new mounts 'C1', 'C2', | ||
520 | 'C3' ... are created and mounted at the dentry 'b' on all mounts where | ||
521 | 'B' propagates to. A new propagation tree containing the new mounts | ||
522 | 'C','C1',.. 'Cn' is created. This propagation tree is identical to the | ||
523 | propagation tree for 'B'. And finally the mount 'C' and its peer group | ||
524 | is made the slave of mount 'Z'. In other words, mount 'C' is in the | ||
525 | state 'slave and shared'. | ||
526 | |||
527 | 4. 'A' is a unbindable mount and 'B' is a shared mount. This is a | ||
528 | invalid operation. | ||
529 | |||
530 | 5. 'A' is a private mount and 'B' is a non-shared(private or slave or | ||
531 | unbindable) mount. A new mount 'C' which is clone of 'A', is created. | ||
532 | Its root dentry is 'a'. 'C' is mounted on mount 'B' at dentry 'b'. | ||
533 | |||
534 | 6. 'A' is a shared mount and 'B' is a non-shared mount. A new mount 'C' | ||
535 | which is a clone of 'A' is created. Its root dentry is 'a'. 'C' is | ||
536 | mounted on mount 'B' at dentry 'b'. 'C' is made a member of the | ||
537 | peer-group of 'A'. | ||
538 | |||
539 | 7. 'A' is a slave mount of mount 'Z' and 'B' is a non-shared mount. A | ||
540 | new mount 'C' which is a clone of 'A' is created. Its root dentry is | ||
541 | 'a'. 'C' is mounted on mount 'B' at dentry 'b'. Also 'C' is set as a | ||
542 | slave mount of 'Z'. In other words 'A' and 'C' are both slave mounts of | ||
543 | 'Z'. All mount/unmount events on 'Z' propagates to 'A' and 'C'. But | ||
544 | mount/unmount on 'A' do not propagate anywhere else. Similarly | ||
545 | mount/unmount on 'C' do not propagate anywhere else. | ||
546 | |||
547 | 8. 'A' is a unbindable mount and 'B' is a non-shared mount. This is a | ||
548 | invalid operation. A unbindable mount cannot be bind mounted. | ||
549 | |||
550 | 5c) Rbind semantics | ||
551 | |||
552 | rbind is same as bind. Bind replicates the specified mount. Rbind | ||
553 | replicates all the mounts in the tree belonging to the specified mount. | ||
554 | Rbind mount is bind mount applied to all the mounts in the tree. | ||
555 | |||
556 | If the source tree that is rbind has some unbindable mounts, | ||
557 | then the subtree under the unbindable mount is pruned in the new | ||
558 | location. | ||
559 | |||
560 | eg: lets say we have the following mount tree. | ||
561 | |||
562 | A | ||
563 | / \ | ||
564 | B C | ||
565 | / \ / \ | ||
566 | D E F G | ||
567 | |||
568 | Lets say all the mount except the mount C in the tree are | ||
569 | of a type other than unbindable. | ||
570 | |||
571 | If this tree is rbound to say Z | ||
572 | |||
573 | We will have the following tree at the new location. | ||
574 | |||
575 | Z | ||
576 | | | ||
577 | A' | ||
578 | / | ||
579 | B' Note how the tree under C is pruned | ||
580 | / \ in the new location. | ||
581 | D' E' | ||
582 | |||
583 | |||
584 | |||
585 | 5d) Move semantics | ||
586 | |||
587 | Consider the following command | ||
588 | |||
589 | mount --move A B/b | ||
590 | |||
591 | where 'A' is the source mount, 'B' is the destination mount and 'b' is | ||
592 | the dentry in the destination mount. | ||
593 | |||
594 | The outcome depends on the type of the mount of 'A' and 'B'. The table | ||
595 | below is a quick reference. | ||
596 | --------------------------------------------------------------------------- | ||
597 | | MOVE MOUNT OPERATION | | ||
598 | |************************************************************************** | ||
599 | | source(A)->| shared | private | slave | unbindable | | ||
600 | | dest(B) | | | | | | ||
601 | | | | | | | | | ||
602 | | v | | | | | | ||
603 | |************************************************************************** | ||
604 | | shared | shared | shared |shared and slave| invalid | | ||
605 | | | | | | | | ||
606 | |non-shared| shared | private | slave | unbindable | | ||
607 | *************************************************************************** | ||
608 | NOTE: moving a mount residing under a shared mount is invalid. | ||
609 | |||
610 | Details follow: | ||
611 | |||
612 | 1. 'A' is a shared mount and 'B' is a shared mount. The mount 'A' is | ||
613 | mounted on mount 'B' at dentry 'b'. Also new mounts 'A1', 'A2'...'An' | ||
614 | are created and mounted at dentry 'b' on all mounts that receive | ||
615 | propagation from mount 'B'. A new propagation tree is created in the | ||
616 | exact same configuration as that of 'B'. This new propagation tree | ||
617 | contains all the new mounts 'A1', 'A2'... 'An'. And this new | ||
618 | propagation tree is appended to the already existing propagation tree | ||
619 | of 'A'. | ||
620 | |||
621 | 2. 'A' is a private mount and 'B' is a shared mount. The mount 'A' is | ||
622 | mounted on mount 'B' at dentry 'b'. Also new mount 'A1', 'A2'... 'An' | ||
623 | are created and mounted at dentry 'b' on all mounts that receive | ||
624 | propagation from mount 'B'. The mount 'A' becomes a shared mount and a | ||
625 | propagation tree is created which is identical to that of | ||
626 | 'B'. This new propagation tree contains all the new mounts 'A1', | ||
627 | 'A2'... 'An'. | ||
628 | |||
629 | 3. 'A' is a slave mount of mount 'Z' and 'B' is a shared mount. The | ||
630 | mount 'A' is mounted on mount 'B' at dentry 'b'. Also new mounts 'A1', | ||
631 | 'A2'... 'An' are created and mounted at dentry 'b' on all mounts that | ||
632 | receive propagation from mount 'B'. A new propagation tree is created | ||
633 | in the exact same configuration as that of 'B'. This new propagation | ||
634 | tree contains all the new mounts 'A1', 'A2'... 'An'. And this new | ||
635 | propagation tree is appended to the already existing propagation tree of | ||
636 | 'A'. Mount 'A' continues to be the slave mount of 'Z' but it also | ||
637 | becomes 'shared'. | ||
638 | |||
639 | 4. 'A' is a unbindable mount and 'B' is a shared mount. The operation | ||
640 | is invalid. Because mounting anything on the shared mount 'B' can | ||
641 | create new mounts that get mounted on the mounts that receive | ||
642 | propagation from 'B'. And since the mount 'A' is unbindable, cloning | ||
643 | it to mount at other mountpoints is not possible. | ||
644 | |||
645 | 5. 'A' is a private mount and 'B' is a non-shared(private or slave or | ||
646 | unbindable) mount. The mount 'A' is mounted on mount 'B' at dentry 'b'. | ||
647 | |||
648 | 6. 'A' is a shared mount and 'B' is a non-shared mount. The mount 'A' | ||
649 | is mounted on mount 'B' at dentry 'b'. Mount 'A' continues to be a | ||
650 | shared mount. | ||
651 | |||
652 | 7. 'A' is a slave mount of mount 'Z' and 'B' is a non-shared mount. | ||
653 | The mount 'A' is mounted on mount 'B' at dentry 'b'. Mount 'A' | ||
654 | continues to be a slave mount of mount 'Z'. | ||
655 | |||
656 | 8. 'A' is a unbindable mount and 'B' is a non-shared mount. The mount | ||
657 | 'A' is mounted on mount 'B' at dentry 'b'. Mount 'A' continues to be a | ||
658 | unbindable mount. | ||
659 | |||
660 | 5e) Mount semantics | ||
661 | |||
662 | Consider the following command | ||
663 | |||
664 | mount device B/b | ||
665 | |||
666 | 'B' is the destination mount and 'b' is the dentry in the destination | ||
667 | mount. | ||
668 | |||
669 | The above operation is the same as bind operation with the exception | ||
670 | that the source mount is always a private mount. | ||
671 | |||
672 | |||
673 | 5f) Unmount semantics | ||
674 | |||
675 | Consider the following command | ||
676 | |||
677 | umount A | ||
678 | |||
679 | where 'A' is a mount mounted on mount 'B' at dentry 'b'. | ||
680 | |||
681 | If mount 'B' is shared, then all most-recently-mounted mounts at dentry | ||
682 | 'b' on mounts that receive propagation from mount 'B' and does not have | ||
683 | sub-mounts within them are unmounted. | ||
684 | |||
685 | Example: Lets say 'B1', 'B2', 'B3' are shared mounts that propagate to | ||
686 | each other. | ||
687 | |||
688 | lets say 'A1', 'A2', 'A3' are first mounted at dentry 'b' on mount | ||
689 | 'B1', 'B2' and 'B3' respectively. | ||
690 | |||
691 | lets say 'C1', 'C2', 'C3' are next mounted at the same dentry 'b' on | ||
692 | mount 'B1', 'B2' and 'B3' respectively. | ||
693 | |||
694 | if 'C1' is unmounted, all the mounts that are most-recently-mounted on | ||
695 | 'B1' and on the mounts that 'B1' propagates-to are unmounted. | ||
696 | |||
697 | 'B1' propagates to 'B2' and 'B3'. And the most recently mounted mount | ||
698 | on 'B2' at dentry 'b' is 'C2', and that of mount 'B3' is 'C3'. | ||
699 | |||
700 | So all 'C1', 'C2' and 'C3' should be unmounted. | ||
701 | |||
702 | If any of 'C2' or 'C3' has some child mounts, then that mount is not | ||
703 | unmounted, but all other mounts are unmounted. However if 'C1' is told | ||
704 | to be unmounted and 'C1' has some sub-mounts, the umount operation is | ||
705 | failed entirely. | ||
706 | |||
707 | 5g) Clone Namespace | ||
708 | |||
709 | A cloned namespace contains all the mounts as that of the parent | ||
710 | namespace. | ||
711 | |||
712 | Lets say 'A' and 'B' are the corresponding mounts in the parent and the | ||
713 | child namespace. | ||
714 | |||
715 | If 'A' is shared, then 'B' is also shared and 'A' and 'B' propagate to | ||
716 | each other. | ||
717 | |||
718 | If 'A' is a slave mount of 'Z', then 'B' is also the slave mount of | ||
719 | 'Z'. | ||
720 | |||
721 | If 'A' is a private mount, then 'B' is a private mount too. | ||
722 | |||
723 | If 'A' is unbindable mount, then 'B' is a unbindable mount too. | ||
724 | |||
725 | |||
726 | 6) Quiz | ||
727 | |||
728 | A. What is the result of the following command sequence? | ||
729 | |||
730 | mount --bind /mnt /mnt | ||
731 | mount --make-shared /mnt | ||
732 | mount --bind /mnt /tmp | ||
733 | mount --move /tmp /mnt/1 | ||
734 | |||
735 | what should be the contents of /mnt /mnt/1 /mnt/1/1 should be? | ||
736 | Should they all be identical? or should /mnt and /mnt/1 be | ||
737 | identical only? | ||
738 | |||
739 | |||
740 | B. What is the result of the following command sequence? | ||
741 | |||
742 | mount --make-rshared / | ||
743 | mkdir -p /v/1 | ||
744 | mount --rbind / /v/1 | ||
745 | |||
746 | what should be the content of /v/1/v/1 be? | ||
747 | |||
748 | |||
749 | C. What is the result of the following command sequence? | ||
750 | |||
751 | mount --bind /mnt /mnt | ||
752 | mount --make-shared /mnt | ||
753 | mkdir -p /mnt/1/2/3 /mnt/1/test | ||
754 | mount --bind /mnt/1 /tmp | ||
755 | mount --make-slave /mnt | ||
756 | mount --make-shared /mnt | ||
757 | mount --bind /mnt/1/2 /tmp1 | ||
758 | mount --make-slave /mnt | ||
759 | |||
760 | At this point we have the first mount at /tmp and | ||
761 | its root dentry is 1. Lets call this mount 'A' | ||
762 | And then we have a second mount at /tmp1 with root | ||
763 | dentry 2. Lets call this mount 'B' | ||
764 | Next we have a third mount at /mnt with root dentry | ||
765 | mnt. Lets call this mount 'C' | ||
766 | |||
767 | 'B' is the slave of 'A' and 'C' is a slave of 'B' | ||
768 | A -> B -> C | ||
769 | |||
770 | at this point if we execute the following command | ||
771 | |||
772 | mount --bind /bin /tmp/test | ||
773 | |||
774 | The mount is attempted on 'A' | ||
775 | |||
776 | will the mount propagate to 'B' and 'C' ? | ||
777 | |||
778 | what would be the contents of | ||
779 | /mnt/1/test be? | ||
780 | |||
781 | 7) FAQ | ||
782 | |||
783 | Q1. Why is bind mount needed? How is it different from symbolic links? | ||
784 | symbolic links can get stale if the destination mount gets | ||
785 | unmounted or moved. Bind mounts continue to exist even if the | ||
786 | other mount is unmounted or moved. | ||
787 | |||
788 | Q2. Why can't the shared subtree be implemented using exportfs? | ||
789 | |||
790 | exportfs is a heavyweight way of accomplishing part of what | ||
791 | shared subtree can do. I cannot imagine a way to implement the | ||
792 | semantics of slave mount using exportfs? | ||
793 | |||
794 | Q3 Why is unbindable mount needed? | ||
795 | |||
796 | Lets say we want to replicate the mount tree at multiple | ||
797 | locations within the same subtree. | ||
798 | |||
799 | if one rbind mounts a tree within the same subtree 'n' times | ||
800 | the number of mounts created is an exponential function of 'n'. | ||
801 | Having unbindable mount can help prune the unneeded bind | ||
802 | mounts. Here is a example. | ||
803 | |||
804 | step 1: | ||
805 | lets say the root tree has just two directories with | ||
806 | one vfsmount. | ||
807 | root | ||
808 | / \ | ||
809 | tmp usr | ||
810 | |||
811 | And we want to replicate the tree at multiple | ||
812 | mountpoints under /root/tmp | ||
813 | |||
814 | step2: | ||
815 | mount --make-shared /root | ||
816 | |||
817 | mkdir -p /tmp/m1 | ||
818 | |||
819 | mount --rbind /root /tmp/m1 | ||
820 | |||
821 | the new tree now looks like this: | ||
822 | |||
823 | root | ||
824 | / \ | ||
825 | tmp usr | ||
826 | / | ||
827 | m1 | ||
828 | / \ | ||
829 | tmp usr | ||
830 | / | ||
831 | m1 | ||
832 | |||
833 | it has two vfsmounts | ||
834 | |||
835 | step3: | ||
836 | mkdir -p /tmp/m2 | ||
837 | mount --rbind /root /tmp/m2 | ||
838 | |||
839 | the new tree now looks like this: | ||
840 | |||
841 | root | ||
842 | / \ | ||
843 | tmp usr | ||
844 | / \ | ||
845 | m1 m2 | ||
846 | / \ / \ | ||
847 | tmp usr tmp usr | ||
848 | / \ / | ||
849 | m1 m2 m1 | ||
850 | / \ / \ | ||
851 | tmp usr tmp usr | ||
852 | / / \ | ||
853 | m1 m1 m2 | ||
854 | / \ | ||
855 | tmp usr | ||
856 | / \ | ||
857 | m1 m2 | ||
858 | |||
859 | it has 6 vfsmounts | ||
860 | |||
861 | step 4: | ||
862 | mkdir -p /tmp/m3 | ||
863 | mount --rbind /root /tmp/m3 | ||
864 | |||
865 | I wont' draw the tree..but it has 24 vfsmounts | ||
866 | |||
867 | |||
868 | at step i the number of vfsmounts is V[i] = i*V[i-1]. | ||
869 | This is an exponential function. And this tree has way more | ||
870 | mounts than what we really needed in the first place. | ||
871 | |||
872 | One could use a series of umount at each step to prune | ||
873 | out the unneeded mounts. But there is a better solution. | ||
874 | Unclonable mounts come in handy here. | ||
875 | |||
876 | step 1: | ||
877 | lets say the root tree has just two directories with | ||
878 | one vfsmount. | ||
879 | root | ||
880 | / \ | ||
881 | tmp usr | ||
882 | |||
883 | How do we set up the same tree at multiple locations under | ||
884 | /root/tmp | ||
885 | |||
886 | step2: | ||
887 | mount --bind /root/tmp /root/tmp | ||
888 | |||
889 | mount --make-rshared /root | ||
890 | mount --make-unbindable /root/tmp | ||
891 | |||
892 | mkdir -p /tmp/m1 | ||
893 | |||
894 | mount --rbind /root /tmp/m1 | ||
895 | |||
896 | the new tree now looks like this: | ||
897 | |||
898 | root | ||
899 | / \ | ||
900 | tmp usr | ||
901 | / | ||
902 | m1 | ||
903 | / \ | ||
904 | tmp usr | ||
905 | |||
906 | step3: | ||
907 | mkdir -p /tmp/m2 | ||
908 | mount --rbind /root /tmp/m2 | ||
909 | |||
910 | the new tree now looks like this: | ||
911 | |||
912 | root | ||
913 | / \ | ||
914 | tmp usr | ||
915 | / \ | ||
916 | m1 m2 | ||
917 | / \ / \ | ||
918 | tmp usr tmp usr | ||
919 | |||
920 | step4: | ||
921 | |||
922 | mkdir -p /tmp/m3 | ||
923 | mount --rbind /root /tmp/m3 | ||
924 | |||
925 | the new tree now looks like this: | ||
926 | |||
927 | root | ||
928 | / \ | ||
929 | tmp usr | ||
930 | / \ \ | ||
931 | m1 m2 m3 | ||
932 | / \ / \ / \ | ||
933 | tmp usr tmp usr tmp usr | ||
934 | |||
935 | 8) Implementation | ||
936 | |||
937 | 8A) Datastructure | ||
938 | |||
939 | 4 new fields are introduced to struct vfsmount | ||
940 | ->mnt_share | ||
941 | ->mnt_slave_list | ||
942 | ->mnt_slave | ||
943 | ->mnt_master | ||
944 | |||
945 | ->mnt_share links togather all the mount to/from which this vfsmount | ||
946 | send/receives propagation events. | ||
947 | |||
948 | ->mnt_slave_list links all the mounts to which this vfsmount propagates | ||
949 | to. | ||
950 | |||
951 | ->mnt_slave links togather all the slaves that its master vfsmount | ||
952 | propagates to. | ||
953 | |||
954 | ->mnt_master points to the master vfsmount from which this vfsmount | ||
955 | receives propagation. | ||
956 | |||
957 | ->mnt_flags takes two more flags to indicate the propagation status of | ||
958 | the vfsmount. MNT_SHARE indicates that the vfsmount is a shared | ||
959 | vfsmount. MNT_UNCLONABLE indicates that the vfsmount cannot be | ||
960 | replicated. | ||
961 | |||
962 | All the shared vfsmounts in a peer group form a cyclic list through | ||
963 | ->mnt_share. | ||
964 | |||
965 | All vfsmounts with the same ->mnt_master form on a cyclic list anchored | ||
966 | in ->mnt_master->mnt_slave_list and going through ->mnt_slave. | ||
967 | |||
968 | ->mnt_master can point to arbitrary (and possibly different) members | ||
969 | of master peer group. To find all immediate slaves of a peer group | ||
970 | you need to go through _all_ ->mnt_slave_list of its members. | ||
971 | Conceptually it's just a single set - distribution among the | ||
972 | individual lists does not affect propagation or the way propagation | ||
973 | tree is modified by operations. | ||
974 | |||
975 | A example propagation tree looks as shown in the figure below. | ||
976 | [ NOTE: Though it looks like a forest, if we consider all the shared | ||
977 | mounts as a conceptual entity called 'pnode', it becomes a tree] | ||
978 | |||
979 | |||
980 | A <--> B <--> C <---> D | ||
981 | /|\ /| |\ | ||
982 | / F G J K H I | ||
983 | / | ||
984 | E<-->K | ||
985 | /|\ | ||
986 | M L N | ||
987 | |||
988 | In the above figure A,B,C and D all are shared and propagate to each | ||
989 | other. 'A' has got 3 slave mounts 'E' 'F' and 'G' 'C' has got 2 slave | ||
990 | mounts 'J' and 'K' and 'D' has got two slave mounts 'H' and 'I'. | ||
991 | 'E' is also shared with 'K' and they propagate to each other. And | ||
992 | 'K' has 3 slaves 'M', 'L' and 'N' | ||
993 | |||
994 | A's ->mnt_share links with the ->mnt_share of 'B' 'C' and 'D' | ||
995 | |||
996 | A's ->mnt_slave_list links with ->mnt_slave of 'E', 'K', 'F' and 'G' | ||
997 | |||
998 | E's ->mnt_share links with ->mnt_share of K | ||
999 | 'E', 'K', 'F', 'G' have their ->mnt_master point to struct | ||
1000 | vfsmount of 'A' | ||
1001 | 'M', 'L', 'N' have their ->mnt_master point to struct vfsmount of 'K' | ||
1002 | K's ->mnt_slave_list links with ->mnt_slave of 'M', 'L' and 'N' | ||
1003 | |||
1004 | C's ->mnt_slave_list links with ->mnt_slave of 'J' and 'K' | ||
1005 | J and K's ->mnt_master points to struct vfsmount of C | ||
1006 | and finally D's ->mnt_slave_list links with ->mnt_slave of 'H' and 'I' | ||
1007 | 'H' and 'I' have their ->mnt_master pointing to struct vfsmount of 'D'. | ||
1008 | |||
1009 | |||
1010 | NOTE: The propagation tree is orthogonal to the mount tree. | ||
1011 | |||
1012 | |||
1013 | 8B Algorithm: | ||
1014 | |||
1015 | The crux of the implementation resides in rbind/move operation. | ||
1016 | |||
1017 | The overall algorithm breaks the operation into 3 phases: (look at | ||
1018 | attach_recursive_mnt() and propagate_mnt()) | ||
1019 | |||
1020 | 1. prepare phase. | ||
1021 | 2. commit phases. | ||
1022 | 3. abort phases. | ||
1023 | |||
1024 | Prepare phase: | ||
1025 | |||
1026 | for each mount in the source tree: | ||
1027 | a) Create the necessary number of mount trees to | ||
1028 | be attached to each of the mounts that receive | ||
1029 | propagation from the destination mount. | ||
1030 | b) Do not attach any of the trees to its destination. | ||
1031 | However note down its ->mnt_parent and ->mnt_mountpoint | ||
1032 | c) Link all the new mounts to form a propagation tree that | ||
1033 | is identical to the propagation tree of the destination | ||
1034 | mount. | ||
1035 | |||
1036 | If this phase is successful, there should be 'n' new | ||
1037 | propagation trees; where 'n' is the number of mounts in the | ||
1038 | source tree. Go to the commit phase | ||
1039 | |||
1040 | Also there should be 'm' new mount trees, where 'm' is | ||
1041 | the number of mounts to which the destination mount | ||
1042 | propagates to. | ||
1043 | |||
1044 | if any memory allocations fail, go to the abort phase. | ||
1045 | |||
1046 | Commit phase | ||
1047 | attach each of the mount trees to their corresponding | ||
1048 | destination mounts. | ||
1049 | |||
1050 | Abort phase | ||
1051 | delete all the newly created trees. | ||
1052 | |||
1053 | NOTE: all the propagation related functionality resides in the file | ||
1054 | pnode.c | ||
1055 | |||
1056 | |||
1057 | ------------------------------------------------------------------------ | ||
1058 | |||
1059 | version 0.1 (created the initial document, Ram Pai linuxram@us.ibm.com) | ||
1060 | version 0.2 (Incorporated comments from Al Viro) | ||
diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt index 13cba955cb5a..2f27f391c7cc 100644 --- a/Documentation/sound/alsa/ALSA-Configuration.txt +++ b/Documentation/sound/alsa/ALSA-Configuration.txt | |||
@@ -167,7 +167,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
167 | spdif - Support SPDIF I/O | 167 | spdif - Support SPDIF I/O |
168 | - Default: disabled | 168 | - Default: disabled |
169 | 169 | ||
170 | Module supports autoprobe and multiple chips (max 8). | 170 | This module supports one chip and autoprobe. |
171 | 171 | ||
172 | The power-management is supported. | 172 | The power-management is supported. |
173 | 173 | ||
@@ -206,7 +206,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
206 | See "AC97 Quirk Option" section below. | 206 | See "AC97 Quirk Option" section below. |
207 | spdif_aclink - S/PDIF transfer over AC-link (default = 1) | 207 | spdif_aclink - S/PDIF transfer over AC-link (default = 1) |
208 | 208 | ||
209 | This module supports up to 8 cards and autoprobe. | 209 | This module supports one card and autoprobe. |
210 | 210 | ||
211 | ATI IXP has two different methods to control SPDIF output. One is | 211 | ATI IXP has two different methods to control SPDIF output. One is |
212 | over AC-link and another is over the "direct" SPDIF output. The | 212 | over AC-link and another is over the "direct" SPDIF output. The |
@@ -218,7 +218,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
218 | 218 | ||
219 | Module for ATI IXP 150/200/250 AC97 modem controllers. | 219 | Module for ATI IXP 150/200/250 AC97 modem controllers. |
220 | 220 | ||
221 | Module supports up to 8 cards. | 221 | This module supports one card and autoprobe. |
222 | 222 | ||
223 | Note: The default index value of this module is -2, i.e. the first | 223 | Note: The default index value of this module is -2, i.e. the first |
224 | slot is excluded. | 224 | slot is excluded. |
@@ -637,7 +637,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
637 | model - force the model name | 637 | model - force the model name |
638 | position_fix - Fix DMA pointer (0 = auto, 1 = none, 2 = POSBUF, 3 = FIFO size) | 638 | position_fix - Fix DMA pointer (0 = auto, 1 = none, 2 = POSBUF, 3 = FIFO size) |
639 | 639 | ||
640 | Module supports up to 8 cards. | 640 | This module supports one card and autoprobe. |
641 | 641 | ||
642 | Each codec may have a model table for different configurations. | 642 | Each codec may have a model table for different configurations. |
643 | If your machine isn't listed there, the default (usually minimal) | 643 | If your machine isn't listed there, the default (usually minimal) |
@@ -663,6 +663,10 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
663 | adjusted. Appearing only when compiled with | 663 | adjusted. Appearing only when compiled with |
664 | $CONFIG_SND_DEBUG=y | 664 | $CONFIG_SND_DEBUG=y |
665 | 665 | ||
666 | ALC260 | ||
667 | hp HP machines | ||
668 | fujitsu Fujitsu S7020 | ||
669 | |||
666 | CMI9880 | 670 | CMI9880 |
667 | minimal 3-jack in back | 671 | minimal 3-jack in back |
668 | min_fp 3-jack in back, 2-jack in front | 672 | min_fp 3-jack in back, 2-jack in front |
@@ -811,7 +815,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
811 | semaphores (e.g. on some ASUS laptops) | 815 | semaphores (e.g. on some ASUS laptops) |
812 | (default off) | 816 | (default off) |
813 | 817 | ||
814 | Module supports autoprobe and multiple bus-master chips (max 8). | 818 | This module supports one chip and autoprobe. |
815 | 819 | ||
816 | Note: the latest driver supports auto-detection of chip clock. | 820 | Note: the latest driver supports auto-detection of chip clock. |
817 | if you still encounter too fast playback, specify the clock | 821 | if you still encounter too fast playback, specify the clock |
@@ -830,7 +834,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
830 | 834 | ||
831 | ac97_clock - AC'97 codec clock base (0 = auto-detect) | 835 | ac97_clock - AC'97 codec clock base (0 = auto-detect) |
832 | 836 | ||
833 | This module supports up to 8 cards and autoprobe. | 837 | This module supports one card and autoprobe. |
834 | 838 | ||
835 | Note: The default index value of this module is -2, i.e. the first | 839 | Note: The default index value of this module is -2, i.e. the first |
836 | slot is excluded. | 840 | slot is excluded. |
@@ -950,8 +954,10 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
950 | use_cache - 0 or 1 (disabled by default) | 954 | use_cache - 0 or 1 (disabled by default) |
951 | vaio_hack - alias buffer_top=0x25a800 | 955 | vaio_hack - alias buffer_top=0x25a800 |
952 | reset_workaround - enable AC97 RESET workaround for some laptops | 956 | reset_workaround - enable AC97 RESET workaround for some laptops |
957 | reset_workaround2 - enable extended AC97 RESET workaround for some | ||
958 | other laptops | ||
953 | 959 | ||
954 | Module supports autoprobe and multiple chips (max 8). | 960 | This module supports one chip and autoprobe. |
955 | 961 | ||
956 | The power-management is supported. | 962 | The power-management is supported. |
957 | 963 | ||
@@ -980,6 +986,11 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
980 | workaround is enabled automatically. For other laptops with a | 986 | workaround is enabled automatically. For other laptops with a |
981 | hard freeze, you can try reset_workaround=1 option. | 987 | hard freeze, you can try reset_workaround=1 option. |
982 | 988 | ||
989 | Note: Dell Latitude CSx laptops have another problem regarding | ||
990 | AC97 RESET. On these laptops, reset_workaround2 option is | ||
991 | turned on as default. This option is worth to try if the | ||
992 | previous reset_workaround option doesn't help. | ||
993 | |||
983 | Note: This driver is really crappy. It's a porting from the | 994 | Note: This driver is really crappy. It's a porting from the |
984 | OSS driver, which is a result of black-magic reverse engineering. | 995 | OSS driver, which is a result of black-magic reverse engineering. |
985 | The detection of codec will fail if the driver is loaded *after* | 996 | The detection of codec will fail if the driver is loaded *after* |
@@ -1310,7 +1321,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
1310 | ac97_quirk - AC'97 workaround for strange hardware | 1321 | ac97_quirk - AC'97 workaround for strange hardware |
1311 | See "AC97 Quirk Option" section below. | 1322 | See "AC97 Quirk Option" section below. |
1312 | 1323 | ||
1313 | Module supports autoprobe and multiple bus-master chips (max 8). | 1324 | This module supports one chip and autoprobe. |
1314 | 1325 | ||
1315 | Note: on some SMP motherboards like MSI 694D the interrupts might | 1326 | Note: on some SMP motherboards like MSI 694D the interrupts might |
1316 | not be generated properly. In such a case, please try to | 1327 | not be generated properly. In such a case, please try to |
@@ -1352,7 +1363,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
1352 | 1363 | ||
1353 | ac97_clock - AC'97 codec clock base (default 48000Hz) | 1364 | ac97_clock - AC'97 codec clock base (default 48000Hz) |
1354 | 1365 | ||
1355 | Module supports up to 8 cards. | 1366 | This module supports one card and autoprobe. |
1356 | 1367 | ||
1357 | Note: The default index value of this module is -2, i.e. the first | 1368 | Note: The default index value of this module is -2, i.e. the first |
1358 | slot is excluded. | 1369 | slot is excluded. |
diff --git a/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl b/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl index 24e85520890b..260334c98d95 100644 --- a/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl +++ b/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl | |||
@@ -18,8 +18,8 @@ | |||
18 | </affiliation> | 18 | </affiliation> |
19 | </author> | 19 | </author> |
20 | 20 | ||
21 | <date>March 6, 2005</date> | 21 | <date>October 6, 2005</date> |
22 | <edition>0.3.4</edition> | 22 | <edition>0.3.5</edition> |
23 | 23 | ||
24 | <abstract> | 24 | <abstract> |
25 | <para> | 25 | <para> |
@@ -30,7 +30,7 @@ | |||
30 | 30 | ||
31 | <legalnotice> | 31 | <legalnotice> |
32 | <para> | 32 | <para> |
33 | Copyright (c) 2002-2004 Takashi Iwai <email>tiwai@suse.de</email> | 33 | Copyright (c) 2002-2005 Takashi Iwai <email>tiwai@suse.de</email> |
34 | </para> | 34 | </para> |
35 | 35 | ||
36 | <para> | 36 | <para> |
@@ -1433,25 +1433,10 @@ | |||
1433 | <informalexample> | 1433 | <informalexample> |
1434 | <programlisting> | 1434 | <programlisting> |
1435 | <![CDATA[ | 1435 | <![CDATA[ |
1436 | if (chip->res_port) { | 1436 | release_and_free_resource(chip->res_port); |
1437 | release_resource(chip->res_port); | ||
1438 | kfree_nocheck(chip->res_port); | ||
1439 | } | ||
1440 | ]]> | 1437 | ]]> |
1441 | </programlisting> | 1438 | </programlisting> |
1442 | </informalexample> | 1439 | </informalexample> |
1443 | |||
1444 | As you can see, the resource pointer is also to be freed | ||
1445 | via <function>kfree_nocheck()</function> after | ||
1446 | <function>release_resource()</function> is called. You | ||
1447 | cannot use <function>kfree()</function> here, because on ALSA, | ||
1448 | <function>kfree()</function> may be a wrapper to its own | ||
1449 | allocator with the memory debugging. Since the resource pointer | ||
1450 | is allocated externally outside the ALSA, it must be released | ||
1451 | via the native | ||
1452 | <function>kfree()</function>. | ||
1453 | <function>kfree_nocheck()</function> is used for that; it calls | ||
1454 | the native <function>kfree()</function> without wrapper. | ||
1455 | </para> | 1440 | </para> |
1456 | 1441 | ||
1457 | <para> | 1442 | <para> |
@@ -2190,8 +2175,7 @@ struct _snd_pcm_runtime { | |||
2190 | unsigned int rate_den; | 2175 | unsigned int rate_den; |
2191 | 2176 | ||
2192 | /* -- SW params -- */ | 2177 | /* -- SW params -- */ |
2193 | int tstamp_timespec; /* use timeval (0) or timespec (1) */ | 2178 | struct timespec tstamp_mode; /* mmap timestamp is updated */ |
2194 | snd_pcm_tstamp_t tstamp_mode; /* mmap timestamp is updated */ | ||
2195 | unsigned int period_step; | 2179 | unsigned int period_step; |
2196 | unsigned int sleep_min; /* min ticks to sleep */ | 2180 | unsigned int sleep_min; /* min ticks to sleep */ |
2197 | snd_pcm_uframes_t xfer_align; /* xfer size need to be a multiple */ | 2181 | snd_pcm_uframes_t xfer_align; /* xfer size need to be a multiple */ |
@@ -3709,8 +3693,7 @@ struct _snd_pcm_runtime { | |||
3709 | <para> | 3693 | <para> |
3710 | Here, the chip instance is retrieved via | 3694 | Here, the chip instance is retrieved via |
3711 | <function>snd_kcontrol_chip()</function> macro. This macro | 3695 | <function>snd_kcontrol_chip()</function> macro. This macro |
3712 | converts from kcontrol->private_data to the type defined by | 3696 | just accesses to kcontrol->private_data. The |
3713 | <type>chip_t</type>. The | ||
3714 | kcontrol->private_data field is | 3697 | kcontrol->private_data field is |
3715 | given as the argument of <function>snd_ctl_new()</function> | 3698 | given as the argument of <function>snd_ctl_new()</function> |
3716 | (see the later subsection | 3699 | (see the later subsection |
@@ -5998,32 +5981,23 @@ struct _snd_pcm_runtime { | |||
5998 | The first argument is the expression to evaluate, and the | 5981 | The first argument is the expression to evaluate, and the |
5999 | second argument is the action if it fails. When | 5982 | second argument is the action if it fails. When |
6000 | <constant>CONFIG_SND_DEBUG</constant>, is set, it will show an | 5983 | <constant>CONFIG_SND_DEBUG</constant>, is set, it will show an |
6001 | error message such as <computeroutput>BUG? (xxx) (called from | 5984 | error message such as <computeroutput>BUG? (xxx)</computeroutput> |
6002 | yyy)</computeroutput>. When no debug flag is set, this is | 5985 | together with stack trace. |
6003 | ignored. | ||
6004 | </para> | 5986 | </para> |
6005 | </section> | ||
6006 | |||
6007 | <section id="useful-functions-snd-runtime-check"> | ||
6008 | <title><function>snd_runtime_check()</function></title> | ||
6009 | <para> | 5987 | <para> |
6010 | This macro is quite similar with | 5988 | When no debug flag is set, this macro is ignored. |
6011 | <function>snd_assert()</function>. Unlike | ||
6012 | <function>snd_assert()</function>, the expression is always | ||
6013 | evaluated regardless of | ||
6014 | <constant>CONFIG_SND_DEBUG</constant>. When | ||
6015 | <constant>CONFIG_SND_DEBUG</constant> is set, the macro will | ||
6016 | show a message like <computeroutput>ERROR (xx) (called from | ||
6017 | yyy)</computeroutput>. | ||
6018 | </para> | 5989 | </para> |
6019 | </section> | 5990 | </section> |
6020 | 5991 | ||
6021 | <section id="useful-functions-snd-bug"> | 5992 | <section id="useful-functions-snd-bug"> |
6022 | <title><function>snd_BUG()</function></title> | 5993 | <title><function>snd_BUG()</function></title> |
6023 | <para> | 5994 | <para> |
6024 | It calls <function>snd_assert(0,)</function> -- that is, just | 5995 | It shows <computeroutput>BUG?</computeroutput> message and |
6025 | prints the error message at the point. It's useful to show that | 5996 | stack trace as well as <function>snd_assert</function> at the point. |
6026 | a fatal error happens there. | 5997 | It's useful to show that a fatal error happens there. |
5998 | </para> | ||
5999 | <para> | ||
6000 | When no debug flag is set, this macro is ignored. | ||
6027 | </para> | 6001 | </para> |
6028 | </section> | 6002 | </section> |
6029 | </chapter> | 6003 | </chapter> |
diff --git a/Documentation/sparse.txt b/Documentation/sparse.txt index 1829009db771..3f1c5464b1c9 100644 --- a/Documentation/sparse.txt +++ b/Documentation/sparse.txt | |||
@@ -41,9 +41,9 @@ sure that bitwise types don't get mixed up (little-endian vs big-endian | |||
41 | vs cpu-endian vs whatever), and there the constant "0" really _is_ | 41 | vs cpu-endian vs whatever), and there the constant "0" really _is_ |
42 | special. | 42 | special. |
43 | 43 | ||
44 | Modify top-level Makefile to say | 44 | Use |
45 | 45 | ||
46 | CHECK = sparse -Wbitwise | 46 | make C=[12] CF=-Wbitwise |
47 | 47 | ||
48 | or you don't get any checking at all. | 48 | or you don't get any checking at all. |
49 | 49 | ||
diff --git a/Documentation/video4linux/bttv/README.freeze b/Documentation/video4linux/bttv/README.freeze index 51f8d4379a94..4259dccc8287 100644 --- a/Documentation/video4linux/bttv/README.freeze +++ b/Documentation/video4linux/bttv/README.freeze | |||
@@ -27,9 +27,9 @@ information out of a register+stack dump printed by the kernel on | |||
27 | protection faults (so-called "kernel oops"). | 27 | protection faults (so-called "kernel oops"). |
28 | 28 | ||
29 | If you run into some kind of deadlock, you can try to dump a call trace | 29 | If you run into some kind of deadlock, you can try to dump a call trace |
30 | for each process using sysrq-t (see Documentation/sysrq.txt). ksymoops | 30 | for each process using sysrq-t (see Documentation/sysrq.txt). |
31 | will translate these dumps into kernel symbols too. This way it is | 31 | This way it is possible to figure where *exactly* some process in "D" |
32 | possible to figure where *exactly* some process in "D" state is stuck. | 32 | state is stuck. |
33 | 33 | ||
34 | I've seen reports that bttv 0.7.x crashes whereas 0.8.x works rock solid | 34 | I've seen reports that bttv 0.7.x crashes whereas 0.8.x works rock solid |
35 | for some people. Thus probably a small buglet left somewhere in bttv | 35 | for some people. Thus probably a small buglet left somewhere in bttv |
diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt index 1b9bcd1fe98b..1ad9af1ca4d0 100644 --- a/Documentation/vm/hugetlbpage.txt +++ b/Documentation/vm/hugetlbpage.txt | |||
@@ -13,12 +13,13 @@ This optimization is more critical now as bigger and bigger physical memories | |||
13 | Users can use the huge page support in Linux kernel by either using the mmap | 13 | Users can use the huge page support in Linux kernel by either using the mmap |
14 | system call or standard SYSv shared memory system calls (shmget, shmat). | 14 | system call or standard SYSv shared memory system calls (shmget, shmat). |
15 | 15 | ||
16 | First the Linux kernel needs to be built with CONFIG_HUGETLB_PAGE (present | 16 | First the Linux kernel needs to be built with the CONFIG_HUGETLBFS |
17 | under Processor types and feature) and CONFIG_HUGETLBFS (present under file | 17 | (present under "File systems") and CONFIG_HUGETLB_PAGE (selected |
18 | system option on config menu) config options. | 18 | automatically when CONFIG_HUGETLBFS is selected) configuration |
19 | options. | ||
19 | 20 | ||
20 | The kernel built with hugepage support should show the number of configured | 21 | The kernel built with hugepage support should show the number of configured |
21 | hugepages in the system by running the "cat /proc/meminfo" command. | 22 | hugepages in the system by running the "cat /proc/meminfo" command. |
22 | 23 | ||
23 | /proc/meminfo also provides information about the total number of hugetlb | 24 | /proc/meminfo also provides information about the total number of hugetlb |
24 | pages configured in the kernel. It also displays information about the | 25 | pages configured in the kernel. It also displays information about the |
@@ -38,19 +39,19 @@ in the kernel. | |||
38 | 39 | ||
39 | /proc/sys/vm/nr_hugepages indicates the current number of configured hugetlb | 40 | /proc/sys/vm/nr_hugepages indicates the current number of configured hugetlb |
40 | pages in the kernel. Super user can dynamically request more (or free some | 41 | pages in the kernel. Super user can dynamically request more (or free some |
41 | pre-configured) hugepages. | 42 | pre-configured) hugepages. |
42 | The allocation( or deallocation) of hugetlb pages is posible only if there are | 43 | The allocation (or deallocation) of hugetlb pages is possible only if there are |
43 | enough physically contiguous free pages in system (freeing of hugepages is | 44 | enough physically contiguous free pages in system (freeing of hugepages is |
44 | possible only if there are enough hugetlb pages free that can be transfered | 45 | possible only if there are enough hugetlb pages free that can be transfered |
45 | back to regular memory pool). | 46 | back to regular memory pool). |
46 | 47 | ||
47 | Pages that are used as hugetlb pages are reserved inside the kernel and can | 48 | Pages that are used as hugetlb pages are reserved inside the kernel and can |
48 | not be used for other purposes. | 49 | not be used for other purposes. |
49 | 50 | ||
50 | Once the kernel with Hugetlb page support is built and running, a user can | 51 | Once the kernel with Hugetlb page support is built and running, a user can |
51 | use either the mmap system call or shared memory system calls to start using | 52 | use either the mmap system call or shared memory system calls to start using |
52 | the huge pages. It is required that the system administrator preallocate | 53 | the huge pages. It is required that the system administrator preallocate |
53 | enough memory for huge page purposes. | 54 | enough memory for huge page purposes. |
54 | 55 | ||
55 | Use the following command to dynamically allocate/deallocate hugepages: | 56 | Use the following command to dynamically allocate/deallocate hugepages: |
56 | 57 | ||
@@ -80,9 +81,9 @@ memory (huge pages) allowed for that filesystem (/mnt/huge). The size is | |||
80 | rounded down to HPAGE_SIZE. The option nr_inode sets the maximum number of | 81 | rounded down to HPAGE_SIZE. The option nr_inode sets the maximum number of |
81 | inodes that /mnt/huge can use. If the size or nr_inode options are not | 82 | inodes that /mnt/huge can use. If the size or nr_inode options are not |
82 | provided on command line then no limits are set. For size and nr_inodes | 83 | provided on command line then no limits are set. For size and nr_inodes |
83 | options, you can use [G|g]/[M|m]/[K|k] to represent giga/mega/kilo. For | 84 | options, you can use [G|g]/[M|m]/[K|k] to represent giga/mega/kilo. For |
84 | example, size=2K has the same meaning as size=2048. An example is given at | 85 | example, size=2K has the same meaning as size=2048. An example is given at |
85 | the end of this document. | 86 | the end of this document. |
86 | 87 | ||
87 | read and write system calls are not supported on files that reside on hugetlb | 88 | read and write system calls are not supported on files that reside on hugetlb |
88 | file systems. | 89 | file systems. |