diff options
Diffstat (limited to 'Documentation')
23 files changed, 863 insertions, 440 deletions
diff --git a/Documentation/Changes b/Documentation/Changes index 783ddc3ce4e8..86b86399d61d 100644 --- a/Documentation/Changes +++ b/Documentation/Changes | |||
@@ -139,9 +139,14 @@ You'll probably want to upgrade. | |||
139 | Ksymoops | 139 | Ksymoops |
140 | -------- | 140 | -------- |
141 | 141 | ||
142 | If the unthinkable happens and your kernel oopses, you'll need a 2.4 | 142 | If the unthinkable happens and your kernel oopses, you may need the |
143 | version of ksymoops to decode the report; see REPORTING-BUGS in the | 143 | ksymoops tool to decode it, but in most cases you don't. |
144 | root of the Linux source for more information. | 144 | In the 2.6 kernel it is generally preferred to build the kernel with |
145 | CONFIG_KALLSYMS so that it produces readable dumps that can be used as-is | ||
146 | (this also produces better output than ksymoops). | ||
147 | If for some reason your kernel is not build with CONFIG_KALLSYMS and | ||
148 | you have no way to rebuild and reproduce the Oops with that option, then | ||
149 | you can still decode that Oops with ksymoops. | ||
145 | 150 | ||
146 | Module-Init-Tools | 151 | Module-Init-Tools |
147 | ----------------- | 152 | ----------------- |
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile index fa3e29ad8a46..7018f5c6a447 100644 --- a/Documentation/DocBook/Makefile +++ b/Documentation/DocBook/Makefile | |||
@@ -10,7 +10,7 @@ DOCBOOKS := wanbook.xml z8530book.xml mcabook.xml videobook.xml \ | |||
10 | kernel-hacking.xml kernel-locking.xml deviceiobook.xml \ | 10 | kernel-hacking.xml kernel-locking.xml deviceiobook.xml \ |
11 | procfs-guide.xml writing_usb_driver.xml \ | 11 | procfs-guide.xml writing_usb_driver.xml \ |
12 | sis900.xml kernel-api.xml journal-api.xml lsm.xml usb.xml \ | 12 | sis900.xml kernel-api.xml journal-api.xml lsm.xml usb.xml \ |
13 | gadget.xml libata.xml mtdnand.xml librs.xml | 13 | gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml |
14 | 14 | ||
15 | ### | 15 | ### |
16 | # The build process is as follows (targets): | 16 | # The build process is as follows (targets): |
diff --git a/Documentation/DocBook/journal-api.tmpl b/Documentation/DocBook/journal-api.tmpl index 341aaa4ce481..2077f9a28c19 100644 --- a/Documentation/DocBook/journal-api.tmpl +++ b/Documentation/DocBook/journal-api.tmpl | |||
@@ -306,7 +306,7 @@ an example. | |||
306 | </para> | 306 | </para> |
307 | <sect1><title>Journal Level</title> | 307 | <sect1><title>Journal Level</title> |
308 | !Efs/jbd/journal.c | 308 | !Efs/jbd/journal.c |
309 | !Efs/jbd/recovery.c | 309 | !Ifs/jbd/recovery.c |
310 | </sect1> | 310 | </sect1> |
311 | <sect1><title>Transasction Level</title> | 311 | <sect1><title>Transasction Level</title> |
312 | !Efs/jbd/transaction.c | 312 | !Efs/jbd/transaction.c |
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl index ec474e5a25ed..a8316b1a3e3d 100644 --- a/Documentation/DocBook/kernel-api.tmpl +++ b/Documentation/DocBook/kernel-api.tmpl | |||
@@ -118,7 +118,7 @@ X!Ilib/string.c | |||
118 | </sect1> | 118 | </sect1> |
119 | <sect1><title>User Space Memory Access</title> | 119 | <sect1><title>User Space Memory Access</title> |
120 | !Iinclude/asm-i386/uaccess.h | 120 | !Iinclude/asm-i386/uaccess.h |
121 | !Iarch/i386/lib/usercopy.c | 121 | !Earch/i386/lib/usercopy.c |
122 | </sect1> | 122 | </sect1> |
123 | <sect1><title>More Memory Management Functions</title> | 123 | <sect1><title>More Memory Management Functions</title> |
124 | !Iinclude/linux/rmap.h | 124 | !Iinclude/linux/rmap.h |
@@ -174,7 +174,6 @@ X!Ilib/string.c | |||
174 | <title>The Linux VFS</title> | 174 | <title>The Linux VFS</title> |
175 | <sect1><title>The Filesystem types</title> | 175 | <sect1><title>The Filesystem types</title> |
176 | !Iinclude/linux/fs.h | 176 | !Iinclude/linux/fs.h |
177 | !Einclude/linux/fs.h | ||
178 | </sect1> | 177 | </sect1> |
179 | <sect1><title>The Directory Cache</title> | 178 | <sect1><title>The Directory Cache</title> |
180 | !Efs/dcache.c | 179 | !Efs/dcache.c |
@@ -266,7 +265,7 @@ X!Ekernel/module.c | |||
266 | <chapter id="hardware"> | 265 | <chapter id="hardware"> |
267 | <title>Hardware Interfaces</title> | 266 | <title>Hardware Interfaces</title> |
268 | <sect1><title>Interrupt Handling</title> | 267 | <sect1><title>Interrupt Handling</title> |
269 | !Ikernel/irq/manage.c | 268 | !Ekernel/irq/manage.c |
270 | </sect1> | 269 | </sect1> |
271 | 270 | ||
272 | <sect1><title>Resources Management</title> | 271 | <sect1><title>Resources Management</title> |
@@ -501,7 +500,7 @@ KAO --> | |||
501 | !Edrivers/video/modedb.c | 500 | !Edrivers/video/modedb.c |
502 | </sect1> | 501 | </sect1> |
503 | <sect1><title>Frame Buffer Macintosh Video Mode Database</title> | 502 | <sect1><title>Frame Buffer Macintosh Video Mode Database</title> |
504 | !Idrivers/video/macmodes.c | 503 | !Edrivers/video/macmodes.c |
505 | </sect1> | 504 | </sect1> |
506 | <sect1><title>Frame Buffer Fonts</title> | 505 | <sect1><title>Frame Buffer Fonts</title> |
507 | <para> | 506 | <para> |
diff --git a/Documentation/DocBook/rapidio.tmpl b/Documentation/DocBook/rapidio.tmpl new file mode 100644 index 000000000000..1becf27ba27e --- /dev/null +++ b/Documentation/DocBook/rapidio.tmpl | |||
@@ -0,0 +1,160 @@ | |||
1 | <?xml version="1.0" encoding="UTF-8"?> | ||
2 | <!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" | ||
3 | "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" [ | ||
4 | <!ENTITY rapidio SYSTEM "rapidio.xml"> | ||
5 | ]> | ||
6 | |||
7 | <book id="RapidIO-Guide"> | ||
8 | <bookinfo> | ||
9 | <title>RapidIO Subsystem Guide</title> | ||
10 | |||
11 | <authorgroup> | ||
12 | <author> | ||
13 | <firstname>Matt</firstname> | ||
14 | <surname>Porter</surname> | ||
15 | <affiliation> | ||
16 | <address> | ||
17 | <email>mporter@kernel.crashing.org</email> | ||
18 | <email>mporter@mvista.com</email> | ||
19 | </address> | ||
20 | </affiliation> | ||
21 | </author> | ||
22 | </authorgroup> | ||
23 | |||
24 | <copyright> | ||
25 | <year>2005</year> | ||
26 | <holder>MontaVista Software, Inc.</holder> | ||
27 | </copyright> | ||
28 | |||
29 | <legalnotice> | ||
30 | <para> | ||
31 | This documentation is free software; you can redistribute | ||
32 | it and/or modify it under the terms of the GNU General Public | ||
33 | License version 2 as published by the Free Software Foundation. | ||
34 | </para> | ||
35 | |||
36 | <para> | ||
37 | This program is distributed in the hope that it will be | ||
38 | useful, but WITHOUT ANY WARRANTY; without even the implied | ||
39 | warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | ||
40 | See the GNU General Public License for more details. | ||
41 | </para> | ||
42 | |||
43 | <para> | ||
44 | You should have received a copy of the GNU General Public | ||
45 | License along with this program; if not, write to the Free | ||
46 | Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, | ||
47 | MA 02111-1307 USA | ||
48 | </para> | ||
49 | |||
50 | <para> | ||
51 | For more details see the file COPYING in the source | ||
52 | distribution of Linux. | ||
53 | </para> | ||
54 | </legalnotice> | ||
55 | </bookinfo> | ||
56 | |||
57 | <toc></toc> | ||
58 | |||
59 | <chapter id="intro"> | ||
60 | <title>Introduction</title> | ||
61 | <para> | ||
62 | RapidIO is a high speed switched fabric interconnect with | ||
63 | features aimed at the embedded market. RapidIO provides | ||
64 | support for memory-mapped I/O as well as message-based | ||
65 | transactions over the switched fabric network. RapidIO has | ||
66 | a standardized discovery mechanism not unlike the PCI bus | ||
67 | standard that allows simple detection of devices in a | ||
68 | network. | ||
69 | </para> | ||
70 | <para> | ||
71 | This documentation is provided for developers intending | ||
72 | to support RapidIO on new architectures, write new drivers, | ||
73 | or to understand the subsystem internals. | ||
74 | </para> | ||
75 | </chapter> | ||
76 | |||
77 | <chapter id="bugs"> | ||
78 | <title>Known Bugs and Limitations</title> | ||
79 | |||
80 | <sect1> | ||
81 | <title>Bugs</title> | ||
82 | <para>None. ;)</para> | ||
83 | </sect1> | ||
84 | <sect1> | ||
85 | <title>Limitations</title> | ||
86 | <para> | ||
87 | <orderedlist> | ||
88 | <listitem><para>Access/management of RapidIO memory regions is not supported</para></listitem> | ||
89 | <listitem><para>Multiple host enumeration is not supported</para></listitem> | ||
90 | </orderedlist> | ||
91 | </para> | ||
92 | </sect1> | ||
93 | </chapter> | ||
94 | |||
95 | <chapter id="drivers"> | ||
96 | <title>RapidIO driver interface</title> | ||
97 | <para> | ||
98 | Drivers are provided a set of calls in order | ||
99 | to interface with the subsystem to gather info | ||
100 | on devices, request/map memory region resources, | ||
101 | and manage mailboxes/doorbells. | ||
102 | </para> | ||
103 | <sect1> | ||
104 | <title>Functions</title> | ||
105 | !Iinclude/linux/rio_drv.h | ||
106 | !Edrivers/rapidio/rio-driver.c | ||
107 | !Edrivers/rapidio/rio.c | ||
108 | </sect1> | ||
109 | </chapter> | ||
110 | |||
111 | <chapter id="internals"> | ||
112 | <title>Internals</title> | ||
113 | |||
114 | <para> | ||
115 | This chapter contains the autogenerated documentation of the RapidIO | ||
116 | subsystem. | ||
117 | </para> | ||
118 | |||
119 | <sect1><title>Structures</title> | ||
120 | !Iinclude/linux/rio.h | ||
121 | </sect1> | ||
122 | <sect1><title>Enumeration and Discovery</title> | ||
123 | !Idrivers/rapidio/rio-scan.c | ||
124 | </sect1> | ||
125 | <sect1><title>Driver functionality</title> | ||
126 | !Idrivers/rapidio/rio.c | ||
127 | !Idrivers/rapidio/rio-access.c | ||
128 | </sect1> | ||
129 | <sect1><title>Device model support</title> | ||
130 | !Idrivers/rapidio/rio-driver.c | ||
131 | </sect1> | ||
132 | <sect1><title>Sysfs support</title> | ||
133 | !Idrivers/rapidio/rio-sysfs.c | ||
134 | </sect1> | ||
135 | <sect1><title>PPC32 support</title> | ||
136 | !Iarch/ppc/kernel/rio.c | ||
137 | !Earch/ppc/syslib/ppc85xx_rio.c | ||
138 | !Iarch/ppc/syslib/ppc85xx_rio.c | ||
139 | </sect1> | ||
140 | </chapter> | ||
141 | |||
142 | <chapter id="credits"> | ||
143 | <title>Credits</title> | ||
144 | <para> | ||
145 | The following people have contributed to the RapidIO | ||
146 | subsystem directly or indirectly: | ||
147 | <orderedlist> | ||
148 | <listitem><para>Matt Porter<email>mporter@kernel.crashing.org</email></para></listitem> | ||
149 | <listitem><para>Randy Vinson<email>rvinson@mvista.com</email></para></listitem> | ||
150 | <listitem><para>Dan Malek<email>dan@embeddedalley.com</email></para></listitem> | ||
151 | </orderedlist> | ||
152 | </para> | ||
153 | <para> | ||
154 | The following people have contributed to this document: | ||
155 | <orderedlist> | ||
156 | <listitem><para>Matt Porter<email>mporter@kernel.crashing.org</email></para></listitem> | ||
157 | </orderedlist> | ||
158 | </para> | ||
159 | </chapter> | ||
160 | </book> | ||
diff --git a/Documentation/MSI-HOWTO.txt b/Documentation/MSI-HOWTO.txt index 63edc5f847c4..3ec6c720b016 100644 --- a/Documentation/MSI-HOWTO.txt +++ b/Documentation/MSI-HOWTO.txt | |||
@@ -10,14 +10,22 @@ | |||
10 | This guide describes the basics of Message Signaled Interrupts (MSI), | 10 | This guide describes the basics of Message Signaled Interrupts (MSI), |
11 | the advantages of using MSI over traditional interrupt mechanisms, | 11 | the advantages of using MSI over traditional interrupt mechanisms, |
12 | and how to enable your driver to use MSI or MSI-X. Also included is | 12 | and how to enable your driver to use MSI or MSI-X. Also included is |
13 | a Frequently Asked Questions. | 13 | a Frequently Asked Questions (FAQ) section. |
14 | |||
15 | 1.1 Terminology | ||
16 | |||
17 | PCI devices can be single-function or multi-function. In either case, | ||
18 | when this text talks about enabling or disabling MSI on a "device | ||
19 | function," it is referring to one specific PCI device and function and | ||
20 | not to all functions on a PCI device (unless the PCI device has only | ||
21 | one function). | ||
14 | 22 | ||
15 | 2. Copyright 2003 Intel Corporation | 23 | 2. Copyright 2003 Intel Corporation |
16 | 24 | ||
17 | 3. What is MSI/MSI-X? | 25 | 3. What is MSI/MSI-X? |
18 | 26 | ||
19 | Message Signaled Interrupt (MSI), as described in the PCI Local Bus | 27 | Message Signaled Interrupt (MSI), as described in the PCI Local Bus |
20 | Specification Revision 2.3 or latest, is an optional feature, and a | 28 | Specification Revision 2.3 or later, is an optional feature, and a |
21 | required feature for PCI Express devices. MSI enables a device function | 29 | required feature for PCI Express devices. MSI enables a device function |
22 | to request service by sending an Inbound Memory Write on its PCI bus to | 30 | to request service by sending an Inbound Memory Write on its PCI bus to |
23 | the FSB as a Message Signal Interrupt transaction. Because MSI is | 31 | the FSB as a Message Signal Interrupt transaction. Because MSI is |
@@ -27,7 +35,7 @@ supported. | |||
27 | 35 | ||
28 | A PCI device that supports MSI must also support pin IRQ assertion | 36 | A PCI device that supports MSI must also support pin IRQ assertion |
29 | interrupt mechanism to provide backward compatibility for systems that | 37 | interrupt mechanism to provide backward compatibility for systems that |
30 | do not support MSI. In Systems, which support MSI, the bus driver is | 38 | do not support MSI. In systems which support MSI, the bus driver is |
31 | responsible for initializing the message address and message data of | 39 | responsible for initializing the message address and message data of |
32 | the device function's MSI/MSI-X capability structure during device | 40 | the device function's MSI/MSI-X capability structure during device |
33 | initial configuration. | 41 | initial configuration. |
@@ -61,17 +69,17 @@ over the MSI capability structure as described below. | |||
61 | 69 | ||
62 | - MSI and MSI-X both support per-vector masking. Per-vector | 70 | - MSI and MSI-X both support per-vector masking. Per-vector |
63 | masking is an optional extension of MSI but a required | 71 | masking is an optional extension of MSI but a required |
64 | feature for MSI-X. Per-vector masking provides the kernel | 72 | feature for MSI-X. Per-vector masking provides the kernel the |
65 | the ability to mask/unmask MSI when servicing its software | 73 | ability to mask/unmask a single MSI while running its |
66 | interrupt service routing handler. If per-vector masking is | 74 | interrupt service routine. If per-vector masking is |
67 | not supported, then the device driver should provide the | 75 | not supported, then the device driver should provide the |
68 | hardware/software synchronization to ensure that the device | 76 | hardware/software synchronization to ensure that the device |
69 | generates MSI when the driver wants it to do so. | 77 | generates MSI when the driver wants it to do so. |
70 | 78 | ||
71 | 4. Why use MSI? | 79 | 4. Why use MSI? |
72 | 80 | ||
73 | As a benefit the simplification of board design, MSI allows board | 81 | As a benefit to the simplification of board design, MSI allows board |
74 | designers to remove out of band interrupt routing. MSI is another | 82 | designers to remove out-of-band interrupt routing. MSI is another |
75 | step towards a legacy-free environment. | 83 | step towards a legacy-free environment. |
76 | 84 | ||
77 | Due to increasing pressure on chipset and processor packages to | 85 | Due to increasing pressure on chipset and processor packages to |
@@ -87,7 +95,7 @@ support. As a result, the PCI Express technology requires MSI | |||
87 | support for better interrupt performance. | 95 | support for better interrupt performance. |
88 | 96 | ||
89 | Using MSI enables the device functions to support two or more | 97 | Using MSI enables the device functions to support two or more |
90 | vectors, which can be configured to target different CPU's to | 98 | vectors, which can be configured to target different CPUs to |
91 | increase scalability. | 99 | increase scalability. |
92 | 100 | ||
93 | 5. Configuring a driver to use MSI/MSI-X | 101 | 5. Configuring a driver to use MSI/MSI-X |
@@ -119,13 +127,13 @@ pci_enable_msi() explicitly. | |||
119 | 127 | ||
120 | int pci_enable_msi(struct pci_dev *dev) | 128 | int pci_enable_msi(struct pci_dev *dev) |
121 | 129 | ||
122 | With this new API, any existing device driver, which like to have | 130 | With this new API, a device driver that wants to have MSI |
123 | MSI enabled on its device function, must call this API to enable MSI | 131 | enabled on its device function must call this API to enable MSI. |
124 | A successful call will initialize the MSI capability structure | 132 | A successful call will initialize the MSI capability structure |
125 | with ONE vector, regardless of whether a device function is | 133 | with ONE vector, regardless of whether a device function is |
126 | capable of supporting multiple messages. This vector replaces the | 134 | capable of supporting multiple messages. This vector replaces the |
127 | pre-assigned dev->irq with a new MSI vector. To avoid the conflict | 135 | pre-assigned dev->irq with a new MSI vector. To avoid a conflict |
128 | of new assigned vector with existing pre-assigned vector requires | 136 | of the new assigned vector with existing pre-assigned vector requires |
129 | a device driver to call this API before calling request_irq(). | 137 | a device driver to call this API before calling request_irq(). |
130 | 138 | ||
131 | 5.2.2 API pci_disable_msi | 139 | 5.2.2 API pci_disable_msi |
@@ -137,14 +145,14 @@ when a device driver is unloading. This API restores dev->irq with | |||
137 | the pre-assigned IOAPIC vector and switches a device's interrupt | 145 | the pre-assigned IOAPIC vector and switches a device's interrupt |
138 | mode to PCI pin-irq assertion/INTx emulation mode. | 146 | mode to PCI pin-irq assertion/INTx emulation mode. |
139 | 147 | ||
140 | Note that a device driver should always call free_irq() on MSI vector | 148 | Note that a device driver should always call free_irq() on the MSI vector |
141 | it has done request_irq() on before calling this API. Failure to do | 149 | that it has done request_irq() on before calling this API. Failure to do |
142 | so results a BUG_ON() and a device will be left with MSI enabled and | 150 | so results in a BUG_ON() and a device will be left with MSI enabled and |
143 | leaks its vector. | 151 | leaks its vector. |
144 | 152 | ||
145 | 5.2.3 MSI mode vs. legacy mode diagram | 153 | 5.2.3 MSI mode vs. legacy mode diagram |
146 | 154 | ||
147 | The below diagram shows the events, which switches the interrupt | 155 | The below diagram shows the events which switch the interrupt |
148 | mode on the MSI-capable device function between MSI mode and | 156 | mode on the MSI-capable device function between MSI mode and |
149 | PIN-IRQ assertion mode. | 157 | PIN-IRQ assertion mode. |
150 | 158 | ||
@@ -155,9 +163,9 @@ PIN-IRQ assertion mode. | |||
155 | ------------ pci_disable_msi ------------------------ | 163 | ------------ pci_disable_msi ------------------------ |
156 | 164 | ||
157 | 165 | ||
158 | Figure 1.0 MSI Mode vs. Legacy Mode | 166 | Figure 1. MSI Mode vs. Legacy Mode |
159 | 167 | ||
160 | In Figure 1.0, a device operates by default in legacy mode. Legacy | 168 | In Figure 1, a device operates by default in legacy mode. Legacy |
161 | in this context means PCI pin-irq assertion or PCI-Express INTx | 169 | in this context means PCI pin-irq assertion or PCI-Express INTx |
162 | emulation. A successful MSI request (using pci_enable_msi()) switches | 170 | emulation. A successful MSI request (using pci_enable_msi()) switches |
163 | a device's interrupt mode to MSI mode. A pre-assigned IOAPIC vector | 171 | a device's interrupt mode to MSI mode. A pre-assigned IOAPIC vector |
@@ -166,11 +174,11 @@ assigned MSI vector will replace dev->irq. | |||
166 | 174 | ||
167 | To return back to its default mode, a device driver should always call | 175 | To return back to its default mode, a device driver should always call |
168 | pci_disable_msi() to undo the effect of pci_enable_msi(). Note that a | 176 | pci_disable_msi() to undo the effect of pci_enable_msi(). Note that a |
169 | device driver should always call free_irq() on MSI vector it has done | 177 | device driver should always call free_irq() on the MSI vector it has |
170 | request_irq() on before calling pci_disable_msi(). Failure to do so | 178 | done request_irq() on before calling pci_disable_msi(). Failure to do |
171 | results a BUG_ON() and a device will be left with MSI enabled and | 179 | so results in a BUG_ON() and a device will be left with MSI enabled and |
172 | leaks its vector. Otherwise, the PCI subsystem restores a device's | 180 | leaks its vector. Otherwise, the PCI subsystem restores a device's |
173 | dev->irq with a pre-assigned IOAPIC vector and marks released | 181 | dev->irq with a pre-assigned IOAPIC vector and marks the released |
174 | MSI vector as unused. | 182 | MSI vector as unused. |
175 | 183 | ||
176 | Once being marked as unused, there is no guarantee that the PCI | 184 | Once being marked as unused, there is no guarantee that the PCI |
@@ -178,8 +186,8 @@ subsystem will reserve this MSI vector for a device. Depending on | |||
178 | the availability of current PCI vector resources and the number of | 186 | the availability of current PCI vector resources and the number of |
179 | MSI/MSI-X requests from other drivers, this MSI may be re-assigned. | 187 | MSI/MSI-X requests from other drivers, this MSI may be re-assigned. |
180 | 188 | ||
181 | For the case where the PCI subsystem re-assigned this MSI vector | 189 | For the case where the PCI subsystem re-assigns this MSI vector to |
182 | another driver, a request to switching back to MSI mode may result | 190 | another driver, a request to switch back to MSI mode may result |
183 | in being assigned a different MSI vector or a failure if no more | 191 | in being assigned a different MSI vector or a failure if no more |
184 | vectors are available. | 192 | vectors are available. |
185 | 193 | ||
@@ -208,12 +216,12 @@ Unlike the function pci_enable_msi(), the function pci_enable_msix() | |||
208 | does not replace the pre-assigned IOAPIC dev->irq with a new MSI | 216 | does not replace the pre-assigned IOAPIC dev->irq with a new MSI |
209 | vector because the PCI subsystem writes the 1:1 vector-to-entry mapping | 217 | vector because the PCI subsystem writes the 1:1 vector-to-entry mapping |
210 | into the field vector of each element contained in a second argument. | 218 | into the field vector of each element contained in a second argument. |
211 | Note that the pre-assigned IO-APIC dev->irq is valid only if the device | 219 | Note that the pre-assigned IOAPIC dev->irq is valid only if the device |
212 | operates in PIN-IRQ assertion mode. In MSI-X mode, any attempt of | 220 | operates in PIN-IRQ assertion mode. In MSI-X mode, any attempt at |
213 | using dev->irq by the device driver to request for interrupt service | 221 | using dev->irq by the device driver to request for interrupt service |
214 | may result unpredictabe behavior. | 222 | may result unpredictabe behavior. |
215 | 223 | ||
216 | For each MSI-X vector granted, a device driver is responsible to call | 224 | For each MSI-X vector granted, a device driver is responsible for calling |
217 | other functions like request_irq(), enable_irq(), etc. to enable | 225 | other functions like request_irq(), enable_irq(), etc. to enable |
218 | this vector with its corresponding interrupt service handler. It is | 226 | this vector with its corresponding interrupt service handler. It is |
219 | a device driver's choice to assign all vectors with the same | 227 | a device driver's choice to assign all vectors with the same |
@@ -224,13 +232,13 @@ service handler. | |||
224 | 232 | ||
225 | The PCI 3.0 specification has implementation notes that MMIO address | 233 | The PCI 3.0 specification has implementation notes that MMIO address |
226 | space for a device's MSI-X structure should be isolated so that the | 234 | space for a device's MSI-X structure should be isolated so that the |
227 | software system can set different page for controlling accesses to | 235 | software system can set different pages for controlling accesses to the |
228 | the MSI-X structure. The implementation of MSI patch requires the PCI | 236 | MSI-X structure. The implementation of MSI support requires the PCI |
229 | subsystem, not a device driver, to maintain full control of the MSI-X | 237 | subsystem, not a device driver, to maintain full control of the MSI-X |
230 | table/MSI-X PBA and MMIO address space of the MSI-X table/MSI-X PBA. | 238 | table/MSI-X PBA (Pending Bit Array) and MMIO address space of the MSI-X |
231 | A device driver is prohibited from requesting the MMIO address space | 239 | table/MSI-X PBA. A device driver is prohibited from requesting the MMIO |
232 | of the MSI-X table/MSI-X PBA. Otherwise, the PCI subsystem will fail | 240 | address space of the MSI-X table/MSI-X PBA. Otherwise, the PCI subsystem |
233 | enabling MSI-X on its hardware device when it calls the function | 241 | will fail enabling MSI-X on its hardware device when it calls the function |
234 | pci_enable_msix(). | 242 | pci_enable_msix(). |
235 | 243 | ||
236 | 5.3.2 Handling MSI-X allocation | 244 | 5.3.2 Handling MSI-X allocation |
@@ -274,9 +282,9 @@ For the case where fewer MSI-X vectors are allocated to a function | |||
274 | than requested, the function pci_enable_msix() will return the | 282 | than requested, the function pci_enable_msix() will return the |
275 | maximum number of MSI-X vectors available to the caller. A device | 283 | maximum number of MSI-X vectors available to the caller. A device |
276 | driver may re-send its request with fewer or equal vectors indicated | 284 | driver may re-send its request with fewer or equal vectors indicated |
277 | in a return. For example, if a device driver requests 5 vectors, but | 285 | in the return. For example, if a device driver requests 5 vectors, but |
278 | the number of available vectors is 3 vectors, a value of 3 will be a | 286 | the number of available vectors is 3 vectors, a value of 3 will be |
279 | return as a result of pci_enable_msix() call. A function could be | 287 | returned as a result of pci_enable_msix() call. A function could be |
280 | designed for its driver to use only 3 MSI-X table entries as | 288 | designed for its driver to use only 3 MSI-X table entries as |
281 | different combinations as ABC--, A-B-C, A--CB, etc. Note that this | 289 | different combinations as ABC--, A-B-C, A--CB, etc. Note that this |
282 | patch does not support multiple entries with the same vector. Such | 290 | patch does not support multiple entries with the same vector. Such |
@@ -285,49 +293,46 @@ as ABBCC, AABCC, BCCBA, etc will result as a failure by the function | |||
285 | pci_enable_msix(). Below are the reasons why supporting multiple | 293 | pci_enable_msix(). Below are the reasons why supporting multiple |
286 | entries with the same vector is an undesirable solution. | 294 | entries with the same vector is an undesirable solution. |
287 | 295 | ||
288 | - The PCI subsystem can not determine which entry, which | 296 | - The PCI subsystem cannot determine the entry that |
289 | generated the message, to mask/unmask MSI while handling | 297 | generated the message to mask/unmask MSI while handling |
290 | software driver ISR. Attempting to walk through all MSI-X | 298 | software driver ISR. Attempting to walk through all MSI-X |
291 | table entries (2048 max) to mask/unmask any match vector | 299 | table entries (2048 max) to mask/unmask any match vector |
292 | is an undesirable solution. | 300 | is an undesirable solution. |
293 | 301 | ||
294 | - Walk through all MSI-X table entries (2048 max) to handle | 302 | - Walking through all MSI-X table entries (2048 max) to handle |
295 | SMP affinity of any match vector is an undesirable solution. | 303 | SMP affinity of any match vector is an undesirable solution. |
296 | 304 | ||
297 | 5.3.4 API pci_enable_msix | 305 | 5.3.4 API pci_enable_msix |
298 | 306 | ||
299 | int pci_enable_msix(struct pci_dev *dev, u32 *entries, int nvec) | 307 | int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec) |
300 | 308 | ||
301 | This API enables a device driver to request the PCI subsystem | 309 | This API enables a device driver to request the PCI subsystem |
302 | for enabling MSI-X messages on its hardware device. Depending on | 310 | to enable MSI-X messages on its hardware device. Depending on |
303 | the availability of PCI vectors resources, the PCI subsystem enables | 311 | the availability of PCI vectors resources, the PCI subsystem enables |
304 | either all or nothing. | 312 | either all or none of the requested vectors. |
305 | 313 | ||
306 | Argument dev points to the device (pci_dev) structure. | 314 | Argument 'dev' points to the device (pci_dev) structure. |
307 | 315 | ||
308 | Argument entries is a pointer of unsigned integer type. The number of | 316 | Argument 'entries' is a pointer to an array of msix_entry structs. |
309 | elements is indicated in argument nvec. The content of each element | 317 | The number of entries is indicated in argument 'nvec'. |
310 | will be mapped to the following struct defined in /driver/pci/msi.h. | 318 | struct msix_entry is defined in /driver/pci/msi.h: |
311 | 319 | ||
312 | struct msix_entry { | 320 | struct msix_entry { |
313 | u16 vector; /* kernel uses to write alloc vector */ | 321 | u16 vector; /* kernel uses to write alloc vector */ |
314 | u16 entry; /* driver uses to specify entry */ | 322 | u16 entry; /* driver uses to specify entry */ |
315 | }; | 323 | }; |
316 | 324 | ||
317 | A device driver is responsible for initializing the field entry of | 325 | A device driver is responsible for initializing the field 'entry' of |
318 | each element with unique entry supported by MSI-X table. Otherwise, | 326 | each element with a unique entry supported by MSI-X table. Otherwise, |
319 | -EINVAL will be returned as a result. A successful return of zero | 327 | -EINVAL will be returned as a result. A successful return of zero |
320 | indicates the PCI subsystem completes initializing each of requested | 328 | indicates the PCI subsystem completed initializing each of the requested |
321 | entries of the MSI-X table with message address and message data. | 329 | entries of the MSI-X table with message address and message data. |
322 | Last but not least, the PCI subsystem will write the 1:1 | 330 | Last but not least, the PCI subsystem will write the 1:1 |
323 | vector-to-entry mapping into the field vector of each element. A | 331 | vector-to-entry mapping into the field 'vector' of each element. A |
324 | device driver is responsible of keeping track of allocated MSI-X | 332 | device driver is responsible for keeping track of allocated MSI-X |
325 | vectors in its internal data structure. | 333 | vectors in its internal data structure. |
326 | 334 | ||
327 | Argument nvec is an integer indicating the number of messages | 335 | A return of zero indicates that the number of MSI-X vectors was |
328 | requested. | ||
329 | |||
330 | A return of zero indicates that the number of MSI-X vectors is | ||
331 | successfully allocated. A return of greater than zero indicates | 336 | successfully allocated. A return of greater than zero indicates |
332 | MSI-X vector shortage. Or a return of less than zero indicates | 337 | MSI-X vector shortage. Or a return of less than zero indicates |
333 | a failure. This failure may be a result of duplicate entries | 338 | a failure. This failure may be a result of duplicate entries |
@@ -341,12 +346,12 @@ void pci_disable_msix(struct pci_dev *dev) | |||
341 | This API should always be used to undo the effect of pci_enable_msix() | 346 | This API should always be used to undo the effect of pci_enable_msix() |
342 | when a device driver is unloading. Note that a device driver should | 347 | when a device driver is unloading. Note that a device driver should |
343 | always call free_irq() on all MSI-X vectors it has done request_irq() | 348 | always call free_irq() on all MSI-X vectors it has done request_irq() |
344 | on before calling this API. Failure to do so results a BUG_ON() and | 349 | on before calling this API. Failure to do so results in a BUG_ON() and |
345 | a device will be left with MSI-X enabled and leaks its vectors. | 350 | a device will be left with MSI-X enabled and leaks its vectors. |
346 | 351 | ||
347 | 5.3.6 MSI-X mode vs. legacy mode diagram | 352 | 5.3.6 MSI-X mode vs. legacy mode diagram |
348 | 353 | ||
349 | The below diagram shows the events, which switches the interrupt | 354 | The below diagram shows the events which switch the interrupt |
350 | mode on the MSI-X capable device function between MSI-X mode and | 355 | mode on the MSI-X capable device function between MSI-X mode and |
351 | PIN-IRQ assertion mode (legacy). | 356 | PIN-IRQ assertion mode (legacy). |
352 | 357 | ||
@@ -356,22 +361,22 @@ PIN-IRQ assertion mode (legacy). | |||
356 | | | ===============> | | | 361 | | | ===============> | | |
357 | ------------ pci_disable_msix ------------------------ | 362 | ------------ pci_disable_msix ------------------------ |
358 | 363 | ||
359 | Figure 2.0 MSI-X Mode vs. Legacy Mode | 364 | Figure 2. MSI-X Mode vs. Legacy Mode |
360 | 365 | ||
361 | In Figure 2.0, a device operates by default in legacy mode. A | 366 | In Figure 2, a device operates by default in legacy mode. A |
362 | successful MSI-X request (using pci_enable_msix()) switches a | 367 | successful MSI-X request (using pci_enable_msix()) switches a |
363 | device's interrupt mode to MSI-X mode. A pre-assigned IOAPIC vector | 368 | device's interrupt mode to MSI-X mode. A pre-assigned IOAPIC vector |
364 | stored in dev->irq will be saved by the PCI subsystem; however, | 369 | stored in dev->irq will be saved by the PCI subsystem; however, |
365 | unlike MSI mode, the PCI subsystem will not replace dev->irq with | 370 | unlike MSI mode, the PCI subsystem will not replace dev->irq with |
366 | assigned MSI-X vector because the PCI subsystem already writes the 1:1 | 371 | assigned MSI-X vector because the PCI subsystem already writes the 1:1 |
367 | vector-to-entry mapping into the field vector of each element | 372 | vector-to-entry mapping into the field 'vector' of each element |
368 | specified in second argument. | 373 | specified in second argument. |
369 | 374 | ||
370 | To return back to its default mode, a device driver should always call | 375 | To return back to its default mode, a device driver should always call |
371 | pci_disable_msix() to undo the effect of pci_enable_msix(). Note that | 376 | pci_disable_msix() to undo the effect of pci_enable_msix(). Note that |
372 | a device driver should always call free_irq() on all MSI-X vectors it | 377 | a device driver should always call free_irq() on all MSI-X vectors it |
373 | has done request_irq() on before calling pci_disable_msix(). Failure | 378 | has done request_irq() on before calling pci_disable_msix(). Failure |
374 | to do so results a BUG_ON() and a device will be left with MSI-X | 379 | to do so results in a BUG_ON() and a device will be left with MSI-X |
375 | enabled and leaks its vectors. Otherwise, the PCI subsystem switches a | 380 | enabled and leaks its vectors. Otherwise, the PCI subsystem switches a |
376 | device function's interrupt mode from MSI-X mode to legacy mode and | 381 | device function's interrupt mode from MSI-X mode to legacy mode and |
377 | marks all allocated MSI-X vectors as unused. | 382 | marks all allocated MSI-X vectors as unused. |
@@ -383,53 +388,56 @@ MSI/MSI-X requests from other drivers, these MSI-X vectors may be | |||
383 | re-assigned. | 388 | re-assigned. |
384 | 389 | ||
385 | For the case where the PCI subsystem re-assigned these MSI-X vectors | 390 | For the case where the PCI subsystem re-assigned these MSI-X vectors |
386 | to other driver, a request to switching back to MSI-X mode may result | 391 | to other drivers, a request to switch back to MSI-X mode may result |
387 | being assigned with another set of MSI-X vectors or a failure if no | 392 | being assigned with another set of MSI-X vectors or a failure if no |
388 | more vectors are available. | 393 | more vectors are available. |
389 | 394 | ||
390 | 5.4 Handling function implementng both MSI and MSI-X capabilities | 395 | 5.4 Handling function implementing both MSI and MSI-X capabilities |
391 | 396 | ||
392 | For the case where a function implements both MSI and MSI-X | 397 | For the case where a function implements both MSI and MSI-X |
393 | capabilities, the PCI subsystem enables a device to run either in MSI | 398 | capabilities, the PCI subsystem enables a device to run either in MSI |
394 | mode or MSI-X mode but not both. A device driver determines whether it | 399 | mode or MSI-X mode but not both. A device driver determines whether it |
395 | wants MSI or MSI-X enabled on its hardware device. Once a device | 400 | wants MSI or MSI-X enabled on its hardware device. Once a device |
396 | driver requests for MSI, for example, it is prohibited to request for | 401 | driver requests for MSI, for example, it is prohibited from requesting |
397 | MSI-X; in other words, a device driver is not permitted to ping-pong | 402 | MSI-X; in other words, a device driver is not permitted to ping-pong |
398 | between MSI mod MSI-X mode during a run-time. | 403 | between MSI mod MSI-X mode during a run-time. |
399 | 404 | ||
400 | 5.5 Hardware requirements for MSI/MSI-X support | 405 | 5.5 Hardware requirements for MSI/MSI-X support |
406 | |||
401 | MSI/MSI-X support requires support from both system hardware and | 407 | MSI/MSI-X support requires support from both system hardware and |
402 | individual hardware device functions. | 408 | individual hardware device functions. |
403 | 409 | ||
404 | 5.5.1 System hardware support | 410 | 5.5.1 System hardware support |
411 | |||
405 | Since the target of MSI address is the local APIC CPU, enabling | 412 | Since the target of MSI address is the local APIC CPU, enabling |
406 | MSI/MSI-X support in Linux kernel is dependent on whether existing | 413 | MSI/MSI-X support in the Linux kernel is dependent on whether existing |
407 | system hardware supports local APIC. Users should verify their | 414 | system hardware supports local APIC. Users should verify that their |
408 | system whether it runs when CONFIG_X86_LOCAL_APIC=y. | 415 | system supports local APIC operation by testing that it runs when |
416 | CONFIG_X86_LOCAL_APIC=y. | ||
409 | 417 | ||
410 | In SMP environment, CONFIG_X86_LOCAL_APIC is automatically set; | 418 | In SMP environment, CONFIG_X86_LOCAL_APIC is automatically set; |
411 | however, in UP environment, users must manually set | 419 | however, in UP environment, users must manually set |
412 | CONFIG_X86_LOCAL_APIC. Once CONFIG_X86_LOCAL_APIC=y, setting | 420 | CONFIG_X86_LOCAL_APIC. Once CONFIG_X86_LOCAL_APIC=y, setting |
413 | CONFIG_PCI_MSI enables the VECTOR based scheme and | 421 | CONFIG_PCI_MSI enables the VECTOR based scheme and the option for |
414 | the option for MSI-capable device drivers to selectively enable | 422 | MSI-capable device drivers to selectively enable MSI/MSI-X. |
415 | MSI/MSI-X. | ||
416 | 423 | ||
417 | Note that CONFIG_X86_IO_APIC setting is irrelevant because MSI/MSI-X | 424 | Note that CONFIG_X86_IO_APIC setting is irrelevant because MSI/MSI-X |
418 | vector is allocated new during runtime and MSI/MSI-X support does not | 425 | vector is allocated new during runtime and MSI/MSI-X support does not |
419 | depend on BIOS support. This key independency enables MSI/MSI-X | 426 | depend on BIOS support. This key independency enables MSI/MSI-X |
420 | support on future IOxAPIC free platform. | 427 | support on future IOxAPIC free platforms. |
421 | 428 | ||
422 | 5.5.2 Device hardware support | 429 | 5.5.2 Device hardware support |
430 | |||
423 | The hardware device function supports MSI by indicating the | 431 | The hardware device function supports MSI by indicating the |
424 | MSI/MSI-X capability structure on its PCI capability list. By | 432 | MSI/MSI-X capability structure on its PCI capability list. By |
425 | default, this capability structure will not be initialized by | 433 | default, this capability structure will not be initialized by |
426 | the kernel to enable MSI during the system boot. In other words, | 434 | the kernel to enable MSI during the system boot. In other words, |
427 | the device function is running on its default pin assertion mode. | 435 | the device function is running on its default pin assertion mode. |
428 | Note that in many cases the hardware supporting MSI have bugs, | 436 | Note that in many cases the hardware supporting MSI have bugs, |
429 | which may result in system hang. The software driver of specific | 437 | which may result in system hangs. The software driver of specific |
430 | MSI-capable hardware is responsible for whether calling | 438 | MSI-capable hardware is responsible for deciding whether to call |
431 | pci_enable_msi or not. A return of zero indicates the kernel | 439 | pci_enable_msi or not. A return of zero indicates the kernel |
432 | successfully initializes the MSI/MSI-X capability structure of the | 440 | successfully initialized the MSI/MSI-X capability structure of the |
433 | device function. The device function is now running on MSI/MSI-X mode. | 441 | device function. The device function is now running on MSI/MSI-X mode. |
434 | 442 | ||
435 | 5.6 How to tell whether MSI/MSI-X is enabled on device function | 443 | 5.6 How to tell whether MSI/MSI-X is enabled on device function |
@@ -439,10 +447,10 @@ pci_enable_msi()/pci_enable_msix() indicates to a device driver that | |||
439 | its device function is initialized successfully and ready to run in | 447 | its device function is initialized successfully and ready to run in |
440 | MSI/MSI-X mode. | 448 | MSI/MSI-X mode. |
441 | 449 | ||
442 | At the user level, users can use command 'cat /proc/interrupts' | 450 | At the user level, users can use the command 'cat /proc/interrupts' |
443 | to display the vector allocated for a device and its interrupt | 451 | to display the vectors allocated for devices and their interrupt |
444 | MSI/MSI-X mode ("PCI MSI"/"PCI MSIX"). Below shows below MSI mode is | 452 | MSI/MSI-X modes ("PCI-MSI"/"PCI-MSI-X"). Below shows MSI mode is |
445 | enabled on a SCSI Adaptec 39320D Ultra320. | 453 | enabled on a SCSI Adaptec 39320D Ultra320 controller. |
446 | 454 | ||
447 | CPU0 CPU1 | 455 | CPU0 CPU1 |
448 | 0: 324639 0 IO-APIC-edge timer | 456 | 0: 324639 0 IO-APIC-edge timer |
@@ -453,8 +461,8 @@ enabled on a SCSI Adaptec 39320D Ultra320. | |||
453 | 15: 1 0 IO-APIC-edge ide1 | 461 | 15: 1 0 IO-APIC-edge ide1 |
454 | 169: 0 0 IO-APIC-level uhci-hcd | 462 | 169: 0 0 IO-APIC-level uhci-hcd |
455 | 185: 0 0 IO-APIC-level uhci-hcd | 463 | 185: 0 0 IO-APIC-level uhci-hcd |
456 | 193: 138 10 PCI MSI aic79xx | 464 | 193: 138 10 PCI-MSI aic79xx |
457 | 201: 30 0 PCI MSI aic79xx | 465 | 201: 30 0 PCI-MSI aic79xx |
458 | 225: 30 0 IO-APIC-level aic7xxx | 466 | 225: 30 0 IO-APIC-level aic7xxx |
459 | 233: 30 0 IO-APIC-level aic7xxx | 467 | 233: 30 0 IO-APIC-level aic7xxx |
460 | NMI: 0 0 | 468 | NMI: 0 0 |
@@ -490,8 +498,8 @@ target address set as 0xfeexxxxx, as conformed to PCI | |||
490 | specification 2.3 or latest, then it should work. | 498 | specification 2.3 or latest, then it should work. |
491 | 499 | ||
492 | Q4. From the driver point of view, if the MSI is lost because | 500 | Q4. From the driver point of view, if the MSI is lost because |
493 | of the errors occur during inbound memory write, then it may | 501 | of errors occurring during inbound memory write, then it may |
494 | wait for ever. Is there a mechanism for it to recover? | 502 | wait forever. Is there a mechanism for it to recover? |
495 | 503 | ||
496 | A4. Since the target of the transaction is an inbound memory | 504 | A4. Since the target of the transaction is an inbound memory |
497 | write, all transaction termination conditions (Retry, | 505 | write, all transaction termination conditions (Retry, |
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 354d89c78377..15da16861fa3 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt | |||
@@ -772,8 +772,6 @@ RCU pointer/list traversal: | |||
772 | list_for_each_entry_rcu | 772 | list_for_each_entry_rcu |
773 | list_for_each_continue_rcu (to be deprecated in favor of new | 773 | list_for_each_continue_rcu (to be deprecated in favor of new |
774 | list_for_each_entry_continue_rcu) | 774 | list_for_each_entry_continue_rcu) |
775 | hlist_for_each_rcu (to be deprecated in favor of | ||
776 | hlist_for_each_entry_rcu) | ||
777 | hlist_for_each_entry_rcu | 775 | hlist_for_each_entry_rcu |
778 | 776 | ||
779 | RCU pointer update: | 777 | RCU pointer update: |
diff --git a/Documentation/device-mapper/snapshot.txt b/Documentation/device-mapper/snapshot.txt index dca274ff4005..a5009c8300f3 100644 --- a/Documentation/device-mapper/snapshot.txt +++ b/Documentation/device-mapper/snapshot.txt | |||
@@ -19,7 +19,6 @@ There are two dm targets available: snapshot and snapshot-origin. | |||
19 | *) snapshot-origin <origin> | 19 | *) snapshot-origin <origin> |
20 | 20 | ||
21 | which will normally have one or more snapshots based on it. | 21 | which will normally have one or more snapshots based on it. |
22 | You must create the snapshot-origin device before you can create snapshots. | ||
23 | Reads will be mapped directly to the backing device. For each write, the | 22 | Reads will be mapped directly to the backing device. For each write, the |
24 | original data will be saved in the <COW device> of each snapshot to keep | 23 | original data will be saved in the <COW device> of each snapshot to keep |
25 | its visible content unchanged, at least until the <COW device> fills up. | 24 | its visible content unchanged, at least until the <COW device> fills up. |
@@ -27,7 +26,7 @@ its visible content unchanged, at least until the <COW device> fills up. | |||
27 | 26 | ||
28 | *) snapshot <origin> <COW device> <persistent?> <chunksize> | 27 | *) snapshot <origin> <COW device> <persistent?> <chunksize> |
29 | 28 | ||
30 | A snapshot is created of the <origin> block device. Changed chunks of | 29 | A snapshot of the <origin> block device is created. Changed chunks of |
31 | <chunksize> sectors will be stored on the <COW device>. Writes will | 30 | <chunksize> sectors will be stored on the <COW device>. Writes will |
32 | only go to the <COW device>. Reads will come from the <COW device> or | 31 | only go to the <COW device>. Reads will come from the <COW device> or |
33 | from <origin> for unchanged data. <COW device> will often be | 32 | from <origin> for unchanged data. <COW device> will often be |
@@ -37,6 +36,8 @@ the amount of free space and expand the <COW device> before it fills up. | |||
37 | 36 | ||
38 | <persistent?> is P (Persistent) or N (Not persistent - will not survive | 37 | <persistent?> is P (Persistent) or N (Not persistent - will not survive |
39 | after reboot). | 38 | after reboot). |
39 | The difference is that for transient snapshots less metadata must be | ||
40 | saved on disk - they can be kept in memory by the kernel. | ||
40 | 41 | ||
41 | 42 | ||
42 | How this is used by LVM2 | 43 | How this is used by LVM2 |
diff --git a/Documentation/fb/vesafb.txt b/Documentation/fb/vesafb.txt index 62db6758d1c1..ee277dd204b0 100644 --- a/Documentation/fb/vesafb.txt +++ b/Documentation/fb/vesafb.txt | |||
@@ -146,10 +146,10 @@ pmipal Use the protected mode interface for palette changes. | |||
146 | 146 | ||
147 | mtrr:n setup memory type range registers for the vesafb framebuffer | 147 | mtrr:n setup memory type range registers for the vesafb framebuffer |
148 | where n: | 148 | where n: |
149 | 0 - disabled (equivalent to nomtrr) | 149 | 0 - disabled (equivalent to nomtrr) (default) |
150 | 1 - uncachable | 150 | 1 - uncachable |
151 | 2 - write-back | 151 | 2 - write-back |
152 | 3 - write-combining (default) | 152 | 3 - write-combining |
153 | 4 - write-through | 153 | 4 - write-through |
154 | 154 | ||
155 | If you see the following in dmesg, choose the type that matches the | 155 | If you see the following in dmesg, choose the type that matches the |
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index b67189a8d8d4..decdf9917e0d 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt | |||
@@ -69,6 +69,22 @@ Who: Grant Coady <gcoady@gmail.com> | |||
69 | 69 | ||
70 | --------------------------- | 70 | --------------------------- |
71 | 71 | ||
72 | What: remove EXPORT_SYMBOL(panic_timeout) | ||
73 | When: April 2006 | ||
74 | Files: kernel/panic.c | ||
75 | Why: No modular usage in the kernel. | ||
76 | Who: Adrian Bunk <bunk@stusta.de> | ||
77 | |||
78 | --------------------------- | ||
79 | |||
80 | What: remove EXPORT_SYMBOL(insert_resource) | ||
81 | When: April 2006 | ||
82 | Files: kernel/resource.c | ||
83 | Why: No modular usage in the kernel. | ||
84 | Who: Adrian Bunk <bunk@stusta.de> | ||
85 | |||
86 | --------------------------- | ||
87 | |||
72 | What: PCMCIA control ioctl (needed for pcmcia-cs [cardmgr, cardctl]) | 88 | What: PCMCIA control ioctl (needed for pcmcia-cs [cardmgr, cardctl]) |
73 | When: November 2005 | 89 | When: November 2005 |
74 | Files: drivers/pcmcia/: pcmcia_ioctl.c | 90 | Files: drivers/pcmcia/: pcmcia_ioctl.c |
diff --git a/Documentation/filesystems/dentry-locking.txt b/Documentation/filesystems/dentry-locking.txt new file mode 100644 index 000000000000..4c0c575a4012 --- /dev/null +++ b/Documentation/filesystems/dentry-locking.txt | |||
@@ -0,0 +1,173 @@ | |||
1 | RCU-based dcache locking model | ||
2 | ============================== | ||
3 | |||
4 | On many workloads, the most common operation on dcache is to look up a | ||
5 | dentry, given a parent dentry and the name of the child. Typically, | ||
6 | for every open(), stat() etc., the dentry corresponding to the | ||
7 | pathname will be looked up by walking the tree starting with the first | ||
8 | component of the pathname and using that dentry along with the next | ||
9 | component to look up the next level and so on. Since it is a frequent | ||
10 | operation for workloads like multiuser environments and web servers, | ||
11 | it is important to optimize this path. | ||
12 | |||
13 | Prior to 2.5.10, dcache_lock was acquired in d_lookup and thus in | ||
14 | every component during path look-up. Since 2.5.10 onwards, fast-walk | ||
15 | algorithm changed this by holding the dcache_lock at the beginning and | ||
16 | walking as many cached path component dentries as possible. This | ||
17 | significantly decreases the number of acquisition of | ||
18 | dcache_lock. However it also increases the lock hold time | ||
19 | significantly and affects performance in large SMP machines. Since | ||
20 | 2.5.62 kernel, dcache has been using a new locking model that uses RCU | ||
21 | to make dcache look-up lock-free. | ||
22 | |||
23 | The current dcache locking model is not very different from the | ||
24 | existing dcache locking model. Prior to 2.5.62 kernel, dcache_lock | ||
25 | protected the hash chain, d_child, d_alias, d_lru lists as well as | ||
26 | d_inode and several other things like mount look-up. RCU-based changes | ||
27 | affect only the way the hash chain is protected. For everything else | ||
28 | the dcache_lock must be taken for both traversing as well as | ||
29 | updating. The hash chain updates too take the dcache_lock. The | ||
30 | significant change is the way d_lookup traverses the hash chain, it | ||
31 | doesn't acquire the dcache_lock for this and rely on RCU to ensure | ||
32 | that the dentry has not been *freed*. | ||
33 | |||
34 | |||
35 | Dcache locking details | ||
36 | ====================== | ||
37 | |||
38 | For many multi-user workloads, open() and stat() on files are very | ||
39 | frequently occurring operations. Both involve walking of path names to | ||
40 | find the dentry corresponding to the concerned file. In 2.4 kernel, | ||
41 | dcache_lock was held during look-up of each path component. Contention | ||
42 | and cache-line bouncing of this global lock caused significant | ||
43 | scalability problems. With the introduction of RCU in Linux kernel, | ||
44 | this was worked around by making the look-up of path components during | ||
45 | path walking lock-free. | ||
46 | |||
47 | |||
48 | Safe lock-free look-up of dcache hash table | ||
49 | =========================================== | ||
50 | |||
51 | Dcache is a complex data structure with the hash table entries also | ||
52 | linked together in other lists. In 2.4 kernel, dcache_lock protected | ||
53 | all the lists. We applied RCU only on hash chain walking. The rest of | ||
54 | the lists are still protected by dcache_lock. Some of the important | ||
55 | changes are : | ||
56 | |||
57 | 1. The deletion from hash chain is done using hlist_del_rcu() macro | ||
58 | which doesn't initialize next pointer of the deleted dentry and | ||
59 | this allows us to walk safely lock-free while a deletion is | ||
60 | happening. | ||
61 | |||
62 | 2. Insertion of a dentry into the hash table is done using | ||
63 | hlist_add_head_rcu() which take care of ordering the writes - the | ||
64 | writes to the dentry must be visible before the dentry is | ||
65 | inserted. This works in conjunction with hlist_for_each_rcu() while | ||
66 | walking the hash chain. The only requirement is that all | ||
67 | initialization to the dentry must be done before | ||
68 | hlist_add_head_rcu() since we don't have dcache_lock protection | ||
69 | while traversing the hash chain. This isn't different from the | ||
70 | existing code. | ||
71 | |||
72 | 3. The dentry looked up without holding dcache_lock by cannot be | ||
73 | returned for walking if it is unhashed. It then may have a NULL | ||
74 | d_inode or other bogosity since RCU doesn't protect the other | ||
75 | fields in the dentry. We therefore use a flag DCACHE_UNHASHED to | ||
76 | indicate unhashed dentries and use this in conjunction with a | ||
77 | per-dentry lock (d_lock). Once looked up without the dcache_lock, | ||
78 | we acquire the per-dentry lock (d_lock) and check if the dentry is | ||
79 | unhashed. If so, the look-up is failed. If not, the reference count | ||
80 | of the dentry is increased and the dentry is returned. | ||
81 | |||
82 | 4. Once a dentry is looked up, it must be ensured during the path walk | ||
83 | for that component it doesn't go away. In pre-2.5.10 code, this was | ||
84 | done holding a reference to the dentry. dcache_rcu does the same. | ||
85 | In some sense, dcache_rcu path walking looks like the pre-2.5.10 | ||
86 | version. | ||
87 | |||
88 | 5. All dentry hash chain updates must take the dcache_lock as well as | ||
89 | the per-dentry lock in that order. dput() does this to ensure that | ||
90 | a dentry that has just been looked up in another CPU doesn't get | ||
91 | deleted before dget() can be done on it. | ||
92 | |||
93 | 6. There are several ways to do reference counting of RCU protected | ||
94 | objects. One such example is in ipv4 route cache where deferred | ||
95 | freeing (using call_rcu()) is done as soon as the reference count | ||
96 | goes to zero. This cannot be done in the case of dentries because | ||
97 | tearing down of dentries require blocking (dentry_iput()) which | ||
98 | isn't supported from RCU callbacks. Instead, tearing down of | ||
99 | dentries happen synchronously in dput(), but actual freeing happens | ||
100 | later when RCU grace period is over. This allows safe lock-free | ||
101 | walking of the hash chains, but a matched dentry may have been | ||
102 | partially torn down. The checking of DCACHE_UNHASHED flag with | ||
103 | d_lock held detects such dentries and prevents them from being | ||
104 | returned from look-up. | ||
105 | |||
106 | |||
107 | Maintaining POSIX rename semantics | ||
108 | ================================== | ||
109 | |||
110 | Since look-up of dentries is lock-free, it can race against a | ||
111 | concurrent rename operation. For example, during rename of file A to | ||
112 | B, look-up of either A or B must succeed. So, if look-up of B happens | ||
113 | after A has been removed from the hash chain but not added to the new | ||
114 | hash chain, it may fail. Also, a comparison while the name is being | ||
115 | written concurrently by a rename may result in false positive matches | ||
116 | violating rename semantics. Issues related to race with rename are | ||
117 | handled as described below : | ||
118 | |||
119 | 1. Look-up can be done in two ways - d_lookup() which is safe from | ||
120 | simultaneous renames and __d_lookup() which is not. If | ||
121 | __d_lookup() fails, it must be followed up by a d_lookup() to | ||
122 | correctly determine whether a dentry is in the hash table or | ||
123 | not. d_lookup() protects look-ups using a sequence lock | ||
124 | (rename_lock). | ||
125 | |||
126 | 2. The name associated with a dentry (d_name) may be changed if a | ||
127 | rename is allowed to happen simultaneously. To avoid memcmp() in | ||
128 | __d_lookup() go out of bounds due to a rename and false positive | ||
129 | comparison, the name comparison is done while holding the | ||
130 | per-dentry lock. This prevents concurrent renames during this | ||
131 | operation. | ||
132 | |||
133 | 3. Hash table walking during look-up may move to a different bucket as | ||
134 | the current dentry is moved to a different bucket due to rename. | ||
135 | But we use hlists in dcache hash table and they are | ||
136 | null-terminated. So, even if a dentry moves to a different bucket, | ||
137 | hash chain walk will terminate. [with a list_head list, it may not | ||
138 | since termination is when the list_head in the original bucket is | ||
139 | reached]. Since we redo the d_parent check and compare name while | ||
140 | holding d_lock, lock-free look-up will not race against d_move(). | ||
141 | |||
142 | 4. There can be a theoretical race when a dentry keeps coming back to | ||
143 | original bucket due to double moves. Due to this look-up may | ||
144 | consider that it has never moved and can end up in a infinite loop. | ||
145 | But this is not any worse that theoretical livelocks we already | ||
146 | have in the kernel. | ||
147 | |||
148 | |||
149 | Important guidelines for filesystem developers related to dcache_rcu | ||
150 | ==================================================================== | ||
151 | |||
152 | 1. Existing dcache interfaces (pre-2.5.62) exported to filesystem | ||
153 | don't change. Only dcache internal implementation changes. However | ||
154 | filesystems *must not* delete from the dentry hash chains directly | ||
155 | using the list macros like allowed earlier. They must use dcache | ||
156 | APIs like d_drop() or __d_drop() depending on the situation. | ||
157 | |||
158 | 2. d_flags is now protected by a per-dentry lock (d_lock). All access | ||
159 | to d_flags must be protected by it. | ||
160 | |||
161 | 3. For a hashed dentry, checking of d_count needs to be protected by | ||
162 | d_lock. | ||
163 | |||
164 | |||
165 | Papers and other documentation on dcache locking | ||
166 | ================================================ | ||
167 | |||
168 | 1. Scaling dcache with RCU (http://linuxjournal.com/article.php?sid=7124). | ||
169 | |||
170 | 2. http://lse.sourceforge.net/locking/dcache/dcache.html | ||
171 | |||
172 | |||
173 | |||
diff --git a/Documentation/filesystems/devfs/README b/Documentation/filesystems/devfs/README index 54366ecc241f..aabfba24bc2e 100644 --- a/Documentation/filesystems/devfs/README +++ b/Documentation/filesystems/devfs/README | |||
@@ -1812,11 +1812,6 @@ it may overflow the messages buffer, but try to get as much of it as | |||
1812 | you can | 1812 | you can |
1813 | 1813 | ||
1814 | 1814 | ||
1815 | if you get an Oops, run ksymoops to decode it so that the | ||
1816 | names of the offending functions are provided. A non-decoded Oops is | ||
1817 | pretty useless | ||
1818 | |||
1819 | |||
1820 | send a copy of your devfsd configuration file(s) | 1815 | send a copy of your devfsd configuration file(s) |
1821 | 1816 | ||
1822 | send the bug report to me first. | 1817 | send the bug report to me first. |
diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.txt b/Documentation/filesystems/ramfs-rootfs-initramfs.txt new file mode 100644 index 000000000000..b3404a032596 --- /dev/null +++ b/Documentation/filesystems/ramfs-rootfs-initramfs.txt | |||
@@ -0,0 +1,195 @@ | |||
1 | ramfs, rootfs and initramfs | ||
2 | October 17, 2005 | ||
3 | Rob Landley <rob@landley.net> | ||
4 | ============================= | ||
5 | |||
6 | What is ramfs? | ||
7 | -------------- | ||
8 | |||
9 | Ramfs is a very simple filesystem that exports Linux's disk caching | ||
10 | mechanisms (the page cache and dentry cache) as a dynamically resizable | ||
11 | ram-based filesystem. | ||
12 | |||
13 | Normally all files are cached in memory by Linux. Pages of data read from | ||
14 | backing store (usually the block device the filesystem is mounted on) are kept | ||
15 | around in case it's needed again, but marked as clean (freeable) in case the | ||
16 | Virtual Memory system needs the memory for something else. Similarly, data | ||
17 | written to files is marked clean as soon as it has been written to backing | ||
18 | store, but kept around for caching purposes until the VM reallocates the | ||
19 | memory. A similar mechanism (the dentry cache) greatly speeds up access to | ||
20 | directories. | ||
21 | |||
22 | With ramfs, there is no backing store. Files written into ramfs allocate | ||
23 | dentries and page cache as usual, but there's nowhere to write them to. | ||
24 | This means the pages are never marked clean, so they can't be freed by the | ||
25 | VM when it's looking to recycle memory. | ||
26 | |||
27 | The amount of code required to implement ramfs is tiny, because all the | ||
28 | work is done by the existing Linux caching infrastructure. Basically, | ||
29 | you're mounting the disk cache as a filesystem. Because of this, ramfs is not | ||
30 | an optional component removable via menuconfig, since there would be negligible | ||
31 | space savings. | ||
32 | |||
33 | ramfs and ramdisk: | ||
34 | ------------------ | ||
35 | |||
36 | The older "ram disk" mechanism created a synthetic block device out of | ||
37 | an area of ram and used it as backing store for a filesystem. This block | ||
38 | device was of fixed size, so the filesystem mounted on it was of fixed | ||
39 | size. Using a ram disk also required unnecessarily copying memory from the | ||
40 | fake block device into the page cache (and copying changes back out), as well | ||
41 | as creating and destroying dentries. Plus it needed a filesystem driver | ||
42 | (such as ext2) to format and interpret this data. | ||
43 | |||
44 | Compared to ramfs, this wastes memory (and memory bus bandwidth), creates | ||
45 | unnecessary work for the CPU, and pollutes the CPU caches. (There are tricks | ||
46 | to avoid this copying by playing with the page tables, but they're unpleasantly | ||
47 | complicated and turn out to be about as expensive as the copying anyway.) | ||
48 | More to the point, all the work ramfs is doing has to happen _anyway_, | ||
49 | since all file access goes through the page and dentry caches. The ram | ||
50 | disk is simply unnecessary, ramfs is internally much simpler. | ||
51 | |||
52 | Another reason ramdisks are semi-obsolete is that the introduction of | ||
53 | loopback devices offered a more flexible and convenient way to create | ||
54 | synthetic block devices, now from files instead of from chunks of memory. | ||
55 | See losetup (8) for details. | ||
56 | |||
57 | ramfs and tmpfs: | ||
58 | ---------------- | ||
59 | |||
60 | One downside of ramfs is you can keep writing data into it until you fill | ||
61 | up all memory, and the VM can't free it because the VM thinks that files | ||
62 | should get written to backing store (rather than swap space), but ramfs hasn't | ||
63 | got any backing store. Because of this, only root (or a trusted user) should | ||
64 | be allowed write access to a ramfs mount. | ||
65 | |||
66 | A ramfs derivative called tmpfs was created to add size limits, and the ability | ||
67 | to write the data to swap space. Normal users can be allowed write access to | ||
68 | tmpfs mounts. See Documentation/filesystems/tmpfs.txt for more information. | ||
69 | |||
70 | What is rootfs? | ||
71 | --------------- | ||
72 | |||
73 | Rootfs is a special instance of ramfs, which is always present in 2.6 systems. | ||
74 | (It's used internally as the starting and stopping point for searches of the | ||
75 | kernel's doubly-linked list of mount points.) | ||
76 | |||
77 | Most systems just mount another filesystem over it and ignore it. The | ||
78 | amount of space an empty instance of ramfs takes up is tiny. | ||
79 | |||
80 | What is initramfs? | ||
81 | ------------------ | ||
82 | |||
83 | All 2.6 Linux kernels contain a gzipped "cpio" format archive, which is | ||
84 | extracted into rootfs when the kernel boots up. After extracting, the kernel | ||
85 | checks to see if rootfs contains a file "init", and if so it executes it as PID | ||
86 | 1. If found, this init process is responsible for bringing the system the | ||
87 | rest of the way up, including locating and mounting the real root device (if | ||
88 | any). If rootfs does not contain an init program after the embedded cpio | ||
89 | archive is extracted into it, the kernel will fall through to the older code | ||
90 | to locate and mount a root partition, then exec some variant of /sbin/init | ||
91 | out of that. | ||
92 | |||
93 | All this differs from the old initrd in several ways: | ||
94 | |||
95 | - The old initrd was a separate file, while the initramfs archive is linked | ||
96 | into the linux kernel image. (The directory linux-*/usr is devoted to | ||
97 | generating this archive during the build.) | ||
98 | |||
99 | - The old initrd file was a gzipped filesystem image (in some file format, | ||
100 | such as ext2, that had to be built into the kernel), while the new | ||
101 | initramfs archive is a gzipped cpio archive (like tar only simpler, | ||
102 | see cpio(1) and Documentation/early-userspace/buffer-format.txt). | ||
103 | |||
104 | - The program run by the old initrd (which was called /initrd, not /init) did | ||
105 | some setup and then returned to the kernel, while the init program from | ||
106 | initramfs is not expected to return to the kernel. (If /init needs to hand | ||
107 | off control it can overmount / with a new root device and exec another init | ||
108 | program. See the switch_root utility, below.) | ||
109 | |||
110 | - When switching another root device, initrd would pivot_root and then | ||
111 | umount the ramdisk. But initramfs is rootfs: you can neither pivot_root | ||
112 | rootfs, nor unmount it. Instead delete everything out of rootfs to | ||
113 | free up the space (find -xdev / -exec rm '{}' ';'), overmount rootfs | ||
114 | with the new root (cd /newmount; mount --move . /; chroot .), attach | ||
115 | stdin/stdout/stderr to the new /dev/console, and exec the new init. | ||
116 | |||
117 | Since this is a remarkably persnickity process (and involves deleting | ||
118 | commands before you can run them), the klibc package introduced a helper | ||
119 | program (utils/run_init.c) to do all this for you. Most other packages | ||
120 | (such as busybox) have named this command "switch_root". | ||
121 | |||
122 | Populating initramfs: | ||
123 | --------------------- | ||
124 | |||
125 | The 2.6 kernel build process always creates a gzipped cpio format initramfs | ||
126 | archive and links it into the resulting kernel binary. By default, this | ||
127 | archive is empty (consuming 134 bytes on x86). The config option | ||
128 | CONFIG_INITRAMFS_SOURCE (for some reason buried under devices->block devices | ||
129 | in menuconfig, and living in usr/Kconfig) can be used to specify a source for | ||
130 | the initramfs archive, which will automatically be incorporated into the | ||
131 | resulting binary. This option can point to an existing gzipped cpio archive, a | ||
132 | directory containing files to be archived, or a text file specification such | ||
133 | as the following example: | ||
134 | |||
135 | dir /dev 755 0 0 | ||
136 | nod /dev/console 644 0 0 c 5 1 | ||
137 | nod /dev/loop0 644 0 0 b 7 0 | ||
138 | dir /bin 755 1000 1000 | ||
139 | slink /bin/sh busybox 777 0 0 | ||
140 | file /bin/busybox initramfs/busybox 755 0 0 | ||
141 | dir /proc 755 0 0 | ||
142 | dir /sys 755 0 0 | ||
143 | dir /mnt 755 0 0 | ||
144 | file /init initramfs/init.sh 755 0 0 | ||
145 | |||
146 | One advantage of the text file is that root access is not required to | ||
147 | set permissions or create device nodes in the new archive. (Note that those | ||
148 | two example "file" entries expect to find files named "init.sh" and "busybox" in | ||
149 | a directory called "initramfs", under the linux-2.6.* directory. See | ||
150 | Documentation/early-userspace/README for more details.) | ||
151 | |||
152 | If you don't already understand what shared libraries, devices, and paths | ||
153 | you need to get a minimal root filesystem up and running, here are some | ||
154 | references: | ||
155 | http://www.tldp.org/HOWTO/Bootdisk-HOWTO/ | ||
156 | http://www.tldp.org/HOWTO/From-PowerUp-To-Bash-Prompt-HOWTO.html | ||
157 | http://www.linuxfromscratch.org/lfs/view/stable/ | ||
158 | |||
159 | The "klibc" package (http://www.kernel.org/pub/linux/libs/klibc) is | ||
160 | designed to be a tiny C library to statically link early userspace | ||
161 | code against, along with some related utilities. It is BSD licensed. | ||
162 | |||
163 | I use uClibc (http://www.uclibc.org) and busybox (http://www.busybox.net) | ||
164 | myself. These are LGPL and GPL, respectively. | ||
165 | |||
166 | In theory you could use glibc, but that's not well suited for small embedded | ||
167 | uses like this. (A "hello world" program statically linked against glibc is | ||
168 | over 400k. With uClibc it's 7k. Also note that glibc dlopens libnss to do | ||
169 | name lookups, even when otherwise statically linked.) | ||
170 | |||
171 | Future directions: | ||
172 | ------------------ | ||
173 | |||
174 | Today (2.6.14), initramfs is always compiled in, but not always used. The | ||
175 | kernel falls back to legacy boot code that is reached only if initramfs does | ||
176 | not contain an /init program. The fallback is legacy code, there to ensure a | ||
177 | smooth transition and allowing early boot functionality to gradually move to | ||
178 | "early userspace" (I.E. initramfs). | ||
179 | |||
180 | The move to early userspace is necessary because finding and mounting the real | ||
181 | root device is complex. Root partitions can span multiple devices (raid or | ||
182 | separate journal). They can be out on the network (requiring dhcp, setting a | ||
183 | specific mac address, logging into a server, etc). They can live on removable | ||
184 | media, with dynamically allocated major/minor numbers and persistent naming | ||
185 | issues requiring a full udev implementation to sort out. They can be | ||
186 | compressed, encrypted, copy-on-write, loopback mounted, strangely partitioned, | ||
187 | and so on. | ||
188 | |||
189 | This kind of complexity (which inevitably includes policy) is rightly handled | ||
190 | in userspace. Both klibc and busybox/uClibc are working on simple initramfs | ||
191 | packages to drop into a kernel build, and when standard solutions are ready | ||
192 | and widely deployed, the kernel's legacy early boot code will become obsolete | ||
193 | and a candidate for the feature removal schedule. | ||
194 | |||
195 | But that's a while off yet. | ||
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index f042c12e0ed2..ee4c0a8b8db7 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt | |||
@@ -3,7 +3,7 @@ | |||
3 | 3 | ||
4 | Original author: Richard Gooch <rgooch@atnf.csiro.au> | 4 | Original author: Richard Gooch <rgooch@atnf.csiro.au> |
5 | 5 | ||
6 | Last updated on August 25, 2005 | 6 | Last updated on October 28, 2005 |
7 | 7 | ||
8 | Copyright (C) 1999 Richard Gooch | 8 | Copyright (C) 1999 Richard Gooch |
9 | Copyright (C) 2005 Pekka Enberg | 9 | Copyright (C) 2005 Pekka Enberg |
@@ -11,62 +11,61 @@ | |||
11 | This file is released under the GPLv2. | 11 | This file is released under the GPLv2. |
12 | 12 | ||
13 | 13 | ||
14 | What is it? | 14 | Introduction |
15 | =========== | 15 | ============ |
16 | 16 | ||
17 | The Virtual File System (otherwise known as the Virtual Filesystem | 17 | The Virtual File System (also known as the Virtual Filesystem Switch) |
18 | Switch) is the software layer in the kernel that provides the | 18 | is the software layer in the kernel that provides the filesystem |
19 | filesystem interface to userspace programs. It also provides an | 19 | interface to userspace programs. It also provides an abstraction |
20 | abstraction within the kernel which allows different filesystem | 20 | within the kernel which allows different filesystem implementations to |
21 | implementations to coexist. | 21 | coexist. |
22 | 22 | ||
23 | VFS system calls open(2), stat(2), read(2), write(2), chmod(2) and so | ||
24 | on are called from a process context. Filesystem locking is described | ||
25 | in the document Documentation/filesystems/Locking. | ||
23 | 26 | ||
24 | A Quick Look At How It Works | ||
25 | ============================ | ||
26 | 27 | ||
27 | In this section I'll briefly describe how things work, before | 28 | Directory Entry Cache (dcache) |
28 | launching into the details. I'll start with describing what happens | 29 | ------------------------------ |
29 | when user programs open and manipulate files, and then look from the | ||
30 | other view which is how a filesystem is supported and subsequently | ||
31 | mounted. | ||
32 | |||
33 | |||
34 | Opening a File | ||
35 | -------------- | ||
36 | |||
37 | The VFS implements the open(2), stat(2), chmod(2) and similar system | ||
38 | calls. The pathname argument is used by the VFS to search through the | ||
39 | directory entry cache (dentry cache or "dcache"). This provides a very | ||
40 | fast look-up mechanism to translate a pathname (filename) into a | ||
41 | specific dentry. | ||
42 | |||
43 | An individual dentry usually has a pointer to an inode. Inodes are the | ||
44 | things that live on disc drives, and can be regular files (you know: | ||
45 | those things that you write data into), directories, FIFOs and other | ||
46 | beasts. Dentries live in RAM and are never saved to disc: they exist | ||
47 | only for performance. Inodes live on disc and are copied into memory | ||
48 | when required. Later any changes are written back to disc. The inode | ||
49 | that lives in RAM is a VFS inode, and it is this which the dentry | ||
50 | points to. A single inode can be pointed to by multiple dentries | ||
51 | (think about hardlinks). | ||
52 | |||
53 | The dcache is meant to be a view into your entire filespace. Unlike | ||
54 | Linus, most of us losers can't fit enough dentries into RAM to cover | ||
55 | all of our filespace, so the dcache has bits missing. In order to | ||
56 | resolve your pathname into a dentry, the VFS may have to resort to | ||
57 | creating dentries along the way, and then loading the inode. This is | ||
58 | done by looking up the inode. | ||
59 | |||
60 | To look up an inode (usually read from disc) requires that the VFS | ||
61 | calls the lookup() method of the parent directory inode. This method | ||
62 | is installed by the specific filesystem implementation that the inode | ||
63 | lives in. There will be more on this later. | ||
64 | 30 | ||
65 | Once the VFS has the required dentry (and hence the inode), we can do | 31 | The VFS implements the open(2), stat(2), chmod(2), and similar system |
66 | all those boring things like open(2) the file, or stat(2) it to peek | 32 | calls. The pathname argument that is passed to them is used by the VFS |
67 | at the inode data. The stat(2) operation is fairly simple: once the | 33 | to search through the directory entry cache (also known as the dentry |
68 | VFS has the dentry, it peeks at the inode data and passes some of it | 34 | cache or dcache). This provides a very fast look-up mechanism to |
69 | back to userspace. | 35 | translate a pathname (filename) into a specific dentry. Dentries live |
36 | in RAM and are never saved to disc: they exist only for performance. | ||
37 | |||
38 | The dentry cache is meant to be a view into your entire filespace. As | ||
39 | most computers cannot fit all dentries in the RAM at the same time, | ||
40 | some bits of the cache are missing. In order to resolve your pathname | ||
41 | into a dentry, the VFS may have to resort to creating dentries along | ||
42 | the way, and then loading the inode. This is done by looking up the | ||
43 | inode. | ||
44 | |||
45 | |||
46 | The Inode Object | ||
47 | ---------------- | ||
48 | |||
49 | An individual dentry usually has a pointer to an inode. Inodes are | ||
50 | filesystem objects such as regular files, directories, FIFOs and other | ||
51 | beasts. They live either on the disc (for block device filesystems) | ||
52 | or in the memory (for pseudo filesystems). Inodes that live on the | ||
53 | disc are copied into the memory when required and changes to the inode | ||
54 | are written back to disc. A single inode can be pointed to by multiple | ||
55 | dentries (hard links, for example, do this). | ||
56 | |||
57 | To look up an inode requires that the VFS calls the lookup() method of | ||
58 | the parent directory inode. This method is installed by the specific | ||
59 | filesystem implementation that the inode lives in. Once the VFS has | ||
60 | the required dentry (and hence the inode), we can do all those boring | ||
61 | things like open(2) the file, or stat(2) it to peek at the inode | ||
62 | data. The stat(2) operation is fairly simple: once the VFS has the | ||
63 | dentry, it peeks at the inode data and passes some of it back to | ||
64 | userspace. | ||
65 | |||
66 | |||
67 | The File Object | ||
68 | --------------- | ||
70 | 69 | ||
71 | Opening a file requires another operation: allocation of a file | 70 | Opening a file requires another operation: allocation of a file |
72 | structure (this is the kernel-side implementation of file | 71 | structure (this is the kernel-side implementation of file |
@@ -74,51 +73,39 @@ descriptors). The freshly allocated file structure is initialized with | |||
74 | a pointer to the dentry and a set of file operation member functions. | 73 | a pointer to the dentry and a set of file operation member functions. |
75 | These are taken from the inode data. The open() file method is then | 74 | These are taken from the inode data. The open() file method is then |
76 | called so the specific filesystem implementation can do it's work. You | 75 | called so the specific filesystem implementation can do it's work. You |
77 | can see that this is another switch performed by the VFS. | 76 | can see that this is another switch performed by the VFS. The file |
78 | 77 | structure is placed into the file descriptor table for the process. | |
79 | The file structure is placed into the file descriptor table for the | ||
80 | process. | ||
81 | 78 | ||
82 | Reading, writing and closing files (and other assorted VFS operations) | 79 | Reading, writing and closing files (and other assorted VFS operations) |
83 | is done by using the userspace file descriptor to grab the appropriate | 80 | is done by using the userspace file descriptor to grab the appropriate |
84 | file structure, and then calling the required file structure method | 81 | file structure, and then calling the required file structure method to |
85 | function to do whatever is required. | 82 | do whatever is required. For as long as the file is open, it keeps the |
86 | 83 | dentry in use, which in turn means that the VFS inode is still in use. | |
87 | For as long as the file is open, it keeps the dentry "open" (in use), | ||
88 | which in turn means that the VFS inode is still in use. | ||
89 | |||
90 | All VFS system calls (i.e. open(2), stat(2), read(2), write(2), | ||
91 | chmod(2) and so on) are called from a process context. You should | ||
92 | assume that these calls are made without any kernel locks being | ||
93 | held. This means that the processes may be executing the same piece of | ||
94 | filesystem or driver code at the same time, on different | ||
95 | processors. You should ensure that access to shared resources is | ||
96 | protected by appropriate locks. | ||
97 | 84 | ||
98 | 85 | ||
99 | Registering and Mounting a Filesystem | 86 | Registering and Mounting a Filesystem |
100 | ------------------------------------- | 87 | ===================================== |
101 | 88 | ||
102 | If you want to support a new kind of filesystem in the kernel, all you | 89 | To register and unregister a filesystem, use the following API |
103 | need to do is call register_filesystem(). You pass a structure | 90 | functions: |
104 | describing the filesystem implementation (struct file_system_type) | ||
105 | which is then added to an internal table of supported filesystems. You | ||
106 | can do: | ||
107 | 91 | ||
108 | % cat /proc/filesystems | 92 | #include <linux/fs.h> |
109 | 93 | ||
110 | to see what filesystems are currently available on your system. | 94 | extern int register_filesystem(struct file_system_type *); |
95 | extern int unregister_filesystem(struct file_system_type *); | ||
111 | 96 | ||
112 | When a request is made to mount a block device onto a directory in | 97 | The passed struct file_system_type describes your filesystem. When a |
113 | your filespace the VFS will call the appropriate method for the | 98 | request is made to mount a device onto a directory in your filespace, |
114 | specific filesystem. The dentry for the mount point will then be | 99 | the VFS will call the appropriate get_sb() method for the specific |
115 | updated to point to the root inode for the new filesystem. | 100 | filesystem. The dentry for the mount point will then be updated to |
101 | point to the root inode for the new filesystem. | ||
116 | 102 | ||
117 | It's now time to look at things in more detail. | 103 | You can see all filesystems that are registered to the kernel in the |
104 | file /proc/filesystems. | ||
118 | 105 | ||
119 | 106 | ||
120 | struct file_system_type | 107 | struct file_system_type |
121 | ======================= | 108 | ----------------------- |
122 | 109 | ||
123 | This describes the filesystem. As of kernel 2.6.13, the following | 110 | This describes the filesystem. As of kernel 2.6.13, the following |
124 | members are defined: | 111 | members are defined: |
@@ -197,8 +184,14 @@ A fill_super() method implementation has the following arguments: | |||
197 | int silent: whether or not to be silent on error | 184 | int silent: whether or not to be silent on error |
198 | 185 | ||
199 | 186 | ||
187 | The Superblock Object | ||
188 | ===================== | ||
189 | |||
190 | A superblock object represents a mounted filesystem. | ||
191 | |||
192 | |||
200 | struct super_operations | 193 | struct super_operations |
201 | ======================= | 194 | ----------------------- |
202 | 195 | ||
203 | This describes how the VFS can manipulate the superblock of your | 196 | This describes how the VFS can manipulate the superblock of your |
204 | filesystem. As of kernel 2.6.13, the following members are defined: | 197 | filesystem. As of kernel 2.6.13, the following members are defined: |
@@ -286,9 +279,9 @@ or bottom half). | |||
286 | a superblock. The second parameter indicates whether the method | 279 | a superblock. The second parameter indicates whether the method |
287 | should wait until the write out has been completed. Optional. | 280 | should wait until the write out has been completed. Optional. |
288 | 281 | ||
289 | write_super_lockfs: called when VFS is locking a filesystem and forcing | 282 | write_super_lockfs: called when VFS is locking a filesystem and |
290 | it into a consistent state. This function is currently used by the | 283 | forcing it into a consistent state. This method is currently |
291 | Logical Volume Manager (LVM). | 284 | used by the Logical Volume Manager (LVM). |
292 | 285 | ||
293 | unlockfs: called when VFS is unlocking a filesystem and making it writable | 286 | unlockfs: called when VFS is unlocking a filesystem and making it writable |
294 | again. | 287 | again. |
@@ -317,8 +310,14 @@ field. This is a pointer to a "struct inode_operations" which | |||
317 | describes the methods that can be performed on individual inodes. | 310 | describes the methods that can be performed on individual inodes. |
318 | 311 | ||
319 | 312 | ||
313 | The Inode Object | ||
314 | ================ | ||
315 | |||
316 | An inode object represents an object within the filesystem. | ||
317 | |||
318 | |||
320 | struct inode_operations | 319 | struct inode_operations |
321 | ======================= | 320 | ----------------------- |
322 | 321 | ||
323 | This describes how the VFS can manipulate an inode in your | 322 | This describes how the VFS can manipulate an inode in your |
324 | filesystem. As of kernel 2.6.13, the following members are defined: | 323 | filesystem. As of kernel 2.6.13, the following members are defined: |
@@ -394,51 +393,62 @@ otherwise noted. | |||
394 | will probably need to call d_instantiate() just as you would | 393 | will probably need to call d_instantiate() just as you would |
395 | in the create() method | 394 | in the create() method |
396 | 395 | ||
396 | rename: called by the rename(2) system call to rename the object to | ||
397 | have the parent and name given by the second inode and dentry. | ||
398 | |||
397 | readlink: called by the readlink(2) system call. Only required if | 399 | readlink: called by the readlink(2) system call. Only required if |
398 | you want to support reading symbolic links | 400 | you want to support reading symbolic links |
399 | 401 | ||
400 | follow_link: called by the VFS to follow a symbolic link to the | 402 | follow_link: called by the VFS to follow a symbolic link to the |
401 | inode it points to. Only required if you want to support | 403 | inode it points to. Only required if you want to support |
402 | symbolic links. This function returns a void pointer cookie | 404 | symbolic links. This method returns a void pointer cookie |
403 | that is passed to put_link(). | 405 | that is passed to put_link(). |
404 | 406 | ||
405 | put_link: called by the VFS to release resources allocated by | 407 | put_link: called by the VFS to release resources allocated by |
406 | follow_link(). The cookie returned by follow_link() is passed to | 408 | follow_link(). The cookie returned by follow_link() is passed |
407 | to this function as the last parameter. It is used by filesystems | 409 | to to this method as the last parameter. It is used by |
408 | such as NFS where page cache is not stable (i.e. page that was | 410 | filesystems such as NFS where page cache is not stable |
409 | installed when the symbolic link walk started might not be in the | 411 | (i.e. page that was installed when the symbolic link walk |
410 | page cache at the end of the walk). | 412 | started might not be in the page cache at the end of the |
411 | 413 | walk). | |
412 | truncate: called by the VFS to change the size of a file. The i_size | 414 | |
413 | field of the inode is set to the desired size by the VFS before | 415 | truncate: called by the VFS to change the size of a file. The |
414 | this function is called. This function is called by the truncate(2) | 416 | i_size field of the inode is set to the desired size by the |
415 | system call and related functionality. | 417 | VFS before this method is called. This method is called by |
418 | the truncate(2) system call and related functionality. | ||
416 | 419 | ||
417 | permission: called by the VFS to check for access rights on a POSIX-like | 420 | permission: called by the VFS to check for access rights on a POSIX-like |
418 | filesystem. | 421 | filesystem. |
419 | 422 | ||
420 | setattr: called by the VFS to set attributes for a file. This function is | 423 | setattr: called by the VFS to set attributes for a file. This method |
421 | called by chmod(2) and related system calls. | 424 | is called by chmod(2) and related system calls. |
422 | 425 | ||
423 | getattr: called by the VFS to get attributes of a file. This function is | 426 | getattr: called by the VFS to get attributes of a file. This method |
424 | called by stat(2) and related system calls. | 427 | is called by stat(2) and related system calls. |
425 | 428 | ||
426 | setxattr: called by the VFS to set an extended attribute for a file. | 429 | setxattr: called by the VFS to set an extended attribute for a file. |
427 | Extended attribute is a name:value pair associated with an inode. This | 430 | Extended attribute is a name:value pair associated with an |
428 | function is called by setxattr(2) system call. | 431 | inode. This method is called by setxattr(2) system call. |
432 | |||
433 | getxattr: called by the VFS to retrieve the value of an extended | ||
434 | attribute name. This method is called by getxattr(2) function | ||
435 | call. | ||
429 | 436 | ||
430 | getxattr: called by the VFS to retrieve the value of an extended attribute | 437 | listxattr: called by the VFS to list all extended attributes for a |
431 | name. This function is called by getxattr(2) function call. | 438 | given file. This method is called by listxattr(2) system call. |
432 | 439 | ||
433 | listxattr: called by the VFS to list all extended attributes for a given | 440 | removexattr: called by the VFS to remove an extended attribute from |
434 | file. This function is called by listxattr(2) system call. | 441 | a file. This method is called by removexattr(2) system call. |
435 | 442 | ||
436 | removexattr: called by the VFS to remove an extended attribute from a file. | 443 | |
437 | This function is called by removexattr(2) system call. | 444 | The Address Space Object |
445 | ======================== | ||
446 | |||
447 | The address space object is used to identify pages in the page cache. | ||
438 | 448 | ||
439 | 449 | ||
440 | struct address_space_operations | 450 | struct address_space_operations |
441 | =============================== | 451 | ------------------------------- |
442 | 452 | ||
443 | This describes how the VFS can manipulate mapping of a file to page cache in | 453 | This describes how the VFS can manipulate mapping of a file to page cache in |
444 | your filesystem. As of kernel 2.6.13, the following members are defined: | 454 | your filesystem. As of kernel 2.6.13, the following members are defined: |
@@ -502,8 +512,14 @@ struct address_space_operations { | |||
502 | it. An example implementation can be found in fs/ext2/xip.c. | 512 | it. An example implementation can be found in fs/ext2/xip.c. |
503 | 513 | ||
504 | 514 | ||
515 | The File Object | ||
516 | =============== | ||
517 | |||
518 | A file object represents a file opened by a process. | ||
519 | |||
520 | |||
505 | struct file_operations | 521 | struct file_operations |
506 | ====================== | 522 | ---------------------- |
507 | 523 | ||
508 | This describes how the VFS can manipulate an open file. As of kernel | 524 | This describes how the VFS can manipulate an open file. As of kernel |
509 | 2.6.13, the following members are defined: | 525 | 2.6.13, the following members are defined: |
@@ -661,7 +677,7 @@ of child dentries. Child dentries are basically like files in a | |||
661 | directory. | 677 | directory. |
662 | 678 | ||
663 | 679 | ||
664 | Directory Entry Cache APIs | 680 | Directory Entry Cache API |
665 | -------------------------- | 681 | -------------------------- |
666 | 682 | ||
667 | There are a number of functions defined which permit a filesystem to | 683 | There are a number of functions defined which permit a filesystem to |
@@ -705,178 +721,24 @@ manipulate dentries: | |||
705 | and the dentry is returned. The caller must use d_put() | 721 | and the dentry is returned. The caller must use d_put() |
706 | to free the dentry when it finishes using it. | 722 | to free the dentry when it finishes using it. |
707 | 723 | ||
724 | For further information on dentry locking, please refer to the document | ||
725 | Documentation/filesystems/dentry-locking.txt. | ||
708 | 726 | ||
709 | RCU-based dcache locking model | ||
710 | ------------------------------ | ||
711 | 727 | ||
712 | On many workloads, the most common operation on dcache is | 728 | Resources |
713 | to look up a dentry, given a parent dentry and the name | 729 | ========= |
714 | of the child. Typically, for every open(), stat() etc., | 730 | |
715 | the dentry corresponding to the pathname will be looked | 731 | (Note some of these resources are not up-to-date with the latest kernel |
716 | up by walking the tree starting with the first component | 732 | version.) |
717 | of the pathname and using that dentry along with the next | 733 | |
718 | component to look up the next level and so on. Since it | 734 | Creating Linux virtual filesystems. 2002 |
719 | is a frequent operation for workloads like multiuser | 735 | <http://lwn.net/Articles/13325/> |
720 | environments and web servers, it is important to optimize | 736 | |
721 | this path. | 737 | The Linux Virtual File-system Layer by Neil Brown. 1999 |
722 | 738 | <http://www.cse.unsw.edu.au/~neilb/oss/linux-commentary/vfs.html> | |
723 | Prior to 2.5.10, dcache_lock was acquired in d_lookup and thus | 739 | |
724 | in every component during path look-up. Since 2.5.10 onwards, | 740 | A tour of the Linux VFS by Michael K. Johnson. 1996 |
725 | fast-walk algorithm changed this by holding the dcache_lock | 741 | <http://www.tldp.org/LDP/khg/HyperNews/get/fs/vfstour.html> |
726 | at the beginning and walking as many cached path component | ||
727 | dentries as possible. This significantly decreases the number | ||
728 | of acquisition of dcache_lock. However it also increases the | ||
729 | lock hold time significantly and affects performance in large | ||
730 | SMP machines. Since 2.5.62 kernel, dcache has been using | ||
731 | a new locking model that uses RCU to make dcache look-up | ||
732 | lock-free. | ||
733 | |||
734 | The current dcache locking model is not very different from the existing | ||
735 | dcache locking model. Prior to 2.5.62 kernel, dcache_lock | ||
736 | protected the hash chain, d_child, d_alias, d_lru lists as well | ||
737 | as d_inode and several other things like mount look-up. RCU-based | ||
738 | changes affect only the way the hash chain is protected. For everything | ||
739 | else the dcache_lock must be taken for both traversing as well as | ||
740 | updating. The hash chain updates too take the dcache_lock. | ||
741 | The significant change is the way d_lookup traverses the hash chain, | ||
742 | it doesn't acquire the dcache_lock for this and rely on RCU to | ||
743 | ensure that the dentry has not been *freed*. | ||
744 | |||
745 | |||
746 | Dcache locking details | ||
747 | ---------------------- | ||
748 | 742 | ||
749 | For many multi-user workloads, open() and stat() on files are | 743 | A small trail through the Linux kernel by Andries Brouwer. 2001 |
750 | very frequently occurring operations. Both involve walking | 744 | <http://www.win.tue.nl/~aeb/linux/vfs/trail.html> |
751 | of path names to find the dentry corresponding to the | ||
752 | concerned file. In 2.4 kernel, dcache_lock was held | ||
753 | during look-up of each path component. Contention and | ||
754 | cache-line bouncing of this global lock caused significant | ||
755 | scalability problems. With the introduction of RCU | ||
756 | in Linux kernel, this was worked around by making | ||
757 | the look-up of path components during path walking lock-free. | ||
758 | |||
759 | |||
760 | Safe lock-free look-up of dcache hash table | ||
761 | =========================================== | ||
762 | |||
763 | Dcache is a complex data structure with the hash table entries | ||
764 | also linked together in other lists. In 2.4 kernel, dcache_lock | ||
765 | protected all the lists. We applied RCU only on hash chain | ||
766 | walking. The rest of the lists are still protected by dcache_lock. | ||
767 | Some of the important changes are : | ||
768 | |||
769 | 1. The deletion from hash chain is done using hlist_del_rcu() macro which | ||
770 | doesn't initialize next pointer of the deleted dentry and this | ||
771 | allows us to walk safely lock-free while a deletion is happening. | ||
772 | |||
773 | 2. Insertion of a dentry into the hash table is done using | ||
774 | hlist_add_head_rcu() which take care of ordering the writes - | ||
775 | the writes to the dentry must be visible before the dentry | ||
776 | is inserted. This works in conjunction with hlist_for_each_rcu() | ||
777 | while walking the hash chain. The only requirement is that | ||
778 | all initialization to the dentry must be done before hlist_add_head_rcu() | ||
779 | since we don't have dcache_lock protection while traversing | ||
780 | the hash chain. This isn't different from the existing code. | ||
781 | |||
782 | 3. The dentry looked up without holding dcache_lock by cannot be | ||
783 | returned for walking if it is unhashed. It then may have a NULL | ||
784 | d_inode or other bogosity since RCU doesn't protect the other | ||
785 | fields in the dentry. We therefore use a flag DCACHE_UNHASHED to | ||
786 | indicate unhashed dentries and use this in conjunction with a | ||
787 | per-dentry lock (d_lock). Once looked up without the dcache_lock, | ||
788 | we acquire the per-dentry lock (d_lock) and check if the | ||
789 | dentry is unhashed. If so, the look-up is failed. If not, the | ||
790 | reference count of the dentry is increased and the dentry is returned. | ||
791 | |||
792 | 4. Once a dentry is looked up, it must be ensured during the path | ||
793 | walk for that component it doesn't go away. In pre-2.5.10 code, | ||
794 | this was done holding a reference to the dentry. dcache_rcu does | ||
795 | the same. In some sense, dcache_rcu path walking looks like | ||
796 | the pre-2.5.10 version. | ||
797 | |||
798 | 5. All dentry hash chain updates must take the dcache_lock as well as | ||
799 | the per-dentry lock in that order. dput() does this to ensure | ||
800 | that a dentry that has just been looked up in another CPU | ||
801 | doesn't get deleted before dget() can be done on it. | ||
802 | |||
803 | 6. There are several ways to do reference counting of RCU protected | ||
804 | objects. One such example is in ipv4 route cache where | ||
805 | deferred freeing (using call_rcu()) is done as soon as | ||
806 | the reference count goes to zero. This cannot be done in | ||
807 | the case of dentries because tearing down of dentries | ||
808 | require blocking (dentry_iput()) which isn't supported from | ||
809 | RCU callbacks. Instead, tearing down of dentries happen | ||
810 | synchronously in dput(), but actual freeing happens later | ||
811 | when RCU grace period is over. This allows safe lock-free | ||
812 | walking of the hash chains, but a matched dentry may have | ||
813 | been partially torn down. The checking of DCACHE_UNHASHED | ||
814 | flag with d_lock held detects such dentries and prevents | ||
815 | them from being returned from look-up. | ||
816 | |||
817 | |||
818 | Maintaining POSIX rename semantics | ||
819 | ================================== | ||
820 | |||
821 | Since look-up of dentries is lock-free, it can race against | ||
822 | a concurrent rename operation. For example, during rename | ||
823 | of file A to B, look-up of either A or B must succeed. | ||
824 | So, if look-up of B happens after A has been removed from the | ||
825 | hash chain but not added to the new hash chain, it may fail. | ||
826 | Also, a comparison while the name is being written concurrently | ||
827 | by a rename may result in false positive matches violating | ||
828 | rename semantics. Issues related to race with rename are | ||
829 | handled as described below : | ||
830 | |||
831 | 1. Look-up can be done in two ways - d_lookup() which is safe | ||
832 | from simultaneous renames and __d_lookup() which is not. | ||
833 | If __d_lookup() fails, it must be followed up by a d_lookup() | ||
834 | to correctly determine whether a dentry is in the hash table | ||
835 | or not. d_lookup() protects look-ups using a sequence | ||
836 | lock (rename_lock). | ||
837 | |||
838 | 2. The name associated with a dentry (d_name) may be changed if | ||
839 | a rename is allowed to happen simultaneously. To avoid memcmp() | ||
840 | in __d_lookup() go out of bounds due to a rename and false | ||
841 | positive comparison, the name comparison is done while holding the | ||
842 | per-dentry lock. This prevents concurrent renames during this | ||
843 | operation. | ||
844 | |||
845 | 3. Hash table walking during look-up may move to a different bucket as | ||
846 | the current dentry is moved to a different bucket due to rename. | ||
847 | But we use hlists in dcache hash table and they are null-terminated. | ||
848 | So, even if a dentry moves to a different bucket, hash chain | ||
849 | walk will terminate. [with a list_head list, it may not since | ||
850 | termination is when the list_head in the original bucket is reached]. | ||
851 | Since we redo the d_parent check and compare name while holding | ||
852 | d_lock, lock-free look-up will not race against d_move(). | ||
853 | |||
854 | 4. There can be a theoretical race when a dentry keeps coming back | ||
855 | to original bucket due to double moves. Due to this look-up may | ||
856 | consider that it has never moved and can end up in a infinite loop. | ||
857 | But this is not any worse that theoretical livelocks we already | ||
858 | have in the kernel. | ||
859 | |||
860 | |||
861 | Important guidelines for filesystem developers related to dcache_rcu | ||
862 | ==================================================================== | ||
863 | |||
864 | 1. Existing dcache interfaces (pre-2.5.62) exported to filesystem | ||
865 | don't change. Only dcache internal implementation changes. However | ||
866 | filesystems *must not* delete from the dentry hash chains directly | ||
867 | using the list macros like allowed earlier. They must use dcache | ||
868 | APIs like d_drop() or __d_drop() depending on the situation. | ||
869 | |||
870 | 2. d_flags is now protected by a per-dentry lock (d_lock). All | ||
871 | access to d_flags must be protected by it. | ||
872 | |||
873 | 3. For a hashed dentry, checking of d_count needs to be protected | ||
874 | by d_lock. | ||
875 | |||
876 | |||
877 | Papers and other documentation on dcache locking | ||
878 | ================================================ | ||
879 | |||
880 | 1. Scaling dcache with RCU (http://linuxjournal.com/article.php?sid=7124). | ||
881 | |||
882 | 2. http://lse.sourceforge.net/locking/dcache/dcache.html | ||
diff --git a/Documentation/hpet.txt b/Documentation/hpet.txt index 4e7cc8d3359b..e52457581f47 100644 --- a/Documentation/hpet.txt +++ b/Documentation/hpet.txt | |||
@@ -1,18 +1,21 @@ | |||
1 | High Precision Event Timer Driver for Linux | 1 | High Precision Event Timer Driver for Linux |
2 | 2 | ||
3 | The High Precision Event Timer (HPET) hardware is the future replacement for the 8254 and Real | 3 | The High Precision Event Timer (HPET) hardware is the future replacement |
4 | Time Clock (RTC) periodic timer functionality. Each HPET can have up two 32 timers. It is possible | 4 | for the 8254 and Real Time Clock (RTC) periodic timer functionality. |
5 | to configure the first two timers as legacy replacements for 8254 and RTC periodic. A specification | 5 | Each HPET can have up two 32 timers. It is possible to configure the |
6 | done by INTEL and Microsoft can be found at http://www.intel.com/labs/platcomp/hpet/hpetspec.htm. | 6 | first two timers as legacy replacements for 8254 and RTC periodic timers. |
7 | 7 | A specification done by Intel and Microsoft can be found at | |
8 | The driver supports detection of HPET driver allocation and initialization of the HPET before the | 8 | <http://www.intel.com/hardwaredesign/hpetspec.htm>. |
9 | driver module_init routine is called. This enables platform code which uses timer 0 or 1 as the | 9 | |
10 | main timer to intercept HPET initialization. An example of this initialization can be found in | 10 | The driver supports detection of HPET driver allocation and initialization |
11 | of the HPET before the driver module_init routine is called. This enables | ||
12 | platform code which uses timer 0 or 1 as the main timer to intercept HPET | ||
13 | initialization. An example of this initialization can be found in | ||
11 | arch/i386/kernel/time_hpet.c. | 14 | arch/i386/kernel/time_hpet.c. |
12 | 15 | ||
13 | The driver provides two APIs which are very similar to the API found in the rtc.c driver. | 16 | The driver provides two APIs which are very similar to the API found in |
14 | There is a user space API and a kernel space API. An example user space program is provided | 17 | the rtc.c driver. There is a user space API and a kernel space API. |
15 | below. | 18 | An example user space program is provided below. |
16 | 19 | ||
17 | #include <stdio.h> | 20 | #include <stdio.h> |
18 | #include <stdlib.h> | 21 | #include <stdlib.h> |
@@ -290,9 +293,8 @@ The kernel API has three interfaces exported from the driver: | |||
290 | hpet_unregister(struct hpet_task *tp) | 293 | hpet_unregister(struct hpet_task *tp) |
291 | hpet_control(struct hpet_task *tp, unsigned int cmd, unsigned long arg) | 294 | hpet_control(struct hpet_task *tp, unsigned int cmd, unsigned long arg) |
292 | 295 | ||
293 | The kernel module using this interface fills in the ht_func and ht_data members of the | 296 | The kernel module using this interface fills in the ht_func and ht_data |
294 | hpet_task structure before calling hpet_register. hpet_control simply vectors to the hpet_ioctl | 297 | members of the hpet_task structure before calling hpet_register. |
295 | routine and has the same commands and respective arguments as the user API. hpet_unregister | 298 | hpet_control simply vectors to the hpet_ioctl routine and has the same |
299 | commands and respective arguments as the user API. hpet_unregister | ||
296 | is used to terminate usage of the HPET timer reserved by hpet_register. | 300 | is used to terminate usage of the HPET timer reserved by hpet_register. |
297 | |||
298 | |||
diff --git a/Documentation/magic-number.txt b/Documentation/magic-number.txt index bd8eefa17587..af67faccf4de 100644 --- a/Documentation/magic-number.txt +++ b/Documentation/magic-number.txt | |||
@@ -120,7 +120,7 @@ ISDN_NET_MAGIC 0x49344C02 isdn_net_local_s drivers/isdn/i4l/isdn_net_li | |||
120 | SAVEKMSG_MAGIC2 0x4B4D5347 savekmsg arch/*/amiga/config.c | 120 | SAVEKMSG_MAGIC2 0x4B4D5347 savekmsg arch/*/amiga/config.c |
121 | STLI_BOARDMAGIC 0x4bc6c825 stlibrd include/linux/istallion.h | 121 | STLI_BOARDMAGIC 0x4bc6c825 stlibrd include/linux/istallion.h |
122 | CS_STATE_MAGIC 0x4c4f4749 cs_state sound/oss/cs46xx.c | 122 | CS_STATE_MAGIC 0x4c4f4749 cs_state sound/oss/cs46xx.c |
123 | SLAB_C_MAGIC 0x4f17a36d kmem_cache_s mm/slab.c | 123 | SLAB_C_MAGIC 0x4f17a36d kmem_cache mm/slab.c |
124 | COW_MAGIC 0x4f4f4f4d cow_header_v1 arch/um/drivers/ubd_user.c | 124 | COW_MAGIC 0x4f4f4f4d cow_header_v1 arch/um/drivers/ubd_user.c |
125 | I810_CARD_MAGIC 0x5072696E i810_card sound/oss/i810_audio.c | 125 | I810_CARD_MAGIC 0x5072696E i810_card sound/oss/i810_audio.c |
126 | TRIDENT_CARD_MAGIC 0x5072696E trident_card sound/oss/trident.c | 126 | TRIDENT_CARD_MAGIC 0x5072696E trident_card sound/oss/trident.c |
diff --git a/Documentation/networking/decnet.txt b/Documentation/networking/decnet.txt index c6bd25f5d61d..e6c39c5831f5 100644 --- a/Documentation/networking/decnet.txt +++ b/Documentation/networking/decnet.txt | |||
@@ -176,8 +176,6 @@ information (_most_ of which _is_ _essential_) includes: | |||
176 | - Which client caused the problem ? | 176 | - Which client caused the problem ? |
177 | - How much data was being transferred ? | 177 | - How much data was being transferred ? |
178 | - Was the network congested ? | 178 | - Was the network congested ? |
179 | - If there was a kernel panic, please run the output through ksymoops | ||
180 | before sending it to me, otherwise its _useless_. | ||
181 | - How can the problem be reproduced ? | 179 | - How can the problem be reproduced ? |
182 | - Can you use tcpdump to get a trace ? (N.B. Most (all?) versions of | 180 | - Can you use tcpdump to get a trace ? (N.B. Most (all?) versions of |
183 | tcpdump don't understand how to dump DECnet properly, so including | 181 | tcpdump don't understand how to dump DECnet properly, so including |
diff --git a/Documentation/oops-tracing.txt b/Documentation/oops-tracing.txt index 66eaaab7773d..c563842ed805 100644 --- a/Documentation/oops-tracing.txt +++ b/Documentation/oops-tracing.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | NOTE: ksymoops is useless on 2.6. Please use the Oops in its original format | 1 | NOTE: ksymoops is useless on 2.6. Please use the Oops in its original format |
2 | (from dmesg, etc). Ignore any references in this or other docs to "decoding | 2 | (from dmesg, etc). Ignore any references in this or other docs to "decoding |
3 | the Oops" or "running it through ksymoops". If you post an Oops fron 2.6 that | 3 | the Oops" or "running it through ksymoops". If you post an Oops from 2.6 that |
4 | has been run through ksymoops, people will just tell you to repost it. | 4 | has been run through ksymoops, people will just tell you to repost it. |
5 | 5 | ||
6 | Quick Summary | 6 | Quick Summary |
diff --git a/Documentation/power/video.txt b/Documentation/power/video.txt index 526d6dd267ea..912bed87c758 100644 --- a/Documentation/power/video.txt +++ b/Documentation/power/video.txt | |||
@@ -11,9 +11,9 @@ boot video card. (Kernel usually does not even contain video card | |||
11 | driver -- vesafb and vgacon are widely used). | 11 | driver -- vesafb and vgacon are widely used). |
12 | 12 | ||
13 | This is not problem for swsusp, because during swsusp resume, BIOS is | 13 | This is not problem for swsusp, because during swsusp resume, BIOS is |
14 | run normally so video card is normally initialized. S3 has absolutely | 14 | run normally so video card is normally initialized. It should not be |
15 | no chance of working with SMP/HT. Be sure it to turn it off before | 15 | problem for S1 standby, because hardware should retain its state over |
16 | testing (swsusp should work ok, OTOH). | 16 | that. |
17 | 17 | ||
18 | There are a few types of systems where video works after S3 resume: | 18 | There are a few types of systems where video works after S3 resume: |
19 | 19 | ||
@@ -64,7 +64,7 @@ your video card (good luck getting docs :-(). Maybe suspending from X | |||
64 | (proper X, knowing your hardware, not XF68_FBcon) might have better | 64 | (proper X, knowing your hardware, not XF68_FBcon) might have better |
65 | chance of working. | 65 | chance of working. |
66 | 66 | ||
67 | Table of known working systems: | 67 | Table of known working notebooks: |
68 | 68 | ||
69 | Model hack (or "how to do it") | 69 | Model hack (or "how to do it") |
70 | ------------------------------------------------------------------------------ | 70 | ------------------------------------------------------------------------------ |
@@ -73,7 +73,7 @@ Acer TM 242FX vbetool (6) | |||
73 | Acer TM C110 video_post (8) | 73 | Acer TM C110 video_post (8) |
74 | Acer TM C300 vga=normal (only suspend on console, not in X), vbetool (6) or video_post (8) | 74 | Acer TM C300 vga=normal (only suspend on console, not in X), vbetool (6) or video_post (8) |
75 | Acer TM 4052LCi s3_bios (2) | 75 | Acer TM 4052LCi s3_bios (2) |
76 | Acer TM 636Lci s3_bios vga=normal (2) | 76 | Acer TM 636Lci s3_bios,s3_mode (4) |
77 | Acer TM 650 (Radeon M7) vga=normal plus boot-radeon (5) gets text console back | 77 | Acer TM 650 (Radeon M7) vga=normal plus boot-radeon (5) gets text console back |
78 | Acer TM 660 ??? (*) | 78 | Acer TM 660 ??? (*) |
79 | Acer TM 800 vga=normal, X patches, see webpage (5) or vbetool (6) | 79 | Acer TM 800 vga=normal, X patches, see webpage (5) or vbetool (6) |
@@ -137,6 +137,13 @@ Toshiba Satellite P10-554 s3_bios,s3_mode (4)(****) | |||
137 | Toshiba M30 (2) xor X with nvidia driver using internal AGP | 137 | Toshiba M30 (2) xor X with nvidia driver using internal AGP |
138 | Uniwill 244IIO ??? (*) | 138 | Uniwill 244IIO ??? (*) |
139 | 139 | ||
140 | Known working desktop systems | ||
141 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
142 | |||
143 | Mainboard Graphics card hack (or "how to do it") | ||
144 | ------------------------------------------------------------------------------ | ||
145 | Asus A7V8X nVidia RIVA TNT2 model 64 s3_bios,s3_mode (4) | ||
146 | |||
140 | 147 | ||
141 | (*) from http://www.ubuntulinux.org/wiki/HoaryPMResults, not sure | 148 | (*) from http://www.ubuntulinux.org/wiki/HoaryPMResults, not sure |
142 | which options to use. If you know, please tell me. | 149 | which options to use. If you know, please tell me. |
diff --git a/Documentation/s390/driver-model.txt b/Documentation/s390/driver-model.txt index 19461958e2bd..df09758bf3fe 100644 --- a/Documentation/s390/driver-model.txt +++ b/Documentation/s390/driver-model.txt | |||
@@ -8,11 +8,10 @@ All devices which can be addressed by means of ccws are called 'CCW devices' - | |||
8 | even if they aren't actually driven by ccws. | 8 | even if they aren't actually driven by ccws. |
9 | 9 | ||
10 | All ccw devices are accessed via a subchannel, this is reflected in the | 10 | All ccw devices are accessed via a subchannel, this is reflected in the |
11 | structures under root/: | 11 | structures under devices/: |
12 | 12 | ||
13 | root/ | 13 | devices/ |
14 | - sys | 14 | - system/ |
15 | - legacy | ||
16 | - css0/ | 15 | - css0/ |
17 | - 0.0.0000/0.0.0815/ | 16 | - 0.0.0000/0.0.0815/ |
18 | - 0.0.0001/0.0.4711/ | 17 | - 0.0.0001/0.0.4711/ |
@@ -36,7 +35,7 @@ availability: Can be 'good' or 'boxed'; 'no path' or 'no device' for | |||
36 | 35 | ||
37 | online: An interface to set the device online and offline. | 36 | online: An interface to set the device online and offline. |
38 | In the special case of the device being disconnected (see the | 37 | In the special case of the device being disconnected (see the |
39 | notify function under 1.2), piping 0 to online will focibly delete | 38 | notify function under 1.2), piping 0 to online will forcibly delete |
40 | the device. | 39 | the device. |
41 | 40 | ||
42 | The device drivers can add entries to export per-device data and interfaces. | 41 | The device drivers can add entries to export per-device data and interfaces. |
@@ -222,7 +221,7 @@ and are called 'chp0.<chpid>'. They have no driver and do not belong to any bus. | |||
222 | Please note, that unlike /proc/chpids in 2.4, the channel path objects reflect | 221 | Please note, that unlike /proc/chpids in 2.4, the channel path objects reflect |
223 | only the logical state and not the physical state, since we cannot track the | 222 | only the logical state and not the physical state, since we cannot track the |
224 | latter consistently due to lacking machine support (we don't need to be aware | 223 | latter consistently due to lacking machine support (we don't need to be aware |
225 | of anyway). | 224 | of it anyway). |
226 | 225 | ||
227 | status - Can be 'online' or 'offline'. | 226 | status - Can be 'online' or 'offline'. |
228 | Piping 'on' or 'off' sets the chpid logically online/offline. | 227 | Piping 'on' or 'off' sets the chpid logically online/offline. |
@@ -235,12 +234,16 @@ status - Can be 'online' or 'offline'. | |||
235 | 3. System devices | 234 | 3. System devices |
236 | ----------------- | 235 | ----------------- |
237 | 236 | ||
238 | Note: cpus may yet be added here. | ||
239 | |||
240 | 3.1 xpram | 237 | 3.1 xpram |
241 | --------- | 238 | --------- |
242 | 239 | ||
243 | xpram shows up under sys/ as 'xpram'. | 240 | xpram shows up under devices/system/ as 'xpram'. |
241 | |||
242 | 3.2 cpus | ||
243 | -------- | ||
244 | |||
245 | For each cpu, a directory is created under devices/system/cpu/. Each cpu has an | ||
246 | attribute 'online' which can be 0 or 1. | ||
244 | 247 | ||
245 | 248 | ||
246 | 4. Other devices | 249 | 4. Other devices |
diff --git a/Documentation/sparse.txt b/Documentation/sparse.txt index 1829009db771..3f1c5464b1c9 100644 --- a/Documentation/sparse.txt +++ b/Documentation/sparse.txt | |||
@@ -41,9 +41,9 @@ sure that bitwise types don't get mixed up (little-endian vs big-endian | |||
41 | vs cpu-endian vs whatever), and there the constant "0" really _is_ | 41 | vs cpu-endian vs whatever), and there the constant "0" really _is_ |
42 | special. | 42 | special. |
43 | 43 | ||
44 | Modify top-level Makefile to say | 44 | Use |
45 | 45 | ||
46 | CHECK = sparse -Wbitwise | 46 | make C=[12] CF=-Wbitwise |
47 | 47 | ||
48 | or you don't get any checking at all. | 48 | or you don't get any checking at all. |
49 | 49 | ||
diff --git a/Documentation/video4linux/bttv/README.freeze b/Documentation/video4linux/bttv/README.freeze index 51f8d4379a94..4259dccc8287 100644 --- a/Documentation/video4linux/bttv/README.freeze +++ b/Documentation/video4linux/bttv/README.freeze | |||
@@ -27,9 +27,9 @@ information out of a register+stack dump printed by the kernel on | |||
27 | protection faults (so-called "kernel oops"). | 27 | protection faults (so-called "kernel oops"). |
28 | 28 | ||
29 | If you run into some kind of deadlock, you can try to dump a call trace | 29 | If you run into some kind of deadlock, you can try to dump a call trace |
30 | for each process using sysrq-t (see Documentation/sysrq.txt). ksymoops | 30 | for each process using sysrq-t (see Documentation/sysrq.txt). |
31 | will translate these dumps into kernel symbols too. This way it is | 31 | This way it is possible to figure where *exactly* some process in "D" |
32 | possible to figure where *exactly* some process in "D" state is stuck. | 32 | state is stuck. |
33 | 33 | ||
34 | I've seen reports that bttv 0.7.x crashes whereas 0.8.x works rock solid | 34 | I've seen reports that bttv 0.7.x crashes whereas 0.8.x works rock solid |
35 | for some people. Thus probably a small buglet left somewhere in bttv | 35 | for some people. Thus probably a small buglet left somewhere in bttv |
diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt index 1b9bcd1fe98b..1ad9af1ca4d0 100644 --- a/Documentation/vm/hugetlbpage.txt +++ b/Documentation/vm/hugetlbpage.txt | |||
@@ -13,12 +13,13 @@ This optimization is more critical now as bigger and bigger physical memories | |||
13 | Users can use the huge page support in Linux kernel by either using the mmap | 13 | Users can use the huge page support in Linux kernel by either using the mmap |
14 | system call or standard SYSv shared memory system calls (shmget, shmat). | 14 | system call or standard SYSv shared memory system calls (shmget, shmat). |
15 | 15 | ||
16 | First the Linux kernel needs to be built with CONFIG_HUGETLB_PAGE (present | 16 | First the Linux kernel needs to be built with the CONFIG_HUGETLBFS |
17 | under Processor types and feature) and CONFIG_HUGETLBFS (present under file | 17 | (present under "File systems") and CONFIG_HUGETLB_PAGE (selected |
18 | system option on config menu) config options. | 18 | automatically when CONFIG_HUGETLBFS is selected) configuration |
19 | options. | ||
19 | 20 | ||
20 | The kernel built with hugepage support should show the number of configured | 21 | The kernel built with hugepage support should show the number of configured |
21 | hugepages in the system by running the "cat /proc/meminfo" command. | 22 | hugepages in the system by running the "cat /proc/meminfo" command. |
22 | 23 | ||
23 | /proc/meminfo also provides information about the total number of hugetlb | 24 | /proc/meminfo also provides information about the total number of hugetlb |
24 | pages configured in the kernel. It also displays information about the | 25 | pages configured in the kernel. It also displays information about the |
@@ -38,19 +39,19 @@ in the kernel. | |||
38 | 39 | ||
39 | /proc/sys/vm/nr_hugepages indicates the current number of configured hugetlb | 40 | /proc/sys/vm/nr_hugepages indicates the current number of configured hugetlb |
40 | pages in the kernel. Super user can dynamically request more (or free some | 41 | pages in the kernel. Super user can dynamically request more (or free some |
41 | pre-configured) hugepages. | 42 | pre-configured) hugepages. |
42 | The allocation( or deallocation) of hugetlb pages is posible only if there are | 43 | The allocation (or deallocation) of hugetlb pages is possible only if there are |
43 | enough physically contiguous free pages in system (freeing of hugepages is | 44 | enough physically contiguous free pages in system (freeing of hugepages is |
44 | possible only if there are enough hugetlb pages free that can be transfered | 45 | possible only if there are enough hugetlb pages free that can be transfered |
45 | back to regular memory pool). | 46 | back to regular memory pool). |
46 | 47 | ||
47 | Pages that are used as hugetlb pages are reserved inside the kernel and can | 48 | Pages that are used as hugetlb pages are reserved inside the kernel and can |
48 | not be used for other purposes. | 49 | not be used for other purposes. |
49 | 50 | ||
50 | Once the kernel with Hugetlb page support is built and running, a user can | 51 | Once the kernel with Hugetlb page support is built and running, a user can |
51 | use either the mmap system call or shared memory system calls to start using | 52 | use either the mmap system call or shared memory system calls to start using |
52 | the huge pages. It is required that the system administrator preallocate | 53 | the huge pages. It is required that the system administrator preallocate |
53 | enough memory for huge page purposes. | 54 | enough memory for huge page purposes. |
54 | 55 | ||
55 | Use the following command to dynamically allocate/deallocate hugepages: | 56 | Use the following command to dynamically allocate/deallocate hugepages: |
56 | 57 | ||
@@ -80,9 +81,9 @@ memory (huge pages) allowed for that filesystem (/mnt/huge). The size is | |||
80 | rounded down to HPAGE_SIZE. The option nr_inode sets the maximum number of | 81 | rounded down to HPAGE_SIZE. The option nr_inode sets the maximum number of |
81 | inodes that /mnt/huge can use. If the size or nr_inode options are not | 82 | inodes that /mnt/huge can use. If the size or nr_inode options are not |
82 | provided on command line then no limits are set. For size and nr_inodes | 83 | provided on command line then no limits are set. For size and nr_inodes |
83 | options, you can use [G|g]/[M|m]/[K|k] to represent giga/mega/kilo. For | 84 | options, you can use [G|g]/[M|m]/[K|k] to represent giga/mega/kilo. For |
84 | example, size=2K has the same meaning as size=2048. An example is given at | 85 | example, size=2K has the same meaning as size=2048. An example is given at |
85 | the end of this document. | 86 | the end of this document. |
86 | 87 | ||
87 | read and write system calls are not supported on files that reside on hugetlb | 88 | read and write system calls are not supported on files that reside on hugetlb |
88 | file systems. | 89 | file systems. |