aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2011-09-18 08:01:26 -0400
committerIngo Molnar <mingo@elte.hu>2011-09-18 08:01:39 -0400
commitbfa322c48dc69bfdaee10faf3bd8dbc23b39a21c (patch)
tree95360c5d253115003080264d878f3c0f907f2978 /Documentation
parent88ebc08ea9f721d1345d5414288a308ea42ac458 (diff)
parent003f6c9df54970d8b19578d195b3e2b398cdbde2 (diff)
Merge branch 'linus' into sched/core
Merge reason: We are queueing up a dependent patch. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/00-INDEX2
-rw-r--r--Documentation/DocBook/media/v4l/controls.xml38
-rw-r--r--Documentation/PCI/MSI-HOWTO.txt89
-rw-r--r--Documentation/SubmittingDrivers2
-rw-r--r--Documentation/SubmittingPatches2
-rw-r--r--Documentation/block/cfq-iosched.txt71
-rw-r--r--Documentation/email-clients.txt12
-rw-r--r--Documentation/feature-removal-schedule.txt8
-rw-r--r--Documentation/filesystems/befs.txt2
-rw-r--r--Documentation/hwmon/max160657
-rw-r--r--Documentation/ioctl/ioctl-number.txt2
-rw-r--r--Documentation/kernel-docs.txt11
-rw-r--r--Documentation/kernel-parameters.txt91
-rw-r--r--Documentation/networking/00-INDEX116
-rw-r--r--Documentation/networking/bonding.txt29
-rw-r--r--Documentation/networking/ip-sysctl.txt2
-rw-r--r--Documentation/networking/scaling.txt378
-rw-r--r--Documentation/power/runtime_pm.txt3
-rw-r--r--Documentation/ramoops.txt76
-rw-r--r--Documentation/virtual/00-INDEX3
-rw-r--r--Documentation/virtual/lguest/lguest.c3
-rw-r--r--Documentation/virtual/virtio-spec.txt2200
22 files changed, 3033 insertions, 114 deletions
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 1f89424c36a6..65bbd2622396 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -272,6 +272,8 @@ printk-formats.txt
272 - how to get printk format specifiers right 272 - how to get printk format specifiers right
273prio_tree.txt 273prio_tree.txt
274 - info on radix-priority-search-tree use for indexing vmas. 274 - info on radix-priority-search-tree use for indexing vmas.
275ramoops.txt
276 - documentation of the ramoops oops/panic logging module.
275rbtree.txt 277rbtree.txt
276 - info on what red-black trees are and what they are for. 278 - info on what red-black trees are and what they are for.
277robust-futex-ABI.txt 279robust-futex-ABI.txt
diff --git a/Documentation/DocBook/media/v4l/controls.xml b/Documentation/DocBook/media/v4l/controls.xml
index 85164016ed26..23fdf79f8cf3 100644
--- a/Documentation/DocBook/media/v4l/controls.xml
+++ b/Documentation/DocBook/media/v4l/controls.xml
@@ -1455,7 +1455,7 @@ Applicable to the H264 encoder.</entry>
1455 </row> 1455 </row>
1456 1456
1457 <row><entry></entry></row> 1457 <row><entry></entry></row>
1458 <row> 1458 <row id="v4l2-mpeg-video-h264-vui-sar-idc">
1459 <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_H264_VUI_SAR_IDC</constant>&nbsp;</entry> 1459 <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_H264_VUI_SAR_IDC</constant>&nbsp;</entry>
1460 <entry>enum&nbsp;v4l2_mpeg_video_h264_vui_sar_idc</entry> 1460 <entry>enum&nbsp;v4l2_mpeg_video_h264_vui_sar_idc</entry>
1461 </row> 1461 </row>
@@ -1561,7 +1561,7 @@ Applicable to the H264 encoder.</entry>
1561 </row> 1561 </row>
1562 1562
1563 <row><entry></entry></row> 1563 <row><entry></entry></row>
1564 <row> 1564 <row id="v4l2-mpeg-video-h264-level">
1565 <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_H264_LEVEL</constant>&nbsp;</entry> 1565 <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_H264_LEVEL</constant>&nbsp;</entry>
1566 <entry>enum&nbsp;v4l2_mpeg_video_h264_level</entry> 1566 <entry>enum&nbsp;v4l2_mpeg_video_h264_level</entry>
1567 </row> 1567 </row>
@@ -1641,7 +1641,7 @@ Possible values are:</entry>
1641 </row> 1641 </row>
1642 1642
1643 <row><entry></entry></row> 1643 <row><entry></entry></row>
1644 <row> 1644 <row id="v4l2-mpeg-video-mpeg4-level">
1645 <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_MPEG4_LEVEL</constant>&nbsp;</entry> 1645 <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_MPEG4_LEVEL</constant>&nbsp;</entry>
1646 <entry>enum&nbsp;v4l2_mpeg_video_mpeg4_level</entry> 1646 <entry>enum&nbsp;v4l2_mpeg_video_mpeg4_level</entry>
1647 </row> 1647 </row>
@@ -1689,9 +1689,9 @@ Possible values are:</entry>
1689 </row> 1689 </row>
1690 1690
1691 <row><entry></entry></row> 1691 <row><entry></entry></row>
1692 <row> 1692 <row id="v4l2-mpeg-video-h264-profile">
1693 <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_H264_PROFILE</constant>&nbsp;</entry> 1693 <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_H264_PROFILE</constant>&nbsp;</entry>
1694 <entry>enum&nbsp;v4l2_mpeg_h264_profile</entry> 1694 <entry>enum&nbsp;v4l2_mpeg_video_h264_profile</entry>
1695 </row> 1695 </row>
1696 <row><entry spanname="descr">The profile information for H264. 1696 <row><entry spanname="descr">The profile information for H264.
1697Applicable to the H264 encoder. 1697Applicable to the H264 encoder.
@@ -1774,9 +1774,9 @@ Possible values are:</entry>
1774 </row> 1774 </row>
1775 1775
1776 <row><entry></entry></row> 1776 <row><entry></entry></row>
1777 <row> 1777 <row id="v4l2-mpeg-video-mpeg4-profile">
1778 <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE</constant>&nbsp;</entry> 1778 <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE</constant>&nbsp;</entry>
1779 <entry>enum&nbsp;v4l2_mpeg_mpeg4_profile</entry> 1779 <entry>enum&nbsp;v4l2_mpeg_video_mpeg4_profile</entry>
1780 </row> 1780 </row>
1781 <row><entry spanname="descr">The profile information for MPEG4. 1781 <row><entry spanname="descr">The profile information for MPEG4.
1782Applicable to the MPEG4 encoder. 1782Applicable to the MPEG4 encoder.
@@ -1820,9 +1820,9 @@ Applicable to the encoder.
1820 </row> 1820 </row>
1821 1821
1822 <row><entry></entry></row> 1822 <row><entry></entry></row>
1823 <row> 1823 <row id="v4l2-mpeg-video-multi-slice-mode">
1824 <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_MULTI_SLICE_MODE</constant>&nbsp;</entry> 1824 <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_MULTI_SLICE_MODE</constant>&nbsp;</entry>
1825 <entry>enum&nbsp;v4l2_mpeg_multi_slice_mode</entry> 1825 <entry>enum&nbsp;v4l2_mpeg_video_multi_slice_mode</entry>
1826 </row> 1826 </row>
1827 <row><entry spanname="descr">Determines how the encoder should handle division of frame into slices. 1827 <row><entry spanname="descr">Determines how the encoder should handle division of frame into slices.
1828Applicable to the encoder. 1828Applicable to the encoder.
@@ -1868,9 +1868,9 @@ Applicable to the encoder.</entry>
1868 </row> 1868 </row>
1869 1869
1870 <row><entry></entry></row> 1870 <row><entry></entry></row>
1871 <row> 1871 <row id="v4l2-mpeg-video-h264-loop-filter-mode">
1872 <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_H264_LOOP_FILTER_MODE</constant>&nbsp;</entry> 1872 <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_H264_LOOP_FILTER_MODE</constant>&nbsp;</entry>
1873 <entry>enum&nbsp;v4l2_mpeg_h264_loop_filter_mode</entry> 1873 <entry>enum&nbsp;v4l2_mpeg_video_h264_loop_filter_mode</entry>
1874 </row> 1874 </row>
1875 <row><entry spanname="descr">Loop filter mode for H264 encoder. 1875 <row><entry spanname="descr">Loop filter mode for H264 encoder.
1876Possible values are:</entry> 1876Possible values are:</entry>
@@ -1913,9 +1913,9 @@ Applicable to the H264 encoder.</entry>
1913 </row> 1913 </row>
1914 1914
1915 <row><entry></entry></row> 1915 <row><entry></entry></row>
1916 <row> 1916 <row id="v4l2-mpeg-video-h264-entropy-mode">
1917 <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_H264_ENTROPY_MODE</constant>&nbsp;</entry> 1917 <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_H264_ENTROPY_MODE</constant>&nbsp;</entry>
1918 <entry>enum&nbsp;v4l2_mpeg_h264_symbol_mode</entry> 1918 <entry>enum&nbsp;v4l2_mpeg_video_h264_entropy_mode</entry>
1919 </row> 1919 </row>
1920 <row><entry spanname="descr">Entropy coding mode for H264 - CABAC/CAVALC. 1920 <row><entry spanname="descr">Entropy coding mode for H264 - CABAC/CAVALC.
1921Applicable to the H264 encoder. 1921Applicable to the H264 encoder.
@@ -2140,9 +2140,9 @@ previous frames. Applicable to the H264 encoder.</entry>
2140 </row> 2140 </row>
2141 2141
2142 <row><entry></entry></row> 2142 <row><entry></entry></row>
2143 <row> 2143 <row id="v4l2-mpeg-video-header-mode">
2144 <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_HEADER_MODE</constant>&nbsp;</entry> 2144 <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_HEADER_MODE</constant>&nbsp;</entry>
2145 <entry>enum&nbsp;v4l2_mpeg_header_mode</entry> 2145 <entry>enum&nbsp;v4l2_mpeg_video_header_mode</entry>
2146 </row> 2146 </row>
2147 <row><entry spanname="descr">Determines whether the header is returned as the first buffer or is 2147 <row><entry spanname="descr">Determines whether the header is returned as the first buffer or is
2148it returned together with the first frame. Applicable to encoders. 2148it returned together with the first frame. Applicable to encoders.
@@ -2320,9 +2320,9 @@ Valid only when H.264 and macroblock level RC is enabled (<constant>V4L2_CID_MPE
2320Applicable to the H264 encoder.</entry> 2320Applicable to the H264 encoder.</entry>
2321 </row> 2321 </row>
2322 <row><entry></entry></row> 2322 <row><entry></entry></row>
2323 <row> 2323 <row id="v4l2-mpeg-mfc51-video-frame-skip-mode">
2324 <entry spanname="id"><constant>V4L2_CID_MPEG_MFC51_VIDEO_FRAME_SKIP_MODE</constant>&nbsp;</entry> 2324 <entry spanname="id"><constant>V4L2_CID_MPEG_MFC51_VIDEO_FRAME_SKIP_MODE</constant>&nbsp;</entry>
2325 <entry>enum&nbsp;v4l2_mpeg_mfc51_frame_skip_mode</entry> 2325 <entry>enum&nbsp;v4l2_mpeg_mfc51_video_frame_skip_mode</entry>
2326 </row> 2326 </row>
2327 <row><entry spanname="descr"> 2327 <row><entry spanname="descr">
2328Indicates in what conditions the encoder should skip frames. If encoding a frame would cause the encoded stream to be larger then 2328Indicates in what conditions the encoder should skip frames. If encoding a frame would cause the encoded stream to be larger then
@@ -2361,9 +2361,9 @@ the stream will meet tight bandwidth contraints. Applicable to encoders.
2361</entry> 2361</entry>
2362 </row> 2362 </row>
2363 <row><entry></entry></row> 2363 <row><entry></entry></row>
2364 <row> 2364 <row id="v4l2-mpeg-mfc51-video-force-frame-type">
2365 <entry spanname="id"><constant>V4L2_CID_MPEG_MFC51_VIDEO_FORCE_FRAME_TYPE</constant>&nbsp;</entry> 2365 <entry spanname="id"><constant>V4L2_CID_MPEG_MFC51_VIDEO_FORCE_FRAME_TYPE</constant>&nbsp;</entry>
2366 <entry>enum&nbsp;v4l2_mpeg_mfc51_force_frame_type</entry> 2366 <entry>enum&nbsp;v4l2_mpeg_mfc51_video_force_frame_type</entry>
2367 </row> 2367 </row>
2368 <row><entry spanname="descr">Force a frame type for the next queued buffer. Applicable to encoders. 2368 <row><entry spanname="descr">Force a frame type for the next queued buffer. Applicable to encoders.
2369Possible values are:</entry> 2369Possible values are:</entry>
diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt
index 3f5e0b09bed5..53e6fca146d7 100644
--- a/Documentation/PCI/MSI-HOWTO.txt
+++ b/Documentation/PCI/MSI-HOWTO.txt
@@ -45,7 +45,7 @@ arrived in memory (this becomes more likely with devices behind PCI-PCI
45bridges). In order to ensure that all the data has arrived in memory, 45bridges). In order to ensure that all the data has arrived in memory,
46the interrupt handler must read a register on the device which raised 46the interrupt handler must read a register on the device which raised
47the interrupt. PCI transaction ordering rules require that all the data 47the interrupt. PCI transaction ordering rules require that all the data
48arrives in memory before the value can be returned from the register. 48arrive in memory before the value may be returned from the register.
49Using MSIs avoids this problem as the interrupt-generating write cannot 49Using MSIs avoids this problem as the interrupt-generating write cannot
50pass the data writes, so by the time the interrupt is raised, the driver 50pass the data writes, so by the time the interrupt is raised, the driver
51knows that all the data has arrived in memory. 51knows that all the data has arrived in memory.
@@ -86,13 +86,13 @@ device.
86 86
87int pci_enable_msi(struct pci_dev *dev) 87int pci_enable_msi(struct pci_dev *dev)
88 88
89A successful call will allocate ONE interrupt to the device, regardless 89A successful call allocates ONE interrupt to the device, regardless
90of how many MSIs the device supports. The device will be switched from 90of how many MSIs the device supports. The device is switched from
91pin-based interrupt mode to MSI mode. The dev->irq number is changed 91pin-based interrupt mode to MSI mode. The dev->irq number is changed
92to a new number which represents the message signaled interrupt. 92to a new number which represents the message signaled interrupt;
93This function should be called before the driver calls request_irq() 93consequently, this function should be called before the driver calls
94since enabling MSIs disables the pin-based IRQ and the driver will not 94request_irq(), because an MSI is delivered via a vector that is
95receive interrupts on the old interrupt. 95different from the vector of a pin-based interrupt.
96 96
974.2.2 pci_enable_msi_block 974.2.2 pci_enable_msi_block
98 98
@@ -111,20 +111,20 @@ the device are in the range dev->irq to dev->irq + count - 1.
111 111
112If this function returns a negative number, it indicates an error and 112If this function returns a negative number, it indicates an error and
113the driver should not attempt to request any more MSI interrupts for 113the driver should not attempt to request any more MSI interrupts for
114this device. If this function returns a positive number, it will be 114this device. If this function returns a positive number, it is
115less than 'count' and indicate the number of interrupts that could have 115less than 'count' and indicates the number of interrupts that could have
116been allocated. In neither case will the irq value have been 116been allocated. In neither case is the irq value updated or the device
117updated, nor will the device have been switched into MSI mode. 117switched into MSI mode.
118 118
119The device driver must decide what action to take if 119The device driver must decide what action to take if
120pci_enable_msi_block() returns a value less than the number asked for. 120pci_enable_msi_block() returns a value less than the number requested.
121Some devices can make use of fewer interrupts than the maximum they 121For instance, the driver could still make use of fewer interrupts;
122request; in this case the driver should call pci_enable_msi_block() 122in this case the driver should call pci_enable_msi_block()
123again. Note that it is not guaranteed to succeed, even when the 123again. Note that it is not guaranteed to succeed, even when the
124'count' has been reduced to the value returned from a previous call to 124'count' has been reduced to the value returned from a previous call to
125pci_enable_msi_block(). This is because there are multiple constraints 125pci_enable_msi_block(). This is because there are multiple constraints
126on the number of vectors that can be allocated; pci_enable_msi_block() 126on the number of vectors that can be allocated; pci_enable_msi_block()
127will return as soon as it finds any constraint that doesn't allow the 127returns as soon as it finds any constraint that doesn't allow the
128call to succeed. 128call to succeed.
129 129
1304.2.3 pci_disable_msi 1304.2.3 pci_disable_msi
@@ -137,10 +137,10 @@ interrupt number and frees the previously allocated message signaled
137interrupt(s). The interrupt may subsequently be assigned to another 137interrupt(s). The interrupt may subsequently be assigned to another
138device, so drivers should not cache the value of dev->irq. 138device, so drivers should not cache the value of dev->irq.
139 139
140A device driver must always call free_irq() on the interrupt(s) 140Before calling this function, a device driver must always call free_irq()
141for which it has called request_irq() before calling this function. 141on any interrupt for which it previously called request_irq().
142Failure to do so will result in a BUG_ON(), the device will be left with 142Failure to do so results in a BUG_ON(), leaving the device with
143MSI enabled and will leak its vector. 143MSI enabled and thus leaking its vector.
144 144
1454.3 Using MSI-X 1454.3 Using MSI-X
146 146
@@ -155,10 +155,10 @@ struct msix_entry {
155}; 155};
156 156
157This allows for the device to use these interrupts in a sparse fashion; 157This allows for the device to use these interrupts in a sparse fashion;
158for example it could use interrupts 3 and 1027 and allocate only a 158for example, it could use interrupts 3 and 1027 and yet allocate only a
159two-element array. The driver is expected to fill in the 'entry' value 159two-element array. The driver is expected to fill in the 'entry' value
160in each element of the array to indicate which entries it wants the kernel 160in each element of the array to indicate for which entries the kernel
161to assign interrupts for. It is invalid to fill in two entries with the 161should assign interrupts; it is invalid to fill in two entries with the
162same number. 162same number.
163 163
1644.3.1 pci_enable_msix 1644.3.1 pci_enable_msix
@@ -168,10 +168,11 @@ int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec)
168Calling this function asks the PCI subsystem to allocate 'nvec' MSIs. 168Calling this function asks the PCI subsystem to allocate 'nvec' MSIs.
169The 'entries' argument is a pointer to an array of msix_entry structs 169The 'entries' argument is a pointer to an array of msix_entry structs
170which should be at least 'nvec' entries in size. On success, the 170which should be at least 'nvec' entries in size. On success, the
171function will return 0 and the device will have been switched into 171device is switched into MSI-X mode and the function returns 0.
172MSI-X interrupt mode. The 'vector' elements in each entry will have 172The 'vector' member in each entry is populated with the interrupt number;
173been filled in with the interrupt number. The driver should then call 173the driver should then call request_irq() for each 'vector' that it
174request_irq() for each 'vector' that it decides to use. 174decides to use. The device driver is responsible for keeping track of the
175interrupts assigned to the MSI-X vectors so it can free them again later.
175 176
176If this function returns a negative number, it indicates an error and 177If this function returns a negative number, it indicates an error and
177the driver should not attempt to allocate any more MSI-X interrupts for 178the driver should not attempt to allocate any more MSI-X interrupts for
@@ -181,16 +182,14 @@ below.
181 182
182This function, in contrast with pci_enable_msi(), does not adjust 183This function, in contrast with pci_enable_msi(), does not adjust
183dev->irq. The device will not generate interrupts for this interrupt 184dev->irq. The device will not generate interrupts for this interrupt
184number once MSI-X is enabled. The device driver is responsible for 185number once MSI-X is enabled.
185keeping track of the interrupts assigned to the MSI-X vectors so it can
186free them again later.
187 186
188Device drivers should normally call this function once per device 187Device drivers should normally call this function once per device
189during the initialization phase. 188during the initialization phase.
190 189
191It is ideal if drivers can cope with a variable number of MSI-X interrupts, 190It is ideal if drivers can cope with a variable number of MSI-X interrupts;
192there are many reasons why the platform may not be able to provide the 191there are many reasons why the platform may not be able to provide the
193exact number a driver asks for. 192exact number that a driver asks for.
194 193
195A request loop to achieve that might look like: 194A request loop to achieve that might look like:
196 195
@@ -212,15 +211,15 @@ static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec)
212 211
213void pci_disable_msix(struct pci_dev *dev) 212void pci_disable_msix(struct pci_dev *dev)
214 213
215This API should be used to undo the effect of pci_enable_msix(). It frees 214This function should be used to undo the effect of pci_enable_msix(). It frees
216the previously allocated message signaled interrupts. The interrupts may 215the previously allocated message signaled interrupts. The interrupts may
217subsequently be assigned to another device, so drivers should not cache 216subsequently be assigned to another device, so drivers should not cache
218the value of the 'vector' elements over a call to pci_disable_msix(). 217the value of the 'vector' elements over a call to pci_disable_msix().
219 218
220A device driver must always call free_irq() on the interrupt(s) 219Before calling this function, a device driver must always call free_irq()
221for which it has called request_irq() before calling this function. 220on any interrupt for which it previously called request_irq().
222Failure to do so will result in a BUG_ON(), the device will be left with 221Failure to do so results in a BUG_ON(), leaving the device with
223MSI enabled and will leak its vector. 222MSI-X enabled and thus leaking its vector.
224 223
2254.3.3 The MSI-X Table 2244.3.3 The MSI-X Table
226 225
@@ -232,10 +231,10 @@ mask or unmask an interrupt, it should call disable_irq() / enable_irq().
2324.4 Handling devices implementing both MSI and MSI-X capabilities 2314.4 Handling devices implementing both MSI and MSI-X capabilities
233 232
234If a device implements both MSI and MSI-X capabilities, it can 233If a device implements both MSI and MSI-X capabilities, it can
235run in either MSI mode or MSI-X mode but not both simultaneously. 234run in either MSI mode or MSI-X mode, but not both simultaneously.
236This is a requirement of the PCI spec, and it is enforced by the 235This is a requirement of the PCI spec, and it is enforced by the
237PCI layer. Calling pci_enable_msi() when MSI-X is already enabled or 236PCI layer. Calling pci_enable_msi() when MSI-X is already enabled or
238pci_enable_msix() when MSI is already enabled will result in an error. 237pci_enable_msix() when MSI is already enabled results in an error.
239If a device driver wishes to switch between MSI and MSI-X at runtime, 238If a device driver wishes to switch between MSI and MSI-X at runtime,
240it must first quiesce the device, then switch it back to pin-interrupt 239it must first quiesce the device, then switch it back to pin-interrupt
241mode, before calling pci_enable_msi() or pci_enable_msix() and resuming 240mode, before calling pci_enable_msi() or pci_enable_msix() and resuming
@@ -251,7 +250,7 @@ the MSI-X facilities in preference to the MSI facilities. As mentioned
251above, MSI-X supports any number of interrupts between 1 and 2048. 250above, MSI-X supports any number of interrupts between 1 and 2048.
252In constrast, MSI is restricted to a maximum of 32 interrupts (and 251In constrast, MSI is restricted to a maximum of 32 interrupts (and
253must be a power of two). In addition, the MSI interrupt vectors must 252must be a power of two). In addition, the MSI interrupt vectors must
254be allocated consecutively, so the system may not be able to allocate 253be allocated consecutively, so the system might not be able to allocate
255as many vectors for MSI as it could for MSI-X. On some platforms, MSI 254as many vectors for MSI as it could for MSI-X. On some platforms, MSI
256interrupts must all be targeted at the same set of CPUs whereas MSI-X 255interrupts must all be targeted at the same set of CPUs whereas MSI-X
257interrupts can all be targeted at different CPUs. 256interrupts can all be targeted at different CPUs.
@@ -281,7 +280,7 @@ disabled to enabled and back again.
281 280
282Using 'lspci -v' (as root) may show some devices with "MSI", "Message 281Using 'lspci -v' (as root) may show some devices with "MSI", "Message
283Signalled Interrupts" or "MSI-X" capabilities. Each of these capabilities 282Signalled Interrupts" or "MSI-X" capabilities. Each of these capabilities
284has an 'Enable' flag which will be followed with either "+" (enabled) 283has an 'Enable' flag which is followed with either "+" (enabled)
285or "-" (disabled). 284or "-" (disabled).
286 285
287 286
@@ -298,7 +297,7 @@ The PCI stack provides three ways to disable MSIs:
298 297
299Some host chipsets simply don't support MSIs properly. If we're 298Some host chipsets simply don't support MSIs properly. If we're
300lucky, the manufacturer knows this and has indicated it in the ACPI 299lucky, the manufacturer knows this and has indicated it in the ACPI
301FADT table. In this case, Linux will automatically disable MSIs. 300FADT table. In this case, Linux automatically disables MSIs.
302Some boards don't include this information in the table and so we have 301Some boards don't include this information in the table and so we have
303to detect them ourselves. The complete list of these is found near the 302to detect them ourselves. The complete list of these is found near the
304quirk_disable_all_msi() function in drivers/pci/quirks.c. 303quirk_disable_all_msi() function in drivers/pci/quirks.c.
@@ -317,7 +316,7 @@ Some bridges allow you to enable MSIs by changing some bits in their
317PCI configuration space (especially the Hypertransport chipsets such 316PCI configuration space (especially the Hypertransport chipsets such
318as the nVidia nForce and Serverworks HT2000). As with host chipsets, 317as the nVidia nForce and Serverworks HT2000). As with host chipsets,
319Linux mostly knows about them and automatically enables MSIs if it can. 318Linux mostly knows about them and automatically enables MSIs if it can.
320If you have a bridge which Linux doesn't yet know about, you can enable 319If you have a bridge unknown to Linux, you can enable
321MSIs in configuration space using whatever method you know works, then 320MSIs in configuration space using whatever method you know works, then
322enable MSIs on that bridge by doing: 321enable MSIs on that bridge by doing:
323 322
@@ -327,7 +326,7 @@ where $bridge is the PCI address of the bridge you've enabled (eg
3270000:00:0e.0). 3260000:00:0e.0).
328 327
329To disable MSIs, echo 0 instead of 1. Changing this value should be 328To disable MSIs, echo 0 instead of 1. Changing this value should be
330done with caution as it can break interrupt handling for all devices 329done with caution as it could break interrupt handling for all devices
331below this bridge. 330below this bridge.
332 331
333Again, please notify linux-pci@vger.kernel.org of any bridges that need 332Again, please notify linux-pci@vger.kernel.org of any bridges that need
@@ -336,7 +335,7 @@ special handling.
3365.3. Disabling MSIs on a single device 3355.3. Disabling MSIs on a single device
337 336
338Some devices are known to have faulty MSI implementations. Usually this 337Some devices are known to have faulty MSI implementations. Usually this
339is handled in the individual device driver but occasionally it's necessary 338is handled in the individual device driver, but occasionally it's necessary
340to handle this with a quirk. Some drivers have an option to disable use 339to handle this with a quirk. Some drivers have an option to disable use
341of MSI. While this is a convenient workaround for the driver author, 340of MSI. While this is a convenient workaround for the driver author,
342it is not good practise, and should not be emulated. 341it is not good practise, and should not be emulated.
@@ -350,7 +349,7 @@ for your machine. You should also check your .config to be sure you
350have enabled CONFIG_PCI_MSI. 349have enabled CONFIG_PCI_MSI.
351 350
352Then, 'lspci -t' gives the list of bridges above a device. Reading 351Then, 'lspci -t' gives the list of bridges above a device. Reading
353/sys/bus/pci/devices/*/msi_bus will tell you whether MSI are enabled (1) 352/sys/bus/pci/devices/*/msi_bus will tell you whether MSIs are enabled (1)
354or disabled (0). If 0 is found in any of the msi_bus files belonging 353or disabled (0). If 0 is found in any of the msi_bus files belonging
355to bridges between the PCI root and the device, MSIs are disabled. 354to bridges between the PCI root and the device, MSIs are disabled.
356 355
diff --git a/Documentation/SubmittingDrivers b/Documentation/SubmittingDrivers
index 319baa8b60dd..36d16bbf72c6 100644
--- a/Documentation/SubmittingDrivers
+++ b/Documentation/SubmittingDrivers
@@ -130,7 +130,7 @@ Linux kernel master tree:
130 ftp.??.kernel.org:/pub/linux/kernel/... 130 ftp.??.kernel.org:/pub/linux/kernel/...
131 ?? == your country code, such as "us", "uk", "fr", etc. 131 ?? == your country code, such as "us", "uk", "fr", etc.
132 132
133 http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git 133 http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git
134 134
135Linux kernel mailing list: 135Linux kernel mailing list:
136 linux-kernel@vger.kernel.org 136 linux-kernel@vger.kernel.org
diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches
index 569f3532e138..4468ce24427c 100644
--- a/Documentation/SubmittingPatches
+++ b/Documentation/SubmittingPatches
@@ -303,7 +303,7 @@ patches that are being emailed around.
303 303
304The sign-off is a simple line at the end of the explanation for the 304The sign-off is a simple line at the end of the explanation for the
305patch, which certifies that you wrote it or otherwise have the right to 305patch, which certifies that you wrote it or otherwise have the right to
306pass it on as a open-source patch. The rules are pretty simple: if you 306pass it on as an open-source patch. The rules are pretty simple: if you
307can certify the below: 307can certify the below:
308 308
309 Developer's Certificate of Origin 1.1 309 Developer's Certificate of Origin 1.1
diff --git a/Documentation/block/cfq-iosched.txt b/Documentation/block/cfq-iosched.txt
index e578feed6d81..6d670f570451 100644
--- a/Documentation/block/cfq-iosched.txt
+++ b/Documentation/block/cfq-iosched.txt
@@ -43,3 +43,74 @@ If one sets slice_idle=0 and if storage supports NCQ, CFQ internally switches
43to IOPS mode and starts providing fairness in terms of number of requests 43to IOPS mode and starts providing fairness in terms of number of requests
44dispatched. Note that this mode switching takes effect only for group 44dispatched. Note that this mode switching takes effect only for group
45scheduling. For non-cgroup users nothing should change. 45scheduling. For non-cgroup users nothing should change.
46
47CFQ IO scheduler Idling Theory
48===============================
49Idling on a queue is primarily about waiting for the next request to come
50on same queue after completion of a request. In this process CFQ will not
51dispatch requests from other cfq queues even if requests are pending there.
52
53The rationale behind idling is that it can cut down on number of seeks
54on rotational media. For example, if a process is doing dependent
55sequential reads (next read will come on only after completion of previous
56one), then not dispatching request from other queue should help as we
57did not move the disk head and kept on dispatching sequential IO from
58one queue.
59
60CFQ has following service trees and various queues are put on these trees.
61
62 sync-idle sync-noidle async
63
64All cfq queues doing synchronous sequential IO go on to sync-idle tree.
65On this tree we idle on each queue individually.
66
67All synchronous non-sequential queues go on sync-noidle tree. Also any
68request which are marked with REQ_NOIDLE go on this service tree. On this
69tree we do not idle on individual queues instead idle on the whole group
70of queues or the tree. So if there are 4 queues waiting for IO to dispatch
71we will idle only once last queue has dispatched the IO and there is
72no more IO on this service tree.
73
74All async writes go on async service tree. There is no idling on async
75queues.
76
77CFQ has some optimizations for SSDs and if it detects a non-rotational
78media which can support higher queue depth (multiple requests at in
79flight at a time), then it cuts down on idling of individual queues and
80all the queues move to sync-noidle tree and only tree idle remains. This
81tree idling provides isolation with buffered write queues on async tree.
82
83FAQ
84===
85Q1. Why to idle at all on queues marked with REQ_NOIDLE.
86
87A1. We only do tree idle (all queues on sync-noidle tree) on queues marked
88 with REQ_NOIDLE. This helps in providing isolation with all the sync-idle
89 queues. Otherwise in presence of many sequential readers, other
90 synchronous IO might not get fair share of disk.
91
92 For example, if there are 10 sequential readers doing IO and they get
93 100ms each. If a REQ_NOIDLE request comes in, it will be scheduled
94 roughly after 1 second. If after completion of REQ_NOIDLE request we
95 do not idle, and after a couple of milli seconds a another REQ_NOIDLE
96 request comes in, again it will be scheduled after 1second. Repeat it
97 and notice how a workload can lose its disk share and suffer due to
98 multiple sequential readers.
99
100 fsync can generate dependent IO where bunch of data is written in the
101 context of fsync, and later some journaling data is written. Journaling
102 data comes in only after fsync has finished its IO (atleast for ext4
103 that seemed to be the case). Now if one decides not to idle on fsync
104 thread due to REQ_NOIDLE, then next journaling write will not get
105 scheduled for another second. A process doing small fsync, will suffer
106 badly in presence of multiple sequential readers.
107
108 Hence doing tree idling on threads using REQ_NOIDLE flag on requests
109 provides isolation from multiple sequential readers and at the same
110 time we do not idle on individual threads.
111
112Q2. When to specify REQ_NOIDLE
113A2. I would think whenever one is doing synchronous write and not expecting
114 more writes to be dispatched from same context soon, should be able
115 to specify REQ_NOIDLE on writes and that probably should work well for
116 most of the cases.
diff --git a/Documentation/email-clients.txt b/Documentation/email-clients.txt
index a0b58e29f911..860c29a472ad 100644
--- a/Documentation/email-clients.txt
+++ b/Documentation/email-clients.txt
@@ -199,18 +199,16 @@ to coerce it into behaving.
199 199
200To beat some sense out of the internal editor, do this: 200To beat some sense out of the internal editor, do this:
201 201
202- Under account settings, composition and addressing, uncheck "Compose
203 messages in HTML format".
204
205- Edit your Thunderbird config settings so that it won't use format=flowed. 202- Edit your Thunderbird config settings so that it won't use format=flowed.
206 Go to "edit->preferences->advanced->config editor" to bring up the 203 Go to "edit->preferences->advanced->config editor" to bring up the
207 thunderbird's registry editor, and set "mailnews.send_plaintext_flowed" to 204 thunderbird's registry editor, and set "mailnews.send_plaintext_flowed" to
208 "false". 205 "false".
209 206
210- Enable "preformat" mode: Shft-click on the Write icon to bring up the HTML 207- Disable HTML Format: Set "mail.identity.id1.compose_html" to "false".
211 composer, select "Preformat" from the drop-down box just under the subject 208
212 line, then close the message without saving. (This setting also applies to 209- Enable "preformat" mode: Set "editor.quotesPreformatted" to "true".
213 the text composer, but the only control for it is in the HTML composer.) 210
211- Enable UTF8: Set "prefs.converted-to-utf8" to "true".
214 212
215- Install the "toggle wordwrap" extension. Download the file from: 213- Install the "toggle wordwrap" extension. Download the file from:
216 https://addons.mozilla.org/thunderbird/addon/2351/ 214 https://addons.mozilla.org/thunderbird/addon/2351/
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index c4a6e148732a..4dc465477665 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -592,3 +592,11 @@ Why: In 3.0, we can now autodetect internal 3G device and already have
592 interface that was used by acer-wmi driver. It will replaced by 592 interface that was used by acer-wmi driver. It will replaced by
593 information log when acer-wmi initial. 593 information log when acer-wmi initial.
594Who: Lee, Chun-Yi <jlee@novell.com> 594Who: Lee, Chun-Yi <jlee@novell.com>
595
596----------------------------
597What: The XFS nodelaylog mount option
598When: 3.3
599Why: The delaylog mode that has been the default since 2.6.39 has proven
600 stable, and the old code is in the way of additional improvements in
601 the log code.
602Who: Christoph Hellwig <hch@lst.de>
diff --git a/Documentation/filesystems/befs.txt b/Documentation/filesystems/befs.txt
index 6e49c363938e..da45e6c842b8 100644
--- a/Documentation/filesystems/befs.txt
+++ b/Documentation/filesystems/befs.txt
@@ -27,7 +27,7 @@ His original code can still be found at:
27Does anyone know of a more current email address for Makoto? He doesn't 27Does anyone know of a more current email address for Makoto? He doesn't
28respond to the address given above... 28respond to the address given above...
29 29
30Current maintainer: Sergey S. Kostyliov <rathamahata@php4.ru> 30This filesystem doesn't have a maintainer.
31 31
32WHAT IS THIS DRIVER? 32WHAT IS THIS DRIVER?
33================== 33==================
diff --git a/Documentation/hwmon/max16065 b/Documentation/hwmon/max16065
index 44b4f61e04f9..c11f64a1f2ad 100644
--- a/Documentation/hwmon/max16065
+++ b/Documentation/hwmon/max16065
@@ -62,6 +62,13 @@ can be safely used to identify the chip. You will have to instantiate
62the devices explicitly. Please see Documentation/i2c/instantiating-devices for 62the devices explicitly. Please see Documentation/i2c/instantiating-devices for
63details. 63details.
64 64
65WARNING: Do not access chip registers using the i2cdump command, and do not use
66any of the i2ctools commands on a command register (0xa5 to 0xac). The chips
67supported by this driver interpret any access to a command register (including
68read commands) as request to execute the command in question. This may result in
69power loss, board resets, and/or Flash corruption. Worst case, your board may
70turn into a brick.
71
65 72
66Sysfs entries 73Sysfs entries
67------------- 74-------------
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 845a191004b1..54078ed96b37 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -319,4 +319,6 @@ Code Seq#(hex) Include File Comments
319 <mailto:thomas@winischhofer.net> 319 <mailto:thomas@winischhofer.net>
3200xF4 00-1F video/mbxfb.h mbxfb 3200xF4 00-1F video/mbxfb.h mbxfb
321 <mailto:raph@8d.com> 321 <mailto:raph@8d.com>
3220xF6 all LTTng Linux Trace Toolkit Next Generation
323 <mailto:mathieu.desnoyers@efficios.com>
3220xFD all linux/dm-ioctl.h 3240xFD all linux/dm-ioctl.h
diff --git a/Documentation/kernel-docs.txt b/Documentation/kernel-docs.txt
index 9a8674629a07..0e0734b509d8 100644
--- a/Documentation/kernel-docs.txt
+++ b/Documentation/kernel-docs.txt
@@ -620,17 +620,6 @@
620 (including this document itself) have been moved there, and might 620 (including this document itself) have been moved there, and might
621 be more up to date than the web version. 621 be more up to date than the web version.
622 622
623 * Name: "Linux Source Driver"
624 URL: http://lsd.linux.cz
625 Keywords: Browsing source code.
626 Description: "Linux Source Driver (LSD) is an application, which
627 can make browsing source codes of Linux kernel easier than you can
628 imagine. You can select between multiple versions of kernel (e.g.
629 0.01, 1.0.0, 2.0.33, 2.0.34pre13, 2.0.0, 2.1.101 etc.). With LSD
630 you can search Linux kernel (fulltext, macros, types, functions
631 and variables) and LSD can generate patches for you on the fly
632 (files, directories or kernel)".
633
634 * Name: "Linux Kernel Source Reference" 623 * Name: "Linux Kernel Source Reference"
635 Author: Thomas Graichen. 624 Author: Thomas Graichen.
636 URL: http://marc.info/?l=linux-kernel&m=96446640102205&w=4 625 URL: http://marc.info/?l=linux-kernel&m=96446640102205&w=4
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e279b7242912..854ed5ca7e3f 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -40,6 +40,7 @@ parameter is applicable:
40 ALSA ALSA sound support is enabled. 40 ALSA ALSA sound support is enabled.
41 APIC APIC support is enabled. 41 APIC APIC support is enabled.
42 APM Advanced Power Management support is enabled. 42 APM Advanced Power Management support is enabled.
43 ARM ARM architecture is enabled.
43 AVR32 AVR32 architecture is enabled. 44 AVR32 AVR32 architecture is enabled.
44 AX25 Appropriate AX.25 support is enabled. 45 AX25 Appropriate AX.25 support is enabled.
45 BLACKFIN Blackfin architecture is enabled. 46 BLACKFIN Blackfin architecture is enabled.
@@ -49,6 +50,7 @@ parameter is applicable:
49 EFI EFI Partitioning (GPT) is enabled 50 EFI EFI Partitioning (GPT) is enabled
50 EIDE EIDE/ATAPI support is enabled. 51 EIDE EIDE/ATAPI support is enabled.
51 FB The frame buffer device is enabled. 52 FB The frame buffer device is enabled.
53 FTRACE Function tracing enabled.
52 GCOV GCOV profiling is enabled. 54 GCOV GCOV profiling is enabled.
53 HW Appropriate hardware is enabled. 55 HW Appropriate hardware is enabled.
54 IA-64 IA-64 architecture is enabled. 56 IA-64 IA-64 architecture is enabled.
@@ -69,6 +71,7 @@ parameter is applicable:
69 Documentation/m68k/kernel-options.txt. 71 Documentation/m68k/kernel-options.txt.
70 MCA MCA bus support is enabled. 72 MCA MCA bus support is enabled.
71 MDA MDA console support is enabled. 73 MDA MDA console support is enabled.
74 MIPS MIPS architecture is enabled.
72 MOUSE Appropriate mouse support is enabled. 75 MOUSE Appropriate mouse support is enabled.
73 MSI Message Signaled Interrupts (PCI). 76 MSI Message Signaled Interrupts (PCI).
74 MTD MTD (Memory Technology Device) support is enabled. 77 MTD MTD (Memory Technology Device) support is enabled.
@@ -100,7 +103,6 @@ parameter is applicable:
100 SPARC Sparc architecture is enabled. 103 SPARC Sparc architecture is enabled.
101 SWSUSP Software suspend (hibernation) is enabled. 104 SWSUSP Software suspend (hibernation) is enabled.
102 SUSPEND System suspend states are enabled. 105 SUSPEND System suspend states are enabled.
103 FTRACE Function tracing enabled.
104 TPM TPM drivers are enabled. 106 TPM TPM drivers are enabled.
105 TS Appropriate touchscreen support is enabled. 107 TS Appropriate touchscreen support is enabled.
106 UMS USB Mass Storage support is enabled. 108 UMS USB Mass Storage support is enabled.
@@ -115,7 +117,7 @@ parameter is applicable:
115 X86-64 X86-64 architecture is enabled. 117 X86-64 X86-64 architecture is enabled.
116 More X86-64 boot options can be found in 118 More X86-64 boot options can be found in
117 Documentation/x86/x86_64/boot-options.txt . 119 Documentation/x86/x86_64/boot-options.txt .
118 X86 Either 32bit or 64bit x86 (same as X86-32+X86-64) 120 X86 Either 32-bit or 64-bit x86 (same as X86-32+X86-64)
119 XEN Xen support is enabled 121 XEN Xen support is enabled
120 122
121In addition, the following text indicates that the option: 123In addition, the following text indicates that the option:
@@ -376,7 +378,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
376 atkbd.softrepeat= [HW] 378 atkbd.softrepeat= [HW]
377 Use software keyboard repeat 379 Use software keyboard repeat
378 380
379 autotest [IA64] 381 autotest [IA-64]
380 382
381 baycom_epp= [HW,AX25] 383 baycom_epp= [HW,AX25]
382 Format: <io>,<mode> 384 Format: <io>,<mode>
@@ -681,8 +683,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
681 uart[8250],mmio32,<addr>[,options] 683 uart[8250],mmio32,<addr>[,options]
682 Start an early, polled-mode console on the 8250/16550 684 Start an early, polled-mode console on the 8250/16550
683 UART at the specified I/O port or MMIO address. 685 UART at the specified I/O port or MMIO address.
684 MMIO inter-register address stride is either 8bit (mmio) 686 MMIO inter-register address stride is either 8-bit
685 or 32bit (mmio32). 687 (mmio) or 32-bit (mmio32).
686 The options are the same as for ttyS, above. 688 The options are the same as for ttyS, above.
687 689
688 earlyprintk= [X86,SH,BLACKFIN] 690 earlyprintk= [X86,SH,BLACKFIN]
@@ -725,7 +727,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
725 See Documentation/block/as-iosched.txt and 727 See Documentation/block/as-iosched.txt and
726 Documentation/block/deadline-iosched.txt for details. 728 Documentation/block/deadline-iosched.txt for details.
727 729
728 elfcorehdr= [IA64,PPC,SH,X86] 730 elfcorehdr= [IA-64,PPC,SH,X86]
729 Specifies physical address of start of kernel core 731 Specifies physical address of start of kernel core
730 image elf header. Generally kexec loader will 732 image elf header. Generally kexec loader will
731 pass this option to capture kernel. 733 pass this option to capture kernel.
@@ -791,7 +793,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
791 tracer at boot up. function-list is a comma separated 793 tracer at boot up. function-list is a comma separated
792 list of functions. This list can be changed at run 794 list of functions. This list can be changed at run
793 time by the set_ftrace_filter file in the debugfs 795 time by the set_ftrace_filter file in the debugfs
794 tracing directory. 796 tracing directory.
795 797
796 ftrace_notrace=[function-list] 798 ftrace_notrace=[function-list]
797 [FTRACE] Do not trace the functions specified in 799 [FTRACE] Do not trace the functions specified in
@@ -829,7 +831,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
829 831
830 hashdist= [KNL,NUMA] Large hashes allocated during boot 832 hashdist= [KNL,NUMA] Large hashes allocated during boot
831 are distributed across NUMA nodes. Defaults on 833 are distributed across NUMA nodes. Defaults on
832 for 64bit NUMA, off otherwise. 834 for 64-bit NUMA, off otherwise.
833 Format: 0 | 1 (for off | on) 835 Format: 0 | 1 (for off | on)
834 836
835 hcl= [IA-64] SGI's Hardware Graph compatibility layer 837 hcl= [IA-64] SGI's Hardware Graph compatibility layer
@@ -998,10 +1000,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
998 DMA. 1000 DMA.
999 forcedac [x86_64] 1001 forcedac [x86_64]
1000 With this option iommu will not optimize to look 1002 With this option iommu will not optimize to look
1001 for io virtual address below 32 bit forcing dual 1003 for io virtual address below 32-bit forcing dual
1002 address cycle on pci bus for cards supporting greater 1004 address cycle on pci bus for cards supporting greater
1003 than 32 bit addressing. The default is to look 1005 than 32-bit addressing. The default is to look
1004 for translation below 32 bit and if not available 1006 for translation below 32-bit and if not available
1005 then look in the higher range. 1007 then look in the higher range.
1006 strict [Default Off] 1008 strict [Default Off]
1007 With this option on every unmap_single operation will 1009 With this option on every unmap_single operation will
@@ -1017,7 +1019,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1017 off disable Interrupt Remapping 1019 off disable Interrupt Remapping
1018 nosid disable Source ID checking 1020 nosid disable Source ID checking
1019 1021
1020 inttest= [IA64] 1022 inttest= [IA-64]
1021 1023
1022 iomem= Disable strict checking of access to MMIO memory 1024 iomem= Disable strict checking of access to MMIO memory
1023 strict regions from userspace. 1025 strict regions from userspace.
@@ -1034,7 +1036,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1034 nomerge 1036 nomerge
1035 forcesac 1037 forcesac
1036 soft 1038 soft
1037 pt [x86, IA64] 1039 pt [x86, IA-64]
1038 1040
1039 io7= [HW] IO7 for Marvel based alpha systems 1041 io7= [HW] IO7 for Marvel based alpha systems
1040 See comment before marvel_specify_io7 in 1042 See comment before marvel_specify_io7 in
@@ -1165,7 +1167,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1165 1167
1166 kvm-amd.npt= [KVM,AMD] Disable nested paging (virtualized MMU) 1168 kvm-amd.npt= [KVM,AMD] Disable nested paging (virtualized MMU)
1167 for all guests. 1169 for all guests.
1168 Default is 1 (enabled) if in 64bit or 32bit-PAE mode 1170 Default is 1 (enabled) if in 64-bit or 32-bit PAE mode.
1169 1171
1170 kvm-intel.ept= [KVM,Intel] Disable extended page tables 1172 kvm-intel.ept= [KVM,Intel] Disable extended page tables
1171 (virtualized MMU) support on capable Intel chips. 1173 (virtualized MMU) support on capable Intel chips.
@@ -1202,10 +1204,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1202 libata.dma=0 Disable all PATA and SATA DMA 1204 libata.dma=0 Disable all PATA and SATA DMA
1203 libata.dma=1 PATA and SATA Disk DMA only 1205 libata.dma=1 PATA and SATA Disk DMA only
1204 libata.dma=2 ATAPI (CDROM) DMA only 1206 libata.dma=2 ATAPI (CDROM) DMA only
1205 libata.dma=4 Compact Flash DMA only 1207 libata.dma=4 Compact Flash DMA only
1206 Combinations also work, so libata.dma=3 enables DMA 1208 Combinations also work, so libata.dma=3 enables DMA
1207 for disks and CDROMs, but not CFs. 1209 for disks and CDROMs, but not CFs.
1208 1210
1209 libata.ignore_hpa= [LIBATA] Ignore HPA limit 1211 libata.ignore_hpa= [LIBATA] Ignore HPA limit
1210 libata.ignore_hpa=0 keep BIOS limits (default) 1212 libata.ignore_hpa=0 keep BIOS limits (default)
1211 libata.ignore_hpa=1 ignore limits, using full disk 1213 libata.ignore_hpa=1 ignore limits, using full disk
@@ -1331,7 +1333,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1331 ltpc= [NET] 1333 ltpc= [NET]
1332 Format: <io>,<irq>,<dma> 1334 Format: <io>,<irq>,<dma>
1333 1335
1334 machvec= [IA64] Force the use of a particular machine-vector 1336 machvec= [IA-64] Force the use of a particular machine-vector
1335 (machvec) in a generic kernel. 1337 (machvec) in a generic kernel.
1336 Example: machvec=hpzx1_swiotlb 1338 Example: machvec=hpzx1_swiotlb
1337 1339
@@ -1348,9 +1350,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1348 it is equivalent to "nosmp", which also disables 1350 it is equivalent to "nosmp", which also disables
1349 the IO APIC. 1351 the IO APIC.
1350 1352
1351 max_loop= [LOOP] Maximum number of loopback devices that can 1353 max_loop= [LOOP] The number of loop block devices that get
1352 be mounted 1354 (loop.max_loop) unconditionally pre-created at init time. The default
1353 Format: <1-256> 1355 number is configured by BLK_DEV_LOOP_MIN_COUNT. Instead
1356 of statically allocating a predefined number, loop
1357 devices can be requested on-demand with the
1358 /dev/loop-control interface.
1354 1359
1355 mcatest= [IA-64] 1360 mcatest= [IA-64]
1356 1361
@@ -1734,7 +1739,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1734 1739
1735 nointroute [IA-64] 1740 nointroute [IA-64]
1736 1741
1737 nojitter [IA64] Disables jitter checking for ITC timers. 1742 nojitter [IA-64] Disables jitter checking for ITC timers.
1738 1743
1739 no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver 1744 no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
1740 1745
@@ -1800,7 +1805,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1800 1805
1801 nox2apic [X86-64,APIC] Do not enable x2APIC mode. 1806 nox2apic [X86-64,APIC] Do not enable x2APIC mode.
1802 1807
1803 nptcg= [IA64] Override max number of concurrent global TLB 1808 nptcg= [IA-64] Override max number of concurrent global TLB
1804 purges which is reported from either PAL_VM_SUMMARY or 1809 purges which is reported from either PAL_VM_SUMMARY or
1805 SAL PALO. 1810 SAL PALO.
1806 1811
@@ -2077,13 +2082,16 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2077 Format: { parport<nr> | timid | 0 } 2082 Format: { parport<nr> | timid | 0 }
2078 See also Documentation/parport.txt. 2083 See also Documentation/parport.txt.
2079 2084
2080 pmtmr= [X86] Manual setup of pmtmr I/O Port. 2085 pmtmr= [X86] Manual setup of pmtmr I/O Port.
2081 Override pmtimer IOPort with a hex value. 2086 Override pmtimer IOPort with a hex value.
2082 e.g. pmtmr=0x508 2087 e.g. pmtmr=0x508
2083 2088
2084 pnp.debug [PNP] 2089 pnp.debug=1 [PNP]
2085 Enable PNP debug messages. This depends on the 2090 Enable PNP debug messages (depends on the
2086 CONFIG_PNP_DEBUG_MESSAGES option. 2091 CONFIG_PNP_DEBUG_MESSAGES option). Change at run-time
2092 via /sys/module/pnp/parameters/debug. We always show
2093 current resource usage; turning this on also shows
2094 possible settings and some assignment information.
2087 2095
2088 pnpacpi= [ACPI] 2096 pnpacpi= [ACPI]
2089 { off } 2097 { off }
@@ -2635,6 +2643,16 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2635 medium is write-protected). 2643 medium is write-protected).
2636 Example: quirks=0419:aaf5:rl,0421:0433:rc 2644 Example: quirks=0419:aaf5:rl,0421:0433:rc
2637 2645
2646 user_debug= [KNL,ARM]
2647 Format: <int>
2648 See arch/arm/Kconfig.debug help text.
2649 1 - undefined instruction events
2650 2 - system calls
2651 4 - invalid data aborts
2652 8 - SIGSEGV faults
2653 16 - SIGBUS faults
2654 Example: user_debug=31
2655
2638 userpte= 2656 userpte=
2639 [X86] Flags controlling user PTE allocations. 2657 [X86] Flags controlling user PTE allocations.
2640 2658
@@ -2680,6 +2698,27 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2680 vmpoff= [KNL,S390] Perform z/VM CP command after power off. 2698 vmpoff= [KNL,S390] Perform z/VM CP command after power off.
2681 Format: <command> 2699 Format: <command>
2682 2700
2701 vsyscall= [X86-64]
2702 Controls the behavior of vsyscalls (i.e. calls to
2703 fixed addresses of 0xffffffffff600x00 from legacy
2704 code). Most statically-linked binaries and older
2705 versions of glibc use these calls. Because these
2706 functions are at fixed addresses, they make nice
2707 targets for exploits that can control RIP.
2708
2709 emulate [default] Vsyscalls turn into traps and are
2710 emulated reasonably safely.
2711
2712 native Vsyscalls are native syscall instructions.
2713 This is a little bit faster than trapping
2714 and makes a few dynamic recompilers work
2715 better than they would in emulation mode.
2716 It also makes exploits much easier to write.
2717
2718 none Vsyscalls don't work at all. This makes
2719 them quite hard to use for exploits but
2720 might break your system.
2721
2683 vt.cur_default= [VT] Default cursor shape. 2722 vt.cur_default= [VT] Default cursor shape.
2684 Format: 0xCCBBAA, where AA, BB, and CC are the same as 2723 Format: 0xCCBBAA, where AA, BB, and CC are the same as
2685 the parameters of the <Esc>[?A;B;Cc escape sequence; 2724 the parameters of the <Esc>[?A;B;Cc escape sequence;
diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX
index 4edd78dfb362..bbce1215434a 100644
--- a/Documentation/networking/00-INDEX
+++ b/Documentation/networking/00-INDEX
@@ -1,13 +1,21 @@
100-INDEX 100-INDEX
2 - this file 2 - this file
33c359.txt
4 - information on the 3Com TokenLink Velocity XL (3c5359) driver.
33c505.txt 53c505.txt
4 - information on the 3Com EtherLink Plus (3c505) driver. 6 - information on the 3Com EtherLink Plus (3c505) driver.
73c509.txt
8 - information on the 3Com Etherlink III Series Ethernet cards.
56pack.txt 96pack.txt
6 - info on the 6pack protocol, an alternative to KISS for AX.25 10 - info on the 6pack protocol, an alternative to KISS for AX.25
7DLINK.txt 11DLINK.txt
8 - info on the D-Link DE-600/DE-620 parallel port pocket adapters 12 - info on the D-Link DE-600/DE-620 parallel port pocket adapters
9PLIP.txt 13PLIP.txt
10 - PLIP: The Parallel Line Internet Protocol device driver 14 - PLIP: The Parallel Line Internet Protocol device driver
15README.ipw2100
16 - README for the Intel PRO/Wireless 2100 driver.
17README.ipw2200
18 - README for the Intel PRO/Wireless 2915ABG and 2200BG driver.
11README.sb1000 19README.sb1000
12 - info on General Instrument/NextLevel SURFboard1000 cable modem. 20 - info on General Instrument/NextLevel SURFboard1000 cable modem.
13alias.txt 21alias.txt
@@ -20,8 +28,12 @@ atm.txt
20 - info on where to get ATM programs and support for Linux. 28 - info on where to get ATM programs and support for Linux.
21ax25.txt 29ax25.txt
22 - info on using AX.25 and NET/ROM code for Linux 30 - info on using AX.25 and NET/ROM code for Linux
31batman-adv.txt
32 - B.A.T.M.A.N routing protocol on top of layer 2 Ethernet Frames.
23baycom.txt 33baycom.txt
24 - info on the driver for Baycom style amateur radio modems 34 - info on the driver for Baycom style amateur radio modems
35bonding.txt
36 - Linux Ethernet Bonding Driver HOWTO: link aggregation in Linux.
25bridge.txt 37bridge.txt
26 - where to get user space programs for ethernet bridging with Linux. 38 - where to get user space programs for ethernet bridging with Linux.
27can.txt 39can.txt
@@ -34,32 +46,60 @@ cxacru.txt
34 - Conexant AccessRunner USB ADSL Modem 46 - Conexant AccessRunner USB ADSL Modem
35cxacru-cf.py 47cxacru-cf.py
36 - Conexant AccessRunner USB ADSL Modem configuration file parser 48 - Conexant AccessRunner USB ADSL Modem configuration file parser
49cxgb.txt
50 - Release Notes for the Chelsio N210 Linux device driver.
51dccp.txt
52 - the Datagram Congestion Control Protocol (DCCP) (RFC 4340..42).
37de4x5.txt 53de4x5.txt
38 - the Digital EtherWORKS DE4?? and DE5?? PCI Ethernet driver 54 - the Digital EtherWORKS DE4?? and DE5?? PCI Ethernet driver
39decnet.txt 55decnet.txt
40 - info on using the DECnet networking layer in Linux. 56 - info on using the DECnet networking layer in Linux.
41depca.txt 57depca.txt
42 - the Digital DEPCA/EtherWORKS DE1?? and DE2?? LANCE Ethernet driver 58 - the Digital DEPCA/EtherWORKS DE1?? and DE2?? LANCE Ethernet driver
59dl2k.txt
60 - README for D-Link DL2000-based Gigabit Ethernet Adapters (dl2k.ko).
61dm9000.txt
62 - README for the Simtec DM9000 Network driver.
43dmfe.txt 63dmfe.txt
44 - info on the Davicom DM9102(A)/DM9132/DM9801 fast ethernet driver. 64 - info on the Davicom DM9102(A)/DM9132/DM9801 fast ethernet driver.
65dns_resolver.txt
66 - The DNS resolver module allows kernel servies to make DNS queries.
67driver.txt
68 - Softnet driver issues.
45e100.txt 69e100.txt
46 - info on Intel's EtherExpress PRO/100 line of 10/100 boards 70 - info on Intel's EtherExpress PRO/100 line of 10/100 boards
47e1000.txt 71e1000.txt
48 - info on Intel's E1000 line of gigabit ethernet boards 72 - info on Intel's E1000 line of gigabit ethernet boards
73e1000e.txt
74 - README for the Intel Gigabit Ethernet Driver (e1000e).
49eql.txt 75eql.txt
50 - serial IP load balancing 76 - serial IP load balancing
51ewrk3.txt 77ewrk3.txt
52 - the Digital EtherWORKS 3 DE203/4/5 Ethernet driver 78 - the Digital EtherWORKS 3 DE203/4/5 Ethernet driver
79fib_trie.txt
80 - Level Compressed Trie (LC-trie) notes: a structure for routing.
53filter.txt 81filter.txt
54 - Linux Socket Filtering 82 - Linux Socket Filtering
55fore200e.txt 83fore200e.txt
56 - FORE Systems PCA-200E/SBA-200E ATM NIC driver info. 84 - FORE Systems PCA-200E/SBA-200E ATM NIC driver info.
57framerelay.txt 85framerelay.txt
58 - info on using Frame Relay/Data Link Connection Identifier (DLCI). 86 - info on using Frame Relay/Data Link Connection Identifier (DLCI).
87gen_stats.txt
88 - Generic networking statistics for netlink users.
89generic_hdlc.txt
90 - The generic High Level Data Link Control (HDLC) layer.
59generic_netlink.txt 91generic_netlink.txt
60 - info on Generic Netlink 92 - info on Generic Netlink
93gianfar.txt
94 - Gianfar Ethernet Driver.
61ieee802154.txt 95ieee802154.txt
62 - Linux IEEE 802.15.4 implementation, API and drivers 96 - Linux IEEE 802.15.4 implementation, API and drivers
97ifenslave.c
98 - Configure network interfaces for parallel routing (bonding).
99igb.txt
100 - README for the Intel Gigabit Ethernet Driver (igb).
101igbvf.txt
102 - README for the Intel Gigabit Ethernet Driver (igbvf).
63ip-sysctl.txt 103ip-sysctl.txt
64 - /proc/sys/net/ipv4/* variables 104 - /proc/sys/net/ipv4/* variables
65ip_dynaddr.txt 105ip_dynaddr.txt
@@ -68,41 +108,117 @@ ipddp.txt
68 - AppleTalk-IP Decapsulation and AppleTalk-IP Encapsulation 108 - AppleTalk-IP Decapsulation and AppleTalk-IP Encapsulation
69iphase.txt 109iphase.txt
70 - Interphase PCI ATM (i)Chip IA Linux driver info. 110 - Interphase PCI ATM (i)Chip IA Linux driver info.
111ipv6.txt
112 - Options to the ipv6 kernel module.
113ipvs-sysctl.txt
114 - Per-inode explanation of the /proc/sys/net/ipv4/vs interface.
71irda.txt 115irda.txt
72 - where to get IrDA (infrared) utilities and info for Linux. 116 - where to get IrDA (infrared) utilities and info for Linux.
117ixgb.txt
118 - README for the Intel 10 Gigabit Ethernet Driver (ixgb).
119ixgbe.txt
120 - README for the Intel 10 Gigabit Ethernet Driver (ixgbe).
121ixgbevf.txt
122 - README for the Intel Virtual Function (VF) Driver (ixgbevf).
123l2tp.txt
124 - User guide to the L2TP tunnel protocol.
73lapb-module.txt 125lapb-module.txt
74 - programming information of the LAPB module. 126 - programming information of the LAPB module.
75ltpc.txt 127ltpc.txt
76 - the Apple or Farallon LocalTalk PC card driver 128 - the Apple or Farallon LocalTalk PC card driver
129mac80211-injection.txt
130 - HOWTO use packet injection with mac80211
77multicast.txt 131multicast.txt
78 - Behaviour of cards under Multicast 132 - Behaviour of cards under Multicast
133multiqueue.txt
134 - HOWTO for multiqueue network device support.
135netconsole.txt
136 - The network console module netconsole.ko: configuration and notes.
137netdev-features.txt
138 - Network interface features API description.
79netdevices.txt 139netdevices.txt
80 - info on network device driver functions exported to the kernel. 140 - info on network device driver functions exported to the kernel.
141netif-msg.txt
142 - Design of the network interface message level setting (NETIF_MSG_*).
143nfc.txt
144 - The Linux Near Field Communication (NFS) subsystem.
81olympic.txt 145olympic.txt
82 - IBM PCI Pit/Pit-Phy/Olympic Token Ring driver info. 146 - IBM PCI Pit/Pit-Phy/Olympic Token Ring driver info.
147operstates.txt
148 - Overview of network interface operational states.
149packet_mmap.txt
150 - User guide to memory mapped packet socket rings (PACKET_[RT]X_RING).
151phonet.txt
152 - The Phonet packet protocol used in Nokia cellular modems.
153phy.txt
154 - The PHY abstraction layer.
155pktgen.txt
156 - User guide to the kernel packet generator (pktgen.ko).
83policy-routing.txt 157policy-routing.txt
84 - IP policy-based routing 158 - IP policy-based routing
159ppp_generic.txt
160 - Information about the generic PPP driver.
161proc_net_tcp.txt
162 - Per inode overview of the /proc/net/tcp and /proc/net/tcp6 interfaces.
163radiotap-headers.txt
164 - Background on radiotap headers.
85ray_cs.txt 165ray_cs.txt
86 - Raylink Wireless LAN card driver info. 166 - Raylink Wireless LAN card driver info.
167rds.txt
168 - Background on the reliable, ordered datagram delivery method RDS.
169regulatory.txt
170 - Overview of the Linux wireless regulatory infrastructure.
171rxrpc.txt
172 - Guide to the RxRPC protocol.
173s2io.txt
174 - Release notes for Neterion Xframe I/II 10GbE driver.
175scaling.txt
176 - Explanation of network scaling techniques: RSS, RPS, RFS, aRFS, XPS.
177sctp.txt
178 - Notes on the Linux kernel implementation of the SCTP protocol.
179secid.txt
180 - Explanation of the secid member in flow structures.
87skfp.txt 181skfp.txt
88 - SysKonnect FDDI (SK-5xxx, Compaq Netelligent) driver info. 182 - SysKonnect FDDI (SK-5xxx, Compaq Netelligent) driver info.
89smc9.txt 183smc9.txt
90 - the driver for SMC's 9000 series of Ethernet cards 184 - the driver for SMC's 9000 series of Ethernet cards
91smctr.txt 185smctr.txt
92 - SMC TokenCard TokenRing Linux driver info. 186 - SMC TokenCard TokenRing Linux driver info.
187spider-net.txt
188 - README for the Spidernet Driver (as found in PS3 / Cell BE).
189stmmac.txt
190 - README for the STMicro Synopsys Ethernet driver.
191tc-actions-env-rules.txt
192 - rules for traffic control (tc) actions.
193timestamping.txt
194 - overview of network packet timestamping variants.
93tcp.txt 195tcp.txt
94 - short blurb on how TCP output takes place. 196 - short blurb on how TCP output takes place.
197tcp-thin.txt
198 - kernel tuning options for low rate 'thin' TCP streams.
95tlan.txt 199tlan.txt
96 - ThunderLAN (Compaq Netelligent 10/100, Olicom OC-2xxx) driver info. 200 - ThunderLAN (Compaq Netelligent 10/100, Olicom OC-2xxx) driver info.
97tms380tr.txt 201tms380tr.txt
98 - SysKonnect Token Ring ISA/PCI adapter driver info. 202 - SysKonnect Token Ring ISA/PCI adapter driver info.
203tproxy.txt
204 - Transparent proxy support user guide.
99tuntap.txt 205tuntap.txt
100 - TUN/TAP device driver, allowing user space Rx/Tx of packets. 206 - TUN/TAP device driver, allowing user space Rx/Tx of packets.
207udplite.txt
208 - UDP-Lite protocol (RFC 3828) introduction.
101vortex.txt 209vortex.txt
102 - info on using 3Com Vortex (3c590, 3c592, 3c595, 3c597) Ethernet cards. 210 - info on using 3Com Vortex (3c590, 3c592, 3c595, 3c597) Ethernet cards.
211vxge.txt
212 - README for the Neterion X3100 PCIe Server Adapter.
103x25.txt 213x25.txt
104 - general info on X.25 development. 214 - general info on X.25 development.
105x25-iface.txt 215x25-iface.txt
106 - description of the X.25 Packet Layer to LAPB device interface. 216 - description of the X.25 Packet Layer to LAPB device interface.
217xfrm_proc.txt
218 - description of the statistics package for XFRM.
219xfrm_sync.txt
220 - sync patches for XFRM enable migration of an SA between hosts.
221xfrm_sysctl.txt
222 - description of the XFRM configuration options.
107z8530drv.txt 223z8530drv.txt
108 - info about Linux driver for Z8530 based HDLC cards for AX.25 224 - info about Linux driver for Z8530 based HDLC cards for AX.25
diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index 5dd960d75174..91df678fb7f8 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -238,6 +238,18 @@ ad_select
238 238
239 This option was added in bonding version 3.4.0. 239 This option was added in bonding version 3.4.0.
240 240
241all_slaves_active
242
243 Specifies that duplicate frames (received on inactive ports) should be
244 dropped (0) or delivered (1).
245
246 Normally, bonding will drop duplicate frames (received on inactive
247 ports), which is desirable for most users. But there are some times
248 it is nice to allow duplicate frames to be delivered.
249
250 The default value is 0 (drop duplicate frames received on inactive
251 ports).
252
241arp_interval 253arp_interval
242 254
243 Specifies the ARP link monitoring frequency in milliseconds. 255 Specifies the ARP link monitoring frequency in milliseconds.
@@ -433,6 +445,23 @@ miimon
433 determined. See the High Availability section for additional 445 determined. See the High Availability section for additional
434 information. The default value is 0. 446 information. The default value is 0.
435 447
448min_links
449
450 Specifies the minimum number of links that must be active before
451 asserting carrier. It is similar to the Cisco EtherChannel min-links
452 feature. This allows setting the minimum number of member ports that
453 must be up (link-up state) before marking the bond device as up
454 (carrier on). This is useful for situations where higher level services
455 such as clustering want to ensure a minimum number of low bandwidth
456 links are active before switchover. This option only affect 802.3ad
457 mode.
458
459 The default value is 0. This will cause carrier to be asserted (for
460 802.3ad mode) whenever there is an active aggregator, regardless of the
461 number of available links in that aggregator. Note that, because an
462 aggregator cannot be active without at least one available link,
463 setting this option to 0 or to 1 has the exact same effect.
464
436mode 465mode
437 466
438 Specifies one of the bonding policies. The default is 467 Specifies one of the bonding policies. The default is
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index db2a4067013c..81546990f41c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -992,7 +992,7 @@ bindv6only - BOOLEAN
992 TRUE: disable IPv4-mapped address feature 992 TRUE: disable IPv4-mapped address feature
993 FALSE: enable IPv4-mapped address feature 993 FALSE: enable IPv4-mapped address feature
994 994
995 Default: FALSE (as specified in RFC2553bis) 995 Default: FALSE (as specified in RFC3493)
996 996
997IPv6 Fragmentation: 997IPv6 Fragmentation:
998 998
diff --git a/Documentation/networking/scaling.txt b/Documentation/networking/scaling.txt
new file mode 100644
index 000000000000..58fd7414e6c0
--- /dev/null
+++ b/Documentation/networking/scaling.txt
@@ -0,0 +1,378 @@
1Scaling in the Linux Networking Stack
2
3
4Introduction
5============
6
7This document describes a set of complementary techniques in the Linux
8networking stack to increase parallelism and improve performance for
9multi-processor systems.
10
11The following technologies are described:
12
13 RSS: Receive Side Scaling
14 RPS: Receive Packet Steering
15 RFS: Receive Flow Steering
16 Accelerated Receive Flow Steering
17 XPS: Transmit Packet Steering
18
19
20RSS: Receive Side Scaling
21=========================
22
23Contemporary NICs support multiple receive and transmit descriptor queues
24(multi-queue). On reception, a NIC can send different packets to different
25queues to distribute processing among CPUs. The NIC distributes packets by
26applying a filter to each packet that assigns it to one of a small number
27of logical flows. Packets for each flow are steered to a separate receive
28queue, which in turn can be processed by separate CPUs. This mechanism is
29generally known as “Receive-side Scaling” (RSS). The goal of RSS and
30the other scaling techniques to increase performance uniformly.
31Multi-queue distribution can also be used for traffic prioritization, but
32that is not the focus of these techniques.
33
34The filter used in RSS is typically a hash function over the network
35and/or transport layer headers-- for example, a 4-tuple hash over
36IP addresses and TCP ports of a packet. The most common hardware
37implementation of RSS uses a 128-entry indirection table where each entry
38stores a queue number. The receive queue for a packet is determined
39by masking out the low order seven bits of the computed hash for the
40packet (usually a Toeplitz hash), taking this number as a key into the
41indirection table and reading the corresponding value.
42
43Some advanced NICs allow steering packets to queues based on
44programmable filters. For example, webserver bound TCP port 80 packets
45can be directed to their own receive queue. Such “n-tuple” filters can
46be configured from ethtool (--config-ntuple).
47
48==== RSS Configuration
49
50The driver for a multi-queue capable NIC typically provides a kernel
51module parameter for specifying the number of hardware queues to
52configure. In the bnx2x driver, for instance, this parameter is called
53num_queues. A typical RSS configuration would be to have one receive queue
54for each CPU if the device supports enough queues, or otherwise at least
55one for each memory domain, where a memory domain is a set of CPUs that
56share a particular memory level (L1, L2, NUMA node, etc.).
57
58The indirection table of an RSS device, which resolves a queue by masked
59hash, is usually programmed by the driver at initialization. The
60default mapping is to distribute the queues evenly in the table, but the
61indirection table can be retrieved and modified at runtime using ethtool
62commands (--show-rxfh-indir and --set-rxfh-indir). Modifying the
63indirection table could be done to give different queues different
64relative weights.
65
66== RSS IRQ Configuration
67
68Each receive queue has a separate IRQ associated with it. The NIC triggers
69this to notify a CPU when new packets arrive on the given queue. The
70signaling path for PCIe devices uses message signaled interrupts (MSI-X),
71that can route each interrupt to a particular CPU. The active mapping
72of queues to IRQs can be determined from /proc/interrupts. By default,
73an IRQ may be handled on any CPU. Because a non-negligible part of packet
74processing takes place in receive interrupt handling, it is advantageous
75to spread receive interrupts between CPUs. To manually adjust the IRQ
76affinity of each interrupt see Documentation/IRQ-affinity. Some systems
77will be running irqbalance, a daemon that dynamically optimizes IRQ
78assignments and as a result may override any manual settings.
79
80== Suggested Configuration
81
82RSS should be enabled when latency is a concern or whenever receive
83interrupt processing forms a bottleneck. Spreading load between CPUs
84decreases queue length. For low latency networking, the optimal setting
85is to allocate as many queues as there are CPUs in the system (or the
86NIC maximum, if lower). The most efficient high-rate configuration
87is likely the one with the smallest number of receive queues where no
88receive queue overflows due to a saturated CPU, because in default
89mode with interrupt coalescing enabled, the aggregate number of
90interrupts (and thus work) grows with each additional queue.
91
92Per-cpu load can be observed using the mpstat utility, but note that on
93processors with hyperthreading (HT), each hyperthread is represented as
94a separate CPU. For interrupt handling, HT has shown no benefit in
95initial tests, so limit the number of queues to the number of CPU cores
96in the system.
97
98
99RPS: Receive Packet Steering
100============================
101
102Receive Packet Steering (RPS) is logically a software implementation of
103RSS. Being in software, it is necessarily called later in the datapath.
104Whereas RSS selects the queue and hence CPU that will run the hardware
105interrupt handler, RPS selects the CPU to perform protocol processing
106above the interrupt handler. This is accomplished by placing the packet
107on the desired CPU’s backlog queue and waking up the CPU for processing.
108RPS has some advantages over RSS: 1) it can be used with any NIC,
1092) software filters can easily be added to hash over new protocols,
1103) it does not increase hardware device interrupt rate (although it does
111introduce inter-processor interrupts (IPIs)).
112
113RPS is called during bottom half of the receive interrupt handler, when
114a driver sends a packet up the network stack with netif_rx() or
115netif_receive_skb(). These call the get_rps_cpu() function, which
116selects the queue that should process a packet.
117
118The first step in determining the target CPU for RPS is to calculate a
119flow hash over the packet’s addresses or ports (2-tuple or 4-tuple hash
120depending on the protocol). This serves as a consistent hash of the
121associated flow of the packet. The hash is either provided by hardware
122or will be computed in the stack. Capable hardware can pass the hash in
123the receive descriptor for the packet; this would usually be the same
124hash used for RSS (e.g. computed Toeplitz hash). The hash is saved in
125skb->rx_hash and can be used elsewhere in the stack as a hash of the
126packet’s flow.
127
128Each receive hardware queue has an associated list of CPUs to which
129RPS may enqueue packets for processing. For each received packet,
130an index into the list is computed from the flow hash modulo the size
131of the list. The indexed CPU is the target for processing the packet,
132and the packet is queued to the tail of that CPU’s backlog queue. At
133the end of the bottom half routine, IPIs are sent to any CPUs for which
134packets have been queued to their backlog queue. The IPI wakes backlog
135processing on the remote CPU, and any queued packets are then processed
136up the networking stack.
137
138==== RPS Configuration
139
140RPS requires a kernel compiled with the CONFIG_RPS kconfig symbol (on
141by default for SMP). Even when compiled in, RPS remains disabled until
142explicitly configured. The list of CPUs to which RPS may forward traffic
143can be configured for each receive queue using a sysfs file entry:
144
145 /sys/class/net/<dev>/queues/rx-<n>/rps_cpus
146
147This file implements a bitmap of CPUs. RPS is disabled when it is zero
148(the default), in which case packets are processed on the interrupting
149CPU. Documentation/IRQ-affinity.txt explains how CPUs are assigned to
150the bitmap.
151
152== Suggested Configuration
153
154For a single queue device, a typical RPS configuration would be to set
155the rps_cpus to the CPUs in the same memory domain of the interrupting
156CPU. If NUMA locality is not an issue, this could also be all CPUs in
157the system. At high interrupt rate, it might be wise to exclude the
158interrupting CPU from the map since that already performs much work.
159
160For a multi-queue system, if RSS is configured so that a hardware
161receive queue is mapped to each CPU, then RPS is probably redundant
162and unnecessary. If there are fewer hardware queues than CPUs, then
163RPS might be beneficial if the rps_cpus for each queue are the ones that
164share the same memory domain as the interrupting CPU for that queue.
165
166
167RFS: Receive Flow Steering
168==========================
169
170While RPS steers packets solely based on hash, and thus generally
171provides good load distribution, it does not take into account
172application locality. This is accomplished by Receive Flow Steering
173(RFS). The goal of RFS is to increase datacache hitrate by steering
174kernel processing of packets to the CPU where the application thread
175consuming the packet is running. RFS relies on the same RPS mechanisms
176to enqueue packets onto the backlog of another CPU and to wake up that
177CPU.
178
179In RFS, packets are not forwarded directly by the value of their hash,
180but the hash is used as index into a flow lookup table. This table maps
181flows to the CPUs where those flows are being processed. The flow hash
182(see RPS section above) is used to calculate the index into this table.
183The CPU recorded in each entry is the one which last processed the flow.
184If an entry does not hold a valid CPU, then packets mapped to that entry
185are steered using plain RPS. Multiple table entries may point to the
186same CPU. Indeed, with many flows and few CPUs, it is very likely that
187a single application thread handles flows with many different flow hashes.
188
189rps_sock_table is a global flow table that contains the *desired* CPU for
190flows: the CPU that is currently processing the flow in userspace. Each
191table value is a CPU index that is updated during calls to recvmsg and
192sendmsg (specifically, inet_recvmsg(), inet_sendmsg(), inet_sendpage()
193and tcp_splice_read()).
194
195When the scheduler moves a thread to a new CPU while it has outstanding
196receive packets on the old CPU, packets may arrive out of order. To
197avoid this, RFS uses a second flow table to track outstanding packets
198for each flow: rps_dev_flow_table is a table specific to each hardware
199receive queue of each device. Each table value stores a CPU index and a
200counter. The CPU index represents the *current* CPU onto which packets
201for this flow are enqueued for further kernel processing. Ideally, kernel
202and userspace processing occur on the same CPU, and hence the CPU index
203in both tables is identical. This is likely false if the scheduler has
204recently migrated a userspace thread while the kernel still has packets
205enqueued for kernel processing on the old CPU.
206
207The counter in rps_dev_flow_table values records the length of the current
208CPU's backlog when a packet in this flow was last enqueued. Each backlog
209queue has a head counter that is incremented on dequeue. A tail counter
210is computed as head counter + queue length. In other words, the counter
211in rps_dev_flow_table[i] records the last element in flow i that has
212been enqueued onto the currently designated CPU for flow i (of course,
213entry i is actually selected by hash and multiple flows may hash to the
214same entry i).
215
216And now the trick for avoiding out of order packets: when selecting the
217CPU for packet processing (from get_rps_cpu()) the rps_sock_flow table
218and the rps_dev_flow table of the queue that the packet was received on
219are compared. If the desired CPU for the flow (found in the
220rps_sock_flow table) matches the current CPU (found in the rps_dev_flow
221table), the packet is enqueued onto that CPU’s backlog. If they differ,
222the current CPU is updated to match the desired CPU if one of the
223following is true:
224
225- The current CPU's queue head counter >= the recorded tail counter
226 value in rps_dev_flow[i]
227- The current CPU is unset (equal to NR_CPUS)
228- The current CPU is offline
229
230After this check, the packet is sent to the (possibly updated) current
231CPU. These rules aim to ensure that a flow only moves to a new CPU when
232there are no packets outstanding on the old CPU, as the outstanding
233packets could arrive later than those about to be processed on the new
234CPU.
235
236==== RFS Configuration
237
238RFS is only available if the kconfig symbol CONFIG_RFS is enabled (on
239by default for SMP). The functionality remains disabled until explicitly
240configured. The number of entries in the global flow table is set through:
241
242 /proc/sys/net/core/rps_sock_flow_entries
243
244The number of entries in the per-queue flow table are set through:
245
246 /sys/class/net/<dev>/queues/tx-<n>/rps_flow_cnt
247
248== Suggested Configuration
249
250Both of these need to be set before RFS is enabled for a receive queue.
251Values for both are rounded up to the nearest power of two. The
252suggested flow count depends on the expected number of active connections
253at any given time, which may be significantly less than the number of open
254connections. We have found that a value of 32768 for rps_sock_flow_entries
255works fairly well on a moderately loaded server.
256
257For a single queue device, the rps_flow_cnt value for the single queue
258would normally be configured to the same value as rps_sock_flow_entries.
259For a multi-queue device, the rps_flow_cnt for each queue might be
260configured as rps_sock_flow_entries / N, where N is the number of
261queues. So for instance, if rps_flow_entries is set to 32768 and there
262are 16 configured receive queues, rps_flow_cnt for each queue might be
263configured as 2048.
264
265
266Accelerated RFS
267===============
268
269Accelerated RFS is to RFS what RSS is to RPS: a hardware-accelerated load
270balancing mechanism that uses soft state to steer flows based on where
271the application thread consuming the packets of each flow is running.
272Accelerated RFS should perform better than RFS since packets are sent
273directly to a CPU local to the thread consuming the data. The target CPU
274will either be the same CPU where the application runs, or at least a CPU
275which is local to the application thread’s CPU in the cache hierarchy.
276
277To enable accelerated RFS, the networking stack calls the
278ndo_rx_flow_steer driver function to communicate the desired hardware
279queue for packets matching a particular flow. The network stack
280automatically calls this function every time a flow entry in
281rps_dev_flow_table is updated. The driver in turn uses a device specific
282method to program the NIC to steer the packets.
283
284The hardware queue for a flow is derived from the CPU recorded in
285rps_dev_flow_table. The stack consults a CPU to hardware queue map which
286is maintained by the NIC driver. This is an auto-generated reverse map of
287the IRQ affinity table shown by /proc/interrupts. Drivers can use
288functions in the cpu_rmap (“CPU affinity reverse map”) kernel library
289to populate the map. For each CPU, the corresponding queue in the map is
290set to be one whose processing CPU is closest in cache locality.
291
292==== Accelerated RFS Configuration
293
294Accelerated RFS is only available if the kernel is compiled with
295CONFIG_RFS_ACCEL and support is provided by the NIC device and driver.
296It also requires that ntuple filtering is enabled via ethtool. The map
297of CPU to queues is automatically deduced from the IRQ affinities
298configured for each receive queue by the driver, so no additional
299configuration should be necessary.
300
301== Suggested Configuration
302
303This technique should be enabled whenever one wants to use RFS and the
304NIC supports hardware acceleration.
305
306XPS: Transmit Packet Steering
307=============================
308
309Transmit Packet Steering is a mechanism for intelligently selecting
310which transmit queue to use when transmitting a packet on a multi-queue
311device. To accomplish this, a mapping from CPU to hardware queue(s) is
312recorded. The goal of this mapping is usually to assign queues
313exclusively to a subset of CPUs, where the transmit completions for
314these queues are processed on a CPU within this set. This choice
315provides two benefits. First, contention on the device queue lock is
316significantly reduced since fewer CPUs contend for the same queue
317(contention can be eliminated completely if each CPU has its own
318transmit queue). Secondly, cache miss rate on transmit completion is
319reduced, in particular for data cache lines that hold the sk_buff
320structures.
321
322XPS is configured per transmit queue by setting a bitmap of CPUs that
323may use that queue to transmit. The reverse mapping, from CPUs to
324transmit queues, is computed and maintained for each network device.
325When transmitting the first packet in a flow, the function
326get_xps_queue() is called to select a queue. This function uses the ID
327of the running CPU as a key into the CPU-to-queue lookup table. If the
328ID matches a single queue, that is used for transmission. If multiple
329queues match, one is selected by using the flow hash to compute an index
330into the set.
331
332The queue chosen for transmitting a particular flow is saved in the
333corresponding socket structure for the flow (e.g. a TCP connection).
334This transmit queue is used for subsequent packets sent on the flow to
335prevent out of order (ooo) packets. The choice also amortizes the cost
336of calling get_xps_queues() over all packets in the flow. To avoid
337ooo packets, the queue for a flow can subsequently only be changed if
338skb->ooo_okay is set for a packet in the flow. This flag indicates that
339there are no outstanding packets in the flow, so the transmit queue can
340change without the risk of generating out of order packets. The
341transport layer is responsible for setting ooo_okay appropriately. TCP,
342for instance, sets the flag when all data for a connection has been
343acknowledged.
344
345==== XPS Configuration
346
347XPS is only available if the kconfig symbol CONFIG_XPS is enabled (on by
348default for SMP). The functionality remains disabled until explicitly
349configured. To enable XPS, the bitmap of CPUs that may use a transmit
350queue is configured using the sysfs file entry:
351
352/sys/class/net/<dev>/queues/tx-<n>/xps_cpus
353
354== Suggested Configuration
355
356For a network device with a single transmission queue, XPS configuration
357has no effect, since there is no choice in this case. In a multi-queue
358system, XPS is preferably configured so that each CPU maps onto one queue.
359If there are as many queues as there are CPUs in the system, then each
360queue can also map onto one CPU, resulting in exclusive pairings that
361experience no contention. If there are fewer queues than CPUs, then the
362best CPUs to share a given queue are probably those that share the cache
363with the CPU that processes transmit completions for that queue
364(transmit interrupts).
365
366
367Further Information
368===================
369RPS and RFS were introduced in kernel 2.6.35. XPS was incorporated into
3702.6.38. Original patches were submitted by Tom Herbert
371(therbert@google.com)
372
373Accelerated RFS was introduced in 2.6.35. Original patches were
374submitted by Ben Hutchings (bhutchings@solarflare.com)
375
376Authors:
377Tom Herbert (therbert@google.com)
378Willem de Bruijn (willemb@google.com)
diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt
index 4ce5450ab6e8..6066e3a6b9a9 100644
--- a/Documentation/power/runtime_pm.txt
+++ b/Documentation/power/runtime_pm.txt
@@ -431,8 +431,7 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
431 431
432 void pm_runtime_irq_safe(struct device *dev); 432 void pm_runtime_irq_safe(struct device *dev);
433 - set the power.irq_safe flag for the device, causing the runtime-PM 433 - set the power.irq_safe flag for the device, causing the runtime-PM
434 suspend and resume callbacks (but not the idle callback) to be invoked 434 callbacks to be invoked with interrupts off
435 with interrupts disabled
436 435
437 void pm_runtime_mark_last_busy(struct device *dev); 436 void pm_runtime_mark_last_busy(struct device *dev);
438 - set the power.last_busy field to the current time 437 - set the power.last_busy field to the current time
diff --git a/Documentation/ramoops.txt b/Documentation/ramoops.txt
new file mode 100644
index 000000000000..8fb1ba7fe7bf
--- /dev/null
+++ b/Documentation/ramoops.txt
@@ -0,0 +1,76 @@
1Ramoops oops/panic logger
2=========================
3
4Sergiu Iordache <sergiu@chromium.org>
5
6Updated: 8 August 2011
7
80. Introduction
9
10Ramoops is an oops/panic logger that writes its logs to RAM before the system
11crashes. It works by logging oopses and panics in a circular buffer. Ramoops
12needs a system with persistent RAM so that the content of that area can
13survive after a restart.
14
151. Ramoops concepts
16
17Ramoops uses a predefined memory area to store the dump. The start and size of
18the memory area are set using two variables:
19 * "mem_address" for the start
20 * "mem_size" for the size. The memory size will be rounded down to a
21 power of two.
22
23The memory area is divided into "record_size" chunks (also rounded down to
24power of two) and each oops/panic writes a "record_size" chunk of
25information.
26
27Dumping both oopses and panics can be done by setting 1 in the "dump_oops"
28variable while setting 0 in that variable dumps only the panics.
29
30The module uses a counter to record multiple dumps but the counter gets reset
31on restart (i.e. new dumps after the restart will overwrite old ones).
32
332. Setting the parameters
34
35Setting the ramoops parameters can be done in 2 different manners:
36 1. Use the module parameters (which have the names of the variables described
37 as before).
38 2. Use a platform device and set the platform data. The parameters can then
39 be set through that platform data. An example of doing that is:
40
41#include <linux/ramoops.h>
42[...]
43
44static struct ramoops_platform_data ramoops_data = {
45 .mem_size = <...>,
46 .mem_address = <...>,
47 .record_size = <...>,
48 .dump_oops = <...>,
49};
50
51static struct platform_device ramoops_dev = {
52 .name = "ramoops",
53 .dev = {
54 .platform_data = &ramoops_data,
55 },
56};
57
58[... inside a function ...]
59int ret;
60
61ret = platform_device_register(&ramoops_dev);
62if (ret) {
63 printk(KERN_ERR "unable to register platform device\n");
64 return ret;
65}
66
673. Dump format
68
69The data dump begins with a header, currently defined as "====" followed by a
70timestamp and a new line. The dump then continues with the actual data.
71
724. Reading the data
73
74The dump data can be read from memory (through /dev/mem or other means).
75Getting the module parameters, which are needed in order to parse the data, can
76be done through /sys/module/ramoops/parameters/* .
diff --git a/Documentation/virtual/00-INDEX b/Documentation/virtual/00-INDEX
index fe0251c4cfb7..8e601991d91c 100644
--- a/Documentation/virtual/00-INDEX
+++ b/Documentation/virtual/00-INDEX
@@ -8,3 +8,6 @@ lguest/
8 - Extremely simple hypervisor for experimental/educational use. 8 - Extremely simple hypervisor for experimental/educational use.
9uml/ 9uml/
10 - User Mode Linux, builds/runs Linux kernel as a userspace program. 10 - User Mode Linux, builds/runs Linux kernel as a userspace program.
11virtio.txt
12 - Text version of draft virtio spec.
13 See http://ozlabs.org/~rusty/virtio-spec
diff --git a/Documentation/virtual/lguest/lguest.c b/Documentation/virtual/lguest/lguest.c
index 043bd7df3139..d928c134dee6 100644
--- a/Documentation/virtual/lguest/lguest.c
+++ b/Documentation/virtual/lguest/lguest.c
@@ -1996,6 +1996,9 @@ int main(int argc, char *argv[])
1996 /* We use a simple helper to copy the arguments separated by spaces. */ 1996 /* We use a simple helper to copy the arguments separated by spaces. */
1997 concat((char *)(boot + 1), argv+optind+2); 1997 concat((char *)(boot + 1), argv+optind+2);
1998 1998
1999 /* Set kernel alignment to 16M (CONFIG_PHYSICAL_ALIGN) */
2000 boot->hdr.kernel_alignment = 0x1000000;
2001
1999 /* Boot protocol version: 2.07 supports the fields for lguest. */ 2002 /* Boot protocol version: 2.07 supports the fields for lguest. */
2000 boot->hdr.version = 0x207; 2003 boot->hdr.version = 0x207;
2001 2004
diff --git a/Documentation/virtual/virtio-spec.txt b/Documentation/virtual/virtio-spec.txt
new file mode 100644
index 000000000000..a350ae135b8c
--- /dev/null
+++ b/Documentation/virtual/virtio-spec.txt
@@ -0,0 +1,2200 @@
1[Generated file: see http://ozlabs.org/~rusty/virtio-spec/]
2Virtio PCI Card Specification
3v0.9.1 DRAFT
4-
5
6Rusty Russell <rusty@rustcorp.com.au>IBM Corporation (Editor)
7
82011 August 1.
9
10Purpose and Description
11
12This document describes the specifications of the “virtio” family
13of PCI[LaTeX Command: nomenclature] devices. These are devices
14are found in virtual environments[LaTeX Command: nomenclature],
15yet by design they are not all that different from physical PCI
16devices, and this document treats them as such. This allows the
17guest to use standard PCI drivers and discovery mechanisms.
18
19The purpose of virtio and this specification is that virtual
20environments and guests should have a straightforward, efficient,
21standard and extensible mechanism for virtual devices, rather
22than boutique per-environment or per-OS mechanisms.
23
24 Straightforward: Virtio PCI devices use normal PCI mechanisms
25 of interrupts and DMA which should be familiar to any device
26 driver author. There is no exotic page-flipping or COW
27 mechanism: it's just a PCI device.[footnote:
28This lack of page-sharing implies that the implementation of the
29device (e.g. the hypervisor or host) needs full access to the
30guest memory. Communication with untrusted parties (i.e.
31inter-guest communication) requires copying.
32]
33
34 Efficient: Virtio PCI devices consist of rings of descriptors
35 for input and output, which are neatly separated to avoid cache
36 effects from both guest and device writing to the same cache
37 lines.
38
39 Standard: Virtio PCI makes no assumptions about the environment
40 in which it operates, beyond supporting PCI. In fact the virtio
41 devices specified in the appendices do not require PCI at all:
42 they have been implemented on non-PCI buses.[footnote:
43The Linux implementation further separates the PCI virtio code
44from the specific virtio drivers: these drivers are shared with
45the non-PCI implementations (currently lguest and S/390).
46]
47
48 Extensible: Virtio PCI devices contain feature bits which are
49 acknowledged by the guest operating system during device setup.
50 This allows forwards and backwards compatibility: the device
51 offers all the features it knows about, and the driver
52 acknowledges those it understands and wishes to use.
53
54 Virtqueues
55
56The mechanism for bulk data transport on virtio PCI devices is
57pretentiously called a virtqueue. Each device can have zero or
58more virtqueues: for example, the network device has one for
59transmit and one for receive.
60
61Each virtqueue occupies two or more physically-contiguous pages
62(defined, for the purposes of this specification, as 4096 bytes),
63and consists of three parts:
64
65
66+-------------------+-----------------------------------+-----------+
67| Descriptor Table | Available Ring (padding) | Used Ring |
68+-------------------+-----------------------------------+-----------+
69
70
71When the driver wants to send buffers to the device, it puts them
72in one or more slots in the descriptor table, and writes the
73descriptor indices into the available ring. It then notifies the
74device. When the device has finished with the buffers, it writes
75the descriptors into the used ring, and sends an interrupt.
76
77Specification
78
79 PCI Discovery
80
81Any PCI device with Vendor ID 0x1AF4, and Device ID 0x1000
82through 0x103F inclusive is a virtio device[footnote:
83The actual value within this range is ignored
84]. The device must also have a Revision ID of 0 to match this
85specification.
86
87The Subsystem Device ID indicates which virtio device is
88supported by the device. The Subsystem Vendor ID should reflect
89the PCI Vendor ID of the environment (it's currently only used
90for informational purposes by the guest).
91
92
93+----------------------+--------------------+---------------+
94| Subsystem Device ID | Virtio Device | Specification |
95+----------------------+--------------------+---------------+
96+----------------------+--------------------+---------------+
97| 1 | network card | Appendix C |
98+----------------------+--------------------+---------------+
99| 2 | block device | Appendix D |
100+----------------------+--------------------+---------------+
101| 3 | console | Appendix E |
102+----------------------+--------------------+---------------+
103| 4 | entropy source | Appendix F |
104+----------------------+--------------------+---------------+
105| 5 | memory ballooning | Appendix G |
106+----------------------+--------------------+---------------+
107| 6 | ioMemory | - |
108+----------------------+--------------------+---------------+
109| 9 | 9P transport | - |
110+----------------------+--------------------+---------------+
111
112
113 Device Configuration
114
115To configure the device, we use the first I/O region of the PCI
116device. This contains a virtio header followed by a
117device-specific region.
118
119There may be different widths of accesses to the I/O region; the “
120natural” access method for each field in the virtio header must
121be used (i.e. 32-bit accesses for 32-bit fields, etc), but the
122device-specific region can be accessed using any width accesses,
123and should obtain the same results.
124
125Note that this is possible because while the virtio header is PCI
126(i.e. little) endian, the device-specific region is encoded in
127the native endian of the guest (where such distinction is
128applicable).
129
130 Device Initialization Sequence
131
132We start with an overview of device initialization, then expand
133on the details of the device and how each step is preformed.
134
135 Reset the device. This is not required on initial start up.
136
137 The ACKNOWLEDGE status bit is set: we have noticed the device.
138
139 The DRIVER status bit is set: we know how to drive the device.
140
141 Device-specific setup, including reading the Device Feature
142 Bits, discovery of virtqueues for the device, optional MSI-X
143 setup, and reading and possibly writing the virtio
144 configuration space.
145
146 The subset of Device Feature Bits understood by the driver is
147 written to the device.
148
149 The DRIVER_OK status bit is set.
150
151 The device can now be used (ie. buffers added to the
152 virtqueues)[footnote:
153Historically, drivers have used the device before steps 5 and 6.
154This is only allowed if the driver does not use any features
155which would alter this early use of the device.
156]
157
158If any of these steps go irrecoverably wrong, the guest should
159set the FAILED status bit to indicate that it has given up on the
160device (it can reset the device later to restart if desired).
161
162We now cover the fields required for general setup in detail.
163
164 Virtio Header
165
166The virtio header looks as follows:
167
168
169+------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+
170| Bits || 32 | 32 | 32 | 16 | 16 | 16 | 8 | 8 |
171+------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+
172| Read/Write || R | R+W | R+W | R | R+W | R+W | R+W | R |
173+------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+
174| Purpose || Device | Guest | Queue | Queue | Queue | Queue | Device | ISR |
175| || Features bits 0:31 | Features bits 0:31 | Address | Size | Select | Notify | Status | Status |
176+------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+
177
178
179If MSI-X is enabled for the device, two additional fields
180immediately follow this header:
181
182
183+------------++----------------+--------+
184| Bits || 16 | 16 |
185 +----------------+--------+
186+------------++----------------+--------+
187| Read/Write || R+W | R+W |
188+------------++----------------+--------+
189| Purpose || Configuration | Queue |
190| (MSI-X) || Vector | Vector |
191+------------++----------------+--------+
192
193
194Finally, if feature bits (VIRTIO_F_FEATURES_HI) this is
195immediately followed by two additional fields:
196
197
198+------------++----------------------+----------------------
199| Bits || 32 | 32
200+------------++----------------------+----------------------
201| Read/Write || R | R+W
202+------------++----------------------+----------------------
203| Purpose || Device | Guest
204| || Features bits 32:63 | Features bits 32:63
205+------------++----------------------+----------------------
206
207
208Immediately following these general headers, there may be
209device-specific headers:
210
211
212+------------++--------------------+
213| Bits || Device Specific |
214 +--------------------+
215+------------++--------------------+
216| Read/Write || Device Specific |
217+------------++--------------------+
218| Purpose || Device Specific... |
219| || |
220+------------++--------------------+
221
222
223 Device Status
224
225The Device Status field is updated by the guest to indicate its
226progress. This provides a simple low-level diagnostic: it's most
227useful to imagine them hooked up to traffic lights on the console
228indicating the status of each device.
229
230The device can be reset by writing a 0 to this field, otherwise
231at least one bit should be set:
232
233 ACKNOWLEDGE (1) Indicates that the guest OS has found the
234 device and recognized it as a valid virtio device.
235
236 DRIVER (2) Indicates that the guest OS knows how to drive the
237 device. Under Linux, drivers can be loadable modules so there
238 may be a significant (or infinite) delay before setting this
239 bit.
240
241 DRIVER_OK (3) Indicates that the driver is set up and ready to
242 drive the device.
243
244 FAILED (8) Indicates that something went wrong in the guest,
245 and it has given up on the device. This could be an internal
246 error, or the driver didn't like the device for some reason, or
247 even a fatal error during device operation. The device must be
248 reset before attempting to re-initialize.
249
250 Feature Bits
251
252The least significant 31 bits of the first configuration field
253indicates the features that the device supports (the high bit is
254reserved, and will be used to indicate the presence of future
255feature bits elsewhere). If more than 31 feature bits are
256supported, the device indicates so by setting feature bit 31 (see
257[cha:Reserved-Feature-Bits]). The bits are allocated as follows:
258
259 0 to 23 Feature bits for the specific device type
260
261 24 to 40 Feature bits reserved for extensions to the queue and
262 feature negotiation mechanisms
263
264 41 to 63 Feature bits reserved for future extensions
265
266For example, feature bit 0 for a network device (i.e. Subsystem
267Device ID 1) indicates that the device supports checksumming of
268packets.
269
270The feature bits are negotiated: the device lists all the
271features it understands in the Device Features field, and the
272guest writes the subset that it understands into the Guest
273Features field. The only way to renegotiate is to reset the
274device.
275
276In particular, new fields in the device configuration header are
277indicated by offering a feature bit, so the guest can check
278before accessing that part of the configuration space.
279
280This allows for forwards and backwards compatibility: if the
281device is enhanced with a new feature bit, older guests will not
282write that feature bit back to the Guest Features field and it
283can go into backwards compatibility mode. Similarly, if a guest
284is enhanced with a feature that the device doesn't support, it
285will not see that feature bit in the Device Features field and
286can go into backwards compatibility mode (or, for poor
287implementations, set the FAILED Device Status bit).
288
289Access to feature bits 32 to 63 is enabled by Guest by setting
290feature bit 31. If this bit is unset, Device must assume that all
291feature bits > 31 are unset.
292
293 Configuration/Queue Vectors
294
295When MSI-X capability is present and enabled in the device
296(through standard PCI configuration space) 4 bytes at byte offset
29720 are used to map configuration change and queue interrupts to
298MSI-X vectors. In this case, the ISR Status field is unused, and
299device specific configuration starts at byte offset 24 in virtio
300header structure. When MSI-X capability is not enabled, device
301specific configuration starts at byte offset 20 in virtio header.
302
303Writing a valid MSI-X Table entry number, 0 to 0x7FF, to one of
304Configuration/Queue Vector registers, maps interrupts triggered
305by the configuration change/selected queue events respectively to
306the corresponding MSI-X vector. To disable interrupts for a
307specific event type, unmap it by writing a special NO_VECTOR
308value:
309
310/* Vector value used to disable MSI for queue */
311
312#define VIRTIO_MSI_NO_VECTOR 0xffff
313
314Reading these registers returns vector mapped to a given event,
315or NO_VECTOR if unmapped. All queue and configuration change
316events are unmapped by default.
317
318Note that mapping an event to vector might require allocating
319internal device resources, and might fail. Devices report such
320failures by returning the NO_VECTOR value when the relevant
321Vector field is read. After mapping an event to vector, the
322driver must verify success by reading the Vector field value: on
323success, the previously written value is returned, and on
324failure, NO_VECTOR is returned. If a mapping failure is detected,
325the driver can retry mapping with fewervectors, or disable MSI-X.
326
327 Virtqueue Configuration
328
329As a device can have zero or more virtqueues for bulk data
330transport (for example, the network driver has two), the driver
331needs to configure them as part of the device-specific
332configuration.
333
334This is done as follows, for each virtqueue a device has:
335
336 Write the virtqueue index (first queue is 0) to the Queue
337 Select field.
338
339 Read the virtqueue size from the Queue Size field, which is
340 always a power of 2. This controls how big the virtqueue is
341 (see below). If this field is 0, the virtqueue does not exist.
342
343 Allocate and zero virtqueue in contiguous physical memory, on a
344 4096 byte alignment. Write the physical address, divided by
345 4096 to the Queue Address field.[footnote:
346The 4096 is based on the x86 page size, but it's also large
347enough to ensure that the separate parts of the virtqueue are on
348separate cache lines.
349]
350
351 Optionally, if MSI-X capability is present and enabled on the
352 device, select a vector to use to request interrupts triggered
353 by virtqueue events. Write the MSI-X Table entry number
354 corresponding to this vector in Queue Vector field. Read the
355 Queue Vector field: on success, previously written value is
356 returned; on failure, NO_VECTOR value is returned.
357
358The Queue Size field controls the total number of bytes required
359for the virtqueue according to the following formula:
360
361#define ALIGN(x) (((x) + 4095) & ~4095)
362
363static inline unsigned vring_size(unsigned int qsz)
364
365{
366
367 return ALIGN(sizeof(struct vring_desc)*qsz + sizeof(u16)*(2
368+ qsz))
369
370 + ALIGN(sizeof(struct vring_used_elem)*qsz);
371
372}
373
374This currently wastes some space with padding, but also allows
375future extensions. The virtqueue layout structure looks like this
376(qsz is the Queue Size field, which is a variable, so this code
377won't compile):
378
379struct vring {
380
381 /* The actual descriptors (16 bytes each) */
382
383 struct vring_desc desc[qsz];
384
385
386
387 /* A ring of available descriptor heads with free-running
388index. */
389
390 struct vring_avail avail;
391
392
393
394 // Padding to the next 4096 boundary.
395
396 char pad[];
397
398
399
400 // A ring of used descriptor heads with free-running index.
401
402 struct vring_used used;
403
404};
405
406 A Note on Virtqueue Endianness
407
408Note that the endian of these fields and everything else in the
409virtqueue is the native endian of the guest, not little-endian as
410PCI normally is. This makes for simpler guest code, and it is
411assumed that the host already has to be deeply aware of the guest
412endian so such an “endian-aware” device is not a significant
413issue.
414
415 Descriptor Table
416
417The descriptor table refers to the buffers the guest is using for
418the device. The addresses are physical addresses, and the buffers
419can be chained via the next field. Each descriptor describes a
420buffer which is read-only or write-only, but a chain of
421descriptors can contain both read-only and write-only buffers.
422
423No descriptor chain may be more than 2^32 bytes long in total.struct vring_desc {
424
425 /* Address (guest-physical). */
426
427 u64 addr;
428
429 /* Length. */
430
431 u32 len;
432
433/* This marks a buffer as continuing via the next field. */
434
435#define VRING_DESC_F_NEXT 1
436
437/* This marks a buffer as write-only (otherwise read-only). */
438
439#define VRING_DESC_F_WRITE 2
440
441/* This means the buffer contains a list of buffer descriptors.
442*/
443
444#define VRING_DESC_F_INDIRECT 4
445
446 /* The flags as indicated above. */
447
448 u16 flags;
449
450 /* Next field if flags & NEXT */
451
452 u16 next;
453
454};
455
456The number of descriptors in the table is specified by the Queue
457Size field for this virtqueue.
458
459 <sub:Indirect-Descriptors>Indirect Descriptors
460
461Some devices benefit by concurrently dispatching a large number
462of large requests. The VIRTIO_RING_F_INDIRECT_DESC feature can be
463used to allow this (see [cha:Reserved-Feature-Bits]). To increase
464ring capacity it is possible to store a table of indirect
465descriptors anywhere in memory, and insert a descriptor in main
466virtqueue (with flags&INDIRECT on) that refers to memory buffer
467containing this indirect descriptor table; fields addr and len
468refer to the indirect table address and length in bytes,
469respectively. The indirect table layout structure looks like this
470(len is the length of the descriptor that refers to this table,
471which is a variable, so this code won't compile):
472
473struct indirect_descriptor_table {
474
475 /* The actual descriptors (16 bytes each) */
476
477 struct vring_desc desc[len / 16];
478
479};
480
481The first indirect descriptor is located at start of the indirect
482descriptor table (index 0), additional indirect descriptors are
483chained by next field. An indirect descriptor without next field
484(with flags&NEXT off) signals the end of the indirect descriptor
485table, and transfers control back to the main virtqueue. An
486indirect descriptor can not refer to another indirect descriptor
487table (flags&INDIRECT must be off). A single indirect descriptor
488table can include both read-only and write-only descriptors;
489write-only flag (flags&WRITE) in the descriptor that refers to it
490is ignored.
491
492 Available Ring
493
494The available ring refers to what descriptors we are offering the
495device: it refers to the head of a descriptor chain. The “flags”
496field is currently 0 or 1: 1 indicating that we do not need an
497interrupt when the device consumes a descriptor from the
498available ring. Alternatively, the guest can ask the device to
499delay interrupts until an entry with an index specified by the “
500used_event” field is written in the used ring (equivalently,
501until the idx field in the used ring will reach the value
502used_event + 1). The method employed by the device is controlled
503by the VIRTIO_RING_F_EVENT_IDX feature bit (see [cha:Reserved-Feature-Bits]
504). This interrupt suppression is merely an optimization; it may
505not suppress interrupts entirely.
506
507The “idx” field indicates where we would put the next descriptor
508entry (modulo the ring size). This starts at 0, and increases.
509
510struct vring_avail {
511
512#define VRING_AVAIL_F_NO_INTERRUPT 1
513
514 u16 flags;
515
516 u16 idx;
517
518 u16 ring[qsz]; /* qsz is the Queue Size field read from device
519*/
520
521 u16 used_event;
522
523};
524
525 Used Ring
526
527The used ring is where the device returns buffers once it is done
528with them. The flags field can be used by the device to hint that
529no notification is necessary when the guest adds to the available
530ring. Alternatively, the “avail_event” field can be used by the
531device to hint that no notification is necessary until an entry
532with an index specified by the “avail_event” is written in the
533available ring (equivalently, until the idx field in the
534available ring will reach the value avail_event + 1). The method
535employed by the device is controlled by the guest through the
536VIRTIO_RING_F_EVENT_IDX feature bit (see [cha:Reserved-Feature-Bits]
537). [footnote:
538These fields are kept here because this is the only part of the
539virtqueue written by the device
540].
541
542Each entry in the ring is a pair: the head entry of the
543descriptor chain describing the buffer (this matches an entry
544placed in the available ring by the guest earlier), and the total
545of bytes written into the buffer. The latter is extremely useful
546for guests using untrusted buffers: if you do not know exactly
547how much has been written by the device, you usually have to zero
548the buffer to ensure no data leakage occurs.
549
550/* u32 is used here for ids for padding reasons. */
551
552struct vring_used_elem {
553
554 /* Index of start of used descriptor chain. */
555
556 u32 id;
557
558 /* Total length of the descriptor chain which was used
559(written to) */
560
561 u32 len;
562
563};
564
565
566
567struct vring_used {
568
569#define VRING_USED_F_NO_NOTIFY 1
570
571 u16 flags;
572
573 u16 idx;
574
575 struct vring_used_elem ring[qsz];
576
577 u16 avail_event;
578
579};
580
581 Helpers for Managing Virtqueues
582
583The Linux Kernel Source code contains the definitions above and
584helper routines in a more usable form, in
585include/linux/virtio_ring.h. This was explicitly licensed by IBM
586and Red Hat under the (3-clause) BSD license so that it can be
587freely used by all other projects, and is reproduced (with slight
588variation to remove Linux assumptions) in Appendix A.
589
590 Device Operation
591
592There are two parts to device operation: supplying new buffers to
593the device, and processing used buffers from the device. As an
594example, the virtio network device has two virtqueues: the
595transmit virtqueue and the receive virtqueue. The driver adds
596outgoing (read-only) packets to the transmit virtqueue, and then
597frees them after they are used. Similarly, incoming (write-only)
598buffers are added to the receive virtqueue, and processed after
599they are used.
600
601 Supplying Buffers to The Device
602
603Actual transfer of buffers from the guest OS to the device
604operates as follows:
605
606 Place the buffer(s) into free descriptor(s).
607
608 If there are no free descriptors, the guest may choose to
609 notify the device even if notifications are suppressed (to
610 reduce latency).[footnote:
611The Linux drivers do this only for read-only buffers: for
612write-only buffers, it is assumed that the driver is merely
613trying to keep the receive buffer ring full, and no notification
614of this expected condition is necessary.
615]
616
617 Place the id of the buffer in the next ring entry of the
618 available ring.
619
620 The steps (1) and (2) may be performed repeatedly if batching
621 is possible.
622
623 A memory barrier should be executed to ensure the device sees
624 the updated descriptor table and available ring before the next
625 step.
626
627 The available “idx” field should be increased by the number of
628 entries added to the available ring.
629
630 A memory barrier should be executed to ensure that we update
631 the idx field before checking for notification suppression.
632
633 If notifications are not suppressed, the device should be
634 notified of the new buffers.
635
636Note that the above code does not take precautions against the
637available ring buffer wrapping around: this is not possible since
638the ring buffer is the same size as the descriptor table, so step
639(1) will prevent such a condition.
640
641In addition, the maximum queue size is 32768 (it must be a power
642of 2 which fits in 16 bits), so the 16-bit “idx” value can always
643distinguish between a full and empty buffer.
644
645Here is a description of each stage in more detail.
646
647 Placing Buffers Into The Descriptor Table
648
649A buffer consists of zero or more read-only physically-contiguous
650elements followed by zero or more physically-contiguous
651write-only elements (it must have at least one element). This
652algorithm maps it into the descriptor table:
653
654 for each buffer element, b:
655
656 Get the next free descriptor table entry, d
657
658 Set d.addr to the physical address of the start of b
659
660 Set d.len to the length of b.
661
662 If b is write-only, set d.flags to VRING_DESC_F_WRITE,
663 otherwise 0.
664
665 If there is a buffer element after this:
666
667 Set d.next to the index of the next free descriptor element.
668
669 Set the VRING_DESC_F_NEXT bit in d.flags.
670
671In practice, the d.next fields are usually used to chain free
672descriptors, and a separate count kept to check there are enough
673free descriptors before beginning the mappings.
674
675 Updating The Available Ring
676
677The head of the buffer we mapped is the first d in the algorithm
678above. A naive implementation would do the following:
679
680avail->ring[avail->idx % qsz] = head;
681
682However, in general we can add many descriptors before we update
683the “idx” field (at which point they become visible to the
684device), so we keep a counter of how many we've added:
685
686avail->ring[(avail->idx + added++) % qsz] = head;
687
688 Updating The Index Field
689
690Once the idx field of the virtqueue is updated, the device will
691be able to access the descriptor entries we've created and the
692memory they refer to. This is why a memory barrier is generally
693used before the idx update, to ensure it sees the most up-to-date
694copy.
695
696The idx field always increments, and we let it wrap naturally at
69765536:
698
699avail->idx += added;
700
701 <sub:Notifying-The-Device>Notifying The Device
702
703Device notification occurs by writing the 16-bit virtqueue index
704of this virtqueue to the Queue Notify field of the virtio header
705in the first I/O region of the PCI device. This can be expensive,
706however, so the device can suppress such notifications if it
707doesn't need them. We have to be careful to expose the new idx
708value before checking the suppression flag: it's OK to notify
709gratuitously, but not to omit a required notification. So again,
710we use a memory barrier here before reading the flags or the
711avail_event field.
712
713If the VIRTIO_F_RING_EVENT_IDX feature is not negotiated, and if
714the VRING_USED_F_NOTIFY flag is not set, we go ahead and write to
715the PCI configuration space.
716
717If the VIRTIO_F_RING_EVENT_IDX feature is negotiated, we read the
718avail_event field in the available ring structure. If the
719available index crossed_the avail_event field value since the
720last notification, we go ahead and write to the PCI configuration
721space. The avail_event field wraps naturally at 65536 as well:
722
723(u16)(new_idx - avail_event - 1) < (u16)(new_idx - old_idx)
724
725 <sub:Receiving-Used-Buffers>Receiving Used Buffers From The
726 Device
727
728Once the device has used a buffer (read from or written to it, or
729parts of both, depending on the nature of the virtqueue and the
730device), it sends an interrupt, following an algorithm very
731similar to the algorithm used for the driver to send the device a
732buffer:
733
734 Write the head descriptor number to the next field in the used
735 ring.
736
737 Update the used ring idx.
738
739 Determine whether an interrupt is necessary:
740
741 If the VIRTIO_F_RING_EVENT_IDX feature is not negotiated: check
742 if f the VRING_AVAIL_F_NO_INTERRUPT flag is not set in avail-
743 >flags
744
745 If the VIRTIO_F_RING_EVENT_IDX feature is negotiated: check
746 whether the used index crossed the used_event field value
747 since the last update. The used_event field wraps naturally
748 at 65536 as well:(u16)(new_idx - used_event - 1) < (u16)(new_idx - old_idx)
749
750 If an interrupt is necessary:
751
752 If MSI-X capability is disabled:
753
754 Set the lower bit of the ISR Status field for the device.
755
756 Send the appropriate PCI interrupt for the device.
757
758 If MSI-X capability is enabled:
759
760 Request the appropriate MSI-X interrupt message for the
761 device, Queue Vector field sets the MSI-X Table entry
762 number.
763
764 If Queue Vector field value is NO_VECTOR, no interrupt
765 message is requested for this event.
766
767The guest interrupt handler should:
768
769 If MSI-X capability is disabled: read the ISR Status field,
770 which will reset it to zero. If the lower bit is zero, the
771 interrupt was not for this device. Otherwise, the guest driver
772 should look through the used rings of each virtqueue for the
773 device, to see if any progress has been made by the device
774 which requires servicing.
775
776 If MSI-X capability is enabled: look through the used rings of
777 each virtqueue mapped to the specific MSI-X vector for the
778 device, to see if any progress has been made by the device
779 which requires servicing.
780
781For each ring, guest should then disable interrupts by writing
782VRING_AVAIL_F_NO_INTERRUPT flag in avail structure, if required.
783It can then process used ring entries finally enabling interrupts
784by clearing the VRING_AVAIL_F_NO_INTERRUPT flag or updating the
785EVENT_IDX field in the available structure, Guest should then
786execute a memory barrier, and then recheck the ring empty
787condition. This is necessary to handle the case where, after the
788last check and before enabling interrupts, an interrupt has been
789suppressed by the device:
790
791vring_disable_interrupts(vq);
792
793for (;;) {
794
795 if (vq->last_seen_used != vring->used.idx) {
796
797 vring_enable_interrupts(vq);
798
799 mb();
800
801 if (vq->last_seen_used != vring->used.idx)
802
803 break;
804
805 }
806
807 struct vring_used_elem *e =
808vring.used->ring[vq->last_seen_used%vsz];
809
810 process_buffer(e);
811
812 vq->last_seen_used++;
813
814}
815
816 Dealing With Configuration Changes
817
818Some virtio PCI devices can change the device configuration
819state, as reflected in the virtio header in the PCI configuration
820space. In this case:
821
822 If MSI-X capability is disabled: an interrupt is delivered and
823 the second highest bit is set in the ISR Status field to
824 indicate that the driver should re-examine the configuration
825 space.Note that a single interrupt can indicate both that one
826 or more virtqueue has been used and that the configuration
827 space has changed: even if the config bit is set, virtqueues
828 must be scanned.
829
830 If MSI-X capability is enabled: an interrupt message is
831 requested. The Configuration Vector field sets the MSI-X Table
832 entry number to use. If Configuration Vector field value is
833 NO_VECTOR, no interrupt message is requested for this event.
834
835Creating New Device Types
836
837Various considerations are necessary when creating a new device
838type:
839
840 How Many Virtqueues?
841
842It is possible that a very simple device will operate entirely
843through its configuration space, but most will need at least one
844virtqueue in which it will place requests. A device with both
845input and output (eg. console and network devices described here)
846need two queues: one which the driver fills with buffers to
847receive input, and one which the driver places buffers to
848transmit output.
849
850 What Configuration Space Layout?
851
852Configuration space is generally used for rarely-changing or
853initialization-time parameters. But it is a limited resource, so
854it might be better to use a virtqueue to update configuration
855information (the network device does this for filtering,
856otherwise the table in the config space could potentially be very
857large).
858
859Note that this space is generally the guest's native endian,
860rather than PCI's little-endian.
861
862 What Device Number?
863
864Currently device numbers are assigned quite freely: a simple
865request mail to the author of this document or the Linux
866virtualization mailing list[footnote:
867
868https://lists.linux-foundation.org/mailman/listinfo/virtualization
869] will be sufficient to secure a unique one.
870
871Meanwhile for experimental drivers, use 65535 and work backwards.
872
873 How many MSI-X vectors?
874
875Using the optional MSI-X capability devices can speed up
876interrupt processing by removing the need to read ISR Status
877register by guest driver (which might be an expensive operation),
878reducing interrupt sharing between devices and queues within the
879device, and handling interrupts from multiple CPUs. However, some
880systems impose a limit (which might be as low as 256) on the
881total number of MSI-X vectors that can be allocated to all
882devices. Devices and/or device drivers should take this into
883account, limiting the number of vectors used unless the device is
884expected to cause a high volume of interrupts. Devices can
885control the number of vectors used by limiting the MSI-X Table
886Size or not presenting MSI-X capability in PCI configuration
887space. Drivers can control this by mapping events to as small
888number of vectors as possible, or disabling MSI-X capability
889altogether.
890
891 Message Framing
892
893The descriptors used for a buffer should not effect the semantics
894of the message, except for the total length of the buffer. For
895example, a network buffer consists of a 10 byte header followed
896by the network packet. Whether this is presented in the ring
897descriptor chain as (say) a 10 byte buffer and a 1514 byte
898buffer, or a single 1524 byte buffer, or even three buffers,
899should have no effect.
900
901In particular, no implementation should use the descriptor
902boundaries to determine the size of any header in a request.[footnote:
903The current qemu device implementations mistakenly insist that
904the first descriptor cover the header in these cases exactly, so
905a cautious driver should arrange it so.
906]
907
908 Device Improvements
909
910Any change to configuration space, or new virtqueues, or
911behavioural changes, should be indicated by negotiation of a new
912feature bit. This establishes clarity[footnote:
913Even if it does mean documenting design or implementation
914mistakes!
915] and avoids future expansion problems.
916
917Clusters of functionality which are always implemented together
918can use a single bit, but if one feature makes sense without the
919others they should not be gratuitously grouped together to
920conserve feature bits. We can always extend the spec when the
921first person needs more than 24 feature bits for their device.
922
923[LaTeX Command: printnomenclature]
924
925Appendix A: virtio_ring.h
926
927#ifndef VIRTIO_RING_H
928
929#define VIRTIO_RING_H
930
931/* An interface for efficient virtio implementation.
932
933 *
934
935 * This header is BSD licensed so anyone can use the definitions
936
937 * to implement compatible drivers/servers.
938
939 *
940
941 * Copyright 2007, 2009, IBM Corporation
942
943 * Copyright 2011, Red Hat, Inc
944
945 * All rights reserved.
946
947 *
948
949 * Redistribution and use in source and binary forms, with or
950without
951
952 * modification, are permitted provided that the following
953conditions
954
955 * are met:
956
957 * 1. Redistributions of source code must retain the above
958copyright
959
960 * notice, this list of conditions and the following
961disclaimer.
962
963 * 2. Redistributions in binary form must reproduce the above
964copyright
965
966 * notice, this list of conditions and the following
967disclaimer in the
968
969 * documentation and/or other materials provided with the
970distribution.
971
972 * 3. Neither the name of IBM nor the names of its contributors
973
974 * may be used to endorse or promote products derived from
975this software
976
977 * without specific prior written permission.
978
979 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
980CONTRIBUTORS ``AS IS'' AND
981
982 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
983TO, THE
984
985 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
986PARTICULAR PURPOSE
987
988 * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE
989LIABLE
990
991 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
992CONSEQUENTIAL
993
994 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
995SUBSTITUTE GOODS
996
997 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
998INTERRUPTION)
999
1000 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
1001CONTRACT, STRICT
1002
1003 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
1004IN ANY WAY
1005
1006 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
1007POSSIBILITY OF
1008
1009 * SUCH DAMAGE.
1010
1011 */
1012
1013
1014
1015/* This marks a buffer as continuing via the next field. */
1016
1017#define VRING_DESC_F_NEXT 1
1018
1019/* This marks a buffer as write-only (otherwise read-only). */
1020
1021#define VRING_DESC_F_WRITE 2
1022
1023
1024
1025/* The Host uses this in used->flags to advise the Guest: don't
1026kick me
1027
1028 * when you add a buffer. It's unreliable, so it's simply an
1029
1030 * optimization. Guest will still kick if it's out of buffers.
1031*/
1032
1033#define VRING_USED_F_NO_NOTIFY 1
1034
1035/* The Guest uses this in avail->flags to advise the Host: don't
1036
1037 * interrupt me when you consume a buffer. It's unreliable, so
1038it's
1039
1040 * simply an optimization. */
1041
1042#define VRING_AVAIL_F_NO_INTERRUPT 1
1043
1044
1045
1046/* Virtio ring descriptors: 16 bytes.
1047
1048 * These can chain together via "next". */
1049
1050struct vring_desc {
1051
1052 /* Address (guest-physical). */
1053
1054 uint64_t addr;
1055
1056 /* Length. */
1057
1058 uint32_t len;
1059
1060 /* The flags as indicated above. */
1061
1062 uint16_t flags;
1063
1064 /* We chain unused descriptors via this, too */
1065
1066 uint16_t next;
1067
1068};
1069
1070
1071
1072struct vring_avail {
1073
1074 uint16_t flags;
1075
1076 uint16_t idx;
1077
1078 uint16_t ring[];
1079
1080 uint16_t used_event;
1081
1082};
1083
1084
1085
1086/* u32 is used here for ids for padding reasons. */
1087
1088struct vring_used_elem {
1089
1090 /* Index of start of used descriptor chain. */
1091
1092 uint32_t id;
1093
1094 /* Total length of the descriptor chain which was written
1095to. */
1096
1097 uint32_t len;
1098
1099};
1100
1101
1102
1103struct vring_used {
1104
1105 uint16_t flags;
1106
1107 uint16_t idx;
1108
1109 struct vring_used_elem ring[];
1110
1111 uint16_t avail_event;
1112
1113};
1114
1115
1116
1117struct vring {
1118
1119 unsigned int num;
1120
1121
1122
1123 struct vring_desc *desc;
1124
1125 struct vring_avail *avail;
1126
1127 struct vring_used *used;
1128
1129};
1130
1131
1132
1133/* The standard layout for the ring is a continuous chunk of
1134memory which
1135
1136 * looks like this. We assume num is a power of 2.
1137
1138 *
1139
1140 * struct vring {
1141
1142 * // The actual descriptors (16 bytes each)
1143
1144 * struct vring_desc desc[num];
1145
1146 *
1147
1148 * // A ring of available descriptor heads with free-running
1149index.
1150
1151 * __u16 avail_flags;
1152
1153 * __u16 avail_idx;
1154
1155 * __u16 available[num];
1156
1157 *
1158
1159 * // Padding to the next align boundary.
1160
1161 * char pad[];
1162
1163 *
1164
1165 * // A ring of used descriptor heads with free-running
1166index.
1167
1168 * __u16 used_flags;
1169
1170 * __u16 EVENT_IDX;
1171
1172 * struct vring_used_elem used[num];
1173
1174 * };
1175
1176 * Note: for virtio PCI, align is 4096.
1177
1178 */
1179
1180static inline void vring_init(struct vring *vr, unsigned int num,
1181void *p,
1182
1183 unsigned long align)
1184
1185{
1186
1187 vr->num = num;
1188
1189 vr->desc = p;
1190
1191 vr->avail = p + num*sizeof(struct vring_desc);
1192
1193 vr->used = (void *)(((unsigned long)&vr->avail->ring[num]
1194
1195 + align-1)
1196
1197 & ~(align - 1));
1198
1199}
1200
1201
1202
1203static inline unsigned vring_size(unsigned int num, unsigned long
1204align)
1205
1206{
1207
1208 return ((sizeof(struct vring_desc)*num +
1209sizeof(uint16_t)*(2+num)
1210
1211 + align - 1) & ~(align - 1))
1212
1213 + sizeof(uint16_t)*3 + sizeof(struct
1214vring_used_elem)*num;
1215
1216}
1217
1218
1219
1220static inline int vring_need_event(uint16_t event_idx, uint16_t
1221new_idx, uint16_t old_idx)
1222
1223{
1224
1225 return (uint16_t)(new_idx - event_idx - 1) <
1226(uint16_t)(new_idx - old_idx);
1227
1228}
1229
1230#endif /* VIRTIO_RING_H */
1231
1232<cha:Reserved-Feature-Bits>Appendix B: Reserved Feature Bits
1233
1234Currently there are five device-independent feature bits defined:
1235
1236 VIRTIO_F_NOTIFY_ON_EMPTY (24) Negotiating this feature
1237 indicates that the driver wants an interrupt if the device runs
1238 out of available descriptors on a virtqueue, even though
1239 interrupts are suppressed using the VRING_AVAIL_F_NO_INTERRUPT
1240 flag or the used_event field. An example of this is the
1241 networking driver: it doesn't need to know every time a packet
1242 is transmitted, but it does need to free the transmitted
1243 packets a finite time after they are transmitted. It can avoid
1244 using a timer if the device interrupts it when all the packets
1245 are transmitted.
1246
1247 VIRTIO_F_RING_INDIRECT_DESC (28) Negotiating this feature
1248 indicates that the driver can use descriptors with the
1249 VRING_DESC_F_INDIRECT flag set, as described in [sub:Indirect-Descriptors]
1250 .
1251
1252 VIRTIO_F_RING_EVENT_IDX(29) This feature enables the used_event
1253 and the avail_event fields. If set, it indicates that the
1254 device should ignore the flags field in the available ring
1255 structure. Instead, the used_event field in this structure is
1256 used by guest to suppress device interrupts. Further, the
1257 driver should ignore the flags field in the used ring
1258 structure. Instead, the avail_event field in this structure is
1259 used by the device to suppress notifications. If unset, the
1260 driver should ignore the used_event field; the device should
1261 ignore the avail_event field; the flags field is used
1262
1263 VIRTIO_F_BAD_FEATURE(30) This feature should never be
1264 negotiated by the guest; doing so is an indication that the
1265 guest is faulty[footnote:
1266An experimental virtio PCI driver contained in Linux version
12672.6.25 had this problem, and this feature bit can be used to
1268detect it.
1269]
1270
1271 VIRTIO_F_FEATURES_HIGH(31) This feature indicates that the
1272 device supports feature bits 32:63. If unset, feature bits
1273 32:63 are unset.
1274
1275Appendix C: Network Device
1276
1277The virtio network device is a virtual ethernet card, and is the
1278most complex of the devices supported so far by virtio. It has
1279enhanced rapidly and demonstrates clearly how support for new
1280features should be added to an existing device. Empty buffers are
1281placed in one virtqueue for receiving packets, and outgoing
1282packets are enqueued into another for transmission in that order.
1283A third command queue is used to control advanced filtering
1284features.
1285
1286 Configuration
1287
1288 Subsystem Device ID 1
1289
1290 Virtqueues 0:receiveq. 1:transmitq. 2:controlq[footnote:
1291Only if VIRTIO_NET_F_CTRL_VQ set
1292]
1293
1294 Feature bits
1295
1296 VIRTIO_NET_F_CSUM (0) Device handles packets with partial
1297 checksum
1298
1299 VIRTIO_NET_F_GUEST_CSUM (1) Guest handles packets with partial
1300 checksum
1301
1302 VIRTIO_NET_F_MAC (5) Device has given MAC address.
1303
1304 VIRTIO_NET_F_GSO (6) (Deprecated) device handles packets with
1305 any GSO type.[footnote:
1306It was supposed to indicate segmentation offload support, but
1307upon further investigation it became clear that multiple bits
1308were required.
1309]
1310
1311 VIRTIO_NET_F_GUEST_TSO4 (7) Guest can receive TSOv4.
1312
1313 VIRTIO_NET_F_GUEST_TSO6 (8) Guest can receive TSOv6.
1314
1315 VIRTIO_NET_F_GUEST_ECN (9) Guest can receive TSO with ECN.
1316
1317 VIRTIO_NET_F_GUEST_UFO (10) Guest can receive UFO.
1318
1319 VIRTIO_NET_F_HOST_TSO4 (11) Device can receive TSOv4.
1320
1321 VIRTIO_NET_F_HOST_TSO6 (12) Device can receive TSOv6.
1322
1323 VIRTIO_NET_F_HOST_ECN (13) Device can receive TSO with ECN.
1324
1325 VIRTIO_NET_F_HOST_UFO (14) Device can receive UFO.
1326
1327 VIRTIO_NET_F_MRG_RXBUF (15) Guest can merge receive buffers.
1328
1329 VIRTIO_NET_F_STATUS (16) Configuration status field is
1330 available.
1331
1332 VIRTIO_NET_F_CTRL_VQ (17) Control channel is available.
1333
1334 VIRTIO_NET_F_CTRL_RX (18) Control channel RX mode support.
1335
1336 VIRTIO_NET_F_CTRL_VLAN (19) Control channel VLAN filtering.
1337
1338 Device configuration layout Two configuration fields are
1339 currently defined. The mac address field always exists (though
1340 is only valid if VIRTIO_NET_F_MAC is set), and the status field
1341 only exists if VIRTIO_NET_F_STATUS is set. Only one bit is
1342 currently defined for the status field: VIRTIO_NET_S_LINK_UP. #define VIRTIO_NET_S_LINK_UP 1
1343
1344
1345
1346struct virtio_net_config {
1347
1348 u8 mac[6];
1349
1350 u16 status;
1351
1352};
1353
1354 Device Initialization
1355
1356 The initialization routine should identify the receive and
1357 transmission virtqueues.
1358
1359 If the VIRTIO_NET_F_MAC feature bit is set, the configuration
1360 space “mac” entry indicates the “physical” address of the the
1361 network card, otherwise a private MAC address should be
1362 assigned. All guests are expected to negotiate this feature if
1363 it is set.
1364
1365 If the VIRTIO_NET_F_CTRL_VQ feature bit is negotiated, identify
1366 the control virtqueue.
1367
1368 If the VIRTIO_NET_F_STATUS feature bit is negotiated, the link
1369 status can be read from the bottom bit of the “status” config
1370 field. Otherwise, the link should be assumed active.
1371
1372 The receive virtqueue should be filled with receive buffers.
1373 This is described in detail below in “Setting Up Receive
1374 Buffers”.
1375
1376 A driver can indicate that it will generate checksumless
1377 packets by negotating the VIRTIO_NET_F_CSUM feature. This “
1378 checksum offload” is a common feature on modern network cards.
1379
1380 If that feature is negotiated, a driver can use TCP or UDP
1381 segmentation offload by negotiating the VIRTIO_NET_F_HOST_TSO4
1382 (IPv4 TCP), VIRTIO_NET_F_HOST_TSO6 (IPv6 TCP) and
1383 VIRTIO_NET_F_HOST_UFO (UDP fragmentation) features. It should
1384 not send TCP packets requiring segmentation offload which have
1385 the Explicit Congestion Notification bit set, unless the
1386 VIRTIO_NET_F_HOST_ECN feature is negotiated.[footnote:
1387This is a common restriction in real, older network cards.
1388]
1389
1390 The converse features are also available: a driver can save the
1391 virtual device some work by negotiating these features.[footnote:
1392For example, a network packet transported between two guests on
1393the same system may not require checksumming at all, nor
1394segmentation, if both guests are amenable.
1395] The VIRTIO_NET_F_GUEST_CSUM feature indicates that partially
1396 checksummed packets can be received, and if it can do that then
1397 the VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6,
1398 VIRTIO_NET_F_GUEST_UFO and VIRTIO_NET_F_GUEST_ECN are the input
1399 equivalents of the features described above. See “Receiving
1400 Packets” below.
1401
1402 Device Operation
1403
1404Packets are transmitted by placing them in the transmitq, and
1405buffers for incoming packets are placed in the receiveq. In each
1406case, the packet itself is preceeded by a header:
1407
1408struct virtio_net_hdr {
1409
1410#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1
1411
1412 u8 flags;
1413
1414#define VIRTIO_NET_HDR_GSO_NONE 0
1415
1416#define VIRTIO_NET_HDR_GSO_TCPV4 1
1417
1418#define VIRTIO_NET_HDR_GSO_UDP 3
1419
1420#define VIRTIO_NET_HDR_GSO_TCPV6 4
1421
1422#define VIRTIO_NET_HDR_GSO_ECN 0x80
1423
1424 u8 gso_type;
1425
1426 u16 hdr_len;
1427
1428 u16 gso_size;
1429
1430 u16 csum_start;
1431
1432 u16 csum_offset;
1433
1434/* Only if VIRTIO_NET_F_MRG_RXBUF: */
1435
1436 u16 num_buffers
1437
1438};
1439
1440The controlq is used to control device features such as
1441filtering.
1442
1443 Packet Transmission
1444
1445Transmitting a single packet is simple, but varies depending on
1446the different features the driver negotiated.
1447
1448 If the driver negotiated VIRTIO_NET_F_CSUM, and the packet has
1449 not been fully checksummed, then the virtio_net_hdr's fields
1450 are set as follows. Otherwise, the packet must be fully
1451 checksummed, and flags is zero.
1452
1453 flags has the VIRTIO_NET_HDR_F_NEEDS_CSUM set,
1454
1455 <ite:csum_start-is-set>csum_start is set to the offset within
1456 the packet to begin checksumming, and
1457
1458 csum_offset indicates how many bytes after the csum_start the
1459 new (16 bit ones' complement) checksum should be placed.[footnote:
1460For example, consider a partially checksummed TCP (IPv4) packet.
1461It will have a 14 byte ethernet header and 20 byte IP header
1462followed by the TCP header (with the TCP checksum field 16 bytes
1463into that header). csum_start will be 14+20 = 34 (the TCP
1464checksum includes the header), and csum_offset will be 16. The
1465value in the TCP checksum field will be the sum of the TCP pseudo
1466header, so that replacing it by the ones' complement checksum of
1467the TCP header and body will give the correct result.
1468]
1469
1470 <enu:If-the-driver>If the driver negotiated
1471 VIRTIO_NET_F_HOST_TSO4, TSO6 or UFO, and the packet requires
1472 TCP segmentation or UDP fragmentation, then the “gso_type”
1473 field is set to VIRTIO_NET_HDR_GSO_TCPV4, TCPV6 or UDP.
1474 (Otherwise, it is set to VIRTIO_NET_HDR_GSO_NONE). In this
1475 case, packets larger than 1514 bytes can be transmitted: the
1476 metadata indicates how to replicate the packet header to cut it
1477 into smaller packets. The other gso fields are set:
1478
1479 hdr_len is a hint to the device as to how much of the header
1480 needs to be kept to copy into each packet, usually set to the
1481 length of the headers, including the transport header.[footnote:
1482Due to various bugs in implementations, this field is not useful
1483as a guarantee of the transport header size.
1484]
1485
1486 gso_size is the size of the packet beyond that header (ie.
1487 MSS).
1488
1489 If the driver negotiated the VIRTIO_NET_F_HOST_ECN feature, the
1490 VIRTIO_NET_HDR_GSO_ECN bit may be set in “gso_type” as well,
1491 indicating that the TCP packet has the ECN bit set.[footnote:
1492This case is not handled by some older hardware, so is called out
1493specifically in the protocol.
1494]
1495
1496 If the driver negotiated the VIRTIO_NET_F_MRG_RXBUF feature,
1497 the num_buffers field is set to zero.
1498
1499 The header and packet are added as one output buffer to the
1500 transmitq, and the device is notified of the new entry (see [sub:Notifying-The-Device]
1501 ).[footnote:
1502Note that the header will be two bytes longer for the
1503VIRTIO_NET_F_MRG_RXBUF case.
1504]
1505
1506 Packet Transmission Interrupt
1507
1508Often a driver will suppress transmission interrupts using the
1509VRING_AVAIL_F_NO_INTERRUPT flag (see [sub:Receiving-Used-Buffers]
1510) and check for used packets in the transmit path of following
1511packets. However, it will still receive interrupts if the
1512VIRTIO_F_NOTIFY_ON_EMPTY feature is negotiated, indicating that
1513the transmission queue is completely emptied.
1514
1515The normal behavior in this interrupt handler is to retrieve and
1516new descriptors from the used ring and free the corresponding
1517headers and packets.
1518
1519 Setting Up Receive Buffers
1520
1521It is generally a good idea to keep the receive virtqueue as
1522fully populated as possible: if it runs out, network performance
1523will suffer.
1524
1525If the VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6 or
1526VIRTIO_NET_F_GUEST_UFO features are used, the Guest will need to
1527accept packets of up to 65550 bytes long (the maximum size of a
1528TCP or UDP packet, plus the 14 byte ethernet header), otherwise
15291514 bytes. So unless VIRTIO_NET_F_MRG_RXBUF is negotiated, every
1530buffer in the receive queue needs to be at least this length [footnote:
1531Obviously each one can be split across multiple descriptor
1532elements.
1533].
1534
1535If VIRTIO_NET_F_MRG_RXBUF is negotiated, each buffer must be at
1536least the size of the struct virtio_net_hdr.
1537
1538 Packet Receive Interrupt
1539
1540When a packet is copied into a buffer in the receiveq, the
1541optimal path is to disable further interrupts for the receiveq
1542(see [sub:Receiving-Used-Buffers]) and process packets until no
1543more are found, then re-enable them.
1544
1545Processing packet involves:
1546
1547 If the driver negotiated the VIRTIO_NET_F_MRG_RXBUF feature,
1548 then the “num_buffers” field indicates how many descriptors
1549 this packet is spread over (including this one). This allows
1550 receipt of large packets without having to allocate large
1551 buffers. In this case, there will be at least “num_buffers” in
1552 the used ring, and they should be chained together to form a
1553 single packet. The other buffers will not begin with a struct
1554 virtio_net_hdr.
1555
1556 If the VIRTIO_NET_F_MRG_RXBUF feature was not negotiated, or
1557 the “num_buffers” field is one, then the entire packet will be
1558 contained within this buffer, immediately following the struct
1559 virtio_net_hdr.
1560
1561 If the VIRTIO_NET_F_GUEST_CSUM feature was negotiated, the
1562 VIRTIO_NET_HDR_F_NEEDS_CSUM bit in the “flags” field may be
1563 set: if so, the checksum on the packet is incomplete and the “
1564 csum_start” and “csum_offset” fields indicate how to calculate
1565 it (see [ite:csum_start-is-set]).
1566
1567 If the VIRTIO_NET_F_GUEST_TSO4, TSO6 or UFO options were
1568 negotiated, then the “gso_type” may be something other than
1569 VIRTIO_NET_HDR_GSO_NONE, and the “gso_size” field indicates the
1570 desired MSS (see [enu:If-the-driver]).Control Virtqueue
1571
1572The driver uses the control virtqueue (if VIRTIO_NET_F_VTRL_VQ is
1573negotiated) to send commands to manipulate various features of
1574the device which would not easily map into the configuration
1575space.
1576
1577All commands are of the following form:
1578
1579struct virtio_net_ctrl {
1580
1581 u8 class;
1582
1583 u8 command;
1584
1585 u8 command-specific-data[];
1586
1587 u8 ack;
1588
1589};
1590
1591
1592
1593/* ack values */
1594
1595#define VIRTIO_NET_OK 0
1596
1597#define VIRTIO_NET_ERR 1
1598
1599The class, command and command-specific-data are set by the
1600driver, and the device sets the ack byte. There is little it can
1601do except issue a diagnostic if the ack byte is not
1602VIRTIO_NET_OK.
1603
1604 Packet Receive Filtering
1605
1606If the VIRTIO_NET_F_CTRL_RX feature is negotiated, the driver can
1607send control commands for promiscuous mode, multicast receiving,
1608and filtering of MAC addresses.
1609
1610Note that in general, these commands are best-effort: unwanted
1611packets may still arrive.
1612
1613 Setting Promiscuous Mode
1614
1615#define VIRTIO_NET_CTRL_RX 0
1616
1617 #define VIRTIO_NET_CTRL_RX_PROMISC 0
1618
1619 #define VIRTIO_NET_CTRL_RX_ALLMULTI 1
1620
1621The class VIRTIO_NET_CTRL_RX has two commands:
1622VIRTIO_NET_CTRL_RX_PROMISC turns promiscuous mode on and off, and
1623VIRTIO_NET_CTRL_RX_ALLMULTI turns all-multicast receive on and
1624off. The command-specific-data is one byte containing 0 (off) or
16251 (on).
1626
1627 Setting MAC Address Filtering
1628
1629struct virtio_net_ctrl_mac {
1630
1631 u32 entries;
1632
1633 u8 macs[entries][ETH_ALEN];
1634
1635};
1636
1637
1638
1639#define VIRTIO_NET_CTRL_MAC 1
1640
1641 #define VIRTIO_NET_CTRL_MAC_TABLE_SET 0
1642
1643The device can filter incoming packets by any number of
1644destination MAC addresses.[footnote:
1645Since there are no guarentees, it can use a hash filter
1646orsilently switch to allmulti or promiscuous mode if it is given
1647too many addresses.
1648] This table is set using the class VIRTIO_NET_CTRL_MAC and the
1649command VIRTIO_NET_CTRL_MAC_TABLE_SET. The command-specific-data
1650is two variable length tables of 6-byte MAC addresses. The first
1651table contains unicast addresses, and the second contains
1652multicast addresses.
1653
1654 VLAN Filtering
1655
1656If the driver negotiates the VIRTION_NET_F_CTRL_VLAN feature, it
1657can control a VLAN filter table in the device.
1658
1659#define VIRTIO_NET_CTRL_VLAN 2
1660
1661 #define VIRTIO_NET_CTRL_VLAN_ADD 0
1662
1663 #define VIRTIO_NET_CTRL_VLAN_DEL 1
1664
1665Both the VIRTIO_NET_CTRL_VLAN_ADD and VIRTIO_NET_CTRL_VLAN_DEL
1666command take a 16-bit VLAN id as the command-specific-data.
1667
1668Appendix D: Block Device
1669
1670The virtio block device is a simple virtual block device (ie.
1671disk). Read and write requests (and other exotic requests) are
1672placed in the queue, and serviced (probably out of order) by the
1673device except where noted.
1674
1675 Configuration
1676
1677 Subsystem Device ID 2
1678
1679 Virtqueues 0:requestq.
1680
1681 Feature bits
1682
1683 VIRTIO_BLK_F_BARRIER (0) Host supports request barriers.
1684
1685 VIRTIO_BLK_F_SIZE_MAX (1) Maximum size of any single segment is
1686 in “size_max”.
1687
1688 VIRTIO_BLK_F_SEG_MAX (2) Maximum number of segments in a
1689 request is in “seg_max”.
1690
1691 VIRTIO_BLK_F_GEOMETRY (4) Disk-style geometry specified in “
1692 geometry”.
1693
1694 VIRTIO_BLK_F_RO (5) Device is read-only.
1695
1696 VIRTIO_BLK_F_BLK_SIZE (6) Block size of disk is in “blk_size”.
1697
1698 VIRTIO_BLK_F_SCSI (7) Device supports scsi packet commands.
1699
1700 VIRTIO_BLK_F_FLUSH (9) Cache flush command support.
1701
1702
1703
1704 Device configuration layout The capacity of the device
1705 (expressed in 512-byte sectors) is always present. The
1706 availability of the others all depend on various feature bits
1707 as indicated above. struct virtio_blk_config {
1708
1709 u64 capacity;
1710
1711 u32 size_max;
1712
1713 u32 seg_max;
1714
1715 struct virtio_blk_geometry {
1716
1717 u16 cylinders;
1718
1719 u8 heads;
1720
1721 u8 sectors;
1722
1723 } geometry;
1724
1725 u32 blk_size;
1726
1727
1728
1729};
1730
1731 Device Initialization
1732
1733 The device size should be read from the “capacity”
1734 configuration field. No requests should be submitted which goes
1735 beyond this limit.
1736
1737 If the VIRTIO_BLK_F_BLK_SIZE feature is negotiated, the
1738 blk_size field can be read to determine the optimal sector size
1739 for the driver to use. This does not effect the units used in
1740 the protocol (always 512 bytes), but awareness of the correct
1741 value can effect performance.
1742
1743 If the VIRTIO_BLK_F_RO feature is set by the device, any write
1744 requests will fail.
1745
1746
1747
1748 Device Operation
1749
1750The driver queues requests to the virtqueue, and they are used by
1751the device (not necessarily in order). Each request is of form:
1752
1753struct virtio_blk_req {
1754
1755
1756
1757 u32 type;
1758
1759 u32 ioprio;
1760
1761 u64 sector;
1762
1763 char data[][512];
1764
1765 u8 status;
1766
1767};
1768
1769If the device has VIRTIO_BLK_F_SCSI feature, it can also support
1770scsi packet command requests, each of these requests is of form:struct virtio_scsi_pc_req {
1771
1772 u32 type;
1773
1774 u32 ioprio;
1775
1776 u64 sector;
1777
1778 char cmd[];
1779
1780 char data[][512];
1781
1782#define SCSI_SENSE_BUFFERSIZE 96
1783
1784 u8 sense[SCSI_SENSE_BUFFERSIZE];
1785
1786 u32 errors;
1787
1788 u32 data_len;
1789
1790 u32 sense_len;
1791
1792 u32 residual;
1793
1794 u8 status;
1795
1796};
1797
1798The type of the request is either a read (VIRTIO_BLK_T_IN), a
1799write (VIRTIO_BLK_T_OUT), a scsi packet command
1800(VIRTIO_BLK_T_SCSI_CMD or VIRTIO_BLK_T_SCSI_CMD_OUT[footnote:
1801the SCSI_CMD and SCSI_CMD_OUT types are equivalent, the device
1802does not distinguish between them
1803]) or a flush (VIRTIO_BLK_T_FLUSH or VIRTIO_BLK_T_FLUSH_OUT[footnote:
1804the FLUSH and FLUSH_OUT types are equivalent, the device does not
1805distinguish between them
1806]). If the device has VIRTIO_BLK_F_BARRIER feature the high bit
1807(VIRTIO_BLK_T_BARRIER) indicates that this request acts as a
1808barrier and that all preceeding requests must be complete before
1809this one, and all following requests must not be started until
1810this is complete. Note that a barrier does not flush caches in
1811the underlying backend device in host, and thus does not serve as
1812data consistency guarantee. Driver must use FLUSH request to
1813flush the host cache.
1814
1815#define VIRTIO_BLK_T_IN 0
1816
1817#define VIRTIO_BLK_T_OUT 1
1818
1819#define VIRTIO_BLK_T_SCSI_CMD 2
1820
1821#define VIRTIO_BLK_T_SCSI_CMD_OUT 3
1822
1823#define VIRTIO_BLK_T_FLUSH 4
1824
1825#define VIRTIO_BLK_T_FLUSH_OUT 5
1826
1827#define VIRTIO_BLK_T_BARRIER 0x80000000
1828
1829The ioprio field is a hint about the relative priorities of
1830requests to the device: higher numbers indicate more important
1831requests.
1832
1833The sector number indicates the offset (multiplied by 512) where
1834the read or write is to occur. This field is unused and set to 0
1835for scsi packet commands and for flush commands.
1836
1837The cmd field is only present for scsi packet command requests,
1838and indicates the command to perform. This field must reside in a
1839single, separate read-only buffer; command length can be derived
1840from the length of this buffer.
1841
1842Note that these first three (four for scsi packet commands)
1843fields are always read-only: the data field is either read-only
1844or write-only, depending on the request. The size of the read or
1845write can be derived from the total size of the request buffers.
1846
1847The sense field is only present for scsi packet command requests,
1848and indicates the buffer for scsi sense data.
1849
1850The data_len field is only present for scsi packet command
1851requests, this field is deprecated, and should be ignored by the
1852driver. Historically, devices copied data length there.
1853
1854The sense_len field is only present for scsi packet command
1855requests and indicates the number of bytes actually written to
1856the sense buffer.
1857
1858The residual field is only present for scsi packet command
1859requests and indicates the residual size, calculated as data
1860length - number of bytes actually transferred.
1861
1862The final status byte is written by the device: either
1863VIRTIO_BLK_S_OK for success, VIRTIO_BLK_S_IOERR for host or guest
1864error or VIRTIO_BLK_S_UNSUPP for a request unsupported by host:#define VIRTIO_BLK_S_OK 0
1865
1866#define VIRTIO_BLK_S_IOERR 1
1867
1868#define VIRTIO_BLK_S_UNSUPP 2
1869
1870Historically, devices assumed that the fields type, ioprio and
1871sector reside in a single, separate read-only buffer; the fields
1872errors, data_len, sense_len and residual reside in a single,
1873separate write-only buffer; the sense field in a separate
1874write-only buffer of size 96 bytes, by itself; the fields errors,
1875data_len, sense_len and residual in a single write-only buffer;
1876and the status field is a separate read-only buffer of size 1
1877byte, by itself.
1878
1879Appendix E: Console Device
1880
1881The virtio console device is a simple device for data input and
1882output. A device may have one or more ports. Each port has a pair
1883of input and output virtqueues. Moreover, a device has a pair of
1884control IO virtqueues. The control virtqueues are used to
1885communicate information between the device and the driver about
1886ports being opened and closed on either side of the connection,
1887indication from the host about whether a particular port is a
1888console port, adding new ports, port hot-plug/unplug, etc., and
1889indication from the guest about whether a port or a device was
1890successfully added, port open/close, etc.. For data IO, one or
1891more empty buffers are placed in the receive queue for incoming
1892data and outgoing characters are placed in the transmit queue.
1893
1894 Configuration
1895
1896 Subsystem Device ID 3
1897
1898 Virtqueues 0:receiveq(port0). 1:transmitq(port0), 2:control
1899 receiveq[footnote:
1900Ports 2 onwards only if VIRTIO_CONSOLE_F_MULTIPORT is set
1901], 3:control transmitq, 4:receiveq(port1), 5:transmitq(port1),
1902 ...
1903
1904 Feature bits
1905
1906 VIRTIO_CONSOLE_F_SIZE (0) Configuration cols and rows fields
1907 are valid.
1908
1909 VIRTIO_CONSOLE_F_MULTIPORT(1) Device has support for multiple
1910 ports; configuration fields nr_ports and max_nr_ports are
1911 valid and control virtqueues will be used.
1912
1913 Device configuration layout The size of the console is supplied
1914 in the configuration space if the VIRTIO_CONSOLE_F_SIZE feature
1915 is set. Furthermore, if the VIRTIO_CONSOLE_F_MULTIPORT feature
1916 is set, the maximum number of ports supported by the device can
1917 be fetched.struct virtio_console_config {
1918
1919 u16 cols;
1920
1921 u16 rows;
1922
1923
1924
1925 u32 max_nr_ports;
1926
1927};
1928
1929 Device Initialization
1930
1931 If the VIRTIO_CONSOLE_F_SIZE feature is negotiated, the driver
1932 can read the console dimensions from the configuration fields.
1933
1934 If the VIRTIO_CONSOLE_F_MULTIPORT feature is negotiated, the
1935 driver can spawn multiple ports, not all of which may be
1936 attached to a console. Some could be generic ports. In this
1937 case, the control virtqueues are enabled and according to the
1938 max_nr_ports configuration-space value, the appropriate number
1939 of virtqueues are created. A control message indicating the
1940 driver is ready is sent to the host. The host can then send
1941 control messages for adding new ports to the device. After
1942 creating and initializing each port, a
1943 VIRTIO_CONSOLE_PORT_READY control message is sent to the host
1944 for that port so the host can let us know of any additional
1945 configuration options set for that port.
1946
1947 The receiveq for each port is populated with one or more
1948 receive buffers.
1949
1950 Device Operation
1951
1952 For output, a buffer containing the characters is placed in the
1953 port's transmitq.[footnote:
1954Because this is high importance and low bandwidth, the current
1955Linux implementation polls for the buffer to be used, rather than
1956waiting for an interrupt, simplifying the implementation
1957significantly. However, for generic serial ports with the
1958O_NONBLOCK flag set, the polling limitation is relaxed and the
1959consumed buffers are freed upon the next write or poll call or
1960when a port is closed or hot-unplugged.
1961]
1962
1963 When a buffer is used in the receiveq (signalled by an
1964 interrupt), the contents is the input to the port associated
1965 with the virtqueue for which the notification was received.
1966
1967 If the driver negotiated the VIRTIO_CONSOLE_F_SIZE feature, a
1968 configuration change interrupt may occur. The updated size can
1969 be read from the configuration fields.
1970
1971 If the driver negotiated the VIRTIO_CONSOLE_F_MULTIPORT
1972 feature, active ports are announced by the host using the
1973 VIRTIO_CONSOLE_PORT_ADD control message. The same message is
1974 used for port hot-plug as well.
1975
1976 If the host specified a port `name', a sysfs attribute is
1977 created with the name filled in, so that udev rules can be
1978 written that can create a symlink from the port's name to the
1979 char device for port discovery by applications in the guest.
1980
1981 Changes to ports' state are effected by control messages.
1982 Appropriate action is taken on the port indicated in the
1983 control message. The layout of the structure of the control
1984 buffer and the events associated are:struct virtio_console_control {
1985
1986 uint32_t id; /* Port number */
1987
1988 uint16_t event; /* The kind of control event */
1989
1990 uint16_t value; /* Extra information for the event */
1991
1992};
1993
1994
1995
1996/* Some events for the internal messages (control packets) */
1997
1998
1999
2000#define VIRTIO_CONSOLE_DEVICE_READY 0
2001
2002#define VIRTIO_CONSOLE_PORT_ADD 1
2003
2004#define VIRTIO_CONSOLE_PORT_REMOVE 2
2005
2006#define VIRTIO_CONSOLE_PORT_READY 3
2007
2008#define VIRTIO_CONSOLE_CONSOLE_PORT 4
2009
2010#define VIRTIO_CONSOLE_RESIZE 5
2011
2012#define VIRTIO_CONSOLE_PORT_OPEN 6
2013
2014#define VIRTIO_CONSOLE_PORT_NAME 7
2015
2016Appendix F: Entropy Device
2017
2018The virtio entropy device supplies high-quality randomness for
2019guest use.
2020
2021 Configuration
2022
2023 Subsystem Device ID 4
2024
2025 Virtqueues 0:requestq.
2026
2027 Feature bits None currently defined
2028
2029 Device configuration layout None currently defined.
2030
2031 Device Initialization
2032
2033 The virtqueue is initialized
2034
2035 Device Operation
2036
2037When the driver requires random bytes, it places the descriptor
2038of one or more buffers in the queue. It will be completely filled
2039by random data by the device.
2040
2041Appendix G: Memory Balloon Device
2042
2043The virtio memory balloon device is a primitive device for
2044managing guest memory: the device asks for a certain amount of
2045memory, and the guest supplies it (or withdraws it, if the device
2046has more than it asks for). This allows the guest to adapt to
2047changes in allowance of underlying physical memory. If the
2048feature is negotiated, the device can also be used to communicate
2049guest memory statistics to the host.
2050
2051 Configuration
2052
2053 Subsystem Device ID 5
2054
2055 Virtqueues 0:inflateq. 1:deflateq. 2:statsq.[footnote:
2056Only if VIRTIO_BALLON_F_STATS_VQ set
2057]
2058
2059 Feature bits
2060
2061 VIRTIO_BALLOON_F_MUST_TELL_HOST (0) Host must be told before
2062 pages from the balloon are used.
2063
2064 VIRTIO_BALLOON_F_STATS_VQ (1) A virtqueue for reporting guest
2065 memory statistics is present.
2066
2067 Device configuration layout Both fields of this configuration
2068 are always available. Note that they are little endian, despite
2069 convention that device fields are guest endian:struct virtio_balloon_config {
2070
2071 u32 num_pages;
2072
2073 u32 actual;
2074
2075};
2076
2077 Device Initialization
2078
2079 The inflate and deflate virtqueues are identified.
2080
2081 If the VIRTIO_BALLOON_F_STATS_VQ feature bit is negotiated:
2082
2083 Identify the stats virtqueue.
2084
2085 Add one empty buffer to the stats virtqueue and notify the
2086 host.
2087
2088Device operation begins immediately.
2089
2090 Device Operation
2091
2092 Memory Ballooning The device is driven by the receipt of a
2093 configuration change interrupt.
2094
2095 The “num_pages” configuration field is examined. If this is
2096 greater than the “actual” number of pages, memory must be given
2097 to the balloon. If it is less than the “actual” number of
2098 pages, memory may be taken back from the balloon for general
2099 use.
2100
2101 To supply memory to the balloon (aka. inflate):
2102
2103 The driver constructs an array of addresses of unused memory
2104 pages. These addresses are divided by 4096[footnote:
2105This is historical, and independent of the guest page size
2106] and the descriptor describing the resulting 32-bit array is
2107 added to the inflateq.
2108
2109 To remove memory from the balloon (aka. deflate):
2110
2111 The driver constructs an array of addresses of memory pages it
2112 has previously given to the balloon, as described above. This
2113 descriptor is added to the deflateq.
2114
2115 If the VIRTIO_BALLOON_F_MUST_TELL_HOST feature is set, the
2116 guest may not use these requested pages until that descriptor
2117 in the deflateq has been used by the device.
2118
2119 Otherwise, the guest may begin to re-use pages previously given
2120 to the balloon before the device has acknowledged their
2121 withdrawl. [footnote:
2122In this case, deflation advice is merely a courtesy
2123]
2124
2125 In either case, once the device has completed the inflation or
2126 deflation, the “actual” field of the configuration should be
2127 updated to reflect the new number of pages in the balloon.[footnote:
2128As updates to configuration space are not atomic, this field
2129isn't particularly reliable, but can be used to diagnose buggy
2130guests.
2131]
2132
2133 Memory Statistics
2134
2135The stats virtqueue is atypical because communication is driven
2136by the device (not the driver). The channel becomes active at
2137driver initialization time when the driver adds an empty buffer
2138and notifies the device. A request for memory statistics proceeds
2139as follows:
2140
2141 The device pushes the buffer onto the used ring and sends an
2142 interrupt.
2143
2144 The driver pops the used buffer and discards it.
2145
2146 The driver collects memory statistics and writes them into a
2147 new buffer.
2148
2149 The driver adds the buffer to the virtqueue and notifies the
2150 device.
2151
2152 The device pops the buffer (retaining it to initiate a
2153 subsequent request) and consumes the statistics.
2154
2155 Memory Statistics Format Each statistic consists of a 16 bit
2156 tag and a 64 bit value. Both quantities are represented in the
2157 native endian of the guest. All statistics are optional and the
2158 driver may choose which ones to supply. To guarantee backwards
2159 compatibility, unsupported statistics should be omitted.
2160
2161 struct virtio_balloon_stat {
2162
2163#define VIRTIO_BALLOON_S_SWAP_IN 0
2164
2165#define VIRTIO_BALLOON_S_SWAP_OUT 1
2166
2167#define VIRTIO_BALLOON_S_MAJFLT 2
2168
2169#define VIRTIO_BALLOON_S_MINFLT 3
2170
2171#define VIRTIO_BALLOON_S_MEMFREE 4
2172
2173#define VIRTIO_BALLOON_S_MEMTOT 5
2174
2175 u16 tag;
2176
2177 u64 val;
2178
2179} __attribute__((packed));
2180
2181 Tags
2182
2183 VIRTIO_BALLOON_S_SWAP_IN The amount of memory that has been
2184 swapped in (in bytes).
2185
2186 VIRTIO_BALLOON_S_SWAP_OUT The amount of memory that has been
2187 swapped out to disk (in bytes).
2188
2189 VIRTIO_BALLOON_S_MAJFLT The number of major page faults that
2190 have occurred.
2191
2192 VIRTIO_BALLOON_S_MINFLT The number of minor page faults that
2193 have occurred.
2194
2195 VIRTIO_BALLOON_S_MEMFREE The amount of memory not being used
2196 for any purpose (in bytes).
2197
2198 VIRTIO_BALLOON_S_MEMTOT The total amount of memory available
2199 (in bytes).
2200