diff options
Diffstat (limited to 'Documentation')
-rw-r--r-- | Documentation/ABI/testing/sysfs-class-scsi_host | 13 | ||||
-rw-r--r-- | Documentation/DocBook/media/v4l/controls.xml | 38 | ||||
-rw-r--r-- | Documentation/PCI/MSI-HOWTO.txt | 89 | ||||
-rw-r--r-- | Documentation/block/cfq-iosched.txt | 71 | ||||
-rw-r--r-- | Documentation/cgroups/memory.txt | 85 | ||||
-rw-r--r-- | Documentation/feature-removal-schedule.txt | 8 | ||||
-rw-r--r-- | Documentation/hwmon/coretemp | 14 | ||||
-rw-r--r-- | Documentation/hwmon/max16065 | 7 | ||||
-rw-r--r-- | Documentation/ioctl/ioctl-number.txt | 2 | ||||
-rw-r--r-- | Documentation/kernel-parameters.txt | 25 | ||||
-rw-r--r-- | Documentation/networking/00-INDEX | 116 | ||||
-rw-r--r-- | Documentation/networking/dmfe.txt | 3 | ||||
-rw-r--r-- | Documentation/networking/ip-sysctl.txt | 6 | ||||
-rw-r--r-- | Documentation/networking/scaling.txt | 35 | ||||
-rw-r--r-- | Documentation/power/runtime_pm.txt | 3 | ||||
-rw-r--r-- | Documentation/virtual/00-INDEX | 3 | ||||
-rw-r--r-- | Documentation/virtual/lguest/lguest.c | 3 | ||||
-rw-r--r-- | Documentation/virtual/virtio-spec.txt | 2200 | ||||
-rw-r--r-- | Documentation/vm/transhuge.txt | 7 |
19 files changed, 2538 insertions, 190 deletions
diff --git a/Documentation/ABI/testing/sysfs-class-scsi_host b/Documentation/ABI/testing/sysfs-class-scsi_host new file mode 100644 index 000000000000..29a4f892e433 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-scsi_host | |||
@@ -0,0 +1,13 @@ | |||
1 | What: /sys/class/scsi_host/hostX/isci_id | ||
2 | Date: June 2011 | ||
3 | Contact: Dave Jiang <dave.jiang@intel.com> | ||
4 | Description: | ||
5 | This file contains the enumerated host ID for the Intel | ||
6 | SCU controller. The Intel(R) C600 Series Chipset SATA/SAS | ||
7 | Storage Control Unit embeds up to two 4-port controllers in | ||
8 | a single PCI device. The controllers are enumerated in order | ||
9 | which usually means the lowest number scsi_host corresponds | ||
10 | with the first controller, but this association is not | ||
11 | guaranteed. The 'isci_id' attribute unambiguously identifies | ||
12 | the controller index: '0' for the first controller, | ||
13 | '1' for the second. | ||
diff --git a/Documentation/DocBook/media/v4l/controls.xml b/Documentation/DocBook/media/v4l/controls.xml index 85164016ed26..23fdf79f8cf3 100644 --- a/Documentation/DocBook/media/v4l/controls.xml +++ b/Documentation/DocBook/media/v4l/controls.xml | |||
@@ -1455,7 +1455,7 @@ Applicable to the H264 encoder.</entry> | |||
1455 | </row> | 1455 | </row> |
1456 | 1456 | ||
1457 | <row><entry></entry></row> | 1457 | <row><entry></entry></row> |
1458 | <row> | 1458 | <row id="v4l2-mpeg-video-h264-vui-sar-idc"> |
1459 | <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_H264_VUI_SAR_IDC</constant> </entry> | 1459 | <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_H264_VUI_SAR_IDC</constant> </entry> |
1460 | <entry>enum v4l2_mpeg_video_h264_vui_sar_idc</entry> | 1460 | <entry>enum v4l2_mpeg_video_h264_vui_sar_idc</entry> |
1461 | </row> | 1461 | </row> |
@@ -1561,7 +1561,7 @@ Applicable to the H264 encoder.</entry> | |||
1561 | </row> | 1561 | </row> |
1562 | 1562 | ||
1563 | <row><entry></entry></row> | 1563 | <row><entry></entry></row> |
1564 | <row> | 1564 | <row id="v4l2-mpeg-video-h264-level"> |
1565 | <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_H264_LEVEL</constant> </entry> | 1565 | <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_H264_LEVEL</constant> </entry> |
1566 | <entry>enum v4l2_mpeg_video_h264_level</entry> | 1566 | <entry>enum v4l2_mpeg_video_h264_level</entry> |
1567 | </row> | 1567 | </row> |
@@ -1641,7 +1641,7 @@ Possible values are:</entry> | |||
1641 | </row> | 1641 | </row> |
1642 | 1642 | ||
1643 | <row><entry></entry></row> | 1643 | <row><entry></entry></row> |
1644 | <row> | 1644 | <row id="v4l2-mpeg-video-mpeg4-level"> |
1645 | <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_MPEG4_LEVEL</constant> </entry> | 1645 | <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_MPEG4_LEVEL</constant> </entry> |
1646 | <entry>enum v4l2_mpeg_video_mpeg4_level</entry> | 1646 | <entry>enum v4l2_mpeg_video_mpeg4_level</entry> |
1647 | </row> | 1647 | </row> |
@@ -1689,9 +1689,9 @@ Possible values are:</entry> | |||
1689 | </row> | 1689 | </row> |
1690 | 1690 | ||
1691 | <row><entry></entry></row> | 1691 | <row><entry></entry></row> |
1692 | <row> | 1692 | <row id="v4l2-mpeg-video-h264-profile"> |
1693 | <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_H264_PROFILE</constant> </entry> | 1693 | <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_H264_PROFILE</constant> </entry> |
1694 | <entry>enum v4l2_mpeg_h264_profile</entry> | 1694 | <entry>enum v4l2_mpeg_video_h264_profile</entry> |
1695 | </row> | 1695 | </row> |
1696 | <row><entry spanname="descr">The profile information for H264. | 1696 | <row><entry spanname="descr">The profile information for H264. |
1697 | Applicable to the H264 encoder. | 1697 | Applicable to the H264 encoder. |
@@ -1774,9 +1774,9 @@ Possible values are:</entry> | |||
1774 | </row> | 1774 | </row> |
1775 | 1775 | ||
1776 | <row><entry></entry></row> | 1776 | <row><entry></entry></row> |
1777 | <row> | 1777 | <row id="v4l2-mpeg-video-mpeg4-profile"> |
1778 | <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE</constant> </entry> | 1778 | <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE</constant> </entry> |
1779 | <entry>enum v4l2_mpeg_mpeg4_profile</entry> | 1779 | <entry>enum v4l2_mpeg_video_mpeg4_profile</entry> |
1780 | </row> | 1780 | </row> |
1781 | <row><entry spanname="descr">The profile information for MPEG4. | 1781 | <row><entry spanname="descr">The profile information for MPEG4. |
1782 | Applicable to the MPEG4 encoder. | 1782 | Applicable to the MPEG4 encoder. |
@@ -1820,9 +1820,9 @@ Applicable to the encoder. | |||
1820 | </row> | 1820 | </row> |
1821 | 1821 | ||
1822 | <row><entry></entry></row> | 1822 | <row><entry></entry></row> |
1823 | <row> | 1823 | <row id="v4l2-mpeg-video-multi-slice-mode"> |
1824 | <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_MULTI_SLICE_MODE</constant> </entry> | 1824 | <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_MULTI_SLICE_MODE</constant> </entry> |
1825 | <entry>enum v4l2_mpeg_multi_slice_mode</entry> | 1825 | <entry>enum v4l2_mpeg_video_multi_slice_mode</entry> |
1826 | </row> | 1826 | </row> |
1827 | <row><entry spanname="descr">Determines how the encoder should handle division of frame into slices. | 1827 | <row><entry spanname="descr">Determines how the encoder should handle division of frame into slices. |
1828 | Applicable to the encoder. | 1828 | Applicable to the encoder. |
@@ -1868,9 +1868,9 @@ Applicable to the encoder.</entry> | |||
1868 | </row> | 1868 | </row> |
1869 | 1869 | ||
1870 | <row><entry></entry></row> | 1870 | <row><entry></entry></row> |
1871 | <row> | 1871 | <row id="v4l2-mpeg-video-h264-loop-filter-mode"> |
1872 | <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_H264_LOOP_FILTER_MODE</constant> </entry> | 1872 | <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_H264_LOOP_FILTER_MODE</constant> </entry> |
1873 | <entry>enum v4l2_mpeg_h264_loop_filter_mode</entry> | 1873 | <entry>enum v4l2_mpeg_video_h264_loop_filter_mode</entry> |
1874 | </row> | 1874 | </row> |
1875 | <row><entry spanname="descr">Loop filter mode for H264 encoder. | 1875 | <row><entry spanname="descr">Loop filter mode for H264 encoder. |
1876 | Possible values are:</entry> | 1876 | Possible values are:</entry> |
@@ -1913,9 +1913,9 @@ Applicable to the H264 encoder.</entry> | |||
1913 | </row> | 1913 | </row> |
1914 | 1914 | ||
1915 | <row><entry></entry></row> | 1915 | <row><entry></entry></row> |
1916 | <row> | 1916 | <row id="v4l2-mpeg-video-h264-entropy-mode"> |
1917 | <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_H264_ENTROPY_MODE</constant> </entry> | 1917 | <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_H264_ENTROPY_MODE</constant> </entry> |
1918 | <entry>enum v4l2_mpeg_h264_symbol_mode</entry> | 1918 | <entry>enum v4l2_mpeg_video_h264_entropy_mode</entry> |
1919 | </row> | 1919 | </row> |
1920 | <row><entry spanname="descr">Entropy coding mode for H264 - CABAC/CAVALC. | 1920 | <row><entry spanname="descr">Entropy coding mode for H264 - CABAC/CAVALC. |
1921 | Applicable to the H264 encoder. | 1921 | Applicable to the H264 encoder. |
@@ -2140,9 +2140,9 @@ previous frames. Applicable to the H264 encoder.</entry> | |||
2140 | </row> | 2140 | </row> |
2141 | 2141 | ||
2142 | <row><entry></entry></row> | 2142 | <row><entry></entry></row> |
2143 | <row> | 2143 | <row id="v4l2-mpeg-video-header-mode"> |
2144 | <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_HEADER_MODE</constant> </entry> | 2144 | <entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_HEADER_MODE</constant> </entry> |
2145 | <entry>enum v4l2_mpeg_header_mode</entry> | 2145 | <entry>enum v4l2_mpeg_video_header_mode</entry> |
2146 | </row> | 2146 | </row> |
2147 | <row><entry spanname="descr">Determines whether the header is returned as the first buffer or is | 2147 | <row><entry spanname="descr">Determines whether the header is returned as the first buffer or is |
2148 | it returned together with the first frame. Applicable to encoders. | 2148 | it returned together with the first frame. Applicable to encoders. |
@@ -2320,9 +2320,9 @@ Valid only when H.264 and macroblock level RC is enabled (<constant>V4L2_CID_MPE | |||
2320 | Applicable to the H264 encoder.</entry> | 2320 | Applicable to the H264 encoder.</entry> |
2321 | </row> | 2321 | </row> |
2322 | <row><entry></entry></row> | 2322 | <row><entry></entry></row> |
2323 | <row> | 2323 | <row id="v4l2-mpeg-mfc51-video-frame-skip-mode"> |
2324 | <entry spanname="id"><constant>V4L2_CID_MPEG_MFC51_VIDEO_FRAME_SKIP_MODE</constant> </entry> | 2324 | <entry spanname="id"><constant>V4L2_CID_MPEG_MFC51_VIDEO_FRAME_SKIP_MODE</constant> </entry> |
2325 | <entry>enum v4l2_mpeg_mfc51_frame_skip_mode</entry> | 2325 | <entry>enum v4l2_mpeg_mfc51_video_frame_skip_mode</entry> |
2326 | </row> | 2326 | </row> |
2327 | <row><entry spanname="descr"> | 2327 | <row><entry spanname="descr"> |
2328 | Indicates in what conditions the encoder should skip frames. If encoding a frame would cause the encoded stream to be larger then | 2328 | Indicates in what conditions the encoder should skip frames. If encoding a frame would cause the encoded stream to be larger then |
@@ -2361,9 +2361,9 @@ the stream will meet tight bandwidth contraints. Applicable to encoders. | |||
2361 | </entry> | 2361 | </entry> |
2362 | </row> | 2362 | </row> |
2363 | <row><entry></entry></row> | 2363 | <row><entry></entry></row> |
2364 | <row> | 2364 | <row id="v4l2-mpeg-mfc51-video-force-frame-type"> |
2365 | <entry spanname="id"><constant>V4L2_CID_MPEG_MFC51_VIDEO_FORCE_FRAME_TYPE</constant> </entry> | 2365 | <entry spanname="id"><constant>V4L2_CID_MPEG_MFC51_VIDEO_FORCE_FRAME_TYPE</constant> </entry> |
2366 | <entry>enum v4l2_mpeg_mfc51_force_frame_type</entry> | 2366 | <entry>enum v4l2_mpeg_mfc51_video_force_frame_type</entry> |
2367 | </row> | 2367 | </row> |
2368 | <row><entry spanname="descr">Force a frame type for the next queued buffer. Applicable to encoders. | 2368 | <row><entry spanname="descr">Force a frame type for the next queued buffer. Applicable to encoders. |
2369 | Possible values are:</entry> | 2369 | Possible values are:</entry> |
diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt index 3f5e0b09bed5..53e6fca146d7 100644 --- a/Documentation/PCI/MSI-HOWTO.txt +++ b/Documentation/PCI/MSI-HOWTO.txt | |||
@@ -45,7 +45,7 @@ arrived in memory (this becomes more likely with devices behind PCI-PCI | |||
45 | bridges). In order to ensure that all the data has arrived in memory, | 45 | bridges). In order to ensure that all the data has arrived in memory, |
46 | the interrupt handler must read a register on the device which raised | 46 | the interrupt handler must read a register on the device which raised |
47 | the interrupt. PCI transaction ordering rules require that all the data | 47 | the interrupt. PCI transaction ordering rules require that all the data |
48 | arrives in memory before the value can be returned from the register. | 48 | arrive in memory before the value may be returned from the register. |
49 | Using MSIs avoids this problem as the interrupt-generating write cannot | 49 | Using MSIs avoids this problem as the interrupt-generating write cannot |
50 | pass the data writes, so by the time the interrupt is raised, the driver | 50 | pass the data writes, so by the time the interrupt is raised, the driver |
51 | knows that all the data has arrived in memory. | 51 | knows that all the data has arrived in memory. |
@@ -86,13 +86,13 @@ device. | |||
86 | 86 | ||
87 | int pci_enable_msi(struct pci_dev *dev) | 87 | int pci_enable_msi(struct pci_dev *dev) |
88 | 88 | ||
89 | A successful call will allocate ONE interrupt to the device, regardless | 89 | A successful call allocates ONE interrupt to the device, regardless |
90 | of how many MSIs the device supports. The device will be switched from | 90 | of how many MSIs the device supports. The device is switched from |
91 | pin-based interrupt mode to MSI mode. The dev->irq number is changed | 91 | pin-based interrupt mode to MSI mode. The dev->irq number is changed |
92 | to a new number which represents the message signaled interrupt. | 92 | to a new number which represents the message signaled interrupt; |
93 | This function should be called before the driver calls request_irq() | 93 | consequently, this function should be called before the driver calls |
94 | since enabling MSIs disables the pin-based IRQ and the driver will not | 94 | request_irq(), because an MSI is delivered via a vector that is |
95 | receive interrupts on the old interrupt. | 95 | different from the vector of a pin-based interrupt. |
96 | 96 | ||
97 | 4.2.2 pci_enable_msi_block | 97 | 4.2.2 pci_enable_msi_block |
98 | 98 | ||
@@ -111,20 +111,20 @@ the device are in the range dev->irq to dev->irq + count - 1. | |||
111 | 111 | ||
112 | If this function returns a negative number, it indicates an error and | 112 | If this function returns a negative number, it indicates an error and |
113 | the driver should not attempt to request any more MSI interrupts for | 113 | the driver should not attempt to request any more MSI interrupts for |
114 | this device. If this function returns a positive number, it will be | 114 | this device. If this function returns a positive number, it is |
115 | less than 'count' and indicate the number of interrupts that could have | 115 | less than 'count' and indicates the number of interrupts that could have |
116 | been allocated. In neither case will the irq value have been | 116 | been allocated. In neither case is the irq value updated or the device |
117 | updated, nor will the device have been switched into MSI mode. | 117 | switched into MSI mode. |
118 | 118 | ||
119 | The device driver must decide what action to take if | 119 | The device driver must decide what action to take if |
120 | pci_enable_msi_block() returns a value less than the number asked for. | 120 | pci_enable_msi_block() returns a value less than the number requested. |
121 | Some devices can make use of fewer interrupts than the maximum they | 121 | For instance, the driver could still make use of fewer interrupts; |
122 | request; in this case the driver should call pci_enable_msi_block() | 122 | in this case the driver should call pci_enable_msi_block() |
123 | again. Note that it is not guaranteed to succeed, even when the | 123 | again. Note that it is not guaranteed to succeed, even when the |
124 | 'count' has been reduced to the value returned from a previous call to | 124 | 'count' has been reduced to the value returned from a previous call to |
125 | pci_enable_msi_block(). This is because there are multiple constraints | 125 | pci_enable_msi_block(). This is because there are multiple constraints |
126 | on the number of vectors that can be allocated; pci_enable_msi_block() | 126 | on the number of vectors that can be allocated; pci_enable_msi_block() |
127 | will return as soon as it finds any constraint that doesn't allow the | 127 | returns as soon as it finds any constraint that doesn't allow the |
128 | call to succeed. | 128 | call to succeed. |
129 | 129 | ||
130 | 4.2.3 pci_disable_msi | 130 | 4.2.3 pci_disable_msi |
@@ -137,10 +137,10 @@ interrupt number and frees the previously allocated message signaled | |||
137 | interrupt(s). The interrupt may subsequently be assigned to another | 137 | interrupt(s). The interrupt may subsequently be assigned to another |
138 | device, so drivers should not cache the value of dev->irq. | 138 | device, so drivers should not cache the value of dev->irq. |
139 | 139 | ||
140 | A device driver must always call free_irq() on the interrupt(s) | 140 | Before calling this function, a device driver must always call free_irq() |
141 | for which it has called request_irq() before calling this function. | 141 | on any interrupt for which it previously called request_irq(). |
142 | Failure to do so will result in a BUG_ON(), the device will be left with | 142 | Failure to do so results in a BUG_ON(), leaving the device with |
143 | MSI enabled and will leak its vector. | 143 | MSI enabled and thus leaking its vector. |
144 | 144 | ||
145 | 4.3 Using MSI-X | 145 | 4.3 Using MSI-X |
146 | 146 | ||
@@ -155,10 +155,10 @@ struct msix_entry { | |||
155 | }; | 155 | }; |
156 | 156 | ||
157 | This allows for the device to use these interrupts in a sparse fashion; | 157 | This allows for the device to use these interrupts in a sparse fashion; |
158 | for example it could use interrupts 3 and 1027 and allocate only a | 158 | for example, it could use interrupts 3 and 1027 and yet allocate only a |
159 | two-element array. The driver is expected to fill in the 'entry' value | 159 | two-element array. The driver is expected to fill in the 'entry' value |
160 | in each element of the array to indicate which entries it wants the kernel | 160 | in each element of the array to indicate for which entries the kernel |
161 | to assign interrupts for. It is invalid to fill in two entries with the | 161 | should assign interrupts; it is invalid to fill in two entries with the |
162 | same number. | 162 | same number. |
163 | 163 | ||
164 | 4.3.1 pci_enable_msix | 164 | 4.3.1 pci_enable_msix |
@@ -168,10 +168,11 @@ int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec) | |||
168 | Calling this function asks the PCI subsystem to allocate 'nvec' MSIs. | 168 | Calling this function asks the PCI subsystem to allocate 'nvec' MSIs. |
169 | The 'entries' argument is a pointer to an array of msix_entry structs | 169 | The 'entries' argument is a pointer to an array of msix_entry structs |
170 | which should be at least 'nvec' entries in size. On success, the | 170 | which should be at least 'nvec' entries in size. On success, the |
171 | function will return 0 and the device will have been switched into | 171 | device is switched into MSI-X mode and the function returns 0. |
172 | MSI-X interrupt mode. The 'vector' elements in each entry will have | 172 | The 'vector' member in each entry is populated with the interrupt number; |
173 | been filled in with the interrupt number. The driver should then call | 173 | the driver should then call request_irq() for each 'vector' that it |
174 | request_irq() for each 'vector' that it decides to use. | 174 | decides to use. The device driver is responsible for keeping track of the |
175 | interrupts assigned to the MSI-X vectors so it can free them again later. | ||
175 | 176 | ||
176 | If this function returns a negative number, it indicates an error and | 177 | If this function returns a negative number, it indicates an error and |
177 | the driver should not attempt to allocate any more MSI-X interrupts for | 178 | the driver should not attempt to allocate any more MSI-X interrupts for |
@@ -181,16 +182,14 @@ below. | |||
181 | 182 | ||
182 | This function, in contrast with pci_enable_msi(), does not adjust | 183 | This function, in contrast with pci_enable_msi(), does not adjust |
183 | dev->irq. The device will not generate interrupts for this interrupt | 184 | dev->irq. The device will not generate interrupts for this interrupt |
184 | number once MSI-X is enabled. The device driver is responsible for | 185 | number once MSI-X is enabled. |
185 | keeping track of the interrupts assigned to the MSI-X vectors so it can | ||
186 | free them again later. | ||
187 | 186 | ||
188 | Device drivers should normally call this function once per device | 187 | Device drivers should normally call this function once per device |
189 | during the initialization phase. | 188 | during the initialization phase. |
190 | 189 | ||
191 | It is ideal if drivers can cope with a variable number of MSI-X interrupts, | 190 | It is ideal if drivers can cope with a variable number of MSI-X interrupts; |
192 | there are many reasons why the platform may not be able to provide the | 191 | there are many reasons why the platform may not be able to provide the |
193 | exact number a driver asks for. | 192 | exact number that a driver asks for. |
194 | 193 | ||
195 | A request loop to achieve that might look like: | 194 | A request loop to achieve that might look like: |
196 | 195 | ||
@@ -212,15 +211,15 @@ static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec) | |||
212 | 211 | ||
213 | void pci_disable_msix(struct pci_dev *dev) | 212 | void pci_disable_msix(struct pci_dev *dev) |
214 | 213 | ||
215 | This API should be used to undo the effect of pci_enable_msix(). It frees | 214 | This function should be used to undo the effect of pci_enable_msix(). It frees |
216 | the previously allocated message signaled interrupts. The interrupts may | 215 | the previously allocated message signaled interrupts. The interrupts may |
217 | subsequently be assigned to another device, so drivers should not cache | 216 | subsequently be assigned to another device, so drivers should not cache |
218 | the value of the 'vector' elements over a call to pci_disable_msix(). | 217 | the value of the 'vector' elements over a call to pci_disable_msix(). |
219 | 218 | ||
220 | A device driver must always call free_irq() on the interrupt(s) | 219 | Before calling this function, a device driver must always call free_irq() |
221 | for which it has called request_irq() before calling this function. | 220 | on any interrupt for which it previously called request_irq(). |
222 | Failure to do so will result in a BUG_ON(), the device will be left with | 221 | Failure to do so results in a BUG_ON(), leaving the device with |
223 | MSI enabled and will leak its vector. | 222 | MSI-X enabled and thus leaking its vector. |
224 | 223 | ||
225 | 4.3.3 The MSI-X Table | 224 | 4.3.3 The MSI-X Table |
226 | 225 | ||
@@ -232,10 +231,10 @@ mask or unmask an interrupt, it should call disable_irq() / enable_irq(). | |||
232 | 4.4 Handling devices implementing both MSI and MSI-X capabilities | 231 | 4.4 Handling devices implementing both MSI and MSI-X capabilities |
233 | 232 | ||
234 | If a device implements both MSI and MSI-X capabilities, it can | 233 | If a device implements both MSI and MSI-X capabilities, it can |
235 | run in either MSI mode or MSI-X mode but not both simultaneously. | 234 | run in either MSI mode or MSI-X mode, but not both simultaneously. |
236 | This is a requirement of the PCI spec, and it is enforced by the | 235 | This is a requirement of the PCI spec, and it is enforced by the |
237 | PCI layer. Calling pci_enable_msi() when MSI-X is already enabled or | 236 | PCI layer. Calling pci_enable_msi() when MSI-X is already enabled or |
238 | pci_enable_msix() when MSI is already enabled will result in an error. | 237 | pci_enable_msix() when MSI is already enabled results in an error. |
239 | If a device driver wishes to switch between MSI and MSI-X at runtime, | 238 | If a device driver wishes to switch between MSI and MSI-X at runtime, |
240 | it must first quiesce the device, then switch it back to pin-interrupt | 239 | it must first quiesce the device, then switch it back to pin-interrupt |
241 | mode, before calling pci_enable_msi() or pci_enable_msix() and resuming | 240 | mode, before calling pci_enable_msi() or pci_enable_msix() and resuming |
@@ -251,7 +250,7 @@ the MSI-X facilities in preference to the MSI facilities. As mentioned | |||
251 | above, MSI-X supports any number of interrupts between 1 and 2048. | 250 | above, MSI-X supports any number of interrupts between 1 and 2048. |
252 | In constrast, MSI is restricted to a maximum of 32 interrupts (and | 251 | In constrast, MSI is restricted to a maximum of 32 interrupts (and |
253 | must be a power of two). In addition, the MSI interrupt vectors must | 252 | must be a power of two). In addition, the MSI interrupt vectors must |
254 | be allocated consecutively, so the system may not be able to allocate | 253 | be allocated consecutively, so the system might not be able to allocate |
255 | as many vectors for MSI as it could for MSI-X. On some platforms, MSI | 254 | as many vectors for MSI as it could for MSI-X. On some platforms, MSI |
256 | interrupts must all be targeted at the same set of CPUs whereas MSI-X | 255 | interrupts must all be targeted at the same set of CPUs whereas MSI-X |
257 | interrupts can all be targeted at different CPUs. | 256 | interrupts can all be targeted at different CPUs. |
@@ -281,7 +280,7 @@ disabled to enabled and back again. | |||
281 | 280 | ||
282 | Using 'lspci -v' (as root) may show some devices with "MSI", "Message | 281 | Using 'lspci -v' (as root) may show some devices with "MSI", "Message |
283 | Signalled Interrupts" or "MSI-X" capabilities. Each of these capabilities | 282 | Signalled Interrupts" or "MSI-X" capabilities. Each of these capabilities |
284 | has an 'Enable' flag which will be followed with either "+" (enabled) | 283 | has an 'Enable' flag which is followed with either "+" (enabled) |
285 | or "-" (disabled). | 284 | or "-" (disabled). |
286 | 285 | ||
287 | 286 | ||
@@ -298,7 +297,7 @@ The PCI stack provides three ways to disable MSIs: | |||
298 | 297 | ||
299 | Some host chipsets simply don't support MSIs properly. If we're | 298 | Some host chipsets simply don't support MSIs properly. If we're |
300 | lucky, the manufacturer knows this and has indicated it in the ACPI | 299 | lucky, the manufacturer knows this and has indicated it in the ACPI |
301 | FADT table. In this case, Linux will automatically disable MSIs. | 300 | FADT table. In this case, Linux automatically disables MSIs. |
302 | Some boards don't include this information in the table and so we have | 301 | Some boards don't include this information in the table and so we have |
303 | to detect them ourselves. The complete list of these is found near the | 302 | to detect them ourselves. The complete list of these is found near the |
304 | quirk_disable_all_msi() function in drivers/pci/quirks.c. | 303 | quirk_disable_all_msi() function in drivers/pci/quirks.c. |
@@ -317,7 +316,7 @@ Some bridges allow you to enable MSIs by changing some bits in their | |||
317 | PCI configuration space (especially the Hypertransport chipsets such | 316 | PCI configuration space (especially the Hypertransport chipsets such |
318 | as the nVidia nForce and Serverworks HT2000). As with host chipsets, | 317 | as the nVidia nForce and Serverworks HT2000). As with host chipsets, |
319 | Linux mostly knows about them and automatically enables MSIs if it can. | 318 | Linux mostly knows about them and automatically enables MSIs if it can. |
320 | If you have a bridge which Linux doesn't yet know about, you can enable | 319 | If you have a bridge unknown to Linux, you can enable |
321 | MSIs in configuration space using whatever method you know works, then | 320 | MSIs in configuration space using whatever method you know works, then |
322 | enable MSIs on that bridge by doing: | 321 | enable MSIs on that bridge by doing: |
323 | 322 | ||
@@ -327,7 +326,7 @@ where $bridge is the PCI address of the bridge you've enabled (eg | |||
327 | 0000:00:0e.0). | 326 | 0000:00:0e.0). |
328 | 327 | ||
329 | To disable MSIs, echo 0 instead of 1. Changing this value should be | 328 | To disable MSIs, echo 0 instead of 1. Changing this value should be |
330 | done with caution as it can break interrupt handling for all devices | 329 | done with caution as it could break interrupt handling for all devices |
331 | below this bridge. | 330 | below this bridge. |
332 | 331 | ||
333 | Again, please notify linux-pci@vger.kernel.org of any bridges that need | 332 | Again, please notify linux-pci@vger.kernel.org of any bridges that need |
@@ -336,7 +335,7 @@ special handling. | |||
336 | 5.3. Disabling MSIs on a single device | 335 | 5.3. Disabling MSIs on a single device |
337 | 336 | ||
338 | Some devices are known to have faulty MSI implementations. Usually this | 337 | Some devices are known to have faulty MSI implementations. Usually this |
339 | is handled in the individual device driver but occasionally it's necessary | 338 | is handled in the individual device driver, but occasionally it's necessary |
340 | to handle this with a quirk. Some drivers have an option to disable use | 339 | to handle this with a quirk. Some drivers have an option to disable use |
341 | of MSI. While this is a convenient workaround for the driver author, | 340 | of MSI. While this is a convenient workaround for the driver author, |
342 | it is not good practise, and should not be emulated. | 341 | it is not good practise, and should not be emulated. |
@@ -350,7 +349,7 @@ for your machine. You should also check your .config to be sure you | |||
350 | have enabled CONFIG_PCI_MSI. | 349 | have enabled CONFIG_PCI_MSI. |
351 | 350 | ||
352 | Then, 'lspci -t' gives the list of bridges above a device. Reading | 351 | Then, 'lspci -t' gives the list of bridges above a device. Reading |
353 | /sys/bus/pci/devices/*/msi_bus will tell you whether MSI are enabled (1) | 352 | /sys/bus/pci/devices/*/msi_bus will tell you whether MSIs are enabled (1) |
354 | or disabled (0). If 0 is found in any of the msi_bus files belonging | 353 | or disabled (0). If 0 is found in any of the msi_bus files belonging |
355 | to bridges between the PCI root and the device, MSIs are disabled. | 354 | to bridges between the PCI root and the device, MSIs are disabled. |
356 | 355 | ||
diff --git a/Documentation/block/cfq-iosched.txt b/Documentation/block/cfq-iosched.txt index e578feed6d81..6d670f570451 100644 --- a/Documentation/block/cfq-iosched.txt +++ b/Documentation/block/cfq-iosched.txt | |||
@@ -43,3 +43,74 @@ If one sets slice_idle=0 and if storage supports NCQ, CFQ internally switches | |||
43 | to IOPS mode and starts providing fairness in terms of number of requests | 43 | to IOPS mode and starts providing fairness in terms of number of requests |
44 | dispatched. Note that this mode switching takes effect only for group | 44 | dispatched. Note that this mode switching takes effect only for group |
45 | scheduling. For non-cgroup users nothing should change. | 45 | scheduling. For non-cgroup users nothing should change. |
46 | |||
47 | CFQ IO scheduler Idling Theory | ||
48 | =============================== | ||
49 | Idling on a queue is primarily about waiting for the next request to come | ||
50 | on same queue after completion of a request. In this process CFQ will not | ||
51 | dispatch requests from other cfq queues even if requests are pending there. | ||
52 | |||
53 | The rationale behind idling is that it can cut down on number of seeks | ||
54 | on rotational media. For example, if a process is doing dependent | ||
55 | sequential reads (next read will come on only after completion of previous | ||
56 | one), then not dispatching request from other queue should help as we | ||
57 | did not move the disk head and kept on dispatching sequential IO from | ||
58 | one queue. | ||
59 | |||
60 | CFQ has following service trees and various queues are put on these trees. | ||
61 | |||
62 | sync-idle sync-noidle async | ||
63 | |||
64 | All cfq queues doing synchronous sequential IO go on to sync-idle tree. | ||
65 | On this tree we idle on each queue individually. | ||
66 | |||
67 | All synchronous non-sequential queues go on sync-noidle tree. Also any | ||
68 | request which are marked with REQ_NOIDLE go on this service tree. On this | ||
69 | tree we do not idle on individual queues instead idle on the whole group | ||
70 | of queues or the tree. So if there are 4 queues waiting for IO to dispatch | ||
71 | we will idle only once last queue has dispatched the IO and there is | ||
72 | no more IO on this service tree. | ||
73 | |||
74 | All async writes go on async service tree. There is no idling on async | ||
75 | queues. | ||
76 | |||
77 | CFQ has some optimizations for SSDs and if it detects a non-rotational | ||
78 | media which can support higher queue depth (multiple requests at in | ||
79 | flight at a time), then it cuts down on idling of individual queues and | ||
80 | all the queues move to sync-noidle tree and only tree idle remains. This | ||
81 | tree idling provides isolation with buffered write queues on async tree. | ||
82 | |||
83 | FAQ | ||
84 | === | ||
85 | Q1. Why to idle at all on queues marked with REQ_NOIDLE. | ||
86 | |||
87 | A1. We only do tree idle (all queues on sync-noidle tree) on queues marked | ||
88 | with REQ_NOIDLE. This helps in providing isolation with all the sync-idle | ||
89 | queues. Otherwise in presence of many sequential readers, other | ||
90 | synchronous IO might not get fair share of disk. | ||
91 | |||
92 | For example, if there are 10 sequential readers doing IO and they get | ||
93 | 100ms each. If a REQ_NOIDLE request comes in, it will be scheduled | ||
94 | roughly after 1 second. If after completion of REQ_NOIDLE request we | ||
95 | do not idle, and after a couple of milli seconds a another REQ_NOIDLE | ||
96 | request comes in, again it will be scheduled after 1second. Repeat it | ||
97 | and notice how a workload can lose its disk share and suffer due to | ||
98 | multiple sequential readers. | ||
99 | |||
100 | fsync can generate dependent IO where bunch of data is written in the | ||
101 | context of fsync, and later some journaling data is written. Journaling | ||
102 | data comes in only after fsync has finished its IO (atleast for ext4 | ||
103 | that seemed to be the case). Now if one decides not to idle on fsync | ||
104 | thread due to REQ_NOIDLE, then next journaling write will not get | ||
105 | scheduled for another second. A process doing small fsync, will suffer | ||
106 | badly in presence of multiple sequential readers. | ||
107 | |||
108 | Hence doing tree idling on threads using REQ_NOIDLE flag on requests | ||
109 | provides isolation from multiple sequential readers and at the same | ||
110 | time we do not idle on individual threads. | ||
111 | |||
112 | Q2. When to specify REQ_NOIDLE | ||
113 | A2. I would think whenever one is doing synchronous write and not expecting | ||
114 | more writes to be dispatched from same context soon, should be able | ||
115 | to specify REQ_NOIDLE on writes and that probably should work well for | ||
116 | most of the cases. | ||
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 6f3c598971fc..06eb6d957c83 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
@@ -380,7 +380,7 @@ will be charged as a new owner of it. | |||
380 | 380 | ||
381 | 5.2 stat file | 381 | 5.2 stat file |
382 | 382 | ||
383 | 5.2.1 memory.stat file includes following statistics | 383 | memory.stat file includes following statistics |
384 | 384 | ||
385 | # per-memory cgroup local status | 385 | # per-memory cgroup local status |
386 | cache - # of bytes of page cache memory. | 386 | cache - # of bytes of page cache memory. |
@@ -438,89 +438,6 @@ Note: | |||
438 | file_mapped is accounted only when the memory cgroup is owner of page | 438 | file_mapped is accounted only when the memory cgroup is owner of page |
439 | cache.) | 439 | cache.) |
440 | 440 | ||
441 | 5.2.2 memory.vmscan_stat | ||
442 | |||
443 | memory.vmscan_stat includes statistics information for memory scanning and | ||
444 | freeing, reclaiming. The statistics shows memory scanning information since | ||
445 | memory cgroup creation and can be reset to 0 by writing 0 as | ||
446 | |||
447 | #echo 0 > ../memory.vmscan_stat | ||
448 | |||
449 | This file contains following statistics. | ||
450 | |||
451 | [param]_[file_or_anon]_pages_by_[reason]_[under_heararchy] | ||
452 | [param]_elapsed_ns_by_[reason]_[under_hierarchy] | ||
453 | |||
454 | For example, | ||
455 | |||
456 | scanned_file_pages_by_limit indicates the number of scanned | ||
457 | file pages at vmscan. | ||
458 | |||
459 | Now, 3 parameters are supported | ||
460 | |||
461 | scanned - the number of pages scanned by vmscan | ||
462 | rotated - the number of pages activated at vmscan | ||
463 | freed - the number of pages freed by vmscan | ||
464 | |||
465 | If "rotated" is high against scanned/freed, the memcg seems busy. | ||
466 | |||
467 | Now, 2 reason are supported | ||
468 | |||
469 | limit - the memory cgroup's limit | ||
470 | system - global memory pressure + softlimit | ||
471 | (global memory pressure not under softlimit is not handled now) | ||
472 | |||
473 | When under_hierarchy is added in the tail, the number indicates the | ||
474 | total memcg scan of its children and itself. | ||
475 | |||
476 | elapsed_ns is a elapsed time in nanosecond. This may include sleep time | ||
477 | and not indicates CPU usage. So, please take this as just showing | ||
478 | latency. | ||
479 | |||
480 | Here is an example. | ||
481 | |||
482 | # cat /cgroup/memory/A/memory.vmscan_stat | ||
483 | scanned_pages_by_limit 9471864 | ||
484 | scanned_anon_pages_by_limit 6640629 | ||
485 | scanned_file_pages_by_limit 2831235 | ||
486 | rotated_pages_by_limit 4243974 | ||
487 | rotated_anon_pages_by_limit 3971968 | ||
488 | rotated_file_pages_by_limit 272006 | ||
489 | freed_pages_by_limit 2318492 | ||
490 | freed_anon_pages_by_limit 962052 | ||
491 | freed_file_pages_by_limit 1356440 | ||
492 | elapsed_ns_by_limit 351386416101 | ||
493 | scanned_pages_by_system 0 | ||
494 | scanned_anon_pages_by_system 0 | ||
495 | scanned_file_pages_by_system 0 | ||
496 | rotated_pages_by_system 0 | ||
497 | rotated_anon_pages_by_system 0 | ||
498 | rotated_file_pages_by_system 0 | ||
499 | freed_pages_by_system 0 | ||
500 | freed_anon_pages_by_system 0 | ||
501 | freed_file_pages_by_system 0 | ||
502 | elapsed_ns_by_system 0 | ||
503 | scanned_pages_by_limit_under_hierarchy 9471864 | ||
504 | scanned_anon_pages_by_limit_under_hierarchy 6640629 | ||
505 | scanned_file_pages_by_limit_under_hierarchy 2831235 | ||
506 | rotated_pages_by_limit_under_hierarchy 4243974 | ||
507 | rotated_anon_pages_by_limit_under_hierarchy 3971968 | ||
508 | rotated_file_pages_by_limit_under_hierarchy 272006 | ||
509 | freed_pages_by_limit_under_hierarchy 2318492 | ||
510 | freed_anon_pages_by_limit_under_hierarchy 962052 | ||
511 | freed_file_pages_by_limit_under_hierarchy 1356440 | ||
512 | elapsed_ns_by_limit_under_hierarchy 351386416101 | ||
513 | scanned_pages_by_system_under_hierarchy 0 | ||
514 | scanned_anon_pages_by_system_under_hierarchy 0 | ||
515 | scanned_file_pages_by_system_under_hierarchy 0 | ||
516 | rotated_pages_by_system_under_hierarchy 0 | ||
517 | rotated_anon_pages_by_system_under_hierarchy 0 | ||
518 | rotated_file_pages_by_system_under_hierarchy 0 | ||
519 | freed_pages_by_system_under_hierarchy 0 | ||
520 | freed_anon_pages_by_system_under_hierarchy 0 | ||
521 | freed_file_pages_by_system_under_hierarchy 0 | ||
522 | elapsed_ns_by_system_under_hierarchy 0 | ||
523 | |||
524 | 5.3 swappiness | 441 | 5.3 swappiness |
525 | 442 | ||
526 | Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. | 443 | Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. |
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index c4a6e148732a..4dc465477665 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt | |||
@@ -592,3 +592,11 @@ Why: In 3.0, we can now autodetect internal 3G device and already have | |||
592 | interface that was used by acer-wmi driver. It will replaced by | 592 | interface that was used by acer-wmi driver. It will replaced by |
593 | information log when acer-wmi initial. | 593 | information log when acer-wmi initial. |
594 | Who: Lee, Chun-Yi <jlee@novell.com> | 594 | Who: Lee, Chun-Yi <jlee@novell.com> |
595 | |||
596 | ---------------------------- | ||
597 | What: The XFS nodelaylog mount option | ||
598 | When: 3.3 | ||
599 | Why: The delaylog mode that has been the default since 2.6.39 has proven | ||
600 | stable, and the old code is in the way of additional improvements in | ||
601 | the log code. | ||
602 | Who: Christoph Hellwig <hch@lst.de> | ||
diff --git a/Documentation/hwmon/coretemp b/Documentation/hwmon/coretemp index fa8776ab9b18..84d46c0c71a3 100644 --- a/Documentation/hwmon/coretemp +++ b/Documentation/hwmon/coretemp | |||
@@ -35,13 +35,6 @@ the Out-Of-Spec bit. Following table summarizes the exported sysfs files: | |||
35 | All Sysfs entries are named with their core_id (represented here by 'X'). | 35 | All Sysfs entries are named with their core_id (represented here by 'X'). |
36 | tempX_input - Core temperature (in millidegrees Celsius). | 36 | tempX_input - Core temperature (in millidegrees Celsius). |
37 | tempX_max - All cooling devices should be turned on (on Core2). | 37 | tempX_max - All cooling devices should be turned on (on Core2). |
38 | Initialized with IA32_THERM_INTERRUPT. When the CPU | ||
39 | temperature reaches this temperature, an interrupt is | ||
40 | generated and tempX_max_alarm is set. | ||
41 | tempX_max_hyst - If the CPU temperature falls below than temperature, | ||
42 | an interrupt is generated and tempX_max_alarm is reset. | ||
43 | tempX_max_alarm - Set if the temperature reaches or exceeds tempX_max. | ||
44 | Reset if the temperature drops to or below tempX_max_hyst. | ||
45 | tempX_crit - Maximum junction temperature (in millidegrees Celsius). | 38 | tempX_crit - Maximum junction temperature (in millidegrees Celsius). |
46 | tempX_crit_alarm - Set when Out-of-spec bit is set, never clears. | 39 | tempX_crit_alarm - Set when Out-of-spec bit is set, never clears. |
47 | Correct CPU operation is no longer guaranteed. | 40 | Correct CPU operation is no longer guaranteed. |
@@ -49,9 +42,10 @@ tempX_label - Contains string "Core X", where X is processor | |||
49 | number. For Package temp, this will be "Physical id Y", | 42 | number. For Package temp, this will be "Physical id Y", |
50 | where Y is the package number. | 43 | where Y is the package number. |
51 | 44 | ||
52 | The TjMax temperature is set to 85 degrees C if undocumented model specific | 45 | On CPU models which support it, TjMax is read from a model-specific register. |
53 | register (UMSR) 0xee has bit 30 set. If not the TjMax is 100 degrees C as | 46 | On other models, it is set to an arbitrary value based on weak heuristics. |
54 | (sometimes) documented in processor datasheet. | 47 | If these heuristics don't work for you, you can pass the correct TjMax value |
48 | as a module parameter (tjmax). | ||
55 | 49 | ||
56 | Appendix A. Known TjMax lists (TBD): | 50 | Appendix A. Known TjMax lists (TBD): |
57 | Some information comes from ark.intel.com | 51 | Some information comes from ark.intel.com |
diff --git a/Documentation/hwmon/max16065 b/Documentation/hwmon/max16065 index 44b4f61e04f9..c11f64a1f2ad 100644 --- a/Documentation/hwmon/max16065 +++ b/Documentation/hwmon/max16065 | |||
@@ -62,6 +62,13 @@ can be safely used to identify the chip. You will have to instantiate | |||
62 | the devices explicitly. Please see Documentation/i2c/instantiating-devices for | 62 | the devices explicitly. Please see Documentation/i2c/instantiating-devices for |
63 | details. | 63 | details. |
64 | 64 | ||
65 | WARNING: Do not access chip registers using the i2cdump command, and do not use | ||
66 | any of the i2ctools commands on a command register (0xa5 to 0xac). The chips | ||
67 | supported by this driver interpret any access to a command register (including | ||
68 | read commands) as request to execute the command in question. This may result in | ||
69 | power loss, board resets, and/or Flash corruption. Worst case, your board may | ||
70 | turn into a brick. | ||
71 | |||
65 | 72 | ||
66 | Sysfs entries | 73 | Sysfs entries |
67 | ------------- | 74 | ------------- |
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index 845a191004b1..54078ed96b37 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt | |||
@@ -319,4 +319,6 @@ Code Seq#(hex) Include File Comments | |||
319 | <mailto:thomas@winischhofer.net> | 319 | <mailto:thomas@winischhofer.net> |
320 | 0xF4 00-1F video/mbxfb.h mbxfb | 320 | 0xF4 00-1F video/mbxfb.h mbxfb |
321 | <mailto:raph@8d.com> | 321 | <mailto:raph@8d.com> |
322 | 0xF6 all LTTng Linux Trace Toolkit Next Generation | ||
323 | <mailto:mathieu.desnoyers@efficios.com> | ||
322 | 0xFD all linux/dm-ioctl.h | 324 | 0xFD all linux/dm-ioctl.h |
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 6ca1f5cb71e0..d6e6724446c8 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -1350,9 +1350,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
1350 | it is equivalent to "nosmp", which also disables | 1350 | it is equivalent to "nosmp", which also disables |
1351 | the IO APIC. | 1351 | the IO APIC. |
1352 | 1352 | ||
1353 | max_loop= [LOOP] Maximum number of loopback devices that can | 1353 | max_loop= [LOOP] The number of loop block devices that get |
1354 | be mounted | 1354 | (loop.max_loop) unconditionally pre-created at init time. The default |
1355 | Format: <1-256> | 1355 | number is configured by BLK_DEV_LOOP_MIN_COUNT. Instead |
1356 | of statically allocating a predefined number, loop | ||
1357 | devices can be requested on-demand with the | ||
1358 | /dev/loop-control interface. | ||
1356 | 1359 | ||
1357 | mcatest= [IA-64] | 1360 | mcatest= [IA-64] |
1358 | 1361 | ||
@@ -2083,9 +2086,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
2083 | Override pmtimer IOPort with a hex value. | 2086 | Override pmtimer IOPort with a hex value. |
2084 | e.g. pmtmr=0x508 | 2087 | e.g. pmtmr=0x508 |
2085 | 2088 | ||
2086 | pnp.debug [PNP] | 2089 | pnp.debug=1 [PNP] |
2087 | Enable PNP debug messages. This depends on the | 2090 | Enable PNP debug messages (depends on the |
2088 | CONFIG_PNP_DEBUG_MESSAGES option. | 2091 | CONFIG_PNP_DEBUG_MESSAGES option). Change at run-time |
2092 | via /sys/module/pnp/parameters/debug. We always show | ||
2093 | current resource usage; turning this on also shows | ||
2094 | possible settings and some assignment information. | ||
2089 | 2095 | ||
2090 | pnpacpi= [ACPI] | 2096 | pnpacpi= [ACPI] |
2091 | { off } | 2097 | { off } |
@@ -2700,10 +2706,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
2700 | functions are at fixed addresses, they make nice | 2706 | functions are at fixed addresses, they make nice |
2701 | targets for exploits that can control RIP. | 2707 | targets for exploits that can control RIP. |
2702 | 2708 | ||
2703 | emulate [default] Vsyscalls turn into traps and are | 2709 | emulate Vsyscalls turn into traps and are emulated |
2704 | emulated reasonably safely. | 2710 | reasonably safely. |
2705 | 2711 | ||
2706 | native Vsyscalls are native syscall instructions. | 2712 | native [default] Vsyscalls are native syscall |
2713 | instructions. | ||
2707 | This is a little bit faster than trapping | 2714 | This is a little bit faster than trapping |
2708 | and makes a few dynamic recompilers work | 2715 | and makes a few dynamic recompilers work |
2709 | better than they would in emulation mode. | 2716 | better than they would in emulation mode. |
diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX index 4edd78dfb362..bbce1215434a 100644 --- a/Documentation/networking/00-INDEX +++ b/Documentation/networking/00-INDEX | |||
@@ -1,13 +1,21 @@ | |||
1 | 00-INDEX | 1 | 00-INDEX |
2 | - this file | 2 | - this file |
3 | 3c359.txt | ||
4 | - information on the 3Com TokenLink Velocity XL (3c5359) driver. | ||
3 | 3c505.txt | 5 | 3c505.txt |
4 | - information on the 3Com EtherLink Plus (3c505) driver. | 6 | - information on the 3Com EtherLink Plus (3c505) driver. |
7 | 3c509.txt | ||
8 | - information on the 3Com Etherlink III Series Ethernet cards. | ||
5 | 6pack.txt | 9 | 6pack.txt |
6 | - info on the 6pack protocol, an alternative to KISS for AX.25 | 10 | - info on the 6pack protocol, an alternative to KISS for AX.25 |
7 | DLINK.txt | 11 | DLINK.txt |
8 | - info on the D-Link DE-600/DE-620 parallel port pocket adapters | 12 | - info on the D-Link DE-600/DE-620 parallel port pocket adapters |
9 | PLIP.txt | 13 | PLIP.txt |
10 | - PLIP: The Parallel Line Internet Protocol device driver | 14 | - PLIP: The Parallel Line Internet Protocol device driver |
15 | README.ipw2100 | ||
16 | - README for the Intel PRO/Wireless 2100 driver. | ||
17 | README.ipw2200 | ||
18 | - README for the Intel PRO/Wireless 2915ABG and 2200BG driver. | ||
11 | README.sb1000 | 19 | README.sb1000 |
12 | - info on General Instrument/NextLevel SURFboard1000 cable modem. | 20 | - info on General Instrument/NextLevel SURFboard1000 cable modem. |
13 | alias.txt | 21 | alias.txt |
@@ -20,8 +28,12 @@ atm.txt | |||
20 | - info on where to get ATM programs and support for Linux. | 28 | - info on where to get ATM programs and support for Linux. |
21 | ax25.txt | 29 | ax25.txt |
22 | - info on using AX.25 and NET/ROM code for Linux | 30 | - info on using AX.25 and NET/ROM code for Linux |
31 | batman-adv.txt | ||
32 | - B.A.T.M.A.N routing protocol on top of layer 2 Ethernet Frames. | ||
23 | baycom.txt | 33 | baycom.txt |
24 | - info on the driver for Baycom style amateur radio modems | 34 | - info on the driver for Baycom style amateur radio modems |
35 | bonding.txt | ||
36 | - Linux Ethernet Bonding Driver HOWTO: link aggregation in Linux. | ||
25 | bridge.txt | 37 | bridge.txt |
26 | - where to get user space programs for ethernet bridging with Linux. | 38 | - where to get user space programs for ethernet bridging with Linux. |
27 | can.txt | 39 | can.txt |
@@ -34,32 +46,60 @@ cxacru.txt | |||
34 | - Conexant AccessRunner USB ADSL Modem | 46 | - Conexant AccessRunner USB ADSL Modem |
35 | cxacru-cf.py | 47 | cxacru-cf.py |
36 | - Conexant AccessRunner USB ADSL Modem configuration file parser | 48 | - Conexant AccessRunner USB ADSL Modem configuration file parser |
49 | cxgb.txt | ||
50 | - Release Notes for the Chelsio N210 Linux device driver. | ||
51 | dccp.txt | ||
52 | - the Datagram Congestion Control Protocol (DCCP) (RFC 4340..42). | ||
37 | de4x5.txt | 53 | de4x5.txt |
38 | - the Digital EtherWORKS DE4?? and DE5?? PCI Ethernet driver | 54 | - the Digital EtherWORKS DE4?? and DE5?? PCI Ethernet driver |
39 | decnet.txt | 55 | decnet.txt |
40 | - info on using the DECnet networking layer in Linux. | 56 | - info on using the DECnet networking layer in Linux. |
41 | depca.txt | 57 | depca.txt |
42 | - the Digital DEPCA/EtherWORKS DE1?? and DE2?? LANCE Ethernet driver | 58 | - the Digital DEPCA/EtherWORKS DE1?? and DE2?? LANCE Ethernet driver |
59 | dl2k.txt | ||
60 | - README for D-Link DL2000-based Gigabit Ethernet Adapters (dl2k.ko). | ||
61 | dm9000.txt | ||
62 | - README for the Simtec DM9000 Network driver. | ||
43 | dmfe.txt | 63 | dmfe.txt |
44 | - info on the Davicom DM9102(A)/DM9132/DM9801 fast ethernet driver. | 64 | - info on the Davicom DM9102(A)/DM9132/DM9801 fast ethernet driver. |
65 | dns_resolver.txt | ||
66 | - The DNS resolver module allows kernel servies to make DNS queries. | ||
67 | driver.txt | ||
68 | - Softnet driver issues. | ||
45 | e100.txt | 69 | e100.txt |
46 | - info on Intel's EtherExpress PRO/100 line of 10/100 boards | 70 | - info on Intel's EtherExpress PRO/100 line of 10/100 boards |
47 | e1000.txt | 71 | e1000.txt |
48 | - info on Intel's E1000 line of gigabit ethernet boards | 72 | - info on Intel's E1000 line of gigabit ethernet boards |
73 | e1000e.txt | ||
74 | - README for the Intel Gigabit Ethernet Driver (e1000e). | ||
49 | eql.txt | 75 | eql.txt |
50 | - serial IP load balancing | 76 | - serial IP load balancing |
51 | ewrk3.txt | 77 | ewrk3.txt |
52 | - the Digital EtherWORKS 3 DE203/4/5 Ethernet driver | 78 | - the Digital EtherWORKS 3 DE203/4/5 Ethernet driver |
79 | fib_trie.txt | ||
80 | - Level Compressed Trie (LC-trie) notes: a structure for routing. | ||
53 | filter.txt | 81 | filter.txt |
54 | - Linux Socket Filtering | 82 | - Linux Socket Filtering |
55 | fore200e.txt | 83 | fore200e.txt |
56 | - FORE Systems PCA-200E/SBA-200E ATM NIC driver info. | 84 | - FORE Systems PCA-200E/SBA-200E ATM NIC driver info. |
57 | framerelay.txt | 85 | framerelay.txt |
58 | - info on using Frame Relay/Data Link Connection Identifier (DLCI). | 86 | - info on using Frame Relay/Data Link Connection Identifier (DLCI). |
87 | gen_stats.txt | ||
88 | - Generic networking statistics for netlink users. | ||
89 | generic_hdlc.txt | ||
90 | - The generic High Level Data Link Control (HDLC) layer. | ||
59 | generic_netlink.txt | 91 | generic_netlink.txt |
60 | - info on Generic Netlink | 92 | - info on Generic Netlink |
93 | gianfar.txt | ||
94 | - Gianfar Ethernet Driver. | ||
61 | ieee802154.txt | 95 | ieee802154.txt |
62 | - Linux IEEE 802.15.4 implementation, API and drivers | 96 | - Linux IEEE 802.15.4 implementation, API and drivers |
97 | ifenslave.c | ||
98 | - Configure network interfaces for parallel routing (bonding). | ||
99 | igb.txt | ||
100 | - README for the Intel Gigabit Ethernet Driver (igb). | ||
101 | igbvf.txt | ||
102 | - README for the Intel Gigabit Ethernet Driver (igbvf). | ||
63 | ip-sysctl.txt | 103 | ip-sysctl.txt |
64 | - /proc/sys/net/ipv4/* variables | 104 | - /proc/sys/net/ipv4/* variables |
65 | ip_dynaddr.txt | 105 | ip_dynaddr.txt |
@@ -68,41 +108,117 @@ ipddp.txt | |||
68 | - AppleTalk-IP Decapsulation and AppleTalk-IP Encapsulation | 108 | - AppleTalk-IP Decapsulation and AppleTalk-IP Encapsulation |
69 | iphase.txt | 109 | iphase.txt |
70 | - Interphase PCI ATM (i)Chip IA Linux driver info. | 110 | - Interphase PCI ATM (i)Chip IA Linux driver info. |
111 | ipv6.txt | ||
112 | - Options to the ipv6 kernel module. | ||
113 | ipvs-sysctl.txt | ||
114 | - Per-inode explanation of the /proc/sys/net/ipv4/vs interface. | ||
71 | irda.txt | 115 | irda.txt |
72 | - where to get IrDA (infrared) utilities and info for Linux. | 116 | - where to get IrDA (infrared) utilities and info for Linux. |
117 | ixgb.txt | ||
118 | - README for the Intel 10 Gigabit Ethernet Driver (ixgb). | ||
119 | ixgbe.txt | ||
120 | - README for the Intel 10 Gigabit Ethernet Driver (ixgbe). | ||
121 | ixgbevf.txt | ||
122 | - README for the Intel Virtual Function (VF) Driver (ixgbevf). | ||
123 | l2tp.txt | ||
124 | - User guide to the L2TP tunnel protocol. | ||
73 | lapb-module.txt | 125 | lapb-module.txt |
74 | - programming information of the LAPB module. | 126 | - programming information of the LAPB module. |
75 | ltpc.txt | 127 | ltpc.txt |
76 | - the Apple or Farallon LocalTalk PC card driver | 128 | - the Apple or Farallon LocalTalk PC card driver |
129 | mac80211-injection.txt | ||
130 | - HOWTO use packet injection with mac80211 | ||
77 | multicast.txt | 131 | multicast.txt |
78 | - Behaviour of cards under Multicast | 132 | - Behaviour of cards under Multicast |
133 | multiqueue.txt | ||
134 | - HOWTO for multiqueue network device support. | ||
135 | netconsole.txt | ||
136 | - The network console module netconsole.ko: configuration and notes. | ||
137 | netdev-features.txt | ||
138 | - Network interface features API description. | ||
79 | netdevices.txt | 139 | netdevices.txt |
80 | - info on network device driver functions exported to the kernel. | 140 | - info on network device driver functions exported to the kernel. |
141 | netif-msg.txt | ||
142 | - Design of the network interface message level setting (NETIF_MSG_*). | ||
143 | nfc.txt | ||
144 | - The Linux Near Field Communication (NFS) subsystem. | ||
81 | olympic.txt | 145 | olympic.txt |
82 | - IBM PCI Pit/Pit-Phy/Olympic Token Ring driver info. | 146 | - IBM PCI Pit/Pit-Phy/Olympic Token Ring driver info. |
147 | operstates.txt | ||
148 | - Overview of network interface operational states. | ||
149 | packet_mmap.txt | ||
150 | - User guide to memory mapped packet socket rings (PACKET_[RT]X_RING). | ||
151 | phonet.txt | ||
152 | - The Phonet packet protocol used in Nokia cellular modems. | ||
153 | phy.txt | ||
154 | - The PHY abstraction layer. | ||
155 | pktgen.txt | ||
156 | - User guide to the kernel packet generator (pktgen.ko). | ||
83 | policy-routing.txt | 157 | policy-routing.txt |
84 | - IP policy-based routing | 158 | - IP policy-based routing |
159 | ppp_generic.txt | ||
160 | - Information about the generic PPP driver. | ||
161 | proc_net_tcp.txt | ||
162 | - Per inode overview of the /proc/net/tcp and /proc/net/tcp6 interfaces. | ||
163 | radiotap-headers.txt | ||
164 | - Background on radiotap headers. | ||
85 | ray_cs.txt | 165 | ray_cs.txt |
86 | - Raylink Wireless LAN card driver info. | 166 | - Raylink Wireless LAN card driver info. |
167 | rds.txt | ||
168 | - Background on the reliable, ordered datagram delivery method RDS. | ||
169 | regulatory.txt | ||
170 | - Overview of the Linux wireless regulatory infrastructure. | ||
171 | rxrpc.txt | ||
172 | - Guide to the RxRPC protocol. | ||
173 | s2io.txt | ||
174 | - Release notes for Neterion Xframe I/II 10GbE driver. | ||
175 | scaling.txt | ||
176 | - Explanation of network scaling techniques: RSS, RPS, RFS, aRFS, XPS. | ||
177 | sctp.txt | ||
178 | - Notes on the Linux kernel implementation of the SCTP protocol. | ||
179 | secid.txt | ||
180 | - Explanation of the secid member in flow structures. | ||
87 | skfp.txt | 181 | skfp.txt |
88 | - SysKonnect FDDI (SK-5xxx, Compaq Netelligent) driver info. | 182 | - SysKonnect FDDI (SK-5xxx, Compaq Netelligent) driver info. |
89 | smc9.txt | 183 | smc9.txt |
90 | - the driver for SMC's 9000 series of Ethernet cards | 184 | - the driver for SMC's 9000 series of Ethernet cards |
91 | smctr.txt | 185 | smctr.txt |
92 | - SMC TokenCard TokenRing Linux driver info. | 186 | - SMC TokenCard TokenRing Linux driver info. |
187 | spider-net.txt | ||
188 | - README for the Spidernet Driver (as found in PS3 / Cell BE). | ||
189 | stmmac.txt | ||
190 | - README for the STMicro Synopsys Ethernet driver. | ||
191 | tc-actions-env-rules.txt | ||
192 | - rules for traffic control (tc) actions. | ||
193 | timestamping.txt | ||
194 | - overview of network packet timestamping variants. | ||
93 | tcp.txt | 195 | tcp.txt |
94 | - short blurb on how TCP output takes place. | 196 | - short blurb on how TCP output takes place. |
197 | tcp-thin.txt | ||
198 | - kernel tuning options for low rate 'thin' TCP streams. | ||
95 | tlan.txt | 199 | tlan.txt |
96 | - ThunderLAN (Compaq Netelligent 10/100, Olicom OC-2xxx) driver info. | 200 | - ThunderLAN (Compaq Netelligent 10/100, Olicom OC-2xxx) driver info. |
97 | tms380tr.txt | 201 | tms380tr.txt |
98 | - SysKonnect Token Ring ISA/PCI adapter driver info. | 202 | - SysKonnect Token Ring ISA/PCI adapter driver info. |
203 | tproxy.txt | ||
204 | - Transparent proxy support user guide. | ||
99 | tuntap.txt | 205 | tuntap.txt |
100 | - TUN/TAP device driver, allowing user space Rx/Tx of packets. | 206 | - TUN/TAP device driver, allowing user space Rx/Tx of packets. |
207 | udplite.txt | ||
208 | - UDP-Lite protocol (RFC 3828) introduction. | ||
101 | vortex.txt | 209 | vortex.txt |
102 | - info on using 3Com Vortex (3c590, 3c592, 3c595, 3c597) Ethernet cards. | 210 | - info on using 3Com Vortex (3c590, 3c592, 3c595, 3c597) Ethernet cards. |
211 | vxge.txt | ||
212 | - README for the Neterion X3100 PCIe Server Adapter. | ||
103 | x25.txt | 213 | x25.txt |
104 | - general info on X.25 development. | 214 | - general info on X.25 development. |
105 | x25-iface.txt | 215 | x25-iface.txt |
106 | - description of the X.25 Packet Layer to LAPB device interface. | 216 | - description of the X.25 Packet Layer to LAPB device interface. |
217 | xfrm_proc.txt | ||
218 | - description of the statistics package for XFRM. | ||
219 | xfrm_sync.txt | ||
220 | - sync patches for XFRM enable migration of an SA between hosts. | ||
221 | xfrm_sysctl.txt | ||
222 | - description of the XFRM configuration options. | ||
107 | z8530drv.txt | 223 | z8530drv.txt |
108 | - info about Linux driver for Z8530 based HDLC cards for AX.25 | 224 | - info about Linux driver for Z8530 based HDLC cards for AX.25 |
diff --git a/Documentation/networking/dmfe.txt b/Documentation/networking/dmfe.txt index 8006c227fda2..25320bf19c86 100644 --- a/Documentation/networking/dmfe.txt +++ b/Documentation/networking/dmfe.txt | |||
@@ -1,3 +1,5 @@ | |||
1 | Note: This driver doesn't have a maintainer. | ||
2 | |||
1 | Davicom DM9102(A)/DM9132/DM9801 fast ethernet driver for Linux. | 3 | Davicom DM9102(A)/DM9132/DM9801 fast ethernet driver for Linux. |
2 | 4 | ||
3 | This program is free software; you can redistribute it and/or | 5 | This program is free software; you can redistribute it and/or |
@@ -55,7 +57,6 @@ Test and make sure PCI latency is now correct for all cases. | |||
55 | Authors: | 57 | Authors: |
56 | 58 | ||
57 | Sten Wang <sten_wang@davicom.com.tw > : Original Author | 59 | Sten Wang <sten_wang@davicom.com.tw > : Original Author |
58 | Tobias Ringstrom <tori@unhappy.mine.nu> : Current Maintainer | ||
59 | 60 | ||
60 | Contributors: | 61 | Contributors: |
61 | 62 | ||
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index db2a4067013c..ca5cdcd0f0e3 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt | |||
@@ -992,7 +992,7 @@ bindv6only - BOOLEAN | |||
992 | TRUE: disable IPv4-mapped address feature | 992 | TRUE: disable IPv4-mapped address feature |
993 | FALSE: enable IPv4-mapped address feature | 993 | FALSE: enable IPv4-mapped address feature |
994 | 994 | ||
995 | Default: FALSE (as specified in RFC2553bis) | 995 | Default: FALSE (as specified in RFC3493) |
996 | 996 | ||
997 | IPv6 Fragmentation: | 997 | IPv6 Fragmentation: |
998 | 998 | ||
@@ -1042,7 +1042,7 @@ conf/interface/*: | |||
1042 | The functional behaviour for certain settings is different | 1042 | The functional behaviour for certain settings is different |
1043 | depending on whether local forwarding is enabled or not. | 1043 | depending on whether local forwarding is enabled or not. |
1044 | 1044 | ||
1045 | accept_ra - BOOLEAN | 1045 | accept_ra - INTEGER |
1046 | Accept Router Advertisements; autoconfigure using them. | 1046 | Accept Router Advertisements; autoconfigure using them. |
1047 | 1047 | ||
1048 | Possible values are: | 1048 | Possible values are: |
@@ -1106,7 +1106,7 @@ dad_transmits - INTEGER | |||
1106 | The amount of Duplicate Address Detection probes to send. | 1106 | The amount of Duplicate Address Detection probes to send. |
1107 | Default: 1 | 1107 | Default: 1 |
1108 | 1108 | ||
1109 | forwarding - BOOLEAN | 1109 | forwarding - INTEGER |
1110 | Configure interface-specific Host/Router behaviour. | 1110 | Configure interface-specific Host/Router behaviour. |
1111 | 1111 | ||
1112 | Note: It is recommended to have the same setting on all | 1112 | Note: It is recommended to have the same setting on all |
diff --git a/Documentation/networking/scaling.txt b/Documentation/networking/scaling.txt index 7254b4b5910e..fe67b5c79f0f 100644 --- a/Documentation/networking/scaling.txt +++ b/Documentation/networking/scaling.txt | |||
@@ -27,7 +27,7 @@ applying a filter to each packet that assigns it to one of a small number | |||
27 | of logical flows. Packets for each flow are steered to a separate receive | 27 | of logical flows. Packets for each flow are steered to a separate receive |
28 | queue, which in turn can be processed by separate CPUs. This mechanism is | 28 | queue, which in turn can be processed by separate CPUs. This mechanism is |
29 | generally known as “Receive-side Scaling” (RSS). The goal of RSS and | 29 | generally known as “Receive-side Scaling” (RSS). The goal of RSS and |
30 | the other scaling techniques to increase performance uniformly. | 30 | the other scaling techniques is to increase performance uniformly. |
31 | Multi-queue distribution can also be used for traffic prioritization, but | 31 | Multi-queue distribution can also be used for traffic prioritization, but |
32 | that is not the focus of these techniques. | 32 | that is not the focus of these techniques. |
33 | 33 | ||
@@ -52,7 +52,8 @@ module parameter for specifying the number of hardware queues to | |||
52 | configure. In the bnx2x driver, for instance, this parameter is called | 52 | configure. In the bnx2x driver, for instance, this parameter is called |
53 | num_queues. A typical RSS configuration would be to have one receive queue | 53 | num_queues. A typical RSS configuration would be to have one receive queue |
54 | for each CPU if the device supports enough queues, or otherwise at least | 54 | for each CPU if the device supports enough queues, or otherwise at least |
55 | one for each cache domain at a particular cache level (L1, L2, etc.). | 55 | one for each memory domain, where a memory domain is a set of CPUs that |
56 | share a particular memory level (L1, L2, NUMA node, etc.). | ||
56 | 57 | ||
57 | The indirection table of an RSS device, which resolves a queue by masked | 58 | The indirection table of an RSS device, which resolves a queue by masked |
58 | hash, is usually programmed by the driver at initialization. The | 59 | hash, is usually programmed by the driver at initialization. The |
@@ -82,11 +83,17 @@ RSS should be enabled when latency is a concern or whenever receive | |||
82 | interrupt processing forms a bottleneck. Spreading load between CPUs | 83 | interrupt processing forms a bottleneck. Spreading load between CPUs |
83 | decreases queue length. For low latency networking, the optimal setting | 84 | decreases queue length. For low latency networking, the optimal setting |
84 | is to allocate as many queues as there are CPUs in the system (or the | 85 | is to allocate as many queues as there are CPUs in the system (or the |
85 | NIC maximum, if lower). Because the aggregate number of interrupts grows | 86 | NIC maximum, if lower). The most efficient high-rate configuration |
86 | with each additional queue, the most efficient high-rate configuration | ||
87 | is likely the one with the smallest number of receive queues where no | 87 | is likely the one with the smallest number of receive queues where no |
88 | CPU that processes receive interrupts reaches 100% utilization. Per-cpu | 88 | receive queue overflows due to a saturated CPU, because in default |
89 | load can be observed using the mpstat utility. | 89 | mode with interrupt coalescing enabled, the aggregate number of |
90 | interrupts (and thus work) grows with each additional queue. | ||
91 | |||
92 | Per-cpu load can be observed using the mpstat utility, but note that on | ||
93 | processors with hyperthreading (HT), each hyperthread is represented as | ||
94 | a separate CPU. For interrupt handling, HT has shown no benefit in | ||
95 | initial tests, so limit the number of queues to the number of CPU cores | ||
96 | in the system. | ||
90 | 97 | ||
91 | 98 | ||
92 | RPS: Receive Packet Steering | 99 | RPS: Receive Packet Steering |
@@ -145,7 +152,7 @@ the bitmap. | |||
145 | == Suggested Configuration | 152 | == Suggested Configuration |
146 | 153 | ||
147 | For a single queue device, a typical RPS configuration would be to set | 154 | For a single queue device, a typical RPS configuration would be to set |
148 | the rps_cpus to the CPUs in the same cache domain of the interrupting | 155 | the rps_cpus to the CPUs in the same memory domain of the interrupting |
149 | CPU. If NUMA locality is not an issue, this could also be all CPUs in | 156 | CPU. If NUMA locality is not an issue, this could also be all CPUs in |
150 | the system. At high interrupt rate, it might be wise to exclude the | 157 | the system. At high interrupt rate, it might be wise to exclude the |
151 | interrupting CPU from the map since that already performs much work. | 158 | interrupting CPU from the map since that already performs much work. |
@@ -154,7 +161,7 @@ For a multi-queue system, if RSS is configured so that a hardware | |||
154 | receive queue is mapped to each CPU, then RPS is probably redundant | 161 | receive queue is mapped to each CPU, then RPS is probably redundant |
155 | and unnecessary. If there are fewer hardware queues than CPUs, then | 162 | and unnecessary. If there are fewer hardware queues than CPUs, then |
156 | RPS might be beneficial if the rps_cpus for each queue are the ones that | 163 | RPS might be beneficial if the rps_cpus for each queue are the ones that |
157 | share the same cache domain as the interrupting CPU for that queue. | 164 | share the same memory domain as the interrupting CPU for that queue. |
158 | 165 | ||
159 | 166 | ||
160 | RFS: Receive Flow Steering | 167 | RFS: Receive Flow Steering |
@@ -179,10 +186,10 @@ are steered using plain RPS. Multiple table entries may point to the | |||
179 | same CPU. Indeed, with many flows and few CPUs, it is very likely that | 186 | same CPU. Indeed, with many flows and few CPUs, it is very likely that |
180 | a single application thread handles flows with many different flow hashes. | 187 | a single application thread handles flows with many different flow hashes. |
181 | 188 | ||
182 | rps_sock_table is a global flow table that contains the *desired* CPU for | 189 | rps_sock_flow_table is a global flow table that contains the *desired* CPU |
183 | flows: the CPU that is currently processing the flow in userspace. Each | 190 | for flows: the CPU that is currently processing the flow in userspace. |
184 | table value is a CPU index that is updated during calls to recvmsg and | 191 | Each table value is a CPU index that is updated during calls to recvmsg |
185 | sendmsg (specifically, inet_recvmsg(), inet_sendmsg(), inet_sendpage() | 192 | and sendmsg (specifically, inet_recvmsg(), inet_sendmsg(), inet_sendpage() |
186 | and tcp_splice_read()). | 193 | and tcp_splice_read()). |
187 | 194 | ||
188 | When the scheduler moves a thread to a new CPU while it has outstanding | 195 | When the scheduler moves a thread to a new CPU while it has outstanding |
@@ -236,7 +243,7 @@ configured. The number of entries in the global flow table is set through: | |||
236 | 243 | ||
237 | The number of entries in the per-queue flow table are set through: | 244 | The number of entries in the per-queue flow table are set through: |
238 | 245 | ||
239 | /sys/class/net/<dev>/queues/tx-<n>/rps_flow_cnt | 246 | /sys/class/net/<dev>/queues/rx-<n>/rps_flow_cnt |
240 | 247 | ||
241 | == Suggested Configuration | 248 | == Suggested Configuration |
242 | 249 | ||
@@ -326,7 +333,7 @@ The queue chosen for transmitting a particular flow is saved in the | |||
326 | corresponding socket structure for the flow (e.g. a TCP connection). | 333 | corresponding socket structure for the flow (e.g. a TCP connection). |
327 | This transmit queue is used for subsequent packets sent on the flow to | 334 | This transmit queue is used for subsequent packets sent on the flow to |
328 | prevent out of order (ooo) packets. The choice also amortizes the cost | 335 | prevent out of order (ooo) packets. The choice also amortizes the cost |
329 | of calling get_xps_queues() over all packets in the connection. To avoid | 336 | of calling get_xps_queues() over all packets in the flow. To avoid |
330 | ooo packets, the queue for a flow can subsequently only be changed if | 337 | ooo packets, the queue for a flow can subsequently only be changed if |
331 | skb->ooo_okay is set for a packet in the flow. This flag indicates that | 338 | skb->ooo_okay is set for a packet in the flow. This flag indicates that |
332 | there are no outstanding packets in the flow, so the transmit queue can | 339 | there are no outstanding packets in the flow, so the transmit queue can |
diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt index 4ce5450ab6e8..6066e3a6b9a9 100644 --- a/Documentation/power/runtime_pm.txt +++ b/Documentation/power/runtime_pm.txt | |||
@@ -431,8 +431,7 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h: | |||
431 | 431 | ||
432 | void pm_runtime_irq_safe(struct device *dev); | 432 | void pm_runtime_irq_safe(struct device *dev); |
433 | - set the power.irq_safe flag for the device, causing the runtime-PM | 433 | - set the power.irq_safe flag for the device, causing the runtime-PM |
434 | suspend and resume callbacks (but not the idle callback) to be invoked | 434 | callbacks to be invoked with interrupts off |
435 | with interrupts disabled | ||
436 | 435 | ||
437 | void pm_runtime_mark_last_busy(struct device *dev); | 436 | void pm_runtime_mark_last_busy(struct device *dev); |
438 | - set the power.last_busy field to the current time | 437 | - set the power.last_busy field to the current time |
diff --git a/Documentation/virtual/00-INDEX b/Documentation/virtual/00-INDEX index fe0251c4cfb7..8e601991d91c 100644 --- a/Documentation/virtual/00-INDEX +++ b/Documentation/virtual/00-INDEX | |||
@@ -8,3 +8,6 @@ lguest/ | |||
8 | - Extremely simple hypervisor for experimental/educational use. | 8 | - Extremely simple hypervisor for experimental/educational use. |
9 | uml/ | 9 | uml/ |
10 | - User Mode Linux, builds/runs Linux kernel as a userspace program. | 10 | - User Mode Linux, builds/runs Linux kernel as a userspace program. |
11 | virtio.txt | ||
12 | - Text version of draft virtio spec. | ||
13 | See http://ozlabs.org/~rusty/virtio-spec | ||
diff --git a/Documentation/virtual/lguest/lguest.c b/Documentation/virtual/lguest/lguest.c index 043bd7df3139..d928c134dee6 100644 --- a/Documentation/virtual/lguest/lguest.c +++ b/Documentation/virtual/lguest/lguest.c | |||
@@ -1996,6 +1996,9 @@ int main(int argc, char *argv[]) | |||
1996 | /* We use a simple helper to copy the arguments separated by spaces. */ | 1996 | /* We use a simple helper to copy the arguments separated by spaces. */ |
1997 | concat((char *)(boot + 1), argv+optind+2); | 1997 | concat((char *)(boot + 1), argv+optind+2); |
1998 | 1998 | ||
1999 | /* Set kernel alignment to 16M (CONFIG_PHYSICAL_ALIGN) */ | ||
2000 | boot->hdr.kernel_alignment = 0x1000000; | ||
2001 | |||
1999 | /* Boot protocol version: 2.07 supports the fields for lguest. */ | 2002 | /* Boot protocol version: 2.07 supports the fields for lguest. */ |
2000 | boot->hdr.version = 0x207; | 2003 | boot->hdr.version = 0x207; |
2001 | 2004 | ||
diff --git a/Documentation/virtual/virtio-spec.txt b/Documentation/virtual/virtio-spec.txt new file mode 100644 index 000000000000..a350ae135b8c --- /dev/null +++ b/Documentation/virtual/virtio-spec.txt | |||
@@ -0,0 +1,2200 @@ | |||
1 | [Generated file: see http://ozlabs.org/~rusty/virtio-spec/] | ||
2 | Virtio PCI Card Specification | ||
3 | v0.9.1 DRAFT | ||
4 | - | ||
5 | |||
6 | Rusty Russell <rusty@rustcorp.com.au>IBM Corporation (Editor) | ||
7 | |||
8 | 2011 August 1. | ||
9 | |||
10 | Purpose and Description | ||
11 | |||
12 | This document describes the specifications of the “virtio” family | ||
13 | of PCI[LaTeX Command: nomenclature] devices. These are devices | ||
14 | are found in virtual environments[LaTeX Command: nomenclature], | ||
15 | yet by design they are not all that different from physical PCI | ||
16 | devices, and this document treats them as such. This allows the | ||
17 | guest to use standard PCI drivers and discovery mechanisms. | ||
18 | |||
19 | The purpose of virtio and this specification is that virtual | ||
20 | environments and guests should have a straightforward, efficient, | ||
21 | standard and extensible mechanism for virtual devices, rather | ||
22 | than boutique per-environment or per-OS mechanisms. | ||
23 | |||
24 | Straightforward: Virtio PCI devices use normal PCI mechanisms | ||
25 | of interrupts and DMA which should be familiar to any device | ||
26 | driver author. There is no exotic page-flipping or COW | ||
27 | mechanism: it's just a PCI device.[footnote: | ||
28 | This lack of page-sharing implies that the implementation of the | ||
29 | device (e.g. the hypervisor or host) needs full access to the | ||
30 | guest memory. Communication with untrusted parties (i.e. | ||
31 | inter-guest communication) requires copying. | ||
32 | ] | ||
33 | |||
34 | Efficient: Virtio PCI devices consist of rings of descriptors | ||
35 | for input and output, which are neatly separated to avoid cache | ||
36 | effects from both guest and device writing to the same cache | ||
37 | lines. | ||
38 | |||
39 | Standard: Virtio PCI makes no assumptions about the environment | ||
40 | in which it operates, beyond supporting PCI. In fact the virtio | ||
41 | devices specified in the appendices do not require PCI at all: | ||
42 | they have been implemented on non-PCI buses.[footnote: | ||
43 | The Linux implementation further separates the PCI virtio code | ||
44 | from the specific virtio drivers: these drivers are shared with | ||
45 | the non-PCI implementations (currently lguest and S/390). | ||
46 | ] | ||
47 | |||
48 | Extensible: Virtio PCI devices contain feature bits which are | ||
49 | acknowledged by the guest operating system during device setup. | ||
50 | This allows forwards and backwards compatibility: the device | ||
51 | offers all the features it knows about, and the driver | ||
52 | acknowledges those it understands and wishes to use. | ||
53 | |||
54 | Virtqueues | ||
55 | |||
56 | The mechanism for bulk data transport on virtio PCI devices is | ||
57 | pretentiously called a virtqueue. Each device can have zero or | ||
58 | more virtqueues: for example, the network device has one for | ||
59 | transmit and one for receive. | ||
60 | |||
61 | Each virtqueue occupies two or more physically-contiguous pages | ||
62 | (defined, for the purposes of this specification, as 4096 bytes), | ||
63 | and consists of three parts: | ||
64 | |||
65 | |||
66 | +-------------------+-----------------------------------+-----------+ | ||
67 | | Descriptor Table | Available Ring (padding) | Used Ring | | ||
68 | +-------------------+-----------------------------------+-----------+ | ||
69 | |||
70 | |||
71 | When the driver wants to send buffers to the device, it puts them | ||
72 | in one or more slots in the descriptor table, and writes the | ||
73 | descriptor indices into the available ring. It then notifies the | ||
74 | device. When the device has finished with the buffers, it writes | ||
75 | the descriptors into the used ring, and sends an interrupt. | ||
76 | |||
77 | Specification | ||
78 | |||
79 | PCI Discovery | ||
80 | |||
81 | Any PCI device with Vendor ID 0x1AF4, and Device ID 0x1000 | ||
82 | through 0x103F inclusive is a virtio device[footnote: | ||
83 | The actual value within this range is ignored | ||
84 | ]. The device must also have a Revision ID of 0 to match this | ||
85 | specification. | ||
86 | |||
87 | The Subsystem Device ID indicates which virtio device is | ||
88 | supported by the device. The Subsystem Vendor ID should reflect | ||
89 | the PCI Vendor ID of the environment (it's currently only used | ||
90 | for informational purposes by the guest). | ||
91 | |||
92 | |||
93 | +----------------------+--------------------+---------------+ | ||
94 | | Subsystem Device ID | Virtio Device | Specification | | ||
95 | +----------------------+--------------------+---------------+ | ||
96 | +----------------------+--------------------+---------------+ | ||
97 | | 1 | network card | Appendix C | | ||
98 | +----------------------+--------------------+---------------+ | ||
99 | | 2 | block device | Appendix D | | ||
100 | +----------------------+--------------------+---------------+ | ||
101 | | 3 | console | Appendix E | | ||
102 | +----------------------+--------------------+---------------+ | ||
103 | | 4 | entropy source | Appendix F | | ||
104 | +----------------------+--------------------+---------------+ | ||
105 | | 5 | memory ballooning | Appendix G | | ||
106 | +----------------------+--------------------+---------------+ | ||
107 | | 6 | ioMemory | - | | ||
108 | +----------------------+--------------------+---------------+ | ||
109 | | 9 | 9P transport | - | | ||
110 | +----------------------+--------------------+---------------+ | ||
111 | |||
112 | |||
113 | Device Configuration | ||
114 | |||
115 | To configure the device, we use the first I/O region of the PCI | ||
116 | device. This contains a virtio header followed by a | ||
117 | device-specific region. | ||
118 | |||
119 | There may be different widths of accesses to the I/O region; the “ | ||
120 | natural” access method for each field in the virtio header must | ||
121 | be used (i.e. 32-bit accesses for 32-bit fields, etc), but the | ||
122 | device-specific region can be accessed using any width accesses, | ||
123 | and should obtain the same results. | ||
124 | |||
125 | Note that this is possible because while the virtio header is PCI | ||
126 | (i.e. little) endian, the device-specific region is encoded in | ||
127 | the native endian of the guest (where such distinction is | ||
128 | applicable). | ||
129 | |||
130 | Device Initialization Sequence | ||
131 | |||
132 | We start with an overview of device initialization, then expand | ||
133 | on the details of the device and how each step is preformed. | ||
134 | |||
135 | Reset the device. This is not required on initial start up. | ||
136 | |||
137 | The ACKNOWLEDGE status bit is set: we have noticed the device. | ||
138 | |||
139 | The DRIVER status bit is set: we know how to drive the device. | ||
140 | |||
141 | Device-specific setup, including reading the Device Feature | ||
142 | Bits, discovery of virtqueues for the device, optional MSI-X | ||
143 | setup, and reading and possibly writing the virtio | ||
144 | configuration space. | ||
145 | |||
146 | The subset of Device Feature Bits understood by the driver is | ||
147 | written to the device. | ||
148 | |||
149 | The DRIVER_OK status bit is set. | ||
150 | |||
151 | The device can now be used (ie. buffers added to the | ||
152 | virtqueues)[footnote: | ||
153 | Historically, drivers have used the device before steps 5 and 6. | ||
154 | This is only allowed if the driver does not use any features | ||
155 | which would alter this early use of the device. | ||
156 | ] | ||
157 | |||
158 | If any of these steps go irrecoverably wrong, the guest should | ||
159 | set the FAILED status bit to indicate that it has given up on the | ||
160 | device (it can reset the device later to restart if desired). | ||
161 | |||
162 | We now cover the fields required for general setup in detail. | ||
163 | |||
164 | Virtio Header | ||
165 | |||
166 | The virtio header looks as follows: | ||
167 | |||
168 | |||
169 | +------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+ | ||
170 | | Bits || 32 | 32 | 32 | 16 | 16 | 16 | 8 | 8 | | ||
171 | +------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+ | ||
172 | | Read/Write || R | R+W | R+W | R | R+W | R+W | R+W | R | | ||
173 | +------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+ | ||
174 | | Purpose || Device | Guest | Queue | Queue | Queue | Queue | Device | ISR | | ||
175 | | || Features bits 0:31 | Features bits 0:31 | Address | Size | Select | Notify | Status | Status | | ||
176 | +------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+ | ||
177 | |||
178 | |||
179 | If MSI-X is enabled for the device, two additional fields | ||
180 | immediately follow this header: | ||
181 | |||
182 | |||
183 | +------------++----------------+--------+ | ||
184 | | Bits || 16 | 16 | | ||
185 | +----------------+--------+ | ||
186 | +------------++----------------+--------+ | ||
187 | | Read/Write || R+W | R+W | | ||
188 | +------------++----------------+--------+ | ||
189 | | Purpose || Configuration | Queue | | ||
190 | | (MSI-X) || Vector | Vector | | ||
191 | +------------++----------------+--------+ | ||
192 | |||
193 | |||
194 | Finally, if feature bits (VIRTIO_F_FEATURES_HI) this is | ||
195 | immediately followed by two additional fields: | ||
196 | |||
197 | |||
198 | +------------++----------------------+---------------------- | ||
199 | | Bits || 32 | 32 | ||
200 | +------------++----------------------+---------------------- | ||
201 | | Read/Write || R | R+W | ||
202 | +------------++----------------------+---------------------- | ||
203 | | Purpose || Device | Guest | ||
204 | | || Features bits 32:63 | Features bits 32:63 | ||
205 | +------------++----------------------+---------------------- | ||
206 | |||
207 | |||
208 | Immediately following these general headers, there may be | ||
209 | device-specific headers: | ||
210 | |||
211 | |||
212 | +------------++--------------------+ | ||
213 | | Bits || Device Specific | | ||
214 | +--------------------+ | ||
215 | +------------++--------------------+ | ||
216 | | Read/Write || Device Specific | | ||
217 | +------------++--------------------+ | ||
218 | | Purpose || Device Specific... | | ||
219 | | || | | ||
220 | +------------++--------------------+ | ||
221 | |||
222 | |||
223 | Device Status | ||
224 | |||
225 | The Device Status field is updated by the guest to indicate its | ||
226 | progress. This provides a simple low-level diagnostic: it's most | ||
227 | useful to imagine them hooked up to traffic lights on the console | ||
228 | indicating the status of each device. | ||
229 | |||
230 | The device can be reset by writing a 0 to this field, otherwise | ||
231 | at least one bit should be set: | ||
232 | |||
233 | ACKNOWLEDGE (1) Indicates that the guest OS has found the | ||
234 | device and recognized it as a valid virtio device. | ||
235 | |||
236 | DRIVER (2) Indicates that the guest OS knows how to drive the | ||
237 | device. Under Linux, drivers can be loadable modules so there | ||
238 | may be a significant (or infinite) delay before setting this | ||
239 | bit. | ||
240 | |||
241 | DRIVER_OK (3) Indicates that the driver is set up and ready to | ||
242 | drive the device. | ||
243 | |||
244 | FAILED (8) Indicates that something went wrong in the guest, | ||
245 | and it has given up on the device. This could be an internal | ||
246 | error, or the driver didn't like the device for some reason, or | ||
247 | even a fatal error during device operation. The device must be | ||
248 | reset before attempting to re-initialize. | ||
249 | |||
250 | Feature Bits | ||
251 | |||
252 | The least significant 31 bits of the first configuration field | ||
253 | indicates the features that the device supports (the high bit is | ||
254 | reserved, and will be used to indicate the presence of future | ||
255 | feature bits elsewhere). If more than 31 feature bits are | ||
256 | supported, the device indicates so by setting feature bit 31 (see | ||
257 | [cha:Reserved-Feature-Bits]). The bits are allocated as follows: | ||
258 | |||
259 | 0 to 23 Feature bits for the specific device type | ||
260 | |||
261 | 24 to 40 Feature bits reserved for extensions to the queue and | ||
262 | feature negotiation mechanisms | ||
263 | |||
264 | 41 to 63 Feature bits reserved for future extensions | ||
265 | |||
266 | For example, feature bit 0 for a network device (i.e. Subsystem | ||
267 | Device ID 1) indicates that the device supports checksumming of | ||
268 | packets. | ||
269 | |||
270 | The feature bits are negotiated: the device lists all the | ||
271 | features it understands in the Device Features field, and the | ||
272 | guest writes the subset that it understands into the Guest | ||
273 | Features field. The only way to renegotiate is to reset the | ||
274 | device. | ||
275 | |||
276 | In particular, new fields in the device configuration header are | ||
277 | indicated by offering a feature bit, so the guest can check | ||
278 | before accessing that part of the configuration space. | ||
279 | |||
280 | This allows for forwards and backwards compatibility: if the | ||
281 | device is enhanced with a new feature bit, older guests will not | ||
282 | write that feature bit back to the Guest Features field and it | ||
283 | can go into backwards compatibility mode. Similarly, if a guest | ||
284 | is enhanced with a feature that the device doesn't support, it | ||
285 | will not see that feature bit in the Device Features field and | ||
286 | can go into backwards compatibility mode (or, for poor | ||
287 | implementations, set the FAILED Device Status bit). | ||
288 | |||
289 | Access to feature bits 32 to 63 is enabled by Guest by setting | ||
290 | feature bit 31. If this bit is unset, Device must assume that all | ||
291 | feature bits > 31 are unset. | ||
292 | |||
293 | Configuration/Queue Vectors | ||
294 | |||
295 | When MSI-X capability is present and enabled in the device | ||
296 | (through standard PCI configuration space) 4 bytes at byte offset | ||
297 | 20 are used to map configuration change and queue interrupts to | ||
298 | MSI-X vectors. In this case, the ISR Status field is unused, and | ||
299 | device specific configuration starts at byte offset 24 in virtio | ||
300 | header structure. When MSI-X capability is not enabled, device | ||
301 | specific configuration starts at byte offset 20 in virtio header. | ||
302 | |||
303 | Writing a valid MSI-X Table entry number, 0 to 0x7FF, to one of | ||
304 | Configuration/Queue Vector registers, maps interrupts triggered | ||
305 | by the configuration change/selected queue events respectively to | ||
306 | the corresponding MSI-X vector. To disable interrupts for a | ||
307 | specific event type, unmap it by writing a special NO_VECTOR | ||
308 | value: | ||
309 | |||
310 | /* Vector value used to disable MSI for queue */ | ||
311 | |||
312 | #define VIRTIO_MSI_NO_VECTOR 0xffff | ||
313 | |||
314 | Reading these registers returns vector mapped to a given event, | ||
315 | or NO_VECTOR if unmapped. All queue and configuration change | ||
316 | events are unmapped by default. | ||
317 | |||
318 | Note that mapping an event to vector might require allocating | ||
319 | internal device resources, and might fail. Devices report such | ||
320 | failures by returning the NO_VECTOR value when the relevant | ||
321 | Vector field is read. After mapping an event to vector, the | ||
322 | driver must verify success by reading the Vector field value: on | ||
323 | success, the previously written value is returned, and on | ||
324 | failure, NO_VECTOR is returned. If a mapping failure is detected, | ||
325 | the driver can retry mapping with fewervectors, or disable MSI-X. | ||
326 | |||
327 | Virtqueue Configuration | ||
328 | |||
329 | As a device can have zero or more virtqueues for bulk data | ||
330 | transport (for example, the network driver has two), the driver | ||
331 | needs to configure them as part of the device-specific | ||
332 | configuration. | ||
333 | |||
334 | This is done as follows, for each virtqueue a device has: | ||
335 | |||
336 | Write the virtqueue index (first queue is 0) to the Queue | ||
337 | Select field. | ||
338 | |||
339 | Read the virtqueue size from the Queue Size field, which is | ||
340 | always a power of 2. This controls how big the virtqueue is | ||
341 | (see below). If this field is 0, the virtqueue does not exist. | ||
342 | |||
343 | Allocate and zero virtqueue in contiguous physical memory, on a | ||
344 | 4096 byte alignment. Write the physical address, divided by | ||
345 | 4096 to the Queue Address field.[footnote: | ||
346 | The 4096 is based on the x86 page size, but it's also large | ||
347 | enough to ensure that the separate parts of the virtqueue are on | ||
348 | separate cache lines. | ||
349 | ] | ||
350 | |||
351 | Optionally, if MSI-X capability is present and enabled on the | ||
352 | device, select a vector to use to request interrupts triggered | ||
353 | by virtqueue events. Write the MSI-X Table entry number | ||
354 | corresponding to this vector in Queue Vector field. Read the | ||
355 | Queue Vector field: on success, previously written value is | ||
356 | returned; on failure, NO_VECTOR value is returned. | ||
357 | |||
358 | The Queue Size field controls the total number of bytes required | ||
359 | for the virtqueue according to the following formula: | ||
360 | |||
361 | #define ALIGN(x) (((x) + 4095) & ~4095) | ||
362 | |||
363 | static inline unsigned vring_size(unsigned int qsz) | ||
364 | |||
365 | { | ||
366 | |||
367 | return ALIGN(sizeof(struct vring_desc)*qsz + sizeof(u16)*(2 | ||
368 | + qsz)) | ||
369 | |||
370 | + ALIGN(sizeof(struct vring_used_elem)*qsz); | ||
371 | |||
372 | } | ||
373 | |||
374 | This currently wastes some space with padding, but also allows | ||
375 | future extensions. The virtqueue layout structure looks like this | ||
376 | (qsz is the Queue Size field, which is a variable, so this code | ||
377 | won't compile): | ||
378 | |||
379 | struct vring { | ||
380 | |||
381 | /* The actual descriptors (16 bytes each) */ | ||
382 | |||
383 | struct vring_desc desc[qsz]; | ||
384 | |||
385 | |||
386 | |||
387 | /* A ring of available descriptor heads with free-running | ||
388 | index. */ | ||
389 | |||
390 | struct vring_avail avail; | ||
391 | |||
392 | |||
393 | |||
394 | // Padding to the next 4096 boundary. | ||
395 | |||
396 | char pad[]; | ||
397 | |||
398 | |||
399 | |||
400 | // A ring of used descriptor heads with free-running index. | ||
401 | |||
402 | struct vring_used used; | ||
403 | |||
404 | }; | ||
405 | |||
406 | A Note on Virtqueue Endianness | ||
407 | |||
408 | Note that the endian of these fields and everything else in the | ||
409 | virtqueue is the native endian of the guest, not little-endian as | ||
410 | PCI normally is. This makes for simpler guest code, and it is | ||
411 | assumed that the host already has to be deeply aware of the guest | ||
412 | endian so such an “endian-aware” device is not a significant | ||
413 | issue. | ||
414 | |||
415 | Descriptor Table | ||
416 | |||
417 | The descriptor table refers to the buffers the guest is using for | ||
418 | the device. The addresses are physical addresses, and the buffers | ||
419 | can be chained via the next field. Each descriptor describes a | ||
420 | buffer which is read-only or write-only, but a chain of | ||
421 | descriptors can contain both read-only and write-only buffers. | ||
422 | |||
423 | No descriptor chain may be more than 2^32 bytes long in total.struct vring_desc { | ||
424 | |||
425 | /* Address (guest-physical). */ | ||
426 | |||
427 | u64 addr; | ||
428 | |||
429 | /* Length. */ | ||
430 | |||
431 | u32 len; | ||
432 | |||
433 | /* This marks a buffer as continuing via the next field. */ | ||
434 | |||
435 | #define VRING_DESC_F_NEXT 1 | ||
436 | |||
437 | /* This marks a buffer as write-only (otherwise read-only). */ | ||
438 | |||
439 | #define VRING_DESC_F_WRITE 2 | ||
440 | |||
441 | /* This means the buffer contains a list of buffer descriptors. | ||
442 | */ | ||
443 | |||
444 | #define VRING_DESC_F_INDIRECT 4 | ||
445 | |||
446 | /* The flags as indicated above. */ | ||
447 | |||
448 | u16 flags; | ||
449 | |||
450 | /* Next field if flags & NEXT */ | ||
451 | |||
452 | u16 next; | ||
453 | |||
454 | }; | ||
455 | |||
456 | The number of descriptors in the table is specified by the Queue | ||
457 | Size field for this virtqueue. | ||
458 | |||
459 | <sub:Indirect-Descriptors>Indirect Descriptors | ||
460 | |||
461 | Some devices benefit by concurrently dispatching a large number | ||
462 | of large requests. The VIRTIO_RING_F_INDIRECT_DESC feature can be | ||
463 | used to allow this (see [cha:Reserved-Feature-Bits]). To increase | ||
464 | ring capacity it is possible to store a table of indirect | ||
465 | descriptors anywhere in memory, and insert a descriptor in main | ||
466 | virtqueue (with flags&INDIRECT on) that refers to memory buffer | ||
467 | containing this indirect descriptor table; fields addr and len | ||
468 | refer to the indirect table address and length in bytes, | ||
469 | respectively. The indirect table layout structure looks like this | ||
470 | (len is the length of the descriptor that refers to this table, | ||
471 | which is a variable, so this code won't compile): | ||
472 | |||
473 | struct indirect_descriptor_table { | ||
474 | |||
475 | /* The actual descriptors (16 bytes each) */ | ||
476 | |||
477 | struct vring_desc desc[len / 16]; | ||
478 | |||
479 | }; | ||
480 | |||
481 | The first indirect descriptor is located at start of the indirect | ||
482 | descriptor table (index 0), additional indirect descriptors are | ||
483 | chained by next field. An indirect descriptor without next field | ||
484 | (with flags&NEXT off) signals the end of the indirect descriptor | ||
485 | table, and transfers control back to the main virtqueue. An | ||
486 | indirect descriptor can not refer to another indirect descriptor | ||
487 | table (flags&INDIRECT must be off). A single indirect descriptor | ||
488 | table can include both read-only and write-only descriptors; | ||
489 | write-only flag (flags&WRITE) in the descriptor that refers to it | ||
490 | is ignored. | ||
491 | |||
492 | Available Ring | ||
493 | |||
494 | The available ring refers to what descriptors we are offering the | ||
495 | device: it refers to the head of a descriptor chain. The “flags” | ||
496 | field is currently 0 or 1: 1 indicating that we do not need an | ||
497 | interrupt when the device consumes a descriptor from the | ||
498 | available ring. Alternatively, the guest can ask the device to | ||
499 | delay interrupts until an entry with an index specified by the “ | ||
500 | used_event” field is written in the used ring (equivalently, | ||
501 | until the idx field in the used ring will reach the value | ||
502 | used_event + 1). The method employed by the device is controlled | ||
503 | by the VIRTIO_RING_F_EVENT_IDX feature bit (see [cha:Reserved-Feature-Bits] | ||
504 | ). This interrupt suppression is merely an optimization; it may | ||
505 | not suppress interrupts entirely. | ||
506 | |||
507 | The “idx” field indicates where we would put the next descriptor | ||
508 | entry (modulo the ring size). This starts at 0, and increases. | ||
509 | |||
510 | struct vring_avail { | ||
511 | |||
512 | #define VRING_AVAIL_F_NO_INTERRUPT 1 | ||
513 | |||
514 | u16 flags; | ||
515 | |||
516 | u16 idx; | ||
517 | |||
518 | u16 ring[qsz]; /* qsz is the Queue Size field read from device | ||
519 | */ | ||
520 | |||
521 | u16 used_event; | ||
522 | |||
523 | }; | ||
524 | |||
525 | Used Ring | ||
526 | |||
527 | The used ring is where the device returns buffers once it is done | ||
528 | with them. The flags field can be used by the device to hint that | ||
529 | no notification is necessary when the guest adds to the available | ||
530 | ring. Alternatively, the “avail_event” field can be used by the | ||
531 | device to hint that no notification is necessary until an entry | ||
532 | with an index specified by the “avail_event” is written in the | ||
533 | available ring (equivalently, until the idx field in the | ||
534 | available ring will reach the value avail_event + 1). The method | ||
535 | employed by the device is controlled by the guest through the | ||
536 | VIRTIO_RING_F_EVENT_IDX feature bit (see [cha:Reserved-Feature-Bits] | ||
537 | ). [footnote: | ||
538 | These fields are kept here because this is the only part of the | ||
539 | virtqueue written by the device | ||
540 | ]. | ||
541 | |||
542 | Each entry in the ring is a pair: the head entry of the | ||
543 | descriptor chain describing the buffer (this matches an entry | ||
544 | placed in the available ring by the guest earlier), and the total | ||
545 | of bytes written into the buffer. The latter is extremely useful | ||
546 | for guests using untrusted buffers: if you do not know exactly | ||
547 | how much has been written by the device, you usually have to zero | ||
548 | the buffer to ensure no data leakage occurs. | ||
549 | |||
550 | /* u32 is used here for ids for padding reasons. */ | ||
551 | |||
552 | struct vring_used_elem { | ||
553 | |||
554 | /* Index of start of used descriptor chain. */ | ||
555 | |||
556 | u32 id; | ||
557 | |||
558 | /* Total length of the descriptor chain which was used | ||
559 | (written to) */ | ||
560 | |||
561 | u32 len; | ||
562 | |||
563 | }; | ||
564 | |||
565 | |||
566 | |||
567 | struct vring_used { | ||
568 | |||
569 | #define VRING_USED_F_NO_NOTIFY 1 | ||
570 | |||
571 | u16 flags; | ||
572 | |||
573 | u16 idx; | ||
574 | |||
575 | struct vring_used_elem ring[qsz]; | ||
576 | |||
577 | u16 avail_event; | ||
578 | |||
579 | }; | ||
580 | |||
581 | Helpers for Managing Virtqueues | ||
582 | |||
583 | The Linux Kernel Source code contains the definitions above and | ||
584 | helper routines in a more usable form, in | ||
585 | include/linux/virtio_ring.h. This was explicitly licensed by IBM | ||
586 | and Red Hat under the (3-clause) BSD license so that it can be | ||
587 | freely used by all other projects, and is reproduced (with slight | ||
588 | variation to remove Linux assumptions) in Appendix A. | ||
589 | |||
590 | Device Operation | ||
591 | |||
592 | There are two parts to device operation: supplying new buffers to | ||
593 | the device, and processing used buffers from the device. As an | ||
594 | example, the virtio network device has two virtqueues: the | ||
595 | transmit virtqueue and the receive virtqueue. The driver adds | ||
596 | outgoing (read-only) packets to the transmit virtqueue, and then | ||
597 | frees them after they are used. Similarly, incoming (write-only) | ||
598 | buffers are added to the receive virtqueue, and processed after | ||
599 | they are used. | ||
600 | |||
601 | Supplying Buffers to The Device | ||
602 | |||
603 | Actual transfer of buffers from the guest OS to the device | ||
604 | operates as follows: | ||
605 | |||
606 | Place the buffer(s) into free descriptor(s). | ||
607 | |||
608 | If there are no free descriptors, the guest may choose to | ||
609 | notify the device even if notifications are suppressed (to | ||
610 | reduce latency).[footnote: | ||
611 | The Linux drivers do this only for read-only buffers: for | ||
612 | write-only buffers, it is assumed that the driver is merely | ||
613 | trying to keep the receive buffer ring full, and no notification | ||
614 | of this expected condition is necessary. | ||
615 | ] | ||
616 | |||
617 | Place the id of the buffer in the next ring entry of the | ||
618 | available ring. | ||
619 | |||
620 | The steps (1) and (2) may be performed repeatedly if batching | ||
621 | is possible. | ||
622 | |||
623 | A memory barrier should be executed to ensure the device sees | ||
624 | the updated descriptor table and available ring before the next | ||
625 | step. | ||
626 | |||
627 | The available “idx” field should be increased by the number of | ||
628 | entries added to the available ring. | ||
629 | |||
630 | A memory barrier should be executed to ensure that we update | ||
631 | the idx field before checking for notification suppression. | ||
632 | |||
633 | If notifications are not suppressed, the device should be | ||
634 | notified of the new buffers. | ||
635 | |||
636 | Note that the above code does not take precautions against the | ||
637 | available ring buffer wrapping around: this is not possible since | ||
638 | the ring buffer is the same size as the descriptor table, so step | ||
639 | (1) will prevent such a condition. | ||
640 | |||
641 | In addition, the maximum queue size is 32768 (it must be a power | ||
642 | of 2 which fits in 16 bits), so the 16-bit “idx” value can always | ||
643 | distinguish between a full and empty buffer. | ||
644 | |||
645 | Here is a description of each stage in more detail. | ||
646 | |||
647 | Placing Buffers Into The Descriptor Table | ||
648 | |||
649 | A buffer consists of zero or more read-only physically-contiguous | ||
650 | elements followed by zero or more physically-contiguous | ||
651 | write-only elements (it must have at least one element). This | ||
652 | algorithm maps it into the descriptor table: | ||
653 | |||
654 | for each buffer element, b: | ||
655 | |||
656 | Get the next free descriptor table entry, d | ||
657 | |||
658 | Set d.addr to the physical address of the start of b | ||
659 | |||
660 | Set d.len to the length of b. | ||
661 | |||
662 | If b is write-only, set d.flags to VRING_DESC_F_WRITE, | ||
663 | otherwise 0. | ||
664 | |||
665 | If there is a buffer element after this: | ||
666 | |||
667 | Set d.next to the index of the next free descriptor element. | ||
668 | |||
669 | Set the VRING_DESC_F_NEXT bit in d.flags. | ||
670 | |||
671 | In practice, the d.next fields are usually used to chain free | ||
672 | descriptors, and a separate count kept to check there are enough | ||
673 | free descriptors before beginning the mappings. | ||
674 | |||
675 | Updating The Available Ring | ||
676 | |||
677 | The head of the buffer we mapped is the first d in the algorithm | ||
678 | above. A naive implementation would do the following: | ||
679 | |||
680 | avail->ring[avail->idx % qsz] = head; | ||
681 | |||
682 | However, in general we can add many descriptors before we update | ||
683 | the “idx” field (at which point they become visible to the | ||
684 | device), so we keep a counter of how many we've added: | ||
685 | |||
686 | avail->ring[(avail->idx + added++) % qsz] = head; | ||
687 | |||
688 | Updating The Index Field | ||
689 | |||
690 | Once the idx field of the virtqueue is updated, the device will | ||
691 | be able to access the descriptor entries we've created and the | ||
692 | memory they refer to. This is why a memory barrier is generally | ||
693 | used before the idx update, to ensure it sees the most up-to-date | ||
694 | copy. | ||
695 | |||
696 | The idx field always increments, and we let it wrap naturally at | ||
697 | 65536: | ||
698 | |||
699 | avail->idx += added; | ||
700 | |||
701 | <sub:Notifying-The-Device>Notifying The Device | ||
702 | |||
703 | Device notification occurs by writing the 16-bit virtqueue index | ||
704 | of this virtqueue to the Queue Notify field of the virtio header | ||
705 | in the first I/O region of the PCI device. This can be expensive, | ||
706 | however, so the device can suppress such notifications if it | ||
707 | doesn't need them. We have to be careful to expose the new idx | ||
708 | value before checking the suppression flag: it's OK to notify | ||
709 | gratuitously, but not to omit a required notification. So again, | ||
710 | we use a memory barrier here before reading the flags or the | ||
711 | avail_event field. | ||
712 | |||
713 | If the VIRTIO_F_RING_EVENT_IDX feature is not negotiated, and if | ||
714 | the VRING_USED_F_NOTIFY flag is not set, we go ahead and write to | ||
715 | the PCI configuration space. | ||
716 | |||
717 | If the VIRTIO_F_RING_EVENT_IDX feature is negotiated, we read the | ||
718 | avail_event field in the available ring structure. If the | ||
719 | available index crossed_the avail_event field value since the | ||
720 | last notification, we go ahead and write to the PCI configuration | ||
721 | space. The avail_event field wraps naturally at 65536 as well: | ||
722 | |||
723 | (u16)(new_idx - avail_event - 1) < (u16)(new_idx - old_idx) | ||
724 | |||
725 | <sub:Receiving-Used-Buffers>Receiving Used Buffers From The | ||
726 | Device | ||
727 | |||
728 | Once the device has used a buffer (read from or written to it, or | ||
729 | parts of both, depending on the nature of the virtqueue and the | ||
730 | device), it sends an interrupt, following an algorithm very | ||
731 | similar to the algorithm used for the driver to send the device a | ||
732 | buffer: | ||
733 | |||
734 | Write the head descriptor number to the next field in the used | ||
735 | ring. | ||
736 | |||
737 | Update the used ring idx. | ||
738 | |||
739 | Determine whether an interrupt is necessary: | ||
740 | |||
741 | If the VIRTIO_F_RING_EVENT_IDX feature is not negotiated: check | ||
742 | if f the VRING_AVAIL_F_NO_INTERRUPT flag is not set in avail- | ||
743 | >flags | ||
744 | |||
745 | If the VIRTIO_F_RING_EVENT_IDX feature is negotiated: check | ||
746 | whether the used index crossed the used_event field value | ||
747 | since the last update. The used_event field wraps naturally | ||
748 | at 65536 as well:(u16)(new_idx - used_event - 1) < (u16)(new_idx - old_idx) | ||
749 | |||
750 | If an interrupt is necessary: | ||
751 | |||
752 | If MSI-X capability is disabled: | ||
753 | |||
754 | Set the lower bit of the ISR Status field for the device. | ||
755 | |||
756 | Send the appropriate PCI interrupt for the device. | ||
757 | |||
758 | If MSI-X capability is enabled: | ||
759 | |||
760 | Request the appropriate MSI-X interrupt message for the | ||
761 | device, Queue Vector field sets the MSI-X Table entry | ||
762 | number. | ||
763 | |||
764 | If Queue Vector field value is NO_VECTOR, no interrupt | ||
765 | message is requested for this event. | ||
766 | |||
767 | The guest interrupt handler should: | ||
768 | |||
769 | If MSI-X capability is disabled: read the ISR Status field, | ||
770 | which will reset it to zero. If the lower bit is zero, the | ||
771 | interrupt was not for this device. Otherwise, the guest driver | ||
772 | should look through the used rings of each virtqueue for the | ||
773 | device, to see if any progress has been made by the device | ||
774 | which requires servicing. | ||
775 | |||
776 | If MSI-X capability is enabled: look through the used rings of | ||
777 | each virtqueue mapped to the specific MSI-X vector for the | ||
778 | device, to see if any progress has been made by the device | ||
779 | which requires servicing. | ||
780 | |||
781 | For each ring, guest should then disable interrupts by writing | ||
782 | VRING_AVAIL_F_NO_INTERRUPT flag in avail structure, if required. | ||
783 | It can then process used ring entries finally enabling interrupts | ||
784 | by clearing the VRING_AVAIL_F_NO_INTERRUPT flag or updating the | ||
785 | EVENT_IDX field in the available structure, Guest should then | ||
786 | execute a memory barrier, and then recheck the ring empty | ||
787 | condition. This is necessary to handle the case where, after the | ||
788 | last check and before enabling interrupts, an interrupt has been | ||
789 | suppressed by the device: | ||
790 | |||
791 | vring_disable_interrupts(vq); | ||
792 | |||
793 | for (;;) { | ||
794 | |||
795 | if (vq->last_seen_used != vring->used.idx) { | ||
796 | |||
797 | vring_enable_interrupts(vq); | ||
798 | |||
799 | mb(); | ||
800 | |||
801 | if (vq->last_seen_used != vring->used.idx) | ||
802 | |||
803 | break; | ||
804 | |||
805 | } | ||
806 | |||
807 | struct vring_used_elem *e = | ||
808 | vring.used->ring[vq->last_seen_used%vsz]; | ||
809 | |||
810 | process_buffer(e); | ||
811 | |||
812 | vq->last_seen_used++; | ||
813 | |||
814 | } | ||
815 | |||
816 | Dealing With Configuration Changes | ||
817 | |||
818 | Some virtio PCI devices can change the device configuration | ||
819 | state, as reflected in the virtio header in the PCI configuration | ||
820 | space. In this case: | ||
821 | |||
822 | If MSI-X capability is disabled: an interrupt is delivered and | ||
823 | the second highest bit is set in the ISR Status field to | ||
824 | indicate that the driver should re-examine the configuration | ||
825 | space.Note that a single interrupt can indicate both that one | ||
826 | or more virtqueue has been used and that the configuration | ||
827 | space has changed: even if the config bit is set, virtqueues | ||
828 | must be scanned. | ||
829 | |||
830 | If MSI-X capability is enabled: an interrupt message is | ||
831 | requested. The Configuration Vector field sets the MSI-X Table | ||
832 | entry number to use. If Configuration Vector field value is | ||
833 | NO_VECTOR, no interrupt message is requested for this event. | ||
834 | |||
835 | Creating New Device Types | ||
836 | |||
837 | Various considerations are necessary when creating a new device | ||
838 | type: | ||
839 | |||
840 | How Many Virtqueues? | ||
841 | |||
842 | It is possible that a very simple device will operate entirely | ||
843 | through its configuration space, but most will need at least one | ||
844 | virtqueue in which it will place requests. A device with both | ||
845 | input and output (eg. console and network devices described here) | ||
846 | need two queues: one which the driver fills with buffers to | ||
847 | receive input, and one which the driver places buffers to | ||
848 | transmit output. | ||
849 | |||
850 | What Configuration Space Layout? | ||
851 | |||
852 | Configuration space is generally used for rarely-changing or | ||
853 | initialization-time parameters. But it is a limited resource, so | ||
854 | it might be better to use a virtqueue to update configuration | ||
855 | information (the network device does this for filtering, | ||
856 | otherwise the table in the config space could potentially be very | ||
857 | large). | ||
858 | |||
859 | Note that this space is generally the guest's native endian, | ||
860 | rather than PCI's little-endian. | ||
861 | |||
862 | What Device Number? | ||
863 | |||
864 | Currently device numbers are assigned quite freely: a simple | ||
865 | request mail to the author of this document or the Linux | ||
866 | virtualization mailing list[footnote: | ||
867 | |||
868 | https://lists.linux-foundation.org/mailman/listinfo/virtualization | ||
869 | ] will be sufficient to secure a unique one. | ||
870 | |||
871 | Meanwhile for experimental drivers, use 65535 and work backwards. | ||
872 | |||
873 | How many MSI-X vectors? | ||
874 | |||
875 | Using the optional MSI-X capability devices can speed up | ||
876 | interrupt processing by removing the need to read ISR Status | ||
877 | register by guest driver (which might be an expensive operation), | ||
878 | reducing interrupt sharing between devices and queues within the | ||
879 | device, and handling interrupts from multiple CPUs. However, some | ||
880 | systems impose a limit (which might be as low as 256) on the | ||
881 | total number of MSI-X vectors that can be allocated to all | ||
882 | devices. Devices and/or device drivers should take this into | ||
883 | account, limiting the number of vectors used unless the device is | ||
884 | expected to cause a high volume of interrupts. Devices can | ||
885 | control the number of vectors used by limiting the MSI-X Table | ||
886 | Size or not presenting MSI-X capability in PCI configuration | ||
887 | space. Drivers can control this by mapping events to as small | ||
888 | number of vectors as possible, or disabling MSI-X capability | ||
889 | altogether. | ||
890 | |||
891 | Message Framing | ||
892 | |||
893 | The descriptors used for a buffer should not effect the semantics | ||
894 | of the message, except for the total length of the buffer. For | ||
895 | example, a network buffer consists of a 10 byte header followed | ||
896 | by the network packet. Whether this is presented in the ring | ||
897 | descriptor chain as (say) a 10 byte buffer and a 1514 byte | ||
898 | buffer, or a single 1524 byte buffer, or even three buffers, | ||
899 | should have no effect. | ||
900 | |||
901 | In particular, no implementation should use the descriptor | ||
902 | boundaries to determine the size of any header in a request.[footnote: | ||
903 | The current qemu device implementations mistakenly insist that | ||
904 | the first descriptor cover the header in these cases exactly, so | ||
905 | a cautious driver should arrange it so. | ||
906 | ] | ||
907 | |||
908 | Device Improvements | ||
909 | |||
910 | Any change to configuration space, or new virtqueues, or | ||
911 | behavioural changes, should be indicated by negotiation of a new | ||
912 | feature bit. This establishes clarity[footnote: | ||
913 | Even if it does mean documenting design or implementation | ||
914 | mistakes! | ||
915 | ] and avoids future expansion problems. | ||
916 | |||
917 | Clusters of functionality which are always implemented together | ||
918 | can use a single bit, but if one feature makes sense without the | ||
919 | others they should not be gratuitously grouped together to | ||
920 | conserve feature bits. We can always extend the spec when the | ||
921 | first person needs more than 24 feature bits for their device. | ||
922 | |||
923 | [LaTeX Command: printnomenclature] | ||
924 | |||
925 | Appendix A: virtio_ring.h | ||
926 | |||
927 | #ifndef VIRTIO_RING_H | ||
928 | |||
929 | #define VIRTIO_RING_H | ||
930 | |||
931 | /* An interface for efficient virtio implementation. | ||
932 | |||
933 | * | ||
934 | |||
935 | * This header is BSD licensed so anyone can use the definitions | ||
936 | |||
937 | * to implement compatible drivers/servers. | ||
938 | |||
939 | * | ||
940 | |||
941 | * Copyright 2007, 2009, IBM Corporation | ||
942 | |||
943 | * Copyright 2011, Red Hat, Inc | ||
944 | |||
945 | * All rights reserved. | ||
946 | |||
947 | * | ||
948 | |||
949 | * Redistribution and use in source and binary forms, with or | ||
950 | without | ||
951 | |||
952 | * modification, are permitted provided that the following | ||
953 | conditions | ||
954 | |||
955 | * are met: | ||
956 | |||
957 | * 1. Redistributions of source code must retain the above | ||
958 | copyright | ||
959 | |||
960 | * notice, this list of conditions and the following | ||
961 | disclaimer. | ||
962 | |||
963 | * 2. Redistributions in binary form must reproduce the above | ||
964 | copyright | ||
965 | |||
966 | * notice, this list of conditions and the following | ||
967 | disclaimer in the | ||
968 | |||
969 | * documentation and/or other materials provided with the | ||
970 | distribution. | ||
971 | |||
972 | * 3. Neither the name of IBM nor the names of its contributors | ||
973 | |||
974 | * may be used to endorse or promote products derived from | ||
975 | this software | ||
976 | |||
977 | * without specific prior written permission. | ||
978 | |||
979 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND | ||
980 | CONTRIBUTORS ``AS IS'' AND | ||
981 | |||
982 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | ||
983 | TO, THE | ||
984 | |||
985 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A | ||
986 | PARTICULAR PURPOSE | ||
987 | |||
988 | * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE | ||
989 | LIABLE | ||
990 | |||
991 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | ||
992 | CONSEQUENTIAL | ||
993 | |||
994 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | ||
995 | SUBSTITUTE GOODS | ||
996 | |||
997 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | ||
998 | INTERRUPTION) | ||
999 | |||
1000 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | ||
1001 | CONTRACT, STRICT | ||
1002 | |||
1003 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING | ||
1004 | IN ANY WAY | ||
1005 | |||
1006 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | ||
1007 | POSSIBILITY OF | ||
1008 | |||
1009 | * SUCH DAMAGE. | ||
1010 | |||
1011 | */ | ||
1012 | |||
1013 | |||
1014 | |||
1015 | /* This marks a buffer as continuing via the next field. */ | ||
1016 | |||
1017 | #define VRING_DESC_F_NEXT 1 | ||
1018 | |||
1019 | /* This marks a buffer as write-only (otherwise read-only). */ | ||
1020 | |||
1021 | #define VRING_DESC_F_WRITE 2 | ||
1022 | |||
1023 | |||
1024 | |||
1025 | /* The Host uses this in used->flags to advise the Guest: don't | ||
1026 | kick me | ||
1027 | |||
1028 | * when you add a buffer. It's unreliable, so it's simply an | ||
1029 | |||
1030 | * optimization. Guest will still kick if it's out of buffers. | ||
1031 | */ | ||
1032 | |||
1033 | #define VRING_USED_F_NO_NOTIFY 1 | ||
1034 | |||
1035 | /* The Guest uses this in avail->flags to advise the Host: don't | ||
1036 | |||
1037 | * interrupt me when you consume a buffer. It's unreliable, so | ||
1038 | it's | ||
1039 | |||
1040 | * simply an optimization. */ | ||
1041 | |||
1042 | #define VRING_AVAIL_F_NO_INTERRUPT 1 | ||
1043 | |||
1044 | |||
1045 | |||
1046 | /* Virtio ring descriptors: 16 bytes. | ||
1047 | |||
1048 | * These can chain together via "next". */ | ||
1049 | |||
1050 | struct vring_desc { | ||
1051 | |||
1052 | /* Address (guest-physical). */ | ||
1053 | |||
1054 | uint64_t addr; | ||
1055 | |||
1056 | /* Length. */ | ||
1057 | |||
1058 | uint32_t len; | ||
1059 | |||
1060 | /* The flags as indicated above. */ | ||
1061 | |||
1062 | uint16_t flags; | ||
1063 | |||
1064 | /* We chain unused descriptors via this, too */ | ||
1065 | |||
1066 | uint16_t next; | ||
1067 | |||
1068 | }; | ||
1069 | |||
1070 | |||
1071 | |||
1072 | struct vring_avail { | ||
1073 | |||
1074 | uint16_t flags; | ||
1075 | |||
1076 | uint16_t idx; | ||
1077 | |||
1078 | uint16_t ring[]; | ||
1079 | |||
1080 | uint16_t used_event; | ||
1081 | |||
1082 | }; | ||
1083 | |||
1084 | |||
1085 | |||
1086 | /* u32 is used here for ids for padding reasons. */ | ||
1087 | |||
1088 | struct vring_used_elem { | ||
1089 | |||
1090 | /* Index of start of used descriptor chain. */ | ||
1091 | |||
1092 | uint32_t id; | ||
1093 | |||
1094 | /* Total length of the descriptor chain which was written | ||
1095 | to. */ | ||
1096 | |||
1097 | uint32_t len; | ||
1098 | |||
1099 | }; | ||
1100 | |||
1101 | |||
1102 | |||
1103 | struct vring_used { | ||
1104 | |||
1105 | uint16_t flags; | ||
1106 | |||
1107 | uint16_t idx; | ||
1108 | |||
1109 | struct vring_used_elem ring[]; | ||
1110 | |||
1111 | uint16_t avail_event; | ||
1112 | |||
1113 | }; | ||
1114 | |||
1115 | |||
1116 | |||
1117 | struct vring { | ||
1118 | |||
1119 | unsigned int num; | ||
1120 | |||
1121 | |||
1122 | |||
1123 | struct vring_desc *desc; | ||
1124 | |||
1125 | struct vring_avail *avail; | ||
1126 | |||
1127 | struct vring_used *used; | ||
1128 | |||
1129 | }; | ||
1130 | |||
1131 | |||
1132 | |||
1133 | /* The standard layout for the ring is a continuous chunk of | ||
1134 | memory which | ||
1135 | |||
1136 | * looks like this. We assume num is a power of 2. | ||
1137 | |||
1138 | * | ||
1139 | |||
1140 | * struct vring { | ||
1141 | |||
1142 | * // The actual descriptors (16 bytes each) | ||
1143 | |||
1144 | * struct vring_desc desc[num]; | ||
1145 | |||
1146 | * | ||
1147 | |||
1148 | * // A ring of available descriptor heads with free-running | ||
1149 | index. | ||
1150 | |||
1151 | * __u16 avail_flags; | ||
1152 | |||
1153 | * __u16 avail_idx; | ||
1154 | |||
1155 | * __u16 available[num]; | ||
1156 | |||
1157 | * | ||
1158 | |||
1159 | * // Padding to the next align boundary. | ||
1160 | |||
1161 | * char pad[]; | ||
1162 | |||
1163 | * | ||
1164 | |||
1165 | * // A ring of used descriptor heads with free-running | ||
1166 | index. | ||
1167 | |||
1168 | * __u16 used_flags; | ||
1169 | |||
1170 | * __u16 EVENT_IDX; | ||
1171 | |||
1172 | * struct vring_used_elem used[num]; | ||
1173 | |||
1174 | * }; | ||
1175 | |||
1176 | * Note: for virtio PCI, align is 4096. | ||
1177 | |||
1178 | */ | ||
1179 | |||
1180 | static inline void vring_init(struct vring *vr, unsigned int num, | ||
1181 | void *p, | ||
1182 | |||
1183 | unsigned long align) | ||
1184 | |||
1185 | { | ||
1186 | |||
1187 | vr->num = num; | ||
1188 | |||
1189 | vr->desc = p; | ||
1190 | |||
1191 | vr->avail = p + num*sizeof(struct vring_desc); | ||
1192 | |||
1193 | vr->used = (void *)(((unsigned long)&vr->avail->ring[num] | ||
1194 | |||
1195 | + align-1) | ||
1196 | |||
1197 | & ~(align - 1)); | ||
1198 | |||
1199 | } | ||
1200 | |||
1201 | |||
1202 | |||
1203 | static inline unsigned vring_size(unsigned int num, unsigned long | ||
1204 | align) | ||
1205 | |||
1206 | { | ||
1207 | |||
1208 | return ((sizeof(struct vring_desc)*num + | ||
1209 | sizeof(uint16_t)*(2+num) | ||
1210 | |||
1211 | + align - 1) & ~(align - 1)) | ||
1212 | |||
1213 | + sizeof(uint16_t)*3 + sizeof(struct | ||
1214 | vring_used_elem)*num; | ||
1215 | |||
1216 | } | ||
1217 | |||
1218 | |||
1219 | |||
1220 | static inline int vring_need_event(uint16_t event_idx, uint16_t | ||
1221 | new_idx, uint16_t old_idx) | ||
1222 | |||
1223 | { | ||
1224 | |||
1225 | return (uint16_t)(new_idx - event_idx - 1) < | ||
1226 | (uint16_t)(new_idx - old_idx); | ||
1227 | |||
1228 | } | ||
1229 | |||
1230 | #endif /* VIRTIO_RING_H */ | ||
1231 | |||
1232 | <cha:Reserved-Feature-Bits>Appendix B: Reserved Feature Bits | ||
1233 | |||
1234 | Currently there are five device-independent feature bits defined: | ||
1235 | |||
1236 | VIRTIO_F_NOTIFY_ON_EMPTY (24) Negotiating this feature | ||
1237 | indicates that the driver wants an interrupt if the device runs | ||
1238 | out of available descriptors on a virtqueue, even though | ||
1239 | interrupts are suppressed using the VRING_AVAIL_F_NO_INTERRUPT | ||
1240 | flag or the used_event field. An example of this is the | ||
1241 | networking driver: it doesn't need to know every time a packet | ||
1242 | is transmitted, but it does need to free the transmitted | ||
1243 | packets a finite time after they are transmitted. It can avoid | ||
1244 | using a timer if the device interrupts it when all the packets | ||
1245 | are transmitted. | ||
1246 | |||
1247 | VIRTIO_F_RING_INDIRECT_DESC (28) Negotiating this feature | ||
1248 | indicates that the driver can use descriptors with the | ||
1249 | VRING_DESC_F_INDIRECT flag set, as described in [sub:Indirect-Descriptors] | ||
1250 | . | ||
1251 | |||
1252 | VIRTIO_F_RING_EVENT_IDX(29) This feature enables the used_event | ||
1253 | and the avail_event fields. If set, it indicates that the | ||
1254 | device should ignore the flags field in the available ring | ||
1255 | structure. Instead, the used_event field in this structure is | ||
1256 | used by guest to suppress device interrupts. Further, the | ||
1257 | driver should ignore the flags field in the used ring | ||
1258 | structure. Instead, the avail_event field in this structure is | ||
1259 | used by the device to suppress notifications. If unset, the | ||
1260 | driver should ignore the used_event field; the device should | ||
1261 | ignore the avail_event field; the flags field is used | ||
1262 | |||
1263 | VIRTIO_F_BAD_FEATURE(30) This feature should never be | ||
1264 | negotiated by the guest; doing so is an indication that the | ||
1265 | guest is faulty[footnote: | ||
1266 | An experimental virtio PCI driver contained in Linux version | ||
1267 | 2.6.25 had this problem, and this feature bit can be used to | ||
1268 | detect it. | ||
1269 | ] | ||
1270 | |||
1271 | VIRTIO_F_FEATURES_HIGH(31) This feature indicates that the | ||
1272 | device supports feature bits 32:63. If unset, feature bits | ||
1273 | 32:63 are unset. | ||
1274 | |||
1275 | Appendix C: Network Device | ||
1276 | |||
1277 | The virtio network device is a virtual ethernet card, and is the | ||
1278 | most complex of the devices supported so far by virtio. It has | ||
1279 | enhanced rapidly and demonstrates clearly how support for new | ||
1280 | features should be added to an existing device. Empty buffers are | ||
1281 | placed in one virtqueue for receiving packets, and outgoing | ||
1282 | packets are enqueued into another for transmission in that order. | ||
1283 | A third command queue is used to control advanced filtering | ||
1284 | features. | ||
1285 | |||
1286 | Configuration | ||
1287 | |||
1288 | Subsystem Device ID 1 | ||
1289 | |||
1290 | Virtqueues 0:receiveq. 1:transmitq. 2:controlq[footnote: | ||
1291 | Only if VIRTIO_NET_F_CTRL_VQ set | ||
1292 | ] | ||
1293 | |||
1294 | Feature bits | ||
1295 | |||
1296 | VIRTIO_NET_F_CSUM (0) Device handles packets with partial | ||
1297 | checksum | ||
1298 | |||
1299 | VIRTIO_NET_F_GUEST_CSUM (1) Guest handles packets with partial | ||
1300 | checksum | ||
1301 | |||
1302 | VIRTIO_NET_F_MAC (5) Device has given MAC address. | ||
1303 | |||
1304 | VIRTIO_NET_F_GSO (6) (Deprecated) device handles packets with | ||
1305 | any GSO type.[footnote: | ||
1306 | It was supposed to indicate segmentation offload support, but | ||
1307 | upon further investigation it became clear that multiple bits | ||
1308 | were required. | ||
1309 | ] | ||
1310 | |||
1311 | VIRTIO_NET_F_GUEST_TSO4 (7) Guest can receive TSOv4. | ||
1312 | |||
1313 | VIRTIO_NET_F_GUEST_TSO6 (8) Guest can receive TSOv6. | ||
1314 | |||
1315 | VIRTIO_NET_F_GUEST_ECN (9) Guest can receive TSO with ECN. | ||
1316 | |||
1317 | VIRTIO_NET_F_GUEST_UFO (10) Guest can receive UFO. | ||
1318 | |||
1319 | VIRTIO_NET_F_HOST_TSO4 (11) Device can receive TSOv4. | ||
1320 | |||
1321 | VIRTIO_NET_F_HOST_TSO6 (12) Device can receive TSOv6. | ||
1322 | |||
1323 | VIRTIO_NET_F_HOST_ECN (13) Device can receive TSO with ECN. | ||
1324 | |||
1325 | VIRTIO_NET_F_HOST_UFO (14) Device can receive UFO. | ||
1326 | |||
1327 | VIRTIO_NET_F_MRG_RXBUF (15) Guest can merge receive buffers. | ||
1328 | |||
1329 | VIRTIO_NET_F_STATUS (16) Configuration status field is | ||
1330 | available. | ||
1331 | |||
1332 | VIRTIO_NET_F_CTRL_VQ (17) Control channel is available. | ||
1333 | |||
1334 | VIRTIO_NET_F_CTRL_RX (18) Control channel RX mode support. | ||
1335 | |||
1336 | VIRTIO_NET_F_CTRL_VLAN (19) Control channel VLAN filtering. | ||
1337 | |||
1338 | Device configuration layout Two configuration fields are | ||
1339 | currently defined. The mac address field always exists (though | ||
1340 | is only valid if VIRTIO_NET_F_MAC is set), and the status field | ||
1341 | only exists if VIRTIO_NET_F_STATUS is set. Only one bit is | ||
1342 | currently defined for the status field: VIRTIO_NET_S_LINK_UP. #define VIRTIO_NET_S_LINK_UP 1 | ||
1343 | |||
1344 | |||
1345 | |||
1346 | struct virtio_net_config { | ||
1347 | |||
1348 | u8 mac[6]; | ||
1349 | |||
1350 | u16 status; | ||
1351 | |||
1352 | }; | ||
1353 | |||
1354 | Device Initialization | ||
1355 | |||
1356 | The initialization routine should identify the receive and | ||
1357 | transmission virtqueues. | ||
1358 | |||
1359 | If the VIRTIO_NET_F_MAC feature bit is set, the configuration | ||
1360 | space “mac” entry indicates the “physical” address of the the | ||
1361 | network card, otherwise a private MAC address should be | ||
1362 | assigned. All guests are expected to negotiate this feature if | ||
1363 | it is set. | ||
1364 | |||
1365 | If the VIRTIO_NET_F_CTRL_VQ feature bit is negotiated, identify | ||
1366 | the control virtqueue. | ||
1367 | |||
1368 | If the VIRTIO_NET_F_STATUS feature bit is negotiated, the link | ||
1369 | status can be read from the bottom bit of the “status” config | ||
1370 | field. Otherwise, the link should be assumed active. | ||
1371 | |||
1372 | The receive virtqueue should be filled with receive buffers. | ||
1373 | This is described in detail below in “Setting Up Receive | ||
1374 | Buffers”. | ||
1375 | |||
1376 | A driver can indicate that it will generate checksumless | ||
1377 | packets by negotating the VIRTIO_NET_F_CSUM feature. This “ | ||
1378 | checksum offload” is a common feature on modern network cards. | ||
1379 | |||
1380 | If that feature is negotiated, a driver can use TCP or UDP | ||
1381 | segmentation offload by negotiating the VIRTIO_NET_F_HOST_TSO4 | ||
1382 | (IPv4 TCP), VIRTIO_NET_F_HOST_TSO6 (IPv6 TCP) and | ||
1383 | VIRTIO_NET_F_HOST_UFO (UDP fragmentation) features. It should | ||
1384 | not send TCP packets requiring segmentation offload which have | ||
1385 | the Explicit Congestion Notification bit set, unless the | ||
1386 | VIRTIO_NET_F_HOST_ECN feature is negotiated.[footnote: | ||
1387 | This is a common restriction in real, older network cards. | ||
1388 | ] | ||
1389 | |||
1390 | The converse features are also available: a driver can save the | ||
1391 | virtual device some work by negotiating these features.[footnote: | ||
1392 | For example, a network packet transported between two guests on | ||
1393 | the same system may not require checksumming at all, nor | ||
1394 | segmentation, if both guests are amenable. | ||
1395 | ] The VIRTIO_NET_F_GUEST_CSUM feature indicates that partially | ||
1396 | checksummed packets can be received, and if it can do that then | ||
1397 | the VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, | ||
1398 | VIRTIO_NET_F_GUEST_UFO and VIRTIO_NET_F_GUEST_ECN are the input | ||
1399 | equivalents of the features described above. See “Receiving | ||
1400 | Packets” below. | ||
1401 | |||
1402 | Device Operation | ||
1403 | |||
1404 | Packets are transmitted by placing them in the transmitq, and | ||
1405 | buffers for incoming packets are placed in the receiveq. In each | ||
1406 | case, the packet itself is preceeded by a header: | ||
1407 | |||
1408 | struct virtio_net_hdr { | ||
1409 | |||
1410 | #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 | ||
1411 | |||
1412 | u8 flags; | ||
1413 | |||
1414 | #define VIRTIO_NET_HDR_GSO_NONE 0 | ||
1415 | |||
1416 | #define VIRTIO_NET_HDR_GSO_TCPV4 1 | ||
1417 | |||
1418 | #define VIRTIO_NET_HDR_GSO_UDP 3 | ||
1419 | |||
1420 | #define VIRTIO_NET_HDR_GSO_TCPV6 4 | ||
1421 | |||
1422 | #define VIRTIO_NET_HDR_GSO_ECN 0x80 | ||
1423 | |||
1424 | u8 gso_type; | ||
1425 | |||
1426 | u16 hdr_len; | ||
1427 | |||
1428 | u16 gso_size; | ||
1429 | |||
1430 | u16 csum_start; | ||
1431 | |||
1432 | u16 csum_offset; | ||
1433 | |||
1434 | /* Only if VIRTIO_NET_F_MRG_RXBUF: */ | ||
1435 | |||
1436 | u16 num_buffers | ||
1437 | |||
1438 | }; | ||
1439 | |||
1440 | The controlq is used to control device features such as | ||
1441 | filtering. | ||
1442 | |||
1443 | Packet Transmission | ||
1444 | |||
1445 | Transmitting a single packet is simple, but varies depending on | ||
1446 | the different features the driver negotiated. | ||
1447 | |||
1448 | If the driver negotiated VIRTIO_NET_F_CSUM, and the packet has | ||
1449 | not been fully checksummed, then the virtio_net_hdr's fields | ||
1450 | are set as follows. Otherwise, the packet must be fully | ||
1451 | checksummed, and flags is zero. | ||
1452 | |||
1453 | flags has the VIRTIO_NET_HDR_F_NEEDS_CSUM set, | ||
1454 | |||
1455 | <ite:csum_start-is-set>csum_start is set to the offset within | ||
1456 | the packet to begin checksumming, and | ||
1457 | |||
1458 | csum_offset indicates how many bytes after the csum_start the | ||
1459 | new (16 bit ones' complement) checksum should be placed.[footnote: | ||
1460 | For example, consider a partially checksummed TCP (IPv4) packet. | ||
1461 | It will have a 14 byte ethernet header and 20 byte IP header | ||
1462 | followed by the TCP header (with the TCP checksum field 16 bytes | ||
1463 | into that header). csum_start will be 14+20 = 34 (the TCP | ||
1464 | checksum includes the header), and csum_offset will be 16. The | ||
1465 | value in the TCP checksum field will be the sum of the TCP pseudo | ||
1466 | header, so that replacing it by the ones' complement checksum of | ||
1467 | the TCP header and body will give the correct result. | ||
1468 | ] | ||
1469 | |||
1470 | <enu:If-the-driver>If the driver negotiated | ||
1471 | VIRTIO_NET_F_HOST_TSO4, TSO6 or UFO, and the packet requires | ||
1472 | TCP segmentation or UDP fragmentation, then the “gso_type” | ||
1473 | field is set to VIRTIO_NET_HDR_GSO_TCPV4, TCPV6 or UDP. | ||
1474 | (Otherwise, it is set to VIRTIO_NET_HDR_GSO_NONE). In this | ||
1475 | case, packets larger than 1514 bytes can be transmitted: the | ||
1476 | metadata indicates how to replicate the packet header to cut it | ||
1477 | into smaller packets. The other gso fields are set: | ||
1478 | |||
1479 | hdr_len is a hint to the device as to how much of the header | ||
1480 | needs to be kept to copy into each packet, usually set to the | ||
1481 | length of the headers, including the transport header.[footnote: | ||
1482 | Due to various bugs in implementations, this field is not useful | ||
1483 | as a guarantee of the transport header size. | ||
1484 | ] | ||
1485 | |||
1486 | gso_size is the size of the packet beyond that header (ie. | ||
1487 | MSS). | ||
1488 | |||
1489 | If the driver negotiated the VIRTIO_NET_F_HOST_ECN feature, the | ||
1490 | VIRTIO_NET_HDR_GSO_ECN bit may be set in “gso_type” as well, | ||
1491 | indicating that the TCP packet has the ECN bit set.[footnote: | ||
1492 | This case is not handled by some older hardware, so is called out | ||
1493 | specifically in the protocol. | ||
1494 | ] | ||
1495 | |||
1496 | If the driver negotiated the VIRTIO_NET_F_MRG_RXBUF feature, | ||
1497 | the num_buffers field is set to zero. | ||
1498 | |||
1499 | The header and packet are added as one output buffer to the | ||
1500 | transmitq, and the device is notified of the new entry (see [sub:Notifying-The-Device] | ||
1501 | ).[footnote: | ||
1502 | Note that the header will be two bytes longer for the | ||
1503 | VIRTIO_NET_F_MRG_RXBUF case. | ||
1504 | ] | ||
1505 | |||
1506 | Packet Transmission Interrupt | ||
1507 | |||
1508 | Often a driver will suppress transmission interrupts using the | ||
1509 | VRING_AVAIL_F_NO_INTERRUPT flag (see [sub:Receiving-Used-Buffers] | ||
1510 | ) and check for used packets in the transmit path of following | ||
1511 | packets. However, it will still receive interrupts if the | ||
1512 | VIRTIO_F_NOTIFY_ON_EMPTY feature is negotiated, indicating that | ||
1513 | the transmission queue is completely emptied. | ||
1514 | |||
1515 | The normal behavior in this interrupt handler is to retrieve and | ||
1516 | new descriptors from the used ring and free the corresponding | ||
1517 | headers and packets. | ||
1518 | |||
1519 | Setting Up Receive Buffers | ||
1520 | |||
1521 | It is generally a good idea to keep the receive virtqueue as | ||
1522 | fully populated as possible: if it runs out, network performance | ||
1523 | will suffer. | ||
1524 | |||
1525 | If the VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6 or | ||
1526 | VIRTIO_NET_F_GUEST_UFO features are used, the Guest will need to | ||
1527 | accept packets of up to 65550 bytes long (the maximum size of a | ||
1528 | TCP or UDP packet, plus the 14 byte ethernet header), otherwise | ||
1529 | 1514 bytes. So unless VIRTIO_NET_F_MRG_RXBUF is negotiated, every | ||
1530 | buffer in the receive queue needs to be at least this length [footnote: | ||
1531 | Obviously each one can be split across multiple descriptor | ||
1532 | elements. | ||
1533 | ]. | ||
1534 | |||
1535 | If VIRTIO_NET_F_MRG_RXBUF is negotiated, each buffer must be at | ||
1536 | least the size of the struct virtio_net_hdr. | ||
1537 | |||
1538 | Packet Receive Interrupt | ||
1539 | |||
1540 | When a packet is copied into a buffer in the receiveq, the | ||
1541 | optimal path is to disable further interrupts for the receiveq | ||
1542 | (see [sub:Receiving-Used-Buffers]) and process packets until no | ||
1543 | more are found, then re-enable them. | ||
1544 | |||
1545 | Processing packet involves: | ||
1546 | |||
1547 | If the driver negotiated the VIRTIO_NET_F_MRG_RXBUF feature, | ||
1548 | then the “num_buffers” field indicates how many descriptors | ||
1549 | this packet is spread over (including this one). This allows | ||
1550 | receipt of large packets without having to allocate large | ||
1551 | buffers. In this case, there will be at least “num_buffers” in | ||
1552 | the used ring, and they should be chained together to form a | ||
1553 | single packet. The other buffers will not begin with a struct | ||
1554 | virtio_net_hdr. | ||
1555 | |||
1556 | If the VIRTIO_NET_F_MRG_RXBUF feature was not negotiated, or | ||
1557 | the “num_buffers” field is one, then the entire packet will be | ||
1558 | contained within this buffer, immediately following the struct | ||
1559 | virtio_net_hdr. | ||
1560 | |||
1561 | If the VIRTIO_NET_F_GUEST_CSUM feature was negotiated, the | ||
1562 | VIRTIO_NET_HDR_F_NEEDS_CSUM bit in the “flags” field may be | ||
1563 | set: if so, the checksum on the packet is incomplete and the “ | ||
1564 | csum_start” and “csum_offset” fields indicate how to calculate | ||
1565 | it (see [ite:csum_start-is-set]). | ||
1566 | |||
1567 | If the VIRTIO_NET_F_GUEST_TSO4, TSO6 or UFO options were | ||
1568 | negotiated, then the “gso_type” may be something other than | ||
1569 | VIRTIO_NET_HDR_GSO_NONE, and the “gso_size” field indicates the | ||
1570 | desired MSS (see [enu:If-the-driver]).Control Virtqueue | ||
1571 | |||
1572 | The driver uses the control virtqueue (if VIRTIO_NET_F_VTRL_VQ is | ||
1573 | negotiated) to send commands to manipulate various features of | ||
1574 | the device which would not easily map into the configuration | ||
1575 | space. | ||
1576 | |||
1577 | All commands are of the following form: | ||
1578 | |||
1579 | struct virtio_net_ctrl { | ||
1580 | |||
1581 | u8 class; | ||
1582 | |||
1583 | u8 command; | ||
1584 | |||
1585 | u8 command-specific-data[]; | ||
1586 | |||
1587 | u8 ack; | ||
1588 | |||
1589 | }; | ||
1590 | |||
1591 | |||
1592 | |||
1593 | /* ack values */ | ||
1594 | |||
1595 | #define VIRTIO_NET_OK 0 | ||
1596 | |||
1597 | #define VIRTIO_NET_ERR 1 | ||
1598 | |||
1599 | The class, command and command-specific-data are set by the | ||
1600 | driver, and the device sets the ack byte. There is little it can | ||
1601 | do except issue a diagnostic if the ack byte is not | ||
1602 | VIRTIO_NET_OK. | ||
1603 | |||
1604 | Packet Receive Filtering | ||
1605 | |||
1606 | If the VIRTIO_NET_F_CTRL_RX feature is negotiated, the driver can | ||
1607 | send control commands for promiscuous mode, multicast receiving, | ||
1608 | and filtering of MAC addresses. | ||
1609 | |||
1610 | Note that in general, these commands are best-effort: unwanted | ||
1611 | packets may still arrive. | ||
1612 | |||
1613 | Setting Promiscuous Mode | ||
1614 | |||
1615 | #define VIRTIO_NET_CTRL_RX 0 | ||
1616 | |||
1617 | #define VIRTIO_NET_CTRL_RX_PROMISC 0 | ||
1618 | |||
1619 | #define VIRTIO_NET_CTRL_RX_ALLMULTI 1 | ||
1620 | |||
1621 | The class VIRTIO_NET_CTRL_RX has two commands: | ||
1622 | VIRTIO_NET_CTRL_RX_PROMISC turns promiscuous mode on and off, and | ||
1623 | VIRTIO_NET_CTRL_RX_ALLMULTI turns all-multicast receive on and | ||
1624 | off. The command-specific-data is one byte containing 0 (off) or | ||
1625 | 1 (on). | ||
1626 | |||
1627 | Setting MAC Address Filtering | ||
1628 | |||
1629 | struct virtio_net_ctrl_mac { | ||
1630 | |||
1631 | u32 entries; | ||
1632 | |||
1633 | u8 macs[entries][ETH_ALEN]; | ||
1634 | |||
1635 | }; | ||
1636 | |||
1637 | |||
1638 | |||
1639 | #define VIRTIO_NET_CTRL_MAC 1 | ||
1640 | |||
1641 | #define VIRTIO_NET_CTRL_MAC_TABLE_SET 0 | ||
1642 | |||
1643 | The device can filter incoming packets by any number of | ||
1644 | destination MAC addresses.[footnote: | ||
1645 | Since there are no guarentees, it can use a hash filter | ||
1646 | orsilently switch to allmulti or promiscuous mode if it is given | ||
1647 | too many addresses. | ||
1648 | ] This table is set using the class VIRTIO_NET_CTRL_MAC and the | ||
1649 | command VIRTIO_NET_CTRL_MAC_TABLE_SET. The command-specific-data | ||
1650 | is two variable length tables of 6-byte MAC addresses. The first | ||
1651 | table contains unicast addresses, and the second contains | ||
1652 | multicast addresses. | ||
1653 | |||
1654 | VLAN Filtering | ||
1655 | |||
1656 | If the driver negotiates the VIRTION_NET_F_CTRL_VLAN feature, it | ||
1657 | can control a VLAN filter table in the device. | ||
1658 | |||
1659 | #define VIRTIO_NET_CTRL_VLAN 2 | ||
1660 | |||
1661 | #define VIRTIO_NET_CTRL_VLAN_ADD 0 | ||
1662 | |||
1663 | #define VIRTIO_NET_CTRL_VLAN_DEL 1 | ||
1664 | |||
1665 | Both the VIRTIO_NET_CTRL_VLAN_ADD and VIRTIO_NET_CTRL_VLAN_DEL | ||
1666 | command take a 16-bit VLAN id as the command-specific-data. | ||
1667 | |||
1668 | Appendix D: Block Device | ||
1669 | |||
1670 | The virtio block device is a simple virtual block device (ie. | ||
1671 | disk). Read and write requests (and other exotic requests) are | ||
1672 | placed in the queue, and serviced (probably out of order) by the | ||
1673 | device except where noted. | ||
1674 | |||
1675 | Configuration | ||
1676 | |||
1677 | Subsystem Device ID 2 | ||
1678 | |||
1679 | Virtqueues 0:requestq. | ||
1680 | |||
1681 | Feature bits | ||
1682 | |||
1683 | VIRTIO_BLK_F_BARRIER (0) Host supports request barriers. | ||
1684 | |||
1685 | VIRTIO_BLK_F_SIZE_MAX (1) Maximum size of any single segment is | ||
1686 | in “size_max”. | ||
1687 | |||
1688 | VIRTIO_BLK_F_SEG_MAX (2) Maximum number of segments in a | ||
1689 | request is in “seg_max”. | ||
1690 | |||
1691 | VIRTIO_BLK_F_GEOMETRY (4) Disk-style geometry specified in “ | ||
1692 | geometry”. | ||
1693 | |||
1694 | VIRTIO_BLK_F_RO (5) Device is read-only. | ||
1695 | |||
1696 | VIRTIO_BLK_F_BLK_SIZE (6) Block size of disk is in “blk_size”. | ||
1697 | |||
1698 | VIRTIO_BLK_F_SCSI (7) Device supports scsi packet commands. | ||
1699 | |||
1700 | VIRTIO_BLK_F_FLUSH (9) Cache flush command support. | ||
1701 | |||
1702 | |||
1703 | |||
1704 | Device configuration layout The capacity of the device | ||
1705 | (expressed in 512-byte sectors) is always present. The | ||
1706 | availability of the others all depend on various feature bits | ||
1707 | as indicated above. struct virtio_blk_config { | ||
1708 | |||
1709 | u64 capacity; | ||
1710 | |||
1711 | u32 size_max; | ||
1712 | |||
1713 | u32 seg_max; | ||
1714 | |||
1715 | struct virtio_blk_geometry { | ||
1716 | |||
1717 | u16 cylinders; | ||
1718 | |||
1719 | u8 heads; | ||
1720 | |||
1721 | u8 sectors; | ||
1722 | |||
1723 | } geometry; | ||
1724 | |||
1725 | u32 blk_size; | ||
1726 | |||
1727 | |||
1728 | |||
1729 | }; | ||
1730 | |||
1731 | Device Initialization | ||
1732 | |||
1733 | The device size should be read from the “capacity” | ||
1734 | configuration field. No requests should be submitted which goes | ||
1735 | beyond this limit. | ||
1736 | |||
1737 | If the VIRTIO_BLK_F_BLK_SIZE feature is negotiated, the | ||
1738 | blk_size field can be read to determine the optimal sector size | ||
1739 | for the driver to use. This does not effect the units used in | ||
1740 | the protocol (always 512 bytes), but awareness of the correct | ||
1741 | value can effect performance. | ||
1742 | |||
1743 | If the VIRTIO_BLK_F_RO feature is set by the device, any write | ||
1744 | requests will fail. | ||
1745 | |||
1746 | |||
1747 | |||
1748 | Device Operation | ||
1749 | |||
1750 | The driver queues requests to the virtqueue, and they are used by | ||
1751 | the device (not necessarily in order). Each request is of form: | ||
1752 | |||
1753 | struct virtio_blk_req { | ||
1754 | |||
1755 | |||
1756 | |||
1757 | u32 type; | ||
1758 | |||
1759 | u32 ioprio; | ||
1760 | |||
1761 | u64 sector; | ||
1762 | |||
1763 | char data[][512]; | ||
1764 | |||
1765 | u8 status; | ||
1766 | |||
1767 | }; | ||
1768 | |||
1769 | If the device has VIRTIO_BLK_F_SCSI feature, it can also support | ||
1770 | scsi packet command requests, each of these requests is of form:struct virtio_scsi_pc_req { | ||
1771 | |||
1772 | u32 type; | ||
1773 | |||
1774 | u32 ioprio; | ||
1775 | |||
1776 | u64 sector; | ||
1777 | |||
1778 | char cmd[]; | ||
1779 | |||
1780 | char data[][512]; | ||
1781 | |||
1782 | #define SCSI_SENSE_BUFFERSIZE 96 | ||
1783 | |||
1784 | u8 sense[SCSI_SENSE_BUFFERSIZE]; | ||
1785 | |||
1786 | u32 errors; | ||
1787 | |||
1788 | u32 data_len; | ||
1789 | |||
1790 | u32 sense_len; | ||
1791 | |||
1792 | u32 residual; | ||
1793 | |||
1794 | u8 status; | ||
1795 | |||
1796 | }; | ||
1797 | |||
1798 | The type of the request is either a read (VIRTIO_BLK_T_IN), a | ||
1799 | write (VIRTIO_BLK_T_OUT), a scsi packet command | ||
1800 | (VIRTIO_BLK_T_SCSI_CMD or VIRTIO_BLK_T_SCSI_CMD_OUT[footnote: | ||
1801 | the SCSI_CMD and SCSI_CMD_OUT types are equivalent, the device | ||
1802 | does not distinguish between them | ||
1803 | ]) or a flush (VIRTIO_BLK_T_FLUSH or VIRTIO_BLK_T_FLUSH_OUT[footnote: | ||
1804 | the FLUSH and FLUSH_OUT types are equivalent, the device does not | ||
1805 | distinguish between them | ||
1806 | ]). If the device has VIRTIO_BLK_F_BARRIER feature the high bit | ||
1807 | (VIRTIO_BLK_T_BARRIER) indicates that this request acts as a | ||
1808 | barrier and that all preceeding requests must be complete before | ||
1809 | this one, and all following requests must not be started until | ||
1810 | this is complete. Note that a barrier does not flush caches in | ||
1811 | the underlying backend device in host, and thus does not serve as | ||
1812 | data consistency guarantee. Driver must use FLUSH request to | ||
1813 | flush the host cache. | ||
1814 | |||
1815 | #define VIRTIO_BLK_T_IN 0 | ||
1816 | |||
1817 | #define VIRTIO_BLK_T_OUT 1 | ||
1818 | |||
1819 | #define VIRTIO_BLK_T_SCSI_CMD 2 | ||
1820 | |||
1821 | #define VIRTIO_BLK_T_SCSI_CMD_OUT 3 | ||
1822 | |||
1823 | #define VIRTIO_BLK_T_FLUSH 4 | ||
1824 | |||
1825 | #define VIRTIO_BLK_T_FLUSH_OUT 5 | ||
1826 | |||
1827 | #define VIRTIO_BLK_T_BARRIER 0x80000000 | ||
1828 | |||
1829 | The ioprio field is a hint about the relative priorities of | ||
1830 | requests to the device: higher numbers indicate more important | ||
1831 | requests. | ||
1832 | |||
1833 | The sector number indicates the offset (multiplied by 512) where | ||
1834 | the read or write is to occur. This field is unused and set to 0 | ||
1835 | for scsi packet commands and for flush commands. | ||
1836 | |||
1837 | The cmd field is only present for scsi packet command requests, | ||
1838 | and indicates the command to perform. This field must reside in a | ||
1839 | single, separate read-only buffer; command length can be derived | ||
1840 | from the length of this buffer. | ||
1841 | |||
1842 | Note that these first three (four for scsi packet commands) | ||
1843 | fields are always read-only: the data field is either read-only | ||
1844 | or write-only, depending on the request. The size of the read or | ||
1845 | write can be derived from the total size of the request buffers. | ||
1846 | |||
1847 | The sense field is only present for scsi packet command requests, | ||
1848 | and indicates the buffer for scsi sense data. | ||
1849 | |||
1850 | The data_len field is only present for scsi packet command | ||
1851 | requests, this field is deprecated, and should be ignored by the | ||
1852 | driver. Historically, devices copied data length there. | ||
1853 | |||
1854 | The sense_len field is only present for scsi packet command | ||
1855 | requests and indicates the number of bytes actually written to | ||
1856 | the sense buffer. | ||
1857 | |||
1858 | The residual field is only present for scsi packet command | ||
1859 | requests and indicates the residual size, calculated as data | ||
1860 | length - number of bytes actually transferred. | ||
1861 | |||
1862 | The final status byte is written by the device: either | ||
1863 | VIRTIO_BLK_S_OK for success, VIRTIO_BLK_S_IOERR for host or guest | ||
1864 | error or VIRTIO_BLK_S_UNSUPP for a request unsupported by host:#define VIRTIO_BLK_S_OK 0 | ||
1865 | |||
1866 | #define VIRTIO_BLK_S_IOERR 1 | ||
1867 | |||
1868 | #define VIRTIO_BLK_S_UNSUPP 2 | ||
1869 | |||
1870 | Historically, devices assumed that the fields type, ioprio and | ||
1871 | sector reside in a single, separate read-only buffer; the fields | ||
1872 | errors, data_len, sense_len and residual reside in a single, | ||
1873 | separate write-only buffer; the sense field in a separate | ||
1874 | write-only buffer of size 96 bytes, by itself; the fields errors, | ||
1875 | data_len, sense_len and residual in a single write-only buffer; | ||
1876 | and the status field is a separate read-only buffer of size 1 | ||
1877 | byte, by itself. | ||
1878 | |||
1879 | Appendix E: Console Device | ||
1880 | |||
1881 | The virtio console device is a simple device for data input and | ||
1882 | output. A device may have one or more ports. Each port has a pair | ||
1883 | of input and output virtqueues. Moreover, a device has a pair of | ||
1884 | control IO virtqueues. The control virtqueues are used to | ||
1885 | communicate information between the device and the driver about | ||
1886 | ports being opened and closed on either side of the connection, | ||
1887 | indication from the host about whether a particular port is a | ||
1888 | console port, adding new ports, port hot-plug/unplug, etc., and | ||
1889 | indication from the guest about whether a port or a device was | ||
1890 | successfully added, port open/close, etc.. For data IO, one or | ||
1891 | more empty buffers are placed in the receive queue for incoming | ||
1892 | data and outgoing characters are placed in the transmit queue. | ||
1893 | |||
1894 | Configuration | ||
1895 | |||
1896 | Subsystem Device ID 3 | ||
1897 | |||
1898 | Virtqueues 0:receiveq(port0). 1:transmitq(port0), 2:control | ||
1899 | receiveq[footnote: | ||
1900 | Ports 2 onwards only if VIRTIO_CONSOLE_F_MULTIPORT is set | ||
1901 | ], 3:control transmitq, 4:receiveq(port1), 5:transmitq(port1), | ||
1902 | ... | ||
1903 | |||
1904 | Feature bits | ||
1905 | |||
1906 | VIRTIO_CONSOLE_F_SIZE (0) Configuration cols and rows fields | ||
1907 | are valid. | ||
1908 | |||
1909 | VIRTIO_CONSOLE_F_MULTIPORT(1) Device has support for multiple | ||
1910 | ports; configuration fields nr_ports and max_nr_ports are | ||
1911 | valid and control virtqueues will be used. | ||
1912 | |||
1913 | Device configuration layout The size of the console is supplied | ||
1914 | in the configuration space if the VIRTIO_CONSOLE_F_SIZE feature | ||
1915 | is set. Furthermore, if the VIRTIO_CONSOLE_F_MULTIPORT feature | ||
1916 | is set, the maximum number of ports supported by the device can | ||
1917 | be fetched.struct virtio_console_config { | ||
1918 | |||
1919 | u16 cols; | ||
1920 | |||
1921 | u16 rows; | ||
1922 | |||
1923 | |||
1924 | |||
1925 | u32 max_nr_ports; | ||
1926 | |||
1927 | }; | ||
1928 | |||
1929 | Device Initialization | ||
1930 | |||
1931 | If the VIRTIO_CONSOLE_F_SIZE feature is negotiated, the driver | ||
1932 | can read the console dimensions from the configuration fields. | ||
1933 | |||
1934 | If the VIRTIO_CONSOLE_F_MULTIPORT feature is negotiated, the | ||
1935 | driver can spawn multiple ports, not all of which may be | ||
1936 | attached to a console. Some could be generic ports. In this | ||
1937 | case, the control virtqueues are enabled and according to the | ||
1938 | max_nr_ports configuration-space value, the appropriate number | ||
1939 | of virtqueues are created. A control message indicating the | ||
1940 | driver is ready is sent to the host. The host can then send | ||
1941 | control messages for adding new ports to the device. After | ||
1942 | creating and initializing each port, a | ||
1943 | VIRTIO_CONSOLE_PORT_READY control message is sent to the host | ||
1944 | for that port so the host can let us know of any additional | ||
1945 | configuration options set for that port. | ||
1946 | |||
1947 | The receiveq for each port is populated with one or more | ||
1948 | receive buffers. | ||
1949 | |||
1950 | Device Operation | ||
1951 | |||
1952 | For output, a buffer containing the characters is placed in the | ||
1953 | port's transmitq.[footnote: | ||
1954 | Because this is high importance and low bandwidth, the current | ||
1955 | Linux implementation polls for the buffer to be used, rather than | ||
1956 | waiting for an interrupt, simplifying the implementation | ||
1957 | significantly. However, for generic serial ports with the | ||
1958 | O_NONBLOCK flag set, the polling limitation is relaxed and the | ||
1959 | consumed buffers are freed upon the next write or poll call or | ||
1960 | when a port is closed or hot-unplugged. | ||
1961 | ] | ||
1962 | |||
1963 | When a buffer is used in the receiveq (signalled by an | ||
1964 | interrupt), the contents is the input to the port associated | ||
1965 | with the virtqueue for which the notification was received. | ||
1966 | |||
1967 | If the driver negotiated the VIRTIO_CONSOLE_F_SIZE feature, a | ||
1968 | configuration change interrupt may occur. The updated size can | ||
1969 | be read from the configuration fields. | ||
1970 | |||
1971 | If the driver negotiated the VIRTIO_CONSOLE_F_MULTIPORT | ||
1972 | feature, active ports are announced by the host using the | ||
1973 | VIRTIO_CONSOLE_PORT_ADD control message. The same message is | ||
1974 | used for port hot-plug as well. | ||
1975 | |||
1976 | If the host specified a port `name', a sysfs attribute is | ||
1977 | created with the name filled in, so that udev rules can be | ||
1978 | written that can create a symlink from the port's name to the | ||
1979 | char device for port discovery by applications in the guest. | ||
1980 | |||
1981 | Changes to ports' state are effected by control messages. | ||
1982 | Appropriate action is taken on the port indicated in the | ||
1983 | control message. The layout of the structure of the control | ||
1984 | buffer and the events associated are:struct virtio_console_control { | ||
1985 | |||
1986 | uint32_t id; /* Port number */ | ||
1987 | |||
1988 | uint16_t event; /* The kind of control event */ | ||
1989 | |||
1990 | uint16_t value; /* Extra information for the event */ | ||
1991 | |||
1992 | }; | ||
1993 | |||
1994 | |||
1995 | |||
1996 | /* Some events for the internal messages (control packets) */ | ||
1997 | |||
1998 | |||
1999 | |||
2000 | #define VIRTIO_CONSOLE_DEVICE_READY 0 | ||
2001 | |||
2002 | #define VIRTIO_CONSOLE_PORT_ADD 1 | ||
2003 | |||
2004 | #define VIRTIO_CONSOLE_PORT_REMOVE 2 | ||
2005 | |||
2006 | #define VIRTIO_CONSOLE_PORT_READY 3 | ||
2007 | |||
2008 | #define VIRTIO_CONSOLE_CONSOLE_PORT 4 | ||
2009 | |||
2010 | #define VIRTIO_CONSOLE_RESIZE 5 | ||
2011 | |||
2012 | #define VIRTIO_CONSOLE_PORT_OPEN 6 | ||
2013 | |||
2014 | #define VIRTIO_CONSOLE_PORT_NAME 7 | ||
2015 | |||
2016 | Appendix F: Entropy Device | ||
2017 | |||
2018 | The virtio entropy device supplies high-quality randomness for | ||
2019 | guest use. | ||
2020 | |||
2021 | Configuration | ||
2022 | |||
2023 | Subsystem Device ID 4 | ||
2024 | |||
2025 | Virtqueues 0:requestq. | ||
2026 | |||
2027 | Feature bits None currently defined | ||
2028 | |||
2029 | Device configuration layout None currently defined. | ||
2030 | |||
2031 | Device Initialization | ||
2032 | |||
2033 | The virtqueue is initialized | ||
2034 | |||
2035 | Device Operation | ||
2036 | |||
2037 | When the driver requires random bytes, it places the descriptor | ||
2038 | of one or more buffers in the queue. It will be completely filled | ||
2039 | by random data by the device. | ||
2040 | |||
2041 | Appendix G: Memory Balloon Device | ||
2042 | |||
2043 | The virtio memory balloon device is a primitive device for | ||
2044 | managing guest memory: the device asks for a certain amount of | ||
2045 | memory, and the guest supplies it (or withdraws it, if the device | ||
2046 | has more than it asks for). This allows the guest to adapt to | ||
2047 | changes in allowance of underlying physical memory. If the | ||
2048 | feature is negotiated, the device can also be used to communicate | ||
2049 | guest memory statistics to the host. | ||
2050 | |||
2051 | Configuration | ||
2052 | |||
2053 | Subsystem Device ID 5 | ||
2054 | |||
2055 | Virtqueues 0:inflateq. 1:deflateq. 2:statsq.[footnote: | ||
2056 | Only if VIRTIO_BALLON_F_STATS_VQ set | ||
2057 | ] | ||
2058 | |||
2059 | Feature bits | ||
2060 | |||
2061 | VIRTIO_BALLOON_F_MUST_TELL_HOST (0) Host must be told before | ||
2062 | pages from the balloon are used. | ||
2063 | |||
2064 | VIRTIO_BALLOON_F_STATS_VQ (1) A virtqueue for reporting guest | ||
2065 | memory statistics is present. | ||
2066 | |||
2067 | Device configuration layout Both fields of this configuration | ||
2068 | are always available. Note that they are little endian, despite | ||
2069 | convention that device fields are guest endian:struct virtio_balloon_config { | ||
2070 | |||
2071 | u32 num_pages; | ||
2072 | |||
2073 | u32 actual; | ||
2074 | |||
2075 | }; | ||
2076 | |||
2077 | Device Initialization | ||
2078 | |||
2079 | The inflate and deflate virtqueues are identified. | ||
2080 | |||
2081 | If the VIRTIO_BALLOON_F_STATS_VQ feature bit is negotiated: | ||
2082 | |||
2083 | Identify the stats virtqueue. | ||
2084 | |||
2085 | Add one empty buffer to the stats virtqueue and notify the | ||
2086 | host. | ||
2087 | |||
2088 | Device operation begins immediately. | ||
2089 | |||
2090 | Device Operation | ||
2091 | |||
2092 | Memory Ballooning The device is driven by the receipt of a | ||
2093 | configuration change interrupt. | ||
2094 | |||
2095 | The “num_pages” configuration field is examined. If this is | ||
2096 | greater than the “actual” number of pages, memory must be given | ||
2097 | to the balloon. If it is less than the “actual” number of | ||
2098 | pages, memory may be taken back from the balloon for general | ||
2099 | use. | ||
2100 | |||
2101 | To supply memory to the balloon (aka. inflate): | ||
2102 | |||
2103 | The driver constructs an array of addresses of unused memory | ||
2104 | pages. These addresses are divided by 4096[footnote: | ||
2105 | This is historical, and independent of the guest page size | ||
2106 | ] and the descriptor describing the resulting 32-bit array is | ||
2107 | added to the inflateq. | ||
2108 | |||
2109 | To remove memory from the balloon (aka. deflate): | ||
2110 | |||
2111 | The driver constructs an array of addresses of memory pages it | ||
2112 | has previously given to the balloon, as described above. This | ||
2113 | descriptor is added to the deflateq. | ||
2114 | |||
2115 | If the VIRTIO_BALLOON_F_MUST_TELL_HOST feature is set, the | ||
2116 | guest may not use these requested pages until that descriptor | ||
2117 | in the deflateq has been used by the device. | ||
2118 | |||
2119 | Otherwise, the guest may begin to re-use pages previously given | ||
2120 | to the balloon before the device has acknowledged their | ||
2121 | withdrawl. [footnote: | ||
2122 | In this case, deflation advice is merely a courtesy | ||
2123 | ] | ||
2124 | |||
2125 | In either case, once the device has completed the inflation or | ||
2126 | deflation, the “actual” field of the configuration should be | ||
2127 | updated to reflect the new number of pages in the balloon.[footnote: | ||
2128 | As updates to configuration space are not atomic, this field | ||
2129 | isn't particularly reliable, but can be used to diagnose buggy | ||
2130 | guests. | ||
2131 | ] | ||
2132 | |||
2133 | Memory Statistics | ||
2134 | |||
2135 | The stats virtqueue is atypical because communication is driven | ||
2136 | by the device (not the driver). The channel becomes active at | ||
2137 | driver initialization time when the driver adds an empty buffer | ||
2138 | and notifies the device. A request for memory statistics proceeds | ||
2139 | as follows: | ||
2140 | |||
2141 | The device pushes the buffer onto the used ring and sends an | ||
2142 | interrupt. | ||
2143 | |||
2144 | The driver pops the used buffer and discards it. | ||
2145 | |||
2146 | The driver collects memory statistics and writes them into a | ||
2147 | new buffer. | ||
2148 | |||
2149 | The driver adds the buffer to the virtqueue and notifies the | ||
2150 | device. | ||
2151 | |||
2152 | The device pops the buffer (retaining it to initiate a | ||
2153 | subsequent request) and consumes the statistics. | ||
2154 | |||
2155 | Memory Statistics Format Each statistic consists of a 16 bit | ||
2156 | tag and a 64 bit value. Both quantities are represented in the | ||
2157 | native endian of the guest. All statistics are optional and the | ||
2158 | driver may choose which ones to supply. To guarantee backwards | ||
2159 | compatibility, unsupported statistics should be omitted. | ||
2160 | |||
2161 | struct virtio_balloon_stat { | ||
2162 | |||
2163 | #define VIRTIO_BALLOON_S_SWAP_IN 0 | ||
2164 | |||
2165 | #define VIRTIO_BALLOON_S_SWAP_OUT 1 | ||
2166 | |||
2167 | #define VIRTIO_BALLOON_S_MAJFLT 2 | ||
2168 | |||
2169 | #define VIRTIO_BALLOON_S_MINFLT 3 | ||
2170 | |||
2171 | #define VIRTIO_BALLOON_S_MEMFREE 4 | ||
2172 | |||
2173 | #define VIRTIO_BALLOON_S_MEMTOT 5 | ||
2174 | |||
2175 | u16 tag; | ||
2176 | |||
2177 | u64 val; | ||
2178 | |||
2179 | } __attribute__((packed)); | ||
2180 | |||
2181 | Tags | ||
2182 | |||
2183 | VIRTIO_BALLOON_S_SWAP_IN The amount of memory that has been | ||
2184 | swapped in (in bytes). | ||
2185 | |||
2186 | VIRTIO_BALLOON_S_SWAP_OUT The amount of memory that has been | ||
2187 | swapped out to disk (in bytes). | ||
2188 | |||
2189 | VIRTIO_BALLOON_S_MAJFLT The number of major page faults that | ||
2190 | have occurred. | ||
2191 | |||
2192 | VIRTIO_BALLOON_S_MINFLT The number of minor page faults that | ||
2193 | have occurred. | ||
2194 | |||
2195 | VIRTIO_BALLOON_S_MEMFREE The amount of memory not being used | ||
2196 | for any purpose (in bytes). | ||
2197 | |||
2198 | VIRTIO_BALLOON_S_MEMTOT The total amount of memory available | ||
2199 | (in bytes). | ||
2200 | |||
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt index 0924aaca3302..29bdf62aac09 100644 --- a/Documentation/vm/transhuge.txt +++ b/Documentation/vm/transhuge.txt | |||
@@ -123,10 +123,11 @@ be automatically shutdown if it's set to "never". | |||
123 | khugepaged runs usually at low frequency so while one may not want to | 123 | khugepaged runs usually at low frequency so while one may not want to |
124 | invoke defrag algorithms synchronously during the page faults, it | 124 | invoke defrag algorithms synchronously during the page faults, it |
125 | should be worth invoking defrag at least in khugepaged. However it's | 125 | should be worth invoking defrag at least in khugepaged. However it's |
126 | also possible to disable defrag in khugepaged: | 126 | also possible to disable defrag in khugepaged by writing 0 or enable |
127 | defrag in khugepaged by writing 1: | ||
127 | 128 | ||
128 | echo yes >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag | 129 | echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag |
129 | echo no >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag | 130 | echo 1 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag |
130 | 131 | ||
131 | You can also control how many pages khugepaged should scan at each | 132 | You can also control how many pages khugepaged should scan at each |
132 | pass: | 133 | pass: |