diff options
Diffstat (limited to 'Documentation')
33 files changed, 1655 insertions, 766 deletions
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index 44f52a4f5903..cbbd3e069945 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block | |||
@@ -60,3 +60,62 @@ Description: | |||
60 | Indicates whether the block layer should automatically | 60 | Indicates whether the block layer should automatically |
61 | generate checksums for write requests bound for | 61 | generate checksums for write requests bound for |
62 | devices that support receiving integrity metadata. | 62 | devices that support receiving integrity metadata. |
63 | |||
64 | What: /sys/block/<disk>/alignment_offset | ||
65 | Date: April 2009 | ||
66 | Contact: Martin K. Petersen <martin.petersen@oracle.com> | ||
67 | Description: | ||
68 | Storage devices may report a physical block size that is | ||
69 | bigger than the logical block size (for instance a drive | ||
70 | with 4KB physical sectors exposing 512-byte logical | ||
71 | blocks to the operating system). This parameter | ||
72 | indicates how many bytes the beginning of the device is | ||
73 | offset from the disk's natural alignment. | ||
74 | |||
75 | What: /sys/block/<disk>/<partition>/alignment_offset | ||
76 | Date: April 2009 | ||
77 | Contact: Martin K. Petersen <martin.petersen@oracle.com> | ||
78 | Description: | ||
79 | Storage devices may report a physical block size that is | ||
80 | bigger than the logical block size (for instance a drive | ||
81 | with 4KB physical sectors exposing 512-byte logical | ||
82 | blocks to the operating system). This parameter | ||
83 | indicates how many bytes the beginning of the partition | ||
84 | is offset from the disk's natural alignment. | ||
85 | |||
86 | What: /sys/block/<disk>/queue/logical_block_size | ||
87 | Date: May 2009 | ||
88 | Contact: Martin K. Petersen <martin.petersen@oracle.com> | ||
89 | Description: | ||
90 | This is the smallest unit the storage device can | ||
91 | address. It is typically 512 bytes. | ||
92 | |||
93 | What: /sys/block/<disk>/queue/physical_block_size | ||
94 | Date: May 2009 | ||
95 | Contact: Martin K. Petersen <martin.petersen@oracle.com> | ||
96 | Description: | ||
97 | This is the smallest unit the storage device can write | ||
98 | without resorting to read-modify-write operation. It is | ||
99 | usually the same as the logical block size but may be | ||
100 | bigger. One example is SATA drives with 4KB sectors | ||
101 | that expose a 512-byte logical block size to the | ||
102 | operating system. | ||
103 | |||
104 | What: /sys/block/<disk>/queue/minimum_io_size | ||
105 | Date: April 2009 | ||
106 | Contact: Martin K. Petersen <martin.petersen@oracle.com> | ||
107 | Description: | ||
108 | Storage devices may report a preferred minimum I/O size, | ||
109 | which is the smallest request the device can perform | ||
110 | without incurring a read-modify-write penalty. For disk | ||
111 | drives this is often the physical block size. For RAID | ||
112 | arrays it is often the stripe chunk size. | ||
113 | |||
114 | What: /sys/block/<disk>/queue/optimal_io_size | ||
115 | Date: April 2009 | ||
116 | Contact: Martin K. Petersen <martin.petersen@oracle.com> | ||
117 | Description: | ||
118 | Storage devices may report an optimal I/O size, which is | ||
119 | the device's preferred unit of receiving I/O. This is | ||
120 | rarely reported for disk drives. For RAID devices it is | ||
121 | usually the stripe width or the internal block size. | ||
diff --git a/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss b/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss new file mode 100644 index 000000000000..0a92a7c93a62 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss | |||
@@ -0,0 +1,33 @@ | |||
1 | Where: /sys/bus/pci/devices/<dev>/ccissX/cXdY/model | ||
2 | Date: March 2009 | ||
3 | Kernel Version: 2.6.30 | ||
4 | Contact: iss_storagedev@hp.com | ||
5 | Description: Displays the SCSI INQUIRY page 0 model for logical drive | ||
6 | Y of controller X. | ||
7 | |||
8 | Where: /sys/bus/pci/devices/<dev>/ccissX/cXdY/rev | ||
9 | Date: March 2009 | ||
10 | Kernel Version: 2.6.30 | ||
11 | Contact: iss_storagedev@hp.com | ||
12 | Description: Displays the SCSI INQUIRY page 0 revision for logical | ||
13 | drive Y of controller X. | ||
14 | |||
15 | Where: /sys/bus/pci/devices/<dev>/ccissX/cXdY/unique_id | ||
16 | Date: March 2009 | ||
17 | Kernel Version: 2.6.30 | ||
18 | Contact: iss_storagedev@hp.com | ||
19 | Description: Displays the SCSI INQUIRY page 83 serial number for logical | ||
20 | drive Y of controller X. | ||
21 | |||
22 | Where: /sys/bus/pci/devices/<dev>/ccissX/cXdY/vendor | ||
23 | Date: March 2009 | ||
24 | Kernel Version: 2.6.30 | ||
25 | Contact: iss_storagedev@hp.com | ||
26 | Description: Displays the SCSI INQUIRY page 0 vendor for logical drive | ||
27 | Y of controller X. | ||
28 | |||
29 | Where: /sys/bus/pci/devices/<dev>/ccissX/cXdY/block:cciss!cXdY | ||
30 | Date: March 2009 | ||
31 | Kernel Version: 2.6.30 | ||
32 | Contact: iss_storagedev@hp.com | ||
33 | Description: A symbolic link to /sys/block/cciss!cXdY | ||
diff --git a/Documentation/ABI/testing/sysfs-devices-cache_disable b/Documentation/ABI/testing/sysfs-devices-cache_disable new file mode 100644 index 000000000000..175bb4f70512 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-devices-cache_disable | |||
@@ -0,0 +1,18 @@ | |||
1 | What: /sys/devices/system/cpu/cpu*/cache/index*/cache_disable_X | ||
2 | Date: August 2008 | ||
3 | KernelVersion: 2.6.27 | ||
4 | Contact: mark.langsdorf@amd.com | ||
5 | Description: These files exist in every cpu's cache index directories. | ||
6 | There are currently 2 cache_disable_# files in each | ||
7 | directory. Reading from these files on a supported | ||
8 | processor will return that cache disable index value | ||
9 | for that processor and node. Writing to one of these | ||
10 | files will cause the specificed cache index to be disabled. | ||
11 | |||
12 | Currently, only AMD Family 10h Processors support cache index | ||
13 | disable, and only for their L3 caches. See the BIOS and | ||
14 | Kernel Developer's Guide at | ||
15 | http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/31116-Public-GH-BKDG_3.20_2-4-09.pdf | ||
16 | for formatting information and other details on the | ||
17 | cache index disable. | ||
18 | Users: joachim.deguara@amd.com | ||
diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt index d9aa43d78bcc..25fb8bcf32a2 100644 --- a/Documentation/DMA-API.txt +++ b/Documentation/DMA-API.txt | |||
@@ -704,12 +704,24 @@ this directory the following files can currently be found: | |||
704 | The current number of free dma_debug_entries | 704 | The current number of free dma_debug_entries |
705 | in the allocator. | 705 | in the allocator. |
706 | 706 | ||
707 | dma-api/driver-filter | ||
708 | You can write a name of a driver into this file | ||
709 | to limit the debug output to requests from that | ||
710 | particular driver. Write an empty string to | ||
711 | that file to disable the filter and see | ||
712 | all errors again. | ||
713 | |||
707 | If you have this code compiled into your kernel it will be enabled by default. | 714 | If you have this code compiled into your kernel it will be enabled by default. |
708 | If you want to boot without the bookkeeping anyway you can provide | 715 | If you want to boot without the bookkeeping anyway you can provide |
709 | 'dma_debug=off' as a boot parameter. This will disable DMA-API debugging. | 716 | 'dma_debug=off' as a boot parameter. This will disable DMA-API debugging. |
710 | Notice that you can not enable it again at runtime. You have to reboot to do | 717 | Notice that you can not enable it again at runtime. You have to reboot to do |
711 | so. | 718 | so. |
712 | 719 | ||
720 | If you want to see debug messages only for a special device driver you can | ||
721 | specify the dma_debug_driver=<drivername> parameter. This will enable the | ||
722 | driver filter at boot time. The debug code will only print errors for that | ||
723 | driver afterwards. This filter can be disabled or changed later using debugfs. | ||
724 | |||
713 | When the code disables itself at runtime this is most likely because it ran | 725 | When the code disables itself at runtime this is most likely because it ran |
714 | out of dma_debug_entries. These entries are preallocated at boot. The number | 726 | out of dma_debug_entries. These entries are preallocated at boot. The number |
715 | of preallocated entries is defined per architecture. If it is too low for you | 727 | of preallocated entries is defined per architecture. If it is too low for you |
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile index b1eb661e6302..9632444f6c62 100644 --- a/Documentation/DocBook/Makefile +++ b/Documentation/DocBook/Makefile | |||
@@ -13,7 +13,8 @@ DOCBOOKS := z8530book.xml mcabook.xml device-drivers.xml \ | |||
13 | gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \ | 13 | gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \ |
14 | genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \ | 14 | genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \ |
15 | mac80211.xml debugobjects.xml sh.xml regulator.xml \ | 15 | mac80211.xml debugobjects.xml sh.xml regulator.xml \ |
16 | alsa-driver-api.xml writing-an-alsa-driver.xml | 16 | alsa-driver-api.xml writing-an-alsa-driver.xml \ |
17 | tracepoint.xml | ||
17 | 18 | ||
18 | ### | 19 | ### |
19 | # The build process is as follows (targets): | 20 | # The build process is as follows (targets): |
diff --git a/Documentation/DocBook/tracepoint.tmpl b/Documentation/DocBook/tracepoint.tmpl new file mode 100644 index 000000000000..b0756d0fd579 --- /dev/null +++ b/Documentation/DocBook/tracepoint.tmpl | |||
@@ -0,0 +1,89 @@ | |||
1 | <?xml version="1.0" encoding="UTF-8"?> | ||
2 | <!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" | ||
3 | "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> | ||
4 | |||
5 | <book id="Tracepoints"> | ||
6 | <bookinfo> | ||
7 | <title>The Linux Kernel Tracepoint API</title> | ||
8 | |||
9 | <authorgroup> | ||
10 | <author> | ||
11 | <firstname>Jason</firstname> | ||
12 | <surname>Baron</surname> | ||
13 | <affiliation> | ||
14 | <address> | ||
15 | <email>jbaron@redhat.com</email> | ||
16 | </address> | ||
17 | </affiliation> | ||
18 | </author> | ||
19 | </authorgroup> | ||
20 | |||
21 | <legalnotice> | ||
22 | <para> | ||
23 | This documentation is free software; you can redistribute | ||
24 | it and/or modify it under the terms of the GNU General Public | ||
25 | License as published by the Free Software Foundation; either | ||
26 | version 2 of the License, or (at your option) any later | ||
27 | version. | ||
28 | </para> | ||
29 | |||
30 | <para> | ||
31 | This program is distributed in the hope that it will be | ||
32 | useful, but WITHOUT ANY WARRANTY; without even the implied | ||
33 | warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | ||
34 | See the GNU General Public License for more details. | ||
35 | </para> | ||
36 | |||
37 | <para> | ||
38 | You should have received a copy of the GNU General Public | ||
39 | License along with this program; if not, write to the Free | ||
40 | Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, | ||
41 | MA 02111-1307 USA | ||
42 | </para> | ||
43 | |||
44 | <para> | ||
45 | For more details see the file COPYING in the source | ||
46 | distribution of Linux. | ||
47 | </para> | ||
48 | </legalnotice> | ||
49 | </bookinfo> | ||
50 | |||
51 | <toc></toc> | ||
52 | <chapter id="intro"> | ||
53 | <title>Introduction</title> | ||
54 | <para> | ||
55 | Tracepoints are static probe points that are located in strategic points | ||
56 | throughout the kernel. 'Probes' register/unregister with tracepoints | ||
57 | via a callback mechanism. The 'probes' are strictly typed functions that | ||
58 | are passed a unique set of parameters defined by each tracepoint. | ||
59 | </para> | ||
60 | |||
61 | <para> | ||
62 | From this simple callback mechanism, 'probes' can be used to profile, debug, | ||
63 | and understand kernel behavior. There are a number of tools that provide a | ||
64 | framework for using 'probes'. These tools include Systemtap, ftrace, and | ||
65 | LTTng. | ||
66 | </para> | ||
67 | |||
68 | <para> | ||
69 | Tracepoints are defined in a number of header files via various macros. Thus, | ||
70 | the purpose of this document is to provide a clear accounting of the available | ||
71 | tracepoints. The intention is to understand not only what tracepoints are | ||
72 | available but also to understand where future tracepoints might be added. | ||
73 | </para> | ||
74 | |||
75 | <para> | ||
76 | The API presented has functions of the form: | ||
77 | <function>trace_tracepointname(function parameters)</function>. These are the | ||
78 | tracepoints callbacks that are found throughout the code. Registering and | ||
79 | unregistering probes with these callback sites is covered in the | ||
80 | <filename>Documentation/trace/*</filename> directory. | ||
81 | </para> | ||
82 | </chapter> | ||
83 | |||
84 | <chapter id="irq"> | ||
85 | <title>IRQ</title> | ||
86 | !Iinclude/trace/events/irq.h | ||
87 | </chapter> | ||
88 | |||
89 | </book> | ||
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index 068848240a8b..02cced183b2d 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt | |||
@@ -192,23 +192,24 @@ rcu/rcuhier (which displays the struct rcu_node hierarchy). | |||
192 | The output of "cat rcu/rcudata" looks as follows: | 192 | The output of "cat rcu/rcudata" looks as follows: |
193 | 193 | ||
194 | rcu: | 194 | rcu: |
195 | 0 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=1 rp=3c2a dt=23301/73 dn=2 df=1882 of=0 ri=2126 ql=2 b=10 | 195 | rcu: |
196 | 1 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=3 rp=39a6 dt=78073/1 dn=2 df=1402 of=0 ri=1875 ql=46 b=10 | 196 | 0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1 dn=0 df=1101 of=0 ri=36 ql=0 b=10 |
197 | 2 c=4010 g=4010 pq=1 pqc=4010 qp=0 rpfq=-5 rp=1d12 dt=16646/0 dn=2 df=3140 of=0 ri=2080 ql=0 b=10 | 197 | 1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1 dn=0 df=1015 of=0 ri=0 ql=0 b=10 |
198 | 3 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=2b50 dt=21159/1 dn=2 df=2230 of=0 ri=1923 ql=72 b=10 | 198 | 2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1 dn=0 df=1839 of=0 ri=0 ql=0 b=10 |
199 | 4 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1644 dt=5783/1 dn=2 df=3348 of=0 ri=2805 ql=7 b=10 | 199 | 3 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=6681/1 dn=0 df=1545 of=0 ri=0 ql=0 b=10 |
200 | 5 c=4012 g=4013 pq=0 pqc=4011 qp=1 rpfq=3 rp=1aac dt=5879/1 dn=2 df=3140 of=0 ri=2066 ql=10 b=10 | 200 | 4 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1003/1 dn=0 df=1992 of=0 ri=0 ql=0 b=10 |
201 | 6 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=ed8 dt=5847/1 dn=2 df=3797 of=0 ri=1266 ql=10 b=10 | 201 | 5 c=17829 g=17830 pq=1 pqc=17829 qp=1 dt=3887/1 dn=0 df=3331 of=0 ri=4 ql=2 b=10 |
202 | 7 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1fa2 dt=6199/1 dn=2 df=2795 of=0 ri=2162 ql=28 b=10 | 202 | 6 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=859/1 dn=0 df=3224 of=0 ri=0 ql=0 b=10 |
203 | 7 c=17829 g=17830 pq=0 pqc=17829 qp=1 dt=3761/1 dn=0 df=1818 of=0 ri=0 ql=2 b=10 | ||
203 | rcu_bh: | 204 | rcu_bh: |
204 | 0 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-145 rp=21d6 dt=23301/73 dn=2 df=0 of=0 ri=0 ql=0 b=10 | 205 | 0 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=10951/1 dn=0 df=0 of=0 ri=0 ql=0 b=10 |
205 | 1 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-170 rp=20ce dt=78073/1 dn=2 df=26 of=0 ri=5 ql=0 b=10 | 206 | 1 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=16117/1 dn=0 df=13 of=0 ri=0 ql=0 b=10 |
206 | 2 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-83 rp=fbd dt=16646/0 dn=2 df=28 of=0 ri=4 ql=0 b=10 | 207 | 2 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1445/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 |
207 | 3 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-105 rp=178c dt=21159/1 dn=2 df=28 of=0 ri=2 ql=0 b=10 | 208 | 3 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=6681/1 dn=0 df=9 of=0 ri=0 ql=0 b=10 |
208 | 4 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-30 rp=b54 dt=5783/1 dn=2 df=32 of=0 ri=0 ql=0 b=10 | 209 | 4 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1003/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 |
209 | 5 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-29 rp=df5 dt=5879/1 dn=2 df=30 of=0 ri=3 ql=0 b=10 | 210 | 5 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3887/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 |
210 | 6 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-28 rp=788 dt=5847/1 dn=2 df=32 of=0 ri=0 ql=0 b=10 | 211 | 6 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=859/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 |
211 | 7 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-53 rp=1098 dt=6199/1 dn=2 df=30 of=0 ri=3 ql=0 b=10 | 212 | 7 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3761/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 |
212 | 213 | ||
213 | The first section lists the rcu_data structures for rcu, the second for | 214 | The first section lists the rcu_data structures for rcu, the second for |
214 | rcu_bh. Each section has one line per CPU, or eight for this 8-CPU system. | 215 | rcu_bh. Each section has one line per CPU, or eight for this 8-CPU system. |
@@ -253,12 +254,6 @@ o "pqc" indicates which grace period the last-observed quiescent | |||
253 | o "qp" indicates that RCU still expects a quiescent state from | 254 | o "qp" indicates that RCU still expects a quiescent state from |
254 | this CPU. | 255 | this CPU. |
255 | 256 | ||
256 | o "rpfq" is the number of rcu_pending() calls on this CPU required | ||
257 | to induce this CPU to invoke force_quiescent_state(). | ||
258 | |||
259 | o "rp" is low-order four hex digits of the count of how many times | ||
260 | rcu_pending() has been invoked on this CPU. | ||
261 | |||
262 | o "dt" is the current value of the dyntick counter that is incremented | 257 | o "dt" is the current value of the dyntick counter that is incremented |
263 | when entering or leaving dynticks idle state, either by the | 258 | when entering or leaving dynticks idle state, either by the |
264 | scheduler or by irq. The number after the "/" is the interrupt | 259 | scheduler or by irq. The number after the "/" is the interrupt |
@@ -305,6 +300,9 @@ o "b" is the batch limit for this CPU. If more than this number | |||
305 | of RCU callbacks is ready to invoke, then the remainder will | 300 | of RCU callbacks is ready to invoke, then the remainder will |
306 | be deferred. | 301 | be deferred. |
307 | 302 | ||
303 | There is also an rcu/rcudata.csv file with the same information in | ||
304 | comma-separated-variable spreadsheet format. | ||
305 | |||
308 | 306 | ||
309 | The output of "cat rcu/rcugp" looks as follows: | 307 | The output of "cat rcu/rcugp" looks as follows: |
310 | 308 | ||
@@ -411,3 +409,63 @@ o Each element of the form "1/1 0:127 ^0" represents one struct | |||
411 | For example, the first entry at the lowest level shows | 409 | For example, the first entry at the lowest level shows |
412 | "^0", indicating that it corresponds to bit zero in | 410 | "^0", indicating that it corresponds to bit zero in |
413 | the first entry at the middle level. | 411 | the first entry at the middle level. |
412 | |||
413 | |||
414 | The output of "cat rcu/rcu_pending" looks as follows: | ||
415 | |||
416 | rcu: | ||
417 | 0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741 | ||
418 | 1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792 | ||
419 | 2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629 | ||
420 | 3 np=236249 qsp=48766 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723 | ||
421 | 4 np=221310 qsp=46850 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110 | ||
422 | 5 np=237332 qsp=48449 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456 | ||
423 | 6 np=219995 qsp=46718 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834 | ||
424 | 7 np=249893 qsp=49390 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888 | ||
425 | rcu_bh: | ||
426 | 0 np=146741 qsp=1419 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314 | ||
427 | 1 np=155792 qsp=12597 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180 | ||
428 | 2 np=136629 qsp=18680 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936 | ||
429 | 3 np=137723 qsp=2843 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863 | ||
430 | 4 np=123110 qsp=12433 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671 | ||
431 | 5 np=137456 qsp=4210 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235 | ||
432 | 6 np=120834 qsp=9902 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921 | ||
433 | 7 np=144888 qsp=26336 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542 | ||
434 | |||
435 | As always, this is once again split into "rcu" and "rcu_bh" portions. | ||
436 | The fields are as follows: | ||
437 | |||
438 | o "np" is the number of times that __rcu_pending() has been invoked | ||
439 | for the corresponding flavor of RCU. | ||
440 | |||
441 | o "qsp" is the number of times that the RCU was waiting for a | ||
442 | quiescent state from this CPU. | ||
443 | |||
444 | o "cbr" is the number of times that this CPU had RCU callbacks | ||
445 | that had passed through a grace period, and were thus ready | ||
446 | to be invoked. | ||
447 | |||
448 | o "cng" is the number of times that this CPU needed another | ||
449 | grace period while RCU was idle. | ||
450 | |||
451 | o "gpc" is the number of times that an old grace period had | ||
452 | completed, but this CPU was not yet aware of it. | ||
453 | |||
454 | o "gps" is the number of times that a new grace period had started, | ||
455 | but this CPU was not yet aware of it. | ||
456 | |||
457 | o "nf" is the number of times that this CPU suspected that the | ||
458 | current grace period had run for too long, and thus needed to | ||
459 | be forced. | ||
460 | |||
461 | Please note that "forcing" consists of sending resched IPIs | ||
462 | to holdout CPUs. If that CPU really still is in an old RCU | ||
463 | read-side critical section, then we really do have to wait for it. | ||
464 | The assumption behing "forcing" is that the CPU is not still in | ||
465 | an old RCU read-side critical section, but has not yet responded | ||
466 | for some other reason. | ||
467 | |||
468 | o "nn" is the number of times that this CPU needed nothing. Alert | ||
469 | readers will note that the rcu "nn" number for a given CPU very | ||
470 | closely matches the rcu_bh "np" number for that same CPU. This | ||
471 | is due to short-circuit evaluation in rcu_pending(). | ||
diff --git a/Documentation/Smack.txt b/Documentation/Smack.txt index 629c92e99783..34614b4c708e 100644 --- a/Documentation/Smack.txt +++ b/Documentation/Smack.txt | |||
@@ -184,8 +184,9 @@ length. Single character labels using special characters, that being anything | |||
184 | other than a letter or digit, are reserved for use by the Smack development | 184 | other than a letter or digit, are reserved for use by the Smack development |
185 | team. Smack labels are unstructured, case sensitive, and the only operation | 185 | team. Smack labels are unstructured, case sensitive, and the only operation |
186 | ever performed on them is comparison for equality. Smack labels cannot | 186 | ever performed on them is comparison for equality. Smack labels cannot |
187 | contain unprintable characters or the "/" (slash) character. Smack labels | 187 | contain unprintable characters, the "/" (slash), the "\" (backslash), the "'" |
188 | cannot begin with a '-', which is reserved for special options. | 188 | (quote) and '"' (double-quote) characters. |
189 | Smack labels cannot begin with a '-', which is reserved for special options. | ||
189 | 190 | ||
190 | There are some predefined labels: | 191 | There are some predefined labels: |
191 | 192 | ||
@@ -523,3 +524,18 @@ Smack supports some mount options: | |||
523 | 524 | ||
524 | These mount options apply to all file system types. | 525 | These mount options apply to all file system types. |
525 | 526 | ||
527 | Smack auditing | ||
528 | |||
529 | If you want Smack auditing of security events, you need to set CONFIG_AUDIT | ||
530 | in your kernel configuration. | ||
531 | By default, all denied events will be audited. You can change this behavior by | ||
532 | writing a single character to the /smack/logging file : | ||
533 | 0 : no logging | ||
534 | 1 : log denied (default) | ||
535 | 2 : log accepted | ||
536 | 3 : log denied & accepted | ||
537 | |||
538 | Events are logged as 'key=value' pairs, for each event you at least will get | ||
539 | the subjet, the object, the rights requested, the action, the kernel function | ||
540 | that triggered the event, plus other pairs depending on the type of event | ||
541 | audited. | ||
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index 6fab97ea7e6b..8d2158a1c6aa 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt | |||
@@ -186,7 +186,7 @@ a virtual address mapping (unlike the earlier scheme of virtual address | |||
186 | do not have a corresponding kernel virtual address space mapping) and | 186 | do not have a corresponding kernel virtual address space mapping) and |
187 | low-memory pages. | 187 | low-memory pages. |
188 | 188 | ||
189 | Note: Please refer to Documentation/PCI/PCI-DMA-mapping.txt for a discussion | 189 | Note: Please refer to Documentation/DMA-mapping.txt for a discussion |
190 | on PCI high mem DMA aspects and mapping of scatter gather lists, and support | 190 | on PCI high mem DMA aspects and mapping of scatter gather lists, and support |
191 | for 64 bit PCI. | 191 | for 64 bit PCI. |
192 | 192 | ||
diff --git a/Documentation/filesystems/gfs2-glocks.txt b/Documentation/filesystems/gfs2-glocks.txt index 4dae9a3840bf..0494f78d87e4 100644 --- a/Documentation/filesystems/gfs2-glocks.txt +++ b/Documentation/filesystems/gfs2-glocks.txt | |||
@@ -60,7 +60,7 @@ go_lock | Called for the first local holder of a lock | |||
60 | go_unlock | Called on the final local unlock of a lock | 60 | go_unlock | Called on the final local unlock of a lock |
61 | go_dump | Called to print content of object for debugfs file, or on | 61 | go_dump | Called to print content of object for debugfs file, or on |
62 | | error to dump glock to the log. | 62 | | error to dump glock to the log. |
63 | go_type; | The type of the glock, LM_TYPE_..... | 63 | go_type | The type of the glock, LM_TYPE_..... |
64 | go_min_hold_time | The minimum hold time | 64 | go_min_hold_time | The minimum hold time |
65 | 65 | ||
66 | The minimum hold time for each lock is the time after a remote lock | 66 | The minimum hold time for each lock is the time after a remote lock |
diff --git a/Documentation/filesystems/gfs2.txt b/Documentation/filesystems/gfs2.txt index 593004b6bbab..5e3ab8f3beff 100644 --- a/Documentation/filesystems/gfs2.txt +++ b/Documentation/filesystems/gfs2.txt | |||
@@ -11,18 +11,15 @@ their I/O so file system consistency is maintained. One of the nifty | |||
11 | features of GFS is perfect consistency -- changes made to the file system | 11 | features of GFS is perfect consistency -- changes made to the file system |
12 | on one machine show up immediately on all other machines in the cluster. | 12 | on one machine show up immediately on all other machines in the cluster. |
13 | 13 | ||
14 | GFS uses interchangable inter-node locking mechanisms. Different lock | 14 | GFS uses interchangable inter-node locking mechanisms, the currently |
15 | modules can plug into GFS and each file system selects the appropriate | 15 | supported mechanisms are: |
16 | lock module at mount time. Lock modules include: | ||
17 | 16 | ||
18 | lock_nolock -- allows gfs to be used as a local file system | 17 | lock_nolock -- allows gfs to be used as a local file system |
19 | 18 | ||
20 | lock_dlm -- uses a distributed lock manager (dlm) for inter-node locking | 19 | lock_dlm -- uses a distributed lock manager (dlm) for inter-node locking |
21 | The dlm is found at linux/fs/dlm/ | 20 | The dlm is found at linux/fs/dlm/ |
22 | 21 | ||
23 | In addition to interfacing with an external locking manager, a gfs lock | 22 | Lock_dlm depends on user space cluster management systems found |
24 | module is responsible for interacting with external cluster management | ||
25 | systems. Lock_dlm depends on user space cluster management systems found | ||
26 | at the URL above. | 23 | at the URL above. |
27 | 24 | ||
28 | To use gfs as a local file system, no external clustering systems are | 25 | To use gfs as a local file system, no external clustering systems are |
@@ -31,13 +28,19 @@ needed, simply: | |||
31 | $ mkfs -t gfs2 -p lock_nolock -j 1 /dev/block_device | 28 | $ mkfs -t gfs2 -p lock_nolock -j 1 /dev/block_device |
32 | $ mount -t gfs2 /dev/block_device /dir | 29 | $ mount -t gfs2 /dev/block_device /dir |
33 | 30 | ||
34 | GFS2 is not on-disk compatible with previous versions of GFS. | 31 | If you are using Fedora, you need to install the gfs2-utils package |
32 | and, for lock_dlm, you will also need to install the cman package | ||
33 | and write a cluster.conf as per the documentation. | ||
34 | |||
35 | GFS2 is not on-disk compatible with previous versions of GFS, but it | ||
36 | is pretty close. | ||
35 | 37 | ||
36 | The following man pages can be found at the URL above: | 38 | The following man pages can be found at the URL above: |
37 | gfs2_fsck to repair a filesystem | 39 | fsck.gfs2 to repair a filesystem |
38 | gfs2_grow to expand a filesystem online | 40 | gfs2_grow to expand a filesystem online |
39 | gfs2_jadd to add journals to a filesystem online | 41 | gfs2_jadd to add journals to a filesystem online |
40 | gfs2_tool to manipulate, examine and tune a filesystem | 42 | gfs2_tool to manipulate, examine and tune a filesystem |
41 | gfs2_quota to examine and change quota values in a filesystem | 43 | gfs2_quota to examine and change quota values in a filesystem |
44 | gfs2_convert to convert a gfs filesystem to gfs2 in-place | ||
42 | mount.gfs2 to help mount(8) mount a filesystem | 45 | mount.gfs2 to help mount(8) mount a filesystem |
43 | mkfs.gfs2 to make a filesystem | 46 | mkfs.gfs2 to make a filesystem |
diff --git a/Documentation/futex-requeue-pi.txt b/Documentation/futex-requeue-pi.txt new file mode 100644 index 000000000000..9dc1ff4fd536 --- /dev/null +++ b/Documentation/futex-requeue-pi.txt | |||
@@ -0,0 +1,131 @@ | |||
1 | Futex Requeue PI | ||
2 | ---------------- | ||
3 | |||
4 | Requeueing of tasks from a non-PI futex to a PI futex requires | ||
5 | special handling in order to ensure the underlying rt_mutex is never | ||
6 | left without an owner if it has waiters; doing so would break the PI | ||
7 | boosting logic [see rt-mutex-desgin.txt] For the purposes of | ||
8 | brevity, this action will be referred to as "requeue_pi" throughout | ||
9 | this document. Priority inheritance is abbreviated throughout as | ||
10 | "PI". | ||
11 | |||
12 | Motivation | ||
13 | ---------- | ||
14 | |||
15 | Without requeue_pi, the glibc implementation of | ||
16 | pthread_cond_broadcast() must resort to waking all the tasks waiting | ||
17 | on a pthread_condvar and letting them try to sort out which task | ||
18 | gets to run first in classic thundering-herd formation. An ideal | ||
19 | implementation would wake the highest-priority waiter, and leave the | ||
20 | rest to the natural wakeup inherent in unlocking the mutex | ||
21 | associated with the condvar. | ||
22 | |||
23 | Consider the simplified glibc calls: | ||
24 | |||
25 | /* caller must lock mutex */ | ||
26 | pthread_cond_wait(cond, mutex) | ||
27 | { | ||
28 | lock(cond->__data.__lock); | ||
29 | unlock(mutex); | ||
30 | do { | ||
31 | unlock(cond->__data.__lock); | ||
32 | futex_wait(cond->__data.__futex); | ||
33 | lock(cond->__data.__lock); | ||
34 | } while(...) | ||
35 | unlock(cond->__data.__lock); | ||
36 | lock(mutex); | ||
37 | } | ||
38 | |||
39 | pthread_cond_broadcast(cond) | ||
40 | { | ||
41 | lock(cond->__data.__lock); | ||
42 | unlock(cond->__data.__lock); | ||
43 | futex_requeue(cond->data.__futex, cond->mutex); | ||
44 | } | ||
45 | |||
46 | Once pthread_cond_broadcast() requeues the tasks, the cond->mutex | ||
47 | has waiters. Note that pthread_cond_wait() attempts to lock the | ||
48 | mutex only after it has returned to user space. This will leave the | ||
49 | underlying rt_mutex with waiters, and no owner, breaking the | ||
50 | previously mentioned PI-boosting algorithms. | ||
51 | |||
52 | In order to support PI-aware pthread_condvar's, the kernel needs to | ||
53 | be able to requeue tasks to PI futexes. This support implies that | ||
54 | upon a successful futex_wait system call, the caller would return to | ||
55 | user space already holding the PI futex. The glibc implementation | ||
56 | would be modified as follows: | ||
57 | |||
58 | |||
59 | /* caller must lock mutex */ | ||
60 | pthread_cond_wait_pi(cond, mutex) | ||
61 | { | ||
62 | lock(cond->__data.__lock); | ||
63 | unlock(mutex); | ||
64 | do { | ||
65 | unlock(cond->__data.__lock); | ||
66 | futex_wait_requeue_pi(cond->__data.__futex); | ||
67 | lock(cond->__data.__lock); | ||
68 | } while(...) | ||
69 | unlock(cond->__data.__lock); | ||
70 | /* the kernel acquired the the mutex for us */ | ||
71 | } | ||
72 | |||
73 | pthread_cond_broadcast_pi(cond) | ||
74 | { | ||
75 | lock(cond->__data.__lock); | ||
76 | unlock(cond->__data.__lock); | ||
77 | futex_requeue_pi(cond->data.__futex, cond->mutex); | ||
78 | } | ||
79 | |||
80 | The actual glibc implementation will likely test for PI and make the | ||
81 | necessary changes inside the existing calls rather than creating new | ||
82 | calls for the PI cases. Similar changes are needed for | ||
83 | pthread_cond_timedwait() and pthread_cond_signal(). | ||
84 | |||
85 | Implementation | ||
86 | -------------- | ||
87 | |||
88 | In order to ensure the rt_mutex has an owner if it has waiters, it | ||
89 | is necessary for both the requeue code, as well as the waiting code, | ||
90 | to be able to acquire the rt_mutex before returning to user space. | ||
91 | The requeue code cannot simply wake the waiter and leave it to | ||
92 | acquire the rt_mutex as it would open a race window between the | ||
93 | requeue call returning to user space and the waiter waking and | ||
94 | starting to run. This is especially true in the uncontended case. | ||
95 | |||
96 | The solution involves two new rt_mutex helper routines, | ||
97 | rt_mutex_start_proxy_lock() and rt_mutex_finish_proxy_lock(), which | ||
98 | allow the requeue code to acquire an uncontended rt_mutex on behalf | ||
99 | of the waiter and to enqueue the waiter on a contended rt_mutex. | ||
100 | Two new system calls provide the kernel<->user interface to | ||
101 | requeue_pi: FUTEX_WAIT_REQUEUE_PI and FUTEX_REQUEUE_CMP_PI. | ||
102 | |||
103 | FUTEX_WAIT_REQUEUE_PI is called by the waiter (pthread_cond_wait() | ||
104 | and pthread_cond_timedwait()) to block on the initial futex and wait | ||
105 | to be requeued to a PI-aware futex. The implementation is the | ||
106 | result of a high-speed collision between futex_wait() and | ||
107 | futex_lock_pi(), with some extra logic to check for the additional | ||
108 | wake-up scenarios. | ||
109 | |||
110 | FUTEX_REQUEUE_CMP_PI is called by the waker | ||
111 | (pthread_cond_broadcast() and pthread_cond_signal()) to requeue and | ||
112 | possibly wake the waiting tasks. Internally, this system call is | ||
113 | still handled by futex_requeue (by passing requeue_pi=1). Before | ||
114 | requeueing, futex_requeue() attempts to acquire the requeue target | ||
115 | PI futex on behalf of the top waiter. If it can, this waiter is | ||
116 | woken. futex_requeue() then proceeds to requeue the remaining | ||
117 | nr_wake+nr_requeue tasks to the PI futex, calling | ||
118 | rt_mutex_start_proxy_lock() prior to each requeue to prepare the | ||
119 | task as a waiter on the underlying rt_mutex. It is possible that | ||
120 | the lock can be acquired at this stage as well, if so, the next | ||
121 | waiter is woken to finish the acquisition of the lock. | ||
122 | |||
123 | FUTEX_REQUEUE_PI accepts nr_wake and nr_requeue as arguments, but | ||
124 | their sum is all that really matters. futex_requeue() will wake or | ||
125 | requeue up to nr_wake + nr_requeue tasks. It will wake only as many | ||
126 | tasks as it can acquire the lock for, which in the majority of cases | ||
127 | should be 0 as good programming practice dictates that the caller of | ||
128 | either pthread_cond_broadcast() or pthread_cond_signal() acquire the | ||
129 | mutex prior to making the call. FUTEX_REQUEUE_PI requires that | ||
130 | nr_wake=1. nr_requeue should be INT_MAX for broadcast and 0 for | ||
131 | signal. | ||
diff --git a/Documentation/ide/ide.txt b/Documentation/ide/ide.txt index 0c78f4b1d9d9..e77bebfa7b0d 100644 --- a/Documentation/ide/ide.txt +++ b/Documentation/ide/ide.txt | |||
@@ -216,6 +216,8 @@ Other kernel parameters for ide_core are: | |||
216 | 216 | ||
217 | * "noflush=[interface_number.device_number]" to disable flush requests | 217 | * "noflush=[interface_number.device_number]" to disable flush requests |
218 | 218 | ||
219 | * "nohpa=[interface_number.device_number]" to disable Host Protected Area | ||
220 | |||
219 | * "noprobe=[interface_number.device_number]" to skip probing | 221 | * "noprobe=[interface_number.device_number]" to skip probing |
220 | 222 | ||
221 | * "nowerr=[interface_number.device_number]" to ignore the WRERR_STAT bit | 223 | * "nowerr=[interface_number.device_number]" to ignore the WRERR_STAT bit |
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index fd5cac013037..0bf8a882ee9e 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -56,7 +56,6 @@ parameter is applicable: | |||
56 | ISAPNP ISA PnP code is enabled. | 56 | ISAPNP ISA PnP code is enabled. |
57 | ISDN Appropriate ISDN support is enabled. | 57 | ISDN Appropriate ISDN support is enabled. |
58 | JOY Appropriate joystick support is enabled. | 58 | JOY Appropriate joystick support is enabled. |
59 | KMEMTRACE kmemtrace is enabled. | ||
60 | LIBATA Libata driver is enabled | 59 | LIBATA Libata driver is enabled |
61 | LP Printer support is enabled. | 60 | LP Printer support is enabled. |
62 | LOOP Loopback device support is enabled. | 61 | LOOP Loopback device support is enabled. |
@@ -329,11 +328,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
329 | flushed before they will be reused, which | 328 | flushed before they will be reused, which |
330 | is a lot of faster | 329 | is a lot of faster |
331 | 330 | ||
332 | amd_iommu_size= [HW,X86-64] | ||
333 | Define the size of the aperture for the AMD IOMMU | ||
334 | driver. Possible values are: | ||
335 | '32M', '64M' (default), '128M', '256M', '512M', '1G' | ||
336 | |||
337 | amijoy.map= [HW,JOY] Amiga joystick support | 331 | amijoy.map= [HW,JOY] Amiga joystick support |
338 | Map of devices attached to JOY0DAT and JOY1DAT | 332 | Map of devices attached to JOY0DAT and JOY1DAT |
339 | Format: <a>,<b> | 333 | Format: <a>,<b> |
@@ -646,6 +640,13 @@ and is between 256 and 4096 characters. It is defined in the file | |||
646 | DMA-API debugging code disables itself because the | 640 | DMA-API debugging code disables itself because the |
647 | architectural default is too low. | 641 | architectural default is too low. |
648 | 642 | ||
643 | dma_debug_driver=<driver_name> | ||
644 | With this option the DMA-API debugging driver | ||
645 | filter feature can be enabled at boot time. Just | ||
646 | pass the driver to filter for as the parameter. | ||
647 | The filter can be disabled or changed to another | ||
648 | driver later using sysfs. | ||
649 | |||
649 | dscc4.setup= [NET] | 650 | dscc4.setup= [NET] |
650 | 651 | ||
651 | dtc3181e= [HW,SCSI] | 652 | dtc3181e= [HW,SCSI] |
@@ -752,12 +753,25 @@ and is between 256 and 4096 characters. It is defined in the file | |||
752 | ia64_pal_cache_flush instead of SAL_CACHE_FLUSH. | 753 | ia64_pal_cache_flush instead of SAL_CACHE_FLUSH. |
753 | 754 | ||
754 | ftrace=[tracer] | 755 | ftrace=[tracer] |
755 | [ftrace] will set and start the specified tracer | 756 | [FTRACE] will set and start the specified tracer |
756 | as early as possible in order to facilitate early | 757 | as early as possible in order to facilitate early |
757 | boot debugging. | 758 | boot debugging. |
758 | 759 | ||
759 | ftrace_dump_on_oops | 760 | ftrace_dump_on_oops |
760 | [ftrace] will dump the trace buffers on oops. | 761 | [FTRACE] will dump the trace buffers on oops. |
762 | |||
763 | ftrace_filter=[function-list] | ||
764 | [FTRACE] Limit the functions traced by the function | ||
765 | tracer at boot up. function-list is a comma separated | ||
766 | list of functions. This list can be changed at run | ||
767 | time by the set_ftrace_filter file in the debugfs | ||
768 | tracing directory. | ||
769 | |||
770 | ftrace_notrace=[function-list] | ||
771 | [FTRACE] Do not trace the functions specified in | ||
772 | function-list. This list can be changed at run time | ||
773 | by the set_ftrace_notrace file in the debugfs | ||
774 | tracing directory. | ||
761 | 775 | ||
762 | gamecon.map[2|3]= | 776 | gamecon.map[2|3]= |
763 | [HW,JOY] Multisystem joystick and NES/SNES/PSX pad | 777 | [HW,JOY] Multisystem joystick and NES/SNES/PSX pad |
@@ -873,11 +887,8 @@ and is between 256 and 4096 characters. It is defined in the file | |||
873 | 887 | ||
874 | ide-core.nodma= [HW] (E)IDE subsystem | 888 | ide-core.nodma= [HW] (E)IDE subsystem |
875 | Format: =0.0 to prevent dma on hda, =0.1 hdb =1.0 hdc | 889 | Format: =0.0 to prevent dma on hda, =0.1 hdb =1.0 hdc |
876 | .vlb_clock .pci_clock .noflush .noprobe .nowerr .cdrom | 890 | .vlb_clock .pci_clock .noflush .nohpa .noprobe .nowerr |
877 | .chs .ignore_cable are additional options | 891 | .cdrom .chs .ignore_cable are additional options |
878 | See Documentation/ide/ide.txt. | ||
879 | |||
880 | idebus= [HW] (E)IDE subsystem - VLB/PCI bus speed | ||
881 | See Documentation/ide/ide.txt. | 892 | See Documentation/ide/ide.txt. |
882 | 893 | ||
883 | ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem | 894 | ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem |
@@ -914,6 +925,12 @@ and is between 256 and 4096 characters. It is defined in the file | |||
914 | Formt: { "sha1" | "md5" } | 925 | Formt: { "sha1" | "md5" } |
915 | default: "sha1" | 926 | default: "sha1" |
916 | 927 | ||
928 | ima_tcb [IMA] | ||
929 | Load a policy which meets the needs of the Trusted | ||
930 | Computing Base. This means IMA will measure all | ||
931 | programs exec'd, files mmap'd for exec, and all files | ||
932 | opened for read by uid=0. | ||
933 | |||
917 | in2000= [HW,SCSI] | 934 | in2000= [HW,SCSI] |
918 | See header of drivers/scsi/in2000.c. | 935 | See header of drivers/scsi/in2000.c. |
919 | 936 | ||
@@ -1054,15 +1071,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1054 | use the HighMem zone if it exists, and the Normal | 1071 | use the HighMem zone if it exists, and the Normal |
1055 | zone if it does not. | 1072 | zone if it does not. |
1056 | 1073 | ||
1057 | kmemtrace.enable= [KNL,KMEMTRACE] Format: { yes | no } | ||
1058 | Controls whether kmemtrace is enabled | ||
1059 | at boot-time. | ||
1060 | |||
1061 | kmemtrace.subbufs=n [KNL,KMEMTRACE] Overrides the number of | ||
1062 | subbufs kmemtrace's relay channel has. Set this | ||
1063 | higher than default (KMEMTRACE_N_SUBBUFS in code) if | ||
1064 | you experience buffer overruns. | ||
1065 | |||
1066 | kgdboc= [HW] kgdb over consoles. | 1074 | kgdboc= [HW] kgdb over consoles. |
1067 | Requires a tty driver that supports console polling. | 1075 | Requires a tty driver that supports console polling. |
1068 | (only serial suported for now) | 1076 | (only serial suported for now) |
@@ -1072,6 +1080,10 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1072 | Configure the RouterBoard 532 series on-chip | 1080 | Configure the RouterBoard 532 series on-chip |
1073 | Ethernet adapter MAC address. | 1081 | Ethernet adapter MAC address. |
1074 | 1082 | ||
1083 | kmemleak= [KNL] Boot-time kmemleak enable/disable | ||
1084 | Valid arguments: on, off | ||
1085 | Default: on | ||
1086 | |||
1075 | kstack=N [X86] Print N words from the kernel stack | 1087 | kstack=N [X86] Print N words from the kernel stack |
1076 | in oops dumps. | 1088 | in oops dumps. |
1077 | 1089 | ||
@@ -1575,6 +1587,9 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1575 | noinitrd [RAM] Tells the kernel not to load any configured | 1587 | noinitrd [RAM] Tells the kernel not to load any configured |
1576 | initial RAM disk. | 1588 | initial RAM disk. |
1577 | 1589 | ||
1590 | nointremap [X86-64, Intel-IOMMU] Do not enable interrupt | ||
1591 | remapping. | ||
1592 | |||
1578 | nointroute [IA-64] | 1593 | nointroute [IA-64] |
1579 | 1594 | ||
1580 | nojitter [IA64] Disables jitter checking for ITC timers. | 1595 | nojitter [IA64] Disables jitter checking for ITC timers. |
@@ -1660,6 +1675,14 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1660 | oprofile.timer= [HW] | 1675 | oprofile.timer= [HW] |
1661 | Use timer interrupt instead of performance counters | 1676 | Use timer interrupt instead of performance counters |
1662 | 1677 | ||
1678 | oprofile.cpu_type= Force an oprofile cpu type | ||
1679 | This might be useful if you have an older oprofile | ||
1680 | userland or if you want common events. | ||
1681 | Format: { archperfmon } | ||
1682 | archperfmon: [X86] Force use of architectural | ||
1683 | perfmon on Intel CPUs instead of the | ||
1684 | CPU specific event set. | ||
1685 | |||
1663 | osst= [HW,SCSI] SCSI Tape Driver | 1686 | osst= [HW,SCSI] SCSI Tape Driver |
1664 | Format: <buffer_size>,<write_threshold> | 1687 | Format: <buffer_size>,<write_threshold> |
1665 | See also Documentation/scsi/st.txt. | 1688 | See also Documentation/scsi/st.txt. |
diff --git a/Documentation/kmemleak.txt b/Documentation/kmemleak.txt new file mode 100644 index 000000000000..0112da3b9ab8 --- /dev/null +++ b/Documentation/kmemleak.txt | |||
@@ -0,0 +1,142 @@ | |||
1 | Kernel Memory Leak Detector | ||
2 | =========================== | ||
3 | |||
4 | Introduction | ||
5 | ------------ | ||
6 | |||
7 | Kmemleak provides a way of detecting possible kernel memory leaks in a | ||
8 | way similar to a tracing garbage collector | ||
9 | (http://en.wikipedia.org/wiki/Garbage_collection_%28computer_science%29#Tracing_garbage_collectors), | ||
10 | with the difference that the orphan objects are not freed but only | ||
11 | reported via /sys/kernel/debug/kmemleak. A similar method is used by the | ||
12 | Valgrind tool (memcheck --leak-check) to detect the memory leaks in | ||
13 | user-space applications. | ||
14 | |||
15 | Usage | ||
16 | ----- | ||
17 | |||
18 | CONFIG_DEBUG_KMEMLEAK in "Kernel hacking" has to be enabled. A kernel | ||
19 | thread scans the memory every 10 minutes (by default) and prints any new | ||
20 | unreferenced objects found. To trigger an intermediate scan and display | ||
21 | all the possible memory leaks: | ||
22 | |||
23 | # mount -t debugfs nodev /sys/kernel/debug/ | ||
24 | # cat /sys/kernel/debug/kmemleak | ||
25 | |||
26 | Note that the orphan objects are listed in the order they were allocated | ||
27 | and one object at the beginning of the list may cause other subsequent | ||
28 | objects to be reported as orphan. | ||
29 | |||
30 | Memory scanning parameters can be modified at run-time by writing to the | ||
31 | /sys/kernel/debug/kmemleak file. The following parameters are supported: | ||
32 | |||
33 | off - disable kmemleak (irreversible) | ||
34 | stack=on - enable the task stacks scanning | ||
35 | stack=off - disable the tasks stacks scanning | ||
36 | scan=on - start the automatic memory scanning thread | ||
37 | scan=off - stop the automatic memory scanning thread | ||
38 | scan=<secs> - set the automatic memory scanning period in seconds (0 | ||
39 | to disable it) | ||
40 | |||
41 | Kmemleak can also be disabled at boot-time by passing "kmemleak=off" on | ||
42 | the kernel command line. | ||
43 | |||
44 | Basic Algorithm | ||
45 | --------------- | ||
46 | |||
47 | The memory allocations via kmalloc, vmalloc, kmem_cache_alloc and | ||
48 | friends are traced and the pointers, together with additional | ||
49 | information like size and stack trace, are stored in a prio search tree. | ||
50 | The corresponding freeing function calls are tracked and the pointers | ||
51 | removed from the kmemleak data structures. | ||
52 | |||
53 | An allocated block of memory is considered orphan if no pointer to its | ||
54 | start address or to any location inside the block can be found by | ||
55 | scanning the memory (including saved registers). This means that there | ||
56 | might be no way for the kernel to pass the address of the allocated | ||
57 | block to a freeing function and therefore the block is considered a | ||
58 | memory leak. | ||
59 | |||
60 | The scanning algorithm steps: | ||
61 | |||
62 | 1. mark all objects as white (remaining white objects will later be | ||
63 | considered orphan) | ||
64 | 2. scan the memory starting with the data section and stacks, checking | ||
65 | the values against the addresses stored in the prio search tree. If | ||
66 | a pointer to a white object is found, the object is added to the | ||
67 | gray list | ||
68 | 3. scan the gray objects for matching addresses (some white objects | ||
69 | can become gray and added at the end of the gray list) until the | ||
70 | gray set is finished | ||
71 | 4. the remaining white objects are considered orphan and reported via | ||
72 | /sys/kernel/debug/kmemleak | ||
73 | |||
74 | Some allocated memory blocks have pointers stored in the kernel's | ||
75 | internal data structures and they cannot be detected as orphans. To | ||
76 | avoid this, kmemleak can also store the number of values pointing to an | ||
77 | address inside the block address range that need to be found so that the | ||
78 | block is not considered a leak. One example is __vmalloc(). | ||
79 | |||
80 | Kmemleak API | ||
81 | ------------ | ||
82 | |||
83 | See the include/linux/kmemleak.h header for the functions prototype. | ||
84 | |||
85 | kmemleak_init - initialize kmemleak | ||
86 | kmemleak_alloc - notify of a memory block allocation | ||
87 | kmemleak_free - notify of a memory block freeing | ||
88 | kmemleak_not_leak - mark an object as not a leak | ||
89 | kmemleak_ignore - do not scan or report an object as leak | ||
90 | kmemleak_scan_area - add scan areas inside a memory block | ||
91 | kmemleak_no_scan - do not scan a memory block | ||
92 | kmemleak_erase - erase an old value in a pointer variable | ||
93 | kmemleak_alloc_recursive - as kmemleak_alloc but checks the recursiveness | ||
94 | kmemleak_free_recursive - as kmemleak_free but checks the recursiveness | ||
95 | |||
96 | Dealing with false positives/negatives | ||
97 | -------------------------------------- | ||
98 | |||
99 | The false negatives are real memory leaks (orphan objects) but not | ||
100 | reported by kmemleak because values found during the memory scanning | ||
101 | point to such objects. To reduce the number of false negatives, kmemleak | ||
102 | provides the kmemleak_ignore, kmemleak_scan_area, kmemleak_no_scan and | ||
103 | kmemleak_erase functions (see above). The task stacks also increase the | ||
104 | amount of false negatives and their scanning is not enabled by default. | ||
105 | |||
106 | The false positives are objects wrongly reported as being memory leaks | ||
107 | (orphan). For objects known not to be leaks, kmemleak provides the | ||
108 | kmemleak_not_leak function. The kmemleak_ignore could also be used if | ||
109 | the memory block is known not to contain other pointers and it will no | ||
110 | longer be scanned. | ||
111 | |||
112 | Some of the reported leaks are only transient, especially on SMP | ||
113 | systems, because of pointers temporarily stored in CPU registers or | ||
114 | stacks. Kmemleak defines MSECS_MIN_AGE (defaulting to 1000) representing | ||
115 | the minimum age of an object to be reported as a memory leak. | ||
116 | |||
117 | Limitations and Drawbacks | ||
118 | ------------------------- | ||
119 | |||
120 | The main drawback is the reduced performance of memory allocation and | ||
121 | freeing. To avoid other penalties, the memory scanning is only performed | ||
122 | when the /sys/kernel/debug/kmemleak file is read. Anyway, this tool is | ||
123 | intended for debugging purposes where the performance might not be the | ||
124 | most important requirement. | ||
125 | |||
126 | To keep the algorithm simple, kmemleak scans for values pointing to any | ||
127 | address inside a block's address range. This may lead to an increased | ||
128 | number of false negatives. However, it is likely that a real memory leak | ||
129 | will eventually become visible. | ||
130 | |||
131 | Another source of false negatives is the data stored in non-pointer | ||
132 | values. In a future version, kmemleak could only scan the pointer | ||
133 | members in the allocated structures. This feature would solve many of | ||
134 | the false negative cases described above. | ||
135 | |||
136 | The tool can report false positives. These are cases where an allocated | ||
137 | block doesn't need to be freed (some cases in the init_call functions), | ||
138 | the pointer is calculated by other methods than the usual container_of | ||
139 | macro or the pointer is stored in a location not scanned by kmemleak. | ||
140 | |||
141 | Page allocations and ioremap are not tracked. Only the ARM and x86 | ||
142 | architectures are currently supported. | ||
diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile index 1f4f9e888bd1..28c8cdfcafd8 100644 --- a/Documentation/lguest/Makefile +++ b/Documentation/lguest/Makefile | |||
@@ -1,6 +1,5 @@ | |||
1 | # This creates the demonstration utility "lguest" which runs a Linux guest. | 1 | # This creates the demonstration utility "lguest" which runs a Linux guest. |
2 | CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include -I../../arch/x86/include -U_FORTIFY_SOURCE | 2 | CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include -I../../arch/x86/include -U_FORTIFY_SOURCE |
3 | LDLIBS:=-lz | ||
4 | 3 | ||
5 | all: lguest | 4 | all: lguest |
6 | 5 | ||
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index d36fcc0f2715..9ebcd6ef361b 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <sys/types.h> | 16 | #include <sys/types.h> |
17 | #include <sys/stat.h> | 17 | #include <sys/stat.h> |
18 | #include <sys/wait.h> | 18 | #include <sys/wait.h> |
19 | #include <sys/eventfd.h> | ||
19 | #include <fcntl.h> | 20 | #include <fcntl.h> |
20 | #include <stdbool.h> | 21 | #include <stdbool.h> |
21 | #include <errno.h> | 22 | #include <errno.h> |
@@ -59,7 +60,6 @@ typedef uint8_t u8; | |||
59 | /*:*/ | 60 | /*:*/ |
60 | 61 | ||
61 | #define PAGE_PRESENT 0x7 /* Present, RW, Execute */ | 62 | #define PAGE_PRESENT 0x7 /* Present, RW, Execute */ |
62 | #define NET_PEERNUM 1 | ||
63 | #define BRIDGE_PFX "bridge:" | 63 | #define BRIDGE_PFX "bridge:" |
64 | #ifndef SIOCBRADDIF | 64 | #ifndef SIOCBRADDIF |
65 | #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ | 65 | #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ |
@@ -76,19 +76,12 @@ static bool verbose; | |||
76 | do { if (verbose) printf(args); } while(0) | 76 | do { if (verbose) printf(args); } while(0) |
77 | /*:*/ | 77 | /*:*/ |
78 | 78 | ||
79 | /* File descriptors for the Waker. */ | ||
80 | struct { | ||
81 | int pipe[2]; | ||
82 | int lguest_fd; | ||
83 | } waker_fds; | ||
84 | |||
85 | /* The pointer to the start of guest memory. */ | 79 | /* The pointer to the start of guest memory. */ |
86 | static void *guest_base; | 80 | static void *guest_base; |
87 | /* The maximum guest physical address allowed, and maximum possible. */ | 81 | /* The maximum guest physical address allowed, and maximum possible. */ |
88 | static unsigned long guest_limit, guest_max; | 82 | static unsigned long guest_limit, guest_max; |
89 | /* The pipe for signal hander to write to. */ | 83 | /* The /dev/lguest file descriptor. */ |
90 | static int timeoutpipe[2]; | 84 | static int lguest_fd; |
91 | static unsigned int timeout_usec = 500; | ||
92 | 85 | ||
93 | /* a per-cpu variable indicating whose vcpu is currently running */ | 86 | /* a per-cpu variable indicating whose vcpu is currently running */ |
94 | static unsigned int __thread cpu_id; | 87 | static unsigned int __thread cpu_id; |
@@ -96,11 +89,6 @@ static unsigned int __thread cpu_id; | |||
96 | /* This is our list of devices. */ | 89 | /* This is our list of devices. */ |
97 | struct device_list | 90 | struct device_list |
98 | { | 91 | { |
99 | /* Summary information about the devices in our list: ready to pass to | ||
100 | * select() to ask which need servicing.*/ | ||
101 | fd_set infds; | ||
102 | int max_infd; | ||
103 | |||
104 | /* Counter to assign interrupt numbers. */ | 92 | /* Counter to assign interrupt numbers. */ |
105 | unsigned int next_irq; | 93 | unsigned int next_irq; |
106 | 94 | ||
@@ -126,22 +114,21 @@ struct device | |||
126 | /* The linked-list pointer. */ | 114 | /* The linked-list pointer. */ |
127 | struct device *next; | 115 | struct device *next; |
128 | 116 | ||
129 | /* The this device's descriptor, as mapped into the Guest. */ | 117 | /* The device's descriptor, as mapped into the Guest. */ |
130 | struct lguest_device_desc *desc; | 118 | struct lguest_device_desc *desc; |
131 | 119 | ||
120 | /* We can't trust desc values once Guest has booted: we use these. */ | ||
121 | unsigned int feature_len; | ||
122 | unsigned int num_vq; | ||
123 | |||
132 | /* The name of this device, for --verbose. */ | 124 | /* The name of this device, for --verbose. */ |
133 | const char *name; | 125 | const char *name; |
134 | 126 | ||
135 | /* If handle_input is set, it wants to be called when this file | ||
136 | * descriptor is ready. */ | ||
137 | int fd; | ||
138 | bool (*handle_input)(int fd, struct device *me); | ||
139 | |||
140 | /* Any queues attached to this device */ | 127 | /* Any queues attached to this device */ |
141 | struct virtqueue *vq; | 128 | struct virtqueue *vq; |
142 | 129 | ||
143 | /* Handle status being finalized (ie. feature bits stable). */ | 130 | /* Is it operational */ |
144 | void (*ready)(struct device *me); | 131 | bool running; |
145 | 132 | ||
146 | /* Device-specific data. */ | 133 | /* Device-specific data. */ |
147 | void *priv; | 134 | void *priv; |
@@ -164,22 +151,28 @@ struct virtqueue | |||
164 | /* Last available index we saw. */ | 151 | /* Last available index we saw. */ |
165 | u16 last_avail_idx; | 152 | u16 last_avail_idx; |
166 | 153 | ||
167 | /* The routine to call when the Guest pings us, or timeout. */ | 154 | /* How many are used since we sent last irq? */ |
168 | void (*handle_output)(int fd, struct virtqueue *me, bool timeout); | 155 | unsigned int pending_used; |
169 | 156 | ||
170 | /* Outstanding buffers */ | 157 | /* Eventfd where Guest notifications arrive. */ |
171 | unsigned int inflight; | 158 | int eventfd; |
172 | 159 | ||
173 | /* Is this blocked awaiting a timer? */ | 160 | /* Function for the thread which is servicing this virtqueue. */ |
174 | bool blocked; | 161 | void (*service)(struct virtqueue *vq); |
162 | pid_t thread; | ||
175 | }; | 163 | }; |
176 | 164 | ||
177 | /* Remember the arguments to the program so we can "reboot" */ | 165 | /* Remember the arguments to the program so we can "reboot" */ |
178 | static char **main_args; | 166 | static char **main_args; |
179 | 167 | ||
180 | /* Since guest is UP and we don't run at the same time, we don't need barriers. | 168 | /* The original tty settings to restore on exit. */ |
181 | * But I include them in the code in case others copy it. */ | 169 | static struct termios orig_term; |
182 | #define wmb() | 170 | |
171 | /* We have to be careful with barriers: our devices are all run in separate | ||
172 | * threads and so we need to make sure that changes visible to the Guest happen | ||
173 | * in precise order. */ | ||
174 | #define wmb() __asm__ __volatile__("" : : : "memory") | ||
175 | #define mb() __asm__ __volatile__("" : : : "memory") | ||
183 | 176 | ||
184 | /* Convert an iovec element to the given type. | 177 | /* Convert an iovec element to the given type. |
185 | * | 178 | * |
@@ -245,7 +238,7 @@ static void iov_consume(struct iovec iov[], unsigned num_iov, unsigned len) | |||
245 | static u8 *get_feature_bits(struct device *dev) | 238 | static u8 *get_feature_bits(struct device *dev) |
246 | { | 239 | { |
247 | return (u8 *)(dev->desc + 1) | 240 | return (u8 *)(dev->desc + 1) |
248 | + dev->desc->num_vq * sizeof(struct lguest_vqconfig); | 241 | + dev->num_vq * sizeof(struct lguest_vqconfig); |
249 | } | 242 | } |
250 | 243 | ||
251 | /*L:100 The Launcher code itself takes us out into userspace, that scary place | 244 | /*L:100 The Launcher code itself takes us out into userspace, that scary place |
@@ -505,99 +498,19 @@ static void concat(char *dst, char *args[]) | |||
505 | * saw the arguments it expects when we looked at initialize() in lguest_user.c: | 498 | * saw the arguments it expects when we looked at initialize() in lguest_user.c: |
506 | * the base of Guest "physical" memory, the top physical page to allow and the | 499 | * the base of Guest "physical" memory, the top physical page to allow and the |
507 | * entry point for the Guest. */ | 500 | * entry point for the Guest. */ |
508 | static int tell_kernel(unsigned long start) | 501 | static void tell_kernel(unsigned long start) |
509 | { | 502 | { |
510 | unsigned long args[] = { LHREQ_INITIALIZE, | 503 | unsigned long args[] = { LHREQ_INITIALIZE, |
511 | (unsigned long)guest_base, | 504 | (unsigned long)guest_base, |
512 | guest_limit / getpagesize(), start }; | 505 | guest_limit / getpagesize(), start }; |
513 | int fd; | ||
514 | |||
515 | verbose("Guest: %p - %p (%#lx)\n", | 506 | verbose("Guest: %p - %p (%#lx)\n", |
516 | guest_base, guest_base + guest_limit, guest_limit); | 507 | guest_base, guest_base + guest_limit, guest_limit); |
517 | fd = open_or_die("/dev/lguest", O_RDWR); | 508 | lguest_fd = open_or_die("/dev/lguest", O_RDWR); |
518 | if (write(fd, args, sizeof(args)) < 0) | 509 | if (write(lguest_fd, args, sizeof(args)) < 0) |
519 | err(1, "Writing to /dev/lguest"); | 510 | err(1, "Writing to /dev/lguest"); |
520 | |||
521 | /* We return the /dev/lguest file descriptor to control this Guest */ | ||
522 | return fd; | ||
523 | } | 511 | } |
524 | /*:*/ | 512 | /*:*/ |
525 | 513 | ||
526 | static void add_device_fd(int fd) | ||
527 | { | ||
528 | FD_SET(fd, &devices.infds); | ||
529 | if (fd > devices.max_infd) | ||
530 | devices.max_infd = fd; | ||
531 | } | ||
532 | |||
533 | /*L:200 | ||
534 | * The Waker. | ||
535 | * | ||
536 | * With console, block and network devices, we can have lots of input which we | ||
537 | * need to process. We could try to tell the kernel what file descriptors to | ||
538 | * watch, but handing a file descriptor mask through to the kernel is fairly | ||
539 | * icky. | ||
540 | * | ||
541 | * Instead, we clone off a thread which watches the file descriptors and writes | ||
542 | * the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host | ||
543 | * stop running the Guest. This causes the Launcher to return from the | ||
544 | * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset | ||
545 | * the LHREQ_BREAK and wake us up again. | ||
546 | * | ||
547 | * This, of course, is merely a different *kind* of icky. | ||
548 | * | ||
549 | * Given my well-known antipathy to threads, I'd prefer to use processes. But | ||
550 | * it's easier to share Guest memory with threads, and trivial to share the | ||
551 | * devices.infds as the Launcher changes it. | ||
552 | */ | ||
553 | static int waker(void *unused) | ||
554 | { | ||
555 | /* Close the write end of the pipe: only the Launcher has it open. */ | ||
556 | close(waker_fds.pipe[1]); | ||
557 | |||
558 | for (;;) { | ||
559 | fd_set rfds = devices.infds; | ||
560 | unsigned long args[] = { LHREQ_BREAK, 1 }; | ||
561 | unsigned int maxfd = devices.max_infd; | ||
562 | |||
563 | /* We also listen to the pipe from the Launcher. */ | ||
564 | FD_SET(waker_fds.pipe[0], &rfds); | ||
565 | if (waker_fds.pipe[0] > maxfd) | ||
566 | maxfd = waker_fds.pipe[0]; | ||
567 | |||
568 | /* Wait until input is ready from one of the devices. */ | ||
569 | select(maxfd+1, &rfds, NULL, NULL, NULL); | ||
570 | |||
571 | /* Message from Launcher? */ | ||
572 | if (FD_ISSET(waker_fds.pipe[0], &rfds)) { | ||
573 | char c; | ||
574 | /* If this fails, then assume Launcher has exited. | ||
575 | * Don't do anything on exit: we're just a thread! */ | ||
576 | if (read(waker_fds.pipe[0], &c, 1) != 1) | ||
577 | _exit(0); | ||
578 | continue; | ||
579 | } | ||
580 | |||
581 | /* Send LHREQ_BREAK command to snap the Launcher out of it. */ | ||
582 | pwrite(waker_fds.lguest_fd, args, sizeof(args), cpu_id); | ||
583 | } | ||
584 | return 0; | ||
585 | } | ||
586 | |||
587 | /* This routine just sets up a pipe to the Waker process. */ | ||
588 | static void setup_waker(int lguest_fd) | ||
589 | { | ||
590 | /* This pipe is closed when Launcher dies, telling Waker. */ | ||
591 | if (pipe(waker_fds.pipe) != 0) | ||
592 | err(1, "Creating pipe for Waker"); | ||
593 | |||
594 | /* Waker also needs to know the lguest fd */ | ||
595 | waker_fds.lguest_fd = lguest_fd; | ||
596 | |||
597 | if (clone(waker, malloc(4096) + 4096, CLONE_VM | SIGCHLD, NULL) == -1) | ||
598 | err(1, "Creating Waker"); | ||
599 | } | ||
600 | |||
601 | /* | 514 | /* |
602 | * Device Handling. | 515 | * Device Handling. |
603 | * | 516 | * |
@@ -623,49 +536,90 @@ static void *_check_pointer(unsigned long addr, unsigned int size, | |||
623 | /* Each buffer in the virtqueues is actually a chain of descriptors. This | 536 | /* Each buffer in the virtqueues is actually a chain of descriptors. This |
624 | * function returns the next descriptor in the chain, or vq->vring.num if we're | 537 | * function returns the next descriptor in the chain, or vq->vring.num if we're |
625 | * at the end. */ | 538 | * at the end. */ |
626 | static unsigned next_desc(struct virtqueue *vq, unsigned int i) | 539 | static unsigned next_desc(struct vring_desc *desc, |
540 | unsigned int i, unsigned int max) | ||
627 | { | 541 | { |
628 | unsigned int next; | 542 | unsigned int next; |
629 | 543 | ||
630 | /* If this descriptor says it doesn't chain, we're done. */ | 544 | /* If this descriptor says it doesn't chain, we're done. */ |
631 | if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT)) | 545 | if (!(desc[i].flags & VRING_DESC_F_NEXT)) |
632 | return vq->vring.num; | 546 | return max; |
633 | 547 | ||
634 | /* Check they're not leading us off end of descriptors. */ | 548 | /* Check they're not leading us off end of descriptors. */ |
635 | next = vq->vring.desc[i].next; | 549 | next = desc[i].next; |
636 | /* Make sure compiler knows to grab that: we don't want it changing! */ | 550 | /* Make sure compiler knows to grab that: we don't want it changing! */ |
637 | wmb(); | 551 | wmb(); |
638 | 552 | ||
639 | if (next >= vq->vring.num) | 553 | if (next >= max) |
640 | errx(1, "Desc next is %u", next); | 554 | errx(1, "Desc next is %u", next); |
641 | 555 | ||
642 | return next; | 556 | return next; |
643 | } | 557 | } |
644 | 558 | ||
559 | /* This actually sends the interrupt for this virtqueue */ | ||
560 | static void trigger_irq(struct virtqueue *vq) | ||
561 | { | ||
562 | unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; | ||
563 | |||
564 | /* Don't inform them if nothing used. */ | ||
565 | if (!vq->pending_used) | ||
566 | return; | ||
567 | vq->pending_used = 0; | ||
568 | |||
569 | /* If they don't want an interrupt, don't send one, unless empty. */ | ||
570 | if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) | ||
571 | && lg_last_avail(vq) != vq->vring.avail->idx) | ||
572 | return; | ||
573 | |||
574 | /* Send the Guest an interrupt tell them we used something up. */ | ||
575 | if (write(lguest_fd, buf, sizeof(buf)) != 0) | ||
576 | err(1, "Triggering irq %i", vq->config.irq); | ||
577 | } | ||
578 | |||
645 | /* This looks in the virtqueue and for the first available buffer, and converts | 579 | /* This looks in the virtqueue and for the first available buffer, and converts |
646 | * it to an iovec for convenient access. Since descriptors consist of some | 580 | * it to an iovec for convenient access. Since descriptors consist of some |
647 | * number of output then some number of input descriptors, it's actually two | 581 | * number of output then some number of input descriptors, it's actually two |
648 | * iovecs, but we pack them into one and note how many of each there were. | 582 | * iovecs, but we pack them into one and note how many of each there were. |
649 | * | 583 | * |
650 | * This function returns the descriptor number found, or vq->vring.num (which | 584 | * This function returns the descriptor number found. */ |
651 | * is never a valid descriptor number) if none was found. */ | 585 | static unsigned wait_for_vq_desc(struct virtqueue *vq, |
652 | static unsigned get_vq_desc(struct virtqueue *vq, | 586 | struct iovec iov[], |
653 | struct iovec iov[], | 587 | unsigned int *out_num, unsigned int *in_num) |
654 | unsigned int *out_num, unsigned int *in_num) | ||
655 | { | 588 | { |
656 | unsigned int i, head; | 589 | unsigned int i, head, max; |
657 | u16 last_avail; | 590 | struct vring_desc *desc; |
591 | u16 last_avail = lg_last_avail(vq); | ||
592 | |||
593 | while (last_avail == vq->vring.avail->idx) { | ||
594 | u64 event; | ||
595 | |||
596 | /* OK, tell Guest about progress up to now. */ | ||
597 | trigger_irq(vq); | ||
598 | |||
599 | /* OK, now we need to know about added descriptors. */ | ||
600 | vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; | ||
601 | |||
602 | /* They could have slipped one in as we were doing that: make | ||
603 | * sure it's written, then check again. */ | ||
604 | mb(); | ||
605 | if (last_avail != vq->vring.avail->idx) { | ||
606 | vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; | ||
607 | break; | ||
608 | } | ||
609 | |||
610 | /* Nothing new? Wait for eventfd to tell us they refilled. */ | ||
611 | if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event)) | ||
612 | errx(1, "Event read failed?"); | ||
613 | |||
614 | /* We don't need to be notified again. */ | ||
615 | vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; | ||
616 | } | ||
658 | 617 | ||
659 | /* Check it isn't doing very strange things with descriptor numbers. */ | 618 | /* Check it isn't doing very strange things with descriptor numbers. */ |
660 | last_avail = lg_last_avail(vq); | ||
661 | if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) | 619 | if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) |
662 | errx(1, "Guest moved used index from %u to %u", | 620 | errx(1, "Guest moved used index from %u to %u", |
663 | last_avail, vq->vring.avail->idx); | 621 | last_avail, vq->vring.avail->idx); |
664 | 622 | ||
665 | /* If there's nothing new since last we looked, return invalid. */ | ||
666 | if (vq->vring.avail->idx == last_avail) | ||
667 | return vq->vring.num; | ||
668 | |||
669 | /* Grab the next descriptor number they're advertising, and increment | 623 | /* Grab the next descriptor number they're advertising, and increment |
670 | * the index we've seen. */ | 624 | * the index we've seen. */ |
671 | head = vq->vring.avail->ring[last_avail % vq->vring.num]; | 625 | head = vq->vring.avail->ring[last_avail % vq->vring.num]; |
@@ -678,15 +632,28 @@ static unsigned get_vq_desc(struct virtqueue *vq, | |||
678 | /* When we start there are none of either input nor output. */ | 632 | /* When we start there are none of either input nor output. */ |
679 | *out_num = *in_num = 0; | 633 | *out_num = *in_num = 0; |
680 | 634 | ||
635 | max = vq->vring.num; | ||
636 | desc = vq->vring.desc; | ||
681 | i = head; | 637 | i = head; |
638 | |||
639 | /* If this is an indirect entry, then this buffer contains a descriptor | ||
640 | * table which we handle as if it's any normal descriptor chain. */ | ||
641 | if (desc[i].flags & VRING_DESC_F_INDIRECT) { | ||
642 | if (desc[i].len % sizeof(struct vring_desc)) | ||
643 | errx(1, "Invalid size for indirect buffer table"); | ||
644 | |||
645 | max = desc[i].len / sizeof(struct vring_desc); | ||
646 | desc = check_pointer(desc[i].addr, desc[i].len); | ||
647 | i = 0; | ||
648 | } | ||
649 | |||
682 | do { | 650 | do { |
683 | /* Grab the first descriptor, and check it's OK. */ | 651 | /* Grab the first descriptor, and check it's OK. */ |
684 | iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len; | 652 | iov[*out_num + *in_num].iov_len = desc[i].len; |
685 | iov[*out_num + *in_num].iov_base | 653 | iov[*out_num + *in_num].iov_base |
686 | = check_pointer(vq->vring.desc[i].addr, | 654 | = check_pointer(desc[i].addr, desc[i].len); |
687 | vq->vring.desc[i].len); | ||
688 | /* If this is an input descriptor, increment that count. */ | 655 | /* If this is an input descriptor, increment that count. */ |
689 | if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE) | 656 | if (desc[i].flags & VRING_DESC_F_WRITE) |
690 | (*in_num)++; | 657 | (*in_num)++; |
691 | else { | 658 | else { |
692 | /* If it's an output descriptor, they're all supposed | 659 | /* If it's an output descriptor, they're all supposed |
@@ -697,11 +664,10 @@ static unsigned get_vq_desc(struct virtqueue *vq, | |||
697 | } | 664 | } |
698 | 665 | ||
699 | /* If we've got too many, that implies a descriptor loop. */ | 666 | /* If we've got too many, that implies a descriptor loop. */ |
700 | if (*out_num + *in_num > vq->vring.num) | 667 | if (*out_num + *in_num > max) |
701 | errx(1, "Looped descriptor"); | 668 | errx(1, "Looped descriptor"); |
702 | } while ((i = next_desc(vq, i)) != vq->vring.num); | 669 | } while ((i = next_desc(desc, i, max)) != max); |
703 | 670 | ||
704 | vq->inflight++; | ||
705 | return head; | 671 | return head; |
706 | } | 672 | } |
707 | 673 | ||
@@ -719,44 +685,20 @@ static void add_used(struct virtqueue *vq, unsigned int head, int len) | |||
719 | /* Make sure buffer is written before we update index. */ | 685 | /* Make sure buffer is written before we update index. */ |
720 | wmb(); | 686 | wmb(); |
721 | vq->vring.used->idx++; | 687 | vq->vring.used->idx++; |
722 | vq->inflight--; | 688 | vq->pending_used++; |
723 | } | ||
724 | |||
725 | /* This actually sends the interrupt for this virtqueue */ | ||
726 | static void trigger_irq(int fd, struct virtqueue *vq) | ||
727 | { | ||
728 | unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; | ||
729 | |||
730 | /* If they don't want an interrupt, don't send one, unless empty. */ | ||
731 | if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) | ||
732 | && vq->inflight) | ||
733 | return; | ||
734 | |||
735 | /* Send the Guest an interrupt tell them we used something up. */ | ||
736 | if (write(fd, buf, sizeof(buf)) != 0) | ||
737 | err(1, "Triggering irq %i", vq->config.irq); | ||
738 | } | 689 | } |
739 | 690 | ||
740 | /* And here's the combo meal deal. Supersize me! */ | 691 | /* And here's the combo meal deal. Supersize me! */ |
741 | static void add_used_and_trigger(int fd, struct virtqueue *vq, | 692 | static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len) |
742 | unsigned int head, int len) | ||
743 | { | 693 | { |
744 | add_used(vq, head, len); | 694 | add_used(vq, head, len); |
745 | trigger_irq(fd, vq); | 695 | trigger_irq(vq); |
746 | } | 696 | } |
747 | 697 | ||
748 | /* | 698 | /* |
749 | * The Console | 699 | * The Console |
750 | * | 700 | * |
751 | * Here is the input terminal setting we save, and the routine to restore them | 701 | * We associate some data with the console for our exit hack. */ |
752 | * on exit so the user gets their terminal back. */ | ||
753 | static struct termios orig_term; | ||
754 | static void restore_term(void) | ||
755 | { | ||
756 | tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); | ||
757 | } | ||
758 | |||
759 | /* We associate some data with the console for our exit hack. */ | ||
760 | struct console_abort | 702 | struct console_abort |
761 | { | 703 | { |
762 | /* How many times have they hit ^C? */ | 704 | /* How many times have they hit ^C? */ |
@@ -766,276 +708,275 @@ struct console_abort | |||
766 | }; | 708 | }; |
767 | 709 | ||
768 | /* This is the routine which handles console input (ie. stdin). */ | 710 | /* This is the routine which handles console input (ie. stdin). */ |
769 | static bool handle_console_input(int fd, struct device *dev) | 711 | static void console_input(struct virtqueue *vq) |
770 | { | 712 | { |
771 | int len; | 713 | int len; |
772 | unsigned int head, in_num, out_num; | 714 | unsigned int head, in_num, out_num; |
773 | struct iovec iov[dev->vq->vring.num]; | 715 | struct console_abort *abort = vq->dev->priv; |
774 | struct console_abort *abort = dev->priv; | 716 | struct iovec iov[vq->vring.num]; |
775 | |||
776 | /* First we need a console buffer from the Guests's input virtqueue. */ | ||
777 | head = get_vq_desc(dev->vq, iov, &out_num, &in_num); | ||
778 | |||
779 | /* If they're not ready for input, stop listening to this file | ||
780 | * descriptor. We'll start again once they add an input buffer. */ | ||
781 | if (head == dev->vq->vring.num) | ||
782 | return false; | ||
783 | 717 | ||
718 | /* Make sure there's a descriptor waiting. */ | ||
719 | head = wait_for_vq_desc(vq, iov, &out_num, &in_num); | ||
784 | if (out_num) | 720 | if (out_num) |
785 | errx(1, "Output buffers in console in queue?"); | 721 | errx(1, "Output buffers in console in queue?"); |
786 | 722 | ||
787 | /* This is why we convert to iovecs: the readv() call uses them, and so | 723 | /* Read it in. */ |
788 | * it reads straight into the Guest's buffer. */ | 724 | len = readv(STDIN_FILENO, iov, in_num); |
789 | len = readv(dev->fd, iov, in_num); | ||
790 | if (len <= 0) { | 725 | if (len <= 0) { |
791 | /* This implies that the console is closed, is /dev/null, or | 726 | /* Ran out of input? */ |
792 | * something went terribly wrong. */ | ||
793 | warnx("Failed to get console input, ignoring console."); | 727 | warnx("Failed to get console input, ignoring console."); |
794 | /* Put the input terminal back. */ | 728 | /* For simplicity, dying threads kill the whole Launcher. So |
795 | restore_term(); | 729 | * just nap here. */ |
796 | /* Remove callback from input vq, so it doesn't restart us. */ | 730 | for (;;) |
797 | dev->vq->handle_output = NULL; | 731 | pause(); |
798 | /* Stop listening to this fd: don't call us again. */ | ||
799 | return false; | ||
800 | } | 732 | } |
801 | 733 | ||
802 | /* Tell the Guest about the new input. */ | 734 | add_used_and_trigger(vq, head, len); |
803 | add_used_and_trigger(fd, dev->vq, head, len); | ||
804 | 735 | ||
805 | /* Three ^C within one second? Exit. | 736 | /* Three ^C within one second? Exit. |
806 | * | 737 | * |
807 | * This is such a hack, but works surprisingly well. Each ^C has to be | 738 | * This is such a hack, but works surprisingly well. Each ^C has to |
808 | * in a buffer by itself, so they can't be too fast. But we check that | 739 | * be in a buffer by itself, so they can't be too fast. But we check |
809 | * we get three within about a second, so they can't be too slow. */ | 740 | * that we get three within about a second, so they can't be too |
810 | if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) { | 741 | * slow. */ |
811 | if (!abort->count++) | 742 | if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) { |
812 | gettimeofday(&abort->start, NULL); | ||
813 | else if (abort->count == 3) { | ||
814 | struct timeval now; | ||
815 | gettimeofday(&now, NULL); | ||
816 | if (now.tv_sec <= abort->start.tv_sec+1) { | ||
817 | unsigned long args[] = { LHREQ_BREAK, 0 }; | ||
818 | /* Close the fd so Waker will know it has to | ||
819 | * exit. */ | ||
820 | close(waker_fds.pipe[1]); | ||
821 | /* Just in case Waker is blocked in BREAK, send | ||
822 | * unbreak now. */ | ||
823 | write(fd, args, sizeof(args)); | ||
824 | exit(2); | ||
825 | } | ||
826 | abort->count = 0; | ||
827 | } | ||
828 | } else | ||
829 | /* Any other key resets the abort counter. */ | ||
830 | abort->count = 0; | 743 | abort->count = 0; |
744 | return; | ||
745 | } | ||
831 | 746 | ||
832 | /* Everything went OK! */ | 747 | abort->count++; |
833 | return true; | 748 | if (abort->count == 1) |
749 | gettimeofday(&abort->start, NULL); | ||
750 | else if (abort->count == 3) { | ||
751 | struct timeval now; | ||
752 | gettimeofday(&now, NULL); | ||
753 | /* Kill all Launcher processes with SIGINT, like normal ^C */ | ||
754 | if (now.tv_sec <= abort->start.tv_sec+1) | ||
755 | kill(0, SIGINT); | ||
756 | abort->count = 0; | ||
757 | } | ||
834 | } | 758 | } |
835 | 759 | ||
836 | /* Handling output for console is simple: we just get all the output buffers | 760 | /* This is the routine which handles console output (ie. stdout). */ |
837 | * and write them to stdout. */ | 761 | static void console_output(struct virtqueue *vq) |
838 | static void handle_console_output(int fd, struct virtqueue *vq, bool timeout) | ||
839 | { | 762 | { |
840 | unsigned int head, out, in; | 763 | unsigned int head, out, in; |
841 | int len; | ||
842 | struct iovec iov[vq->vring.num]; | 764 | struct iovec iov[vq->vring.num]; |
843 | 765 | ||
844 | /* Keep getting output buffers from the Guest until we run out. */ | 766 | head = wait_for_vq_desc(vq, iov, &out, &in); |
845 | while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { | 767 | if (in) |
846 | if (in) | 768 | errx(1, "Input buffers in console output queue?"); |
847 | errx(1, "Input buffers in output queue?"); | 769 | while (!iov_empty(iov, out)) { |
848 | len = writev(STDOUT_FILENO, iov, out); | 770 | int len = writev(STDOUT_FILENO, iov, out); |
849 | add_used_and_trigger(fd, vq, head, len); | 771 | if (len <= 0) |
772 | err(1, "Write to stdout gave %i", len); | ||
773 | iov_consume(iov, out, len); | ||
850 | } | 774 | } |
851 | } | 775 | add_used(vq, head, 0); |
852 | |||
853 | /* This is called when we no longer want to hear about Guest changes to a | ||
854 | * virtqueue. This is more efficient in high-traffic cases, but it means we | ||
855 | * have to set a timer to check if any more changes have occurred. */ | ||
856 | static void block_vq(struct virtqueue *vq) | ||
857 | { | ||
858 | struct itimerval itm; | ||
859 | |||
860 | vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; | ||
861 | vq->blocked = true; | ||
862 | |||
863 | itm.it_interval.tv_sec = 0; | ||
864 | itm.it_interval.tv_usec = 0; | ||
865 | itm.it_value.tv_sec = 0; | ||
866 | itm.it_value.tv_usec = timeout_usec; | ||
867 | |||
868 | setitimer(ITIMER_REAL, &itm, NULL); | ||
869 | } | 776 | } |
870 | 777 | ||
871 | /* | 778 | /* |
872 | * The Network | 779 | * The Network |
873 | * | 780 | * |
874 | * Handling output for network is also simple: we get all the output buffers | 781 | * Handling output for network is also simple: we get all the output buffers |
875 | * and write them (ignoring the first element) to this device's file descriptor | 782 | * and write them to /dev/net/tun. |
876 | * (/dev/net/tun). | ||
877 | */ | 783 | */ |
878 | static void handle_net_output(int fd, struct virtqueue *vq, bool timeout) | 784 | struct net_info { |
785 | int tunfd; | ||
786 | }; | ||
787 | |||
788 | static void net_output(struct virtqueue *vq) | ||
879 | { | 789 | { |
880 | unsigned int head, out, in, num = 0; | 790 | struct net_info *net_info = vq->dev->priv; |
881 | int len; | 791 | unsigned int head, out, in; |
882 | struct iovec iov[vq->vring.num]; | 792 | struct iovec iov[vq->vring.num]; |
883 | static int last_timeout_num; | ||
884 | |||
885 | /* Keep getting output buffers from the Guest until we run out. */ | ||
886 | while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { | ||
887 | if (in) | ||
888 | errx(1, "Input buffers in output queue?"); | ||
889 | len = writev(vq->dev->fd, iov, out); | ||
890 | if (len < 0) | ||
891 | err(1, "Writing network packet to tun"); | ||
892 | add_used_and_trigger(fd, vq, head, len); | ||
893 | num++; | ||
894 | } | ||
895 | 793 | ||
896 | /* Block further kicks and set up a timer if we saw anything. */ | 794 | head = wait_for_vq_desc(vq, iov, &out, &in); |
897 | if (!timeout && num) | 795 | if (in) |
898 | block_vq(vq); | 796 | errx(1, "Input buffers in net output queue?"); |
899 | 797 | if (writev(net_info->tunfd, iov, out) < 0) | |
900 | /* We never quite know how long should we wait before we check the | 798 | errx(1, "Write to tun failed?"); |
901 | * queue again for more packets. We start at 500 microseconds, and if | 799 | add_used(vq, head, 0); |
902 | * we get fewer packets than last time, we assume we made the timeout | 800 | } |
903 | * too small and increase it by 10 microseconds. Otherwise, we drop it | 801 | |
904 | * by one microsecond every time. It seems to work well enough. */ | 802 | /* Will reading from this file descriptor block? */ |
905 | if (timeout) { | 803 | static bool will_block(int fd) |
906 | if (num < last_timeout_num) | 804 | { |
907 | timeout_usec += 10; | 805 | fd_set fdset; |
908 | else if (timeout_usec > 1) | 806 | struct timeval zero = { 0, 0 }; |
909 | timeout_usec--; | 807 | FD_ZERO(&fdset); |
910 | last_timeout_num = num; | 808 | FD_SET(fd, &fdset); |
911 | } | 809 | return select(fd+1, &fdset, NULL, NULL, &zero) != 1; |
912 | } | 810 | } |
913 | 811 | ||
914 | /* This is where we handle a packet coming in from the tun device to our | 812 | /* This is where we handle packets coming in from the tun device to our |
915 | * Guest. */ | 813 | * Guest. */ |
916 | static bool handle_tun_input(int fd, struct device *dev) | 814 | static void net_input(struct virtqueue *vq) |
917 | { | 815 | { |
918 | unsigned int head, in_num, out_num; | ||
919 | int len; | 816 | int len; |
920 | struct iovec iov[dev->vq->vring.num]; | 817 | unsigned int head, out, in; |
921 | 818 | struct iovec iov[vq->vring.num]; | |
922 | /* First we need a network buffer from the Guests's recv virtqueue. */ | 819 | struct net_info *net_info = vq->dev->priv; |
923 | head = get_vq_desc(dev->vq, iov, &out_num, &in_num); | ||
924 | if (head == dev->vq->vring.num) { | ||
925 | /* Now, it's expected that if we try to send a packet too | ||
926 | * early, the Guest won't be ready yet. Wait until the device | ||
927 | * status says it's ready. */ | ||
928 | /* FIXME: Actually want DRIVER_ACTIVE here. */ | ||
929 | |||
930 | /* Now tell it we want to know if new things appear. */ | ||
931 | dev->vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; | ||
932 | wmb(); | ||
933 | |||
934 | /* We'll turn this back on if input buffers are registered. */ | ||
935 | return false; | ||
936 | } else if (out_num) | ||
937 | errx(1, "Output buffers in network recv queue?"); | ||
938 | |||
939 | /* Read the packet from the device directly into the Guest's buffer. */ | ||
940 | len = readv(dev->fd, iov, in_num); | ||
941 | if (len <= 0) | ||
942 | err(1, "reading network"); | ||
943 | 820 | ||
944 | /* Tell the Guest about the new packet. */ | 821 | head = wait_for_vq_desc(vq, iov, &out, &in); |
945 | add_used_and_trigger(fd, dev->vq, head, len); | 822 | if (out) |
823 | errx(1, "Output buffers in net input queue?"); | ||
946 | 824 | ||
947 | verbose("tun input packet len %i [%02x %02x] (%s)\n", len, | 825 | /* Deliver interrupt now, since we're about to sleep. */ |
948 | ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1], | 826 | if (vq->pending_used && will_block(net_info->tunfd)) |
949 | head != dev->vq->vring.num ? "sent" : "discarded"); | 827 | trigger_irq(vq); |
950 | 828 | ||
951 | /* All good. */ | 829 | len = readv(net_info->tunfd, iov, in); |
952 | return true; | 830 | if (len <= 0) |
831 | err(1, "Failed to read from tun."); | ||
832 | add_used(vq, head, len); | ||
953 | } | 833 | } |
954 | 834 | ||
955 | /*L:215 This is the callback attached to the network and console input | 835 | /* This is the helper to create threads. */ |
956 | * virtqueues: it ensures we try again, in case we stopped console or net | 836 | static int do_thread(void *_vq) |
957 | * delivery because Guest didn't have any buffers. */ | ||
958 | static void enable_fd(int fd, struct virtqueue *vq, bool timeout) | ||
959 | { | 837 | { |
960 | add_device_fd(vq->dev->fd); | 838 | struct virtqueue *vq = _vq; |
961 | /* Snap the Waker out of its select loop. */ | 839 | |
962 | write(waker_fds.pipe[1], "", 1); | 840 | for (;;) |
841 | vq->service(vq); | ||
842 | return 0; | ||
963 | } | 843 | } |
964 | 844 | ||
965 | static void net_enable_fd(int fd, struct virtqueue *vq, bool timeout) | 845 | /* When a child dies, we kill our entire process group with SIGTERM. This |
846 | * also has the side effect that the shell restores the console for us! */ | ||
847 | static void kill_launcher(int signal) | ||
966 | { | 848 | { |
967 | /* We don't need to know again when Guest refills receive buffer. */ | 849 | kill(0, SIGTERM); |
968 | vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; | ||
969 | enable_fd(fd, vq, timeout); | ||
970 | } | 850 | } |
971 | 851 | ||
972 | /* When the Guest tells us they updated the status field, we handle it. */ | 852 | static void reset_device(struct device *dev) |
973 | static void update_device_status(struct device *dev) | ||
974 | { | 853 | { |
975 | struct virtqueue *vq; | 854 | struct virtqueue *vq; |
976 | 855 | ||
977 | /* This is a reset. */ | 856 | verbose("Resetting device %s\n", dev->name); |
978 | if (dev->desc->status == 0) { | ||
979 | verbose("Resetting device %s\n", dev->name); | ||
980 | 857 | ||
981 | /* Clear any features they've acked. */ | 858 | /* Clear any features they've acked. */ |
982 | memset(get_feature_bits(dev) + dev->desc->feature_len, 0, | 859 | memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len); |
983 | dev->desc->feature_len); | ||
984 | 860 | ||
985 | /* Zero out the virtqueues. */ | 861 | /* We're going to be explicitly killing threads, so ignore them. */ |
986 | for (vq = dev->vq; vq; vq = vq->next) { | 862 | signal(SIGCHLD, SIG_IGN); |
987 | memset(vq->vring.desc, 0, | 863 | |
988 | vring_size(vq->config.num, LGUEST_VRING_ALIGN)); | 864 | /* Zero out the virtqueues, get rid of their threads */ |
989 | lg_last_avail(vq) = 0; | 865 | for (vq = dev->vq; vq; vq = vq->next) { |
866 | if (vq->thread != (pid_t)-1) { | ||
867 | kill(vq->thread, SIGTERM); | ||
868 | waitpid(vq->thread, NULL, 0); | ||
869 | vq->thread = (pid_t)-1; | ||
990 | } | 870 | } |
991 | } else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { | 871 | memset(vq->vring.desc, 0, |
872 | vring_size(vq->config.num, LGUEST_VRING_ALIGN)); | ||
873 | lg_last_avail(vq) = 0; | ||
874 | } | ||
875 | dev->running = false; | ||
876 | |||
877 | /* Now we care if threads die. */ | ||
878 | signal(SIGCHLD, (void *)kill_launcher); | ||
879 | } | ||
880 | |||
881 | static void create_thread(struct virtqueue *vq) | ||
882 | { | ||
883 | /* Create stack for thread and run it. Since stack grows | ||
884 | * upwards, we point the stack pointer to the end of this | ||
885 | * region. */ | ||
886 | char *stack = malloc(32768); | ||
887 | unsigned long args[] = { LHREQ_EVENTFD, | ||
888 | vq->config.pfn*getpagesize(), 0 }; | ||
889 | |||
890 | /* Create a zero-initialized eventfd. */ | ||
891 | vq->eventfd = eventfd(0, 0); | ||
892 | if (vq->eventfd < 0) | ||
893 | err(1, "Creating eventfd"); | ||
894 | args[2] = vq->eventfd; | ||
895 | |||
896 | /* Attach an eventfd to this virtqueue: it will go off | ||
897 | * when the Guest does an LHCALL_NOTIFY for this vq. */ | ||
898 | if (write(lguest_fd, &args, sizeof(args)) != 0) | ||
899 | err(1, "Attaching eventfd"); | ||
900 | |||
901 | /* CLONE_VM: because it has to access the Guest memory, and | ||
902 | * SIGCHLD so we get a signal if it dies. */ | ||
903 | vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); | ||
904 | if (vq->thread == (pid_t)-1) | ||
905 | err(1, "Creating clone"); | ||
906 | /* We close our local copy, now the child has it. */ | ||
907 | close(vq->eventfd); | ||
908 | } | ||
909 | |||
910 | static void start_device(struct device *dev) | ||
911 | { | ||
912 | unsigned int i; | ||
913 | struct virtqueue *vq; | ||
914 | |||
915 | verbose("Device %s OK: offered", dev->name); | ||
916 | for (i = 0; i < dev->feature_len; i++) | ||
917 | verbose(" %02x", get_feature_bits(dev)[i]); | ||
918 | verbose(", accepted"); | ||
919 | for (i = 0; i < dev->feature_len; i++) | ||
920 | verbose(" %02x", get_feature_bits(dev) | ||
921 | [dev->feature_len+i]); | ||
922 | |||
923 | for (vq = dev->vq; vq; vq = vq->next) { | ||
924 | if (vq->service) | ||
925 | create_thread(vq); | ||
926 | } | ||
927 | dev->running = true; | ||
928 | } | ||
929 | |||
930 | static void cleanup_devices(void) | ||
931 | { | ||
932 | struct device *dev; | ||
933 | |||
934 | for (dev = devices.dev; dev; dev = dev->next) | ||
935 | reset_device(dev); | ||
936 | |||
937 | /* If we saved off the original terminal settings, restore them now. */ | ||
938 | if (orig_term.c_lflag & (ISIG|ICANON|ECHO)) | ||
939 | tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); | ||
940 | } | ||
941 | |||
942 | /* When the Guest tells us they updated the status field, we handle it. */ | ||
943 | static void update_device_status(struct device *dev) | ||
944 | { | ||
945 | /* A zero status is a reset, otherwise it's a set of flags. */ | ||
946 | if (dev->desc->status == 0) | ||
947 | reset_device(dev); | ||
948 | else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { | ||
992 | warnx("Device %s configuration FAILED", dev->name); | 949 | warnx("Device %s configuration FAILED", dev->name); |
950 | if (dev->running) | ||
951 | reset_device(dev); | ||
993 | } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { | 952 | } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { |
994 | unsigned int i; | 953 | if (!dev->running) |
995 | 954 | start_device(dev); | |
996 | verbose("Device %s OK: offered", dev->name); | ||
997 | for (i = 0; i < dev->desc->feature_len; i++) | ||
998 | verbose(" %02x", get_feature_bits(dev)[i]); | ||
999 | verbose(", accepted"); | ||
1000 | for (i = 0; i < dev->desc->feature_len; i++) | ||
1001 | verbose(" %02x", get_feature_bits(dev) | ||
1002 | [dev->desc->feature_len+i]); | ||
1003 | |||
1004 | if (dev->ready) | ||
1005 | dev->ready(dev); | ||
1006 | } | 955 | } |
1007 | } | 956 | } |
1008 | 957 | ||
1009 | /* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ | 958 | /* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ |
1010 | static void handle_output(int fd, unsigned long addr) | 959 | static void handle_output(unsigned long addr) |
1011 | { | 960 | { |
1012 | struct device *i; | 961 | struct device *i; |
1013 | struct virtqueue *vq; | ||
1014 | 962 | ||
1015 | /* Check each device and virtqueue. */ | 963 | /* Check each device. */ |
1016 | for (i = devices.dev; i; i = i->next) { | 964 | for (i = devices.dev; i; i = i->next) { |
965 | struct virtqueue *vq; | ||
966 | |||
1017 | /* Notifications to device descriptors update device status. */ | 967 | /* Notifications to device descriptors update device status. */ |
1018 | if (from_guest_phys(addr) == i->desc) { | 968 | if (from_guest_phys(addr) == i->desc) { |
1019 | update_device_status(i); | 969 | update_device_status(i); |
1020 | return; | 970 | return; |
1021 | } | 971 | } |
1022 | 972 | ||
1023 | /* Notifications to virtqueues mean output has occurred. */ | 973 | /* Devices *can* be used before status is set to DRIVER_OK. */ |
1024 | for (vq = i->vq; vq; vq = vq->next) { | 974 | for (vq = i->vq; vq; vq = vq->next) { |
1025 | if (vq->config.pfn != addr/getpagesize()) | 975 | if (addr != vq->config.pfn*getpagesize()) |
1026 | continue; | 976 | continue; |
1027 | 977 | if (i->running) | |
1028 | /* Guest should acknowledge (and set features!) before | 978 | errx(1, "Notification on running %s", i->name); |
1029 | * using the device. */ | 979 | start_device(i); |
1030 | if (i->desc->status == 0) { | ||
1031 | warnx("%s gave early output", i->name); | ||
1032 | return; | ||
1033 | } | ||
1034 | |||
1035 | if (strcmp(vq->dev->name, "console") != 0) | ||
1036 | verbose("Output to %s\n", vq->dev->name); | ||
1037 | if (vq->handle_output) | ||
1038 | vq->handle_output(fd, vq, false); | ||
1039 | return; | 980 | return; |
1040 | } | 981 | } |
1041 | } | 982 | } |
@@ -1049,71 +990,6 @@ static void handle_output(int fd, unsigned long addr) | |||
1049 | strnlen(from_guest_phys(addr), guest_limit - addr)); | 990 | strnlen(from_guest_phys(addr), guest_limit - addr)); |
1050 | } | 991 | } |
1051 | 992 | ||
1052 | static void handle_timeout(int fd) | ||
1053 | { | ||
1054 | char buf[32]; | ||
1055 | struct device *i; | ||
1056 | struct virtqueue *vq; | ||
1057 | |||
1058 | /* Clear the pipe */ | ||
1059 | read(timeoutpipe[0], buf, sizeof(buf)); | ||
1060 | |||
1061 | /* Check each device and virtqueue: flush blocked ones. */ | ||
1062 | for (i = devices.dev; i; i = i->next) { | ||
1063 | for (vq = i->vq; vq; vq = vq->next) { | ||
1064 | if (!vq->blocked) | ||
1065 | continue; | ||
1066 | |||
1067 | vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; | ||
1068 | vq->blocked = false; | ||
1069 | if (vq->handle_output) | ||
1070 | vq->handle_output(fd, vq, true); | ||
1071 | } | ||
1072 | } | ||
1073 | } | ||
1074 | |||
1075 | /* This is called when the Waker wakes us up: check for incoming file | ||
1076 | * descriptors. */ | ||
1077 | static void handle_input(int fd) | ||
1078 | { | ||
1079 | /* select() wants a zeroed timeval to mean "don't wait". */ | ||
1080 | struct timeval poll = { .tv_sec = 0, .tv_usec = 0 }; | ||
1081 | |||
1082 | for (;;) { | ||
1083 | struct device *i; | ||
1084 | fd_set fds = devices.infds; | ||
1085 | int num; | ||
1086 | |||
1087 | num = select(devices.max_infd+1, &fds, NULL, NULL, &poll); | ||
1088 | /* Could get interrupted */ | ||
1089 | if (num < 0) | ||
1090 | continue; | ||
1091 | /* If nothing is ready, we're done. */ | ||
1092 | if (num == 0) | ||
1093 | break; | ||
1094 | |||
1095 | /* Otherwise, call the device(s) which have readable file | ||
1096 | * descriptors and a method of handling them. */ | ||
1097 | for (i = devices.dev; i; i = i->next) { | ||
1098 | if (i->handle_input && FD_ISSET(i->fd, &fds)) { | ||
1099 | if (i->handle_input(fd, i)) | ||
1100 | continue; | ||
1101 | |||
1102 | /* If handle_input() returns false, it means we | ||
1103 | * should no longer service it. Networking and | ||
1104 | * console do this when there's no input | ||
1105 | * buffers to deliver into. Console also uses | ||
1106 | * it when it discovers that stdin is closed. */ | ||
1107 | FD_CLR(i->fd, &devices.infds); | ||
1108 | } | ||
1109 | } | ||
1110 | |||
1111 | /* Is this the timeout fd? */ | ||
1112 | if (FD_ISSET(timeoutpipe[0], &fds)) | ||
1113 | handle_timeout(fd); | ||
1114 | } | ||
1115 | } | ||
1116 | |||
1117 | /*L:190 | 993 | /*L:190 |
1118 | * Device Setup | 994 | * Device Setup |
1119 | * | 995 | * |
@@ -1129,8 +1005,8 @@ static void handle_input(int fd) | |||
1129 | static u8 *device_config(const struct device *dev) | 1005 | static u8 *device_config(const struct device *dev) |
1130 | { | 1006 | { |
1131 | return (void *)(dev->desc + 1) | 1007 | return (void *)(dev->desc + 1) |
1132 | + dev->desc->num_vq * sizeof(struct lguest_vqconfig) | 1008 | + dev->num_vq * sizeof(struct lguest_vqconfig) |
1133 | + dev->desc->feature_len * 2; | 1009 | + dev->feature_len * 2; |
1134 | } | 1010 | } |
1135 | 1011 | ||
1136 | /* This routine allocates a new "struct lguest_device_desc" from descriptor | 1012 | /* This routine allocates a new "struct lguest_device_desc" from descriptor |
@@ -1159,7 +1035,7 @@ static struct lguest_device_desc *new_dev_desc(u16 type) | |||
1159 | /* Each device descriptor is followed by the description of its virtqueues. We | 1035 | /* Each device descriptor is followed by the description of its virtqueues. We |
1160 | * specify how many descriptors the virtqueue is to have. */ | 1036 | * specify how many descriptors the virtqueue is to have. */ |
1161 | static void add_virtqueue(struct device *dev, unsigned int num_descs, | 1037 | static void add_virtqueue(struct device *dev, unsigned int num_descs, |
1162 | void (*handle_output)(int, struct virtqueue *, bool)) | 1038 | void (*service)(struct virtqueue *)) |
1163 | { | 1039 | { |
1164 | unsigned int pages; | 1040 | unsigned int pages; |
1165 | struct virtqueue **i, *vq = malloc(sizeof(*vq)); | 1041 | struct virtqueue **i, *vq = malloc(sizeof(*vq)); |
@@ -1174,8 +1050,8 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, | |||
1174 | vq->next = NULL; | 1050 | vq->next = NULL; |
1175 | vq->last_avail_idx = 0; | 1051 | vq->last_avail_idx = 0; |
1176 | vq->dev = dev; | 1052 | vq->dev = dev; |
1177 | vq->inflight = 0; | 1053 | vq->service = service; |
1178 | vq->blocked = false; | 1054 | vq->thread = (pid_t)-1; |
1179 | 1055 | ||
1180 | /* Initialize the configuration. */ | 1056 | /* Initialize the configuration. */ |
1181 | vq->config.num = num_descs; | 1057 | vq->config.num = num_descs; |
@@ -1191,6 +1067,7 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, | |||
1191 | * yet, otherwise we'd be overwriting them. */ | 1067 | * yet, otherwise we'd be overwriting them. */ |
1192 | assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); | 1068 | assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); |
1193 | memcpy(device_config(dev), &vq->config, sizeof(vq->config)); | 1069 | memcpy(device_config(dev), &vq->config, sizeof(vq->config)); |
1070 | dev->num_vq++; | ||
1194 | dev->desc->num_vq++; | 1071 | dev->desc->num_vq++; |
1195 | 1072 | ||
1196 | verbose("Virtqueue page %#lx\n", to_guest_phys(p)); | 1073 | verbose("Virtqueue page %#lx\n", to_guest_phys(p)); |
@@ -1199,15 +1076,6 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, | |||
1199 | * second. */ | 1076 | * second. */ |
1200 | for (i = &dev->vq; *i; i = &(*i)->next); | 1077 | for (i = &dev->vq; *i; i = &(*i)->next); |
1201 | *i = vq; | 1078 | *i = vq; |
1202 | |||
1203 | /* Set the routine to call when the Guest does something to this | ||
1204 | * virtqueue. */ | ||
1205 | vq->handle_output = handle_output; | ||
1206 | |||
1207 | /* As an optimization, set the advisory "Don't Notify Me" flag if we | ||
1208 | * don't have a handler */ | ||
1209 | if (!handle_output) | ||
1210 | vq->vring.used->flags = VRING_USED_F_NO_NOTIFY; | ||
1211 | } | 1079 | } |
1212 | 1080 | ||
1213 | /* The first half of the feature bitmask is for us to advertise features. The | 1081 | /* The first half of the feature bitmask is for us to advertise features. The |
@@ -1219,7 +1087,7 @@ static void add_feature(struct device *dev, unsigned bit) | |||
1219 | /* We can't extend the feature bits once we've added config bytes */ | 1087 | /* We can't extend the feature bits once we've added config bytes */ |
1220 | if (dev->desc->feature_len <= bit / CHAR_BIT) { | 1088 | if (dev->desc->feature_len <= bit / CHAR_BIT) { |
1221 | assert(dev->desc->config_len == 0); | 1089 | assert(dev->desc->config_len == 0); |
1222 | dev->desc->feature_len = (bit / CHAR_BIT) + 1; | 1090 | dev->feature_len = dev->desc->feature_len = (bit/CHAR_BIT) + 1; |
1223 | } | 1091 | } |
1224 | 1092 | ||
1225 | features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); | 1093 | features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); |
@@ -1243,22 +1111,17 @@ static void set_config(struct device *dev, unsigned len, const void *conf) | |||
1243 | * calling new_dev_desc() to allocate the descriptor and device memory. | 1111 | * calling new_dev_desc() to allocate the descriptor and device memory. |
1244 | * | 1112 | * |
1245 | * See what I mean about userspace being boring? */ | 1113 | * See what I mean about userspace being boring? */ |
1246 | static struct device *new_device(const char *name, u16 type, int fd, | 1114 | static struct device *new_device(const char *name, u16 type) |
1247 | bool (*handle_input)(int, struct device *)) | ||
1248 | { | 1115 | { |
1249 | struct device *dev = malloc(sizeof(*dev)); | 1116 | struct device *dev = malloc(sizeof(*dev)); |
1250 | 1117 | ||
1251 | /* Now we populate the fields one at a time. */ | 1118 | /* Now we populate the fields one at a time. */ |
1252 | dev->fd = fd; | ||
1253 | /* If we have an input handler for this file descriptor, then we add it | ||
1254 | * to the device_list's fdset and maxfd. */ | ||
1255 | if (handle_input) | ||
1256 | add_device_fd(dev->fd); | ||
1257 | dev->desc = new_dev_desc(type); | 1119 | dev->desc = new_dev_desc(type); |
1258 | dev->handle_input = handle_input; | ||
1259 | dev->name = name; | 1120 | dev->name = name; |
1260 | dev->vq = NULL; | 1121 | dev->vq = NULL; |
1261 | dev->ready = NULL; | 1122 | dev->feature_len = 0; |
1123 | dev->num_vq = 0; | ||
1124 | dev->running = false; | ||
1262 | 1125 | ||
1263 | /* Append to device list. Prepending to a single-linked list is | 1126 | /* Append to device list. Prepending to a single-linked list is |
1264 | * easier, but the user expects the devices to be arranged on the bus | 1127 | * easier, but the user expects the devices to be arranged on the bus |
@@ -1286,13 +1149,10 @@ static void setup_console(void) | |||
1286 | * raw input stream to the Guest. */ | 1149 | * raw input stream to the Guest. */ |
1287 | term.c_lflag &= ~(ISIG|ICANON|ECHO); | 1150 | term.c_lflag &= ~(ISIG|ICANON|ECHO); |
1288 | tcsetattr(STDIN_FILENO, TCSANOW, &term); | 1151 | tcsetattr(STDIN_FILENO, TCSANOW, &term); |
1289 | /* If we exit gracefully, the original settings will be | ||
1290 | * restored so the user can see what they're typing. */ | ||
1291 | atexit(restore_term); | ||
1292 | } | 1152 | } |
1293 | 1153 | ||
1294 | dev = new_device("console", VIRTIO_ID_CONSOLE, | 1154 | dev = new_device("console", VIRTIO_ID_CONSOLE); |
1295 | STDIN_FILENO, handle_console_input); | 1155 | |
1296 | /* We store the console state in dev->priv, and initialize it. */ | 1156 | /* We store the console state in dev->priv, and initialize it. */ |
1297 | dev->priv = malloc(sizeof(struct console_abort)); | 1157 | dev->priv = malloc(sizeof(struct console_abort)); |
1298 | ((struct console_abort *)dev->priv)->count = 0; | 1158 | ((struct console_abort *)dev->priv)->count = 0; |
@@ -1301,31 +1161,13 @@ static void setup_console(void) | |||
1301 | * they put something the input queue, we make sure we're listening to | 1161 | * they put something the input queue, we make sure we're listening to |
1302 | * stdin. When they put something in the output queue, we write it to | 1162 | * stdin. When they put something in the output queue, we write it to |
1303 | * stdout. */ | 1163 | * stdout. */ |
1304 | add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); | 1164 | add_virtqueue(dev, VIRTQUEUE_NUM, console_input); |
1305 | add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output); | 1165 | add_virtqueue(dev, VIRTQUEUE_NUM, console_output); |
1306 | 1166 | ||
1307 | verbose("device %u: console\n", devices.device_num++); | 1167 | verbose("device %u: console\n", ++devices.device_num); |
1308 | } | 1168 | } |
1309 | /*:*/ | 1169 | /*:*/ |
1310 | 1170 | ||
1311 | static void timeout_alarm(int sig) | ||
1312 | { | ||
1313 | write(timeoutpipe[1], "", 1); | ||
1314 | } | ||
1315 | |||
1316 | static void setup_timeout(void) | ||
1317 | { | ||
1318 | if (pipe(timeoutpipe) != 0) | ||
1319 | err(1, "Creating timeout pipe"); | ||
1320 | |||
1321 | if (fcntl(timeoutpipe[1], F_SETFL, | ||
1322 | fcntl(timeoutpipe[1], F_GETFL) | O_NONBLOCK) != 0) | ||
1323 | err(1, "Making timeout pipe nonblocking"); | ||
1324 | |||
1325 | add_device_fd(timeoutpipe[0]); | ||
1326 | signal(SIGALRM, timeout_alarm); | ||
1327 | } | ||
1328 | |||
1329 | /*M:010 Inter-guest networking is an interesting area. Simplest is to have a | 1171 | /*M:010 Inter-guest networking is an interesting area. Simplest is to have a |
1330 | * --sharenet=<name> option which opens or creates a named pipe. This can be | 1172 | * --sharenet=<name> option which opens or creates a named pipe. This can be |
1331 | * used to send packets to another guest in a 1:1 manner. | 1173 | * used to send packets to another guest in a 1:1 manner. |
@@ -1447,21 +1289,23 @@ static int get_tun_device(char tapif[IFNAMSIZ]) | |||
1447 | static void setup_tun_net(char *arg) | 1289 | static void setup_tun_net(char *arg) |
1448 | { | 1290 | { |
1449 | struct device *dev; | 1291 | struct device *dev; |
1450 | int netfd, ipfd; | 1292 | struct net_info *net_info = malloc(sizeof(*net_info)); |
1293 | int ipfd; | ||
1451 | u32 ip = INADDR_ANY; | 1294 | u32 ip = INADDR_ANY; |
1452 | bool bridging = false; | 1295 | bool bridging = false; |
1453 | char tapif[IFNAMSIZ], *p; | 1296 | char tapif[IFNAMSIZ], *p; |
1454 | struct virtio_net_config conf; | 1297 | struct virtio_net_config conf; |
1455 | 1298 | ||
1456 | netfd = get_tun_device(tapif); | 1299 | net_info->tunfd = get_tun_device(tapif); |
1457 | 1300 | ||
1458 | /* First we create a new network device. */ | 1301 | /* First we create a new network device. */ |
1459 | dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input); | 1302 | dev = new_device("net", VIRTIO_ID_NET); |
1303 | dev->priv = net_info; | ||
1460 | 1304 | ||
1461 | /* Network devices need a receive and a send queue, just like | 1305 | /* Network devices need a receive and a send queue, just like |
1462 | * console. */ | 1306 | * console. */ |
1463 | add_virtqueue(dev, VIRTQUEUE_NUM, net_enable_fd); | 1307 | add_virtqueue(dev, VIRTQUEUE_NUM, net_input); |
1464 | add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output); | 1308 | add_virtqueue(dev, VIRTQUEUE_NUM, net_output); |
1465 | 1309 | ||
1466 | /* We need a socket to perform the magic network ioctls to bring up the | 1310 | /* We need a socket to perform the magic network ioctls to bring up the |
1467 | * tap interface, connect to the bridge etc. Any socket will do! */ | 1311 | * tap interface, connect to the bridge etc. Any socket will do! */ |
@@ -1502,6 +1346,8 @@ static void setup_tun_net(char *arg) | |||
1502 | add_feature(dev, VIRTIO_NET_F_HOST_TSO4); | 1346 | add_feature(dev, VIRTIO_NET_F_HOST_TSO4); |
1503 | add_feature(dev, VIRTIO_NET_F_HOST_TSO6); | 1347 | add_feature(dev, VIRTIO_NET_F_HOST_TSO6); |
1504 | add_feature(dev, VIRTIO_NET_F_HOST_ECN); | 1348 | add_feature(dev, VIRTIO_NET_F_HOST_ECN); |
1349 | /* We handle indirect ring entries */ | ||
1350 | add_feature(dev, VIRTIO_RING_F_INDIRECT_DESC); | ||
1505 | set_config(dev, sizeof(conf), &conf); | 1351 | set_config(dev, sizeof(conf), &conf); |
1506 | 1352 | ||
1507 | /* We don't need the socket any more; setup is done. */ | 1353 | /* We don't need the socket any more; setup is done. */ |
@@ -1550,20 +1396,18 @@ struct vblk_info | |||
1550 | * Remember that the block device is handled by a separate I/O thread. We head | 1396 | * Remember that the block device is handled by a separate I/O thread. We head |
1551 | * straight into the core of that thread here: | 1397 | * straight into the core of that thread here: |
1552 | */ | 1398 | */ |
1553 | static bool service_io(struct device *dev) | 1399 | static void blk_request(struct virtqueue *vq) |
1554 | { | 1400 | { |
1555 | struct vblk_info *vblk = dev->priv; | 1401 | struct vblk_info *vblk = vq->dev->priv; |
1556 | unsigned int head, out_num, in_num, wlen; | 1402 | unsigned int head, out_num, in_num, wlen; |
1557 | int ret; | 1403 | int ret; |
1558 | u8 *in; | 1404 | u8 *in; |
1559 | struct virtio_blk_outhdr *out; | 1405 | struct virtio_blk_outhdr *out; |
1560 | struct iovec iov[dev->vq->vring.num]; | 1406 | struct iovec iov[vq->vring.num]; |
1561 | off64_t off; | 1407 | off64_t off; |
1562 | 1408 | ||
1563 | /* See if there's a request waiting. If not, nothing to do. */ | 1409 | /* Get the next request. */ |
1564 | head = get_vq_desc(dev->vq, iov, &out_num, &in_num); | 1410 | head = wait_for_vq_desc(vq, iov, &out_num, &in_num); |
1565 | if (head == dev->vq->vring.num) | ||
1566 | return false; | ||
1567 | 1411 | ||
1568 | /* Every block request should contain at least one output buffer | 1412 | /* Every block request should contain at least one output buffer |
1569 | * (detailing the location on disk and the type of request) and one | 1413 | * (detailing the location on disk and the type of request) and one |
@@ -1637,83 +1481,21 @@ static bool service_io(struct device *dev) | |||
1637 | if (out->type & VIRTIO_BLK_T_BARRIER) | 1481 | if (out->type & VIRTIO_BLK_T_BARRIER) |
1638 | fdatasync(vblk->fd); | 1482 | fdatasync(vblk->fd); |
1639 | 1483 | ||
1640 | /* We can't trigger an IRQ, because we're not the Launcher. It does | 1484 | add_used(vq, head, wlen); |
1641 | * that when we tell it we're done. */ | ||
1642 | add_used(dev->vq, head, wlen); | ||
1643 | return true; | ||
1644 | } | ||
1645 | |||
1646 | /* This is the thread which actually services the I/O. */ | ||
1647 | static int io_thread(void *_dev) | ||
1648 | { | ||
1649 | struct device *dev = _dev; | ||
1650 | struct vblk_info *vblk = dev->priv; | ||
1651 | char c; | ||
1652 | |||
1653 | /* Close other side of workpipe so we get 0 read when main dies. */ | ||
1654 | close(vblk->workpipe[1]); | ||
1655 | /* Close the other side of the done_fd pipe. */ | ||
1656 | close(dev->fd); | ||
1657 | |||
1658 | /* When this read fails, it means Launcher died, so we follow. */ | ||
1659 | while (read(vblk->workpipe[0], &c, 1) == 1) { | ||
1660 | /* We acknowledge each request immediately to reduce latency, | ||
1661 | * rather than waiting until we've done them all. I haven't | ||
1662 | * measured to see if it makes any difference. | ||
1663 | * | ||
1664 | * That would be an interesting test, wouldn't it? You could | ||
1665 | * also try having more than one I/O thread. */ | ||
1666 | while (service_io(dev)) | ||
1667 | write(vblk->done_fd, &c, 1); | ||
1668 | } | ||
1669 | return 0; | ||
1670 | } | ||
1671 | |||
1672 | /* Now we've seen the I/O thread, we return to the Launcher to see what happens | ||
1673 | * when that thread tells us it's completed some I/O. */ | ||
1674 | static bool handle_io_finish(int fd, struct device *dev) | ||
1675 | { | ||
1676 | char c; | ||
1677 | |||
1678 | /* If the I/O thread died, presumably it printed the error, so we | ||
1679 | * simply exit. */ | ||
1680 | if (read(dev->fd, &c, 1) != 1) | ||
1681 | exit(1); | ||
1682 | |||
1683 | /* It did some work, so trigger the irq. */ | ||
1684 | trigger_irq(fd, dev->vq); | ||
1685 | return true; | ||
1686 | } | ||
1687 | |||
1688 | /* When the Guest submits some I/O, we just need to wake the I/O thread. */ | ||
1689 | static void handle_virtblk_output(int fd, struct virtqueue *vq, bool timeout) | ||
1690 | { | ||
1691 | struct vblk_info *vblk = vq->dev->priv; | ||
1692 | char c = 0; | ||
1693 | |||
1694 | /* Wake up I/O thread and tell it to go to work! */ | ||
1695 | if (write(vblk->workpipe[1], &c, 1) != 1) | ||
1696 | /* Presumably it indicated why it died. */ | ||
1697 | exit(1); | ||
1698 | } | 1485 | } |
1699 | 1486 | ||
1700 | /*L:198 This actually sets up a virtual block device. */ | 1487 | /*L:198 This actually sets up a virtual block device. */ |
1701 | static void setup_block_file(const char *filename) | 1488 | static void setup_block_file(const char *filename) |
1702 | { | 1489 | { |
1703 | int p[2]; | ||
1704 | struct device *dev; | 1490 | struct device *dev; |
1705 | struct vblk_info *vblk; | 1491 | struct vblk_info *vblk; |
1706 | void *stack; | ||
1707 | struct virtio_blk_config conf; | 1492 | struct virtio_blk_config conf; |
1708 | 1493 | ||
1709 | /* This is the pipe the I/O thread will use to tell us I/O is done. */ | ||
1710 | pipe(p); | ||
1711 | |||
1712 | /* The device responds to return from I/O thread. */ | 1494 | /* The device responds to return from I/O thread. */ |
1713 | dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish); | 1495 | dev = new_device("block", VIRTIO_ID_BLOCK); |
1714 | 1496 | ||
1715 | /* The device has one virtqueue, where the Guest places requests. */ | 1497 | /* The device has one virtqueue, where the Guest places requests. */ |
1716 | add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output); | 1498 | add_virtqueue(dev, VIRTQUEUE_NUM, blk_request); |
1717 | 1499 | ||
1718 | /* Allocate the room for our own bookkeeping */ | 1500 | /* Allocate the room for our own bookkeeping */ |
1719 | vblk = dev->priv = malloc(sizeof(*vblk)); | 1501 | vblk = dev->priv = malloc(sizeof(*vblk)); |
@@ -1735,49 +1517,29 @@ static void setup_block_file(const char *filename) | |||
1735 | 1517 | ||
1736 | set_config(dev, sizeof(conf), &conf); | 1518 | set_config(dev, sizeof(conf), &conf); |
1737 | 1519 | ||
1738 | /* The I/O thread writes to this end of the pipe when done. */ | ||
1739 | vblk->done_fd = p[1]; | ||
1740 | |||
1741 | /* This is the second pipe, which is how we tell the I/O thread about | ||
1742 | * more work. */ | ||
1743 | pipe(vblk->workpipe); | ||
1744 | |||
1745 | /* Create stack for thread and run it. Since stack grows upwards, we | ||
1746 | * point the stack pointer to the end of this region. */ | ||
1747 | stack = malloc(32768); | ||
1748 | /* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from | ||
1749 | * becoming a zombie. */ | ||
1750 | if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1) | ||
1751 | err(1, "Creating clone"); | ||
1752 | |||
1753 | /* We don't need to keep the I/O thread's end of the pipes open. */ | ||
1754 | close(vblk->done_fd); | ||
1755 | close(vblk->workpipe[0]); | ||
1756 | |||
1757 | verbose("device %u: virtblock %llu sectors\n", | 1520 | verbose("device %u: virtblock %llu sectors\n", |
1758 | devices.device_num, le64_to_cpu(conf.capacity)); | 1521 | ++devices.device_num, le64_to_cpu(conf.capacity)); |
1759 | } | 1522 | } |
1760 | 1523 | ||
1524 | struct rng_info { | ||
1525 | int rfd; | ||
1526 | }; | ||
1527 | |||
1761 | /* Our random number generator device reads from /dev/random into the Guest's | 1528 | /* Our random number generator device reads from /dev/random into the Guest's |
1762 | * input buffers. The usual case is that the Guest doesn't want random numbers | 1529 | * input buffers. The usual case is that the Guest doesn't want random numbers |
1763 | * and so has no buffers although /dev/random is still readable, whereas | 1530 | * and so has no buffers although /dev/random is still readable, whereas |
1764 | * console is the reverse. | 1531 | * console is the reverse. |
1765 | * | 1532 | * |
1766 | * The same logic applies, however. */ | 1533 | * The same logic applies, however. */ |
1767 | static bool handle_rng_input(int fd, struct device *dev) | 1534 | static void rng_input(struct virtqueue *vq) |
1768 | { | 1535 | { |
1769 | int len; | 1536 | int len; |
1770 | unsigned int head, in_num, out_num, totlen = 0; | 1537 | unsigned int head, in_num, out_num, totlen = 0; |
1771 | struct iovec iov[dev->vq->vring.num]; | 1538 | struct rng_info *rng_info = vq->dev->priv; |
1539 | struct iovec iov[vq->vring.num]; | ||
1772 | 1540 | ||
1773 | /* First we need a buffer from the Guests's virtqueue. */ | 1541 | /* First we need a buffer from the Guests's virtqueue. */ |
1774 | head = get_vq_desc(dev->vq, iov, &out_num, &in_num); | 1542 | head = wait_for_vq_desc(vq, iov, &out_num, &in_num); |
1775 | |||
1776 | /* If they're not ready for input, stop listening to this file | ||
1777 | * descriptor. We'll start again once they add an input buffer. */ | ||
1778 | if (head == dev->vq->vring.num) | ||
1779 | return false; | ||
1780 | |||
1781 | if (out_num) | 1543 | if (out_num) |
1782 | errx(1, "Output buffers in rng?"); | 1544 | errx(1, "Output buffers in rng?"); |
1783 | 1545 | ||
@@ -1785,7 +1547,7 @@ static bool handle_rng_input(int fd, struct device *dev) | |||
1785 | * it reads straight into the Guest's buffer. We loop to make sure we | 1547 | * it reads straight into the Guest's buffer. We loop to make sure we |
1786 | * fill it. */ | 1548 | * fill it. */ |
1787 | while (!iov_empty(iov, in_num)) { | 1549 | while (!iov_empty(iov, in_num)) { |
1788 | len = readv(dev->fd, iov, in_num); | 1550 | len = readv(rng_info->rfd, iov, in_num); |
1789 | if (len <= 0) | 1551 | if (len <= 0) |
1790 | err(1, "Read from /dev/random gave %i", len); | 1552 | err(1, "Read from /dev/random gave %i", len); |
1791 | iov_consume(iov, in_num, len); | 1553 | iov_consume(iov, in_num, len); |
@@ -1793,25 +1555,23 @@ static bool handle_rng_input(int fd, struct device *dev) | |||
1793 | } | 1555 | } |
1794 | 1556 | ||
1795 | /* Tell the Guest about the new input. */ | 1557 | /* Tell the Guest about the new input. */ |
1796 | add_used_and_trigger(fd, dev->vq, head, totlen); | 1558 | add_used(vq, head, totlen); |
1797 | |||
1798 | /* Everything went OK! */ | ||
1799 | return true; | ||
1800 | } | 1559 | } |
1801 | 1560 | ||
1802 | /* And this creates a "hardware" random number device for the Guest. */ | 1561 | /* And this creates a "hardware" random number device for the Guest. */ |
1803 | static void setup_rng(void) | 1562 | static void setup_rng(void) |
1804 | { | 1563 | { |
1805 | struct device *dev; | 1564 | struct device *dev; |
1806 | int fd; | 1565 | struct rng_info *rng_info = malloc(sizeof(*rng_info)); |
1807 | 1566 | ||
1808 | fd = open_or_die("/dev/random", O_RDONLY); | 1567 | rng_info->rfd = open_or_die("/dev/random", O_RDONLY); |
1809 | 1568 | ||
1810 | /* The device responds to return from I/O thread. */ | 1569 | /* The device responds to return from I/O thread. */ |
1811 | dev = new_device("rng", VIRTIO_ID_RNG, fd, handle_rng_input); | 1570 | dev = new_device("rng", VIRTIO_ID_RNG); |
1571 | dev->priv = rng_info; | ||
1812 | 1572 | ||
1813 | /* The device has one virtqueue, where the Guest places inbufs. */ | 1573 | /* The device has one virtqueue, where the Guest places inbufs. */ |
1814 | add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); | 1574 | add_virtqueue(dev, VIRTQUEUE_NUM, rng_input); |
1815 | 1575 | ||
1816 | verbose("device %u: rng\n", devices.device_num++); | 1576 | verbose("device %u: rng\n", devices.device_num++); |
1817 | } | 1577 | } |
@@ -1827,17 +1587,18 @@ static void __attribute__((noreturn)) restart_guest(void) | |||
1827 | for (i = 3; i < FD_SETSIZE; i++) | 1587 | for (i = 3; i < FD_SETSIZE; i++) |
1828 | close(i); | 1588 | close(i); |
1829 | 1589 | ||
1830 | /* The exec automatically gets rid of the I/O and Waker threads. */ | 1590 | /* Reset all the devices (kills all threads). */ |
1591 | cleanup_devices(); | ||
1592 | |||
1831 | execv(main_args[0], main_args); | 1593 | execv(main_args[0], main_args); |
1832 | err(1, "Could not exec %s", main_args[0]); | 1594 | err(1, "Could not exec %s", main_args[0]); |
1833 | } | 1595 | } |
1834 | 1596 | ||
1835 | /*L:220 Finally we reach the core of the Launcher which runs the Guest, serves | 1597 | /*L:220 Finally we reach the core of the Launcher which runs the Guest, serves |
1836 | * its input and output, and finally, lays it to rest. */ | 1598 | * its input and output, and finally, lays it to rest. */ |
1837 | static void __attribute__((noreturn)) run_guest(int lguest_fd) | 1599 | static void __attribute__((noreturn)) run_guest(void) |
1838 | { | 1600 | { |
1839 | for (;;) { | 1601 | for (;;) { |
1840 | unsigned long args[] = { LHREQ_BREAK, 0 }; | ||
1841 | unsigned long notify_addr; | 1602 | unsigned long notify_addr; |
1842 | int readval; | 1603 | int readval; |
1843 | 1604 | ||
@@ -1848,8 +1609,7 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd) | |||
1848 | /* One unsigned long means the Guest did HCALL_NOTIFY */ | 1609 | /* One unsigned long means the Guest did HCALL_NOTIFY */ |
1849 | if (readval == sizeof(notify_addr)) { | 1610 | if (readval == sizeof(notify_addr)) { |
1850 | verbose("Notify on address %#lx\n", notify_addr); | 1611 | verbose("Notify on address %#lx\n", notify_addr); |
1851 | handle_output(lguest_fd, notify_addr); | 1612 | handle_output(notify_addr); |
1852 | continue; | ||
1853 | /* ENOENT means the Guest died. Reading tells us why. */ | 1613 | /* ENOENT means the Guest died. Reading tells us why. */ |
1854 | } else if (errno == ENOENT) { | 1614 | } else if (errno == ENOENT) { |
1855 | char reason[1024] = { 0 }; | 1615 | char reason[1024] = { 0 }; |
@@ -1858,19 +1618,9 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd) | |||
1858 | /* ERESTART means that we need to reboot the guest */ | 1618 | /* ERESTART means that we need to reboot the guest */ |
1859 | } else if (errno == ERESTART) { | 1619 | } else if (errno == ERESTART) { |
1860 | restart_guest(); | 1620 | restart_guest(); |
1861 | /* EAGAIN means a signal (timeout). | 1621 | /* Anything else means a bug or incompatible change. */ |
1862 | * Anything else means a bug or incompatible change. */ | 1622 | } else |
1863 | } else if (errno != EAGAIN) | ||
1864 | err(1, "Running guest failed"); | 1623 | err(1, "Running guest failed"); |
1865 | |||
1866 | /* Only service input on thread for CPU 0. */ | ||
1867 | if (cpu_id != 0) | ||
1868 | continue; | ||
1869 | |||
1870 | /* Service input, then unset the BREAK to release the Waker. */ | ||
1871 | handle_input(lguest_fd); | ||
1872 | if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0) | ||
1873 | err(1, "Resetting break"); | ||
1874 | } | 1624 | } |
1875 | } | 1625 | } |
1876 | /*L:240 | 1626 | /*L:240 |
@@ -1904,8 +1654,8 @@ int main(int argc, char *argv[]) | |||
1904 | /* Memory, top-level pagetable, code startpoint and size of the | 1654 | /* Memory, top-level pagetable, code startpoint and size of the |
1905 | * (optional) initrd. */ | 1655 | * (optional) initrd. */ |
1906 | unsigned long mem = 0, start, initrd_size = 0; | 1656 | unsigned long mem = 0, start, initrd_size = 0; |
1907 | /* Two temporaries and the /dev/lguest file descriptor. */ | 1657 | /* Two temporaries. */ |
1908 | int i, c, lguest_fd; | 1658 | int i, c; |
1909 | /* The boot information for the Guest. */ | 1659 | /* The boot information for the Guest. */ |
1910 | struct boot_params *boot; | 1660 | struct boot_params *boot; |
1911 | /* If they specify an initrd file to load. */ | 1661 | /* If they specify an initrd file to load. */ |
@@ -1913,18 +1663,10 @@ int main(int argc, char *argv[]) | |||
1913 | 1663 | ||
1914 | /* Save the args: we "reboot" by execing ourselves again. */ | 1664 | /* Save the args: we "reboot" by execing ourselves again. */ |
1915 | main_args = argv; | 1665 | main_args = argv; |
1916 | /* We don't "wait" for the children, so prevent them from becoming | ||
1917 | * zombies. */ | ||
1918 | signal(SIGCHLD, SIG_IGN); | ||
1919 | 1666 | ||
1920 | /* First we initialize the device list. Since console and network | 1667 | /* First we initialize the device list. We keep a pointer to the last |
1921 | * device receive input from a file descriptor, we keep an fdset | 1668 | * device, and the next interrupt number to use for devices (1: |
1922 | * (infds) and the maximum fd number (max_infd) with the head of the | 1669 | * remember that 0 is used by the timer). */ |
1923 | * list. We also keep a pointer to the last device. Finally, we keep | ||
1924 | * the next interrupt number to use for devices (1: remember that 0 is | ||
1925 | * used by the timer). */ | ||
1926 | FD_ZERO(&devices.infds); | ||
1927 | devices.max_infd = -1; | ||
1928 | devices.lastdev = NULL; | 1670 | devices.lastdev = NULL; |
1929 | devices.next_irq = 1; | 1671 | devices.next_irq = 1; |
1930 | 1672 | ||
@@ -1982,9 +1724,6 @@ int main(int argc, char *argv[]) | |||
1982 | /* We always have a console device */ | 1724 | /* We always have a console device */ |
1983 | setup_console(); | 1725 | setup_console(); |
1984 | 1726 | ||
1985 | /* We can timeout waiting for Guest network transmit. */ | ||
1986 | setup_timeout(); | ||
1987 | |||
1988 | /* Now we load the kernel */ | 1727 | /* Now we load the kernel */ |
1989 | start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); | 1728 | start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); |
1990 | 1729 | ||
@@ -2023,15 +1762,16 @@ int main(int argc, char *argv[]) | |||
2023 | 1762 | ||
2024 | /* We tell the kernel to initialize the Guest: this returns the open | 1763 | /* We tell the kernel to initialize the Guest: this returns the open |
2025 | * /dev/lguest file descriptor. */ | 1764 | * /dev/lguest file descriptor. */ |
2026 | lguest_fd = tell_kernel(start); | 1765 | tell_kernel(start); |
1766 | |||
1767 | /* Ensure that we terminate if a child dies. */ | ||
1768 | signal(SIGCHLD, kill_launcher); | ||
2027 | 1769 | ||
2028 | /* We clone off a thread, which wakes the Launcher whenever one of the | 1770 | /* If we exit via err(), this kills all the threads, restores tty. */ |
2029 | * input file descriptors needs attention. We call this the Waker, and | 1771 | atexit(cleanup_devices); |
2030 | * we'll cover it in a moment. */ | ||
2031 | setup_waker(lguest_fd); | ||
2032 | 1772 | ||
2033 | /* Finally, run the Guest. This doesn't return. */ | 1773 | /* Finally, run the Guest. This doesn't return. */ |
2034 | run_guest(lguest_fd); | 1774 | run_guest(); |
2035 | } | 1775 | } |
2036 | /*:*/ | 1776 | /*:*/ |
2037 | 1777 | ||
diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt index 28c747362f95..efb3a6a045a2 100644 --- a/Documentation/lguest/lguest.txt +++ b/Documentation/lguest/lguest.txt | |||
@@ -37,7 +37,6 @@ Running Lguest: | |||
37 | "Paravirtualized guest support" = Y | 37 | "Paravirtualized guest support" = Y |
38 | "Lguest guest support" = Y | 38 | "Lguest guest support" = Y |
39 | "High Memory Support" = off/4GB | 39 | "High Memory Support" = off/4GB |
40 | "PAE (Physical Address Extension) Support" = N | ||
41 | "Alignment value to which kernel should be aligned" = 0x100000 | 40 | "Alignment value to which kernel should be aligned" = 0x100000 |
42 | (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and | 41 | (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and |
43 | CONFIG_PHYSICAL_ALIGN=0x100000) | 42 | CONFIG_PHYSICAL_ALIGN=0x100000) |
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index f5b7127f54ac..7f5809eddee6 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt | |||
@@ -31,6 +31,7 @@ Contents: | |||
31 | 31 | ||
32 | - Locking functions. | 32 | - Locking functions. |
33 | - Interrupt disabling functions. | 33 | - Interrupt disabling functions. |
34 | - Sleep and wake-up functions. | ||
34 | - Miscellaneous functions. | 35 | - Miscellaneous functions. |
35 | 36 | ||
36 | (*) Inter-CPU locking barrier effects. | 37 | (*) Inter-CPU locking barrier effects. |
@@ -1217,6 +1218,132 @@ barriers are required in such a situation, they must be provided from some | |||
1217 | other means. | 1218 | other means. |
1218 | 1219 | ||
1219 | 1220 | ||
1221 | SLEEP AND WAKE-UP FUNCTIONS | ||
1222 | --------------------------- | ||
1223 | |||
1224 | Sleeping and waking on an event flagged in global data can be viewed as an | ||
1225 | interaction between two pieces of data: the task state of the task waiting for | ||
1226 | the event and the global data used to indicate the event. To make sure that | ||
1227 | these appear to happen in the right order, the primitives to begin the process | ||
1228 | of going to sleep, and the primitives to initiate a wake up imply certain | ||
1229 | barriers. | ||
1230 | |||
1231 | Firstly, the sleeper normally follows something like this sequence of events: | ||
1232 | |||
1233 | for (;;) { | ||
1234 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
1235 | if (event_indicated) | ||
1236 | break; | ||
1237 | schedule(); | ||
1238 | } | ||
1239 | |||
1240 | A general memory barrier is interpolated automatically by set_current_state() | ||
1241 | after it has altered the task state: | ||
1242 | |||
1243 | CPU 1 | ||
1244 | =============================== | ||
1245 | set_current_state(); | ||
1246 | set_mb(); | ||
1247 | STORE current->state | ||
1248 | <general barrier> | ||
1249 | LOAD event_indicated | ||
1250 | |||
1251 | set_current_state() may be wrapped by: | ||
1252 | |||
1253 | prepare_to_wait(); | ||
1254 | prepare_to_wait_exclusive(); | ||
1255 | |||
1256 | which therefore also imply a general memory barrier after setting the state. | ||
1257 | The whole sequence above is available in various canned forms, all of which | ||
1258 | interpolate the memory barrier in the right place: | ||
1259 | |||
1260 | wait_event(); | ||
1261 | wait_event_interruptible(); | ||
1262 | wait_event_interruptible_exclusive(); | ||
1263 | wait_event_interruptible_timeout(); | ||
1264 | wait_event_killable(); | ||
1265 | wait_event_timeout(); | ||
1266 | wait_on_bit(); | ||
1267 | wait_on_bit_lock(); | ||
1268 | |||
1269 | |||
1270 | Secondly, code that performs a wake up normally follows something like this: | ||
1271 | |||
1272 | event_indicated = 1; | ||
1273 | wake_up(&event_wait_queue); | ||
1274 | |||
1275 | or: | ||
1276 | |||
1277 | event_indicated = 1; | ||
1278 | wake_up_process(event_daemon); | ||
1279 | |||
1280 | A write memory barrier is implied by wake_up() and co. if and only if they wake | ||
1281 | something up. The barrier occurs before the task state is cleared, and so sits | ||
1282 | between the STORE to indicate the event and the STORE to set TASK_RUNNING: | ||
1283 | |||
1284 | CPU 1 CPU 2 | ||
1285 | =============================== =============================== | ||
1286 | set_current_state(); STORE event_indicated | ||
1287 | set_mb(); wake_up(); | ||
1288 | STORE current->state <write barrier> | ||
1289 | <general barrier> STORE current->state | ||
1290 | LOAD event_indicated | ||
1291 | |||
1292 | The available waker functions include: | ||
1293 | |||
1294 | complete(); | ||
1295 | wake_up(); | ||
1296 | wake_up_all(); | ||
1297 | wake_up_bit(); | ||
1298 | wake_up_interruptible(); | ||
1299 | wake_up_interruptible_all(); | ||
1300 | wake_up_interruptible_nr(); | ||
1301 | wake_up_interruptible_poll(); | ||
1302 | wake_up_interruptible_sync(); | ||
1303 | wake_up_interruptible_sync_poll(); | ||
1304 | wake_up_locked(); | ||
1305 | wake_up_locked_poll(); | ||
1306 | wake_up_nr(); | ||
1307 | wake_up_poll(); | ||
1308 | wake_up_process(); | ||
1309 | |||
1310 | |||
1311 | [!] Note that the memory barriers implied by the sleeper and the waker do _not_ | ||
1312 | order multiple stores before the wake-up with respect to loads of those stored | ||
1313 | values after the sleeper has called set_current_state(). For instance, if the | ||
1314 | sleeper does: | ||
1315 | |||
1316 | set_current_state(TASK_INTERRUPTIBLE); | ||
1317 | if (event_indicated) | ||
1318 | break; | ||
1319 | __set_current_state(TASK_RUNNING); | ||
1320 | do_something(my_data); | ||
1321 | |||
1322 | and the waker does: | ||
1323 | |||
1324 | my_data = value; | ||
1325 | event_indicated = 1; | ||
1326 | wake_up(&event_wait_queue); | ||
1327 | |||
1328 | there's no guarantee that the change to event_indicated will be perceived by | ||
1329 | the sleeper as coming after the change to my_data. In such a circumstance, the | ||
1330 | code on both sides must interpolate its own memory barriers between the | ||
1331 | separate data accesses. Thus the above sleeper ought to do: | ||
1332 | |||
1333 | set_current_state(TASK_INTERRUPTIBLE); | ||
1334 | if (event_indicated) { | ||
1335 | smp_rmb(); | ||
1336 | do_something(my_data); | ||
1337 | } | ||
1338 | |||
1339 | and the waker should do: | ||
1340 | |||
1341 | my_data = value; | ||
1342 | smp_wmb(); | ||
1343 | event_indicated = 1; | ||
1344 | wake_up(&event_wait_queue); | ||
1345 | |||
1346 | |||
1220 | MISCELLANEOUS FUNCTIONS | 1347 | MISCELLANEOUS FUNCTIONS |
1221 | ----------------------- | 1348 | ----------------------- |
1222 | 1349 | ||
@@ -1366,7 +1493,7 @@ WHERE ARE MEMORY BARRIERS NEEDED? | |||
1366 | 1493 | ||
1367 | Under normal operation, memory operation reordering is generally not going to | 1494 | Under normal operation, memory operation reordering is generally not going to |
1368 | be a problem as a single-threaded linear piece of code will still appear to | 1495 | be a problem as a single-threaded linear piece of code will still appear to |
1369 | work correctly, even if it's in an SMP kernel. There are, however, three | 1496 | work correctly, even if it's in an SMP kernel. There are, however, four |
1370 | circumstances in which reordering definitely _could_ be a problem: | 1497 | circumstances in which reordering definitely _could_ be a problem: |
1371 | 1498 | ||
1372 | (*) Interprocessor interaction. | 1499 | (*) Interprocessor interaction. |
diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt index 421e7d00ffd0..c9abbd86bc18 100644 --- a/Documentation/power/devices.txt +++ b/Documentation/power/devices.txt | |||
@@ -75,9 +75,6 @@ may need to apply in domain-specific ways to their devices: | |||
75 | struct bus_type { | 75 | struct bus_type { |
76 | ... | 76 | ... |
77 | int (*suspend)(struct device *dev, pm_message_t state); | 77 | int (*suspend)(struct device *dev, pm_message_t state); |
78 | int (*suspend_late)(struct device *dev, pm_message_t state); | ||
79 | |||
80 | int (*resume_early)(struct device *dev); | ||
81 | int (*resume)(struct device *dev); | 78 | int (*resume)(struct device *dev); |
82 | }; | 79 | }; |
83 | 80 | ||
@@ -226,20 +223,7 @@ The phases are seen by driver notifications issued in this order: | |||
226 | 223 | ||
227 | This call should handle parts of device suspend logic that require | 224 | This call should handle parts of device suspend logic that require |
228 | sleeping. It probably does work to quiesce the device which hasn't | 225 | sleeping. It probably does work to quiesce the device which hasn't |
229 | been abstracted into class.suspend() or bus.suspend_late(). | 226 | been abstracted into class.suspend(). |
230 | |||
231 | 3 bus.suspend_late(dev, message) is called with IRQs disabled, and | ||
232 | with only one CPU active. Until the bus.resume_early() phase | ||
233 | completes (see later), IRQs are not enabled again. This method | ||
234 | won't be exposed by all busses; for message based busses like USB, | ||
235 | I2C, or SPI, device interactions normally require IRQs. This bus | ||
236 | call may be morphed into a driver call with bus-specific parameters. | ||
237 | |||
238 | This call might save low level hardware state that might otherwise | ||
239 | be lost in the upcoming low power state, and actually put the | ||
240 | device into a low power state ... so that in some cases the device | ||
241 | may stay partly usable until this late. This "late" call may also | ||
242 | help when coping with hardware that behaves badly. | ||
243 | 227 | ||
244 | The pm_message_t parameter is currently used to refine those semantics | 228 | The pm_message_t parameter is currently used to refine those semantics |
245 | (described later). | 229 | (described later). |
@@ -351,19 +335,11 @@ devices processing each phase's calls before the next phase begins. | |||
351 | 335 | ||
352 | The phases are seen by driver notifications issued in this order: | 336 | The phases are seen by driver notifications issued in this order: |
353 | 337 | ||
354 | 1 bus.resume_early(dev) is called with IRQs disabled, and with | 338 | 1 bus.resume(dev) reverses the effects of bus.suspend(). This may |
355 | only one CPU active. As with bus.suspend_late(), this method | 339 | be morphed into a device driver call with bus-specific parameters; |
356 | won't be supported on busses that require IRQs in order to | 340 | implementations may sleep. |
357 | interact with devices. | ||
358 | |||
359 | This reverses the effects of bus.suspend_late(). | ||
360 | |||
361 | 2 bus.resume(dev) is called next. This may be morphed into a device | ||
362 | driver call with bus-specific parameters; implementations may sleep. | ||
363 | |||
364 | This reverses the effects of bus.suspend(). | ||
365 | 341 | ||
366 | 3 class.resume(dev) is called for devices associated with a class | 342 | 2 class.resume(dev) is called for devices associated with a class |
367 | that has such a method. Implementations may sleep. | 343 | that has such a method. Implementations may sleep. |
368 | 344 | ||
369 | This reverses the effects of class.suspend(), and would usually | 345 | This reverses the effects of class.suspend(), and would usually |
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt index 5ba4d3fc625a..1df7f9cdab05 100644 --- a/Documentation/scheduler/sched-rt-group.txt +++ b/Documentation/scheduler/sched-rt-group.txt | |||
@@ -4,6 +4,7 @@ | |||
4 | CONTENTS | 4 | CONTENTS |
5 | ======== | 5 | ======== |
6 | 6 | ||
7 | 0. WARNING | ||
7 | 1. Overview | 8 | 1. Overview |
8 | 1.1 The problem | 9 | 1.1 The problem |
9 | 1.2 The solution | 10 | 1.2 The solution |
@@ -14,6 +15,23 @@ CONTENTS | |||
14 | 3. Future plans | 15 | 3. Future plans |
15 | 16 | ||
16 | 17 | ||
18 | 0. WARNING | ||
19 | ========== | ||
20 | |||
21 | Fiddling with these settings can result in an unstable system, the knobs are | ||
22 | root only and assumes root knows what he is doing. | ||
23 | |||
24 | Most notable: | ||
25 | |||
26 | * very small values in sched_rt_period_us can result in an unstable | ||
27 | system when the period is smaller than either the available hrtimer | ||
28 | resolution, or the time it takes to handle the budget refresh itself. | ||
29 | |||
30 | * very small values in sched_rt_runtime_us can result in an unstable | ||
31 | system when the runtime is so small the system has difficulty making | ||
32 | forward progress (NOTE: the migration thread and kstopmachine both | ||
33 | are real-time processes). | ||
34 | |||
17 | 1. Overview | 35 | 1. Overview |
18 | =========== | 36 | =========== |
19 | 37 | ||
@@ -169,7 +187,7 @@ get their allocated time. | |||
169 | 187 | ||
170 | Implementing SCHED_EDF might take a while to complete. Priority Inheritance is | 188 | Implementing SCHED_EDF might take a while to complete. Priority Inheritance is |
171 | the biggest challenge as the current linux PI infrastructure is geared towards | 189 | the biggest challenge as the current linux PI infrastructure is geared towards |
172 | the limited static priority levels 0-139. With deadline scheduling you need to | 190 | the limited static priority levels 0-99. With deadline scheduling you need to |
173 | do deadline inheritance (since priority is inversely proportional to the | 191 | do deadline inheritance (since priority is inversely proportional to the |
174 | deadline delta (deadline - now). | 192 | deadline delta (deadline - now). |
175 | 193 | ||
diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt index 012858d2b119..5c08d96f407c 100644 --- a/Documentation/sound/alsa/ALSA-Configuration.txt +++ b/Documentation/sound/alsa/ALSA-Configuration.txt | |||
@@ -460,6 +460,25 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
460 | 460 | ||
461 | The power-management is supported. | 461 | The power-management is supported. |
462 | 462 | ||
463 | Module snd-ctxfi | ||
464 | ---------------- | ||
465 | |||
466 | Module for Creative Sound Blaster X-Fi boards (20k1 / 20k2 chips) | ||
467 | * Creative Sound Blaster X-Fi Titanium Fatal1ty Champion Series | ||
468 | * Creative Sound Blaster X-Fi Titanium Fatal1ty Professional Series | ||
469 | * Creative Sound Blaster X-Fi Titanium Professional Audio | ||
470 | * Creative Sound Blaster X-Fi Titanium | ||
471 | * Creative Sound Blaster X-Fi Elite Pro | ||
472 | * Creative Sound Blaster X-Fi Platinum | ||
473 | * Creative Sound Blaster X-Fi Fatal1ty | ||
474 | * Creative Sound Blaster X-Fi XtremeGamer | ||
475 | * Creative Sound Blaster X-Fi XtremeMusic | ||
476 | |||
477 | reference_rate - reference sample rate, 44100 or 48000 (default) | ||
478 | multiple - multiple to ref. sample rate, 1 or 2 (default) | ||
479 | |||
480 | This module supports multiple cards. | ||
481 | |||
463 | Module snd-darla20 | 482 | Module snd-darla20 |
464 | ------------------ | 483 | ------------------ |
465 | 484 | ||
@@ -925,6 +944,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
925 | * Onkyo SE-90PCI | 944 | * Onkyo SE-90PCI |
926 | * Onkyo SE-200PCI | 945 | * Onkyo SE-200PCI |
927 | * ESI Juli@ | 946 | * ESI Juli@ |
947 | * ESI Maya44 | ||
928 | * Hercules Fortissimo IV | 948 | * Hercules Fortissimo IV |
929 | * EGO-SYS WaveTerminal 192M | 949 | * EGO-SYS WaveTerminal 192M |
930 | 950 | ||
@@ -933,7 +953,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
933 | prodigy71xt, prodigy71hifi, prodigyhd2, prodigy192, | 953 | prodigy71xt, prodigy71hifi, prodigyhd2, prodigy192, |
934 | juli, aureon51, aureon71, universe, ap192, k8x800, | 954 | juli, aureon51, aureon71, universe, ap192, k8x800, |
935 | phase22, phase28, ms300, av710, se200pci, se90pci, | 955 | phase22, phase28, ms300, av710, se200pci, se90pci, |
936 | fortissimo4, sn25p, WT192M | 956 | fortissimo4, sn25p, WT192M, maya44 |
937 | 957 | ||
938 | This module supports multiple cards and autoprobe. | 958 | This module supports multiple cards and autoprobe. |
939 | 959 | ||
@@ -1093,6 +1113,13 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
1093 | This module supports multiple cards. | 1113 | This module supports multiple cards. |
1094 | The driver requires the firmware loader support on kernel. | 1114 | The driver requires the firmware loader support on kernel. |
1095 | 1115 | ||
1116 | Module snd-lx6464es | ||
1117 | ------------------- | ||
1118 | |||
1119 | Module for Digigram LX6464ES boards | ||
1120 | |||
1121 | This module supports multiple cards. | ||
1122 | |||
1096 | Module snd-maestro3 | 1123 | Module snd-maestro3 |
1097 | ------------------- | 1124 | ------------------- |
1098 | 1125 | ||
@@ -1543,13 +1570,15 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
1543 | Module snd-sc6000 | 1570 | Module snd-sc6000 |
1544 | ----------------- | 1571 | ----------------- |
1545 | 1572 | ||
1546 | Module for Gallant SC-6000 soundcard. | 1573 | Module for Gallant SC-6000 soundcard and later models: SC-6600 |
1574 | and SC-7000. | ||
1547 | 1575 | ||
1548 | port - Port # (0x220 or 0x240) | 1576 | port - Port # (0x220 or 0x240) |
1549 | mss_port - MSS Port # (0x530 or 0xe80) | 1577 | mss_port - MSS Port # (0x530 or 0xe80) |
1550 | irq - IRQ # (5,7,9,10,11) | 1578 | irq - IRQ # (5,7,9,10,11) |
1551 | mpu_irq - MPU-401 IRQ # (5,7,9,10) ,0 - no MPU-401 irq | 1579 | mpu_irq - MPU-401 IRQ # (5,7,9,10) ,0 - no MPU-401 irq |
1552 | dma - DMA # (1,3,0) | 1580 | dma - DMA # (1,3,0) |
1581 | joystick - Enable gameport - 0 = disable (default), 1 = enable | ||
1553 | 1582 | ||
1554 | This module supports multiple cards. | 1583 | This module supports multiple cards. |
1555 | 1584 | ||
@@ -1859,7 +1888,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
1859 | ------------------- | 1888 | ------------------- |
1860 | 1889 | ||
1861 | Module for sound cards based on the Asus AV100/AV200 chips, | 1890 | Module for sound cards based on the Asus AV100/AV200 chips, |
1862 | i.e., Xonar D1, DX, D2, D2X, HDAV1.3 (Deluxe), and Essence STX. | 1891 | i.e., Xonar D1, DX, D2, D2X, HDAV1.3 (Deluxe), Essence ST |
1892 | (Deluxe) and Essence STX. | ||
1863 | 1893 | ||
1864 | This module supports autoprobe and multiple cards. | 1894 | This module supports autoprobe and multiple cards. |
1865 | 1895 | ||
diff --git a/Documentation/sound/alsa/HD-Audio-Models.txt b/Documentation/sound/alsa/HD-Audio-Models.txt index 322869fc8a9e..de8e10a94103 100644 --- a/Documentation/sound/alsa/HD-Audio-Models.txt +++ b/Documentation/sound/alsa/HD-Audio-Models.txt | |||
@@ -36,6 +36,7 @@ ALC260 | |||
36 | acer Acer TravelMate | 36 | acer Acer TravelMate |
37 | will Will laptops (PB V7900) | 37 | will Will laptops (PB V7900) |
38 | replacer Replacer 672V | 38 | replacer Replacer 672V |
39 | favorit100 Maxdata Favorit 100XS | ||
39 | basic fixed pin assignment (old default model) | 40 | basic fixed pin assignment (old default model) |
40 | test for testing/debugging purpose, almost all controls can | 41 | test for testing/debugging purpose, almost all controls can |
41 | adjusted. Appearing only when compiled with | 42 | adjusted. Appearing only when compiled with |
@@ -85,10 +86,11 @@ ALC269 | |||
85 | eeepc-p703 ASUS Eeepc P703 P900A | 86 | eeepc-p703 ASUS Eeepc P703 P900A |
86 | eeepc-p901 ASUS Eeepc P901 S101 | 87 | eeepc-p901 ASUS Eeepc P901 S101 |
87 | fujitsu FSC Amilo | 88 | fujitsu FSC Amilo |
89 | lifebook Fujitsu Lifebook S6420 | ||
88 | auto auto-config reading BIOS (default) | 90 | auto auto-config reading BIOS (default) |
89 | 91 | ||
90 | ALC662/663 | 92 | ALC662/663/272 |
91 | ========== | 93 | ============== |
92 | 3stack-dig 3-stack (2-channel) with SPDIF | 94 | 3stack-dig 3-stack (2-channel) with SPDIF |
93 | 3stack-6ch 3-stack (6-channel) | 95 | 3stack-6ch 3-stack (6-channel) |
94 | 3stack-6ch-dig 3-stack (6-channel) with SPDIF | 96 | 3stack-6ch-dig 3-stack (6-channel) with SPDIF |
@@ -107,6 +109,9 @@ ALC662/663 | |||
107 | asus-mode4 ASUS | 109 | asus-mode4 ASUS |
108 | asus-mode5 ASUS | 110 | asus-mode5 ASUS |
109 | asus-mode6 ASUS | 111 | asus-mode6 ASUS |
112 | dell Dell with ALC272 | ||
113 | dell-zm1 Dell ZM1 with ALC272 | ||
114 | samsung-nc10 Samsung NC10 mini notebook | ||
110 | auto auto-config reading BIOS (default) | 115 | auto auto-config reading BIOS (default) |
111 | 116 | ||
112 | ALC882/885 | 117 | ALC882/885 |
@@ -118,6 +123,7 @@ ALC882/885 | |||
118 | asus-a7j ASUS A7J | 123 | asus-a7j ASUS A7J |
119 | asus-a7m ASUS A7M | 124 | asus-a7m ASUS A7M |
120 | macpro MacPro support | 125 | macpro MacPro support |
126 | mb5 Macbook 5,1 | ||
121 | mbp3 Macbook Pro rev3 | 127 | mbp3 Macbook Pro rev3 |
122 | imac24 iMac 24'' with jack detection | 128 | imac24 iMac 24'' with jack detection |
123 | w2jc ASUS W2JC | 129 | w2jc ASUS W2JC |
@@ -133,10 +139,12 @@ ALC883/888 | |||
133 | acer Acer laptops (Travelmate 3012WTMi, Aspire 5600, etc) | 139 | acer Acer laptops (Travelmate 3012WTMi, Aspire 5600, etc) |
134 | acer-aspire Acer Aspire 9810 | 140 | acer-aspire Acer Aspire 9810 |
135 | acer-aspire-4930g Acer Aspire 4930G | 141 | acer-aspire-4930g Acer Aspire 4930G |
142 | acer-aspire-8930g Acer Aspire 8930G | ||
136 | medion Medion Laptops | 143 | medion Medion Laptops |
137 | medion-md2 Medion MD2 | 144 | medion-md2 Medion MD2 |
138 | targa-dig Targa/MSI | 145 | targa-dig Targa/MSI |
139 | targa-2ch-dig Targs/MSI with 2-channel | 146 | targa-2ch-dig Targa/MSI with 2-channel |
147 | targa-8ch-dig Targa/MSI with 8-channel (MSI GX620) | ||
140 | laptop-eapd 3-jack with SPDIF I/O and EAPD (Clevo M540JE, M550JE) | 148 | laptop-eapd 3-jack with SPDIF I/O and EAPD (Clevo M540JE, M550JE) |
141 | lenovo-101e Lenovo 101E | 149 | lenovo-101e Lenovo 101E |
142 | lenovo-nb0763 Lenovo NB0763 | 150 | lenovo-nb0763 Lenovo NB0763 |
@@ -150,6 +158,9 @@ ALC883/888 | |||
150 | fujitsu-pi2515 Fujitsu AMILO Pi2515 | 158 | fujitsu-pi2515 Fujitsu AMILO Pi2515 |
151 | fujitsu-xa3530 Fujitsu AMILO XA3530 | 159 | fujitsu-xa3530 Fujitsu AMILO XA3530 |
152 | 3stack-6ch-intel Intel DG33* boards | 160 | 3stack-6ch-intel Intel DG33* boards |
161 | asus-p5q ASUS P5Q-EM boards | ||
162 | mb31 MacBook 3,1 | ||
163 | sony-vaio-tt Sony VAIO TT | ||
153 | auto auto-config reading BIOS (default) | 164 | auto auto-config reading BIOS (default) |
154 | 165 | ||
155 | ALC861/660 | 166 | ALC861/660 |
@@ -348,6 +359,7 @@ STAC92HD71B* | |||
348 | hp-m4 HP mini 1000 | 359 | hp-m4 HP mini 1000 |
349 | hp-dv5 HP dv series | 360 | hp-dv5 HP dv series |
350 | hp-hdx HP HDX series | 361 | hp-hdx HP HDX series |
362 | hp-dv4-1222nr HP dv4-1222nr (with LED support) | ||
351 | auto BIOS setup (default) | 363 | auto BIOS setup (default) |
352 | 364 | ||
353 | STAC92HD73* | 365 | STAC92HD73* |
diff --git a/Documentation/sound/alsa/Procfile.txt b/Documentation/sound/alsa/Procfile.txt index cfac20cf9e33..381908d8ca42 100644 --- a/Documentation/sound/alsa/Procfile.txt +++ b/Documentation/sound/alsa/Procfile.txt | |||
@@ -88,26 +88,34 @@ card*/pcm*/info | |||
88 | substreams, etc. | 88 | substreams, etc. |
89 | 89 | ||
90 | card*/pcm*/xrun_debug | 90 | card*/pcm*/xrun_debug |
91 | This file appears when CONFIG_SND_DEBUG=y. | 91 | This file appears when CONFIG_SND_DEBUG=y and |
92 | This shows the status of xrun (= buffer overrun/xrun) debug of | 92 | CONFIG_PCM_XRUN_DEBUG=y. |
93 | ALSA PCM middle layer, as an integer from 0 to 2. The value | 93 | This shows the status of xrun (= buffer overrun/xrun) and |
94 | can be changed by writing to this file, such as | 94 | invalid PCM position debug/check of ALSA PCM middle layer. |
95 | 95 | It takes an integer value, can be changed by writing to this | |
96 | # cat 2 > /proc/asound/card0/pcm0p/xrun_debug | 96 | file, such as |
97 | 97 | ||
98 | When this value is greater than 0, the driver will show the | 98 | # cat 5 > /proc/asound/card0/pcm0p/xrun_debug |
99 | messages to kernel log when an xrun is detected. The debug | 99 | |
100 | message is shown also when the invalid H/W pointer is detected | 100 | The value consists of the following bit flags: |
101 | at the update of periods (usually called from the interrupt | 101 | bit 0 = Enable XRUN/jiffies debug messages |
102 | bit 1 = Show stack trace at XRUN / jiffies check | ||
103 | bit 2 = Enable additional jiffies check | ||
104 | |||
105 | When the bit 0 is set, the driver will show the messages to | ||
106 | kernel log when an xrun is detected. The debug message is | ||
107 | shown also when the invalid H/W pointer is detected at the | ||
108 | update of periods (usually called from the interrupt | ||
102 | handler). | 109 | handler). |
103 | 110 | ||
104 | When this value is greater than 1, the driver will show the | 111 | When the bit 1 is set, the driver will show the stack trace |
105 | stack trace additionally. This may help the debugging. | 112 | additionally. This may help the debugging. |
106 | 113 | ||
107 | Since 2.6.30, this option also enables the hwptr check using | 114 | Since 2.6.30, this option can enable the hwptr check using |
108 | jiffies. This detects spontaneous invalid pointer callback | 115 | jiffies. This detects spontaneous invalid pointer callback |
109 | values, but can be lead to too much corrections for a (mostly | 116 | values, but can be lead to too much corrections for a (mostly |
110 | buggy) hardware that doesn't give smooth pointer updates. | 117 | buggy) hardware that doesn't give smooth pointer updates. |
118 | This feature is enabled via the bit 2. | ||
111 | 119 | ||
112 | card*/pcm*/sub*/info | 120 | card*/pcm*/sub*/info |
113 | The general information of this PCM sub-stream. | 121 | The general information of this PCM sub-stream. |
diff --git a/Documentation/sound/alsa/README.maya44 b/Documentation/sound/alsa/README.maya44 new file mode 100644 index 000000000000..0e41576fa13e --- /dev/null +++ b/Documentation/sound/alsa/README.maya44 | |||
@@ -0,0 +1,163 @@ | |||
1 | NOTE: The following is the original document of Rainer's patch that the | ||
2 | current maya44 code based on. Some contents might be obsoleted, but I | ||
3 | keep here as reference -- tiwai | ||
4 | |||
5 | ---------------------------------------------------------------- | ||
6 | |||
7 | STATE OF DEVELOPMENT: | ||
8 | |||
9 | This driver is being developed on the initiative of Piotr Makowski (oponek@gmail.com) and financed by Lars Bergmann. | ||
10 | Development is carried out by Rainer Zimmermann (mail@lightshed.de). | ||
11 | |||
12 | ESI provided a sample Maya44 card for the development work. | ||
13 | |||
14 | However, unfortunately it has turned out difficult to get detailed programming information, so I (Rainer Zimmermann) had to find out some card-specific information by experiment and conjecture. Some information (in particular, several GPIO bits) is still missing. | ||
15 | |||
16 | This is the first testing version of the Maya44 driver released to the alsa-devel mailing list (Feb 5, 2008). | ||
17 | |||
18 | |||
19 | The following functions work, as tested by Rainer Zimmermann and Piotr Makowski: | ||
20 | |||
21 | - playback and capture at all sampling rates | ||
22 | - input/output level | ||
23 | - crossmixing | ||
24 | - line/mic switch | ||
25 | - phantom power switch | ||
26 | - analogue monitor a.k.a bypass | ||
27 | |||
28 | |||
29 | The following functions *should* work, but are not fully tested: | ||
30 | |||
31 | - Channel 3+4 analogue - S/PDIF input switching | ||
32 | - S/PDIF output | ||
33 | - all inputs/outputs on the M/IO/DIO extension card | ||
34 | - internal/external clock selection | ||
35 | |||
36 | |||
37 | *In particular, we would appreciate testing of these functions by anyone who has access to an M/IO/DIO extension card.* | ||
38 | |||
39 | |||
40 | Things that do not seem to work: | ||
41 | |||
42 | - The level meters ("multi track") in 'alsamixer' do not seem to react to signals in (if this is a bug, it would probably be in the existing ICE1724 code). | ||
43 | |||
44 | - Ardour 2.1 seems to work only via JACK, not using ALSA directly or via OSS. This still needs to be tracked down. | ||
45 | |||
46 | |||
47 | DRIVER DETAILS: | ||
48 | |||
49 | the following files were added: | ||
50 | |||
51 | pci/ice1724/maya44.c - Maya44 specific code | ||
52 | pci/ice1724/maya44.h | ||
53 | pci/ice1724/ice1724.patch | ||
54 | pci/ice1724/ice1724.h.patch - PROPOSED patch to ice1724.h (see SAMPLING RATES) | ||
55 | i2c/other/wm8776.c - low-level access routines for Wolfson WM8776 codecs | ||
56 | include/wm8776.h | ||
57 | |||
58 | |||
59 | Note that the wm8776.c code is meant to be card-independent and does not actually register the codec with the ALSA infrastructure. | ||
60 | This is done in maya44.c, mainly because some of the WM8776 controls are used in Maya44-specific ways, and should be named appropriately. | ||
61 | |||
62 | |||
63 | the following files were created in pci/ice1724, simply #including the corresponding file from the alsa-kernel tree: | ||
64 | |||
65 | wtm.h | ||
66 | vt1720_mobo.h | ||
67 | revo.h | ||
68 | prodigy192.h | ||
69 | pontis.h | ||
70 | phase.h | ||
71 | maya44.h | ||
72 | juli.h | ||
73 | aureon.h | ||
74 | amp.h | ||
75 | envy24ht.h | ||
76 | se.h | ||
77 | prodigy_hifi.h | ||
78 | |||
79 | |||
80 | *I hope this is the correct way to do things.* | ||
81 | |||
82 | |||
83 | SAMPLING RATES: | ||
84 | |||
85 | The Maya44 card (or more exactly, the Wolfson WM8776 codecs) allow a maximum sampling rate of 192 kHz for playback and 92 kHz for capture. | ||
86 | |||
87 | As the ICE1724 chip only allows one global sampling rate, this is handled as follows: | ||
88 | |||
89 | * setting the sampling rate on any open PCM device on the maya44 card will always set the *global* sampling rate for all playback and capture channels. | ||
90 | |||
91 | * In the current state of the driver, setting rates of up to 192 kHz is permitted even for capture devices. | ||
92 | |||
93 | *AVOID CAPTURING AT RATES ABOVE 96kHz*, even though it may appear to work. The codec cannot actually capture at such rates, meaning poor quality. | ||
94 | |||
95 | |||
96 | I propose some additional code for limiting the sampling rate when setting on a capture pcm device. However because of the global sampling rate, this logic would be somewhat problematic. | ||
97 | |||
98 | The proposed code (currently deactivated) is in ice1712.h.patch, ice1724.c and maya44.c (in pci/ice1712). | ||
99 | |||
100 | |||
101 | SOUND DEVICES: | ||
102 | |||
103 | PCM devices correspond to inputs/outputs as follows (assuming Maya44 is card #0): | ||
104 | |||
105 | hw:0,0 input - stereo, analog input 1+2 | ||
106 | hw:0,0 output - stereo, analog output 1+2 | ||
107 | hw:0,1 input - stereo, analog input 3+4 OR S/PDIF input | ||
108 | hw:0,1 output - stereo, analog output 3+4 (and SPDIF out) | ||
109 | |||
110 | |||
111 | NAMING OF MIXER CONTROLS: | ||
112 | |||
113 | (for more information about the signal flow, please refer to the block diagram on p.24 of the ESI Maya44 manual, or in the ESI windows software). | ||
114 | |||
115 | |||
116 | PCM: (digital) output level for channel 1+2 | ||
117 | PCM 1: same for channel 3+4 | ||
118 | |||
119 | Mic Phantom+48V: switch for +48V phantom power for electrostatic microphones on input 1/2. | ||
120 | Make sure this is not turned on while any other source is connected to input 1/2. | ||
121 | It might damage the source and/or the maya44 card. | ||
122 | |||
123 | Mic/Line input: if switch is is on, input jack 1/2 is microphone input (mono), otherwise line input (stereo). | ||
124 | |||
125 | Bypass: analogue bypass from ADC input to output for channel 1+2. Same as "Monitor" in the windows driver. | ||
126 | Bypass 1: same for channel 3+4. | ||
127 | |||
128 | Crossmix: cross-mixer from channels 1+2 to channels 3+4 | ||
129 | Crossmix 1: cross-mixer from channels 3+4 to channels 1+2 | ||
130 | |||
131 | IEC958 Output: switch for S/PDIF output. | ||
132 | This is not supported by the ESI windows driver. | ||
133 | S/PDIF should output the same signal as channel 3+4. [untested!] | ||
134 | |||
135 | |||
136 | Digitial output selectors: | ||
137 | |||
138 | These switches allow a direct digital routing from the ADCs to the DACs. | ||
139 | Each switch determines where the digital input data to one of the DACs comes from. | ||
140 | They are not supported by the ESI windows driver. | ||
141 | For normal operation, they should all be set to "PCM out". | ||
142 | |||
143 | H/W: Output source channel 1 | ||
144 | H/W 1: Output source channel 2 | ||
145 | H/W 2: Output source channel 3 | ||
146 | H/W 3: Output source channel 4 | ||
147 | |||
148 | H/W 4 ... H/W 9: unknown function, left in to enable testing. | ||
149 | Possibly some of these control S/PDIF output(s). | ||
150 | If these turn out to be unused, they will go away in later driver versions. | ||
151 | |||
152 | Selectable values for each of the digital output selectors are: | ||
153 | "PCM out" -> DAC output of the corresponding channel (default setting) | ||
154 | "Input 1"... | ||
155 | "Input 4" -> direct routing from ADC output of the selected input channel | ||
156 | |||
157 | |||
158 | -------- | ||
159 | |||
160 | Feb 14, 2008 | ||
161 | Rainer Zimmermann | ||
162 | mail@lightshed.de | ||
163 | |||
diff --git a/Documentation/sound/alsa/soc/dapm.txt b/Documentation/sound/alsa/soc/dapm.txt index 9e6763264a2e..9ac842be9b4f 100644 --- a/Documentation/sound/alsa/soc/dapm.txt +++ b/Documentation/sound/alsa/soc/dapm.txt | |||
@@ -62,6 +62,7 @@ Audio DAPM widgets fall into a number of types:- | |||
62 | o Mic - Mic (and optional Jack) | 62 | o Mic - Mic (and optional Jack) |
63 | o Line - Line Input/Output (and optional Jack) | 63 | o Line - Line Input/Output (and optional Jack) |
64 | o Speaker - Speaker | 64 | o Speaker - Speaker |
65 | o Supply - Power or clock supply widget used by other widgets. | ||
65 | o Pre - Special PRE widget (exec before all others) | 66 | o Pre - Special PRE widget (exec before all others) |
66 | o Post - Special POST widget (exec after all others) | 67 | o Post - Special POST widget (exec after all others) |
67 | 68 | ||
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index f11ca7979fa6..322a00bb99d9 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt | |||
@@ -32,6 +32,7 @@ show up in /proc/sys/kernel: | |||
32 | - kstack_depth_to_print [ X86 only ] | 32 | - kstack_depth_to_print [ X86 only ] |
33 | - l2cr [ PPC only ] | 33 | - l2cr [ PPC only ] |
34 | - modprobe ==> Documentation/debugging-modules.txt | 34 | - modprobe ==> Documentation/debugging-modules.txt |
35 | - modules_disabled | ||
35 | - msgmax | 36 | - msgmax |
36 | - msgmnb | 37 | - msgmnb |
37 | - msgmni | 38 | - msgmni |
@@ -184,6 +185,16 @@ kernel stack. | |||
184 | 185 | ||
185 | ============================================================== | 186 | ============================================================== |
186 | 187 | ||
188 | modules_disabled: | ||
189 | |||
190 | A toggle value indicating if modules are allowed to be loaded | ||
191 | in an otherwise modular kernel. This toggle defaults to off | ||
192 | (0), but can be set true (1). Once true, modules can be | ||
193 | neither loaded nor unloaded, and the toggle cannot be set back | ||
194 | to false. | ||
195 | |||
196 | ============================================================== | ||
197 | |||
187 | osrelease, ostype & version: | 198 | osrelease, ostype & version: |
188 | 199 | ||
189 | # cat osrelease | 200 | # cat osrelease |
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt new file mode 100644 index 000000000000..f157d7594ea7 --- /dev/null +++ b/Documentation/trace/events.txt | |||
@@ -0,0 +1,90 @@ | |||
1 | Event Tracing | ||
2 | |||
3 | Documentation written by Theodore Ts'o | ||
4 | Updated by Li Zefan | ||
5 | |||
6 | 1. Introduction | ||
7 | =============== | ||
8 | |||
9 | Tracepoints (see Documentation/trace/tracepoints.txt) can be used | ||
10 | without creating custom kernel modules to register probe functions | ||
11 | using the event tracing infrastructure. | ||
12 | |||
13 | Not all tracepoints can be traced using the event tracing system; | ||
14 | the kernel developer must provide code snippets which define how the | ||
15 | tracing information is saved into the tracing buffer, and how the | ||
16 | tracing information should be printed. | ||
17 | |||
18 | 2. Using Event Tracing | ||
19 | ====================== | ||
20 | |||
21 | 2.1 Via the 'set_event' interface | ||
22 | --------------------------------- | ||
23 | |||
24 | The events which are available for tracing can be found in the file | ||
25 | /debug/tracing/available_events. | ||
26 | |||
27 | To enable a particular event, such as 'sched_wakeup', simply echo it | ||
28 | to /debug/tracing/set_event. For example: | ||
29 | |||
30 | # echo sched_wakeup >> /debug/tracing/set_event | ||
31 | |||
32 | [ Note: '>>' is necessary, otherwise it will firstly disable | ||
33 | all the events. ] | ||
34 | |||
35 | To disable an event, echo the event name to the set_event file prefixed | ||
36 | with an exclamation point: | ||
37 | |||
38 | # echo '!sched_wakeup' >> /debug/tracing/set_event | ||
39 | |||
40 | To disable all events, echo an empty line to the set_event file: | ||
41 | |||
42 | # echo > /debug/tracing/set_event | ||
43 | |||
44 | To enable all events, echo '*:*' or '*:' to the set_event file: | ||
45 | |||
46 | # echo *:* > /debug/tracing/set_event | ||
47 | |||
48 | The events are organized into subsystems, such as ext4, irq, sched, | ||
49 | etc., and a full event name looks like this: <subsystem>:<event>. The | ||
50 | subsystem name is optional, but it is displayed in the available_events | ||
51 | file. All of the events in a subsystem can be specified via the syntax | ||
52 | "<subsystem>:*"; for example, to enable all irq events, you can use the | ||
53 | command: | ||
54 | |||
55 | # echo 'irq:*' > /debug/tracing/set_event | ||
56 | |||
57 | 2.2 Via the 'enable' toggle | ||
58 | --------------------------- | ||
59 | |||
60 | The events available are also listed in /debug/tracing/events/ hierarchy | ||
61 | of directories. | ||
62 | |||
63 | To enable event 'sched_wakeup': | ||
64 | |||
65 | # echo 1 > /debug/tracing/events/sched/sched_wakeup/enable | ||
66 | |||
67 | To disable it: | ||
68 | |||
69 | # echo 0 > /debug/tracing/events/sched/sched_wakeup/enable | ||
70 | |||
71 | To enable all events in sched subsystem: | ||
72 | |||
73 | # echo 1 > /debug/tracing/events/sched/enable | ||
74 | |||
75 | To eanble all events: | ||
76 | |||
77 | # echo 1 > /debug/tracing/events/enable | ||
78 | |||
79 | When reading one of these enable files, there are four results: | ||
80 | |||
81 | 0 - all events this file affects are disabled | ||
82 | 1 - all events this file affects are enabled | ||
83 | X - there is a mixture of events enabled and disabled | ||
84 | ? - this file does not affect any event | ||
85 | |||
86 | 3. Defining an event-enabled tracepoint | ||
87 | ======================================= | ||
88 | |||
89 | See The example provided in samples/trace_events | ||
90 | |||
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index fd9a3e693813..2a82d8602944 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt | |||
@@ -179,7 +179,7 @@ Here is the list of current tracers that may be configured. | |||
179 | 179 | ||
180 | Function call tracer to trace all kernel functions. | 180 | Function call tracer to trace all kernel functions. |
181 | 181 | ||
182 | "function_graph_tracer" | 182 | "function_graph" |
183 | 183 | ||
184 | Similar to the function tracer except that the | 184 | Similar to the function tracer except that the |
185 | function tracer probes the functions on their entry | 185 | function tracer probes the functions on their entry |
@@ -518,9 +518,18 @@ priority with zero (0) being the highest priority and the nice | |||
518 | values starting at 100 (nice -20). Below is a quick chart to map | 518 | values starting at 100 (nice -20). Below is a quick chart to map |
519 | the kernel priority to user land priorities. | 519 | the kernel priority to user land priorities. |
520 | 520 | ||
521 | Kernel priority: 0 to 99 ==> user RT priority 99 to 0 | 521 | Kernel Space User Space |
522 | Kernel priority: 100 to 139 ==> user nice -20 to 19 | 522 | =============================================================== |
523 | Kernel priority: 140 ==> idle task priority | 523 | 0(high) to 98(low) user RT priority 99(high) to 1(low) |
524 | with SCHED_RR or SCHED_FIFO | ||
525 | --------------------------------------------------------------- | ||
526 | 99 sched_priority is not used in scheduling | ||
527 | decisions(it must be specified as 0) | ||
528 | --------------------------------------------------------------- | ||
529 | 100(high) to 139(low) user nice -20(high) to 19(low) | ||
530 | --------------------------------------------------------------- | ||
531 | 140 idle task priority | ||
532 | --------------------------------------------------------------- | ||
524 | 533 | ||
525 | The task states are: | 534 | The task states are: |
526 | 535 | ||
diff --git a/Documentation/trace/power.txt b/Documentation/trace/power.txt new file mode 100644 index 000000000000..cd805e16dc27 --- /dev/null +++ b/Documentation/trace/power.txt | |||
@@ -0,0 +1,17 @@ | |||
1 | The power tracer collects detailed information about C-state and P-state | ||
2 | transitions, instead of just looking at the high-level "average" | ||
3 | information. | ||
4 | |||
5 | There is a helper script found in scrips/tracing/power.pl in the kernel | ||
6 | sources which can be used to parse this information and create a | ||
7 | Scalable Vector Graphics (SVG) picture from the trace data. | ||
8 | |||
9 | To use this tracer: | ||
10 | |||
11 | echo 0 > /sys/kernel/debug/tracing/tracing_enabled | ||
12 | echo power > /sys/kernel/debug/tracing/current_tracer | ||
13 | echo 1 > /sys/kernel/debug/tracing/tracing_enabled | ||
14 | sleep 1 | ||
15 | echo 0 > /sys/kernel/debug/tracing/tracing_enabled | ||
16 | cat /sys/kernel/debug/tracing/trace | \ | ||
17 | perl scripts/tracing/power.pl > out.sv | ||
diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt index e0203662f9e9..8da3a795083f 100644 --- a/Documentation/x86/boot.txt +++ b/Documentation/x86/boot.txt | |||
@@ -50,6 +50,10 @@ Protocol 2.08: (Kernel 2.6.26) Added crc32 checksum and ELF format | |||
50 | Protocol 2.09: (Kernel 2.6.26) Added a field of 64-bit physical | 50 | Protocol 2.09: (Kernel 2.6.26) Added a field of 64-bit physical |
51 | pointer to single linked list of struct setup_data. | 51 | pointer to single linked list of struct setup_data. |
52 | 52 | ||
53 | Protocol 2.10: (Kernel 2.6.31) Added a protocol for relaxed alignment | ||
54 | beyond the kernel_alignment added, new init_size and | ||
55 | pref_address fields. Added extended boot loader IDs. | ||
56 | |||
53 | **** MEMORY LAYOUT | 57 | **** MEMORY LAYOUT |
54 | 58 | ||
55 | The traditional memory map for the kernel loader, used for Image or | 59 | The traditional memory map for the kernel loader, used for Image or |
@@ -168,12 +172,13 @@ Offset Proto Name Meaning | |||
168 | 021C/4 2.00+ ramdisk_size initrd size (set by boot loader) | 172 | 021C/4 2.00+ ramdisk_size initrd size (set by boot loader) |
169 | 0220/4 2.00+ bootsect_kludge DO NOT USE - for bootsect.S use only | 173 | 0220/4 2.00+ bootsect_kludge DO NOT USE - for bootsect.S use only |
170 | 0224/2 2.01+ heap_end_ptr Free memory after setup end | 174 | 0224/2 2.01+ heap_end_ptr Free memory after setup end |
171 | 0226/2 N/A pad1 Unused | 175 | 0226/1 2.02+(3 ext_loader_ver Extended boot loader version |
176 | 0227/1 2.02+(3 ext_loader_type Extended boot loader ID | ||
172 | 0228/4 2.02+ cmd_line_ptr 32-bit pointer to the kernel command line | 177 | 0228/4 2.02+ cmd_line_ptr 32-bit pointer to the kernel command line |
173 | 022C/4 2.03+ ramdisk_max Highest legal initrd address | 178 | 022C/4 2.03+ ramdisk_max Highest legal initrd address |
174 | 0230/4 2.05+ kernel_alignment Physical addr alignment required for kernel | 179 | 0230/4 2.05+ kernel_alignment Physical addr alignment required for kernel |
175 | 0234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not | 180 | 0234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not |
176 | 0235/1 N/A pad2 Unused | 181 | 0235/1 2.10+ min_alignment Minimum alignment, as a power of two |
177 | 0236/2 N/A pad3 Unused | 182 | 0236/2 N/A pad3 Unused |
178 | 0238/4 2.06+ cmdline_size Maximum size of the kernel command line | 183 | 0238/4 2.06+ cmdline_size Maximum size of the kernel command line |
179 | 023C/4 2.07+ hardware_subarch Hardware subarchitecture | 184 | 023C/4 2.07+ hardware_subarch Hardware subarchitecture |
@@ -182,6 +187,8 @@ Offset Proto Name Meaning | |||
182 | 024C/4 2.08+ payload_length Length of kernel payload | 187 | 024C/4 2.08+ payload_length Length of kernel payload |
183 | 0250/8 2.09+ setup_data 64-bit physical pointer to linked list | 188 | 0250/8 2.09+ setup_data 64-bit physical pointer to linked list |
184 | of struct setup_data | 189 | of struct setup_data |
190 | 0258/8 2.10+ pref_address Preferred loading address | ||
191 | 0260/4 2.10+ init_size Linear memory required during initialization | ||
185 | 192 | ||
186 | (1) For backwards compatibility, if the setup_sects field contains 0, the | 193 | (1) For backwards compatibility, if the setup_sects field contains 0, the |
187 | real value is 4. | 194 | real value is 4. |
@@ -190,6 +197,8 @@ Offset Proto Name Meaning | |||
190 | field are unusable, which means the size of a bzImage kernel | 197 | field are unusable, which means the size of a bzImage kernel |
191 | cannot be determined. | 198 | cannot be determined. |
192 | 199 | ||
200 | (3) Ignored, but safe to set, for boot protocols 2.02-2.09. | ||
201 | |||
193 | If the "HdrS" (0x53726448) magic number is not found at offset 0x202, | 202 | If the "HdrS" (0x53726448) magic number is not found at offset 0x202, |
194 | the boot protocol version is "old". Loading an old kernel, the | 203 | the boot protocol version is "old". Loading an old kernel, the |
195 | following parameters should be assumed: | 204 | following parameters should be assumed: |
@@ -343,18 +352,32 @@ Protocol: 2.00+ | |||
343 | 0xTV here, where T is an identifier for the boot loader and V is | 352 | 0xTV here, where T is an identifier for the boot loader and V is |
344 | a version number. Otherwise, enter 0xFF here. | 353 | a version number. Otherwise, enter 0xFF here. |
345 | 354 | ||
355 | For boot loader IDs above T = 0xD, write T = 0xE to this field and | ||
356 | write the extended ID minus 0x10 to the ext_loader_type field. | ||
357 | Similarly, the ext_loader_ver field can be used to provide more than | ||
358 | four bits for the bootloader version. | ||
359 | |||
360 | For example, for T = 0x15, V = 0x234, write: | ||
361 | |||
362 | type_of_loader <- 0xE4 | ||
363 | ext_loader_type <- 0x05 | ||
364 | ext_loader_ver <- 0x23 | ||
365 | |||
346 | Assigned boot loader ids: | 366 | Assigned boot loader ids: |
347 | 0 LILO (0x00 reserved for pre-2.00 bootloader) | 367 | 0 LILO (0x00 reserved for pre-2.00 bootloader) |
348 | 1 Loadlin | 368 | 1 Loadlin |
349 | 2 bootsect-loader (0x20, all other values reserved) | 369 | 2 bootsect-loader (0x20, all other values reserved) |
350 | 3 SYSLINUX | 370 | 3 Syslinux |
351 | 4 EtherBoot | 371 | 4 Etherboot/gPXE |
352 | 5 ELILO | 372 | 5 ELILO |
353 | 7 GRUB | 373 | 7 GRUB |
354 | 8 U-BOOT | 374 | 8 U-Boot |
355 | 9 Xen | 375 | 9 Xen |
356 | A Gujin | 376 | A Gujin |
357 | B Qemu | 377 | B Qemu |
378 | C Arcturus Networks uCbootloader | ||
379 | E Extended (see ext_loader_type) | ||
380 | F Special (0xFF = undefined) | ||
358 | 381 | ||
359 | Please contact <hpa@zytor.com> if you need a bootloader ID | 382 | Please contact <hpa@zytor.com> if you need a bootloader ID |
360 | value assigned. | 383 | value assigned. |
@@ -453,6 +476,35 @@ Protocol: 2.01+ | |||
453 | Set this field to the offset (from the beginning of the real-mode | 476 | Set this field to the offset (from the beginning of the real-mode |
454 | code) of the end of the setup stack/heap, minus 0x0200. | 477 | code) of the end of the setup stack/heap, minus 0x0200. |
455 | 478 | ||
479 | Field name: ext_loader_ver | ||
480 | Type: write (optional) | ||
481 | Offset/size: 0x226/1 | ||
482 | Protocol: 2.02+ | ||
483 | |||
484 | This field is used as an extension of the version number in the | ||
485 | type_of_loader field. The total version number is considered to be | ||
486 | (type_of_loader & 0x0f) + (ext_loader_ver << 4). | ||
487 | |||
488 | The use of this field is boot loader specific. If not written, it | ||
489 | is zero. | ||
490 | |||
491 | Kernels prior to 2.6.31 did not recognize this field, but it is safe | ||
492 | to write for protocol version 2.02 or higher. | ||
493 | |||
494 | Field name: ext_loader_type | ||
495 | Type: write (obligatory if (type_of_loader & 0xf0) == 0xe0) | ||
496 | Offset/size: 0x227/1 | ||
497 | Protocol: 2.02+ | ||
498 | |||
499 | This field is used as an extension of the type number in | ||
500 | type_of_loader field. If the type in type_of_loader is 0xE, then | ||
501 | the actual type is (ext_loader_type + 0x10). | ||
502 | |||
503 | This field is ignored if the type in type_of_loader is not 0xE. | ||
504 | |||
505 | Kernels prior to 2.6.31 did not recognize this field, but it is safe | ||
506 | to write for protocol version 2.02 or higher. | ||
507 | |||
456 | Field name: cmd_line_ptr | 508 | Field name: cmd_line_ptr |
457 | Type: write (obligatory) | 509 | Type: write (obligatory) |
458 | Offset/size: 0x228/4 | 510 | Offset/size: 0x228/4 |
@@ -482,11 +534,19 @@ Protocol: 2.03+ | |||
482 | 0x37FFFFFF, you can start your ramdisk at 0x37FE0000.) | 534 | 0x37FFFFFF, you can start your ramdisk at 0x37FE0000.) |
483 | 535 | ||
484 | Field name: kernel_alignment | 536 | Field name: kernel_alignment |
485 | Type: read (reloc) | 537 | Type: read/modify (reloc) |
486 | Offset/size: 0x230/4 | 538 | Offset/size: 0x230/4 |
487 | Protocol: 2.05+ | 539 | Protocol: 2.05+ (read), 2.10+ (modify) |
540 | |||
541 | Alignment unit required by the kernel (if relocatable_kernel is | ||
542 | true.) A relocatable kernel that is loaded at an alignment | ||
543 | incompatible with the value in this field will be realigned during | ||
544 | kernel initialization. | ||
488 | 545 | ||
489 | Alignment unit required by the kernel (if relocatable_kernel is true.) | 546 | Starting with protocol version 2.10, this reflects the kernel |
547 | alignment preferred for optimal performance; it is possible for the | ||
548 | loader to modify this field to permit a lesser alignment. See the | ||
549 | min_alignment and pref_address field below. | ||
490 | 550 | ||
491 | Field name: relocatable_kernel | 551 | Field name: relocatable_kernel |
492 | Type: read (reloc) | 552 | Type: read (reloc) |
@@ -498,6 +558,22 @@ Protocol: 2.05+ | |||
498 | After loading, the boot loader must set the code32_start field to | 558 | After loading, the boot loader must set the code32_start field to |
499 | point to the loaded code, or to a boot loader hook. | 559 | point to the loaded code, or to a boot loader hook. |
500 | 560 | ||
561 | Field name: min_alignment | ||
562 | Type: read (reloc) | ||
563 | Offset/size: 0x235/1 | ||
564 | Protocol: 2.10+ | ||
565 | |||
566 | This field, if nonzero, indicates as a power of two the minimum | ||
567 | alignment required, as opposed to preferred, by the kernel to boot. | ||
568 | If a boot loader makes use of this field, it should update the | ||
569 | kernel_alignment field with the alignment unit desired; typically: | ||
570 | |||
571 | kernel_alignment = 1 << min_alignment | ||
572 | |||
573 | There may be a considerable performance cost with an excessively | ||
574 | misaligned kernel. Therefore, a loader should typically try each | ||
575 | power-of-two alignment from kernel_alignment down to this alignment. | ||
576 | |||
501 | Field name: cmdline_size | 577 | Field name: cmdline_size |
502 | Type: read | 578 | Type: read |
503 | Offset/size: 0x238/4 | 579 | Offset/size: 0x238/4 |
@@ -582,6 +658,36 @@ Protocol: 2.09+ | |||
582 | sure to consider the case where the linked list already contains | 658 | sure to consider the case where the linked list already contains |
583 | entries. | 659 | entries. |
584 | 660 | ||
661 | Field name: pref_address | ||
662 | Type: read (reloc) | ||
663 | Offset/size: 0x258/8 | ||
664 | Protocol: 2.10+ | ||
665 | |||
666 | This field, if nonzero, represents a preferred load address for the | ||
667 | kernel. A relocating bootloader should attempt to load at this | ||
668 | address if possible. | ||
669 | |||
670 | A non-relocatable kernel will unconditionally move itself and to run | ||
671 | at this address. | ||
672 | |||
673 | Field name: init_size | ||
674 | Type: read | ||
675 | Offset/size: 0x25c/4 | ||
676 | |||
677 | This field indicates the amount of linear contiguous memory starting | ||
678 | at the kernel runtime start address that the kernel needs before it | ||
679 | is capable of examining its memory map. This is not the same thing | ||
680 | as the total amount of memory the kernel needs to boot, but it can | ||
681 | be used by a relocating boot loader to help select a safe load | ||
682 | address for the kernel. | ||
683 | |||
684 | The kernel runtime start address is determined by the following algorithm: | ||
685 | |||
686 | if (relocatable_kernel) | ||
687 | runtime_start = align_up(load_address, kernel_alignment) | ||
688 | else | ||
689 | runtime_start = pref_address | ||
690 | |||
585 | 691 | ||
586 | **** THE IMAGE CHECKSUM | 692 | **** THE IMAGE CHECKSUM |
587 | 693 | ||
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 34c13040a718..2db5893d6c97 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt | |||
@@ -150,11 +150,6 @@ NUMA | |||
150 | Otherwise, the remaining system RAM is allocated to an | 150 | Otherwise, the remaining system RAM is allocated to an |
151 | additional node. | 151 | additional node. |
152 | 152 | ||
153 | numa=hotadd=percent | ||
154 | Only allow hotadd memory to preallocate page structures upto | ||
155 | percent of already available memory. | ||
156 | numa=hotadd=0 will disable hotadd memory. | ||
157 | |||
158 | ACPI | 153 | ACPI |
159 | 154 | ||
160 | acpi=off Don't enable ACPI | 155 | acpi=off Don't enable ACPI |
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 29b52b14d0b4..d6498e3cd713 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt | |||
@@ -6,10 +6,11 @@ Virtual memory map with 4 level page tables: | |||
6 | 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm | 6 | 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm |
7 | hole caused by [48:63] sign extension | 7 | hole caused by [48:63] sign extension |
8 | ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole | 8 | ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole |
9 | ffff880000000000 - ffffc0ffffffffff (=57 TB) direct mapping of all phys. memory | 9 | ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory |
10 | ffffc10000000000 - ffffc1ffffffffff (=40 bits) hole | 10 | ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole |
11 | ffffc20000000000 - ffffe1ffffffffff (=45 bits) vmalloc/ioremap space | 11 | ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space |
12 | ffffe20000000000 - ffffe2ffffffffff (=40 bits) virtual memory map (1TB) | 12 | ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole |
13 | ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) | ||
13 | ... unused hole ... | 14 | ... unused hole ... |
14 | ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0 | 15 | ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0 |
15 | ffffffffa0000000 - fffffffffff00000 (=1536 MB) module mapping space | 16 | ffffffffa0000000 - fffffffffff00000 (=1536 MB) module mapping space |