diff options
Diffstat (limited to 'Documentation')
303 files changed, 15379 insertions, 3756 deletions
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index 06b982affe76..dd10b51b4e65 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX | |||
@@ -250,6 +250,8 @@ numastat.txt | |||
250 | - info on how to read Numa policy hit/miss statistics in sysfs. | 250 | - info on how to read Numa policy hit/miss statistics in sysfs. |
251 | oops-tracing.txt | 251 | oops-tracing.txt |
252 | - how to decode those nasty internal kernel error dump messages. | 252 | - how to decode those nasty internal kernel error dump messages. |
253 | padata.txt | ||
254 | - An introduction to the "padata" parallel execution API | ||
253 | parisc/ | 255 | parisc/ |
254 | - directory with info on using Linux on PA-RISC architecture. | 256 | - directory with info on using Linux on PA-RISC architecture. |
255 | parport.txt | 257 | parport.txt |
diff --git a/Documentation/ABI/obsolete/sysfs-bus-usb b/Documentation/ABI/obsolete/sysfs-bus-usb new file mode 100644 index 000000000000..bd096d33fbc7 --- /dev/null +++ b/Documentation/ABI/obsolete/sysfs-bus-usb | |||
@@ -0,0 +1,31 @@ | |||
1 | What: /sys/bus/usb/devices/.../power/level | ||
2 | Date: March 2007 | ||
3 | KernelVersion: 2.6.21 | ||
4 | Contact: Alan Stern <stern@rowland.harvard.edu> | ||
5 | Description: | ||
6 | Each USB device directory will contain a file named | ||
7 | power/level. This file holds a power-level setting for | ||
8 | the device, either "on" or "auto". | ||
9 | |||
10 | "on" means that the device is not allowed to autosuspend, | ||
11 | although normal suspends for system sleep will still | ||
12 | be honored. "auto" means the device will autosuspend | ||
13 | and autoresume in the usual manner, according to the | ||
14 | capabilities of its driver. | ||
15 | |||
16 | During normal use, devices should be left in the "auto" | ||
17 | level. The "on" level is meant for administrative uses. | ||
18 | If you want to suspend a device immediately but leave it | ||
19 | free to wake up in response to I/O requests, you should | ||
20 | write "0" to power/autosuspend. | ||
21 | |||
22 | Device not capable of proper suspend and resume should be | ||
23 | left in the "on" level. Although the USB spec requires | ||
24 | devices to support suspend/resume, many of them do not. | ||
25 | In fact so many don't that by default, the USB core | ||
26 | initializes all non-hub devices in the "on" level. Some | ||
27 | drivers may change this setting when they are bound. | ||
28 | |||
29 | This file is deprecated and will be removed after 2010. | ||
30 | Use the power/control file instead; it does exactly the | ||
31 | same thing. | ||
diff --git a/Documentation/ABI/obsolete/sysfs-class-rfkill b/Documentation/ABI/obsolete/sysfs-class-rfkill new file mode 100644 index 000000000000..4201d5b05515 --- /dev/null +++ b/Documentation/ABI/obsolete/sysfs-class-rfkill | |||
@@ -0,0 +1,29 @@ | |||
1 | rfkill - radio frequency (RF) connector kill switch support | ||
2 | |||
3 | For details to this subsystem look at Documentation/rfkill.txt. | ||
4 | |||
5 | What: /sys/class/rfkill/rfkill[0-9]+/state | ||
6 | Date: 09-Jul-2007 | ||
7 | KernelVersion v2.6.22 | ||
8 | Contact: linux-wireless@vger.kernel.org | ||
9 | Description: Current state of the transmitter. | ||
10 | This file is deprecated and sheduled to be removed in 2014, | ||
11 | because its not possible to express the 'soft and hard block' | ||
12 | state of the rfkill driver. | ||
13 | Values: A numeric value. | ||
14 | 0: RFKILL_STATE_SOFT_BLOCKED | ||
15 | transmitter is turned off by software | ||
16 | 1: RFKILL_STATE_UNBLOCKED | ||
17 | transmitter is (potentially) active | ||
18 | 2: RFKILL_STATE_HARD_BLOCKED | ||
19 | transmitter is forced off by something outside of | ||
20 | the driver's control. | ||
21 | |||
22 | What: /sys/class/rfkill/rfkill[0-9]+/claim | ||
23 | Date: 09-Jul-2007 | ||
24 | KernelVersion v2.6.22 | ||
25 | Contact: linux-wireless@vger.kernel.org | ||
26 | Description: This file is deprecated because there no longer is a way to | ||
27 | claim just control over a single rfkill instance. | ||
28 | This file is scheduled to be removed in 2012. | ||
29 | Values: 0: Kernel handles events | ||
diff --git a/Documentation/ABI/stable/sysfs-class-rfkill b/Documentation/ABI/stable/sysfs-class-rfkill new file mode 100644 index 000000000000..097f522c33bb --- /dev/null +++ b/Documentation/ABI/stable/sysfs-class-rfkill | |||
@@ -0,0 +1,67 @@ | |||
1 | rfkill - radio frequency (RF) connector kill switch support | ||
2 | |||
3 | For details to this subsystem look at Documentation/rfkill.txt. | ||
4 | |||
5 | For the deprecated /sys/class/rfkill/*/state and | ||
6 | /sys/class/rfkill/*/claim knobs of this interface look in | ||
7 | Documentation/ABI/obsolete/sysfs-class-rfkill. | ||
8 | |||
9 | What: /sys/class/rfkill | ||
10 | Date: 09-Jul-2007 | ||
11 | KernelVersion: v2.6.22 | ||
12 | Contact: linux-wireless@vger.kernel.org, | ||
13 | Description: The rfkill class subsystem folder. | ||
14 | Each registered rfkill driver is represented by an rfkillX | ||
15 | subfolder (X being an integer > 0). | ||
16 | |||
17 | |||
18 | What: /sys/class/rfkill/rfkill[0-9]+/name | ||
19 | Date: 09-Jul-2007 | ||
20 | KernelVersion v2.6.22 | ||
21 | Contact: linux-wireless@vger.kernel.org | ||
22 | Description: Name assigned by driver to this key (interface or driver name). | ||
23 | Values: arbitrary string. | ||
24 | |||
25 | |||
26 | What: /sys/class/rfkill/rfkill[0-9]+/type | ||
27 | Date: 09-Jul-2007 | ||
28 | KernelVersion v2.6.22 | ||
29 | Contact: linux-wireless@vger.kernel.org | ||
30 | Description: Driver type string ("wlan", "bluetooth", etc). | ||
31 | Values: See include/linux/rfkill.h. | ||
32 | |||
33 | |||
34 | What: /sys/class/rfkill/rfkill[0-9]+/persistent | ||
35 | Date: 09-Jul-2007 | ||
36 | KernelVersion v2.6.22 | ||
37 | Contact: linux-wireless@vger.kernel.org | ||
38 | Description: Whether the soft blocked state is initialised from non-volatile | ||
39 | storage at startup. | ||
40 | Values: A numeric value. | ||
41 | 0: false | ||
42 | 1: true | ||
43 | |||
44 | |||
45 | What: /sys/class/rfkill/rfkill[0-9]+/hard | ||
46 | Date: 12-March-2010 | ||
47 | KernelVersion v2.6.34 | ||
48 | Contact: linux-wireless@vger.kernel.org | ||
49 | Description: Current hardblock state. This file is read only. | ||
50 | Values: A numeric value. | ||
51 | 0: inactive | ||
52 | The transmitter is (potentially) active. | ||
53 | 1: active | ||
54 | The transmitter is forced off by something outside of | ||
55 | the driver's control. | ||
56 | |||
57 | |||
58 | What: /sys/class/rfkill/rfkill[0-9]+/soft | ||
59 | Date: 12-March-2010 | ||
60 | KernelVersion v2.6.34 | ||
61 | Contact: linux-wireless@vger.kernel.org | ||
62 | Description: Current softblock state. This file is read and write. | ||
63 | Values: A numeric value. | ||
64 | 0: inactive | ||
65 | The transmitter is (potentially) active. | ||
66 | 1: active | ||
67 | The transmitter is turned off by software. | ||
diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node new file mode 100644 index 000000000000..49b82cad7003 --- /dev/null +++ b/Documentation/ABI/stable/sysfs-devices-node | |||
@@ -0,0 +1,7 @@ | |||
1 | What: /sys/devices/system/node/nodeX | ||
2 | Date: October 2002 | ||
3 | Contact: Linux Memory Management list <linux-mm@kvack.org> | ||
4 | Description: | ||
5 | When CONFIG_NUMA is enabled, this is a directory containing | ||
6 | information on node X such as what CPUs are local to the | ||
7 | node. | ||
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index d2f90334bb93..4873c759d535 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block | |||
@@ -128,3 +128,17 @@ Description: | |||
128 | preferred request size for workloads where sustained | 128 | preferred request size for workloads where sustained |
129 | throughput is desired. If no optimal I/O size is | 129 | throughput is desired. If no optimal I/O size is |
130 | reported this file contains 0. | 130 | reported this file contains 0. |
131 | |||
132 | What: /sys/block/<disk>/queue/nomerges | ||
133 | Date: January 2010 | ||
134 | Contact: | ||
135 | Description: | ||
136 | Standard I/O elevator operations include attempts to | ||
137 | merge contiguous I/Os. For known random I/O loads these | ||
138 | attempts will always fail and result in extra cycles | ||
139 | being spent in the kernel. This allows one to turn off | ||
140 | this behavior on one of two ways: When set to 1, complex | ||
141 | merge checks are disabled, but the simple one-shot merges | ||
142 | with the previous I/O request are enabled. When set to 2, | ||
143 | all merge tries are disabled. The default value is 0 - | ||
144 | which enables all types of merge tries. | ||
diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index 25be3250f7d6..428676cfa61e 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci | |||
@@ -133,6 +133,46 @@ Description: | |||
133 | The symbolic link points to the PCI device sysfs entry of the | 133 | The symbolic link points to the PCI device sysfs entry of the |
134 | Physical Function this device associates with. | 134 | Physical Function this device associates with. |
135 | 135 | ||
136 | |||
137 | What: /sys/bus/pci/slots/... | ||
138 | Date: April 2005 (possibly older) | ||
139 | KernelVersion: 2.6.12 (possibly older) | ||
140 | Contact: linux-pci@vger.kernel.org | ||
141 | Description: | ||
142 | When the appropriate driver is loaded, it will create a | ||
143 | directory per claimed physical PCI slot in | ||
144 | /sys/bus/pci/slots/. The names of these directories are | ||
145 | specific to the driver, which in turn, are specific to the | ||
146 | platform, but in general, should match the label on the | ||
147 | machine's physical chassis. | ||
148 | |||
149 | The drivers that can create slot directories include the | ||
150 | PCI hotplug drivers, and as of 2.6.27, the pci_slot driver. | ||
151 | |||
152 | The slot directories contain, at a minimum, a file named | ||
153 | 'address' which contains the PCI bus:device:function tuple. | ||
154 | Other files may appear as well, but are specific to the | ||
155 | driver. | ||
156 | |||
157 | What: /sys/bus/pci/slots/.../function[0-7] | ||
158 | Date: March 2010 | ||
159 | KernelVersion: 2.6.35 | ||
160 | Contact: linux-pci@vger.kernel.org | ||
161 | Description: | ||
162 | If PCI slot directories (as described above) are created, | ||
163 | and the physical slot is actually populated with a device, | ||
164 | symbolic links in the slot directory pointing to the | ||
165 | device's PCI functions are created as well. | ||
166 | |||
167 | What: /sys/bus/pci/devices/.../slot | ||
168 | Date: March 2010 | ||
169 | KernelVersion: 2.6.35 | ||
170 | Contact: linux-pci@vger.kernel.org | ||
171 | Description: | ||
172 | If PCI slot directories (as described above) are created, | ||
173 | a symbolic link pointing to the slot directory will be | ||
174 | created as well. | ||
175 | |||
136 | What: /sys/bus/pci/slots/.../module | 176 | What: /sys/bus/pci/slots/.../module |
137 | Date: June 2009 | 177 | Date: June 2009 |
138 | Contact: linux-pci@vger.kernel.org | 178 | Contact: linux-pci@vger.kernel.org |
diff --git a/Documentation/ABI/testing/sysfs-bus-usb b/Documentation/ABI/testing/sysfs-bus-usb index a07c0f366f91..294aa864a60a 100644 --- a/Documentation/ABI/testing/sysfs-bus-usb +++ b/Documentation/ABI/testing/sysfs-bus-usb | |||
@@ -14,34 +14,6 @@ Description: | |||
14 | The autosuspend delay for newly-created devices is set to | 14 | The autosuspend delay for newly-created devices is set to |
15 | the value of the usbcore.autosuspend module parameter. | 15 | the value of the usbcore.autosuspend module parameter. |
16 | 16 | ||
17 | What: /sys/bus/usb/devices/.../power/level | ||
18 | Date: March 2007 | ||
19 | KernelVersion: 2.6.21 | ||
20 | Contact: Alan Stern <stern@rowland.harvard.edu> | ||
21 | Description: | ||
22 | Each USB device directory will contain a file named | ||
23 | power/level. This file holds a power-level setting for | ||
24 | the device, either "on" or "auto". | ||
25 | |||
26 | "on" means that the device is not allowed to autosuspend, | ||
27 | although normal suspends for system sleep will still | ||
28 | be honored. "auto" means the device will autosuspend | ||
29 | and autoresume in the usual manner, according to the | ||
30 | capabilities of its driver. | ||
31 | |||
32 | During normal use, devices should be left in the "auto" | ||
33 | level. The "on" level is meant for administrative uses. | ||
34 | If you want to suspend a device immediately but leave it | ||
35 | free to wake up in response to I/O requests, you should | ||
36 | write "0" to power/autosuspend. | ||
37 | |||
38 | Device not capable of proper suspend and resume should be | ||
39 | left in the "on" level. Although the USB spec requires | ||
40 | devices to support suspend/resume, many of them do not. | ||
41 | In fact so many don't that by default, the USB core | ||
42 | initializes all non-hub devices in the "on" level. Some | ||
43 | drivers may change this setting when they are bound. | ||
44 | |||
45 | What: /sys/bus/usb/devices/.../power/persist | 17 | What: /sys/bus/usb/devices/.../power/persist |
46 | Date: May 2007 | 18 | Date: May 2007 |
47 | KernelVersion: 2.6.23 | 19 | KernelVersion: 2.6.23 |
@@ -159,3 +131,14 @@ Description: | |||
159 | device. This is useful to ensure auto probing won't | 131 | device. This is useful to ensure auto probing won't |
160 | match the driver to the device. For example: | 132 | match the driver to the device. For example: |
161 | # echo "046d c315" > /sys/bus/usb/drivers/foo/remove_id | 133 | # echo "046d c315" > /sys/bus/usb/drivers/foo/remove_id |
134 | |||
135 | What: /sys/bus/usb/device/.../avoid_reset_quirk | ||
136 | Date: December 2009 | ||
137 | Contact: Oliver Neukum <oliver@neukum.org> | ||
138 | Description: | ||
139 | Writing 1 to this file tells the kernel that this | ||
140 | device will morph into another mode when it is reset. | ||
141 | Drivers will not use reset for error handling for | ||
142 | such devices. | ||
143 | Users: | ||
144 | usb_modeswitch | ||
diff --git a/Documentation/ABI/testing/sysfs-class-power b/Documentation/ABI/testing/sysfs-class-power new file mode 100644 index 000000000000..78c7baca3587 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-power | |||
@@ -0,0 +1,20 @@ | |||
1 | What: /sys/class/power/ds2760-battery.*/charge_now | ||
2 | Date: May 2010 | ||
3 | KernelVersion: 2.6.35 | ||
4 | Contact: Daniel Mack <daniel@caiaq.de> | ||
5 | Description: | ||
6 | This file is writeable and can be used to set the current | ||
7 | coloumb counter value inside the battery monitor chip. This | ||
8 | is needed for unavoidable corrections of aging batteries. | ||
9 | A userspace daemon can monitor the battery charging logic | ||
10 | and once the counter drops out of considerable bounds, take | ||
11 | appropriate action. | ||
12 | |||
13 | What: /sys/class/power/ds2760-battery.*/charge_full | ||
14 | Date: May 2010 | ||
15 | KernelVersion: 2.6.35 | ||
16 | Contact: Daniel Mack <daniel@caiaq.de> | ||
17 | Description: | ||
18 | This file is writeable and can be used to set the assumed | ||
19 | battery 'full level'. As batteries age, this value has to be | ||
20 | amended over time. | ||
diff --git a/Documentation/ABI/testing/sysfs-devices-memory b/Documentation/ABI/testing/sysfs-devices-memory index bf1627b02a03..aba7d989208c 100644 --- a/Documentation/ABI/testing/sysfs-devices-memory +++ b/Documentation/ABI/testing/sysfs-devices-memory | |||
@@ -43,7 +43,7 @@ Date: September 2008 | |||
43 | Contact: Badari Pulavarty <pbadari@us.ibm.com> | 43 | Contact: Badari Pulavarty <pbadari@us.ibm.com> |
44 | Description: | 44 | Description: |
45 | The file /sys/devices/system/memory/memoryX/state | 45 | The file /sys/devices/system/memory/memoryX/state |
46 | is read-write. When read, it's contents show the | 46 | is read-write. When read, its contents show the |
47 | online/offline state of the memory section. When written, | 47 | online/offline state of the memory section. When written, |
48 | root can toggle the the online/offline state of a removable | 48 | root can toggle the the online/offline state of a removable |
49 | memory section (see removable file description above) | 49 | memory section (see removable file description above) |
diff --git a/Documentation/ABI/testing/sysfs-devices-node b/Documentation/ABI/testing/sysfs-devices-node new file mode 100644 index 000000000000..453a210c3ceb --- /dev/null +++ b/Documentation/ABI/testing/sysfs-devices-node | |||
@@ -0,0 +1,7 @@ | |||
1 | What: /sys/devices/system/node/nodeX/compact | ||
2 | Date: February 2010 | ||
3 | Contact: Mel Gorman <mel@csn.ul.ie> | ||
4 | Description: | ||
5 | When this file is written to, all memory within that node | ||
6 | will be compacted. When it completes, memory will be freed | ||
7 | into blocks which have as many contiguous pages as possible | ||
diff --git a/Documentation/ABI/testing/sysfs-devices-platform-_UDC_-gadget b/Documentation/ABI/testing/sysfs-devices-platform-_UDC_-gadget new file mode 100644 index 000000000000..34034027b13c --- /dev/null +++ b/Documentation/ABI/testing/sysfs-devices-platform-_UDC_-gadget | |||
@@ -0,0 +1,9 @@ | |||
1 | What: /sys/devices/platform/_UDC_/gadget/suspended | ||
2 | Date: April 2010 | ||
3 | Contact: Fabien Chouteau <fabien.chouteau@barco.com> | ||
4 | Description: | ||
5 | Show the suspend state of an USB composite gadget. | ||
6 | 1 -> suspended | ||
7 | 0 -> resumed | ||
8 | |||
9 | (_UDC_ is the name of the USB Device Controller driver) | ||
diff --git a/Documentation/ABI/testing/sysfs-devices-power b/Documentation/ABI/testing/sysfs-devices-power new file mode 100644 index 000000000000..6123c523bfd7 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-devices-power | |||
@@ -0,0 +1,79 @@ | |||
1 | What: /sys/devices/.../power/ | ||
2 | Date: January 2009 | ||
3 | Contact: Rafael J. Wysocki <rjw@sisk.pl> | ||
4 | Description: | ||
5 | The /sys/devices/.../power directory contains attributes | ||
6 | allowing the user space to check and modify some power | ||
7 | management related properties of given device. | ||
8 | |||
9 | What: /sys/devices/.../power/wakeup | ||
10 | Date: January 2009 | ||
11 | Contact: Rafael J. Wysocki <rjw@sisk.pl> | ||
12 | Description: | ||
13 | The /sys/devices/.../power/wakeup attribute allows the user | ||
14 | space to check if the device is enabled to wake up the system | ||
15 | from sleep states, such as the memory sleep state (suspend to | ||
16 | RAM) and hibernation (suspend to disk), and to enable or disable | ||
17 | it to do that as desired. | ||
18 | |||
19 | Some devices support "wakeup" events, which are hardware signals | ||
20 | used to activate the system from a sleep state. Such devices | ||
21 | have one of the following two values for the sysfs power/wakeup | ||
22 | file: | ||
23 | |||
24 | + "enabled\n" to issue the events; | ||
25 | + "disabled\n" not to do so; | ||
26 | |||
27 | In that cases the user space can change the setting represented | ||
28 | by the contents of this file by writing either "enabled", or | ||
29 | "disabled" to it. | ||
30 | |||
31 | For the devices that are not capable of generating system wakeup | ||
32 | events this file contains "\n". In that cases the user space | ||
33 | cannot modify the contents of this file and the device cannot be | ||
34 | enabled to wake up the system. | ||
35 | |||
36 | What: /sys/devices/.../power/control | ||
37 | Date: January 2009 | ||
38 | Contact: Rafael J. Wysocki <rjw@sisk.pl> | ||
39 | Description: | ||
40 | The /sys/devices/.../power/control attribute allows the user | ||
41 | space to control the run-time power management of the device. | ||
42 | |||
43 | All devices have one of the following two values for the | ||
44 | power/control file: | ||
45 | |||
46 | + "auto\n" to allow the device to be power managed at run time; | ||
47 | + "on\n" to prevent the device from being power managed; | ||
48 | |||
49 | The default for all devices is "auto", which means that they may | ||
50 | be subject to automatic power management, depending on their | ||
51 | drivers. Changing this attribute to "on" prevents the driver | ||
52 | from power managing the device at run time. Doing that while | ||
53 | the device is suspended causes it to be woken up. | ||
54 | |||
55 | What: /sys/devices/.../power/async | ||
56 | Date: January 2009 | ||
57 | Contact: Rafael J. Wysocki <rjw@sisk.pl> | ||
58 | Description: | ||
59 | The /sys/devices/.../async attribute allows the user space to | ||
60 | enable or diasble the device's suspend and resume callbacks to | ||
61 | be executed asynchronously (ie. in separate threads, in parallel | ||
62 | with the main suspend/resume thread) during system-wide power | ||
63 | transitions (eg. suspend to RAM, hibernation). | ||
64 | |||
65 | All devices have one of the following two values for the | ||
66 | power/async file: | ||
67 | |||
68 | + "enabled\n" to permit the asynchronous suspend/resume; | ||
69 | + "disabled\n" to forbid it; | ||
70 | |||
71 | The value of this attribute may be changed by writing either | ||
72 | "enabled", or "disabled" to it. | ||
73 | |||
74 | It generally is unsafe to permit the asynchronous suspend/resume | ||
75 | of a device unless it is certain that all of the PM dependencies | ||
76 | of the device are known to the PM core. However, for some | ||
77 | devices this attribute is set to "enabled" by bus type code or | ||
78 | device drivers and in that cases it should be safe to leave the | ||
79 | default value. | ||
diff --git a/Documentation/ABI/testing/sysfs-driver-hid-picolcd b/Documentation/ABI/testing/sysfs-driver-hid-picolcd new file mode 100644 index 000000000000..08579e7e1e89 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-driver-hid-picolcd | |||
@@ -0,0 +1,43 @@ | |||
1 | What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/operation_mode | ||
2 | Date: March 2010 | ||
3 | Contact: Bruno Prémont <bonbons@linux-vserver.org> | ||
4 | Description: Make it possible to switch the PicoLCD device between LCD | ||
5 | (firmware) and bootloader (flasher) operation modes. | ||
6 | |||
7 | Reading: returns list of available modes, the active mode being | ||
8 | enclosed in brackets ('[' and ']') | ||
9 | |||
10 | Writing: causes operation mode switch. Permitted values are | ||
11 | the non-active mode names listed when read. | ||
12 | |||
13 | Note: when switching mode the current PicoLCD HID device gets | ||
14 | disconnected and reconnects after above delay (see attribute | ||
15 | operation_mode_delay for its value). | ||
16 | |||
17 | |||
18 | What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/operation_mode_delay | ||
19 | Date: April 2010 | ||
20 | Contact: Bruno Prémont <bonbons@linux-vserver.org> | ||
21 | Description: Delay PicoLCD waits before restarting in new mode when | ||
22 | operation_mode has changed. | ||
23 | |||
24 | Reading/Writing: It is expressed in ms and permitted range is | ||
25 | 0..30000ms. | ||
26 | |||
27 | |||
28 | What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/fb_update_rate | ||
29 | Date: March 2010 | ||
30 | Contact: Bruno Prémont <bonbons@linux-vserver.org> | ||
31 | Description: Make it possible to adjust defio refresh rate. | ||
32 | |||
33 | Reading: returns list of available refresh rates (expressed in Hz), | ||
34 | the active refresh rate being enclosed in brackets ('[' and ']') | ||
35 | |||
36 | Writing: accepts new refresh rate expressed in integer Hz | ||
37 | within permitted rates. | ||
38 | |||
39 | Note: As device can barely do 2 complete refreshes a second | ||
40 | it only makes sense to adjust this value if only one or two | ||
41 | tiles get changed and it's not appropriate to expect the application | ||
42 | to flush it's tiny changes explicitely at higher than default rate. | ||
43 | |||
diff --git a/Documentation/ABI/testing/sysfs-driver-hid-prodikeys b/Documentation/ABI/testing/sysfs-driver-hid-prodikeys new file mode 100644 index 000000000000..05d988c29a83 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-driver-hid-prodikeys | |||
@@ -0,0 +1,29 @@ | |||
1 | What: /sys/bus/hid/drivers/prodikeys/.../channel | ||
2 | Date: April 2010 | ||
3 | KernelVersion: 2.6.34 | ||
4 | Contact: Don Prince <dhprince.devel@yahoo.co.uk> | ||
5 | Description: | ||
6 | Allows control (via software) the midi channel to which | ||
7 | that the pc-midi keyboard will output.midi data. | ||
8 | Range: 0..15 | ||
9 | Type: Read/write | ||
10 | What: /sys/bus/hid/drivers/prodikeys/.../sustain | ||
11 | Date: April 2010 | ||
12 | KernelVersion: 2.6.34 | ||
13 | Contact: Don Prince <dhprince.devel@yahoo.co.uk> | ||
14 | Description: | ||
15 | Allows control (via software) the sustain duration of a | ||
16 | note held by the pc-midi driver. | ||
17 | 0 means sustain mode is disabled. | ||
18 | Range: 0..5000 (milliseconds) | ||
19 | Type: Read/write | ||
20 | What: /sys/bus/hid/drivers/prodikeys/.../octave | ||
21 | Date: April 2010 | ||
22 | KernelVersion: 2.6.34 | ||
23 | Contact: Don Prince <dhprince.devel@yahoo.co.uk> | ||
24 | Description: | ||
25 | Controls the octave shift modifier in the pc-midi driver. | ||
26 | The octave can be shifted via software up/down 2 octaves. | ||
27 | 0 means the no ocatve shift. | ||
28 | Range: -2..2 (minus 2 to plus 2) | ||
29 | Type: Read/Write | ||
diff --git a/Documentation/ABI/testing/sysfs-driver-hid-roccat-kone b/Documentation/ABI/testing/sysfs-driver-hid-roccat-kone new file mode 100644 index 000000000000..88340a23ce91 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-driver-hid-roccat-kone | |||
@@ -0,0 +1,111 @@ | |||
1 | What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/actual_dpi | ||
2 | Date: March 2010 | ||
3 | Contact: Stefan Achatz <erazor_de@users.sourceforge.net> | ||
4 | Description: It is possible to switch the dpi setting of the mouse with the | ||
5 | press of a button. | ||
6 | When read, this file returns the raw number of the actual dpi | ||
7 | setting reported by the mouse. This number has to be further | ||
8 | processed to receive the real dpi value. | ||
9 | |||
10 | VALUE DPI | ||
11 | 1 800 | ||
12 | 2 1200 | ||
13 | 3 1600 | ||
14 | 4 2000 | ||
15 | 5 2400 | ||
16 | 6 3200 | ||
17 | |||
18 | This file is readonly. | ||
19 | |||
20 | What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/actual_profile | ||
21 | Date: March 2010 | ||
22 | Contact: Stefan Achatz <erazor_de@users.sourceforge.net> | ||
23 | Description: When read, this file returns the number of the actual profile. | ||
24 | This file is readonly. | ||
25 | |||
26 | What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/firmware_version | ||
27 | Date: March 2010 | ||
28 | Contact: Stefan Achatz <erazor_de@users.sourceforge.net> | ||
29 | Description: When read, this file returns the raw integer version number of the | ||
30 | firmware reported by the mouse. Using the integer value eases | ||
31 | further usage in other programs. To receive the real version | ||
32 | number the decimal point has to be shifted 2 positions to the | ||
33 | left. E.g. a returned value of 138 means 1.38 | ||
34 | This file is readonly. | ||
35 | |||
36 | What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/kone_driver_version | ||
37 | Date: March 2010 | ||
38 | Contact: Stefan Achatz <erazor_de@users.sourceforge.net> | ||
39 | Description: When read, this file returns the driver version. | ||
40 | The format of the string is "v<major>.<minor>.<patchlevel>". | ||
41 | This attribute is used by the userland tools to find the sysfs- | ||
42 | paths of installed kone-mice and determine the capabilites of | ||
43 | the driver. Versions of this driver for old kernels replace | ||
44 | usbhid instead of generic-usb. The way to scan for this file | ||
45 | has been chosen to provide a consistent way for all supported | ||
46 | kernel versions. | ||
47 | This file is readonly. | ||
48 | |||
49 | What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/profile[1-5] | ||
50 | Date: March 2010 | ||
51 | Contact: Stefan Achatz <erazor_de@users.sourceforge.net> | ||
52 | Description: The mouse can store 5 profiles which can be switched by the | ||
53 | press of a button. A profile holds informations like button | ||
54 | mappings, sensitivity, the colors of the 5 leds and light | ||
55 | effects. | ||
56 | When read, these files return the respective profile. The | ||
57 | returned data is 975 bytes in size. | ||
58 | When written, this file lets one write the respective profile | ||
59 | data back to the mouse. The data has to be 975 bytes long. | ||
60 | The mouse will reject invalid data, whereas the profile number | ||
61 | stored in the profile doesn't need to fit the number of the | ||
62 | store. | ||
63 | |||
64 | What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/settings | ||
65 | Date: March 2010 | ||
66 | Contact: Stefan Achatz <erazor_de@users.sourceforge.net> | ||
67 | Description: When read, this file returns the settings stored in the mouse. | ||
68 | The size of the data is 36 bytes and holds information like the | ||
69 | startup_profile, tcu state and calibration_data. | ||
70 | When written, this file lets write settings back to the mouse. | ||
71 | The data has to be 36 bytes long. The mouse will reject invalid | ||
72 | data. | ||
73 | |||
74 | What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/startup_profile | ||
75 | Date: March 2010 | ||
76 | Contact: Stefan Achatz <erazor_de@users.sourceforge.net> | ||
77 | Description: The integer value of this attribute ranges from 1 to 5. | ||
78 | When read, this attribute returns the number of the profile | ||
79 | that's active when the mouse is powered on. | ||
80 | When written, this file sets the number of the startup profile | ||
81 | and the mouse activates this profile immediately. | ||
82 | |||
83 | What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/tcu | ||
84 | Date: March 2010 | ||
85 | Contact: Stefan Achatz <erazor_de@users.sourceforge.net> | ||
86 | Description: The mouse has a "Tracking Control Unit" which lets the user | ||
87 | calibrate the laser power to fit the mousepad surface. | ||
88 | When read, this file returns the current state of the TCU, | ||
89 | where 0 means off and 1 means on. | ||
90 | Writing 0 in this file will switch the TCU off. | ||
91 | Writing 1 in this file will start the calibration which takes | ||
92 | around 6 seconds to complete and activates the TCU. | ||
93 | |||
94 | What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/weight | ||
95 | Date: March 2010 | ||
96 | Contact: Stefan Achatz <erazor_de@users.sourceforge.net> | ||
97 | Description: The mouse can be equipped with one of four supplied weights | ||
98 | ranging from 5 to 20 grams which are recognized by the mouse | ||
99 | and its value can be read out. When read, this file returns the | ||
100 | raw value returned by the mouse which eases further processing | ||
101 | in other software. | ||
102 | The values map to the weights as follows: | ||
103 | |||
104 | VALUE WEIGHT | ||
105 | 0 none | ||
106 | 1 5g | ||
107 | 2 10g | ||
108 | 3 15g | ||
109 | 4 20g | ||
110 | |||
111 | This file is readonly. | ||
diff --git a/Documentation/ABI/testing/sysfs-platform-asus-laptop b/Documentation/ABI/testing/sysfs-platform-asus-laptop index a1cb660c50cf..1d775390e856 100644 --- a/Documentation/ABI/testing/sysfs-platform-asus-laptop +++ b/Documentation/ABI/testing/sysfs-platform-asus-laptop | |||
@@ -1,4 +1,4 @@ | |||
1 | What: /sys/devices/platform/asus-laptop/display | 1 | What: /sys/devices/platform/asus_laptop/display |
2 | Date: January 2007 | 2 | Date: January 2007 |
3 | KernelVersion: 2.6.20 | 3 | KernelVersion: 2.6.20 |
4 | Contact: "Corentin Chary" <corentincj@iksaif.net> | 4 | Contact: "Corentin Chary" <corentincj@iksaif.net> |
@@ -13,7 +13,7 @@ Description: | |||
13 | Ex: - 0 (0000b) means no display | 13 | Ex: - 0 (0000b) means no display |
14 | - 3 (0011b) CRT+LCD. | 14 | - 3 (0011b) CRT+LCD. |
15 | 15 | ||
16 | What: /sys/devices/platform/asus-laptop/gps | 16 | What: /sys/devices/platform/asus_laptop/gps |
17 | Date: January 2007 | 17 | Date: January 2007 |
18 | KernelVersion: 2.6.20 | 18 | KernelVersion: 2.6.20 |
19 | Contact: "Corentin Chary" <corentincj@iksaif.net> | 19 | Contact: "Corentin Chary" <corentincj@iksaif.net> |
@@ -21,7 +21,7 @@ Description: | |||
21 | Control the gps device. 1 means on, 0 means off. | 21 | Control the gps device. 1 means on, 0 means off. |
22 | Users: Lapsus | 22 | Users: Lapsus |
23 | 23 | ||
24 | What: /sys/devices/platform/asus-laptop/ledd | 24 | What: /sys/devices/platform/asus_laptop/ledd |
25 | Date: January 2007 | 25 | Date: January 2007 |
26 | KernelVersion: 2.6.20 | 26 | KernelVersion: 2.6.20 |
27 | Contact: "Corentin Chary" <corentincj@iksaif.net> | 27 | Contact: "Corentin Chary" <corentincj@iksaif.net> |
@@ -29,11 +29,11 @@ Description: | |||
29 | Some models like the W1N have a LED display that can be | 29 | Some models like the W1N have a LED display that can be |
30 | used to display several informations. | 30 | used to display several informations. |
31 | To control the LED display, use the following : | 31 | To control the LED display, use the following : |
32 | echo 0x0T000DDD > /sys/devices/platform/asus-laptop/ | 32 | echo 0x0T000DDD > /sys/devices/platform/asus_laptop/ |
33 | where T control the 3 letters display, and DDD the 3 digits display. | 33 | where T control the 3 letters display, and DDD the 3 digits display. |
34 | The DDD table can be found in Documentation/laptops/asus-laptop.txt | 34 | The DDD table can be found in Documentation/laptops/asus-laptop.txt |
35 | 35 | ||
36 | What: /sys/devices/platform/asus-laptop/bluetooth | 36 | What: /sys/devices/platform/asus_laptop/bluetooth |
37 | Date: January 2007 | 37 | Date: January 2007 |
38 | KernelVersion: 2.6.20 | 38 | KernelVersion: 2.6.20 |
39 | Contact: "Corentin Chary" <corentincj@iksaif.net> | 39 | Contact: "Corentin Chary" <corentincj@iksaif.net> |
@@ -42,7 +42,7 @@ Description: | |||
42 | This may control the led, the device or both. | 42 | This may control the led, the device or both. |
43 | Users: Lapsus | 43 | Users: Lapsus |
44 | 44 | ||
45 | What: /sys/devices/platform/asus-laptop/wlan | 45 | What: /sys/devices/platform/asus_laptop/wlan |
46 | Date: January 2007 | 46 | Date: January 2007 |
47 | KernelVersion: 2.6.20 | 47 | KernelVersion: 2.6.20 |
48 | Contact: "Corentin Chary" <corentincj@iksaif.net> | 48 | Contact: "Corentin Chary" <corentincj@iksaif.net> |
diff --git a/Documentation/ABI/testing/sysfs-platform-eeepc-laptop b/Documentation/ABI/testing/sysfs-platform-eeepc-laptop index 7445dfb321b5..5b026c69587a 100644 --- a/Documentation/ABI/testing/sysfs-platform-eeepc-laptop +++ b/Documentation/ABI/testing/sysfs-platform-eeepc-laptop | |||
@@ -1,4 +1,4 @@ | |||
1 | What: /sys/devices/platform/eeepc-laptop/disp | 1 | What: /sys/devices/platform/eeepc/disp |
2 | Date: May 2008 | 2 | Date: May 2008 |
3 | KernelVersion: 2.6.26 | 3 | KernelVersion: 2.6.26 |
4 | Contact: "Corentin Chary" <corentincj@iksaif.net> | 4 | Contact: "Corentin Chary" <corentincj@iksaif.net> |
@@ -9,21 +9,21 @@ Description: | |||
9 | - 3 = LCD+CRT | 9 | - 3 = LCD+CRT |
10 | If you run X11, you should use xrandr instead. | 10 | If you run X11, you should use xrandr instead. |
11 | 11 | ||
12 | What: /sys/devices/platform/eeepc-laptop/camera | 12 | What: /sys/devices/platform/eeepc/camera |
13 | Date: May 2008 | 13 | Date: May 2008 |
14 | KernelVersion: 2.6.26 | 14 | KernelVersion: 2.6.26 |
15 | Contact: "Corentin Chary" <corentincj@iksaif.net> | 15 | Contact: "Corentin Chary" <corentincj@iksaif.net> |
16 | Description: | 16 | Description: |
17 | Control the camera. 1 means on, 0 means off. | 17 | Control the camera. 1 means on, 0 means off. |
18 | 18 | ||
19 | What: /sys/devices/platform/eeepc-laptop/cardr | 19 | What: /sys/devices/platform/eeepc/cardr |
20 | Date: May 2008 | 20 | Date: May 2008 |
21 | KernelVersion: 2.6.26 | 21 | KernelVersion: 2.6.26 |
22 | Contact: "Corentin Chary" <corentincj@iksaif.net> | 22 | Contact: "Corentin Chary" <corentincj@iksaif.net> |
23 | Description: | 23 | Description: |
24 | Control the card reader. 1 means on, 0 means off. | 24 | Control the card reader. 1 means on, 0 means off. |
25 | 25 | ||
26 | What: /sys/devices/platform/eeepc-laptop/cpufv | 26 | What: /sys/devices/platform/eeepc/cpufv |
27 | Date: Jun 2009 | 27 | Date: Jun 2009 |
28 | KernelVersion: 2.6.31 | 28 | KernelVersion: 2.6.31 |
29 | Contact: "Corentin Chary" <corentincj@iksaif.net> | 29 | Contact: "Corentin Chary" <corentincj@iksaif.net> |
@@ -42,7 +42,7 @@ Description: | |||
42 | `------------ Availables modes | 42 | `------------ Availables modes |
43 | For example, 0x301 means: mode 1 selected, 3 available modes. | 43 | For example, 0x301 means: mode 1 selected, 3 available modes. |
44 | 44 | ||
45 | What: /sys/devices/platform/eeepc-laptop/available_cpufv | 45 | What: /sys/devices/platform/eeepc/available_cpufv |
46 | Date: Jun 2009 | 46 | Date: Jun 2009 |
47 | KernelVersion: 2.6.31 | 47 | KernelVersion: 2.6.31 |
48 | Contact: "Corentin Chary" <corentincj@iksaif.net> | 48 | Contact: "Corentin Chary" <corentincj@iksaif.net> |
diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power index dcff4d0623ad..d6a801f45b48 100644 --- a/Documentation/ABI/testing/sysfs-power +++ b/Documentation/ABI/testing/sysfs-power | |||
@@ -101,3 +101,16 @@ Description: | |||
101 | 101 | ||
102 | CAUTION: Using it will cause your machine's real-time (CMOS) | 102 | CAUTION: Using it will cause your machine's real-time (CMOS) |
103 | clock to be set to a random invalid time after a resume. | 103 | clock to be set to a random invalid time after a resume. |
104 | |||
105 | What: /sys/power/pm_async | ||
106 | Date: January 2009 | ||
107 | Contact: Rafael J. Wysocki <rjw@sisk.pl> | ||
108 | Description: | ||
109 | The /sys/power/pm_async file controls the switch allowing the | ||
110 | user space to enable or disable asynchronous suspend and resume | ||
111 | of devices. If enabled, this feature will cause some device | ||
112 | drivers' suspend and resume callbacks to be executed in parallel | ||
113 | with each other and with the main suspend thread. It is enabled | ||
114 | if this file contains "1", which is the default. It may be | ||
115 | disabled by writing "0" to this file, in which case all devices | ||
116 | will be suspended and resumed synchronously. | ||
diff --git a/Documentation/ABI/testing/sysfs-wacom b/Documentation/ABI/testing/sysfs-wacom new file mode 100644 index 000000000000..1517976e25c4 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-wacom | |||
@@ -0,0 +1,10 @@ | |||
1 | What: /sys/class/hidraw/hidraw*/device/speed | ||
2 | Date: April 2010 | ||
3 | Kernel Version: 2.6.35 | ||
4 | Contact: linux-bluetooth@vger.kernel.org | ||
5 | Description: | ||
6 | The /sys/class/hidraw/hidraw*/device/speed file controls | ||
7 | reporting speed of wacom bluetooth tablet. Reading from | ||
8 | this file returns 1 if tablet reports in high speed mode | ||
9 | or 0 otherwise. Writing to this file one of these values | ||
10 | switches reporting speed. | ||
diff --git a/Documentation/Changes b/Documentation/Changes index f08b313cd235..eca9f6e6fbe6 100644 --- a/Documentation/Changes +++ b/Documentation/Changes | |||
@@ -49,7 +49,7 @@ o oprofile 0.9 # oprofiled --version | |||
49 | o udev 081 # udevinfo -V | 49 | o udev 081 # udevinfo -V |
50 | o grub 0.93 # grub --version | 50 | o grub 0.93 # grub --version |
51 | o mcelog 0.6 | 51 | o mcelog 0.6 |
52 | o iptables 1.4.1 # iptables -V | 52 | o iptables 1.4.2 # iptables -V |
53 | 53 | ||
54 | 54 | ||
55 | Kernel compilation | 55 | Kernel compilation |
diff --git a/Documentation/PCI/PCI-DMA-mapping.txt b/Documentation/DMA-API-HOWTO.txt index ecad88d9fe59..98ce51796f71 100644 --- a/Documentation/PCI/PCI-DMA-mapping.txt +++ b/Documentation/DMA-API-HOWTO.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | Dynamic DMA mapping | 1 | Dynamic DMA mapping Guide |
2 | =================== | 2 | ========================= |
3 | 3 | ||
4 | David S. Miller <davem@redhat.com> | 4 | David S. Miller <davem@redhat.com> |
5 | Richard Henderson <rth@cygnus.com> | 5 | Richard Henderson <rth@cygnus.com> |
6 | Jakub Jelinek <jakub@redhat.com> | 6 | Jakub Jelinek <jakub@redhat.com> |
7 | 7 | ||
8 | This document describes the DMA mapping system in terms of the pci_ | 8 | This is a guide to device driver writers on how to use the DMA API |
9 | API. For a similar API that works for generic devices, see | 9 | with example pseudo-code. For a concise description of the API, see |
10 | DMA-API.txt. | 10 | DMA-API.txt. |
11 | 11 | ||
12 | Most of the 64bit platforms have special hardware that translates bus | 12 | Most of the 64bit platforms have special hardware that translates bus |
@@ -26,12 +26,15 @@ mapped only for the time they are actually used and unmapped after the DMA | |||
26 | transfer. | 26 | transfer. |
27 | 27 | ||
28 | The following API will work of course even on platforms where no such | 28 | The following API will work of course even on platforms where no such |
29 | hardware exists, see e.g. arch/x86/include/asm/pci.h for how it is implemented on | 29 | hardware exists. |
30 | top of the virt_to_bus interface. | 30 | |
31 | Note that the DMA API works with any bus independent of the underlying | ||
32 | microprocessor architecture. You should use the DMA API rather than | ||
33 | the bus specific DMA API (e.g. pci_dma_*). | ||
31 | 34 | ||
32 | First of all, you should make sure | 35 | First of all, you should make sure |
33 | 36 | ||
34 | #include <linux/pci.h> | 37 | #include <linux/dma-mapping.h> |
35 | 38 | ||
36 | is in your driver. This file will obtain for you the definition of the | 39 | is in your driver. This file will obtain for you the definition of the |
37 | dma_addr_t (which can hold any valid DMA address for the platform) | 40 | dma_addr_t (which can hold any valid DMA address for the platform) |
@@ -78,44 +81,43 @@ for you to DMA from/to. | |||
78 | DMA addressing limitations | 81 | DMA addressing limitations |
79 | 82 | ||
80 | Does your device have any DMA addressing limitations? For example, is | 83 | Does your device have any DMA addressing limitations? For example, is |
81 | your device only capable of driving the low order 24-bits of address | 84 | your device only capable of driving the low order 24-bits of address? |
82 | on the PCI bus for SAC DMA transfers? If so, you need to inform the | 85 | If so, you need to inform the kernel of this fact. |
83 | PCI layer of this fact. | ||
84 | 86 | ||
85 | By default, the kernel assumes that your device can address the full | 87 | By default, the kernel assumes that your device can address the full |
86 | 32-bits in a SAC cycle. For a 64-bit DAC capable device, this needs | 88 | 32-bits. For a 64-bit capable device, this needs to be increased. |
87 | to be increased. And for a device with limitations, as discussed in | 89 | And for a device with limitations, as discussed in the previous |
88 | the previous paragraph, it needs to be decreased. | 90 | paragraph, it needs to be decreased. |
89 | 91 | ||
90 | pci_alloc_consistent() by default will return 32-bit DMA addresses. | 92 | Special note about PCI: PCI-X specification requires PCI-X devices to |
91 | PCI-X specification requires PCI-X devices to support 64-bit | 93 | support 64-bit addressing (DAC) for all transactions. And at least |
92 | addressing (DAC) for all transactions. And at least one platform (SGI | 94 | one platform (SGI SN2) requires 64-bit consistent allocations to |
93 | SN2) requires 64-bit consistent allocations to operate correctly when | 95 | operate correctly when the IO bus is in PCI-X mode. |
94 | the IO bus is in PCI-X mode. Therefore, like with pci_set_dma_mask(), | 96 | |
95 | it's good practice to call pci_set_consistent_dma_mask() to set the | 97 | For correct operation, you must interrogate the kernel in your device |
96 | appropriate mask even if your device only supports 32-bit DMA | 98 | probe routine to see if the DMA controller on the machine can properly |
97 | (default) and especially if it's a PCI-X device. | 99 | support the DMA addressing limitation your device has. It is good |
98 | 100 | style to do this even if your device holds the default setting, | |
99 | For correct operation, you must interrogate the PCI layer in your | ||
100 | device probe routine to see if the PCI controller on the machine can | ||
101 | properly support the DMA addressing limitation your device has. It is | ||
102 | good style to do this even if your device holds the default setting, | ||
103 | because this shows that you did think about these issues wrt. your | 101 | because this shows that you did think about these issues wrt. your |
104 | device. | 102 | device. |
105 | 103 | ||
106 | The query is performed via a call to pci_set_dma_mask(): | 104 | The query is performed via a call to dma_set_mask(): |
107 | 105 | ||
108 | int pci_set_dma_mask(struct pci_dev *pdev, u64 device_mask); | 106 | int dma_set_mask(struct device *dev, u64 mask); |
109 | 107 | ||
110 | The query for consistent allocations is performed via a call to | 108 | The query for consistent allocations is performed via a call to |
111 | pci_set_consistent_dma_mask(): | 109 | dma_set_coherent_mask(): |
112 | 110 | ||
113 | int pci_set_consistent_dma_mask(struct pci_dev *pdev, u64 device_mask); | 111 | int dma_set_coherent_mask(struct device *dev, u64 mask); |
114 | 112 | ||
115 | Here, pdev is a pointer to the PCI device struct of your device, and | 113 | Here, dev is a pointer to the device struct of your device, and mask |
116 | device_mask is a bit mask describing which bits of a PCI address your | 114 | is a bit mask describing which bits of an address your device |
117 | device supports. It returns zero if your card can perform DMA | 115 | supports. It returns zero if your card can perform DMA properly on |
118 | properly on the machine given the address mask you provided. | 116 | the machine given the address mask you provided. In general, the |
117 | device struct of your device is embedded in the bus specific device | ||
118 | struct of your device. For example, a pointer to the device struct of | ||
119 | your PCI device is pdev->dev (pdev is a pointer to the PCI device | ||
120 | struct of your device). | ||
119 | 121 | ||
120 | If it returns non-zero, your device cannot perform DMA properly on | 122 | If it returns non-zero, your device cannot perform DMA properly on |
121 | this platform, and attempting to do so will result in undefined | 123 | this platform, and attempting to do so will result in undefined |
@@ -133,31 +135,30 @@ of your driver reports that performance is bad or that the device is not | |||
133 | even detected, you can ask them for the kernel messages to find out | 135 | even detected, you can ask them for the kernel messages to find out |
134 | exactly why. | 136 | exactly why. |
135 | 137 | ||
136 | The standard 32-bit addressing PCI device would do something like | 138 | The standard 32-bit addressing device would do something like this: |
137 | this: | ||
138 | 139 | ||
139 | if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) { | 140 | if (dma_set_mask(dev, DMA_BIT_MASK(32))) { |
140 | printk(KERN_WARNING | 141 | printk(KERN_WARNING |
141 | "mydev: No suitable DMA available.\n"); | 142 | "mydev: No suitable DMA available.\n"); |
142 | goto ignore_this_device; | 143 | goto ignore_this_device; |
143 | } | 144 | } |
144 | 145 | ||
145 | Another common scenario is a 64-bit capable device. The approach | 146 | Another common scenario is a 64-bit capable device. The approach here |
146 | here is to try for 64-bit DAC addressing, but back down to a | 147 | is to try for 64-bit addressing, but back down to a 32-bit mask that |
147 | 32-bit mask should that fail. The PCI platform code may fail the | 148 | should not fail. The kernel may fail the 64-bit mask not because the |
148 | 64-bit mask not because the platform is not capable of 64-bit | 149 | platform is not capable of 64-bit addressing. Rather, it may fail in |
149 | addressing. Rather, it may fail in this case simply because | 150 | this case simply because 32-bit addressing is done more efficiently |
150 | 32-bit SAC addressing is done more efficiently than DAC addressing. | 151 | than 64-bit addressing. For example, Sparc64 PCI SAC addressing is |
151 | Sparc64 is one platform which behaves in this way. | 152 | more efficient than DAC addressing. |
152 | 153 | ||
153 | Here is how you would handle a 64-bit capable device which can drive | 154 | Here is how you would handle a 64-bit capable device which can drive |
154 | all 64-bits when accessing streaming DMA: | 155 | all 64-bits when accessing streaming DMA: |
155 | 156 | ||
156 | int using_dac; | 157 | int using_dac; |
157 | 158 | ||
158 | if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) { | 159 | if (!dma_set_mask(dev, DMA_BIT_MASK(64))) { |
159 | using_dac = 1; | 160 | using_dac = 1; |
160 | } else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) { | 161 | } else if (!dma_set_mask(dev, DMA_BIT_MASK(32))) { |
161 | using_dac = 0; | 162 | using_dac = 0; |
162 | } else { | 163 | } else { |
163 | printk(KERN_WARNING | 164 | printk(KERN_WARNING |
@@ -170,36 +171,36 @@ the case would look like this: | |||
170 | 171 | ||
171 | int using_dac, consistent_using_dac; | 172 | int using_dac, consistent_using_dac; |
172 | 173 | ||
173 | if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) { | 174 | if (!dma_set_mask(dev, DMA_BIT_MASK(64))) { |
174 | using_dac = 1; | 175 | using_dac = 1; |
175 | consistent_using_dac = 1; | 176 | consistent_using_dac = 1; |
176 | pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); | 177 | dma_set_coherent_mask(dev, DMA_BIT_MASK(64)); |
177 | } else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) { | 178 | } else if (!dma_set_mask(dev, DMA_BIT_MASK(32))) { |
178 | using_dac = 0; | 179 | using_dac = 0; |
179 | consistent_using_dac = 0; | 180 | consistent_using_dac = 0; |
180 | pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); | 181 | dma_set_coherent_mask(dev, DMA_BIT_MASK(32)); |
181 | } else { | 182 | } else { |
182 | printk(KERN_WARNING | 183 | printk(KERN_WARNING |
183 | "mydev: No suitable DMA available.\n"); | 184 | "mydev: No suitable DMA available.\n"); |
184 | goto ignore_this_device; | 185 | goto ignore_this_device; |
185 | } | 186 | } |
186 | 187 | ||
187 | pci_set_consistent_dma_mask() will always be able to set the same or a | 188 | dma_set_coherent_mask() will always be able to set the same or a |
188 | smaller mask as pci_set_dma_mask(). However for the rare case that a | 189 | smaller mask as dma_set_mask(). However for the rare case that a |
189 | device driver only uses consistent allocations, one would have to | 190 | device driver only uses consistent allocations, one would have to |
190 | check the return value from pci_set_consistent_dma_mask(). | 191 | check the return value from dma_set_coherent_mask(). |
191 | 192 | ||
192 | Finally, if your device can only drive the low 24-bits of | 193 | Finally, if your device can only drive the low 24-bits of |
193 | address during PCI bus mastering you might do something like: | 194 | address you might do something like: |
194 | 195 | ||
195 | if (pci_set_dma_mask(pdev, DMA_BIT_MASK(24))) { | 196 | if (dma_set_mask(dev, DMA_BIT_MASK(24))) { |
196 | printk(KERN_WARNING | 197 | printk(KERN_WARNING |
197 | "mydev: 24-bit DMA addressing not available.\n"); | 198 | "mydev: 24-bit DMA addressing not available.\n"); |
198 | goto ignore_this_device; | 199 | goto ignore_this_device; |
199 | } | 200 | } |
200 | 201 | ||
201 | When pci_set_dma_mask() is successful, and returns zero, the PCI layer | 202 | When dma_set_mask() is successful, and returns zero, the kernel saves |
202 | saves away this mask you have provided. The PCI layer will use this | 203 | away this mask you have provided. The kernel will use this |
203 | information later when you make DMA mappings. | 204 | information later when you make DMA mappings. |
204 | 205 | ||
205 | There is a case which we are aware of at this time, which is worth | 206 | There is a case which we are aware of at this time, which is worth |
@@ -208,7 +209,7 @@ functions (for example a sound card provides playback and record | |||
208 | functions) and the various different functions have _different_ | 209 | functions) and the various different functions have _different_ |
209 | DMA addressing limitations, you may wish to probe each mask and | 210 | DMA addressing limitations, you may wish to probe each mask and |
210 | only provide the functionality which the machine can handle. It | 211 | only provide the functionality which the machine can handle. It |
211 | is important that the last call to pci_set_dma_mask() be for the | 212 | is important that the last call to dma_set_mask() be for the |
212 | most specific mask. | 213 | most specific mask. |
213 | 214 | ||
214 | Here is pseudo-code showing how this might be done: | 215 | Here is pseudo-code showing how this might be done: |
@@ -217,17 +218,17 @@ Here is pseudo-code showing how this might be done: | |||
217 | #define RECORD_ADDRESS_BITS DMA_BIT_MASK(24) | 218 | #define RECORD_ADDRESS_BITS DMA_BIT_MASK(24) |
218 | 219 | ||
219 | struct my_sound_card *card; | 220 | struct my_sound_card *card; |
220 | struct pci_dev *pdev; | 221 | struct device *dev; |
221 | 222 | ||
222 | ... | 223 | ... |
223 | if (!pci_set_dma_mask(pdev, PLAYBACK_ADDRESS_BITS)) { | 224 | if (!dma_set_mask(dev, PLAYBACK_ADDRESS_BITS)) { |
224 | card->playback_enabled = 1; | 225 | card->playback_enabled = 1; |
225 | } else { | 226 | } else { |
226 | card->playback_enabled = 0; | 227 | card->playback_enabled = 0; |
227 | printk(KERN_WARNING "%s: Playback disabled due to DMA limitations.\n", | 228 | printk(KERN_WARNING "%s: Playback disabled due to DMA limitations.\n", |
228 | card->name); | 229 | card->name); |
229 | } | 230 | } |
230 | if (!pci_set_dma_mask(pdev, RECORD_ADDRESS_BITS)) { | 231 | if (!dma_set_mask(dev, RECORD_ADDRESS_BITS)) { |
231 | card->record_enabled = 1; | 232 | card->record_enabled = 1; |
232 | } else { | 233 | } else { |
233 | card->record_enabled = 0; | 234 | card->record_enabled = 0; |
@@ -252,8 +253,8 @@ There are two types of DMA mappings: | |||
252 | Think of "consistent" as "synchronous" or "coherent". | 253 | Think of "consistent" as "synchronous" or "coherent". |
253 | 254 | ||
254 | The current default is to return consistent memory in the low 32 | 255 | The current default is to return consistent memory in the low 32 |
255 | bits of the PCI bus space. However, for future compatibility you | 256 | bits of the bus space. However, for future compatibility you should |
256 | should set the consistent mask even if this default is fine for your | 257 | set the consistent mask even if this default is fine for your |
257 | driver. | 258 | driver. |
258 | 259 | ||
259 | Good examples of what to use consistent mappings for are: | 260 | Good examples of what to use consistent mappings for are: |
@@ -285,9 +286,9 @@ There are two types of DMA mappings: | |||
285 | found in PCI bridges (such as by reading a register's value | 286 | found in PCI bridges (such as by reading a register's value |
286 | after writing it). | 287 | after writing it). |
287 | 288 | ||
288 | - Streaming DMA mappings which are usually mapped for one DMA transfer, | 289 | - Streaming DMA mappings which are usually mapped for one DMA |
289 | unmapped right after it (unless you use pci_dma_sync_* below) and for which | 290 | transfer, unmapped right after it (unless you use dma_sync_* below) |
290 | hardware can optimize for sequential accesses. | 291 | and for which hardware can optimize for sequential accesses. |
291 | 292 | ||
292 | This of "streaming" as "asynchronous" or "outside the coherency | 293 | This of "streaming" as "asynchronous" or "outside the coherency |
293 | domain". | 294 | domain". |
@@ -302,8 +303,8 @@ There are two types of DMA mappings: | |||
302 | optimizations the hardware allows. To this end, when using | 303 | optimizations the hardware allows. To this end, when using |
303 | such mappings you must be explicit about what you want to happen. | 304 | such mappings you must be explicit about what you want to happen. |
304 | 305 | ||
305 | Neither type of DMA mapping has alignment restrictions that come | 306 | Neither type of DMA mapping has alignment restrictions that come from |
306 | from PCI, although some devices may have such restrictions. | 307 | the underlying bus, although some devices may have such restrictions. |
307 | Also, systems with caches that aren't DMA-coherent will work better | 308 | Also, systems with caches that aren't DMA-coherent will work better |
308 | when the underlying buffers don't share cache lines with other data. | 309 | when the underlying buffers don't share cache lines with other data. |
309 | 310 | ||
@@ -315,33 +316,27 @@ you should do: | |||
315 | 316 | ||
316 | dma_addr_t dma_handle; | 317 | dma_addr_t dma_handle; |
317 | 318 | ||
318 | cpu_addr = pci_alloc_consistent(pdev, size, &dma_handle); | 319 | cpu_addr = dma_alloc_coherent(dev, size, &dma_handle, gfp); |
319 | |||
320 | where pdev is a struct pci_dev *. This may be called in interrupt context. | ||
321 | You should use dma_alloc_coherent (see DMA-API.txt) for buses | ||
322 | where devices don't have struct pci_dev (like ISA, EISA). | ||
323 | 320 | ||
324 | This argument is needed because the DMA translations may be bus | 321 | where device is a struct device *. This may be called in interrupt |
325 | specific (and often is private to the bus which the device is attached | 322 | context with the GFP_ATOMIC flag. |
326 | to). | ||
327 | 323 | ||
328 | Size is the length of the region you want to allocate, in bytes. | 324 | Size is the length of the region you want to allocate, in bytes. |
329 | 325 | ||
330 | This routine will allocate RAM for that region, so it acts similarly to | 326 | This routine will allocate RAM for that region, so it acts similarly to |
331 | __get_free_pages (but takes size instead of a page order). If your | 327 | __get_free_pages (but takes size instead of a page order). If your |
332 | driver needs regions sized smaller than a page, you may prefer using | 328 | driver needs regions sized smaller than a page, you may prefer using |
333 | the pci_pool interface, described below. | 329 | the dma_pool interface, described below. |
334 | 330 | ||
335 | The consistent DMA mapping interfaces, for non-NULL pdev, will by | 331 | The consistent DMA mapping interfaces, for non-NULL dev, will by |
336 | default return a DMA address which is SAC (Single Address Cycle) | 332 | default return a DMA address which is 32-bit addressable. Even if the |
337 | addressable. Even if the device indicates (via PCI dma mask) that it | 333 | device indicates (via DMA mask) that it may address the upper 32-bits, |
338 | may address the upper 32-bits and thus perform DAC cycles, consistent | 334 | consistent allocation will only return > 32-bit addresses for DMA if |
339 | allocation will only return > 32-bit PCI addresses for DMA if the | 335 | the consistent DMA mask has been explicitly changed via |
340 | consistent dma mask has been explicitly changed via | 336 | dma_set_coherent_mask(). This is true of the dma_pool interface as |
341 | pci_set_consistent_dma_mask(). This is true of the pci_pool interface | 337 | well. |
342 | as well. | 338 | |
343 | 339 | dma_alloc_coherent returns two values: the virtual address which you | |
344 | pci_alloc_consistent returns two values: the virtual address which you | ||
345 | can use to access it from the CPU and dma_handle which you pass to the | 340 | can use to access it from the CPU and dma_handle which you pass to the |
346 | card. | 341 | card. |
347 | 342 | ||
@@ -354,54 +349,54 @@ buffer you receive will not cross a 64K boundary. | |||
354 | 349 | ||
355 | To unmap and free such a DMA region, you call: | 350 | To unmap and free such a DMA region, you call: |
356 | 351 | ||
357 | pci_free_consistent(pdev, size, cpu_addr, dma_handle); | 352 | dma_free_coherent(dev, size, cpu_addr, dma_handle); |
358 | 353 | ||
359 | where pdev, size are the same as in the above call and cpu_addr and | 354 | where dev, size are the same as in the above call and cpu_addr and |
360 | dma_handle are the values pci_alloc_consistent returned to you. | 355 | dma_handle are the values dma_alloc_coherent returned to you. |
361 | This function may not be called in interrupt context. | 356 | This function may not be called in interrupt context. |
362 | 357 | ||
363 | If your driver needs lots of smaller memory regions, you can write | 358 | If your driver needs lots of smaller memory regions, you can write |
364 | custom code to subdivide pages returned by pci_alloc_consistent, | 359 | custom code to subdivide pages returned by dma_alloc_coherent, |
365 | or you can use the pci_pool API to do that. A pci_pool is like | 360 | or you can use the dma_pool API to do that. A dma_pool is like |
366 | a kmem_cache, but it uses pci_alloc_consistent not __get_free_pages. | 361 | a kmem_cache, but it uses dma_alloc_coherent not __get_free_pages. |
367 | Also, it understands common hardware constraints for alignment, | 362 | Also, it understands common hardware constraints for alignment, |
368 | like queue heads needing to be aligned on N byte boundaries. | 363 | like queue heads needing to be aligned on N byte boundaries. |
369 | 364 | ||
370 | Create a pci_pool like this: | 365 | Create a dma_pool like this: |
371 | 366 | ||
372 | struct pci_pool *pool; | 367 | struct dma_pool *pool; |
373 | 368 | ||
374 | pool = pci_pool_create(name, pdev, size, align, alloc); | 369 | pool = dma_pool_create(name, dev, size, align, alloc); |
375 | 370 | ||
376 | The "name" is for diagnostics (like a kmem_cache name); pdev and size | 371 | The "name" is for diagnostics (like a kmem_cache name); dev and size |
377 | are as above. The device's hardware alignment requirement for this | 372 | are as above. The device's hardware alignment requirement for this |
378 | type of data is "align" (which is expressed in bytes, and must be a | 373 | type of data is "align" (which is expressed in bytes, and must be a |
379 | power of two). If your device has no boundary crossing restrictions, | 374 | power of two). If your device has no boundary crossing restrictions, |
380 | pass 0 for alloc; passing 4096 says memory allocated from this pool | 375 | pass 0 for alloc; passing 4096 says memory allocated from this pool |
381 | must not cross 4KByte boundaries (but at that time it may be better to | 376 | must not cross 4KByte boundaries (but at that time it may be better to |
382 | go for pci_alloc_consistent directly instead). | 377 | go for dma_alloc_coherent directly instead). |
383 | 378 | ||
384 | Allocate memory from a pci pool like this: | 379 | Allocate memory from a dma pool like this: |
385 | 380 | ||
386 | cpu_addr = pci_pool_alloc(pool, flags, &dma_handle); | 381 | cpu_addr = dma_pool_alloc(pool, flags, &dma_handle); |
387 | 382 | ||
388 | flags are SLAB_KERNEL if blocking is permitted (not in_interrupt nor | 383 | flags are SLAB_KERNEL if blocking is permitted (not in_interrupt nor |
389 | holding SMP locks), SLAB_ATOMIC otherwise. Like pci_alloc_consistent, | 384 | holding SMP locks), SLAB_ATOMIC otherwise. Like dma_alloc_coherent, |
390 | this returns two values, cpu_addr and dma_handle. | 385 | this returns two values, cpu_addr and dma_handle. |
391 | 386 | ||
392 | Free memory that was allocated from a pci_pool like this: | 387 | Free memory that was allocated from a dma_pool like this: |
393 | 388 | ||
394 | pci_pool_free(pool, cpu_addr, dma_handle); | 389 | dma_pool_free(pool, cpu_addr, dma_handle); |
395 | 390 | ||
396 | where pool is what you passed to pci_pool_alloc, and cpu_addr and | 391 | where pool is what you passed to dma_pool_alloc, and cpu_addr and |
397 | dma_handle are the values pci_pool_alloc returned. This function | 392 | dma_handle are the values dma_pool_alloc returned. This function |
398 | may be called in interrupt context. | 393 | may be called in interrupt context. |
399 | 394 | ||
400 | Destroy a pci_pool by calling: | 395 | Destroy a dma_pool by calling: |
401 | 396 | ||
402 | pci_pool_destroy(pool); | 397 | dma_pool_destroy(pool); |
403 | 398 | ||
404 | Make sure you've called pci_pool_free for all memory allocated | 399 | Make sure you've called dma_pool_free for all memory allocated |
405 | from a pool before you destroy the pool. This function may not | 400 | from a pool before you destroy the pool. This function may not |
406 | be called in interrupt context. | 401 | be called in interrupt context. |
407 | 402 | ||
@@ -411,15 +406,15 @@ The interfaces described in subsequent portions of this document | |||
411 | take a DMA direction argument, which is an integer and takes on | 406 | take a DMA direction argument, which is an integer and takes on |
412 | one of the following values: | 407 | one of the following values: |
413 | 408 | ||
414 | PCI_DMA_BIDIRECTIONAL | 409 | DMA_BIDIRECTIONAL |
415 | PCI_DMA_TODEVICE | 410 | DMA_TO_DEVICE |
416 | PCI_DMA_FROMDEVICE | 411 | DMA_FROM_DEVICE |
417 | PCI_DMA_NONE | 412 | DMA_NONE |
418 | 413 | ||
419 | One should provide the exact DMA direction if you know it. | 414 | One should provide the exact DMA direction if you know it. |
420 | 415 | ||
421 | PCI_DMA_TODEVICE means "from main memory to the PCI device" | 416 | DMA_TO_DEVICE means "from main memory to the device" |
422 | PCI_DMA_FROMDEVICE means "from the PCI device to main memory" | 417 | DMA_FROM_DEVICE means "from the device to main memory" |
423 | It is the direction in which the data moves during the DMA | 418 | It is the direction in which the data moves during the DMA |
424 | transfer. | 419 | transfer. |
425 | 420 | ||
@@ -427,12 +422,12 @@ You are _strongly_ encouraged to specify this as precisely | |||
427 | as you possibly can. | 422 | as you possibly can. |
428 | 423 | ||
429 | If you absolutely cannot know the direction of the DMA transfer, | 424 | If you absolutely cannot know the direction of the DMA transfer, |
430 | specify PCI_DMA_BIDIRECTIONAL. It means that the DMA can go in | 425 | specify DMA_BIDIRECTIONAL. It means that the DMA can go in |
431 | either direction. The platform guarantees that you may legally | 426 | either direction. The platform guarantees that you may legally |
432 | specify this, and that it will work, but this may be at the | 427 | specify this, and that it will work, but this may be at the |
433 | cost of performance for example. | 428 | cost of performance for example. |
434 | 429 | ||
435 | The value PCI_DMA_NONE is to be used for debugging. One can | 430 | The value DMA_NONE is to be used for debugging. One can |
436 | hold this in a data structure before you come to know the | 431 | hold this in a data structure before you come to know the |
437 | precise direction, and this will help catch cases where your | 432 | precise direction, and this will help catch cases where your |
438 | direction tracking logic has failed to set things up properly. | 433 | direction tracking logic has failed to set things up properly. |
@@ -442,21 +437,21 @@ potential platform-specific optimizations of such) is for debugging. | |||
442 | Some platforms actually have a write permission boolean which DMA | 437 | Some platforms actually have a write permission boolean which DMA |
443 | mappings can be marked with, much like page protections in the user | 438 | mappings can be marked with, much like page protections in the user |
444 | program address space. Such platforms can and do report errors in the | 439 | program address space. Such platforms can and do report errors in the |
445 | kernel logs when the PCI controller hardware detects violation of the | 440 | kernel logs when the DMA controller hardware detects violation of the |
446 | permission setting. | 441 | permission setting. |
447 | 442 | ||
448 | Only streaming mappings specify a direction, consistent mappings | 443 | Only streaming mappings specify a direction, consistent mappings |
449 | implicitly have a direction attribute setting of | 444 | implicitly have a direction attribute setting of |
450 | PCI_DMA_BIDIRECTIONAL. | 445 | DMA_BIDIRECTIONAL. |
451 | 446 | ||
452 | The SCSI subsystem tells you the direction to use in the | 447 | The SCSI subsystem tells you the direction to use in the |
453 | 'sc_data_direction' member of the SCSI command your driver is | 448 | 'sc_data_direction' member of the SCSI command your driver is |
454 | working on. | 449 | working on. |
455 | 450 | ||
456 | For Networking drivers, it's a rather simple affair. For transmit | 451 | For Networking drivers, it's a rather simple affair. For transmit |
457 | packets, map/unmap them with the PCI_DMA_TODEVICE direction | 452 | packets, map/unmap them with the DMA_TO_DEVICE direction |
458 | specifier. For receive packets, just the opposite, map/unmap them | 453 | specifier. For receive packets, just the opposite, map/unmap them |
459 | with the PCI_DMA_FROMDEVICE direction specifier. | 454 | with the DMA_FROM_DEVICE direction specifier. |
460 | 455 | ||
461 | Using Streaming DMA mappings | 456 | Using Streaming DMA mappings |
462 | 457 | ||
@@ -467,43 +462,43 @@ scatterlist. | |||
467 | 462 | ||
468 | To map a single region, you do: | 463 | To map a single region, you do: |
469 | 464 | ||
470 | struct pci_dev *pdev = mydev->pdev; | 465 | struct device *dev = &my_dev->dev; |
471 | dma_addr_t dma_handle; | 466 | dma_addr_t dma_handle; |
472 | void *addr = buffer->ptr; | 467 | void *addr = buffer->ptr; |
473 | size_t size = buffer->len; | 468 | size_t size = buffer->len; |
474 | 469 | ||
475 | dma_handle = pci_map_single(pdev, addr, size, direction); | 470 | dma_handle = dma_map_single(dev, addr, size, direction); |
476 | 471 | ||
477 | and to unmap it: | 472 | and to unmap it: |
478 | 473 | ||
479 | pci_unmap_single(pdev, dma_handle, size, direction); | 474 | dma_unmap_single(dev, dma_handle, size, direction); |
480 | 475 | ||
481 | You should call pci_unmap_single when the DMA activity is finished, e.g. | 476 | You should call dma_unmap_single when the DMA activity is finished, e.g. |
482 | from the interrupt which told you that the DMA transfer is done. | 477 | from the interrupt which told you that the DMA transfer is done. |
483 | 478 | ||
484 | Using cpu pointers like this for single mappings has a disadvantage, | 479 | Using cpu pointers like this for single mappings has a disadvantage, |
485 | you cannot reference HIGHMEM memory in this way. Thus, there is a | 480 | you cannot reference HIGHMEM memory in this way. Thus, there is a |
486 | map/unmap interface pair akin to pci_{map,unmap}_single. These | 481 | map/unmap interface pair akin to dma_{map,unmap}_single. These |
487 | interfaces deal with page/offset pairs instead of cpu pointers. | 482 | interfaces deal with page/offset pairs instead of cpu pointers. |
488 | Specifically: | 483 | Specifically: |
489 | 484 | ||
490 | struct pci_dev *pdev = mydev->pdev; | 485 | struct device *dev = &my_dev->dev; |
491 | dma_addr_t dma_handle; | 486 | dma_addr_t dma_handle; |
492 | struct page *page = buffer->page; | 487 | struct page *page = buffer->page; |
493 | unsigned long offset = buffer->offset; | 488 | unsigned long offset = buffer->offset; |
494 | size_t size = buffer->len; | 489 | size_t size = buffer->len; |
495 | 490 | ||
496 | dma_handle = pci_map_page(pdev, page, offset, size, direction); | 491 | dma_handle = dma_map_page(dev, page, offset, size, direction); |
497 | 492 | ||
498 | ... | 493 | ... |
499 | 494 | ||
500 | pci_unmap_page(pdev, dma_handle, size, direction); | 495 | dma_unmap_page(dev, dma_handle, size, direction); |
501 | 496 | ||
502 | Here, "offset" means byte offset within the given page. | 497 | Here, "offset" means byte offset within the given page. |
503 | 498 | ||
504 | With scatterlists, you map a region gathered from several regions by: | 499 | With scatterlists, you map a region gathered from several regions by: |
505 | 500 | ||
506 | int i, count = pci_map_sg(pdev, sglist, nents, direction); | 501 | int i, count = dma_map_sg(dev, sglist, nents, direction); |
507 | struct scatterlist *sg; | 502 | struct scatterlist *sg; |
508 | 503 | ||
509 | for_each_sg(sglist, sg, count, i) { | 504 | for_each_sg(sglist, sg, count, i) { |
@@ -527,16 +522,16 @@ accessed sg->address and sg->length as shown above. | |||
527 | 522 | ||
528 | To unmap a scatterlist, just call: | 523 | To unmap a scatterlist, just call: |
529 | 524 | ||
530 | pci_unmap_sg(pdev, sglist, nents, direction); | 525 | dma_unmap_sg(dev, sglist, nents, direction); |
531 | 526 | ||
532 | Again, make sure DMA activity has already finished. | 527 | Again, make sure DMA activity has already finished. |
533 | 528 | ||
534 | PLEASE NOTE: The 'nents' argument to the pci_unmap_sg call must be | 529 | PLEASE NOTE: The 'nents' argument to the dma_unmap_sg call must be |
535 | the _same_ one you passed into the pci_map_sg call, | 530 | the _same_ one you passed into the dma_map_sg call, |
536 | it should _NOT_ be the 'count' value _returned_ from the | 531 | it should _NOT_ be the 'count' value _returned_ from the |
537 | pci_map_sg call. | 532 | dma_map_sg call. |
538 | 533 | ||
539 | Every pci_map_{single,sg} call should have its pci_unmap_{single,sg} | 534 | Every dma_map_{single,sg} call should have its dma_unmap_{single,sg} |
540 | counterpart, because the bus address space is a shared resource (although | 535 | counterpart, because the bus address space is a shared resource (although |
541 | in some ports the mapping is per each BUS so less devices contend for the | 536 | in some ports the mapping is per each BUS so less devices contend for the |
542 | same bus address space) and you could render the machine unusable by eating | 537 | same bus address space) and you could render the machine unusable by eating |
@@ -547,14 +542,14 @@ the data in between the DMA transfers, the buffer needs to be synced | |||
547 | properly in order for the cpu and device to see the most uptodate and | 542 | properly in order for the cpu and device to see the most uptodate and |
548 | correct copy of the DMA buffer. | 543 | correct copy of the DMA buffer. |
549 | 544 | ||
550 | So, firstly, just map it with pci_map_{single,sg}, and after each DMA | 545 | So, firstly, just map it with dma_map_{single,sg}, and after each DMA |
551 | transfer call either: | 546 | transfer call either: |
552 | 547 | ||
553 | pci_dma_sync_single_for_cpu(pdev, dma_handle, size, direction); | 548 | dma_sync_single_for_cpu(dev, dma_handle, size, direction); |
554 | 549 | ||
555 | or: | 550 | or: |
556 | 551 | ||
557 | pci_dma_sync_sg_for_cpu(pdev, sglist, nents, direction); | 552 | dma_sync_sg_for_cpu(dev, sglist, nents, direction); |
558 | 553 | ||
559 | as appropriate. | 554 | as appropriate. |
560 | 555 | ||
@@ -562,27 +557,27 @@ Then, if you wish to let the device get at the DMA area again, | |||
562 | finish accessing the data with the cpu, and then before actually | 557 | finish accessing the data with the cpu, and then before actually |
563 | giving the buffer to the hardware call either: | 558 | giving the buffer to the hardware call either: |
564 | 559 | ||
565 | pci_dma_sync_single_for_device(pdev, dma_handle, size, direction); | 560 | dma_sync_single_for_device(dev, dma_handle, size, direction); |
566 | 561 | ||
567 | or: | 562 | or: |
568 | 563 | ||
569 | pci_dma_sync_sg_for_device(dev, sglist, nents, direction); | 564 | dma_sync_sg_for_device(dev, sglist, nents, direction); |
570 | 565 | ||
571 | as appropriate. | 566 | as appropriate. |
572 | 567 | ||
573 | After the last DMA transfer call one of the DMA unmap routines | 568 | After the last DMA transfer call one of the DMA unmap routines |
574 | pci_unmap_{single,sg}. If you don't touch the data from the first pci_map_* | 569 | dma_unmap_{single,sg}. If you don't touch the data from the first dma_map_* |
575 | call till pci_unmap_*, then you don't have to call the pci_dma_sync_* | 570 | call till dma_unmap_*, then you don't have to call the dma_sync_* |
576 | routines at all. | 571 | routines at all. |
577 | 572 | ||
578 | Here is pseudo code which shows a situation in which you would need | 573 | Here is pseudo code which shows a situation in which you would need |
579 | to use the pci_dma_sync_*() interfaces. | 574 | to use the dma_sync_*() interfaces. |
580 | 575 | ||
581 | my_card_setup_receive_buffer(struct my_card *cp, char *buffer, int len) | 576 | my_card_setup_receive_buffer(struct my_card *cp, char *buffer, int len) |
582 | { | 577 | { |
583 | dma_addr_t mapping; | 578 | dma_addr_t mapping; |
584 | 579 | ||
585 | mapping = pci_map_single(cp->pdev, buffer, len, PCI_DMA_FROMDEVICE); | 580 | mapping = dma_map_single(cp->dev, buffer, len, DMA_FROM_DEVICE); |
586 | 581 | ||
587 | cp->rx_buf = buffer; | 582 | cp->rx_buf = buffer; |
588 | cp->rx_len = len; | 583 | cp->rx_len = len; |
@@ -606,25 +601,25 @@ to use the pci_dma_sync_*() interfaces. | |||
606 | * the DMA transfer with the CPU first | 601 | * the DMA transfer with the CPU first |
607 | * so that we see updated contents. | 602 | * so that we see updated contents. |
608 | */ | 603 | */ |
609 | pci_dma_sync_single_for_cpu(cp->pdev, cp->rx_dma, | 604 | dma_sync_single_for_cpu(&cp->dev, cp->rx_dma, |
610 | cp->rx_len, | 605 | cp->rx_len, |
611 | PCI_DMA_FROMDEVICE); | 606 | DMA_FROM_DEVICE); |
612 | 607 | ||
613 | /* Now it is safe to examine the buffer. */ | 608 | /* Now it is safe to examine the buffer. */ |
614 | hp = (struct my_card_header *) cp->rx_buf; | 609 | hp = (struct my_card_header *) cp->rx_buf; |
615 | if (header_is_ok(hp)) { | 610 | if (header_is_ok(hp)) { |
616 | pci_unmap_single(cp->pdev, cp->rx_dma, cp->rx_len, | 611 | dma_unmap_single(&cp->dev, cp->rx_dma, cp->rx_len, |
617 | PCI_DMA_FROMDEVICE); | 612 | DMA_FROM_DEVICE); |
618 | pass_to_upper_layers(cp->rx_buf); | 613 | pass_to_upper_layers(cp->rx_buf); |
619 | make_and_setup_new_rx_buf(cp); | 614 | make_and_setup_new_rx_buf(cp); |
620 | } else { | 615 | } else { |
621 | /* Just sync the buffer and give it back | 616 | /* Just sync the buffer and give it back |
622 | * to the card. | 617 | * to the card. |
623 | */ | 618 | */ |
624 | pci_dma_sync_single_for_device(cp->pdev, | 619 | dma_sync_single_for_device(&cp->dev, |
625 | cp->rx_dma, | 620 | cp->rx_dma, |
626 | cp->rx_len, | 621 | cp->rx_len, |
627 | PCI_DMA_FROMDEVICE); | 622 | DMA_FROM_DEVICE); |
628 | give_rx_buf_to_card(cp); | 623 | give_rx_buf_to_card(cp); |
629 | } | 624 | } |
630 | } | 625 | } |
@@ -634,19 +629,49 @@ Drivers converted fully to this interface should not use virt_to_bus any | |||
634 | longer, nor should they use bus_to_virt. Some drivers have to be changed a | 629 | longer, nor should they use bus_to_virt. Some drivers have to be changed a |
635 | little bit, because there is no longer an equivalent to bus_to_virt in the | 630 | little bit, because there is no longer an equivalent to bus_to_virt in the |
636 | dynamic DMA mapping scheme - you have to always store the DMA addresses | 631 | dynamic DMA mapping scheme - you have to always store the DMA addresses |
637 | returned by the pci_alloc_consistent, pci_pool_alloc, and pci_map_single | 632 | returned by the dma_alloc_coherent, dma_pool_alloc, and dma_map_single |
638 | calls (pci_map_sg stores them in the scatterlist itself if the platform | 633 | calls (dma_map_sg stores them in the scatterlist itself if the platform |
639 | supports dynamic DMA mapping in hardware) in your driver structures and/or | 634 | supports dynamic DMA mapping in hardware) in your driver structures and/or |
640 | in the card registers. | 635 | in the card registers. |
641 | 636 | ||
642 | All PCI drivers should be using these interfaces with no exceptions. | 637 | All drivers should be using these interfaces with no exceptions. It |
643 | It is planned to completely remove virt_to_bus() and bus_to_virt() as | 638 | is planned to completely remove virt_to_bus() and bus_to_virt() as |
644 | they are entirely deprecated. Some ports already do not provide these | 639 | they are entirely deprecated. Some ports already do not provide these |
645 | as it is impossible to correctly support them. | 640 | as it is impossible to correctly support them. |
646 | 641 | ||
642 | Handling Errors | ||
643 | |||
644 | DMA address space is limited on some architectures and an allocation | ||
645 | failure can be determined by: | ||
646 | |||
647 | - checking if dma_alloc_coherent returns NULL or dma_map_sg returns 0 | ||
648 | |||
649 | - checking the returned dma_addr_t of dma_map_single and dma_map_page | ||
650 | by using dma_mapping_error(): | ||
651 | |||
652 | dma_addr_t dma_handle; | ||
653 | |||
654 | dma_handle = dma_map_single(dev, addr, size, direction); | ||
655 | if (dma_mapping_error(dev, dma_handle)) { | ||
656 | /* | ||
657 | * reduce current DMA mapping usage, | ||
658 | * delay and try again later or | ||
659 | * reset driver. | ||
660 | */ | ||
661 | } | ||
662 | |||
663 | Networking drivers must call dev_kfree_skb to free the socket buffer | ||
664 | and return NETDEV_TX_OK if the DMA mapping fails on the transmit hook | ||
665 | (ndo_start_xmit). This means that the socket buffer is just dropped in | ||
666 | the failure case. | ||
667 | |||
668 | SCSI drivers must return SCSI_MLQUEUE_HOST_BUSY if the DMA mapping | ||
669 | fails in the queuecommand hook. This means that the SCSI subsystem | ||
670 | passes the command to the driver again later. | ||
671 | |||
647 | Optimizing Unmap State Space Consumption | 672 | Optimizing Unmap State Space Consumption |
648 | 673 | ||
649 | On many platforms, pci_unmap_{single,page}() is simply a nop. | 674 | On many platforms, dma_unmap_{single,page}() is simply a nop. |
650 | Therefore, keeping track of the mapping address and length is a waste | 675 | Therefore, keeping track of the mapping address and length is a waste |
651 | of space. Instead of filling your drivers up with ifdefs and the like | 676 | of space. Instead of filling your drivers up with ifdefs and the like |
652 | to "work around" this (which would defeat the whole purpose of a | 677 | to "work around" this (which would defeat the whole purpose of a |
@@ -655,7 +680,7 @@ portable API) the following facilities are provided. | |||
655 | Actually, instead of describing the macros one by one, we'll | 680 | Actually, instead of describing the macros one by one, we'll |
656 | transform some example code. | 681 | transform some example code. |
657 | 682 | ||
658 | 1) Use DECLARE_PCI_UNMAP_{ADDR,LEN} in state saving structures. | 683 | 1) Use DEFINE_DMA_UNMAP_{ADDR,LEN} in state saving structures. |
659 | Example, before: | 684 | Example, before: |
660 | 685 | ||
661 | struct ring_state { | 686 | struct ring_state { |
@@ -668,14 +693,11 @@ transform some example code. | |||
668 | 693 | ||
669 | struct ring_state { | 694 | struct ring_state { |
670 | struct sk_buff *skb; | 695 | struct sk_buff *skb; |
671 | DECLARE_PCI_UNMAP_ADDR(mapping) | 696 | DEFINE_DMA_UNMAP_ADDR(mapping); |
672 | DECLARE_PCI_UNMAP_LEN(len) | 697 | DEFINE_DMA_UNMAP_LEN(len); |
673 | }; | 698 | }; |
674 | 699 | ||
675 | NOTE: DO NOT put a semicolon at the end of the DECLARE_*() | 700 | 2) Use dma_unmap_{addr,len}_set to set these values. |
676 | macro. | ||
677 | |||
678 | 2) Use pci_unmap_{addr,len}_set to set these values. | ||
679 | Example, before: | 701 | Example, before: |
680 | 702 | ||
681 | ringp->mapping = FOO; | 703 | ringp->mapping = FOO; |
@@ -683,21 +705,21 @@ transform some example code. | |||
683 | 705 | ||
684 | after: | 706 | after: |
685 | 707 | ||
686 | pci_unmap_addr_set(ringp, mapping, FOO); | 708 | dma_unmap_addr_set(ringp, mapping, FOO); |
687 | pci_unmap_len_set(ringp, len, BAR); | 709 | dma_unmap_len_set(ringp, len, BAR); |
688 | 710 | ||
689 | 3) Use pci_unmap_{addr,len} to access these values. | 711 | 3) Use dma_unmap_{addr,len} to access these values. |
690 | Example, before: | 712 | Example, before: |
691 | 713 | ||
692 | pci_unmap_single(pdev, ringp->mapping, ringp->len, | 714 | dma_unmap_single(dev, ringp->mapping, ringp->len, |
693 | PCI_DMA_FROMDEVICE); | 715 | DMA_FROM_DEVICE); |
694 | 716 | ||
695 | after: | 717 | after: |
696 | 718 | ||
697 | pci_unmap_single(pdev, | 719 | dma_unmap_single(dev, |
698 | pci_unmap_addr(ringp, mapping), | 720 | dma_unmap_addr(ringp, mapping), |
699 | pci_unmap_len(ringp, len), | 721 | dma_unmap_len(ringp, len), |
700 | PCI_DMA_FROMDEVICE); | 722 | DMA_FROM_DEVICE); |
701 | 723 | ||
702 | It really should be self-explanatory. We treat the ADDR and LEN | 724 | It really should be self-explanatory. We treat the ADDR and LEN |
703 | separately, because it is possible for an implementation to only | 725 | separately, because it is possible for an implementation to only |
@@ -711,46 +733,29 @@ to "Closing". | |||
711 | 733 | ||
712 | 1) Struct scatterlist requirements. | 734 | 1) Struct scatterlist requirements. |
713 | 735 | ||
714 | Struct scatterlist must contain, at a minimum, the following | 736 | Don't invent the architecture specific struct scatterlist; just use |
715 | members: | 737 | <asm-generic/scatterlist.h>. You need to enable |
716 | 738 | CONFIG_NEED_SG_DMA_LENGTH if the architecture supports IOMMUs | |
717 | struct page *page; | 739 | (including software IOMMU). |
718 | unsigned int offset; | ||
719 | unsigned int length; | ||
720 | |||
721 | The base address is specified by a "page+offset" pair. | ||
722 | |||
723 | Previous versions of struct scatterlist contained a "void *address" | ||
724 | field that was sometimes used instead of page+offset. As of Linux | ||
725 | 2.5., page+offset is always used, and the "address" field has been | ||
726 | deleted. | ||
727 | |||
728 | 2) More to come... | ||
729 | |||
730 | Handling Errors | ||
731 | |||
732 | DMA address space is limited on some architectures and an allocation | ||
733 | failure can be determined by: | ||
734 | 740 | ||
735 | - checking if pci_alloc_consistent returns NULL or pci_map_sg returns 0 | 741 | 2) ARCH_KMALLOC_MINALIGN |
736 | 742 | ||
737 | - checking the returned dma_addr_t of pci_map_single and pci_map_page | 743 | Architectures must ensure that kmalloc'ed buffer is |
738 | by using pci_dma_mapping_error(): | 744 | DMA-safe. Drivers and subsystems depend on it. If an architecture |
745 | isn't fully DMA-coherent (i.e. hardware doesn't ensure that data in | ||
746 | the CPU cache is identical to data in main memory), | ||
747 | ARCH_KMALLOC_MINALIGN must be set so that the memory allocator | ||
748 | makes sure that kmalloc'ed buffer doesn't share a cache line with | ||
749 | the others. See arch/arm/include/asm/cache.h as an example. | ||
739 | 750 | ||
740 | dma_addr_t dma_handle; | 751 | Note that ARCH_KMALLOC_MINALIGN is about DMA memory alignment |
741 | 752 | constraints. You don't need to worry about the architecture data | |
742 | dma_handle = pci_map_single(pdev, addr, size, direction); | 753 | alignment constraints (e.g. the alignment constraints about 64-bit |
743 | if (pci_dma_mapping_error(pdev, dma_handle)) { | 754 | objects). |
744 | /* | ||
745 | * reduce current DMA mapping usage, | ||
746 | * delay and try again later or | ||
747 | * reset driver. | ||
748 | */ | ||
749 | } | ||
750 | 755 | ||
751 | Closing | 756 | Closing |
752 | 757 | ||
753 | This document, and the API itself, would not be in it's current | 758 | This document, and the API itself, would not be in its current |
754 | form without the feedback and suggestions from numerous individuals. | 759 | form without the feedback and suggestions from numerous individuals. |
755 | We would like to specifically mention, in no particular order, the | 760 | We would like to specifically mention, in no particular order, the |
756 | following people: | 761 | following people: |
diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt index 5aceb88b3f8b..05e2ae236865 100644 --- a/Documentation/DMA-API.txt +++ b/Documentation/DMA-API.txt | |||
@@ -4,20 +4,18 @@ | |||
4 | James E.J. Bottomley <James.Bottomley@HansenPartnership.com> | 4 | James E.J. Bottomley <James.Bottomley@HansenPartnership.com> |
5 | 5 | ||
6 | This document describes the DMA API. For a more gentle introduction | 6 | This document describes the DMA API. For a more gentle introduction |
7 | phrased in terms of the pci_ equivalents (and actual examples) see | 7 | of the API (and actual examples) see |
8 | Documentation/PCI/PCI-DMA-mapping.txt. | 8 | Documentation/DMA-API-HOWTO.txt. |
9 | 9 | ||
10 | This API is split into two pieces. Part I describes the API and the | 10 | This API is split into two pieces. Part I describes the API. Part II |
11 | corresponding pci_ API. Part II describes the extensions to the API | 11 | describes the extensions to the API for supporting non-consistent |
12 | for supporting non-consistent memory machines. Unless you know that | 12 | memory machines. Unless you know that your driver absolutely has to |
13 | your driver absolutely has to support non-consistent platforms (this | 13 | support non-consistent platforms (this is usually only legacy |
14 | is usually only legacy platforms) you should only use the API | 14 | platforms) you should only use the API described in part I. |
15 | described in part I. | ||
16 | 15 | ||
17 | Part I - pci_ and dma_ Equivalent API | 16 | Part I - dma_ API |
18 | ------------------------------------- | 17 | ------------------------------------- |
19 | 18 | ||
20 | To get the pci_ API, you must #include <linux/pci.h> | ||
21 | To get the dma_ API, you must #include <linux/dma-mapping.h> | 19 | To get the dma_ API, you must #include <linux/dma-mapping.h> |
22 | 20 | ||
23 | 21 | ||
@@ -27,9 +25,6 @@ Part Ia - Using large dma-coherent buffers | |||
27 | void * | 25 | void * |
28 | dma_alloc_coherent(struct device *dev, size_t size, | 26 | dma_alloc_coherent(struct device *dev, size_t size, |
29 | dma_addr_t *dma_handle, gfp_t flag) | 27 | dma_addr_t *dma_handle, gfp_t flag) |
30 | void * | ||
31 | pci_alloc_consistent(struct pci_dev *dev, size_t size, | ||
32 | dma_addr_t *dma_handle) | ||
33 | 28 | ||
34 | Consistent memory is memory for which a write by either the device or | 29 | Consistent memory is memory for which a write by either the device or |
35 | the processor can immediately be read by the processor or device | 30 | the processor can immediately be read by the processor or device |
@@ -53,15 +48,11 @@ The simplest way to do that is to use the dma_pool calls (see below). | |||
53 | The flag parameter (dma_alloc_coherent only) allows the caller to | 48 | The flag parameter (dma_alloc_coherent only) allows the caller to |
54 | specify the GFP_ flags (see kmalloc) for the allocation (the | 49 | specify the GFP_ flags (see kmalloc) for the allocation (the |
55 | implementation may choose to ignore flags that affect the location of | 50 | implementation may choose to ignore flags that affect the location of |
56 | the returned memory, like GFP_DMA). For pci_alloc_consistent, you | 51 | the returned memory, like GFP_DMA). |
57 | must assume GFP_ATOMIC behaviour. | ||
58 | 52 | ||
59 | void | 53 | void |
60 | dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, | 54 | dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, |
61 | dma_addr_t dma_handle) | 55 | dma_addr_t dma_handle) |
62 | void | ||
63 | pci_free_consistent(struct pci_dev *dev, size_t size, void *cpu_addr, | ||
64 | dma_addr_t dma_handle) | ||
65 | 56 | ||
66 | Free the region of consistent memory you previously allocated. dev, | 57 | Free the region of consistent memory you previously allocated. dev, |
67 | size and dma_handle must all be the same as those passed into the | 58 | size and dma_handle must all be the same as those passed into the |
@@ -89,10 +80,6 @@ for alignment, like queue heads needing to be aligned on N-byte boundaries. | |||
89 | dma_pool_create(const char *name, struct device *dev, | 80 | dma_pool_create(const char *name, struct device *dev, |
90 | size_t size, size_t align, size_t alloc); | 81 | size_t size, size_t align, size_t alloc); |
91 | 82 | ||
92 | struct pci_pool * | ||
93 | pci_pool_create(const char *name, struct pci_device *dev, | ||
94 | size_t size, size_t align, size_t alloc); | ||
95 | |||
96 | The pool create() routines initialize a pool of dma-coherent buffers | 83 | The pool create() routines initialize a pool of dma-coherent buffers |
97 | for use with a given device. It must be called in a context which | 84 | for use with a given device. It must be called in a context which |
98 | can sleep. | 85 | can sleep. |
@@ -108,9 +95,6 @@ from this pool must not cross 4KByte boundaries. | |||
108 | void *dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags, | 95 | void *dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags, |
109 | dma_addr_t *dma_handle); | 96 | dma_addr_t *dma_handle); |
110 | 97 | ||
111 | void *pci_pool_alloc(struct pci_pool *pool, gfp_t gfp_flags, | ||
112 | dma_addr_t *dma_handle); | ||
113 | |||
114 | This allocates memory from the pool; the returned memory will meet the size | 98 | This allocates memory from the pool; the returned memory will meet the size |
115 | and alignment requirements specified at creation time. Pass GFP_ATOMIC to | 99 | and alignment requirements specified at creation time. Pass GFP_ATOMIC to |
116 | prevent blocking, or if it's permitted (not in_interrupt, not holding SMP locks), | 100 | prevent blocking, or if it's permitted (not in_interrupt, not holding SMP locks), |
@@ -122,9 +106,6 @@ pool's device. | |||
122 | void dma_pool_free(struct dma_pool *pool, void *vaddr, | 106 | void dma_pool_free(struct dma_pool *pool, void *vaddr, |
123 | dma_addr_t addr); | 107 | dma_addr_t addr); |
124 | 108 | ||
125 | void pci_pool_free(struct pci_pool *pool, void *vaddr, | ||
126 | dma_addr_t addr); | ||
127 | |||
128 | This puts memory back into the pool. The pool is what was passed to | 109 | This puts memory back into the pool. The pool is what was passed to |
129 | the pool allocation routine; the cpu (vaddr) and dma addresses are what | 110 | the pool allocation routine; the cpu (vaddr) and dma addresses are what |
130 | were returned when that routine allocated the memory being freed. | 111 | were returned when that routine allocated the memory being freed. |
@@ -132,8 +113,6 @@ were returned when that routine allocated the memory being freed. | |||
132 | 113 | ||
133 | void dma_pool_destroy(struct dma_pool *pool); | 114 | void dma_pool_destroy(struct dma_pool *pool); |
134 | 115 | ||
135 | void pci_pool_destroy(struct pci_pool *pool); | ||
136 | |||
137 | The pool destroy() routines free the resources of the pool. They must be | 116 | The pool destroy() routines free the resources of the pool. They must be |
138 | called in a context which can sleep. Make sure you've freed all allocated | 117 | called in a context which can sleep. Make sure you've freed all allocated |
139 | memory back to the pool before you destroy it. | 118 | memory back to the pool before you destroy it. |
@@ -144,8 +123,6 @@ Part Ic - DMA addressing limitations | |||
144 | 123 | ||
145 | int | 124 | int |
146 | dma_supported(struct device *dev, u64 mask) | 125 | dma_supported(struct device *dev, u64 mask) |
147 | int | ||
148 | pci_dma_supported(struct pci_dev *hwdev, u64 mask) | ||
149 | 126 | ||
150 | Checks to see if the device can support DMA to the memory described by | 127 | Checks to see if the device can support DMA to the memory described by |
151 | mask. | 128 | mask. |
@@ -159,8 +136,14 @@ driver writers. | |||
159 | 136 | ||
160 | int | 137 | int |
161 | dma_set_mask(struct device *dev, u64 mask) | 138 | dma_set_mask(struct device *dev, u64 mask) |
139 | |||
140 | Checks to see if the mask is possible and updates the device | ||
141 | parameters if it is. | ||
142 | |||
143 | Returns: 0 if successful and a negative error if not. | ||
144 | |||
162 | int | 145 | int |
163 | pci_set_dma_mask(struct pci_device *dev, u64 mask) | 146 | dma_set_coherent_mask(struct device *dev, u64 mask) |
164 | 147 | ||
165 | Checks to see if the mask is possible and updates the device | 148 | Checks to see if the mask is possible and updates the device |
166 | parameters if it is. | 149 | parameters if it is. |
@@ -187,9 +170,6 @@ Part Id - Streaming DMA mappings | |||
187 | dma_addr_t | 170 | dma_addr_t |
188 | dma_map_single(struct device *dev, void *cpu_addr, size_t size, | 171 | dma_map_single(struct device *dev, void *cpu_addr, size_t size, |
189 | enum dma_data_direction direction) | 172 | enum dma_data_direction direction) |
190 | dma_addr_t | ||
191 | pci_map_single(struct pci_dev *hwdev, void *cpu_addr, size_t size, | ||
192 | int direction) | ||
193 | 173 | ||
194 | Maps a piece of processor virtual memory so it can be accessed by the | 174 | Maps a piece of processor virtual memory so it can be accessed by the |
195 | device and returns the physical handle of the memory. | 175 | device and returns the physical handle of the memory. |
@@ -198,14 +178,10 @@ The direction for both api's may be converted freely by casting. | |||
198 | However the dma_ API uses a strongly typed enumerator for its | 178 | However the dma_ API uses a strongly typed enumerator for its |
199 | direction: | 179 | direction: |
200 | 180 | ||
201 | DMA_NONE = PCI_DMA_NONE no direction (used for | 181 | DMA_NONE no direction (used for debugging) |
202 | debugging) | 182 | DMA_TO_DEVICE data is going from the memory to the device |
203 | DMA_TO_DEVICE = PCI_DMA_TODEVICE data is going from the | 183 | DMA_FROM_DEVICE data is coming from the device to the memory |
204 | memory to the device | 184 | DMA_BIDIRECTIONAL direction isn't known |
205 | DMA_FROM_DEVICE = PCI_DMA_FROMDEVICE data is coming from | ||
206 | the device to the | ||
207 | memory | ||
208 | DMA_BIDIRECTIONAL = PCI_DMA_BIDIRECTIONAL direction isn't known | ||
209 | 185 | ||
210 | Notes: Not all memory regions in a machine can be mapped by this | 186 | Notes: Not all memory regions in a machine can be mapped by this |
211 | API. Further, regions that appear to be physically contiguous in | 187 | API. Further, regions that appear to be physically contiguous in |
@@ -268,9 +244,6 @@ cache lines are updated with data that the device may have changed). | |||
268 | void | 244 | void |
269 | dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, | 245 | dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, |
270 | enum dma_data_direction direction) | 246 | enum dma_data_direction direction) |
271 | void | ||
272 | pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr, | ||
273 | size_t size, int direction) | ||
274 | 247 | ||
275 | Unmaps the region previously mapped. All the parameters passed in | 248 | Unmaps the region previously mapped. All the parameters passed in |
276 | must be identical to those passed in (and returned) by the mapping | 249 | must be identical to those passed in (and returned) by the mapping |
@@ -280,15 +253,9 @@ dma_addr_t | |||
280 | dma_map_page(struct device *dev, struct page *page, | 253 | dma_map_page(struct device *dev, struct page *page, |
281 | unsigned long offset, size_t size, | 254 | unsigned long offset, size_t size, |
282 | enum dma_data_direction direction) | 255 | enum dma_data_direction direction) |
283 | dma_addr_t | ||
284 | pci_map_page(struct pci_dev *hwdev, struct page *page, | ||
285 | unsigned long offset, size_t size, int direction) | ||
286 | void | 256 | void |
287 | dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, | 257 | dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, |
288 | enum dma_data_direction direction) | 258 | enum dma_data_direction direction) |
289 | void | ||
290 | pci_unmap_page(struct pci_dev *hwdev, dma_addr_t dma_address, | ||
291 | size_t size, int direction) | ||
292 | 259 | ||
293 | API for mapping and unmapping for pages. All the notes and warnings | 260 | API for mapping and unmapping for pages. All the notes and warnings |
294 | for the other mapping APIs apply here. Also, although the <offset> | 261 | for the other mapping APIs apply here. Also, although the <offset> |
@@ -299,9 +266,6 @@ cache width is. | |||
299 | int | 266 | int |
300 | dma_mapping_error(struct device *dev, dma_addr_t dma_addr) | 267 | dma_mapping_error(struct device *dev, dma_addr_t dma_addr) |
301 | 268 | ||
302 | int | ||
303 | pci_dma_mapping_error(struct pci_dev *hwdev, dma_addr_t dma_addr) | ||
304 | |||
305 | In some circumstances dma_map_single and dma_map_page will fail to create | 269 | In some circumstances dma_map_single and dma_map_page will fail to create |
306 | a mapping. A driver can check for these errors by testing the returned | 270 | a mapping. A driver can check for these errors by testing the returned |
307 | dma address with dma_mapping_error(). A non-zero return value means the mapping | 271 | dma address with dma_mapping_error(). A non-zero return value means the mapping |
@@ -311,9 +275,6 @@ reduce current DMA mapping usage or delay and try again later). | |||
311 | int | 275 | int |
312 | dma_map_sg(struct device *dev, struct scatterlist *sg, | 276 | dma_map_sg(struct device *dev, struct scatterlist *sg, |
313 | int nents, enum dma_data_direction direction) | 277 | int nents, enum dma_data_direction direction) |
314 | int | ||
315 | pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg, | ||
316 | int nents, int direction) | ||
317 | 278 | ||
318 | Returns: the number of physical segments mapped (this may be shorter | 279 | Returns: the number of physical segments mapped (this may be shorter |
319 | than <nents> passed in if some elements of the scatter/gather list are | 280 | than <nents> passed in if some elements of the scatter/gather list are |
@@ -353,9 +314,6 @@ accessed sg->address and sg->length as shown above. | |||
353 | void | 314 | void |
354 | dma_unmap_sg(struct device *dev, struct scatterlist *sg, | 315 | dma_unmap_sg(struct device *dev, struct scatterlist *sg, |
355 | int nhwentries, enum dma_data_direction direction) | 316 | int nhwentries, enum dma_data_direction direction) |
356 | void | ||
357 | pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg, | ||
358 | int nents, int direction) | ||
359 | 317 | ||
360 | Unmap the previously mapped scatter/gather list. All the parameters | 318 | Unmap the previously mapped scatter/gather list. All the parameters |
361 | must be the same as those and passed in to the scatter/gather mapping | 319 | must be the same as those and passed in to the scatter/gather mapping |
@@ -365,21 +323,23 @@ Note: <nents> must be the number you passed in, *not* the number of | |||
365 | physical entries returned. | 323 | physical entries returned. |
366 | 324 | ||
367 | void | 325 | void |
368 | dma_sync_single(struct device *dev, dma_addr_t dma_handle, size_t size, | 326 | dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, |
369 | enum dma_data_direction direction) | 327 | enum dma_data_direction direction) |
370 | void | 328 | void |
371 | pci_dma_sync_single(struct pci_dev *hwdev, dma_addr_t dma_handle, | 329 | dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, |
372 | size_t size, int direction) | 330 | enum dma_data_direction direction) |
373 | void | 331 | void |
374 | dma_sync_sg(struct device *dev, struct scatterlist *sg, int nelems, | 332 | dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, |
375 | enum dma_data_direction direction) | 333 | enum dma_data_direction direction) |
376 | void | 334 | void |
377 | pci_dma_sync_sg(struct pci_dev *hwdev, struct scatterlist *sg, | 335 | dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, |
378 | int nelems, int direction) | 336 | enum dma_data_direction direction) |
379 | 337 | ||
380 | Synchronise a single contiguous or scatter/gather mapping. All the | 338 | Synchronise a single contiguous or scatter/gather mapping for the cpu |
381 | parameters must be the same as those passed into the single mapping | 339 | and device. With the sync_sg API, all the parameters must be the same |
382 | API. | 340 | as those passed into the single mapping API. With the sync_single API, |
341 | you can use dma_handle and size parameters that aren't identical to | ||
342 | those passed into the single mapping API to do a partial sync. | ||
383 | 343 | ||
384 | Notes: You must do this: | 344 | Notes: You must do this: |
385 | 345 | ||
@@ -461,9 +421,9 @@ void whizco_dma_map_sg_attrs(struct device *dev, dma_addr_t dma_addr, | |||
461 | Part II - Advanced dma_ usage | 421 | Part II - Advanced dma_ usage |
462 | ----------------------------- | 422 | ----------------------------- |
463 | 423 | ||
464 | Warning: These pieces of the DMA API have no PCI equivalent. They | 424 | Warning: These pieces of the DMA API should not be used in the |
465 | should also not be used in the majority of cases, since they cater for | 425 | majority of cases, since they cater for unlikely corner cases that |
466 | unlikely corner cases that don't belong in usual drivers. | 426 | don't belong in usual drivers. |
467 | 427 | ||
468 | If you don't understand how cache line coherency works between a | 428 | If you don't understand how cache line coherency works between a |
469 | processor and an I/O device, you should not be using this part of the | 429 | processor and an I/O device, you should not be using this part of the |
@@ -514,16 +474,6 @@ into the width returned by this call. It will also always be a power | |||
514 | of two for easy alignment. | 474 | of two for easy alignment. |
515 | 475 | ||
516 | void | 476 | void |
517 | dma_sync_single_range(struct device *dev, dma_addr_t dma_handle, | ||
518 | unsigned long offset, size_t size, | ||
519 | enum dma_data_direction direction) | ||
520 | |||
521 | Does a partial sync, starting at offset and continuing for size. You | ||
522 | must be careful to observe the cache alignment and width when doing | ||
523 | anything like this. You must also be extra careful about accessing | ||
524 | memory you intend to sync partially. | ||
525 | |||
526 | void | ||
527 | dma_cache_sync(struct device *dev, void *vaddr, size_t size, | 477 | dma_cache_sync(struct device *dev, void *vaddr, size_t size, |
528 | enum dma_data_direction direction) | 478 | enum dma_data_direction direction) |
529 | 479 | ||
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile index 325cfd1d6d99..c7e5dc7e8cb3 100644 --- a/Documentation/DocBook/Makefile +++ b/Documentation/DocBook/Makefile | |||
@@ -14,7 +14,7 @@ DOCBOOKS := z8530book.xml mcabook.xml device-drivers.xml \ | |||
14 | genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \ | 14 | genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \ |
15 | mac80211.xml debugobjects.xml sh.xml regulator.xml \ | 15 | mac80211.xml debugobjects.xml sh.xml regulator.xml \ |
16 | alsa-driver-api.xml writing-an-alsa-driver.xml \ | 16 | alsa-driver-api.xml writing-an-alsa-driver.xml \ |
17 | tracepoint.xml media.xml | 17 | tracepoint.xml media.xml drm.xml |
18 | 18 | ||
19 | ### | 19 | ### |
20 | # The build process is as follows (targets): | 20 | # The build process is as follows (targets): |
diff --git a/Documentation/DocBook/device-drivers.tmpl b/Documentation/DocBook/device-drivers.tmpl index f9a6e2c75f12..1b2dd4fc3db2 100644 --- a/Documentation/DocBook/device-drivers.tmpl +++ b/Documentation/DocBook/device-drivers.tmpl | |||
@@ -45,7 +45,7 @@ | |||
45 | </sect1> | 45 | </sect1> |
46 | 46 | ||
47 | <sect1><title>Atomic and pointer manipulation</title> | 47 | <sect1><title>Atomic and pointer manipulation</title> |
48 | !Iarch/x86/include/asm/atomic_32.h | 48 | !Iarch/x86/include/asm/atomic.h |
49 | !Iarch/x86/include/asm/unaligned.h | 49 | !Iarch/x86/include/asm/unaligned.h |
50 | </sect1> | 50 | </sect1> |
51 | 51 | ||
diff --git a/Documentation/DocBook/deviceiobook.tmpl b/Documentation/DocBook/deviceiobook.tmpl index 3ed88126ab8f..c1ed6a49e598 100644 --- a/Documentation/DocBook/deviceiobook.tmpl +++ b/Documentation/DocBook/deviceiobook.tmpl | |||
@@ -316,7 +316,7 @@ CPU B: spin_unlock_irqrestore(&dev_lock, flags) | |||
316 | 316 | ||
317 | <chapter id="pubfunctions"> | 317 | <chapter id="pubfunctions"> |
318 | <title>Public Functions Provided</title> | 318 | <title>Public Functions Provided</title> |
319 | !Iarch/x86/include/asm/io_32.h | 319 | !Iarch/x86/include/asm/io.h |
320 | !Elib/iomap.c | 320 | !Elib/iomap.c |
321 | </chapter> | 321 | </chapter> |
322 | 322 | ||
diff --git a/Documentation/DocBook/drm.tmpl b/Documentation/DocBook/drm.tmpl new file mode 100644 index 000000000000..7583dc7cf64d --- /dev/null +++ b/Documentation/DocBook/drm.tmpl | |||
@@ -0,0 +1,839 @@ | |||
1 | <?xml version="1.0" encoding="UTF-8"?> | ||
2 | <!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" | ||
3 | "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> | ||
4 | |||
5 | <book id="drmDevelopersGuide"> | ||
6 | <bookinfo> | ||
7 | <title>Linux DRM Developer's Guide</title> | ||
8 | |||
9 | <copyright> | ||
10 | <year>2008-2009</year> | ||
11 | <holder> | ||
12 | Intel Corporation (Jesse Barnes <jesse.barnes@intel.com>) | ||
13 | </holder> | ||
14 | </copyright> | ||
15 | |||
16 | <legalnotice> | ||
17 | <para> | ||
18 | The contents of this file may be used under the terms of the GNU | ||
19 | General Public License version 2 (the "GPL") as distributed in | ||
20 | the kernel source COPYING file. | ||
21 | </para> | ||
22 | </legalnotice> | ||
23 | </bookinfo> | ||
24 | |||
25 | <toc></toc> | ||
26 | |||
27 | <!-- Introduction --> | ||
28 | |||
29 | <chapter id="drmIntroduction"> | ||
30 | <title>Introduction</title> | ||
31 | <para> | ||
32 | The Linux DRM layer contains code intended to support the needs | ||
33 | of complex graphics devices, usually containing programmable | ||
34 | pipelines well suited to 3D graphics acceleration. Graphics | ||
35 | drivers in the kernel can make use of DRM functions to make | ||
36 | tasks like memory management, interrupt handling and DMA easier, | ||
37 | and provide a uniform interface to applications. | ||
38 | </para> | ||
39 | <para> | ||
40 | A note on versions: this guide covers features found in the DRM | ||
41 | tree, including the TTM memory manager, output configuration and | ||
42 | mode setting, and the new vblank internals, in addition to all | ||
43 | the regular features found in current kernels. | ||
44 | </para> | ||
45 | <para> | ||
46 | [Insert diagram of typical DRM stack here] | ||
47 | </para> | ||
48 | </chapter> | ||
49 | |||
50 | <!-- Internals --> | ||
51 | |||
52 | <chapter id="drmInternals"> | ||
53 | <title>DRM Internals</title> | ||
54 | <para> | ||
55 | This chapter documents DRM internals relevant to driver authors | ||
56 | and developers working to add support for the latest features to | ||
57 | existing drivers. | ||
58 | </para> | ||
59 | <para> | ||
60 | First, we'll go over some typical driver initialization | ||
61 | requirements, like setting up command buffers, creating an | ||
62 | initial output configuration, and initializing core services. | ||
63 | Subsequent sections will cover core internals in more detail, | ||
64 | providing implementation notes and examples. | ||
65 | </para> | ||
66 | <para> | ||
67 | The DRM layer provides several services to graphics drivers, | ||
68 | many of them driven by the application interfaces it provides | ||
69 | through libdrm, the library that wraps most of the DRM ioctls. | ||
70 | These include vblank event handling, memory | ||
71 | management, output management, framebuffer management, command | ||
72 | submission & fencing, suspend/resume support, and DMA | ||
73 | services. | ||
74 | </para> | ||
75 | <para> | ||
76 | The core of every DRM driver is struct drm_device. Drivers | ||
77 | will typically statically initialize a drm_device structure, | ||
78 | then pass it to drm_init() at load time. | ||
79 | </para> | ||
80 | |||
81 | <!-- Internals: driver init --> | ||
82 | |||
83 | <sect1> | ||
84 | <title>Driver initialization</title> | ||
85 | <para> | ||
86 | Before calling the DRM initialization routines, the driver must | ||
87 | first create and fill out a struct drm_device structure. | ||
88 | </para> | ||
89 | <programlisting> | ||
90 | static struct drm_driver driver = { | ||
91 | /* don't use mtrr's here, the Xserver or user space app should | ||
92 | * deal with them for intel hardware. | ||
93 | */ | ||
94 | .driver_features = | ||
95 | DRIVER_USE_AGP | DRIVER_REQUIRE_AGP | | ||
96 | DRIVER_HAVE_IRQ | DRIVER_IRQ_SHARED | DRIVER_MODESET, | ||
97 | .load = i915_driver_load, | ||
98 | .unload = i915_driver_unload, | ||
99 | .firstopen = i915_driver_firstopen, | ||
100 | .lastclose = i915_driver_lastclose, | ||
101 | .preclose = i915_driver_preclose, | ||
102 | .save = i915_save, | ||
103 | .restore = i915_restore, | ||
104 | .device_is_agp = i915_driver_device_is_agp, | ||
105 | .get_vblank_counter = i915_get_vblank_counter, | ||
106 | .enable_vblank = i915_enable_vblank, | ||
107 | .disable_vblank = i915_disable_vblank, | ||
108 | .irq_preinstall = i915_driver_irq_preinstall, | ||
109 | .irq_postinstall = i915_driver_irq_postinstall, | ||
110 | .irq_uninstall = i915_driver_irq_uninstall, | ||
111 | .irq_handler = i915_driver_irq_handler, | ||
112 | .reclaim_buffers = drm_core_reclaim_buffers, | ||
113 | .get_map_ofs = drm_core_get_map_ofs, | ||
114 | .get_reg_ofs = drm_core_get_reg_ofs, | ||
115 | .fb_probe = intelfb_probe, | ||
116 | .fb_remove = intelfb_remove, | ||
117 | .fb_resize = intelfb_resize, | ||
118 | .master_create = i915_master_create, | ||
119 | .master_destroy = i915_master_destroy, | ||
120 | #if defined(CONFIG_DEBUG_FS) | ||
121 | .debugfs_init = i915_debugfs_init, | ||
122 | .debugfs_cleanup = i915_debugfs_cleanup, | ||
123 | #endif | ||
124 | .gem_init_object = i915_gem_init_object, | ||
125 | .gem_free_object = i915_gem_free_object, | ||
126 | .gem_vm_ops = &i915_gem_vm_ops, | ||
127 | .ioctls = i915_ioctls, | ||
128 | .fops = { | ||
129 | .owner = THIS_MODULE, | ||
130 | .open = drm_open, | ||
131 | .release = drm_release, | ||
132 | .ioctl = drm_ioctl, | ||
133 | .mmap = drm_mmap, | ||
134 | .poll = drm_poll, | ||
135 | .fasync = drm_fasync, | ||
136 | #ifdef CONFIG_COMPAT | ||
137 | .compat_ioctl = i915_compat_ioctl, | ||
138 | #endif | ||
139 | }, | ||
140 | .pci_driver = { | ||
141 | .name = DRIVER_NAME, | ||
142 | .id_table = pciidlist, | ||
143 | .probe = probe, | ||
144 | .remove = __devexit_p(drm_cleanup_pci), | ||
145 | }, | ||
146 | .name = DRIVER_NAME, | ||
147 | .desc = DRIVER_DESC, | ||
148 | .date = DRIVER_DATE, | ||
149 | .major = DRIVER_MAJOR, | ||
150 | .minor = DRIVER_MINOR, | ||
151 | .patchlevel = DRIVER_PATCHLEVEL, | ||
152 | }; | ||
153 | </programlisting> | ||
154 | <para> | ||
155 | In the example above, taken from the i915 DRM driver, the driver | ||
156 | sets several flags indicating what core features it supports. | ||
157 | We'll go over the individual callbacks in later sections. Since | ||
158 | flags indicate which features your driver supports to the DRM | ||
159 | core, you need to set most of them prior to calling drm_init(). Some, | ||
160 | like DRIVER_MODESET can be set later based on user supplied parameters, | ||
161 | but that's the exception rather than the rule. | ||
162 | </para> | ||
163 | <variablelist> | ||
164 | <title>Driver flags</title> | ||
165 | <varlistentry> | ||
166 | <term>DRIVER_USE_AGP</term> | ||
167 | <listitem><para> | ||
168 | Driver uses AGP interface | ||
169 | </para></listitem> | ||
170 | </varlistentry> | ||
171 | <varlistentry> | ||
172 | <term>DRIVER_REQUIRE_AGP</term> | ||
173 | <listitem><para> | ||
174 | Driver needs AGP interface to function. | ||
175 | </para></listitem> | ||
176 | </varlistentry> | ||
177 | <varlistentry> | ||
178 | <term>DRIVER_USE_MTRR</term> | ||
179 | <listitem> | ||
180 | <para> | ||
181 | Driver uses MTRR interface for mapping memory. Deprecated. | ||
182 | </para> | ||
183 | </listitem> | ||
184 | </varlistentry> | ||
185 | <varlistentry> | ||
186 | <term>DRIVER_PCI_DMA</term> | ||
187 | <listitem><para> | ||
188 | Driver is capable of PCI DMA. Deprecated. | ||
189 | </para></listitem> | ||
190 | </varlistentry> | ||
191 | <varlistentry> | ||
192 | <term>DRIVER_SG</term> | ||
193 | <listitem><para> | ||
194 | Driver can perform scatter/gather DMA. Deprecated. | ||
195 | </para></listitem> | ||
196 | </varlistentry> | ||
197 | <varlistentry> | ||
198 | <term>DRIVER_HAVE_DMA</term> | ||
199 | <listitem><para>Driver supports DMA. Deprecated.</para></listitem> | ||
200 | </varlistentry> | ||
201 | <varlistentry> | ||
202 | <term>DRIVER_HAVE_IRQ</term><term>DRIVER_IRQ_SHARED</term> | ||
203 | <listitem> | ||
204 | <para> | ||
205 | DRIVER_HAVE_IRQ indicates whether the driver has a IRQ | ||
206 | handler, DRIVER_IRQ_SHARED indicates whether the device & | ||
207 | handler support shared IRQs (note that this is required of | ||
208 | PCI drivers). | ||
209 | </para> | ||
210 | </listitem> | ||
211 | </varlistentry> | ||
212 | <varlistentry> | ||
213 | <term>DRIVER_DMA_QUEUE</term> | ||
214 | <listitem> | ||
215 | <para> | ||
216 | If the driver queues DMA requests and completes them | ||
217 | asynchronously, this flag should be set. Deprecated. | ||
218 | </para> | ||
219 | </listitem> | ||
220 | </varlistentry> | ||
221 | <varlistentry> | ||
222 | <term>DRIVER_FB_DMA</term> | ||
223 | <listitem> | ||
224 | <para> | ||
225 | Driver supports DMA to/from the framebuffer. Deprecated. | ||
226 | </para> | ||
227 | </listitem> | ||
228 | </varlistentry> | ||
229 | <varlistentry> | ||
230 | <term>DRIVER_MODESET</term> | ||
231 | <listitem> | ||
232 | <para> | ||
233 | Driver supports mode setting interfaces. | ||
234 | </para> | ||
235 | </listitem> | ||
236 | </varlistentry> | ||
237 | </variablelist> | ||
238 | <para> | ||
239 | In this specific case, the driver requires AGP and supports | ||
240 | IRQs. DMA, as we'll see, is handled by device specific ioctls | ||
241 | in this case. It also supports the kernel mode setting APIs, though | ||
242 | unlike in the actual i915 driver source, this example unconditionally | ||
243 | exports KMS capability. | ||
244 | </para> | ||
245 | </sect1> | ||
246 | |||
247 | <!-- Internals: driver load --> | ||
248 | |||
249 | <sect1> | ||
250 | <title>Driver load</title> | ||
251 | <para> | ||
252 | In the previous section, we saw what a typical drm_driver | ||
253 | structure might look like. One of the more important fields in | ||
254 | the structure is the hook for the load function. | ||
255 | </para> | ||
256 | <programlisting> | ||
257 | static struct drm_driver driver = { | ||
258 | ... | ||
259 | .load = i915_driver_load, | ||
260 | ... | ||
261 | }; | ||
262 | </programlisting> | ||
263 | <para> | ||
264 | The load function has many responsibilities: allocating a driver | ||
265 | private structure, specifying supported performance counters, | ||
266 | configuring the device (e.g. mapping registers & command | ||
267 | buffers), initializing the memory manager, and setting up the | ||
268 | initial output configuration. | ||
269 | </para> | ||
270 | <para> | ||
271 | Note that the tasks performed at driver load time must not | ||
272 | conflict with DRM client requirements. For instance, if user | ||
273 | level mode setting drivers are in use, it would be problematic | ||
274 | to perform output discovery & configuration at load time. | ||
275 | Likewise, if pre-memory management aware user level drivers are | ||
276 | in use, memory management and command buffer setup may need to | ||
277 | be omitted. These requirements are driver specific, and care | ||
278 | needs to be taken to keep both old and new applications and | ||
279 | libraries working. The i915 driver supports the "modeset" | ||
280 | module parameter to control whether advanced features are | ||
281 | enabled at load time or in legacy fashion. If compatibility is | ||
282 | a concern (e.g. with drivers converted over to the new interfaces | ||
283 | from the old ones), care must be taken to prevent incompatible | ||
284 | device initialization and control with the currently active | ||
285 | userspace drivers. | ||
286 | </para> | ||
287 | |||
288 | <sect2> | ||
289 | <title>Driver private & performance counters</title> | ||
290 | <para> | ||
291 | The driver private hangs off the main drm_device structure and | ||
292 | can be used for tracking various device specific bits of | ||
293 | information, like register offsets, command buffer status, | ||
294 | register state for suspend/resume, etc. At load time, a | ||
295 | driver can simply allocate one and set drm_device.dev_priv | ||
296 | appropriately; at unload the driver can free it and set | ||
297 | drm_device.dev_priv to NULL. | ||
298 | </para> | ||
299 | <para> | ||
300 | The DRM supports several counters which can be used for rough | ||
301 | performance characterization. Note that the DRM stat counter | ||
302 | system is not often used by applications, and supporting | ||
303 | additional counters is completely optional. | ||
304 | </para> | ||
305 | <para> | ||
306 | These interfaces are deprecated and should not be used. If performance | ||
307 | monitoring is desired, the developer should investigate and | ||
308 | potentially enhance the kernel perf and tracing infrastructure to export | ||
309 | GPU related performance information to performance monitoring | ||
310 | tools and applications. | ||
311 | </para> | ||
312 | </sect2> | ||
313 | |||
314 | <sect2> | ||
315 | <title>Configuring the device</title> | ||
316 | <para> | ||
317 | Obviously, device configuration will be device specific. | ||
318 | However, there are several common operations: finding a | ||
319 | device's PCI resources, mapping them, and potentially setting | ||
320 | up an IRQ handler. | ||
321 | </para> | ||
322 | <para> | ||
323 | Finding & mapping resources is fairly straightforward. The | ||
324 | DRM wrapper functions, drm_get_resource_start() and | ||
325 | drm_get_resource_len() can be used to find BARs on the given | ||
326 | drm_device struct. Once those values have been retrieved, the | ||
327 | driver load function can call drm_addmap() to create a new | ||
328 | mapping for the BAR in question. Note you'll probably want a | ||
329 | drm_local_map_t in your driver private structure to track any | ||
330 | mappings you create. | ||
331 | <!-- !Fdrivers/gpu/drm/drm_bufs.c drm_get_resource_* --> | ||
332 | <!-- !Finclude/drm/drmP.h drm_local_map_t --> | ||
333 | </para> | ||
334 | <para> | ||
335 | if compatibility with other operating systems isn't a concern | ||
336 | (DRM drivers can run under various BSD variants and OpenSolaris), | ||
337 | native Linux calls can be used for the above, e.g. pci_resource_* | ||
338 | and iomap*/iounmap. See the Linux device driver book for more | ||
339 | info. | ||
340 | </para> | ||
341 | <para> | ||
342 | Once you have a register map, you can use the DRM_READn() and | ||
343 | DRM_WRITEn() macros to access the registers on your device, or | ||
344 | use driver specific versions to offset into your MMIO space | ||
345 | relative to a driver specific base pointer (see I915_READ for | ||
346 | example). | ||
347 | </para> | ||
348 | <para> | ||
349 | If your device supports interrupt generation, you may want to | ||
350 | setup an interrupt handler at driver load time as well. This | ||
351 | is done using the drm_irq_install() function. If your device | ||
352 | supports vertical blank interrupts, it should call | ||
353 | drm_vblank_init() to initialize the core vblank handling code before | ||
354 | enabling interrupts on your device. This ensures the vblank related | ||
355 | structures are allocated and allows the core to handle vblank events. | ||
356 | </para> | ||
357 | <!--!Fdrivers/char/drm/drm_irq.c drm_irq_install--> | ||
358 | <para> | ||
359 | Once your interrupt handler is registered (it'll use your | ||
360 | drm_driver.irq_handler as the actual interrupt handling | ||
361 | function), you can safely enable interrupts on your device, | ||
362 | assuming any other state your interrupt handler uses is also | ||
363 | initialized. | ||
364 | </para> | ||
365 | <para> | ||
366 | Another task that may be necessary during configuration is | ||
367 | mapping the video BIOS. On many devices, the VBIOS describes | ||
368 | device configuration, LCD panel timings (if any), and contains | ||
369 | flags indicating device state. Mapping the BIOS can be done | ||
370 | using the pci_map_rom() call, a convenience function that | ||
371 | takes care of mapping the actual ROM, whether it has been | ||
372 | shadowed into memory (typically at address 0xc0000) or exists | ||
373 | on the PCI device in the ROM BAR. Note that once you've | ||
374 | mapped the ROM and extracted any necessary information, be | ||
375 | sure to unmap it; on many devices the ROM address decoder is | ||
376 | shared with other BARs, so leaving it mapped can cause | ||
377 | undesired behavior like hangs or memory corruption. | ||
378 | <!--!Fdrivers/pci/rom.c pci_map_rom--> | ||
379 | </para> | ||
380 | </sect2> | ||
381 | |||
382 | <sect2> | ||
383 | <title>Memory manager initialization</title> | ||
384 | <para> | ||
385 | In order to allocate command buffers, cursor memory, scanout | ||
386 | buffers, etc., as well as support the latest features provided | ||
387 | by packages like Mesa and the X.Org X server, your driver | ||
388 | should support a memory manager. | ||
389 | </para> | ||
390 | <para> | ||
391 | If your driver supports memory management (it should!), you'll | ||
392 | need to set that up at load time as well. How you intialize | ||
393 | it depends on which memory manager you're using, TTM or GEM. | ||
394 | </para> | ||
395 | <sect3> | ||
396 | <title>TTM initialization</title> | ||
397 | <para> | ||
398 | TTM (for Translation Table Manager) manages video memory and | ||
399 | aperture space for graphics devices. TTM supports both UMA devices | ||
400 | and devices with dedicated video RAM (VRAM), i.e. most discrete | ||
401 | graphics devices. If your device has dedicated RAM, supporting | ||
402 | TTM is desireable. TTM also integrates tightly with your | ||
403 | driver specific buffer execution function. See the radeon | ||
404 | driver for examples. | ||
405 | </para> | ||
406 | <para> | ||
407 | The core TTM structure is the ttm_bo_driver struct. It contains | ||
408 | several fields with function pointers for initializing the TTM, | ||
409 | allocating and freeing memory, waiting for command completion | ||
410 | and fence synchronization, and memory migration. See the | ||
411 | radeon_ttm.c file for an example of usage. | ||
412 | </para> | ||
413 | <para> | ||
414 | The ttm_global_reference structure is made up of several fields: | ||
415 | </para> | ||
416 | <programlisting> | ||
417 | struct ttm_global_reference { | ||
418 | enum ttm_global_types global_type; | ||
419 | size_t size; | ||
420 | void *object; | ||
421 | int (*init) (struct ttm_global_reference *); | ||
422 | void (*release) (struct ttm_global_reference *); | ||
423 | }; | ||
424 | </programlisting> | ||
425 | <para> | ||
426 | There should be one global reference structure for your memory | ||
427 | manager as a whole, and there will be others for each object | ||
428 | created by the memory manager at runtime. Your global TTM should | ||
429 | have a type of TTM_GLOBAL_TTM_MEM. The size field for the global | ||
430 | object should be sizeof(struct ttm_mem_global), and the init and | ||
431 | release hooks should point at your driver specific init and | ||
432 | release routines, which will probably eventually call | ||
433 | ttm_mem_global_init and ttm_mem_global_release respectively. | ||
434 | </para> | ||
435 | <para> | ||
436 | Once your global TTM accounting structure is set up and initialized | ||
437 | (done by calling ttm_global_item_ref on the global object you | ||
438 | just created), you'll need to create a buffer object TTM to | ||
439 | provide a pool for buffer object allocation by clients and the | ||
440 | kernel itself. The type of this object should be TTM_GLOBAL_TTM_BO, | ||
441 | and its size should be sizeof(struct ttm_bo_global). Again, | ||
442 | driver specific init and release functions can be provided, | ||
443 | likely eventually calling ttm_bo_global_init and | ||
444 | ttm_bo_global_release, respectively. Also like the previous | ||
445 | object, ttm_global_item_ref is used to create an initial reference | ||
446 | count for the TTM, which will call your initalization function. | ||
447 | </para> | ||
448 | </sect3> | ||
449 | <sect3> | ||
450 | <title>GEM initialization</title> | ||
451 | <para> | ||
452 | GEM is an alternative to TTM, designed specifically for UMA | ||
453 | devices. It has simpler initialization and execution requirements | ||
454 | than TTM, but has no VRAM management capability. Core GEM | ||
455 | initialization is comprised of a basic drm_mm_init call to create | ||
456 | a GTT DRM MM object, which provides an address space pool for | ||
457 | object allocation. In a KMS configuration, the driver will | ||
458 | need to allocate and initialize a command ring buffer following | ||
459 | basic GEM initialization. Most UMA devices have a so-called | ||
460 | "stolen" memory region, which provides space for the initial | ||
461 | framebuffer and large, contiguous memory regions required by the | ||
462 | device. This space is not typically managed by GEM, and must | ||
463 | be initialized separately into its own DRM MM object. | ||
464 | </para> | ||
465 | <para> | ||
466 | Initialization will be driver specific, and will depend on | ||
467 | the architecture of the device. In the case of Intel | ||
468 | integrated graphics chips like 965GM, GEM initialization can | ||
469 | be done by calling the internal GEM init function, | ||
470 | i915_gem_do_init(). Since the 965GM is a UMA device | ||
471 | (i.e. it doesn't have dedicated VRAM), GEM will manage | ||
472 | making regular RAM available for GPU operations. Memory set | ||
473 | aside by the BIOS (called "stolen" memory by the i915 | ||
474 | driver) will be managed by the DRM memrange allocator; the | ||
475 | rest of the aperture will be managed by GEM. | ||
476 | <programlisting> | ||
477 | /* Basic memrange allocator for stolen space (aka vram) */ | ||
478 | drm_memrange_init(&dev_priv->vram, 0, prealloc_size); | ||
479 | /* Let GEM Manage from end of prealloc space to end of aperture */ | ||
480 | i915_gem_do_init(dev, prealloc_size, agp_size); | ||
481 | </programlisting> | ||
482 | <!--!Edrivers/char/drm/drm_memrange.c--> | ||
483 | </para> | ||
484 | <para> | ||
485 | Once the memory manager has been set up, we can allocate the | ||
486 | command buffer. In the i915 case, this is also done with a | ||
487 | GEM function, i915_gem_init_ringbuffer(). | ||
488 | </para> | ||
489 | </sect3> | ||
490 | </sect2> | ||
491 | |||
492 | <sect2> | ||
493 | <title>Output configuration</title> | ||
494 | <para> | ||
495 | The final initialization task is output configuration. This involves | ||
496 | finding and initializing the CRTCs, encoders and connectors | ||
497 | for your device, creating an initial configuration and | ||
498 | registering a framebuffer console driver. | ||
499 | </para> | ||
500 | <sect3> | ||
501 | <title>Output discovery and initialization</title> | ||
502 | <para> | ||
503 | Several core functions exist to create CRTCs, encoders and | ||
504 | connectors, namely drm_crtc_init(), drm_connector_init() and | ||
505 | drm_encoder_init(), along with several "helper" functions to | ||
506 | perform common tasks. | ||
507 | </para> | ||
508 | <para> | ||
509 | Connectors should be registered with sysfs once they've been | ||
510 | detected and initialized, using the | ||
511 | drm_sysfs_connector_add() function. Likewise, when they're | ||
512 | removed from the system, they should be destroyed with | ||
513 | drm_sysfs_connector_remove(). | ||
514 | </para> | ||
515 | <programlisting> | ||
516 | <![CDATA[ | ||
517 | void intel_crt_init(struct drm_device *dev) | ||
518 | { | ||
519 | struct drm_connector *connector; | ||
520 | struct intel_output *intel_output; | ||
521 | |||
522 | intel_output = kzalloc(sizeof(struct intel_output), GFP_KERNEL); | ||
523 | if (!intel_output) | ||
524 | return; | ||
525 | |||
526 | connector = &intel_output->base; | ||
527 | drm_connector_init(dev, &intel_output->base, | ||
528 | &intel_crt_connector_funcs, DRM_MODE_CONNECTOR_VGA); | ||
529 | |||
530 | drm_encoder_init(dev, &intel_output->enc, &intel_crt_enc_funcs, | ||
531 | DRM_MODE_ENCODER_DAC); | ||
532 | |||
533 | drm_mode_connector_attach_encoder(&intel_output->base, | ||
534 | &intel_output->enc); | ||
535 | |||
536 | /* Set up the DDC bus. */ | ||
537 | intel_output->ddc_bus = intel_i2c_create(dev, GPIOA, "CRTDDC_A"); | ||
538 | if (!intel_output->ddc_bus) { | ||
539 | dev_printk(KERN_ERR, &dev->pdev->dev, "DDC bus registration " | ||
540 | "failed.\n"); | ||
541 | return; | ||
542 | } | ||
543 | |||
544 | intel_output->type = INTEL_OUTPUT_ANALOG; | ||
545 | connector->interlace_allowed = 0; | ||
546 | connector->doublescan_allowed = 0; | ||
547 | |||
548 | drm_encoder_helper_add(&intel_output->enc, &intel_crt_helper_funcs); | ||
549 | drm_connector_helper_add(connector, &intel_crt_connector_helper_funcs); | ||
550 | |||
551 | drm_sysfs_connector_add(connector); | ||
552 | } | ||
553 | ]]> | ||
554 | </programlisting> | ||
555 | <para> | ||
556 | In the example above (again, taken from the i915 driver), a | ||
557 | CRT connector and encoder combination is created. A device | ||
558 | specific i2c bus is also created, for fetching EDID data and | ||
559 | performing monitor detection. Once the process is complete, | ||
560 | the new connector is regsitered with sysfs, to make its | ||
561 | properties available to applications. | ||
562 | </para> | ||
563 | <sect4> | ||
564 | <title>Helper functions and core functions</title> | ||
565 | <para> | ||
566 | Since many PC-class graphics devices have similar display output | ||
567 | designs, the DRM provides a set of helper functions to make | ||
568 | output management easier. The core helper routines handle | ||
569 | encoder re-routing and disabling of unused functions following | ||
570 | mode set. Using the helpers is optional, but recommended for | ||
571 | devices with PC-style architectures (i.e. a set of display planes | ||
572 | for feeding pixels to encoders which are in turn routed to | ||
573 | connectors). Devices with more complex requirements needing | ||
574 | finer grained management can opt to use the core callbacks | ||
575 | directly. | ||
576 | </para> | ||
577 | <para> | ||
578 | [Insert typical diagram here.] [Insert OMAP style config here.] | ||
579 | </para> | ||
580 | </sect4> | ||
581 | <para> | ||
582 | For each encoder, CRTC and connector, several functions must | ||
583 | be provided, depending on the object type. Encoder objects | ||
584 | need should provide a DPMS (basically on/off) function, mode fixup | ||
585 | (for converting requested modes into native hardware timings), | ||
586 | and prepare, set and commit functions for use by the core DRM | ||
587 | helper functions. Connector helpers need to provide mode fetch and | ||
588 | validity functions as well as an encoder matching function for | ||
589 | returing an ideal encoder for a given connector. The core | ||
590 | connector functions include a DPMS callback, (deprecated) | ||
591 | save/restore routines, detection, mode probing, property handling, | ||
592 | and cleanup functions. | ||
593 | </para> | ||
594 | <!--!Edrivers/char/drm/drm_crtc.h--> | ||
595 | <!--!Edrivers/char/drm/drm_crtc.c--> | ||
596 | <!--!Edrivers/char/drm/drm_crtc_helper.c--> | ||
597 | </sect3> | ||
598 | </sect2> | ||
599 | </sect1> | ||
600 | |||
601 | <!-- Internals: vblank handling --> | ||
602 | |||
603 | <sect1> | ||
604 | <title>VBlank event handling</title> | ||
605 | <para> | ||
606 | The DRM core exposes two vertical blank related ioctls: | ||
607 | DRM_IOCTL_WAIT_VBLANK and DRM_IOCTL_MODESET_CTL. | ||
608 | <!--!Edrivers/char/drm/drm_irq.c--> | ||
609 | </para> | ||
610 | <para> | ||
611 | DRM_IOCTL_WAIT_VBLANK takes a struct drm_wait_vblank structure | ||
612 | as its argument, and is used to block or request a signal when a | ||
613 | specified vblank event occurs. | ||
614 | </para> | ||
615 | <para> | ||
616 | DRM_IOCTL_MODESET_CTL should be called by application level | ||
617 | drivers before and after mode setting, since on many devices the | ||
618 | vertical blank counter will be reset at that time. Internally, | ||
619 | the DRM snapshots the last vblank count when the ioctl is called | ||
620 | with the _DRM_PRE_MODESET command so that the counter won't go | ||
621 | backwards (which is dealt with when _DRM_POST_MODESET is used). | ||
622 | </para> | ||
623 | <para> | ||
624 | To support the functions above, the DRM core provides several | ||
625 | helper functions for tracking vertical blank counters, and | ||
626 | requires drivers to provide several callbacks: | ||
627 | get_vblank_counter(), enable_vblank() and disable_vblank(). The | ||
628 | core uses get_vblank_counter() to keep the counter accurate | ||
629 | across interrupt disable periods. It should return the current | ||
630 | vertical blank event count, which is often tracked in a device | ||
631 | register. The enable and disable vblank callbacks should enable | ||
632 | and disable vertical blank interrupts, respectively. In the | ||
633 | absence of DRM clients waiting on vblank events, the core DRM | ||
634 | code will use the disable_vblank() function to disable | ||
635 | interrupts, which saves power. They'll be re-enabled again when | ||
636 | a client calls the vblank wait ioctl above. | ||
637 | </para> | ||
638 | <para> | ||
639 | Devices that don't provide a count register can simply use an | ||
640 | internal atomic counter incremented on every vertical blank | ||
641 | interrupt, and can make their enable and disable vblank | ||
642 | functions into no-ops. | ||
643 | </para> | ||
644 | </sect1> | ||
645 | |||
646 | <sect1> | ||
647 | <title>Memory management</title> | ||
648 | <para> | ||
649 | The memory manager lies at the heart of many DRM operations, and | ||
650 | is also required to support advanced client features like OpenGL | ||
651 | pbuffers. The DRM currently contains two memory managers, TTM | ||
652 | and GEM. | ||
653 | </para> | ||
654 | |||
655 | <sect2> | ||
656 | <title>The Translation Table Manager (TTM)</title> | ||
657 | <para> | ||
658 | TTM was developed by Tungsten Graphics, primarily by Thomas | ||
659 | Hellström, and is intended to be a flexible, high performance | ||
660 | graphics memory manager. | ||
661 | </para> | ||
662 | <para> | ||
663 | Drivers wishing to support TTM must fill out a drm_bo_driver | ||
664 | structure. | ||
665 | </para> | ||
666 | <para> | ||
667 | TTM design background and information belongs here. | ||
668 | </para> | ||
669 | </sect2> | ||
670 | |||
671 | <sect2> | ||
672 | <title>The Graphics Execution Manager (GEM)</title> | ||
673 | <para> | ||
674 | GEM is an Intel project, authored by Eric Anholt and Keith | ||
675 | Packard. It provides simpler interfaces than TTM, and is well | ||
676 | suited for UMA devices. | ||
677 | </para> | ||
678 | <para> | ||
679 | GEM-enabled drivers must provide gem_init_object() and | ||
680 | gem_free_object() callbacks to support the core memory | ||
681 | allocation routines. They should also provide several driver | ||
682 | specific ioctls to support command execution, pinning, buffer | ||
683 | read & write, mapping, and domain ownership transfers. | ||
684 | </para> | ||
685 | <para> | ||
686 | On a fundamental level, GEM involves several operations: memory | ||
687 | allocation and freeing, command execution, and aperture management | ||
688 | at command execution time. Buffer object allocation is relatively | ||
689 | straightforward and largely provided by Linux's shmem layer, which | ||
690 | provides memory to back each object. When mapped into the GTT | ||
691 | or used in a command buffer, the backing pages for an object are | ||
692 | flushed to memory and marked write combined so as to be coherent | ||
693 | with the GPU. Likewise, when the GPU finishes rendering to an object, | ||
694 | if the CPU accesses it, it must be made coherent with the CPU's view | ||
695 | of memory, usually involving GPU cache flushing of various kinds. | ||
696 | This core CPU<->GPU coherency management is provided by the GEM | ||
697 | set domain function, which evaluates an object's current domain and | ||
698 | performs any necessary flushing or synchronization to put the object | ||
699 | into the desired coherency domain (note that the object may be busy, | ||
700 | i.e. an active render target; in that case the set domain function | ||
701 | will block the client and wait for rendering to complete before | ||
702 | performing any necessary flushing operations). | ||
703 | </para> | ||
704 | <para> | ||
705 | Perhaps the most important GEM function is providing a command | ||
706 | execution interface to clients. Client programs construct command | ||
707 | buffers containing references to previously allocated memory objects | ||
708 | and submit them to GEM. At that point, GEM will take care to bind | ||
709 | all the objects into the GTT, execute the buffer, and provide | ||
710 | necessary synchronization between clients accessing the same buffers. | ||
711 | This often involves evicting some objects from the GTT and re-binding | ||
712 | others (a fairly expensive operation), and providing relocation | ||
713 | support which hides fixed GTT offsets from clients. Clients must | ||
714 | take care not to submit command buffers that reference more objects | ||
715 | than can fit in the GTT or GEM will reject them and no rendering | ||
716 | will occur. Similarly, if several objects in the buffer require | ||
717 | fence registers to be allocated for correct rendering (e.g. 2D blits | ||
718 | on pre-965 chips), care must be taken not to require more fence | ||
719 | registers than are available to the client. Such resource management | ||
720 | should be abstracted from the client in libdrm. | ||
721 | </para> | ||
722 | </sect2> | ||
723 | |||
724 | </sect1> | ||
725 | |||
726 | <!-- Output management --> | ||
727 | <sect1> | ||
728 | <title>Output management</title> | ||
729 | <para> | ||
730 | At the core of the DRM output management code is a set of | ||
731 | structures representing CRTCs, encoders and connectors. | ||
732 | </para> | ||
733 | <para> | ||
734 | A CRTC is an abstraction representing a part of the chip that | ||
735 | contains a pointer to a scanout buffer. Therefore, the number | ||
736 | of CRTCs available determines how many independent scanout | ||
737 | buffers can be active at any given time. The CRTC structure | ||
738 | contains several fields to support this: a pointer to some video | ||
739 | memory, a display mode, and an (x, y) offset into the video | ||
740 | memory to support panning or configurations where one piece of | ||
741 | video memory spans multiple CRTCs. | ||
742 | </para> | ||
743 | <para> | ||
744 | An encoder takes pixel data from a CRTC and converts it to a | ||
745 | format suitable for any attached connectors. On some devices, | ||
746 | it may be possible to have a CRTC send data to more than one | ||
747 | encoder. In that case, both encoders would receive data from | ||
748 | the same scanout buffer, resulting in a "cloned" display | ||
749 | configuration across the connectors attached to each encoder. | ||
750 | </para> | ||
751 | <para> | ||
752 | A connector is the final destination for pixel data on a device, | ||
753 | and usually connects directly to an external display device like | ||
754 | a monitor or laptop panel. A connector can only be attached to | ||
755 | one encoder at a time. The connector is also the structure | ||
756 | where information about the attached display is kept, so it | ||
757 | contains fields for display data, EDID data, DPMS & | ||
758 | connection status, and information about modes supported on the | ||
759 | attached displays. | ||
760 | </para> | ||
761 | <!--!Edrivers/char/drm/drm_crtc.c--> | ||
762 | </sect1> | ||
763 | |||
764 | <sect1> | ||
765 | <title>Framebuffer management</title> | ||
766 | <para> | ||
767 | In order to set a mode on a given CRTC, encoder and connector | ||
768 | configuration, clients need to provide a framebuffer object which | ||
769 | will provide a source of pixels for the CRTC to deliver to the encoder(s) | ||
770 | and ultimately the connector(s) in the configuration. A framebuffer | ||
771 | is fundamentally a driver specific memory object, made into an opaque | ||
772 | handle by the DRM addfb function. Once an fb has been created this | ||
773 | way it can be passed to the KMS mode setting routines for use in | ||
774 | a configuration. | ||
775 | </para> | ||
776 | </sect1> | ||
777 | |||
778 | <sect1> | ||
779 | <title>Command submission & fencing</title> | ||
780 | <para> | ||
781 | This should cover a few device specific command submission | ||
782 | implementations. | ||
783 | </para> | ||
784 | </sect1> | ||
785 | |||
786 | <sect1> | ||
787 | <title>Suspend/resume</title> | ||
788 | <para> | ||
789 | The DRM core provides some suspend/resume code, but drivers | ||
790 | wanting full suspend/resume support should provide save() and | ||
791 | restore() functions. These will be called at suspend, | ||
792 | hibernate, or resume time, and should perform any state save or | ||
793 | restore required by your device across suspend or hibernate | ||
794 | states. | ||
795 | </para> | ||
796 | </sect1> | ||
797 | |||
798 | <sect1> | ||
799 | <title>DMA services</title> | ||
800 | <para> | ||
801 | This should cover how DMA mapping etc. is supported by the core. | ||
802 | These functions are deprecated and should not be used. | ||
803 | </para> | ||
804 | </sect1> | ||
805 | </chapter> | ||
806 | |||
807 | <!-- External interfaces --> | ||
808 | |||
809 | <chapter id="drmExternals"> | ||
810 | <title>Userland interfaces</title> | ||
811 | <para> | ||
812 | The DRM core exports several interfaces to applications, | ||
813 | generally intended to be used through corresponding libdrm | ||
814 | wrapper functions. In addition, drivers export device specific | ||
815 | interfaces for use by userspace drivers & device aware | ||
816 | applications through ioctls and sysfs files. | ||
817 | </para> | ||
818 | <para> | ||
819 | External interfaces include: memory mapping, context management, | ||
820 | DMA operations, AGP management, vblank control, fence | ||
821 | management, memory management, and output management. | ||
822 | </para> | ||
823 | <para> | ||
824 | Cover generic ioctls and sysfs layout here. Only need high | ||
825 | level info, since man pages will cover the rest. | ||
826 | </para> | ||
827 | </chapter> | ||
828 | |||
829 | <!-- API reference --> | ||
830 | |||
831 | <appendix id="drmDriverApi"> | ||
832 | <title>DRM Driver API</title> | ||
833 | <para> | ||
834 | Include auto-generated API reference here (need to reference it | ||
835 | from paragraphs above too). | ||
836 | </para> | ||
837 | </appendix> | ||
838 | |||
839 | </book> | ||
diff --git a/Documentation/DocBook/kgdb.tmpl b/Documentation/DocBook/kgdb.tmpl index 5cff41a5fa7c..55f12ac37acd 100644 --- a/Documentation/DocBook/kgdb.tmpl +++ b/Documentation/DocBook/kgdb.tmpl | |||
@@ -4,7 +4,7 @@ | |||
4 | 4 | ||
5 | <book id="kgdbOnLinux"> | 5 | <book id="kgdbOnLinux"> |
6 | <bookinfo> | 6 | <bookinfo> |
7 | <title>Using kgdb and the kgdb Internals</title> | 7 | <title>Using kgdb, kdb and the kernel debugger internals</title> |
8 | 8 | ||
9 | <authorgroup> | 9 | <authorgroup> |
10 | <author> | 10 | <author> |
@@ -17,33 +17,8 @@ | |||
17 | </affiliation> | 17 | </affiliation> |
18 | </author> | 18 | </author> |
19 | </authorgroup> | 19 | </authorgroup> |
20 | |||
21 | <authorgroup> | ||
22 | <author> | ||
23 | <firstname>Tom</firstname> | ||
24 | <surname>Rini</surname> | ||
25 | <affiliation> | ||
26 | <address> | ||
27 | <email>trini@kernel.crashing.org</email> | ||
28 | </address> | ||
29 | </affiliation> | ||
30 | </author> | ||
31 | </authorgroup> | ||
32 | |||
33 | <authorgroup> | ||
34 | <author> | ||
35 | <firstname>Amit S.</firstname> | ||
36 | <surname>Kale</surname> | ||
37 | <affiliation> | ||
38 | <address> | ||
39 | <email>amitkale@linsyssoft.com</email> | ||
40 | </address> | ||
41 | </affiliation> | ||
42 | </author> | ||
43 | </authorgroup> | ||
44 | |||
45 | <copyright> | 20 | <copyright> |
46 | <year>2008</year> | 21 | <year>2008,2010</year> |
47 | <holder>Wind River Systems, Inc.</holder> | 22 | <holder>Wind River Systems, Inc.</holder> |
48 | </copyright> | 23 | </copyright> |
49 | <copyright> | 24 | <copyright> |
@@ -69,41 +44,76 @@ | |||
69 | <chapter id="Introduction"> | 44 | <chapter id="Introduction"> |
70 | <title>Introduction</title> | 45 | <title>Introduction</title> |
71 | <para> | 46 | <para> |
72 | kgdb is a source level debugger for linux kernel. It is used along | 47 | The kernel has two different debugger front ends (kdb and kgdb) |
73 | with gdb to debug a linux kernel. The expectation is that gdb can | 48 | which interface to the debug core. It is possible to use either |
74 | be used to "break in" to the kernel to inspect memory, variables | 49 | of the debugger front ends and dynamically transition between them |
75 | and look through call stack information similar to what an | 50 | if you configure the kernel properly at compile and runtime. |
76 | application developer would use gdb for. It is possible to place | 51 | </para> |
77 | breakpoints in kernel code and perform some limited execution | 52 | <para> |
78 | stepping. | 53 | Kdb is simplistic shell-style interface which you can use on a |
54 | system console with a keyboard or serial console. You can use it | ||
55 | to inspect memory, registers, process lists, dmesg, and even set | ||
56 | breakpoints to stop in a certain location. Kdb is not a source | ||
57 | level debugger, although you can set breakpoints and execute some | ||
58 | basic kernel run control. Kdb is mainly aimed at doing some | ||
59 | analysis to aid in development or diagnosing kernel problems. You | ||
60 | can access some symbols by name in kernel built-ins or in kernel | ||
61 | modules if the code was built | ||
62 | with <symbol>CONFIG_KALLSYMS</symbol>. | ||
63 | </para> | ||
64 | <para> | ||
65 | Kgdb is intended to be used as a source level debugger for the | ||
66 | Linux kernel. It is used along with gdb to debug a Linux kernel. | ||
67 | The expectation is that gdb can be used to "break in" to the | ||
68 | kernel to inspect memory, variables and look through call stack | ||
69 | information similar to the way an application developer would use | ||
70 | gdb to debug an application. It is possible to place breakpoints | ||
71 | in kernel code and perform some limited execution stepping. | ||
79 | </para> | 72 | </para> |
80 | <para> | 73 | <para> |
81 | Two machines are required for using kgdb. One of these machines is a | 74 | Two machines are required for using kgdb. One of these machines is |
82 | development machine and the other is a test machine. The kernel | 75 | a development machine and the other is the target machine. The |
83 | to be debugged runs on the test machine. The development machine | 76 | kernel to be debugged runs on the target machine. The development |
84 | runs an instance of gdb against the vmlinux file which contains | 77 | machine runs an instance of gdb against the vmlinux file which |
85 | the symbols (not boot image such as bzImage, zImage, uImage...). | 78 | contains the symbols (not boot image such as bzImage, zImage, |
86 | In gdb the developer specifies the connection parameters and | 79 | uImage...). In gdb the developer specifies the connection |
87 | connects to kgdb. The type of connection a developer makes with | 80 | parameters and connects to kgdb. The type of connection a |
88 | gdb depends on the availability of kgdb I/O modules compiled as | 81 | developer makes with gdb depends on the availability of kgdb I/O |
89 | builtin's or kernel modules in the test machine's kernel. | 82 | modules compiled as built-ins or loadable kernel modules in the test |
83 | machine's kernel. | ||
90 | </para> | 84 | </para> |
91 | </chapter> | 85 | </chapter> |
92 | <chapter id="CompilingAKernel"> | 86 | <chapter id="CompilingAKernel"> |
93 | <title>Compiling a kernel</title> | 87 | <title>Compiling a kernel</title> |
88 | <para> | ||
89 | <itemizedlist> | ||
90 | <listitem><para>In order to enable compilation of kdb, you must first enable kgdb.</para></listitem> | ||
91 | <listitem><para>The kgdb test compile options are described in the kgdb test suite chapter.</para></listitem> | ||
92 | </itemizedlist> | ||
93 | </para> | ||
94 | <sect1 id="CompileKGDB"> | ||
95 | <title>Kernel config options for kgdb</title> | ||
94 | <para> | 96 | <para> |
95 | To enable <symbol>CONFIG_KGDB</symbol> you should first turn on | 97 | To enable <symbol>CONFIG_KGDB</symbol> you should first turn on |
96 | "Prompt for development and/or incomplete code/drivers" | 98 | "Prompt for development and/or incomplete code/drivers" |
97 | (CONFIG_EXPERIMENTAL) in "General setup", then under the | 99 | (CONFIG_EXPERIMENTAL) in "General setup", then under the |
98 | "Kernel debugging" select "KGDB: kernel debugging with remote gdb". | 100 | "Kernel debugging" select "KGDB: kernel debugger". |
101 | </para> | ||
102 | <para> | ||
103 | While it is not a hard requirement that you have symbols in your | ||
104 | vmlinux file, gdb tends not to be very useful without the symbolic | ||
105 | data, so you will want to turn | ||
106 | on <symbol>CONFIG_DEBUG_INFO</symbol> which is called "Compile the | ||
107 | kernel with debug info" in the config menu. | ||
99 | </para> | 108 | </para> |
100 | <para> | 109 | <para> |
101 | It is advised, but not required that you turn on the | 110 | It is advised, but not required that you turn on the |
102 | CONFIG_FRAME_POINTER kernel option. This option inserts code to | 111 | <symbol>CONFIG_FRAME_POINTER</symbol> kernel option which is called "Compile the |
103 | into the compiled executable which saves the frame information in | 112 | kernel with frame pointers" in the config menu. This option |
104 | registers or on the stack at different points which will allow a | 113 | inserts code to into the compiled executable which saves the frame |
105 | debugger such as gdb to more accurately construct stack back traces | 114 | information in registers or on the stack at different points which |
106 | while debugging the kernel. | 115 | allows a debugger such as gdb to more accurately construct |
116 | stack back traces while debugging the kernel. | ||
107 | </para> | 117 | </para> |
108 | <para> | 118 | <para> |
109 | If the architecture that you are using supports the kernel option | 119 | If the architecture that you are using supports the kernel option |
@@ -116,38 +126,160 @@ | |||
116 | this option. | 126 | this option. |
117 | </para> | 127 | </para> |
118 | <para> | 128 | <para> |
119 | Next you should choose one of more I/O drivers to interconnect debugging | 129 | Next you should choose one of more I/O drivers to interconnect |
120 | host and debugged target. Early boot debugging requires a KGDB | 130 | debugging host and debugged target. Early boot debugging requires |
121 | I/O driver that supports early debugging and the driver must be | 131 | a KGDB I/O driver that supports early debugging and the driver |
122 | built into the kernel directly. Kgdb I/O driver configuration | 132 | must be built into the kernel directly. Kgdb I/O driver |
123 | takes place via kernel or module parameters, see following | 133 | configuration takes place via kernel or module parameters which |
124 | chapter. | 134 | you can learn more about in the in the section that describes the |
135 | parameter "kgdboc". | ||
125 | </para> | 136 | </para> |
126 | <para> | 137 | <para>Here is an example set of .config symbols to enable or |
127 | The kgdb test compile options are described in the kgdb test suite chapter. | 138 | disable for kgdb: |
139 | <itemizedlist> | ||
140 | <listitem><para># CONFIG_DEBUG_RODATA is not set</para></listitem> | ||
141 | <listitem><para>CONFIG_FRAME_POINTER=y</para></listitem> | ||
142 | <listitem><para>CONFIG_KGDB=y</para></listitem> | ||
143 | <listitem><para>CONFIG_KGDB_SERIAL_CONSOLE=y</para></listitem> | ||
144 | </itemizedlist> | ||
128 | </para> | 145 | </para> |
129 | 146 | </sect1> | |
147 | <sect1 id="CompileKDB"> | ||
148 | <title>Kernel config options for kdb</title> | ||
149 | <para>Kdb is quite a bit more complex than the simple gdbstub | ||
150 | sitting on top of the kernel's debug core. Kdb must implement a | ||
151 | shell, and also adds some helper functions in other parts of the | ||
152 | kernel, responsible for printing out interesting data such as what | ||
153 | you would see if you ran "lsmod", or "ps". In order to build kdb | ||
154 | into the kernel you follow the same steps as you would for kgdb. | ||
155 | </para> | ||
156 | <para>The main config option for kdb | ||
157 | is <symbol>CONFIG_KGDB_KDB</symbol> which is called "KGDB_KDB: | ||
158 | include kdb frontend for kgdb" in the config menu. In theory you | ||
159 | would have already also selected an I/O driver such as the | ||
160 | CONFIG_KGDB_SERIAL_CONSOLE interface if you plan on using kdb on a | ||
161 | serial port, when you were configuring kgdb. | ||
162 | </para> | ||
163 | <para>If you want to use a PS/2-style keyboard with kdb, you would | ||
164 | select CONFIG_KDB_KEYBOARD which is called "KGDB_KDB: keyboard as | ||
165 | input device" in the config menu. The CONFIG_KDB_KEYBOARD option | ||
166 | is not used for anything in the gdb interface to kgdb. The | ||
167 | CONFIG_KDB_KEYBOARD option only works with kdb. | ||
168 | </para> | ||
169 | <para>Here is an example set of .config symbols to enable/disable kdb: | ||
170 | <itemizedlist> | ||
171 | <listitem><para># CONFIG_DEBUG_RODATA is not set</para></listitem> | ||
172 | <listitem><para>CONFIG_FRAME_POINTER=y</para></listitem> | ||
173 | <listitem><para>CONFIG_KGDB=y</para></listitem> | ||
174 | <listitem><para>CONFIG_KGDB_SERIAL_CONSOLE=y</para></listitem> | ||
175 | <listitem><para>CONFIG_KGDB_KDB=y</para></listitem> | ||
176 | <listitem><para>CONFIG_KDB_KEYBOARD=y</para></listitem> | ||
177 | </itemizedlist> | ||
178 | </para> | ||
179 | </sect1> | ||
130 | </chapter> | 180 | </chapter> |
131 | <chapter id="EnableKGDB"> | 181 | <chapter id="kgdbKernelArgs"> |
132 | <title>Enable kgdb for debugging</title> | 182 | <title>Kernel Debugger Boot Arguments</title> |
133 | <para> | 183 | <para>This section describes the various runtime kernel |
134 | In order to use kgdb you must activate it by passing configuration | 184 | parameters that affect the configuration of the kernel debugger. |
135 | information to one of the kgdb I/O drivers. If you do not pass any | 185 | The following chapter covers using kdb and kgdb as well as |
136 | configuration information kgdb will not do anything at all. Kgdb | 186 | provides some examples of the configuration parameters.</para> |
137 | will only actively hook up to the kernel trap hooks if a kgdb I/O | 187 | <sect1 id="kgdboc"> |
138 | driver is loaded and configured. If you unconfigure a kgdb I/O | 188 | <title>Kernel parameter: kgdboc</title> |
139 | driver, kgdb will unregister all the kernel hook points. | 189 | <para>The kgdboc driver was originally an abbreviation meant to |
190 | stand for "kgdb over console". Today it is the primary mechanism | ||
191 | to configure how to communicate from gdb to kgdb as well as the | ||
192 | devices you want to use to interact with the kdb shell. | ||
193 | </para> | ||
194 | <para>For kgdb/gdb, kgdboc is designed to work with a single serial | ||
195 | port. It is intended to cover the circumstance where you want to | ||
196 | use a serial console as your primary console as well as using it to | ||
197 | perform kernel debugging. It is also possible to use kgdb on a | ||
198 | serial port which is not designated as a system console. Kgdboc | ||
199 | may be configured as a kernel built-in or a kernel loadable module. | ||
200 | You can only make use of <constant>kgdbwait</constant> and early | ||
201 | debugging if you build kgdboc into the kernel as a built-in. | ||
140 | </para> | 202 | </para> |
203 | <sect2 id="kgdbocArgs"> | ||
204 | <title>kgdboc arguments</title> | ||
205 | <para>Usage: <constant>kgdboc=[kbd][[,]serial_device][,baud]</constant></para> | ||
206 | <sect3 id="kgdbocArgs1"> | ||
207 | <title>Using loadable module or built-in</title> | ||
141 | <para> | 208 | <para> |
142 | All drivers can be reconfigured at run time, if | 209 | <orderedlist> |
143 | <symbol>CONFIG_SYSFS</symbol> and <symbol>CONFIG_MODULES</symbol> | 210 | <listitem><para>As a kernel built-in:</para> |
144 | are enabled, by echo'ing a new config string to | 211 | <para>Use the kernel boot argument: <constant>kgdboc=<tty-device>,[baud]</constant></para></listitem> |
145 | <constant>/sys/module/<driver>/parameter/<option></constant>. | 212 | <listitem> |
146 | The driver can be unconfigured by passing an empty string. You cannot | 213 | <para>As a kernel loadable module:</para> |
147 | change the configuration while the debugger is attached. Make sure | 214 | <para>Use the command: <constant>modprobe kgdboc kgdboc=<tty-device>,[baud]</constant></para> |
148 | to detach the debugger with the <constant>detach</constant> command | 215 | <para>Here are two examples of how you might formate the kgdboc |
149 | prior to trying unconfigure a kgdb I/O driver. | 216 | string. The first is for an x86 target using the first serial port. |
217 | The second example is for the ARM Versatile AB using the second | ||
218 | serial port. | ||
219 | <orderedlist> | ||
220 | <listitem><para><constant>kgdboc=ttyS0,115200</constant></para></listitem> | ||
221 | <listitem><para><constant>kgdboc=ttyAMA1,115200</constant></para></listitem> | ||
222 | </orderedlist> | ||
150 | </para> | 223 | </para> |
224 | </listitem> | ||
225 | </orderedlist></para> | ||
226 | </sect3> | ||
227 | <sect3 id="kgdbocArgs2"> | ||
228 | <title>Configure kgdboc at runtime with sysfs</title> | ||
229 | <para>At run time you can enable or disable kgdboc by echoing a | ||
230 | parameters into the sysfs. Here are two examples:</para> | ||
231 | <orderedlist> | ||
232 | <listitem><para>Enable kgdboc on ttyS0</para> | ||
233 | <para><constant>echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem> | ||
234 | <listitem><para>Disable kgdboc</para> | ||
235 | <para><constant>echo "" > /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem> | ||
236 | </orderedlist> | ||
237 | <para>NOTE: You do not need to specify the baud if you are | ||
238 | configuring the console on tty which is already configured or | ||
239 | open.</para> | ||
240 | </sect3> | ||
241 | <sect3 id="kgdbocArgs3"> | ||
242 | <title>More examples</title> | ||
243 | <para>You can configure kgdboc to use the keyboard, and or a serial device | ||
244 | depending on if you are using kdb and or kgdb, in one of the | ||
245 | following scenarios. | ||
246 | <orderedlist> | ||
247 | <listitem><para>kdb and kgdb over only a serial port</para> | ||
248 | <para><constant>kgdboc=<serial_device>[,baud]</constant></para> | ||
249 | <para>Example: <constant>kgdboc=ttyS0,115200</constant></para> | ||
250 | </listitem> | ||
251 | <listitem><para>kdb and kgdb with keyboard and a serial port</para> | ||
252 | <para><constant>kgdboc=kbd,<serial_device>[,baud]</constant></para> | ||
253 | <para>Example: <constant>kgdboc=kbd,ttyS0,115200</constant></para> | ||
254 | </listitem> | ||
255 | <listitem><para>kdb with a keyboard</para> | ||
256 | <para><constant>kgdboc=kbd</constant></para> | ||
257 | </listitem> | ||
258 | </orderedlist> | ||
259 | </para> | ||
260 | </sect3> | ||
261 | <para>NOTE: Kgdboc does not support interrupting the target via the | ||
262 | gdb remote protocol. You must manually send a sysrq-g unless you | ||
263 | have a proxy that splits console output to a terminal program. | ||
264 | A console proxy has a separate TCP port for the debugger and a separate | ||
265 | TCP port for the "human" console. The proxy can take care of sending | ||
266 | the sysrq-g for you. | ||
267 | </para> | ||
268 | <para>When using kgdboc with no debugger proxy, you can end up | ||
269 | connecting the debugger at one of two entry points. If an | ||
270 | exception occurs after you have loaded kgdboc, a message should | ||
271 | print on the console stating it is waiting for the debugger. In | ||
272 | this case you disconnect your terminal program and then connect the | ||
273 | debugger in its place. If you want to interrupt the target system | ||
274 | and forcibly enter a debug session you have to issue a Sysrq | ||
275 | sequence and then type the letter <constant>g</constant>. Then | ||
276 | you disconnect the terminal session and connect gdb. Your options | ||
277 | if you don't like this are to hack gdb to send the sysrq-g for you | ||
278 | as well as on the initial connect, or to use a debugger proxy that | ||
279 | allows an unmodified gdb to do the debugging. | ||
280 | </para> | ||
281 | </sect2> | ||
282 | </sect1> | ||
151 | <sect1 id="kgdbwait"> | 283 | <sect1 id="kgdbwait"> |
152 | <title>Kernel parameter: kgdbwait</title> | 284 | <title>Kernel parameter: kgdbwait</title> |
153 | <para> | 285 | <para> |
@@ -162,103 +294,204 @@ | |||
162 | </para> | 294 | </para> |
163 | <para> | 295 | <para> |
164 | The kernel will stop and wait as early as the I/O driver and | 296 | The kernel will stop and wait as early as the I/O driver and |
165 | architecture will allow when you use this option. If you build the | 297 | architecture allows when you use this option. If you build the |
166 | kgdb I/O driver as a kernel module kgdbwait will not do anything. | 298 | kgdb I/O driver as a loadable kernel module kgdbwait will not do |
299 | anything. | ||
167 | </para> | 300 | </para> |
168 | </sect1> | 301 | </sect1> |
169 | <sect1 id="kgdboc"> | 302 | <sect1 id="kgdbcon"> |
170 | <title>Kernel parameter: kgdboc</title> | 303 | <title>Kernel parameter: kgdbcon</title> |
171 | <para> | 304 | <para> The kgdbcon feature allows you to see printk() messages |
172 | The kgdboc driver was originally an abbreviation meant to stand for | 305 | inside gdb while gdb is connected to the kernel. Kdb does not make |
173 | "kgdb over console". Kgdboc is designed to work with a single | 306 | use of the kgdbcon feature. |
174 | serial port. It was meant to cover the circumstance | 307 | </para> |
175 | where you wanted to use a serial console as your primary console as | 308 | <para>Kgdb supports using the gdb serial protocol to send console |
176 | well as using it to perform kernel debugging. Of course you can | 309 | messages to the debugger when the debugger is connected and running. |
177 | also use kgdboc without assigning a console to the same port. | 310 | There are two ways to activate this feature. |
311 | <orderedlist> | ||
312 | <listitem><para>Activate with the kernel command line option:</para> | ||
313 | <para><constant>kgdbcon</constant></para> | ||
314 | </listitem> | ||
315 | <listitem><para>Use sysfs before configuring an I/O driver</para> | ||
316 | <para> | ||
317 | <constant>echo 1 > /sys/module/kgdb/parameters/kgdb_use_con</constant> | ||
318 | </para> | ||
319 | <para> | ||
320 | NOTE: If you do this after you configure the kgdb I/O driver, the | ||
321 | setting will not take effect until the next point the I/O is | ||
322 | reconfigured. | ||
323 | </para> | ||
324 | </listitem> | ||
325 | </orderedlist> | ||
326 | <para>IMPORTANT NOTE: You cannot use kgdboc + kgdbcon on a tty that is an | ||
327 | active system console. An example incorrect usage is <constant>console=ttyS0,115200 kgdboc=ttyS0 kgdbcon</constant> | ||
328 | </para> | ||
329 | <para>It is possible to use this option with kgdboc on a tty that is not a system console. | ||
330 | </para> | ||
178 | </para> | 331 | </para> |
179 | <sect2 id="UsingKgdboc"> | 332 | </sect1> |
180 | <title>Using kgdboc</title> | 333 | </chapter> |
181 | <para> | 334 | <chapter id="usingKDB"> |
182 | You can configure kgdboc via sysfs or a module or kernel boot line | 335 | <title>Using kdb</title> |
183 | parameter depending on if you build with CONFIG_KGDBOC as a module | ||
184 | or built-in. | ||
185 | <orderedlist> | ||
186 | <listitem><para>From the module load or build-in</para> | ||
187 | <para><constant>kgdboc=<tty-device>,[baud]</constant></para> | ||
188 | <para> | 336 | <para> |
189 | The example here would be if your console port was typically ttyS0, you would use something like <constant>kgdboc=ttyS0,115200</constant> or on the ARM Versatile AB you would likely use <constant>kgdboc=ttyAMA0,115200</constant> | 337 | </para> |
338 | <sect1 id="quickKDBserial"> | ||
339 | <title>Quick start for kdb on a serial port</title> | ||
340 | <para>This is a quick example of how to use kdb.</para> | ||
341 | <para><orderedlist> | ||
342 | <listitem><para>Boot kernel with arguments: | ||
343 | <itemizedlist> | ||
344 | <listitem><para><constant>console=ttyS0,115200 kgdboc=ttyS0,115200</constant></para></listitem> | ||
345 | </itemizedlist></para> | ||
346 | <para>OR</para> | ||
347 | <para>Configure kgdboc after the kernel booted; assuming you are using a serial port console: | ||
348 | <itemizedlist> | ||
349 | <listitem><para><constant>echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem> | ||
350 | </itemizedlist> | ||
190 | </para> | 351 | </para> |
191 | </listitem> | 352 | </listitem> |
192 | <listitem><para>From sysfs</para> | 353 | <listitem><para>Enter the kernel debugger manually or by waiting for an oops or fault. There are several ways you can enter the kernel debugger manually; all involve using the sysrq-g, which means you must have enabled CONFIG_MAGIC_SYSRQ=y in your kernel config.</para> |
193 | <para><constant>echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc</constant></para> | 354 | <itemizedlist> |
355 | <listitem><para>When logged in as root or with a super user session you can run:</para> | ||
356 | <para><constant>echo g > /proc/sysrq-trigger</constant></para></listitem> | ||
357 | <listitem><para>Example using minicom 2.2</para> | ||
358 | <para>Press: <constant>Control-a</constant></para> | ||
359 | <para>Press: <constant>f</constant></para> | ||
360 | <para>Press: <constant>g</constant></para> | ||
194 | </listitem> | 361 | </listitem> |
195 | </orderedlist> | 362 | <listitem><para>When you have telneted to a terminal server that supports sending a remote break</para> |
196 | </para> | 363 | <para>Press: <constant>Control-]</constant></para> |
197 | <para> | 364 | <para>Type in:<constant>send break</constant></para> |
198 | NOTE: Kgdboc does not support interrupting the target via the | 365 | <para>Press: <constant>Enter</constant></para> |
199 | gdb remote protocol. You must manually send a sysrq-g unless you | 366 | <para>Press: <constant>g</constant></para> |
200 | have a proxy that splits console output to a terminal problem and | 367 | </listitem> |
201 | has a separate port for the debugger to connect to that sends the | 368 | </itemizedlist> |
202 | sysrq-g for you. | 369 | </listitem> |
370 | <listitem><para>From the kdb prompt you can run the "help" command to see a complete list of the commands that are available.</para> | ||
371 | <para>Some useful commands in kdb include: | ||
372 | <itemizedlist> | ||
373 | <listitem><para>lsmod -- Shows where kernel modules are loaded</para></listitem> | ||
374 | <listitem><para>ps -- Displays only the active processes</para></listitem> | ||
375 | <listitem><para>ps A -- Shows all the processes</para></listitem> | ||
376 | <listitem><para>summary -- Shows kernel version info and memory usage</para></listitem> | ||
377 | <listitem><para>bt -- Get a backtrace of the current process using dump_stack()</para></listitem> | ||
378 | <listitem><para>dmesg -- View the kernel syslog buffer</para></listitem> | ||
379 | <listitem><para>go -- Continue the system</para></listitem> | ||
380 | </itemizedlist> | ||
203 | </para> | 381 | </para> |
204 | <para>When using kgdboc with no debugger proxy, you can end up | 382 | </listitem> |
205 | connecting the debugger for one of two entry points. If an | 383 | <listitem> |
206 | exception occurs after you have loaded kgdboc a message should print | 384 | <para>When you are done using kdb you need to consider rebooting the |
207 | on the console stating it is waiting for the debugger. In case you | 385 | system or using the "go" command to resuming normal kernel |
208 | disconnect your terminal program and then connect the debugger in | 386 | execution. If you have paused the kernel for a lengthy period of |
209 | its place. If you want to interrupt the target system and forcibly | 387 | time, applications that rely on timely networking or anything to do |
210 | enter a debug session you have to issue a Sysrq sequence and then | 388 | with real wall clock time could be adversely affected, so you |
211 | type the letter <constant>g</constant>. Then you disconnect the | 389 | should take this into consideration when using the kernel |
212 | terminal session and connect gdb. Your options if you don't like | 390 | debugger.</para> |
213 | this are to hack gdb to send the sysrq-g for you as well as on the | 391 | </listitem> |
214 | initial connect, or to use a debugger proxy that allows an | 392 | </orderedlist></para> |
215 | unmodified gdb to do the debugging. | 393 | </sect1> |
394 | <sect1 id="quickKDBkeyboard"> | ||
395 | <title>Quick start for kdb using a keyboard connected console</title> | ||
396 | <para>This is a quick example of how to use kdb with a keyboard.</para> | ||
397 | <para><orderedlist> | ||
398 | <listitem><para>Boot kernel with arguments: | ||
399 | <itemizedlist> | ||
400 | <listitem><para><constant>kgdboc=kbd</constant></para></listitem> | ||
401 | </itemizedlist></para> | ||
402 | <para>OR</para> | ||
403 | <para>Configure kgdboc after the kernel booted: | ||
404 | <itemizedlist> | ||
405 | <listitem><para><constant>echo kbd > /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem> | ||
406 | </itemizedlist> | ||
216 | </para> | 407 | </para> |
217 | </sect2> | 408 | </listitem> |
409 | <listitem><para>Enter the kernel debugger manually or by waiting for an oops or fault. There are several ways you can enter the kernel debugger manually; all involve using the sysrq-g, which means you must have enabled CONFIG_MAGIC_SYSRQ=y in your kernel config.</para> | ||
410 | <itemizedlist> | ||
411 | <listitem><para>When logged in as root or with a super user session you can run:</para> | ||
412 | <para><constant>echo g > /proc/sysrq-trigger</constant></para></listitem> | ||
413 | <listitem><para>Example using a laptop keyboard</para> | ||
414 | <para>Press and hold down: <constant>Alt</constant></para> | ||
415 | <para>Press and hold down: <constant>Fn</constant></para> | ||
416 | <para>Press and release the key with the label: <constant>SysRq</constant></para> | ||
417 | <para>Release: <constant>Fn</constant></para> | ||
418 | <para>Press and release: <constant>g</constant></para> | ||
419 | <para>Release: <constant>Alt</constant></para> | ||
420 | </listitem> | ||
421 | <listitem><para>Example using a PS/2 101-key keyboard</para> | ||
422 | <para>Press and hold down: <constant>Alt</constant></para> | ||
423 | <para>Press and release the key with the label: <constant>SysRq</constant></para> | ||
424 | <para>Press and release: <constant>g</constant></para> | ||
425 | <para>Release: <constant>Alt</constant></para> | ||
426 | </listitem> | ||
427 | </itemizedlist> | ||
428 | </listitem> | ||
429 | <listitem> | ||
430 | <para>Now type in a kdb command such as "help", "dmesg", "bt" or "go" to continue kernel execution.</para> | ||
431 | </listitem> | ||
432 | </orderedlist></para> | ||
218 | </sect1> | 433 | </sect1> |
219 | <sect1 id="kgdbcon"> | 434 | </chapter> |
220 | <title>Kernel parameter: kgdbcon</title> | 435 | <chapter id="EnableKGDB"> |
221 | <para> | 436 | <title>Using kgdb / gdb</title> |
222 | Kgdb supports using the gdb serial protocol to send console messages | 437 | <para>In order to use kgdb you must activate it by passing |
223 | to the debugger when the debugger is connected and running. There | 438 | configuration information to one of the kgdb I/O drivers. If you |
224 | are two ways to activate this feature. | 439 | do not pass any configuration information kgdb will not do anything |
440 | at all. Kgdb will only actively hook up to the kernel trap hooks | ||
441 | if a kgdb I/O driver is loaded and configured. If you unconfigure | ||
442 | a kgdb I/O driver, kgdb will unregister all the kernel hook points. | ||
443 | </para> | ||
444 | <para> All kgdb I/O drivers can be reconfigured at run time, if | ||
445 | <symbol>CONFIG_SYSFS</symbol> and <symbol>CONFIG_MODULES</symbol> | ||
446 | are enabled, by echo'ing a new config string to | ||
447 | <constant>/sys/module/<driver>/parameter/<option></constant>. | ||
448 | The driver can be unconfigured by passing an empty string. You cannot | ||
449 | change the configuration while the debugger is attached. Make sure | ||
450 | to detach the debugger with the <constant>detach</constant> command | ||
451 | prior to trying to unconfigure a kgdb I/O driver. | ||
452 | </para> | ||
453 | <sect1 id="ConnectingGDB"> | ||
454 | <title>Connecting with gdb to a serial port</title> | ||
225 | <orderedlist> | 455 | <orderedlist> |
226 | <listitem><para>Activate with the kernel command line option:</para> | 456 | <listitem><para>Configure kgdboc</para> |
227 | <para><constant>kgdbcon</constant></para> | 457 | <para>Boot kernel with arguments: |
458 | <itemizedlist> | ||
459 | <listitem><para><constant>kgdboc=ttyS0,115200</constant></para></listitem> | ||
460 | </itemizedlist></para> | ||
461 | <para>OR</para> | ||
462 | <para>Configure kgdboc after the kernel booted: | ||
463 | <itemizedlist> | ||
464 | <listitem><para><constant>echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem> | ||
465 | </itemizedlist></para> | ||
228 | </listitem> | 466 | </listitem> |
229 | <listitem><para>Use sysfs before configuring an io driver</para> | 467 | <listitem> |
230 | <para> | 468 | <para>Stop kernel execution (break into the debugger)</para> |
231 | <constant>echo 1 > /sys/module/kgdb/parameters/kgdb_use_con</constant> | 469 | <para>In order to connect to gdb via kgdboc, the kernel must |
232 | </para> | 470 | first be stopped. There are several ways to stop the kernel which |
233 | <para> | 471 | include using kgdbwait as a boot argument, via a sysrq-g, or running |
234 | NOTE: If you do this after you configure the kgdb I/O driver, the | 472 | the kernel until it takes an exception where it waits for the |
235 | setting will not take effect until the next point the I/O is | 473 | debugger to attach. |
236 | reconfigured. | 474 | <itemizedlist> |
237 | </para> | 475 | <listitem><para>When logged in as root or with a super user session you can run:</para> |
476 | <para><constant>echo g > /proc/sysrq-trigger</constant></para></listitem> | ||
477 | <listitem><para>Example using minicom 2.2</para> | ||
478 | <para>Press: <constant>Control-a</constant></para> | ||
479 | <para>Press: <constant>f</constant></para> | ||
480 | <para>Press: <constant>g</constant></para> | ||
238 | </listitem> | 481 | </listitem> |
239 | </orderedlist> | 482 | <listitem><para>When you have telneted to a terminal server that supports sending a remote break</para> |
240 | </para> | 483 | <para>Press: <constant>Control-]</constant></para> |
241 | <para> | 484 | <para>Type in:<constant>send break</constant></para> |
242 | IMPORTANT NOTE: Using this option with kgdb over the console | 485 | <para>Press: <constant>Enter</constant></para> |
243 | (kgdboc) is not supported. | 486 | <para>Press: <constant>g</constant></para> |
487 | </listitem> | ||
488 | </itemizedlist> | ||
244 | </para> | 489 | </para> |
245 | </sect1> | 490 | </listitem> |
246 | </chapter> | 491 | <listitem> |
247 | <chapter id="ConnectingGDB"> | 492 | <para>Connect from from gdb</para> |
248 | <title>Connecting gdb</title> | ||
249 | <para> | ||
250 | If you are using kgdboc, you need to have used kgdbwait as a boot | ||
251 | argument, issued a sysrq-g, or the system you are going to debug | ||
252 | has already taken an exception and is waiting for the debugger to | ||
253 | attach before you can connect gdb. | ||
254 | </para> | ||
255 | <para> | ||
256 | If you are not using different kgdb I/O driver other than kgdboc, | ||
257 | you should be able to connect and the target will automatically | ||
258 | respond. | ||
259 | </para> | ||
260 | <para> | 493 | <para> |
261 | Example (using a serial port): | 494 | Example (using a directly connected port): |
262 | </para> | 495 | </para> |
263 | <programlisting> | 496 | <programlisting> |
264 | % gdb ./vmlinux | 497 | % gdb ./vmlinux |
@@ -266,7 +499,7 @@ | |||
266 | (gdb) target remote /dev/ttyS0 | 499 | (gdb) target remote /dev/ttyS0 |
267 | </programlisting> | 500 | </programlisting> |
268 | <para> | 501 | <para> |
269 | Example (kgdb to a terminal server on tcp port 2012): | 502 | Example (kgdb to a terminal server on TCP port 2012): |
270 | </para> | 503 | </para> |
271 | <programlisting> | 504 | <programlisting> |
272 | % gdb ./vmlinux | 505 | % gdb ./vmlinux |
@@ -283,6 +516,83 @@ | |||
283 | communications. You do this prior to issuing the <constant>target | 516 | communications. You do this prior to issuing the <constant>target |
284 | remote</constant> command by typing in: <constant>set debug remote 1</constant> | 517 | remote</constant> command by typing in: <constant>set debug remote 1</constant> |
285 | </para> | 518 | </para> |
519 | </listitem> | ||
520 | </orderedlist> | ||
521 | <para>Remember if you continue in gdb, and need to "break in" again, | ||
522 | you need to issue an other sysrq-g. It is easy to create a simple | ||
523 | entry point by putting a breakpoint at <constant>sys_sync</constant> | ||
524 | and then you can run "sync" from a shell or script to break into the | ||
525 | debugger.</para> | ||
526 | </sect1> | ||
527 | </chapter> | ||
528 | <chapter id="switchKdbKgdb"> | ||
529 | <title>kgdb and kdb interoperability</title> | ||
530 | <para>It is possible to transition between kdb and kgdb dynamically. | ||
531 | The debug core will remember which you used the last time and | ||
532 | automatically start in the same mode.</para> | ||
533 | <sect1> | ||
534 | <title>Switching between kdb and kgdb</title> | ||
535 | <sect2> | ||
536 | <title>Switching from kgdb to kdb</title> | ||
537 | <para> | ||
538 | There are two ways to switch from kgdb to kdb: you can use gdb to | ||
539 | issue a maintenance packet, or you can blindly type the command $3#33. | ||
540 | Whenever kernel debugger stops in kgdb mode it will print the | ||
541 | message <constant>KGDB or $3#33 for KDB</constant>. It is important | ||
542 | to note that you have to type the sequence correctly in one pass. | ||
543 | You cannot type a backspace or delete because kgdb will interpret | ||
544 | that as part of the debug stream. | ||
545 | <orderedlist> | ||
546 | <listitem><para>Change from kgdb to kdb by blindly typing:</para> | ||
547 | <para><constant>$3#33</constant></para></listitem> | ||
548 | <listitem><para>Change from kgdb to kdb with gdb</para> | ||
549 | <para><constant>maintenance packet 3</constant></para> | ||
550 | <para>NOTE: Now you must kill gdb. Typically you press control-z and | ||
551 | issue the command: kill -9 %</para></listitem> | ||
552 | </orderedlist> | ||
553 | </para> | ||
554 | </sect2> | ||
555 | <sect2> | ||
556 | <title>Change from kdb to kgdb</title> | ||
557 | <para>There are two ways you can change from kdb to kgdb. You can | ||
558 | manually enter kgdb mode by issuing the kgdb command from the kdb | ||
559 | shell prompt, or you can connect gdb while the kdb shell prompt is | ||
560 | active. The kdb shell looks for the typical first commands that gdb | ||
561 | would issue with the gdb remote protocol and if it sees one of those | ||
562 | commands it automatically changes into kgdb mode.</para> | ||
563 | <orderedlist> | ||
564 | <listitem><para>From kdb issue the command:</para> | ||
565 | <para><constant>kgdb</constant></para> | ||
566 | <para>Now disconnect your terminal program and connect gdb in its place</para></listitem> | ||
567 | <listitem><para>At the kdb prompt, disconnect the terminal program and connect gdb in its place.</para></listitem> | ||
568 | </orderedlist> | ||
569 | </sect2> | ||
570 | </sect1> | ||
571 | <sect1> | ||
572 | <title>Running kdb commands from gdb</title> | ||
573 | <para>It is possible to run a limited set of kdb commands from gdb, | ||
574 | using the gdb monitor command. You don't want to execute any of the | ||
575 | run control or breakpoint operations, because it can disrupt the | ||
576 | state of the kernel debugger. You should be using gdb for | ||
577 | breakpoints and run control operations if you have gdb connected. | ||
578 | The more useful commands to run are things like lsmod, dmesg, ps or | ||
579 | possibly some of the memory information commands. To see all the kdb | ||
580 | commands you can run <constant>monitor help</constant>.</para> | ||
581 | <para>Example: | ||
582 | <informalexample><programlisting> | ||
583 | (gdb) monitor ps | ||
584 | 1 idle process (state I) and | ||
585 | 27 sleeping system daemon (state M) processes suppressed, | ||
586 | use 'ps A' to see all. | ||
587 | Task Addr Pid Parent [*] cpu State Thread Command | ||
588 | |||
589 | 0xc78291d0 1 0 0 0 S 0xc7829404 init | ||
590 | 0xc7954150 942 1 0 0 S 0xc7954384 dropbear | ||
591 | 0xc78789c0 944 1 0 0 S 0xc7878bf4 sh | ||
592 | (gdb) | ||
593 | </programlisting></informalexample> | ||
594 | </para> | ||
595 | </sect1> | ||
286 | </chapter> | 596 | </chapter> |
287 | <chapter id="KGDBTestSuite"> | 597 | <chapter id="KGDBTestSuite"> |
288 | <title>kgdb Test Suite</title> | 598 | <title>kgdb Test Suite</title> |
@@ -309,34 +619,36 @@ | |||
309 | </para> | 619 | </para> |
310 | </chapter> | 620 | </chapter> |
311 | <chapter id="CommonBackEndReq"> | 621 | <chapter id="CommonBackEndReq"> |
312 | <title>KGDB Internals</title> | 622 | <title>Kernel Debugger Internals</title> |
313 | <sect1 id="kgdbArchitecture"> | 623 | <sect1 id="kgdbArchitecture"> |
314 | <title>Architecture Specifics</title> | 624 | <title>Architecture Specifics</title> |
315 | <para> | 625 | <para> |
316 | Kgdb is organized into three basic components: | 626 | The kernel debugger is organized into a number of components: |
317 | <orderedlist> | 627 | <orderedlist> |
318 | <listitem><para>kgdb core</para> | 628 | <listitem><para>The debug core</para> |
319 | <para> | 629 | <para> |
320 | The kgdb core is found in kernel/kgdb.c. It contains: | 630 | The debug core is found in kernel/debugger/debug_core.c. It contains: |
321 | <itemizedlist> | 631 | <itemizedlist> |
322 | <listitem><para>All the logic to implement the gdb serial protocol</para></listitem> | 632 | <listitem><para>A generic OS exception handler which includes |
323 | <listitem><para>A generic OS exception handler which includes sync'ing the processors into a stopped state on an multi cpu system.</para></listitem> | 633 | sync'ing the processors into a stopped state on an multi-CPU |
634 | system.</para></listitem> | ||
324 | <listitem><para>The API to talk to the kgdb I/O drivers</para></listitem> | 635 | <listitem><para>The API to talk to the kgdb I/O drivers</para></listitem> |
325 | <listitem><para>The API to make calls to the arch specific kgdb implementation</para></listitem> | 636 | <listitem><para>The API to make calls to the arch-specific kgdb implementation</para></listitem> |
326 | <listitem><para>The logic to perform safe memory reads and writes to memory while using the debugger</para></listitem> | 637 | <listitem><para>The logic to perform safe memory reads and writes to memory while using the debugger</para></listitem> |
327 | <listitem><para>A full implementation for software breakpoints unless overridden by the arch</para></listitem> | 638 | <listitem><para>A full implementation for software breakpoints unless overridden by the arch</para></listitem> |
639 | <listitem><para>The API to invoke either the kdb or kgdb frontend to the debug core.</para></listitem> | ||
328 | </itemizedlist> | 640 | </itemizedlist> |
329 | </para> | 641 | </para> |
330 | </listitem> | 642 | </listitem> |
331 | <listitem><para>kgdb arch specific implementation</para> | 643 | <listitem><para>kgdb arch-specific implementation</para> |
332 | <para> | 644 | <para> |
333 | This implementation is generally found in arch/*/kernel/kgdb.c. | 645 | This implementation is generally found in arch/*/kernel/kgdb.c. |
334 | As an example, arch/x86/kernel/kgdb.c contains the specifics to | 646 | As an example, arch/x86/kernel/kgdb.c contains the specifics to |
335 | implement HW breakpoint as well as the initialization to | 647 | implement HW breakpoint as well as the initialization to |
336 | dynamically register and unregister for the trap handlers on | 648 | dynamically register and unregister for the trap handlers on |
337 | this architecture. The arch specific portion implements: | 649 | this architecture. The arch-specific portion implements: |
338 | <itemizedlist> | 650 | <itemizedlist> |
339 | <listitem><para>contains an arch specific trap catcher which | 651 | <listitem><para>contains an arch-specific trap catcher which |
340 | invokes kgdb_handle_exception() to start kgdb about doing its | 652 | invokes kgdb_handle_exception() to start kgdb about doing its |
341 | work</para></listitem> | 653 | work</para></listitem> |
342 | <listitem><para>translation to and from gdb specific packet format to pt_regs</para></listitem> | 654 | <listitem><para>translation to and from gdb specific packet format to pt_regs</para></listitem> |
@@ -347,11 +659,35 @@ | |||
347 | </itemizedlist> | 659 | </itemizedlist> |
348 | </para> | 660 | </para> |
349 | </listitem> | 661 | </listitem> |
662 | <listitem><para>gdbstub frontend (aka kgdb)</para> | ||
663 | <para>The gdbstub is located in kernel/debug/gdbstub.c. It contains:</para> | ||
664 | <itemizedlist> | ||
665 | <listitem><para>All the logic to implement the gdb serial protocol</para></listitem> | ||
666 | </itemizedlist> | ||
667 | </listitem> | ||
668 | <listitem><para>kdb frontend</para> | ||
669 | <para>The kdb debugger shell is broken down into a number of | ||
670 | components. The kdb core is located in kernel/debug/kdb. There | ||
671 | are a number of helper functions in some of the other kernel | ||
672 | components to make it possible for kdb to examine and report | ||
673 | information about the kernel without taking locks that could | ||
674 | cause a kernel deadlock. The kdb core contains implements the following functionality.</para> | ||
675 | <itemizedlist> | ||
676 | <listitem><para>A simple shell</para></listitem> | ||
677 | <listitem><para>The kdb core command set</para></listitem> | ||
678 | <listitem><para>A registration API to register additional kdb shell commands.</para> | ||
679 | <para>A good example of a self-contained kdb module is the "ftdump" command for dumping the ftrace buffer. See: kernel/trace/trace_kdb.c</para></listitem> | ||
680 | <listitem><para>The implementation for kdb_printf() which | ||
681 | emits messages directly to I/O drivers, bypassing the kernel | ||
682 | log.</para></listitem> | ||
683 | <listitem><para>SW / HW breakpoint management for the kdb shell</para></listitem> | ||
684 | </itemizedlist> | ||
685 | </listitem> | ||
350 | <listitem><para>kgdb I/O driver</para> | 686 | <listitem><para>kgdb I/O driver</para> |
351 | <para> | 687 | <para> |
352 | Each kgdb I/O driver has to provide an implemenation for the following: | 688 | Each kgdb I/O driver has to provide an implementation for the following: |
353 | <itemizedlist> | 689 | <itemizedlist> |
354 | <listitem><para>configuration via builtin or module</para></listitem> | 690 | <listitem><para>configuration via built-in or module</para></listitem> |
355 | <listitem><para>dynamic configuration and kgdb hook registration calls</para></listitem> | 691 | <listitem><para>dynamic configuration and kgdb hook registration calls</para></listitem> |
356 | <listitem><para>read and write character interface</para></listitem> | 692 | <listitem><para>read and write character interface</para></listitem> |
357 | <listitem><para>A cleanup handler for unconfiguring from the kgdb core</para></listitem> | 693 | <listitem><para>A cleanup handler for unconfiguring from the kgdb core</para></listitem> |
@@ -416,15 +752,15 @@ | |||
416 | underlying low level to the hardware driver having "polling hooks" | 752 | underlying low level to the hardware driver having "polling hooks" |
417 | which the to which the tty driver is attached. In the initial | 753 | which the to which the tty driver is attached. In the initial |
418 | implementation of kgdboc it the serial_core was changed to expose a | 754 | implementation of kgdboc it the serial_core was changed to expose a |
419 | low level uart hook for doing polled mode reading and writing of a | 755 | low level UART hook for doing polled mode reading and writing of a |
420 | single character while in an atomic context. When kgdb makes an I/O | 756 | single character while in an atomic context. When kgdb makes an I/O |
421 | request to the debugger, kgdboc invokes a call back in the serial | 757 | request to the debugger, kgdboc invokes a call back in the serial |
422 | core which in turn uses the call back in the uart driver. It is | 758 | core which in turn uses the call back in the UART driver. It is |
423 | certainly possible to extend kgdboc to work with non-uart based | 759 | certainly possible to extend kgdboc to work with non-UART based |
424 | consoles in the future. | 760 | consoles in the future. |
425 | </para> | 761 | </para> |
426 | <para> | 762 | <para> |
427 | When using kgdboc with a uart, the uart driver must implement two callbacks in the <constant>struct uart_ops</constant>. Example from drivers/8250.c:<programlisting> | 763 | When using kgdboc with a UART, the UART driver must implement two callbacks in the <constant>struct uart_ops</constant>. Example from drivers/8250.c:<programlisting> |
428 | #ifdef CONFIG_CONSOLE_POLL | 764 | #ifdef CONFIG_CONSOLE_POLL |
429 | .poll_get_char = serial8250_get_poll_char, | 765 | .poll_get_char = serial8250_get_poll_char, |
430 | .poll_put_char = serial8250_put_poll_char, | 766 | .poll_put_char = serial8250_put_poll_char, |
@@ -434,7 +770,7 @@ | |||
434 | <constant>#ifdef CONFIG_CONSOLE_POLL</constant>, as shown above. | 770 | <constant>#ifdef CONFIG_CONSOLE_POLL</constant>, as shown above. |
435 | Keep in mind that polling hooks have to be implemented in such a way | 771 | Keep in mind that polling hooks have to be implemented in such a way |
436 | that they can be called from an atomic context and have to restore | 772 | that they can be called from an atomic context and have to restore |
437 | the state of the uart chip on return such that the system can return | 773 | the state of the UART chip on return such that the system can return |
438 | to normal when the debugger detaches. You need to be very careful | 774 | to normal when the debugger detaches. You need to be very careful |
439 | with any kind of lock you consider, because failing here is most | 775 | with any kind of lock you consider, because failing here is most |
440 | going to mean pressing the reset button. | 776 | going to mean pressing the reset button. |
@@ -453,6 +789,10 @@ | |||
453 | <itemizedlist> | 789 | <itemizedlist> |
454 | <listitem><para>Jason Wessel<email>jason.wessel@windriver.com</email></para></listitem> | 790 | <listitem><para>Jason Wessel<email>jason.wessel@windriver.com</email></para></listitem> |
455 | </itemizedlist> | 791 | </itemizedlist> |
792 | In Jan 2010 this document was updated to include kdb. | ||
793 | <itemizedlist> | ||
794 | <listitem><para>Jason Wessel<email>jason.wessel@windriver.com</email></para></listitem> | ||
795 | </itemizedlist> | ||
456 | </para> | 796 | </para> |
457 | </chapter> | 797 | </chapter> |
458 | </book> | 798 | </book> |
diff --git a/Documentation/DocBook/libata.tmpl b/Documentation/DocBook/libata.tmpl index ba9975771503..8c5411cfeaf0 100644 --- a/Documentation/DocBook/libata.tmpl +++ b/Documentation/DocBook/libata.tmpl | |||
@@ -81,16 +81,14 @@ void (*port_disable) (struct ata_port *); | |||
81 | </programlisting> | 81 | </programlisting> |
82 | 82 | ||
83 | <para> | 83 | <para> |
84 | Called from ata_bus_probe() and ata_bus_reset() error paths, | 84 | Called from ata_bus_probe() error path, as well as when |
85 | as well as when unregistering from the SCSI module (rmmod, hot | 85 | unregistering from the SCSI module (rmmod, hot unplug). |
86 | unplug). | ||
87 | This function should do whatever needs to be done to take the | 86 | This function should do whatever needs to be done to take the |
88 | port out of use. In most cases, ata_port_disable() can be used | 87 | port out of use. In most cases, ata_port_disable() can be used |
89 | as this hook. | 88 | as this hook. |
90 | </para> | 89 | </para> |
91 | <para> | 90 | <para> |
92 | Called from ata_bus_probe() on a failed probe. | 91 | Called from ata_bus_probe() on a failed probe. |
93 | Called from ata_bus_reset() on a failed bus reset. | ||
94 | Called from ata_scsi_release(). | 92 | Called from ata_scsi_release(). |
95 | </para> | 93 | </para> |
96 | 94 | ||
@@ -107,10 +105,6 @@ void (*dev_config) (struct ata_port *, struct ata_device *); | |||
107 | issue of SET FEATURES - XFER MODE, and prior to operation. | 105 | issue of SET FEATURES - XFER MODE, and prior to operation. |
108 | </para> | 106 | </para> |
109 | <para> | 107 | <para> |
110 | Called by ata_device_add() after ata_dev_identify() determines | ||
111 | a device is present. | ||
112 | </para> | ||
113 | <para> | ||
114 | This entry may be specified as NULL in ata_port_operations. | 108 | This entry may be specified as NULL in ata_port_operations. |
115 | </para> | 109 | </para> |
116 | 110 | ||
@@ -154,8 +148,8 @@ unsigned int (*mode_filter) (struct ata_port *, struct ata_device *, unsigned in | |||
154 | 148 | ||
155 | <sect2><title>Taskfile read/write</title> | 149 | <sect2><title>Taskfile read/write</title> |
156 | <programlisting> | 150 | <programlisting> |
157 | void (*tf_load) (struct ata_port *ap, struct ata_taskfile *tf); | 151 | void (*sff_tf_load) (struct ata_port *ap, struct ata_taskfile *tf); |
158 | void (*tf_read) (struct ata_port *ap, struct ata_taskfile *tf); | 152 | void (*sff_tf_read) (struct ata_port *ap, struct ata_taskfile *tf); |
159 | </programlisting> | 153 | </programlisting> |
160 | 154 | ||
161 | <para> | 155 | <para> |
@@ -164,36 +158,35 @@ void (*tf_read) (struct ata_port *ap, struct ata_taskfile *tf); | |||
164 | hardware registers / DMA buffers, to obtain the current set of | 158 | hardware registers / DMA buffers, to obtain the current set of |
165 | taskfile register values. | 159 | taskfile register values. |
166 | Most drivers for taskfile-based hardware (PIO or MMIO) use | 160 | Most drivers for taskfile-based hardware (PIO or MMIO) use |
167 | ata_tf_load() and ata_tf_read() for these hooks. | 161 | ata_sff_tf_load() and ata_sff_tf_read() for these hooks. |
168 | </para> | 162 | </para> |
169 | 163 | ||
170 | </sect2> | 164 | </sect2> |
171 | 165 | ||
172 | <sect2><title>PIO data read/write</title> | 166 | <sect2><title>PIO data read/write</title> |
173 | <programlisting> | 167 | <programlisting> |
174 | void (*data_xfer) (struct ata_device *, unsigned char *, unsigned int, int); | 168 | void (*sff_data_xfer) (struct ata_device *, unsigned char *, unsigned int, int); |
175 | </programlisting> | 169 | </programlisting> |
176 | 170 | ||
177 | <para> | 171 | <para> |
178 | All bmdma-style drivers must implement this hook. This is the low-level | 172 | All bmdma-style drivers must implement this hook. This is the low-level |
179 | operation that actually copies the data bytes during a PIO data | 173 | operation that actually copies the data bytes during a PIO data |
180 | transfer. | 174 | transfer. |
181 | Typically the driver | 175 | Typically the driver will choose one of ata_sff_data_xfer_noirq(), |
182 | will choose one of ata_pio_data_xfer_noirq(), ata_pio_data_xfer(), or | 176 | ata_sff_data_xfer(), or ata_sff_data_xfer32(). |
183 | ata_mmio_data_xfer(). | ||
184 | </para> | 177 | </para> |
185 | 178 | ||
186 | </sect2> | 179 | </sect2> |
187 | 180 | ||
188 | <sect2><title>ATA command execute</title> | 181 | <sect2><title>ATA command execute</title> |
189 | <programlisting> | 182 | <programlisting> |
190 | void (*exec_command)(struct ata_port *ap, struct ata_taskfile *tf); | 183 | void (*sff_exec_command)(struct ata_port *ap, struct ata_taskfile *tf); |
191 | </programlisting> | 184 | </programlisting> |
192 | 185 | ||
193 | <para> | 186 | <para> |
194 | causes an ATA command, previously loaded with | 187 | causes an ATA command, previously loaded with |
195 | ->tf_load(), to be initiated in hardware. | 188 | ->tf_load(), to be initiated in hardware. |
196 | Most drivers for taskfile-based hardware use ata_exec_command() | 189 | Most drivers for taskfile-based hardware use ata_sff_exec_command() |
197 | for this hook. | 190 | for this hook. |
198 | </para> | 191 | </para> |
199 | 192 | ||
@@ -218,8 +211,8 @@ command. | |||
218 | 211 | ||
219 | <sect2><title>Read specific ATA shadow registers</title> | 212 | <sect2><title>Read specific ATA shadow registers</title> |
220 | <programlisting> | 213 | <programlisting> |
221 | u8 (*check_status)(struct ata_port *ap); | 214 | u8 (*sff_check_status)(struct ata_port *ap); |
222 | u8 (*check_altstatus)(struct ata_port *ap); | 215 | u8 (*sff_check_altstatus)(struct ata_port *ap); |
223 | </programlisting> | 216 | </programlisting> |
224 | 217 | ||
225 | <para> | 218 | <para> |
@@ -227,20 +220,26 @@ u8 (*check_altstatus)(struct ata_port *ap); | |||
227 | hardware. On some hardware, reading the Status register has | 220 | hardware. On some hardware, reading the Status register has |
228 | the side effect of clearing the interrupt condition. | 221 | the side effect of clearing the interrupt condition. |
229 | Most drivers for taskfile-based hardware use | 222 | Most drivers for taskfile-based hardware use |
230 | ata_check_status() for this hook. | 223 | ata_sff_check_status() for this hook. |
231 | </para> | 224 | </para> |
225 | |||
226 | </sect2> | ||
227 | |||
228 | <sect2><title>Write specific ATA shadow register</title> | ||
229 | <programlisting> | ||
230 | void (*sff_set_devctl)(struct ata_port *ap, u8 ctl); | ||
231 | </programlisting> | ||
232 | |||
232 | <para> | 233 | <para> |
233 | Note that because this is called from ata_device_add(), at | 234 | Write the device control ATA shadow register to the hardware. |
234 | least a dummy function that clears device interrupts must be | 235 | Most drivers don't need to define this. |
235 | provided for all drivers, even if the controller doesn't | ||
236 | actually have a taskfile status register. | ||
237 | </para> | 236 | </para> |
238 | 237 | ||
239 | </sect2> | 238 | </sect2> |
240 | 239 | ||
241 | <sect2><title>Select ATA device on bus</title> | 240 | <sect2><title>Select ATA device on bus</title> |
242 | <programlisting> | 241 | <programlisting> |
243 | void (*dev_select)(struct ata_port *ap, unsigned int device); | 242 | void (*sff_dev_select)(struct ata_port *ap, unsigned int device); |
244 | </programlisting> | 243 | </programlisting> |
245 | 244 | ||
246 | <para> | 245 | <para> |
@@ -251,9 +250,7 @@ void (*dev_select)(struct ata_port *ap, unsigned int device); | |||
251 | </para> | 250 | </para> |
252 | <para> | 251 | <para> |
253 | Most drivers for taskfile-based hardware use | 252 | Most drivers for taskfile-based hardware use |
254 | ata_std_dev_select() for this hook. Controllers which do not | 253 | ata_sff_dev_select() for this hook. |
255 | support second drives on a port (such as SATA contollers) will | ||
256 | use ata_noop_dev_select(). | ||
257 | </para> | 254 | </para> |
258 | 255 | ||
259 | </sect2> | 256 | </sect2> |
@@ -441,13 +438,13 @@ void (*irq_clear) (struct ata_port *); | |||
441 | to struct ata_host_set. | 438 | to struct ata_host_set. |
442 | </para> | 439 | </para> |
443 | <para> | 440 | <para> |
444 | Most legacy IDE drivers use ata_interrupt() for the | 441 | Most legacy IDE drivers use ata_sff_interrupt() for the |
445 | irq_handler hook, which scans all ports in the host_set, | 442 | irq_handler hook, which scans all ports in the host_set, |
446 | determines which queued command was active (if any), and calls | 443 | determines which queued command was active (if any), and calls |
447 | ata_host_intr(ap,qc). | 444 | ata_sff_host_intr(ap,qc). |
448 | </para> | 445 | </para> |
449 | <para> | 446 | <para> |
450 | Most legacy IDE drivers use ata_bmdma_irq_clear() for the | 447 | Most legacy IDE drivers use ata_sff_irq_clear() for the |
451 | irq_clear() hook, which simply clears the interrupt and error | 448 | irq_clear() hook, which simply clears the interrupt and error |
452 | flags in the DMA status register. | 449 | flags in the DMA status register. |
453 | </para> | 450 | </para> |
@@ -490,16 +487,12 @@ void (*host_stop) (struct ata_host_set *host_set); | |||
490 | allocates space for a legacy IDE PRD table and returns. | 487 | allocates space for a legacy IDE PRD table and returns. |
491 | </para> | 488 | </para> |
492 | <para> | 489 | <para> |
493 | ->port_stop() is called after ->host_stop(). It's sole function | 490 | ->port_stop() is called after ->host_stop(). Its sole function |
494 | is to release DMA/memory resources, now that they are no longer | 491 | is to release DMA/memory resources, now that they are no longer |
495 | actively being used. Many drivers also free driver-private | 492 | actively being used. Many drivers also free driver-private |
496 | data from port at this time. | 493 | data from port at this time. |
497 | </para> | 494 | </para> |
498 | <para> | 495 | <para> |
499 | Many drivers use ata_port_stop() as this hook, which frees the | ||
500 | PRD table. | ||
501 | </para> | ||
502 | <para> | ||
503 | ->host_stop() is called after all ->port_stop() calls | 496 | ->host_stop() is called after all ->port_stop() calls |
504 | have completed. The hook must finalize hardware shutdown, release DMA | 497 | have completed. The hook must finalize hardware shutdown, release DMA |
505 | and other resources, etc. | 498 | and other resources, etc. |
diff --git a/Documentation/DocBook/mac80211.tmpl b/Documentation/DocBook/mac80211.tmpl index f3f37f141dbd..affb15a344a1 100644 --- a/Documentation/DocBook/mac80211.tmpl +++ b/Documentation/DocBook/mac80211.tmpl | |||
@@ -144,7 +144,7 @@ usage should require reading the full document. | |||
144 | this though and the recommendation to allow only a single | 144 | this though and the recommendation to allow only a single |
145 | interface in STA mode at first! | 145 | interface in STA mode at first! |
146 | </para> | 146 | </para> |
147 | !Finclude/net/mac80211.h ieee80211_if_init_conf | 147 | !Finclude/net/mac80211.h ieee80211_vif |
148 | </chapter> | 148 | </chapter> |
149 | 149 | ||
150 | <chapter id="rx-tx"> | 150 | <chapter id="rx-tx"> |
@@ -234,7 +234,6 @@ usage should require reading the full document. | |||
234 | <title>Multiple queues and QoS support</title> | 234 | <title>Multiple queues and QoS support</title> |
235 | <para>TBD</para> | 235 | <para>TBD</para> |
236 | !Finclude/net/mac80211.h ieee80211_tx_queue_params | 236 | !Finclude/net/mac80211.h ieee80211_tx_queue_params |
237 | !Finclude/net/mac80211.h ieee80211_tx_queue_stats | ||
238 | </chapter> | 237 | </chapter> |
239 | 238 | ||
240 | <chapter id="AP"> | 239 | <chapter id="AP"> |
diff --git a/Documentation/DocBook/media-entities.tmpl b/Documentation/DocBook/media-entities.tmpl index c725cb852c54..5d4d40f429a5 100644 --- a/Documentation/DocBook/media-entities.tmpl +++ b/Documentation/DocBook/media-entities.tmpl | |||
@@ -17,6 +17,7 @@ | |||
17 | <!ENTITY VIDIOC-DBG-G-REGISTER "<link linkend='vidioc-dbg-g-register'><constant>VIDIOC_DBG_G_REGISTER</constant></link>"> | 17 | <!ENTITY VIDIOC-DBG-G-REGISTER "<link linkend='vidioc-dbg-g-register'><constant>VIDIOC_DBG_G_REGISTER</constant></link>"> |
18 | <!ENTITY VIDIOC-DBG-S-REGISTER "<link linkend='vidioc-dbg-g-register'><constant>VIDIOC_DBG_S_REGISTER</constant></link>"> | 18 | <!ENTITY VIDIOC-DBG-S-REGISTER "<link linkend='vidioc-dbg-g-register'><constant>VIDIOC_DBG_S_REGISTER</constant></link>"> |
19 | <!ENTITY VIDIOC-DQBUF "<link linkend='vidioc-qbuf'><constant>VIDIOC_DQBUF</constant></link>"> | 19 | <!ENTITY VIDIOC-DQBUF "<link linkend='vidioc-qbuf'><constant>VIDIOC_DQBUF</constant></link>"> |
20 | <!ENTITY VIDIOC-DQEVENT "<link linkend='vidioc-dqevent'><constant>VIDIOC_DQEVENT</constant></link>"> | ||
20 | <!ENTITY VIDIOC-ENCODER-CMD "<link linkend='vidioc-encoder-cmd'><constant>VIDIOC_ENCODER_CMD</constant></link>"> | 21 | <!ENTITY VIDIOC-ENCODER-CMD "<link linkend='vidioc-encoder-cmd'><constant>VIDIOC_ENCODER_CMD</constant></link>"> |
21 | <!ENTITY VIDIOC-ENUMAUDIO "<link linkend='vidioc-enumaudio'><constant>VIDIOC_ENUMAUDIO</constant></link>"> | 22 | <!ENTITY VIDIOC-ENUMAUDIO "<link linkend='vidioc-enumaudio'><constant>VIDIOC_ENUMAUDIO</constant></link>"> |
22 | <!ENTITY VIDIOC-ENUMAUDOUT "<link linkend='vidioc-enumaudioout'><constant>VIDIOC_ENUMAUDOUT</constant></link>"> | 23 | <!ENTITY VIDIOC-ENUMAUDOUT "<link linkend='vidioc-enumaudioout'><constant>VIDIOC_ENUMAUDOUT</constant></link>"> |
@@ -60,6 +61,7 @@ | |||
60 | <!ENTITY VIDIOC-REQBUFS "<link linkend='vidioc-reqbufs'><constant>VIDIOC_REQBUFS</constant></link>"> | 61 | <!ENTITY VIDIOC-REQBUFS "<link linkend='vidioc-reqbufs'><constant>VIDIOC_REQBUFS</constant></link>"> |
61 | <!ENTITY VIDIOC-STREAMOFF "<link linkend='vidioc-streamon'><constant>VIDIOC_STREAMOFF</constant></link>"> | 62 | <!ENTITY VIDIOC-STREAMOFF "<link linkend='vidioc-streamon'><constant>VIDIOC_STREAMOFF</constant></link>"> |
62 | <!ENTITY VIDIOC-STREAMON "<link linkend='vidioc-streamon'><constant>VIDIOC_STREAMON</constant></link>"> | 63 | <!ENTITY VIDIOC-STREAMON "<link linkend='vidioc-streamon'><constant>VIDIOC_STREAMON</constant></link>"> |
64 | <!ENTITY VIDIOC-SUBSCRIBE-EVENT "<link linkend='vidioc-subscribe-event'><constant>VIDIOC_SUBSCRIBE_EVENT</constant></link>"> | ||
63 | <!ENTITY VIDIOC-S-AUDIO "<link linkend='vidioc-g-audio'><constant>VIDIOC_S_AUDIO</constant></link>"> | 65 | <!ENTITY VIDIOC-S-AUDIO "<link linkend='vidioc-g-audio'><constant>VIDIOC_S_AUDIO</constant></link>"> |
64 | <!ENTITY VIDIOC-S-AUDOUT "<link linkend='vidioc-g-audioout'><constant>VIDIOC_S_AUDOUT</constant></link>"> | 66 | <!ENTITY VIDIOC-S-AUDOUT "<link linkend='vidioc-g-audioout'><constant>VIDIOC_S_AUDOUT</constant></link>"> |
65 | <!ENTITY VIDIOC-S-CROP "<link linkend='vidioc-g-crop'><constant>VIDIOC_S_CROP</constant></link>"> | 67 | <!ENTITY VIDIOC-S-CROP "<link linkend='vidioc-g-crop'><constant>VIDIOC_S_CROP</constant></link>"> |
@@ -83,6 +85,7 @@ | |||
83 | <!ENTITY VIDIOC-TRY-ENCODER-CMD "<link linkend='vidioc-encoder-cmd'><constant>VIDIOC_TRY_ENCODER_CMD</constant></link>"> | 85 | <!ENTITY VIDIOC-TRY-ENCODER-CMD "<link linkend='vidioc-encoder-cmd'><constant>VIDIOC_TRY_ENCODER_CMD</constant></link>"> |
84 | <!ENTITY VIDIOC-TRY-EXT-CTRLS "<link linkend='vidioc-g-ext-ctrls'><constant>VIDIOC_TRY_EXT_CTRLS</constant></link>"> | 86 | <!ENTITY VIDIOC-TRY-EXT-CTRLS "<link linkend='vidioc-g-ext-ctrls'><constant>VIDIOC_TRY_EXT_CTRLS</constant></link>"> |
85 | <!ENTITY VIDIOC-TRY-FMT "<link linkend='vidioc-g-fmt'><constant>VIDIOC_TRY_FMT</constant></link>"> | 87 | <!ENTITY VIDIOC-TRY-FMT "<link linkend='vidioc-g-fmt'><constant>VIDIOC_TRY_FMT</constant></link>"> |
88 | <!ENTITY VIDIOC-UNSUBSCRIBE-EVENT "<link linkend='vidioc-subscribe-event'><constant>VIDIOC_UNSUBSCRIBE_EVENT</constant></link>"> | ||
86 | 89 | ||
87 | <!-- Types --> | 90 | <!-- Types --> |
88 | <!ENTITY v4l2-std-id "<link linkend='v4l2-std-id'>v4l2_std_id</link>"> | 91 | <!ENTITY v4l2-std-id "<link linkend='v4l2-std-id'>v4l2_std_id</link>"> |
@@ -141,6 +144,9 @@ | |||
141 | <!ENTITY v4l2-enc-idx "struct <link linkend='v4l2-enc-idx'>v4l2_enc_idx</link>"> | 144 | <!ENTITY v4l2-enc-idx "struct <link linkend='v4l2-enc-idx'>v4l2_enc_idx</link>"> |
142 | <!ENTITY v4l2-enc-idx-entry "struct <link linkend='v4l2-enc-idx-entry'>v4l2_enc_idx_entry</link>"> | 145 | <!ENTITY v4l2-enc-idx-entry "struct <link linkend='v4l2-enc-idx-entry'>v4l2_enc_idx_entry</link>"> |
143 | <!ENTITY v4l2-encoder-cmd "struct <link linkend='v4l2-encoder-cmd'>v4l2_encoder_cmd</link>"> | 146 | <!ENTITY v4l2-encoder-cmd "struct <link linkend='v4l2-encoder-cmd'>v4l2_encoder_cmd</link>"> |
147 | <!ENTITY v4l2-event "struct <link linkend='v4l2-event'>v4l2_event</link>"> | ||
148 | <!ENTITY v4l2-event-subscription "struct <link linkend='v4l2-event-subscription'>v4l2_event_subscription</link>"> | ||
149 | <!ENTITY v4l2-event-vsync "struct <link linkend='v4l2-event-vsync'>v4l2_event_vsync</link>"> | ||
144 | <!ENTITY v4l2-ext-control "struct <link linkend='v4l2-ext-control'>v4l2_ext_control</link>"> | 150 | <!ENTITY v4l2-ext-control "struct <link linkend='v4l2-ext-control'>v4l2_ext_control</link>"> |
145 | <!ENTITY v4l2-ext-controls "struct <link linkend='v4l2-ext-controls'>v4l2_ext_controls</link>"> | 151 | <!ENTITY v4l2-ext-controls "struct <link linkend='v4l2-ext-controls'>v4l2_ext_controls</link>"> |
146 | <!ENTITY v4l2-fmtdesc "struct <link linkend='v4l2-fmtdesc'>v4l2_fmtdesc</link>"> | 152 | <!ENTITY v4l2-fmtdesc "struct <link linkend='v4l2-fmtdesc'>v4l2_fmtdesc</link>"> |
@@ -200,6 +206,7 @@ | |||
200 | <!ENTITY sub-controls SYSTEM "v4l/controls.xml"> | 206 | <!ENTITY sub-controls SYSTEM "v4l/controls.xml"> |
201 | <!ENTITY sub-dev-capture SYSTEM "v4l/dev-capture.xml"> | 207 | <!ENTITY sub-dev-capture SYSTEM "v4l/dev-capture.xml"> |
202 | <!ENTITY sub-dev-codec SYSTEM "v4l/dev-codec.xml"> | 208 | <!ENTITY sub-dev-codec SYSTEM "v4l/dev-codec.xml"> |
209 | <!ENTITY sub-dev-event SYSTEM "v4l/dev-event.xml"> | ||
203 | <!ENTITY sub-dev-effect SYSTEM "v4l/dev-effect.xml"> | 210 | <!ENTITY sub-dev-effect SYSTEM "v4l/dev-effect.xml"> |
204 | <!ENTITY sub-dev-osd SYSTEM "v4l/dev-osd.xml"> | 211 | <!ENTITY sub-dev-osd SYSTEM "v4l/dev-osd.xml"> |
205 | <!ENTITY sub-dev-output SYSTEM "v4l/dev-output.xml"> | 212 | <!ENTITY sub-dev-output SYSTEM "v4l/dev-output.xml"> |
@@ -292,6 +299,8 @@ | |||
292 | <!ENTITY sub-v4l2grab-c SYSTEM "v4l/v4l2grab.c.xml"> | 299 | <!ENTITY sub-v4l2grab-c SYSTEM "v4l/v4l2grab.c.xml"> |
293 | <!ENTITY sub-videodev2-h SYSTEM "v4l/videodev2.h.xml"> | 300 | <!ENTITY sub-videodev2-h SYSTEM "v4l/videodev2.h.xml"> |
294 | <!ENTITY sub-v4l2 SYSTEM "v4l/v4l2.xml"> | 301 | <!ENTITY sub-v4l2 SYSTEM "v4l/v4l2.xml"> |
302 | <!ENTITY sub-dqevent SYSTEM "v4l/vidioc-dqevent.xml"> | ||
303 | <!ENTITY sub-subscribe-event SYSTEM "v4l/vidioc-subscribe-event.xml"> | ||
295 | <!ENTITY sub-intro SYSTEM "dvb/intro.xml"> | 304 | <!ENTITY sub-intro SYSTEM "dvb/intro.xml"> |
296 | <!ENTITY sub-frontend SYSTEM "dvb/frontend.xml"> | 305 | <!ENTITY sub-frontend SYSTEM "dvb/frontend.xml"> |
297 | <!ENTITY sub-dvbproperty SYSTEM "dvb/dvbproperty.xml"> | 306 | <!ENTITY sub-dvbproperty SYSTEM "dvb/dvbproperty.xml"> |
@@ -381,3 +390,5 @@ | |||
381 | <!ENTITY reqbufs SYSTEM "v4l/vidioc-reqbufs.xml"> | 390 | <!ENTITY reqbufs SYSTEM "v4l/vidioc-reqbufs.xml"> |
382 | <!ENTITY s-hw-freq-seek SYSTEM "v4l/vidioc-s-hw-freq-seek.xml"> | 391 | <!ENTITY s-hw-freq-seek SYSTEM "v4l/vidioc-s-hw-freq-seek.xml"> |
383 | <!ENTITY streamon SYSTEM "v4l/vidioc-streamon.xml"> | 392 | <!ENTITY streamon SYSTEM "v4l/vidioc-streamon.xml"> |
393 | <!ENTITY dqevent SYSTEM "v4l/vidioc-dqevent.xml"> | ||
394 | <!ENTITY subscribe_event SYSTEM "v4l/vidioc-subscribe-event.xml"> | ||
diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl index 5e7d84b48505..020ac80d4682 100644 --- a/Documentation/DocBook/mtdnand.tmpl +++ b/Documentation/DocBook/mtdnand.tmpl | |||
@@ -269,7 +269,7 @@ static void board_hwcontrol(struct mtd_info *mtd, int cmd) | |||
269 | information about the device. | 269 | information about the device. |
270 | </para> | 270 | </para> |
271 | <programlisting> | 271 | <programlisting> |
272 | int __init board_init (void) | 272 | static int __init board_init (void) |
273 | { | 273 | { |
274 | struct nand_chip *this; | 274 | struct nand_chip *this; |
275 | int err = 0; | 275 | int err = 0; |
@@ -488,7 +488,7 @@ static void board_select_chip (struct mtd_info *mtd, int chip) | |||
488 | The ECC bytes must be placed immidiately after the data | 488 | The ECC bytes must be placed immidiately after the data |
489 | bytes in order to make the syndrome generator work. This | 489 | bytes in order to make the syndrome generator work. This |
490 | is contrary to the usual layout used by software ECC. The | 490 | is contrary to the usual layout used by software ECC. The |
491 | seperation of data and out of band area is not longer | 491 | separation of data and out of band area is not longer |
492 | possible. The nand driver code handles this layout and | 492 | possible. The nand driver code handles this layout and |
493 | the remaining free bytes in the oob area are managed by | 493 | the remaining free bytes in the oob area are managed by |
494 | the autoplacement code. Provide a matching oob-layout | 494 | the autoplacement code. Provide a matching oob-layout |
@@ -560,7 +560,7 @@ static void board_select_chip (struct mtd_info *mtd, int chip) | |||
560 | bad blocks. They have factory marked good blocks. The marker pattern | 560 | bad blocks. They have factory marked good blocks. The marker pattern |
561 | is erased when the block is erased to be reused. So in case of | 561 | is erased when the block is erased to be reused. So in case of |
562 | powerloss before writing the pattern back to the chip this block | 562 | powerloss before writing the pattern back to the chip this block |
563 | would be lost and added to the bad blocks. Therefor we scan the | 563 | would be lost and added to the bad blocks. Therefore we scan the |
564 | chip(s) when we detect them the first time for good blocks and | 564 | chip(s) when we detect them the first time for good blocks and |
565 | store this information in a bad block table before erasing any | 565 | store this information in a bad block table before erasing any |
566 | of the blocks. | 566 | of the blocks. |
@@ -1094,7 +1094,7 @@ in this page</entry> | |||
1094 | manufacturers specifications. This applies similar to the spare area. | 1094 | manufacturers specifications. This applies similar to the spare area. |
1095 | </para> | 1095 | </para> |
1096 | <para> | 1096 | <para> |
1097 | Therefor NAND aware filesystems must either write in page size chunks | 1097 | Therefore NAND aware filesystems must either write in page size chunks |
1098 | or hold a writebuffer to collect smaller writes until they sum up to | 1098 | or hold a writebuffer to collect smaller writes until they sum up to |
1099 | pagesize. Available NAND aware filesystems: JFFS2, YAFFS. | 1099 | pagesize. Available NAND aware filesystems: JFFS2, YAFFS. |
1100 | </para> | 1100 | </para> |
diff --git a/Documentation/DocBook/sh.tmpl b/Documentation/DocBook/sh.tmpl index 0c3dc4c69dd1..d858d92cf6d9 100644 --- a/Documentation/DocBook/sh.tmpl +++ b/Documentation/DocBook/sh.tmpl | |||
@@ -19,13 +19,17 @@ | |||
19 | </authorgroup> | 19 | </authorgroup> |
20 | 20 | ||
21 | <copyright> | 21 | <copyright> |
22 | <year>2008</year> | 22 | <year>2008-2010</year> |
23 | <holder>Paul Mundt</holder> | 23 | <holder>Paul Mundt</holder> |
24 | </copyright> | 24 | </copyright> |
25 | <copyright> | 25 | <copyright> |
26 | <year>2008</year> | 26 | <year>2008-2010</year> |
27 | <holder>Renesas Technology Corp.</holder> | 27 | <holder>Renesas Technology Corp.</holder> |
28 | </copyright> | 28 | </copyright> |
29 | <copyright> | ||
30 | <year>2010</year> | ||
31 | <holder>Renesas Electronics Corp.</holder> | ||
32 | </copyright> | ||
29 | 33 | ||
30 | <legalnotice> | 34 | <legalnotice> |
31 | <para> | 35 | <para> |
@@ -77,7 +81,7 @@ | |||
77 | </chapter> | 81 | </chapter> |
78 | <chapter id="clk"> | 82 | <chapter id="clk"> |
79 | <title>Clock Framework Extensions</title> | 83 | <title>Clock Framework Extensions</title> |
80 | !Iarch/sh/include/asm/clock.h | 84 | !Iinclude/linux/sh_clk.h |
81 | </chapter> | 85 | </chapter> |
82 | <chapter id="mach"> | 86 | <chapter id="mach"> |
83 | <title>Machine Specific Interfaces</title> | 87 | <title>Machine Specific Interfaces</title> |
diff --git a/Documentation/DocBook/tracepoint.tmpl b/Documentation/DocBook/tracepoint.tmpl index 8bca1d5cec09..e8473eae2a20 100644 --- a/Documentation/DocBook/tracepoint.tmpl +++ b/Documentation/DocBook/tracepoint.tmpl | |||
@@ -16,6 +16,15 @@ | |||
16 | </address> | 16 | </address> |
17 | </affiliation> | 17 | </affiliation> |
18 | </author> | 18 | </author> |
19 | <author> | ||
20 | <firstname>William</firstname> | ||
21 | <surname>Cohen</surname> | ||
22 | <affiliation> | ||
23 | <address> | ||
24 | <email>wcohen@redhat.com</email> | ||
25 | </address> | ||
26 | </affiliation> | ||
27 | </author> | ||
19 | </authorgroup> | 28 | </authorgroup> |
20 | 29 | ||
21 | <legalnotice> | 30 | <legalnotice> |
@@ -91,4 +100,8 @@ | |||
91 | !Iinclude/trace/events/signal.h | 100 | !Iinclude/trace/events/signal.h |
92 | </chapter> | 101 | </chapter> |
93 | 102 | ||
103 | <chapter id="block"> | ||
104 | <title>Block IO</title> | ||
105 | !Iinclude/trace/events/block.h | ||
106 | </chapter> | ||
94 | </book> | 107 | </book> |
diff --git a/Documentation/DocBook/v4l/common.xml b/Documentation/DocBook/v4l/common.xml index c65f0ac9b6ee..cea23e1c4fc6 100644 --- a/Documentation/DocBook/v4l/common.xml +++ b/Documentation/DocBook/v4l/common.xml | |||
@@ -1170,7 +1170,7 @@ frames per second. If less than this number of frames is to be | |||
1170 | captured or output, applications can request frame skipping or | 1170 | captured or output, applications can request frame skipping or |
1171 | duplicating on the driver side. This is especially useful when using | 1171 | duplicating on the driver side. This is especially useful when using |
1172 | the &func-read; or &func-write;, which are not augmented by timestamps | 1172 | the &func-read; or &func-write;, which are not augmented by timestamps |
1173 | or sequence counters, and to avoid unneccessary data copying.</para> | 1173 | or sequence counters, and to avoid unnecessary data copying.</para> |
1174 | 1174 | ||
1175 | <para>Finally these ioctls can be used to determine the number of | 1175 | <para>Finally these ioctls can be used to determine the number of |
1176 | buffers used internally by a driver in read/write mode. For | 1176 | buffers used internally by a driver in read/write mode. For |
diff --git a/Documentation/DocBook/v4l/compat.xml b/Documentation/DocBook/v4l/compat.xml index b9dbdf9e6d29..b42b935913cd 100644 --- a/Documentation/DocBook/v4l/compat.xml +++ b/Documentation/DocBook/v4l/compat.xml | |||
@@ -2332,15 +2332,26 @@ more information.</para> | |||
2332 | </listitem> | 2332 | </listitem> |
2333 | </orderedlist> | 2333 | </orderedlist> |
2334 | </section> | 2334 | </section> |
2335 | </section> | 2335 | <section> |
2336 | <title>V4L2 in Linux 2.6.34</title> | ||
2337 | <orderedlist> | ||
2338 | <listitem> | ||
2339 | <para>Added | ||
2340 | <constant>V4L2_CID_IRIS_ABSOLUTE</constant> and | ||
2341 | <constant>V4L2_CID_IRIS_RELATIVE</constant> controls to the | ||
2342 | <link linkend="camera-controls">Camera controls class</link>. | ||
2343 | </para> | ||
2344 | </listitem> | ||
2345 | </orderedlist> | ||
2346 | </section> | ||
2336 | 2347 | ||
2337 | <section id="other"> | 2348 | <section id="other"> |
2338 | <title>Relation of V4L2 to other Linux multimedia APIs</title> | 2349 | <title>Relation of V4L2 to other Linux multimedia APIs</title> |
2339 | 2350 | ||
2340 | <section id="xvideo"> | 2351 | <section id="xvideo"> |
2341 | <title>X Video Extension</title> | 2352 | <title>X Video Extension</title> |
2342 | 2353 | ||
2343 | <para>The X Video Extension (abbreviated XVideo or just Xv) is | 2354 | <para>The X Video Extension (abbreviated XVideo or just Xv) is |
2344 | an extension of the X Window system, implemented for example by the | 2355 | an extension of the X Window system, implemented for example by the |
2345 | XFree86 project. Its scope is similar to V4L2, an API to video capture | 2356 | XFree86 project. Its scope is similar to V4L2, an API to video capture |
2346 | and output devices for X clients. Xv allows applications to display | 2357 | and output devices for X clients. Xv allows applications to display |
@@ -2351,7 +2362,7 @@ capture or output still images in XPixmaps<footnote> | |||
2351 | extension available across many operating systems and | 2362 | extension available across many operating systems and |
2352 | architectures.</para> | 2363 | architectures.</para> |
2353 | 2364 | ||
2354 | <para>Because the driver is embedded into the X server Xv has a | 2365 | <para>Because the driver is embedded into the X server Xv has a |
2355 | number of advantages over the V4L2 <link linkend="overlay">video | 2366 | number of advantages over the V4L2 <link linkend="overlay">video |
2356 | overlay interface</link>. The driver can easily determine the overlay | 2367 | overlay interface</link>. The driver can easily determine the overlay |
2357 | target, &ie; visible graphics memory or off-screen buffers for a | 2368 | target, &ie; visible graphics memory or off-screen buffers for a |
@@ -2360,16 +2371,16 @@ overlay, scaling or color-keying, or the clipping functions of the | |||
2360 | video capture hardware, always in sync with drawing operations or | 2371 | video capture hardware, always in sync with drawing operations or |
2361 | windows moving or changing their stacking order.</para> | 2372 | windows moving or changing their stacking order.</para> |
2362 | 2373 | ||
2363 | <para>To combine the advantages of Xv and V4L a special Xv | 2374 | <para>To combine the advantages of Xv and V4L a special Xv |
2364 | driver exists in XFree86 and XOrg, just programming any overlay capable | 2375 | driver exists in XFree86 and XOrg, just programming any overlay capable |
2365 | Video4Linux device it finds. To enable it | 2376 | Video4Linux device it finds. To enable it |
2366 | <filename>/etc/X11/XF86Config</filename> must contain these lines:</para> | 2377 | <filename>/etc/X11/XF86Config</filename> must contain these lines:</para> |
2367 | <para><screen> | 2378 | <para><screen> |
2368 | Section "Module" | 2379 | Section "Module" |
2369 | Load "v4l" | 2380 | Load "v4l" |
2370 | EndSection</screen></para> | 2381 | EndSection</screen></para> |
2371 | 2382 | ||
2372 | <para>As of XFree86 4.2 this driver still supports only V4L | 2383 | <para>As of XFree86 4.2 this driver still supports only V4L |
2373 | ioctls, however it should work just fine with all V4L2 devices through | 2384 | ioctls, however it should work just fine with all V4L2 devices through |
2374 | the V4L2 backward-compatibility layer. Since V4L2 permits multiple | 2385 | the V4L2 backward-compatibility layer. Since V4L2 permits multiple |
2375 | opens it is possible (if supported by the V4L2 driver) to capture | 2386 | opens it is possible (if supported by the V4L2 driver) to capture |
@@ -2377,83 +2388,84 @@ video while an X client requested video overlay. Restrictions of | |||
2377 | simultaneous capturing and overlay are discussed in <xref | 2388 | simultaneous capturing and overlay are discussed in <xref |
2378 | linkend="overlay" /> apply.</para> | 2389 | linkend="overlay" /> apply.</para> |
2379 | 2390 | ||
2380 | <para>Only marginally related to V4L2, XFree86 extended Xv to | 2391 | <para>Only marginally related to V4L2, XFree86 extended Xv to |
2381 | support hardware YUV to RGB conversion and scaling for faster video | 2392 | support hardware YUV to RGB conversion and scaling for faster video |
2382 | playback, and added an interface to MPEG-2 decoding hardware. This API | 2393 | playback, and added an interface to MPEG-2 decoding hardware. This API |
2383 | is useful to display images captured with V4L2 devices.</para> | 2394 | is useful to display images captured with V4L2 devices.</para> |
2384 | </section> | 2395 | </section> |
2385 | 2396 | ||
2386 | <section> | 2397 | <section> |
2387 | <title>Digital Video</title> | 2398 | <title>Digital Video</title> |
2388 | 2399 | ||
2389 | <para>V4L2 does not support digital terrestrial, cable or | 2400 | <para>V4L2 does not support digital terrestrial, cable or |
2390 | satellite broadcast. A separate project aiming at digital receivers | 2401 | satellite broadcast. A separate project aiming at digital receivers |
2391 | exists. You can find its homepage at <ulink | 2402 | exists. You can find its homepage at <ulink |
2392 | url="http://linuxtv.org">http://linuxtv.org</ulink>. The Linux DVB API | 2403 | url="http://linuxtv.org">http://linuxtv.org</ulink>. The Linux DVB API |
2393 | has no connection to the V4L2 API except that drivers for hybrid | 2404 | has no connection to the V4L2 API except that drivers for hybrid |
2394 | hardware may support both.</para> | 2405 | hardware may support both.</para> |
2395 | </section> | 2406 | </section> |
2396 | 2407 | ||
2397 | <section> | 2408 | <section> |
2398 | <title>Audio Interfaces</title> | 2409 | <title>Audio Interfaces</title> |
2399 | 2410 | ||
2400 | <para>[to do - OSS/ALSA]</para> | 2411 | <para>[to do - OSS/ALSA]</para> |
2412 | </section> | ||
2401 | </section> | 2413 | </section> |
2402 | </section> | ||
2403 | 2414 | ||
2404 | <section id="experimental"> | 2415 | <section id="experimental"> |
2405 | <title>Experimental API Elements</title> | 2416 | <title>Experimental API Elements</title> |
2406 | 2417 | ||
2407 | <para>The following V4L2 API elements are currently experimental | 2418 | <para>The following V4L2 API elements are currently experimental |
2408 | and may change in the future.</para> | 2419 | and may change in the future.</para> |
2409 | 2420 | ||
2410 | <itemizedlist> | 2421 | <itemizedlist> |
2411 | <listitem> | 2422 | <listitem> |
2412 | <para>Video Output Overlay (OSD) Interface, <xref | 2423 | <para>Video Output Overlay (OSD) Interface, <xref |
2413 | linkend="osd" />.</para> | 2424 | linkend="osd" />.</para> |
2414 | </listitem> | 2425 | </listitem> |
2415 | <listitem> | 2426 | <listitem> |
2416 | <para><constant>V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY</constant>, | 2427 | <para><constant>V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY</constant>, |
2417 | &v4l2-buf-type;, <xref linkend="v4l2-buf-type" />.</para> | 2428 | &v4l2-buf-type;, <xref linkend="v4l2-buf-type" />.</para> |
2418 | </listitem> | 2429 | </listitem> |
2419 | <listitem> | 2430 | <listitem> |
2420 | <para><constant>V4L2_CAP_VIDEO_OUTPUT_OVERLAY</constant>, | 2431 | <para><constant>V4L2_CAP_VIDEO_OUTPUT_OVERLAY</constant>, |
2421 | &VIDIOC-QUERYCAP; ioctl, <xref linkend="device-capabilities" />.</para> | 2432 | &VIDIOC-QUERYCAP; ioctl, <xref linkend="device-capabilities" />.</para> |
2422 | </listitem> | 2433 | </listitem> |
2423 | <listitem> | 2434 | <listitem> |
2424 | <para>&VIDIOC-ENUM-FRAMESIZES; and | 2435 | <para>&VIDIOC-ENUM-FRAMESIZES; and |
2425 | &VIDIOC-ENUM-FRAMEINTERVALS; ioctls.</para> | 2436 | &VIDIOC-ENUM-FRAMEINTERVALS; ioctls.</para> |
2426 | </listitem> | 2437 | </listitem> |
2427 | <listitem> | 2438 | <listitem> |
2428 | <para>&VIDIOC-G-ENC-INDEX; ioctl.</para> | 2439 | <para>&VIDIOC-G-ENC-INDEX; ioctl.</para> |
2429 | </listitem> | 2440 | </listitem> |
2430 | <listitem> | 2441 | <listitem> |
2431 | <para>&VIDIOC-ENCODER-CMD; and &VIDIOC-TRY-ENCODER-CMD; | 2442 | <para>&VIDIOC-ENCODER-CMD; and &VIDIOC-TRY-ENCODER-CMD; |
2432 | ioctls.</para> | 2443 | ioctls.</para> |
2433 | </listitem> | 2444 | </listitem> |
2434 | <listitem> | 2445 | <listitem> |
2435 | <para>&VIDIOC-DBG-G-REGISTER; and &VIDIOC-DBG-S-REGISTER; | 2446 | <para>&VIDIOC-DBG-G-REGISTER; and &VIDIOC-DBG-S-REGISTER; |
2436 | ioctls.</para> | 2447 | ioctls.</para> |
2437 | </listitem> | 2448 | </listitem> |
2438 | <listitem> | 2449 | <listitem> |
2439 | <para>&VIDIOC-DBG-G-CHIP-IDENT; ioctl.</para> | 2450 | <para>&VIDIOC-DBG-G-CHIP-IDENT; ioctl.</para> |
2440 | </listitem> | 2451 | </listitem> |
2441 | </itemizedlist> | 2452 | </itemizedlist> |
2442 | </section> | 2453 | </section> |
2443 | 2454 | ||
2444 | <section id="obsolete"> | 2455 | <section id="obsolete"> |
2445 | <title>Obsolete API Elements</title> | 2456 | <title>Obsolete API Elements</title> |
2446 | 2457 | ||
2447 | <para>The following V4L2 API elements were superseded by new | 2458 | <para>The following V4L2 API elements were superseded by new |
2448 | interfaces and should not be implemented in new drivers.</para> | 2459 | interfaces and should not be implemented in new drivers.</para> |
2449 | 2460 | ||
2450 | <itemizedlist> | 2461 | <itemizedlist> |
2451 | <listitem> | 2462 | <listitem> |
2452 | <para><constant>VIDIOC_G_MPEGCOMP</constant> and | 2463 | <para><constant>VIDIOC_G_MPEGCOMP</constant> and |
2453 | <constant>VIDIOC_S_MPEGCOMP</constant> ioctls. Use Extended Controls, | 2464 | <constant>VIDIOC_S_MPEGCOMP</constant> ioctls. Use Extended Controls, |
2454 | <xref linkend="extended-controls" />.</para> | 2465 | <xref linkend="extended-controls" />.</para> |
2455 | </listitem> | 2466 | </listitem> |
2456 | </itemizedlist> | 2467 | </itemizedlist> |
2468 | </section> | ||
2457 | </section> | 2469 | </section> |
2458 | 2470 | ||
2459 | <!-- | 2471 | <!-- |
diff --git a/Documentation/DocBook/v4l/controls.xml b/Documentation/DocBook/v4l/controls.xml index f46450610412..8408caaee276 100644 --- a/Documentation/DocBook/v4l/controls.xml +++ b/Documentation/DocBook/v4l/controls.xml | |||
@@ -267,6 +267,12 @@ minimum value disables backlight compensation.</entry> | |||
267 | <entry>Chroma automatic gain control.</entry> | 267 | <entry>Chroma automatic gain control.</entry> |
268 | </row> | 268 | </row> |
269 | <row> | 269 | <row> |
270 | <entry><constant>V4L2_CID_CHROMA_GAIN</constant></entry> | ||
271 | <entry>integer</entry> | ||
272 | <entry>Adjusts the Chroma gain control (for use when chroma AGC | ||
273 | is disabled).</entry> | ||
274 | </row> | ||
275 | <row> | ||
270 | <entry><constant>V4L2_CID_COLOR_KILLER</constant></entry> | 276 | <entry><constant>V4L2_CID_COLOR_KILLER</constant></entry> |
271 | <entry>boolean</entry> | 277 | <entry>boolean</entry> |
272 | <entry>Enable the color killer (&ie; force a black & white image in case of a weak video signal).</entry> | 278 | <entry>Enable the color killer (&ie; force a black & white image in case of a weak video signal).</entry> |
@@ -277,8 +283,15 @@ minimum value disables backlight compensation.</entry> | |||
277 | <entry>Selects a color effect. Possible values for | 283 | <entry>Selects a color effect. Possible values for |
278 | <constant>enum v4l2_colorfx</constant> are: | 284 | <constant>enum v4l2_colorfx</constant> are: |
279 | <constant>V4L2_COLORFX_NONE</constant> (0), | 285 | <constant>V4L2_COLORFX_NONE</constant> (0), |
280 | <constant>V4L2_COLORFX_BW</constant> (1) and | 286 | <constant>V4L2_COLORFX_BW</constant> (1), |
281 | <constant>V4L2_COLORFX_SEPIA</constant> (2).</entry> | 287 | <constant>V4L2_COLORFX_SEPIA</constant> (2), |
288 | <constant>V4L2_COLORFX_NEGATIVE</constant> (3), | ||
289 | <constant>V4L2_COLORFX_EMBOSS</constant> (4), | ||
290 | <constant>V4L2_COLORFX_SKETCH</constant> (5), | ||
291 | <constant>V4L2_COLORFX_SKY_BLUE</constant> (6), | ||
292 | <constant>V4L2_COLORFX_GRASS_GREEN</constant> (7), | ||
293 | <constant>V4L2_COLORFX_SKIN_WHITEN</constant> (8) and | ||
294 | <constant>V4L2_COLORFX_VIVID</constant> (9).</entry> | ||
282 | </row> | 295 | </row> |
283 | <row> | 296 | <row> |
284 | <entry><constant>V4L2_CID_ROTATE</constant></entry> | 297 | <entry><constant>V4L2_CID_ROTATE</constant></entry> |
@@ -1825,6 +1838,25 @@ wide-angle direction. The zoom speed unit is driver-specific.</entry> | |||
1825 | <row><entry></entry></row> | 1838 | <row><entry></entry></row> |
1826 | 1839 | ||
1827 | <row> | 1840 | <row> |
1841 | <entry spanname="id"><constant>V4L2_CID_IRIS_ABSOLUTE</constant> </entry> | ||
1842 | <entry>integer</entry> | ||
1843 | </row><row><entry spanname="descr">This control sets the | ||
1844 | camera's aperture to the specified value. The unit is undefined. | ||
1845 | Larger values open the iris wider, smaller values close it.</entry> | ||
1846 | </row> | ||
1847 | <row><entry></entry></row> | ||
1848 | |||
1849 | <row> | ||
1850 | <entry spanname="id"><constant>V4L2_CID_IRIS_RELATIVE</constant> </entry> | ||
1851 | <entry>integer</entry> | ||
1852 | </row><row><entry spanname="descr">This control modifies the | ||
1853 | camera's aperture by the specified amount. The unit is undefined. | ||
1854 | Positive values open the iris one step further, negative values close | ||
1855 | it one step further. This is a write-only control.</entry> | ||
1856 | </row> | ||
1857 | <row><entry></entry></row> | ||
1858 | |||
1859 | <row> | ||
1828 | <entry spanname="id"><constant>V4L2_CID_PRIVACY</constant> </entry> | 1860 | <entry spanname="id"><constant>V4L2_CID_PRIVACY</constant> </entry> |
1829 | <entry>boolean</entry> | 1861 | <entry>boolean</entry> |
1830 | </row><row><entry spanname="descr">Prevent video from being acquired | 1862 | </row><row><entry spanname="descr">Prevent video from being acquired |
diff --git a/Documentation/DocBook/v4l/dev-event.xml b/Documentation/DocBook/v4l/dev-event.xml new file mode 100644 index 000000000000..be5a98fb4fab --- /dev/null +++ b/Documentation/DocBook/v4l/dev-event.xml | |||
@@ -0,0 +1,31 @@ | |||
1 | <title>Event Interface</title> | ||
2 | |||
3 | <para>The V4L2 event interface provides means for user to get | ||
4 | immediately notified on certain conditions taking place on a device. | ||
5 | This might include start of frame or loss of signal events, for | ||
6 | example. | ||
7 | </para> | ||
8 | |||
9 | <para>To receive events, the events the user is interested in first must | ||
10 | be subscribed using the &VIDIOC-SUBSCRIBE-EVENT; ioctl. Once an event is | ||
11 | subscribed, the events of subscribed types are dequeueable using the | ||
12 | &VIDIOC-DQEVENT; ioctl. Events may be unsubscribed using | ||
13 | VIDIOC_UNSUBSCRIBE_EVENT ioctl. The special event type V4L2_EVENT_ALL may | ||
14 | be used to unsubscribe all the events the driver supports.</para> | ||
15 | |||
16 | <para>The event subscriptions and event queues are specific to file | ||
17 | handles. Subscribing an event on one file handle does not affect | ||
18 | other file handles. | ||
19 | </para> | ||
20 | |||
21 | <para>The information on dequeueable events is obtained by using select or | ||
22 | poll system calls on video devices. The V4L2 events use POLLPRI events on | ||
23 | poll system call and exceptions on select system call. </para> | ||
24 | |||
25 | <!-- | ||
26 | Local Variables: | ||
27 | mode: sgml | ||
28 | sgml-parent-document: "v4l2.sgml" | ||
29 | indent-tabs-mode: nil | ||
30 | End: | ||
31 | --> | ||
diff --git a/Documentation/DocBook/v4l/io.xml b/Documentation/DocBook/v4l/io.xml index f92f24323b2a..d424886beda0 100644 --- a/Documentation/DocBook/v4l/io.xml +++ b/Documentation/DocBook/v4l/io.xml | |||
@@ -589,7 +589,8 @@ number of a video input as in &v4l2-input; field | |||
589 | <entry></entry> | 589 | <entry></entry> |
590 | <entry>A place holder for future extensions and custom | 590 | <entry>A place holder for future extensions and custom |
591 | (driver defined) buffer types | 591 | (driver defined) buffer types |
592 | <constant>V4L2_BUF_TYPE_PRIVATE</constant> and higher.</entry> | 592 | <constant>V4L2_BUF_TYPE_PRIVATE</constant> and higher. Applications |
593 | should set this to 0.</entry> | ||
593 | </row> | 594 | </row> |
594 | </tbody> | 595 | </tbody> |
595 | </tgroup> | 596 | </tgroup> |
@@ -701,6 +702,16 @@ They can be both cleared however, then the buffer is in "dequeued" | |||
701 | state, in the application domain to say so.</entry> | 702 | state, in the application domain to say so.</entry> |
702 | </row> | 703 | </row> |
703 | <row> | 704 | <row> |
705 | <entry><constant>V4L2_BUF_FLAG_ERROR</constant></entry> | ||
706 | <entry>0x0040</entry> | ||
707 | <entry>When this flag is set, the buffer has been dequeued | ||
708 | successfully, although the data might have been corrupted. | ||
709 | This is recoverable, streaming may continue as normal and | ||
710 | the buffer may be reused normally. | ||
711 | Drivers set this flag when the <constant>VIDIOC_DQBUF</constant> | ||
712 | ioctl is called.</entry> | ||
713 | </row> | ||
714 | <row> | ||
704 | <entry><constant>V4L2_BUF_FLAG_KEYFRAME</constant></entry> | 715 | <entry><constant>V4L2_BUF_FLAG_KEYFRAME</constant></entry> |
705 | <entry>0x0008</entry> | 716 | <entry>0x0008</entry> |
706 | <entry>Drivers set or clear this flag when calling the | 717 | <entry>Drivers set or clear this flag when calling the |
@@ -917,8 +928,8 @@ order</emphasis>.</para> | |||
917 | 928 | ||
918 | <para>When the driver provides or accepts images field by field | 929 | <para>When the driver provides or accepts images field by field |
919 | rather than interleaved, it is also important applications understand | 930 | rather than interleaved, it is also important applications understand |
920 | how the fields combine to frames. We distinguish between top and | 931 | how the fields combine to frames. We distinguish between top (aka odd) and |
921 | bottom fields, the <emphasis>spatial order</emphasis>: The first line | 932 | bottom (aka even) fields, the <emphasis>spatial order</emphasis>: The first line |
922 | of the top field is the first line of an interlaced frame, the first | 933 | of the top field is the first line of an interlaced frame, the first |
923 | line of the bottom field is the second line of that frame.</para> | 934 | line of the bottom field is the second line of that frame.</para> |
924 | 935 | ||
@@ -971,12 +982,12 @@ between <constant>V4L2_FIELD_TOP</constant> and | |||
971 | <row> | 982 | <row> |
972 | <entry><constant>V4L2_FIELD_TOP</constant></entry> | 983 | <entry><constant>V4L2_FIELD_TOP</constant></entry> |
973 | <entry>2</entry> | 984 | <entry>2</entry> |
974 | <entry>Images consist of the top field only.</entry> | 985 | <entry>Images consist of the top (aka odd) field only.</entry> |
975 | </row> | 986 | </row> |
976 | <row> | 987 | <row> |
977 | <entry><constant>V4L2_FIELD_BOTTOM</constant></entry> | 988 | <entry><constant>V4L2_FIELD_BOTTOM</constant></entry> |
978 | <entry>3</entry> | 989 | <entry>3</entry> |
979 | <entry>Images consist of the bottom field only. | 990 | <entry>Images consist of the bottom (aka even) field only. |
980 | Applications may wish to prevent a device from capturing interlaced | 991 | Applications may wish to prevent a device from capturing interlaced |
981 | images because they will have "comb" or "feathering" artefacts around | 992 | images because they will have "comb" or "feathering" artefacts around |
982 | moving objects.</entry> | 993 | moving objects.</entry> |
diff --git a/Documentation/DocBook/v4l/pixfmt.xml b/Documentation/DocBook/v4l/pixfmt.xml index 885968d6a2fc..c4ad0a8e42dc 100644 --- a/Documentation/DocBook/v4l/pixfmt.xml +++ b/Documentation/DocBook/v4l/pixfmt.xml | |||
@@ -792,6 +792,18 @@ http://www.thedirks.org/winnov/</ulink></para></entry> | |||
792 | <entry>'YYUV'</entry> | 792 | <entry>'YYUV'</entry> |
793 | <entry>unknown</entry> | 793 | <entry>unknown</entry> |
794 | </row> | 794 | </row> |
795 | <row id="V4L2-PIX-FMT-Y4"> | ||
796 | <entry><constant>V4L2_PIX_FMT_Y4</constant></entry> | ||
797 | <entry>'Y04 '</entry> | ||
798 | <entry>Old 4-bit greyscale format. Only the least significant 4 bits of each byte are used, | ||
799 | the other bits are set to 0.</entry> | ||
800 | </row> | ||
801 | <row id="V4L2-PIX-FMT-Y6"> | ||
802 | <entry><constant>V4L2_PIX_FMT_Y6</constant></entry> | ||
803 | <entry>'Y06 '</entry> | ||
804 | <entry>Old 6-bit greyscale format. Only the least significant 6 bits of each byte are used, | ||
805 | the other bits are set to 0.</entry> | ||
806 | </row> | ||
795 | </tbody> | 807 | </tbody> |
796 | </tgroup> | 808 | </tgroup> |
797 | </table> | 809 | </table> |
diff --git a/Documentation/DocBook/v4l/v4l2.xml b/Documentation/DocBook/v4l/v4l2.xml index 060105af49e5..9737243377a3 100644 --- a/Documentation/DocBook/v4l/v4l2.xml +++ b/Documentation/DocBook/v4l/v4l2.xml | |||
@@ -401,6 +401,7 @@ and discussions on the V4L mailing list.</revremark> | |||
401 | <section id="ttx"> &sub-dev-teletext; </section> | 401 | <section id="ttx"> &sub-dev-teletext; </section> |
402 | <section id="radio"> &sub-dev-radio; </section> | 402 | <section id="radio"> &sub-dev-radio; </section> |
403 | <section id="rds"> &sub-dev-rds; </section> | 403 | <section id="rds"> &sub-dev-rds; </section> |
404 | <section id="event"> &sub-dev-event; </section> | ||
404 | </chapter> | 405 | </chapter> |
405 | 406 | ||
406 | <chapter id="driver"> | 407 | <chapter id="driver"> |
@@ -426,6 +427,7 @@ and discussions on the V4L mailing list.</revremark> | |||
426 | &sub-cropcap; | 427 | &sub-cropcap; |
427 | &sub-dbg-g-chip-ident; | 428 | &sub-dbg-g-chip-ident; |
428 | &sub-dbg-g-register; | 429 | &sub-dbg-g-register; |
430 | &sub-dqevent; | ||
429 | &sub-encoder-cmd; | 431 | &sub-encoder-cmd; |
430 | &sub-enumaudio; | 432 | &sub-enumaudio; |
431 | &sub-enumaudioout; | 433 | &sub-enumaudioout; |
@@ -467,6 +469,7 @@ and discussions on the V4L mailing list.</revremark> | |||
467 | &sub-reqbufs; | 469 | &sub-reqbufs; |
468 | &sub-s-hw-freq-seek; | 470 | &sub-s-hw-freq-seek; |
469 | &sub-streamon; | 471 | &sub-streamon; |
472 | &sub-subscribe-event; | ||
470 | <!-- End of ioctls. --> | 473 | <!-- End of ioctls. --> |
471 | &sub-mmap; | 474 | &sub-mmap; |
472 | &sub-munmap; | 475 | &sub-munmap; |
diff --git a/Documentation/DocBook/v4l/videodev2.h.xml b/Documentation/DocBook/v4l/videodev2.h.xml index 068325940658..865b06d9e679 100644 --- a/Documentation/DocBook/v4l/videodev2.h.xml +++ b/Documentation/DocBook/v4l/videodev2.h.xml | |||
@@ -1018,6 +1018,13 @@ enum <link linkend="v4l2-colorfx">v4l2_colorfx</link> { | |||
1018 | V4L2_COLORFX_NONE = 0, | 1018 | V4L2_COLORFX_NONE = 0, |
1019 | V4L2_COLORFX_BW = 1, | 1019 | V4L2_COLORFX_BW = 1, |
1020 | V4L2_COLORFX_SEPIA = 2, | 1020 | V4L2_COLORFX_SEPIA = 2, |
1021 | V4L2_COLORFX_NEGATIVE = 3, | ||
1022 | V4L2_COLORFX_EMBOSS = 4, | ||
1023 | V4L2_COLORFX_SKETCH = 5, | ||
1024 | V4L2_COLORFX_SKY_BLUE = 6, | ||
1025 | V4L2_COLORFX_GRASS_GREEN = 7, | ||
1026 | V4L2_COLORFX_SKIN_WHITEN = 8, | ||
1027 | V4L2_COLORFX_VIVID = 9. | ||
1021 | }; | 1028 | }; |
1022 | #define V4L2_CID_AUTOBRIGHTNESS (V4L2_CID_BASE+32) | 1029 | #define V4L2_CID_AUTOBRIGHTNESS (V4L2_CID_BASE+32) |
1023 | #define V4L2_CID_BAND_STOP_FILTER (V4L2_CID_BASE+33) | 1030 | #define V4L2_CID_BAND_STOP_FILTER (V4L2_CID_BASE+33) |
@@ -1271,6 +1278,9 @@ enum <link linkend="v4l2-exposure-auto-type">v4l2_exposure_auto_type</link> { | |||
1271 | 1278 | ||
1272 | #define V4L2_CID_PRIVACY (V4L2_CID_CAMERA_CLASS_BASE+16) | 1279 | #define V4L2_CID_PRIVACY (V4L2_CID_CAMERA_CLASS_BASE+16) |
1273 | 1280 | ||
1281 | #define V4L2_CID_IRIS_ABSOLUTE (V4L2_CID_CAMERA_CLASS_BASE+17) | ||
1282 | #define V4L2_CID_IRIS_RELATIVE (V4L2_CID_CAMERA_CLASS_BASE+18) | ||
1283 | |||
1274 | /* FM Modulator class control IDs */ | 1284 | /* FM Modulator class control IDs */ |
1275 | #define V4L2_CID_FM_TX_CLASS_BASE (V4L2_CTRL_CLASS_FM_TX | 0x900) | 1285 | #define V4L2_CID_FM_TX_CLASS_BASE (V4L2_CTRL_CLASS_FM_TX | 0x900) |
1276 | #define V4L2_CID_FM_TX_CLASS (V4L2_CTRL_CLASS_FM_TX | 1) | 1286 | #define V4L2_CID_FM_TX_CLASS (V4L2_CTRL_CLASS_FM_TX | 1) |
diff --git a/Documentation/DocBook/v4l/vidioc-dqevent.xml b/Documentation/DocBook/v4l/vidioc-dqevent.xml new file mode 100644 index 000000000000..4e0a7cc30812 --- /dev/null +++ b/Documentation/DocBook/v4l/vidioc-dqevent.xml | |||
@@ -0,0 +1,131 @@ | |||
1 | <refentry id="vidioc-dqevent"> | ||
2 | <refmeta> | ||
3 | <refentrytitle>ioctl VIDIOC_DQEVENT</refentrytitle> | ||
4 | &manvol; | ||
5 | </refmeta> | ||
6 | |||
7 | <refnamediv> | ||
8 | <refname>VIDIOC_DQEVENT</refname> | ||
9 | <refpurpose>Dequeue event</refpurpose> | ||
10 | </refnamediv> | ||
11 | |||
12 | <refsynopsisdiv> | ||
13 | <funcsynopsis> | ||
14 | <funcprototype> | ||
15 | <funcdef>int <function>ioctl</function></funcdef> | ||
16 | <paramdef>int <parameter>fd</parameter></paramdef> | ||
17 | <paramdef>int <parameter>request</parameter></paramdef> | ||
18 | <paramdef>struct v4l2_event | ||
19 | *<parameter>argp</parameter></paramdef> | ||
20 | </funcprototype> | ||
21 | </funcsynopsis> | ||
22 | </refsynopsisdiv> | ||
23 | |||
24 | <refsect1> | ||
25 | <title>Arguments</title> | ||
26 | |||
27 | <variablelist> | ||
28 | <varlistentry> | ||
29 | <term><parameter>fd</parameter></term> | ||
30 | <listitem> | ||
31 | <para>&fd;</para> | ||
32 | </listitem> | ||
33 | </varlistentry> | ||
34 | <varlistentry> | ||
35 | <term><parameter>request</parameter></term> | ||
36 | <listitem> | ||
37 | <para>VIDIOC_DQEVENT</para> | ||
38 | </listitem> | ||
39 | </varlistentry> | ||
40 | <varlistentry> | ||
41 | <term><parameter>argp</parameter></term> | ||
42 | <listitem> | ||
43 | <para></para> | ||
44 | </listitem> | ||
45 | </varlistentry> | ||
46 | </variablelist> | ||
47 | </refsect1> | ||
48 | |||
49 | <refsect1> | ||
50 | <title>Description</title> | ||
51 | |||
52 | <para>Dequeue an event from a video device. No input is required | ||
53 | for this ioctl. All the fields of the &v4l2-event; structure are | ||
54 | filled by the driver. The file handle will also receive exceptions | ||
55 | which the application may get by e.g. using the select system | ||
56 | call.</para> | ||
57 | |||
58 | <table frame="none" pgwide="1" id="v4l2-event"> | ||
59 | <title>struct <structname>v4l2_event</structname></title> | ||
60 | <tgroup cols="4"> | ||
61 | &cs-str; | ||
62 | <tbody valign="top"> | ||
63 | <row> | ||
64 | <entry>__u32</entry> | ||
65 | <entry><structfield>type</structfield></entry> | ||
66 | <entry></entry> | ||
67 | <entry>Type of the event.</entry> | ||
68 | </row> | ||
69 | <row> | ||
70 | <entry>union</entry> | ||
71 | <entry><structfield>u</structfield></entry> | ||
72 | <entry></entry> | ||
73 | <entry></entry> | ||
74 | </row> | ||
75 | <row> | ||
76 | <entry></entry> | ||
77 | <entry>&v4l2-event-vsync;</entry> | ||
78 | <entry><structfield>vsync</structfield></entry> | ||
79 | <entry>Event data for event V4L2_EVENT_VSYNC. | ||
80 | </entry> | ||
81 | </row> | ||
82 | <row> | ||
83 | <entry></entry> | ||
84 | <entry>__u8</entry> | ||
85 | <entry><structfield>data</structfield>[64]</entry> | ||
86 | <entry>Event data. Defined by the event type. The union | ||
87 | should be used to define easily accessible type for | ||
88 | events.</entry> | ||
89 | </row> | ||
90 | <row> | ||
91 | <entry>__u32</entry> | ||
92 | <entry><structfield>pending</structfield></entry> | ||
93 | <entry></entry> | ||
94 | <entry>Number of pending events excluding this one.</entry> | ||
95 | </row> | ||
96 | <row> | ||
97 | <entry>__u32</entry> | ||
98 | <entry><structfield>sequence</structfield></entry> | ||
99 | <entry></entry> | ||
100 | <entry>Event sequence number. The sequence number is | ||
101 | incremented for every subscribed event that takes place. | ||
102 | If sequence numbers are not contiguous it means that | ||
103 | events have been lost. | ||
104 | </entry> | ||
105 | </row> | ||
106 | <row> | ||
107 | <entry>struct timespec</entry> | ||
108 | <entry><structfield>timestamp</structfield></entry> | ||
109 | <entry></entry> | ||
110 | <entry>Event timestamp.</entry> | ||
111 | </row> | ||
112 | <row> | ||
113 | <entry>__u32</entry> | ||
114 | <entry><structfield>reserved</structfield>[9]</entry> | ||
115 | <entry></entry> | ||
116 | <entry>Reserved for future extensions. Drivers must set | ||
117 | the array to zero.</entry> | ||
118 | </row> | ||
119 | </tbody> | ||
120 | </tgroup> | ||
121 | </table> | ||
122 | |||
123 | </refsect1> | ||
124 | </refentry> | ||
125 | <!-- | ||
126 | Local Variables: | ||
127 | mode: sgml | ||
128 | sgml-parent-document: "v4l2.sgml" | ||
129 | indent-tabs-mode: nil | ||
130 | End: | ||
131 | --> | ||
diff --git a/Documentation/DocBook/v4l/vidioc-enuminput.xml b/Documentation/DocBook/v4l/vidioc-enuminput.xml index 71b868e2fb8f..476fe1d2bba0 100644 --- a/Documentation/DocBook/v4l/vidioc-enuminput.xml +++ b/Documentation/DocBook/v4l/vidioc-enuminput.xml | |||
@@ -283,7 +283,7 @@ input/output interface to linux-media@vger.kernel.org on 19 Oct 2009. | |||
283 | <entry>This input supports setting DV presets by using VIDIOC_S_DV_PRESET.</entry> | 283 | <entry>This input supports setting DV presets by using VIDIOC_S_DV_PRESET.</entry> |
284 | </row> | 284 | </row> |
285 | <row> | 285 | <row> |
286 | <entry><constant>V4L2_OUT_CAP_CUSTOM_TIMINGS</constant></entry> | 286 | <entry><constant>V4L2_IN_CAP_CUSTOM_TIMINGS</constant></entry> |
287 | <entry>0x00000002</entry> | 287 | <entry>0x00000002</entry> |
288 | <entry>This input supports setting custom video timings by using VIDIOC_S_DV_TIMINGS.</entry> | 288 | <entry>This input supports setting custom video timings by using VIDIOC_S_DV_TIMINGS.</entry> |
289 | </row> | 289 | </row> |
diff --git a/Documentation/DocBook/v4l/vidioc-g-parm.xml b/Documentation/DocBook/v4l/vidioc-g-parm.xml index 78332d365ce9..392aa9e5571e 100644 --- a/Documentation/DocBook/v4l/vidioc-g-parm.xml +++ b/Documentation/DocBook/v4l/vidioc-g-parm.xml | |||
@@ -55,7 +55,7 @@ captured or output, applications can request frame skipping or | |||
55 | duplicating on the driver side. This is especially useful when using | 55 | duplicating on the driver side. This is especially useful when using |
56 | the <function>read()</function> or <function>write()</function>, which | 56 | the <function>read()</function> or <function>write()</function>, which |
57 | are not augmented by timestamps or sequence counters, and to avoid | 57 | are not augmented by timestamps or sequence counters, and to avoid |
58 | unneccessary data copying.</para> | 58 | unnecessary data copying.</para> |
59 | 59 | ||
60 | <para>Further these ioctls can be used to determine the number of | 60 | <para>Further these ioctls can be used to determine the number of |
61 | buffers used internally by a driver in read/write mode. For | 61 | buffers used internally by a driver in read/write mode. For |
diff --git a/Documentation/DocBook/v4l/vidioc-qbuf.xml b/Documentation/DocBook/v4l/vidioc-qbuf.xml index 187081778154..ab691ebf3b93 100644 --- a/Documentation/DocBook/v4l/vidioc-qbuf.xml +++ b/Documentation/DocBook/v4l/vidioc-qbuf.xml | |||
@@ -54,12 +54,10 @@ to enqueue an empty (capturing) or filled (output) buffer in the | |||
54 | driver's incoming queue. The semantics depend on the selected I/O | 54 | driver's incoming queue. The semantics depend on the selected I/O |
55 | method.</para> | 55 | method.</para> |
56 | 56 | ||
57 | <para>To enqueue a <link linkend="mmap">memory mapped</link> | 57 | <para>To enqueue a buffer applications set the <structfield>type</structfield> |
58 | buffer applications set the <structfield>type</structfield> field of a | 58 | field of a &v4l2-buffer; to the same buffer type as was previously used |
59 | &v4l2-buffer; to the same buffer type as previously &v4l2-format; | 59 | with &v4l2-format; <structfield>type</structfield> and &v4l2-requestbuffers; |
60 | <structfield>type</structfield> and &v4l2-requestbuffers; | 60 | <structfield>type</structfield>. Applications must also set the |
61 | <structfield>type</structfield>, the <structfield>memory</structfield> | ||
62 | field to <constant>V4L2_MEMORY_MMAP</constant> and the | ||
63 | <structfield>index</structfield> field. Valid index numbers range from | 61 | <structfield>index</structfield> field. Valid index numbers range from |
64 | zero to the number of buffers allocated with &VIDIOC-REQBUFS; | 62 | zero to the number of buffers allocated with &VIDIOC-REQBUFS; |
65 | (&v4l2-requestbuffers; <structfield>count</structfield>) minus one. The | 63 | (&v4l2-requestbuffers; <structfield>count</structfield>) minus one. The |
@@ -70,8 +68,19 @@ intended for output (<structfield>type</structfield> is | |||
70 | <constant>V4L2_BUF_TYPE_VBI_OUTPUT</constant>) applications must also | 68 | <constant>V4L2_BUF_TYPE_VBI_OUTPUT</constant>) applications must also |
71 | initialize the <structfield>bytesused</structfield>, | 69 | initialize the <structfield>bytesused</structfield>, |
72 | <structfield>field</structfield> and | 70 | <structfield>field</structfield> and |
73 | <structfield>timestamp</structfield> fields. See <xref | 71 | <structfield>timestamp</structfield> fields, see <xref |
74 | linkend="buffer" /> for details. When | 72 | linkend="buffer" /> for details. |
73 | Applications must also set <structfield>flags</structfield> to 0. If a driver | ||
74 | supports capturing from specific video inputs and you want to specify a video | ||
75 | input, then <structfield>flags</structfield> should be set to | ||
76 | <constant>V4L2_BUF_FLAG_INPUT</constant> and the field | ||
77 | <structfield>input</structfield> must be initialized to the desired input. | ||
78 | The <structfield>reserved</structfield> field must be set to 0. | ||
79 | </para> | ||
80 | |||
81 | <para>To enqueue a <link linkend="mmap">memory mapped</link> | ||
82 | buffer applications set the <structfield>memory</structfield> | ||
83 | field to <constant>V4L2_MEMORY_MMAP</constant>. When | ||
75 | <constant>VIDIOC_QBUF</constant> is called with a pointer to this | 84 | <constant>VIDIOC_QBUF</constant> is called with a pointer to this |
76 | structure the driver sets the | 85 | structure the driver sets the |
77 | <constant>V4L2_BUF_FLAG_MAPPED</constant> and | 86 | <constant>V4L2_BUF_FLAG_MAPPED</constant> and |
@@ -81,14 +90,10 @@ structure the driver sets the | |||
81 | &EINVAL;.</para> | 90 | &EINVAL;.</para> |
82 | 91 | ||
83 | <para>To enqueue a <link linkend="userp">user pointer</link> | 92 | <para>To enqueue a <link linkend="userp">user pointer</link> |
84 | buffer applications set the <structfield>type</structfield> field of a | 93 | buffer applications set the <structfield>memory</structfield> |
85 | &v4l2-buffer; to the same buffer type as previously &v4l2-format; | 94 | field to <constant>V4L2_MEMORY_USERPTR</constant>, the |
86 | <structfield>type</structfield> and &v4l2-requestbuffers; | ||
87 | <structfield>type</structfield>, the <structfield>memory</structfield> | ||
88 | field to <constant>V4L2_MEMORY_USERPTR</constant> and the | ||
89 | <structfield>m.userptr</structfield> field to the address of the | 95 | <structfield>m.userptr</structfield> field to the address of the |
90 | buffer and <structfield>length</structfield> to its size. When the | 96 | buffer and <structfield>length</structfield> to its size. |
91 | buffer is intended for output additional fields must be set as above. | ||
92 | When <constant>VIDIOC_QBUF</constant> is called with a pointer to this | 97 | When <constant>VIDIOC_QBUF</constant> is called with a pointer to this |
93 | structure the driver sets the <constant>V4L2_BUF_FLAG_QUEUED</constant> | 98 | structure the driver sets the <constant>V4L2_BUF_FLAG_QUEUED</constant> |
94 | flag and clears the <constant>V4L2_BUF_FLAG_MAPPED</constant> and | 99 | flag and clears the <constant>V4L2_BUF_FLAG_MAPPED</constant> and |
@@ -96,16 +101,21 @@ flag and clears the <constant>V4L2_BUF_FLAG_MAPPED</constant> and | |||
96 | <structfield>flags</structfield> field, or it returns an error code. | 101 | <structfield>flags</structfield> field, or it returns an error code. |
97 | This ioctl locks the memory pages of the buffer in physical memory, | 102 | This ioctl locks the memory pages of the buffer in physical memory, |
98 | they cannot be swapped out to disk. Buffers remain locked until | 103 | they cannot be swapped out to disk. Buffers remain locked until |
99 | dequeued, until the &VIDIOC-STREAMOFF; or &VIDIOC-REQBUFS; ioctl are | 104 | dequeued, until the &VIDIOC-STREAMOFF; or &VIDIOC-REQBUFS; ioctl is |
100 | called, or until the device is closed.</para> | 105 | called, or until the device is closed.</para> |
101 | 106 | ||
102 | <para>Applications call the <constant>VIDIOC_DQBUF</constant> | 107 | <para>Applications call the <constant>VIDIOC_DQBUF</constant> |
103 | ioctl to dequeue a filled (capturing) or displayed (output) buffer | 108 | ioctl to dequeue a filled (capturing) or displayed (output) buffer |
104 | from the driver's outgoing queue. They just set the | 109 | from the driver's outgoing queue. They just set the |
105 | <structfield>type</structfield> and <structfield>memory</structfield> | 110 | <structfield>type</structfield>, <structfield>memory</structfield> |
111 | and <structfield>reserved</structfield> | ||
106 | fields of a &v4l2-buffer; as above, when <constant>VIDIOC_DQBUF</constant> | 112 | fields of a &v4l2-buffer; as above, when <constant>VIDIOC_DQBUF</constant> |
107 | is called with a pointer to this structure the driver fills the | 113 | is called with a pointer to this structure the driver fills the |
108 | remaining fields or returns an error code.</para> | 114 | remaining fields or returns an error code. The driver may also set |
115 | <constant>V4L2_BUF_FLAG_ERROR</constant> in the <structfield>flags</structfield> | ||
116 | field. It indicates a non-critical (recoverable) streaming error. In such case | ||
117 | the application may continue as normal, but should be aware that data in the | ||
118 | dequeued buffer might be corrupted.</para> | ||
109 | 119 | ||
110 | <para>By default <constant>VIDIOC_DQBUF</constant> blocks when no | 120 | <para>By default <constant>VIDIOC_DQBUF</constant> blocks when no |
111 | buffer is in the outgoing queue. When the | 121 | buffer is in the outgoing queue. When the |
@@ -152,7 +162,13 @@ enqueue a user pointer buffer.</para> | |||
152 | <para><constant>VIDIOC_DQBUF</constant> failed due to an | 162 | <para><constant>VIDIOC_DQBUF</constant> failed due to an |
153 | internal error. Can also indicate temporary problems like signal | 163 | internal error. Can also indicate temporary problems like signal |
154 | loss. Note the driver might dequeue an (empty) buffer despite | 164 | loss. Note the driver might dequeue an (empty) buffer despite |
155 | returning an error, or even stop capturing.</para> | 165 | returning an error, or even stop capturing. Reusing such buffer may be unsafe |
166 | though and its details (e.g. <structfield>index</structfield>) may not be | ||
167 | returned either. It is recommended that drivers indicate recoverable errors | ||
168 | by setting the <constant>V4L2_BUF_FLAG_ERROR</constant> and returning 0 instead. | ||
169 | In that case the application should be able to safely reuse the buffer and | ||
170 | continue streaming. | ||
171 | </para> | ||
156 | </listitem> | 172 | </listitem> |
157 | </varlistentry> | 173 | </varlistentry> |
158 | </variablelist> | 174 | </variablelist> |
diff --git a/Documentation/DocBook/v4l/vidioc-querybuf.xml b/Documentation/DocBook/v4l/vidioc-querybuf.xml index d834993e6191..e649805a4908 100644 --- a/Documentation/DocBook/v4l/vidioc-querybuf.xml +++ b/Documentation/DocBook/v4l/vidioc-querybuf.xml | |||
@@ -54,12 +54,13 @@ buffer at any time after buffers have been allocated with the | |||
54 | &VIDIOC-REQBUFS; ioctl.</para> | 54 | &VIDIOC-REQBUFS; ioctl.</para> |
55 | 55 | ||
56 | <para>Applications set the <structfield>type</structfield> field | 56 | <para>Applications set the <structfield>type</structfield> field |
57 | of a &v4l2-buffer; to the same buffer type as previously | 57 | of a &v4l2-buffer; to the same buffer type as was previously used with |
58 | &v4l2-format; <structfield>type</structfield> and &v4l2-requestbuffers; | 58 | &v4l2-format; <structfield>type</structfield> and &v4l2-requestbuffers; |
59 | <structfield>type</structfield>, and the <structfield>index</structfield> | 59 | <structfield>type</structfield>, and the <structfield>index</structfield> |
60 | field. Valid index numbers range from zero | 60 | field. Valid index numbers range from zero |
61 | to the number of buffers allocated with &VIDIOC-REQBUFS; | 61 | to the number of buffers allocated with &VIDIOC-REQBUFS; |
62 | (&v4l2-requestbuffers; <structfield>count</structfield>) minus one. | 62 | (&v4l2-requestbuffers; <structfield>count</structfield>) minus one. |
63 | The <structfield>reserved</structfield> field should to set to 0. | ||
63 | After calling <constant>VIDIOC_QUERYBUF</constant> with a pointer to | 64 | After calling <constant>VIDIOC_QUERYBUF</constant> with a pointer to |
64 | this structure drivers return an error code or fill the rest of | 65 | this structure drivers return an error code or fill the rest of |
65 | the structure.</para> | 66 | the structure.</para> |
@@ -68,8 +69,8 @@ the structure.</para> | |||
68 | <constant>V4L2_BUF_FLAG_MAPPED</constant>, | 69 | <constant>V4L2_BUF_FLAG_MAPPED</constant>, |
69 | <constant>V4L2_BUF_FLAG_QUEUED</constant> and | 70 | <constant>V4L2_BUF_FLAG_QUEUED</constant> and |
70 | <constant>V4L2_BUF_FLAG_DONE</constant> flags will be valid. The | 71 | <constant>V4L2_BUF_FLAG_DONE</constant> flags will be valid. The |
71 | <structfield>memory</structfield> field will be set to | 72 | <structfield>memory</structfield> field will be set to the current |
72 | <constant>V4L2_MEMORY_MMAP</constant>, the <structfield>m.offset</structfield> | 73 | I/O method, the <structfield>m.offset</structfield> |
73 | contains the offset of the buffer from the start of the device memory, | 74 | contains the offset of the buffer from the start of the device memory, |
74 | the <structfield>length</structfield> field its size. The driver may | 75 | the <structfield>length</structfield> field its size. The driver may |
75 | or may not set the remaining fields and flags, they are meaningless in | 76 | or may not set the remaining fields and flags, they are meaningless in |
diff --git a/Documentation/DocBook/v4l/vidioc-queryctrl.xml b/Documentation/DocBook/v4l/vidioc-queryctrl.xml index 4876ff1a1a04..8e0e055ac934 100644 --- a/Documentation/DocBook/v4l/vidioc-queryctrl.xml +++ b/Documentation/DocBook/v4l/vidioc-queryctrl.xml | |||
@@ -325,7 +325,7 @@ should be part of the control documentation.</entry> | |||
325 | <entry>n/a</entry> | 325 | <entry>n/a</entry> |
326 | <entry>This is not a control. When | 326 | <entry>This is not a control. When |
327 | <constant>VIDIOC_QUERYCTRL</constant> is called with a control ID | 327 | <constant>VIDIOC_QUERYCTRL</constant> is called with a control ID |
328 | equal to a control class code (see <xref linkend="ctrl-class" />), the | 328 | equal to a control class code (see <xref linkend="ctrl-class" />) + 1, the |
329 | ioctl returns the name of the control class and this control type. | 329 | ioctl returns the name of the control class and this control type. |
330 | Older drivers which do not support this feature return an | 330 | Older drivers which do not support this feature return an |
331 | &EINVAL;.</entry> | 331 | &EINVAL;.</entry> |
diff --git a/Documentation/DocBook/v4l/vidioc-reqbufs.xml b/Documentation/DocBook/v4l/vidioc-reqbufs.xml index bab38084454f..69800ae23348 100644 --- a/Documentation/DocBook/v4l/vidioc-reqbufs.xml +++ b/Documentation/DocBook/v4l/vidioc-reqbufs.xml | |||
@@ -54,23 +54,23 @@ I/O. Memory mapped buffers are located in device memory and must be | |||
54 | allocated with this ioctl before they can be mapped into the | 54 | allocated with this ioctl before they can be mapped into the |
55 | application's address space. User buffers are allocated by | 55 | application's address space. User buffers are allocated by |
56 | applications themselves, and this ioctl is merely used to switch the | 56 | applications themselves, and this ioctl is merely used to switch the |
57 | driver into user pointer I/O mode.</para> | 57 | driver into user pointer I/O mode and to setup some internal structures.</para> |
58 | 58 | ||
59 | <para>To allocate device buffers applications initialize three | 59 | <para>To allocate device buffers applications initialize all |
60 | fields of a <structname>v4l2_requestbuffers</structname> structure. | 60 | fields of the <structname>v4l2_requestbuffers</structname> structure. |
61 | They set the <structfield>type</structfield> field to the respective | 61 | They set the <structfield>type</structfield> field to the respective |
62 | stream or buffer type, the <structfield>count</structfield> field to | 62 | stream or buffer type, the <structfield>count</structfield> field to |
63 | the desired number of buffers, and <structfield>memory</structfield> | 63 | the desired number of buffers, <structfield>memory</structfield> |
64 | must be set to <constant>V4L2_MEMORY_MMAP</constant>. When the ioctl | 64 | must be set to the requested I/O method and the <structfield>reserved</structfield> array |
65 | is called with a pointer to this structure the driver attempts to | 65 | must be zeroed. When the ioctl |
66 | allocate the requested number of buffers and stores the actual number | 66 | is called with a pointer to this structure the driver will attempt to allocate |
67 | the requested number of buffers and it stores the actual number | ||
67 | allocated in the <structfield>count</structfield> field. It can be | 68 | allocated in the <structfield>count</structfield> field. It can be |
68 | smaller than the number requested, even zero, when the driver runs out | 69 | smaller than the number requested, even zero, when the driver runs out |
69 | of free memory. A larger number is possible when the driver requires | 70 | of free memory. A larger number is also possible when the driver requires |
70 | more buffers to function correctly.<footnote> | 71 | more buffers to function correctly. For example video output requires at least two buffers, |
71 | <para>For example video output requires at least two buffers, | ||
72 | one displayed and one filled by the application.</para> | 72 | one displayed and one filled by the application.</para> |
73 | </footnote> When memory mapping I/O is not supported the ioctl | 73 | <para>When the I/O method is not supported the ioctl |
74 | returns an &EINVAL;.</para> | 74 | returns an &EINVAL;.</para> |
75 | 75 | ||
76 | <para>Applications can call <constant>VIDIOC_REQBUFS</constant> | 76 | <para>Applications can call <constant>VIDIOC_REQBUFS</constant> |
@@ -81,14 +81,6 @@ in progress, an implicit &VIDIOC-STREAMOFF;. <!-- mhs: I see no | |||
81 | reason why munmap()ping one or even all buffers must imply | 81 | reason why munmap()ping one or even all buffers must imply |
82 | streamoff.--></para> | 82 | streamoff.--></para> |
83 | 83 | ||
84 | <para>To negotiate user pointer I/O, applications initialize only | ||
85 | the <structfield>type</structfield> field and set | ||
86 | <structfield>memory</structfield> to | ||
87 | <constant>V4L2_MEMORY_USERPTR</constant>. When the ioctl is called | ||
88 | with a pointer to this structure the driver prepares for user pointer | ||
89 | I/O, when this I/O method is not supported the ioctl returns an | ||
90 | &EINVAL;.</para> | ||
91 | |||
92 | <table pgwide="1" frame="none" id="v4l2-requestbuffers"> | 84 | <table pgwide="1" frame="none" id="v4l2-requestbuffers"> |
93 | <title>struct <structname>v4l2_requestbuffers</structname></title> | 85 | <title>struct <structname>v4l2_requestbuffers</structname></title> |
94 | <tgroup cols="3"> | 86 | <tgroup cols="3"> |
@@ -97,9 +89,7 @@ I/O, when this I/O method is not supported the ioctl returns an | |||
97 | <row> | 89 | <row> |
98 | <entry>__u32</entry> | 90 | <entry>__u32</entry> |
99 | <entry><structfield>count</structfield></entry> | 91 | <entry><structfield>count</structfield></entry> |
100 | <entry>The number of buffers requested or granted. This | 92 | <entry>The number of buffers requested or granted.</entry> |
101 | field is only used when <structfield>memory</structfield> is set to | ||
102 | <constant>V4L2_MEMORY_MMAP</constant>.</entry> | ||
103 | </row> | 93 | </row> |
104 | <row> | 94 | <row> |
105 | <entry>&v4l2-buf-type;</entry> | 95 | <entry>&v4l2-buf-type;</entry> |
@@ -120,7 +110,7 @@ as the &v4l2-format; <structfield>type</structfield> field. See <xref | |||
120 | <entry><structfield>reserved</structfield>[2]</entry> | 110 | <entry><structfield>reserved</structfield>[2]</entry> |
121 | <entry>A place holder for future extensions and custom | 111 | <entry>A place holder for future extensions and custom |
122 | (driver defined) buffer types <constant>V4L2_BUF_TYPE_PRIVATE</constant> and | 112 | (driver defined) buffer types <constant>V4L2_BUF_TYPE_PRIVATE</constant> and |
123 | higher.</entry> | 113 | higher. This array should be zeroed by applications.</entry> |
124 | </row> | 114 | </row> |
125 | </tbody> | 115 | </tbody> |
126 | </tgroup> | 116 | </tgroup> |
diff --git a/Documentation/DocBook/v4l/vidioc-subscribe-event.xml b/Documentation/DocBook/v4l/vidioc-subscribe-event.xml new file mode 100644 index 000000000000..8b501791aa68 --- /dev/null +++ b/Documentation/DocBook/v4l/vidioc-subscribe-event.xml | |||
@@ -0,0 +1,133 @@ | |||
1 | <refentry id="vidioc-subscribe-event"> | ||
2 | <refmeta> | ||
3 | <refentrytitle>ioctl VIDIOC_SUBSCRIBE_EVENT, VIDIOC_UNSUBSCRIBE_EVENT</refentrytitle> | ||
4 | &manvol; | ||
5 | </refmeta> | ||
6 | |||
7 | <refnamediv> | ||
8 | <refname>VIDIOC_SUBSCRIBE_EVENT, VIDIOC_UNSUBSCRIBE_EVENT</refname> | ||
9 | <refpurpose>Subscribe or unsubscribe event</refpurpose> | ||
10 | </refnamediv> | ||
11 | |||
12 | <refsynopsisdiv> | ||
13 | <funcsynopsis> | ||
14 | <funcprototype> | ||
15 | <funcdef>int <function>ioctl</function></funcdef> | ||
16 | <paramdef>int <parameter>fd</parameter></paramdef> | ||
17 | <paramdef>int <parameter>request</parameter></paramdef> | ||
18 | <paramdef>struct v4l2_event_subscription | ||
19 | *<parameter>argp</parameter></paramdef> | ||
20 | </funcprototype> | ||
21 | </funcsynopsis> | ||
22 | </refsynopsisdiv> | ||
23 | |||
24 | <refsect1> | ||
25 | <title>Arguments</title> | ||
26 | |||
27 | <variablelist> | ||
28 | <varlistentry> | ||
29 | <term><parameter>fd</parameter></term> | ||
30 | <listitem> | ||
31 | <para>&fd;</para> | ||
32 | </listitem> | ||
33 | </varlistentry> | ||
34 | <varlistentry> | ||
35 | <term><parameter>request</parameter></term> | ||
36 | <listitem> | ||
37 | <para>VIDIOC_SUBSCRIBE_EVENT, VIDIOC_UNSUBSCRIBE_EVENT</para> | ||
38 | </listitem> | ||
39 | </varlistentry> | ||
40 | <varlistentry> | ||
41 | <term><parameter>argp</parameter></term> | ||
42 | <listitem> | ||
43 | <para></para> | ||
44 | </listitem> | ||
45 | </varlistentry> | ||
46 | </variablelist> | ||
47 | </refsect1> | ||
48 | |||
49 | <refsect1> | ||
50 | <title>Description</title> | ||
51 | |||
52 | <para>Subscribe or unsubscribe V4L2 event. Subscribed events are | ||
53 | dequeued by using the &VIDIOC-DQEVENT; ioctl.</para> | ||
54 | |||
55 | <table frame="none" pgwide="1" id="v4l2-event-subscription"> | ||
56 | <title>struct <structname>v4l2_event_subscription</structname></title> | ||
57 | <tgroup cols="3"> | ||
58 | &cs-str; | ||
59 | <tbody valign="top"> | ||
60 | <row> | ||
61 | <entry>__u32</entry> | ||
62 | <entry><structfield>type</structfield></entry> | ||
63 | <entry>Type of the event.</entry> | ||
64 | </row> | ||
65 | <row> | ||
66 | <entry>__u32</entry> | ||
67 | <entry><structfield>reserved</structfield>[7]</entry> | ||
68 | <entry>Reserved for future extensions. Drivers and applications | ||
69 | must set the array to zero.</entry> | ||
70 | </row> | ||
71 | </tbody> | ||
72 | </tgroup> | ||
73 | </table> | ||
74 | |||
75 | <table frame="none" pgwide="1" id="event-type"> | ||
76 | <title>Event Types</title> | ||
77 | <tgroup cols="3"> | ||
78 | &cs-def; | ||
79 | <tbody valign="top"> | ||
80 | <row> | ||
81 | <entry><constant>V4L2_EVENT_ALL</constant></entry> | ||
82 | <entry>0</entry> | ||
83 | <entry>All events. V4L2_EVENT_ALL is valid only for | ||
84 | VIDIOC_UNSUBSCRIBE_EVENT for unsubscribing all events at once. | ||
85 | </entry> | ||
86 | </row> | ||
87 | <row> | ||
88 | <entry><constant>V4L2_EVENT_VSYNC</constant></entry> | ||
89 | <entry>1</entry> | ||
90 | <entry>This event is triggered on the vertical sync. | ||
91 | This event has &v4l2-event-vsync; associated with it. | ||
92 | </entry> | ||
93 | </row> | ||
94 | <row> | ||
95 | <entry><constant>V4L2_EVENT_EOS</constant></entry> | ||
96 | <entry>2</entry> | ||
97 | <entry>This event is triggered when the end of a stream is reached. | ||
98 | This is typically used with MPEG decoders to report to the application | ||
99 | when the last of the MPEG stream has been decoded. | ||
100 | </entry> | ||
101 | </row> | ||
102 | <row> | ||
103 | <entry><constant>V4L2_EVENT_PRIVATE_START</constant></entry> | ||
104 | <entry>0x08000000</entry> | ||
105 | <entry>Base event number for driver-private events.</entry> | ||
106 | </row> | ||
107 | </tbody> | ||
108 | </tgroup> | ||
109 | </table> | ||
110 | |||
111 | <table frame="none" pgwide="1" id="v4l2-event-vsync"> | ||
112 | <title>struct <structname>v4l2_event_vsync</structname></title> | ||
113 | <tgroup cols="3"> | ||
114 | &cs-str; | ||
115 | <tbody valign="top"> | ||
116 | <row> | ||
117 | <entry>__u8</entry> | ||
118 | <entry><structfield>field</structfield></entry> | ||
119 | <entry>The upcoming field. See &v4l2-field;.</entry> | ||
120 | </row> | ||
121 | </tbody> | ||
122 | </tgroup> | ||
123 | </table> | ||
124 | |||
125 | </refsect1> | ||
126 | </refentry> | ||
127 | <!-- | ||
128 | Local Variables: | ||
129 | mode: sgml | ||
130 | sgml-parent-document: "v4l2.sgml" | ||
131 | indent-tabs-mode: nil | ||
132 | End: | ||
133 | --> | ||
diff --git a/Documentation/DocBook/writing-an-alsa-driver.tmpl b/Documentation/DocBook/writing-an-alsa-driver.tmpl index 0d0f7b4d4b1a..0ba149de2608 100644 --- a/Documentation/DocBook/writing-an-alsa-driver.tmpl +++ b/Documentation/DocBook/writing-an-alsa-driver.tmpl | |||
@@ -5518,34 +5518,41 @@ struct _snd_pcm_runtime { | |||
5518 | ]]> | 5518 | ]]> |
5519 | </programlisting> | 5519 | </programlisting> |
5520 | </informalexample> | 5520 | </informalexample> |
5521 | |||
5522 | For the raw data, <structfield>size</structfield> field must be | ||
5523 | set properly. This specifies the maximum size of the proc file access. | ||
5521 | </para> | 5524 | </para> |
5522 | 5525 | ||
5523 | <para> | 5526 | <para> |
5524 | The callback is much more complicated than the text-file | 5527 | The read/write callbacks of raw mode are more direct than the text mode. |
5525 | version. You need to use a low-level I/O functions such as | 5528 | You need to use a low-level I/O functions such as |
5526 | <function>copy_from/to_user()</function> to transfer the | 5529 | <function>copy_from/to_user()</function> to transfer the |
5527 | data. | 5530 | data. |
5528 | 5531 | ||
5529 | <informalexample> | 5532 | <informalexample> |
5530 | <programlisting> | 5533 | <programlisting> |
5531 | <![CDATA[ | 5534 | <![CDATA[ |
5532 | static long my_file_io_read(struct snd_info_entry *entry, | 5535 | static ssize_t my_file_io_read(struct snd_info_entry *entry, |
5533 | void *file_private_data, | 5536 | void *file_private_data, |
5534 | struct file *file, | 5537 | struct file *file, |
5535 | char *buf, | 5538 | char *buf, |
5536 | unsigned long count, | 5539 | size_t count, |
5537 | unsigned long pos) | 5540 | loff_t pos) |
5538 | { | 5541 | { |
5539 | long size = count; | 5542 | if (copy_to_user(buf, local_data + pos, count)) |
5540 | if (pos + size > local_max_size) | ||
5541 | size = local_max_size - pos; | ||
5542 | if (copy_to_user(buf, local_data + pos, size)) | ||
5543 | return -EFAULT; | 5543 | return -EFAULT; |
5544 | return size; | 5544 | return count; |
5545 | } | 5545 | } |
5546 | ]]> | 5546 | ]]> |
5547 | </programlisting> | 5547 | </programlisting> |
5548 | </informalexample> | 5548 | </informalexample> |
5549 | |||
5550 | If the size of the info entry has been set up properly, | ||
5551 | <structfield>count</structfield> and <structfield>pos</structfield> are | ||
5552 | guaranteed to fit within 0 and the given size. | ||
5553 | You don't have to check the range in the callbacks unless any | ||
5554 | other condition is required. | ||
5555 | |||
5549 | </para> | 5556 | </para> |
5550 | 5557 | ||
5551 | </chapter> | 5558 | </chapter> |
diff --git a/Documentation/DocBook/writing_usb_driver.tmpl b/Documentation/DocBook/writing_usb_driver.tmpl index eeff19ca831b..bd97a13fa5ae 100644 --- a/Documentation/DocBook/writing_usb_driver.tmpl +++ b/Documentation/DocBook/writing_usb_driver.tmpl | |||
@@ -342,7 +342,7 @@ static inline void skel_delete (struct usb_skel *dev) | |||
342 | { | 342 | { |
343 | kfree (dev->bulk_in_buffer); | 343 | kfree (dev->bulk_in_buffer); |
344 | if (dev->bulk_out_buffer != NULL) | 344 | if (dev->bulk_out_buffer != NULL) |
345 | usb_buffer_free (dev->udev, dev->bulk_out_size, | 345 | usb_free_coherent (dev->udev, dev->bulk_out_size, |
346 | dev->bulk_out_buffer, | 346 | dev->bulk_out_buffer, |
347 | dev->write_urb->transfer_dma); | 347 | dev->write_urb->transfer_dma); |
348 | usb_free_urb (dev->write_urb); | 348 | usb_free_urb (dev->write_urb); |
diff --git a/Documentation/HOWTO b/Documentation/HOWTO index 8495fc970391..40ada93b820a 100644 --- a/Documentation/HOWTO +++ b/Documentation/HOWTO | |||
@@ -221,8 +221,8 @@ branches. These different branches are: | |||
221 | - main 2.6.x kernel tree | 221 | - main 2.6.x kernel tree |
222 | - 2.6.x.y -stable kernel tree | 222 | - 2.6.x.y -stable kernel tree |
223 | - 2.6.x -git kernel patches | 223 | - 2.6.x -git kernel patches |
224 | - 2.6.x -mm kernel patches | ||
225 | - subsystem specific kernel trees and patches | 224 | - subsystem specific kernel trees and patches |
225 | - the 2.6.x -next kernel tree for integration tests | ||
226 | 226 | ||
227 | 2.6.x kernel tree | 227 | 2.6.x kernel tree |
228 | ----------------- | 228 | ----------------- |
@@ -232,9 +232,9 @@ process is as follows: | |||
232 | - As soon as a new kernel is released a two weeks window is open, | 232 | - As soon as a new kernel is released a two weeks window is open, |
233 | during this period of time maintainers can submit big diffs to | 233 | during this period of time maintainers can submit big diffs to |
234 | Linus, usually the patches that have already been included in the | 234 | Linus, usually the patches that have already been included in the |
235 | -mm kernel for a few weeks. The preferred way to submit big changes | 235 | -next kernel for a few weeks. The preferred way to submit big changes |
236 | is using git (the kernel's source management tool, more information | 236 | is using git (the kernel's source management tool, more information |
237 | can be found at http://git.or.cz/) but plain patches are also just | 237 | can be found at http://git-scm.com/) but plain patches are also just |
238 | fine. | 238 | fine. |
239 | - After two weeks a -rc1 kernel is released it is now possible to push | 239 | - After two weeks a -rc1 kernel is released it is now possible to push |
240 | only patches that do not include new features that could affect the | 240 | only patches that do not include new features that could affect the |
@@ -293,84 +293,43 @@ daily and represent the current state of Linus' tree. They are more | |||
293 | experimental than -rc kernels since they are generated automatically | 293 | experimental than -rc kernels since they are generated automatically |
294 | without even a cursory glance to see if they are sane. | 294 | without even a cursory glance to see if they are sane. |
295 | 295 | ||
296 | 2.6.x -mm kernel patches | ||
297 | ------------------------ | ||
298 | These are experimental kernel patches released by Andrew Morton. Andrew | ||
299 | takes all of the different subsystem kernel trees and patches and mushes | ||
300 | them together, along with a lot of patches that have been plucked from | ||
301 | the linux-kernel mailing list. This tree serves as a proving ground for | ||
302 | new features and patches. Once a patch has proved its worth in -mm for | ||
303 | a while Andrew or the subsystem maintainer pushes it on to Linus for | ||
304 | inclusion in mainline. | ||
305 | |||
306 | It is heavily encouraged that all new patches get tested in the -mm tree | ||
307 | before they are sent to Linus for inclusion in the main kernel tree. Code | ||
308 | which does not make an appearance in -mm before the opening of the merge | ||
309 | window will prove hard to merge into the mainline. | ||
310 | |||
311 | These kernels are not appropriate for use on systems that are supposed | ||
312 | to be stable and they are more risky to run than any of the other | ||
313 | branches. | ||
314 | |||
315 | If you wish to help out with the kernel development process, please test | ||
316 | and use these kernel releases and provide feedback to the linux-kernel | ||
317 | mailing list if you have any problems, and if everything works properly. | ||
318 | |||
319 | In addition to all the other experimental patches, these kernels usually | ||
320 | also contain any changes in the mainline -git kernels available at the | ||
321 | time of release. | ||
322 | |||
323 | The -mm kernels are not released on a fixed schedule, but usually a few | ||
324 | -mm kernels are released in between each -rc kernel (1 to 3 is common). | ||
325 | |||
326 | Subsystem Specific kernel trees and patches | 296 | Subsystem Specific kernel trees and patches |
327 | ------------------------------------------- | 297 | ------------------------------------------- |
328 | A number of the different kernel subsystem developers expose their | 298 | The maintainers of the various kernel subsystems --- and also many |
329 | development trees so that others can see what is happening in the | 299 | kernel subsystem developers --- expose their current state of |
330 | different areas of the kernel. These trees are pulled into the -mm | 300 | development in source repositories. That way, others can see what is |
331 | kernel releases as described above. | 301 | happening in the different areas of the kernel. In areas where |
332 | 302 | development is rapid, a developer may be asked to base his submissions | |
333 | Here is a list of some of the different kernel trees available: | 303 | onto such a subsystem kernel tree so that conflicts between the |
334 | git trees: | 304 | submission and other already ongoing work are avoided. |
335 | - Kbuild development tree, Sam Ravnborg <sam@ravnborg.org> | 305 | |
336 | git.kernel.org:/pub/scm/linux/kernel/git/sam/kbuild.git | 306 | Most of these repositories are git trees, but there are also other SCMs |
337 | 307 | in use, or patch queues being published as quilt series. Addresses of | |
338 | - ACPI development tree, Len Brown <len.brown@intel.com> | 308 | these subsystem repositories are listed in the MAINTAINERS file. Many |
339 | git.kernel.org:/pub/scm/linux/kernel/git/lenb/linux-acpi-2.6.git | 309 | of them can be browsed at http://git.kernel.org/. |
340 | 310 | ||
341 | - Block development tree, Jens Axboe <jens.axboe@oracle.com> | 311 | Before a proposed patch is committed to such a subsystem tree, it is |
342 | git.kernel.org:/pub/scm/linux/kernel/git/axboe/linux-2.6-block.git | 312 | subject to review which primarily happens on mailing lists (see the |
343 | 313 | respective section below). For several kernel subsystems, this review | |
344 | - DRM development tree, Dave Airlie <airlied@linux.ie> | 314 | process is tracked with the tool patchwork. Patchwork offers a web |
345 | git.kernel.org:/pub/scm/linux/kernel/git/airlied/drm-2.6.git | 315 | interface which shows patch postings, any comments on a patch or |
346 | 316 | revisions to it, and maintainers can mark patches as under review, | |
347 | - ia64 development tree, Tony Luck <tony.luck@intel.com> | 317 | accepted, or rejected. Most of these patchwork sites are listed at |
348 | git.kernel.org:/pub/scm/linux/kernel/git/aegl/linux-2.6.git | 318 | http://patchwork.kernel.org/ or http://patchwork.ozlabs.org/. |
349 | 319 | ||
350 | - infiniband, Roland Dreier <rolandd@cisco.com> | 320 | 2.6.x -next kernel tree for integration tests |
351 | git.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband.git | 321 | --------------------------------------------- |
352 | 322 | Before updates from subsystem trees are merged into the mainline 2.6.x | |
353 | - libata, Jeff Garzik <jgarzik@pobox.com> | 323 | tree, they need to be integration-tested. For this purpose, a special |
354 | git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/libata-dev.git | 324 | testing repository exists into which virtually all subsystem trees are |
355 | 325 | pulled on an almost daily basis: | |
356 | - network drivers, Jeff Garzik <jgarzik@pobox.com> | 326 | http://git.kernel.org/?p=linux/kernel/git/sfr/linux-next.git |
357 | git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/netdev-2.6.git | 327 | http://linux.f-seidel.de/linux-next/pmwiki/ |
358 | 328 | ||
359 | - pcmcia, Dominik Brodowski <linux@dominikbrodowski.net> | 329 | This way, the -next kernel gives a summary outlook onto what will be |
360 | git.kernel.org:/pub/scm/linux/kernel/git/brodo/pcmcia-2.6.git | 330 | expected to go into the mainline kernel at the next merge period. |
361 | 331 | Adventurous testers are very welcome to runtime-test the -next kernel. | |
362 | - SCSI, James Bottomley <James.Bottomley@hansenpartnership.com> | ||
363 | git.kernel.org:/pub/scm/linux/kernel/git/jejb/scsi-misc-2.6.git | ||
364 | |||
365 | - x86, Ingo Molnar <mingo@elte.hu> | ||
366 | git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86.git | ||
367 | |||
368 | quilt trees: | ||
369 | - USB, Driver Core, and I2C, Greg Kroah-Hartman <gregkh@suse.de> | ||
370 | kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/ | ||
371 | 332 | ||
372 | Other kernel trees can be found listed at http://git.kernel.org/ and in | ||
373 | the MAINTAINERS file. | ||
374 | 333 | ||
375 | Bug Reporting | 334 | Bug Reporting |
376 | ------------- | 335 | ------------- |
diff --git a/Documentation/IPMI.txt b/Documentation/IPMI.txt index bc38283379f0..69dd29ed824e 100644 --- a/Documentation/IPMI.txt +++ b/Documentation/IPMI.txt | |||
@@ -365,6 +365,7 @@ You can change this at module load time (for a module) with: | |||
365 | regshifts=<shift1>,<shift2>,... | 365 | regshifts=<shift1>,<shift2>,... |
366 | slave_addrs=<addr1>,<addr2>,... | 366 | slave_addrs=<addr1>,<addr2>,... |
367 | force_kipmid=<enable1>,<enable2>,... | 367 | force_kipmid=<enable1>,<enable2>,... |
368 | kipmid_max_busy_us=<ustime1>,<ustime2>,... | ||
368 | unload_when_empty=[0|1] | 369 | unload_when_empty=[0|1] |
369 | 370 | ||
370 | Each of these except si_trydefaults is a list, the first item for the | 371 | Each of these except si_trydefaults is a list, the first item for the |
@@ -433,6 +434,7 @@ kernel command line as: | |||
433 | ipmi_si.regshifts=<shift1>,<shift2>,... | 434 | ipmi_si.regshifts=<shift1>,<shift2>,... |
434 | ipmi_si.slave_addrs=<addr1>,<addr2>,... | 435 | ipmi_si.slave_addrs=<addr1>,<addr2>,... |
435 | ipmi_si.force_kipmid=<enable1>,<enable2>,... | 436 | ipmi_si.force_kipmid=<enable1>,<enable2>,... |
437 | ipmi_si.kipmid_max_busy_us=<ustime1>,<ustime2>,... | ||
436 | 438 | ||
437 | It works the same as the module parameters of the same names. | 439 | It works the same as the module parameters of the same names. |
438 | 440 | ||
@@ -450,6 +452,16 @@ force this thread on or off. If you force it off and don't have | |||
450 | interrupts, the driver will run VERY slowly. Don't blame me, | 452 | interrupts, the driver will run VERY slowly. Don't blame me, |
451 | these interfaces suck. | 453 | these interfaces suck. |
452 | 454 | ||
455 | Unfortunately, this thread can use a lot of CPU depending on the | ||
456 | interface's performance. This can waste a lot of CPU and cause | ||
457 | various issues with detecting idle CPU and using extra power. To | ||
458 | avoid this, the kipmid_max_busy_us sets the maximum amount of time, in | ||
459 | microseconds, that kipmid will spin before sleeping for a tick. This | ||
460 | value sets a balance between performance and CPU waste and needs to be | ||
461 | tuned to your needs. Maybe, someday, auto-tuning will be added, but | ||
462 | that's not a simple thing and even the auto-tuning would need to be | ||
463 | tuned to the user's desired performance. | ||
464 | |||
453 | The driver supports a hot add and remove of interfaces. This way, | 465 | The driver supports a hot add and remove of interfaces. This way, |
454 | interfaces can be added or removed after the kernel is up and running. | 466 | interfaces can be added or removed after the kernel is up and running. |
455 | This is done using /sys/modules/ipmi_si/parameters/hotmod, which is a | 467 | This is done using /sys/modules/ipmi_si/parameters/hotmod, which is a |
diff --git a/Documentation/Makefile b/Documentation/Makefile index 94b945733534..6fc7ea1d1f9d 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile | |||
@@ -1,3 +1,3 @@ | |||
1 | obj-m := DocBook/ accounting/ auxdisplay/ connector/ \ | 1 | obj-m := DocBook/ accounting/ auxdisplay/ connector/ \ |
2 | filesystems/configfs/ ia64/ networking/ \ | 2 | filesystems/ filesystems/configfs/ ia64/ laptops/ networking/ \ |
3 | pcmcia/ spi/ video4linux/ vm/ watchdog/src/ | 3 | pcmcia/ spi/ timers/ video4linux/ vm/ watchdog/src/ |
diff --git a/Documentation/PCI/pci-error-recovery.txt b/Documentation/PCI/pci-error-recovery.txt index e83f2ea76415..898ded24510d 100644 --- a/Documentation/PCI/pci-error-recovery.txt +++ b/Documentation/PCI/pci-error-recovery.txt | |||
@@ -216,7 +216,7 @@ The driver should return one of the following result codes: | |||
216 | 216 | ||
217 | - PCI_ERS_RESULT_NEED_RESET | 217 | - PCI_ERS_RESULT_NEED_RESET |
218 | Driver returns this if it thinks the device is not | 218 | Driver returns this if it thinks the device is not |
219 | recoverable in it's current state and it needs a slot | 219 | recoverable in its current state and it needs a slot |
220 | reset to proceed. | 220 | reset to proceed. |
221 | 221 | ||
222 | - PCI_ERS_RESULT_DISCONNECT | 222 | - PCI_ERS_RESULT_DISCONNECT |
@@ -241,7 +241,7 @@ in working condition. | |||
241 | 241 | ||
242 | The driver is not supposed to restart normal driver I/O operations | 242 | The driver is not supposed to restart normal driver I/O operations |
243 | at this point. It should limit itself to "probing" the device to | 243 | at this point. It should limit itself to "probing" the device to |
244 | check it's recoverability status. If all is right, then the platform | 244 | check its recoverability status. If all is right, then the platform |
245 | will call resume() once all drivers have ack'd link_reset(). | 245 | will call resume() once all drivers have ack'd link_reset(). |
246 | 246 | ||
247 | Result codes: | 247 | Result codes: |
diff --git a/Documentation/PCI/pcieaer-howto.txt b/Documentation/PCI/pcieaer-howto.txt index be21001ab144..26d3d945c3c2 100644 --- a/Documentation/PCI/pcieaer-howto.txt +++ b/Documentation/PCI/pcieaer-howto.txt | |||
@@ -13,7 +13,7 @@ Reporting (AER) driver and provides information on how to use it, as | |||
13 | well as how to enable the drivers of endpoint devices to conform with | 13 | well as how to enable the drivers of endpoint devices to conform with |
14 | PCI Express AER driver. | 14 | PCI Express AER driver. |
15 | 15 | ||
16 | 1.2 Copyright © Intel Corporation 2006. | 16 | 1.2 Copyright (C) Intel Corporation 2006. |
17 | 17 | ||
18 | 1.3 What is the PCI Express AER Driver? | 18 | 1.3 What is the PCI Express AER Driver? |
19 | 19 | ||
@@ -71,15 +71,11 @@ console. If it's a correctable error, it is outputed as a warning. | |||
71 | Otherwise, it is printed as an error. So users could choose different | 71 | Otherwise, it is printed as an error. So users could choose different |
72 | log level to filter out correctable error messages. | 72 | log level to filter out correctable error messages. |
73 | 73 | ||
74 | Below shows an example. | 74 | Below shows an example: |
75 | +------ PCI-Express Device Error -----+ | 75 | 0000:50:00.0: PCIe Bus Error: severity=Uncorrected (Fatal), type=Transaction Layer, id=0500(Requester ID) |
76 | Error Severity : Uncorrected (Fatal) | 76 | 0000:50:00.0: device [8086:0329] error status/mask=00100000/00000000 |
77 | PCIE Bus Error type : Transaction Layer | 77 | 0000:50:00.0: [20] Unsupported Request (First) |
78 | Unsupported Request : First | 78 | 0000:50:00.0: TLP Header: 04000001 00200a03 05010000 00050100 |
79 | Requester ID : 0500 | ||
80 | VendorID=8086h, DeviceID=0329h, Bus=05h, Device=00h, Function=00h | ||
81 | TLB Header: | ||
82 | 04000001 00200a03 05010000 00050100 | ||
83 | 79 | ||
84 | In the example, 'Requester ID' means the ID of the device who sends | 80 | In the example, 'Requester ID' means the ID of the device who sends |
85 | the error message to root port. Pls. refer to pci express specs for | 81 | the error message to root port. Pls. refer to pci express specs for |
@@ -112,7 +108,7 @@ but the PCI Express link itself is fully functional. Fatal errors, on | |||
112 | the other hand, cause the link to be unreliable. | 108 | the other hand, cause the link to be unreliable. |
113 | 109 | ||
114 | When AER is enabled, a PCI Express device will automatically send an | 110 | When AER is enabled, a PCI Express device will automatically send an |
115 | error message to the PCIE root port above it when the device captures | 111 | error message to the PCIe root port above it when the device captures |
116 | an error. The Root Port, upon receiving an error reporting message, | 112 | an error. The Root Port, upon receiving an error reporting message, |
117 | internally processes and logs the error message in its PCI Express | 113 | internally processes and logs the error message in its PCI Express |
118 | capability structure. Error information being logged includes storing | 114 | capability structure. Error information being logged includes storing |
@@ -198,8 +194,9 @@ to reset link, AER port service driver is required to provide the | |||
198 | function to reset link. Firstly, kernel looks for if the upstream | 194 | function to reset link. Firstly, kernel looks for if the upstream |
199 | component has an aer driver. If it has, kernel uses the reset_link | 195 | component has an aer driver. If it has, kernel uses the reset_link |
200 | callback of the aer driver. If the upstream component has no aer driver | 196 | callback of the aer driver. If the upstream component has no aer driver |
201 | and the port is downstream port, we will use the aer driver of the | 197 | and the port is downstream port, we will perform a hot reset as the |
202 | root port who reports the AER error. As for upstream ports, | 198 | default by setting the Secondary Bus Reset bit of the Bridge Control |
199 | register associated with the downstream port. As for upstream ports, | ||
203 | they should provide their own aer service drivers with reset_link | 200 | they should provide their own aer service drivers with reset_link |
204 | function. If error_detected returns PCI_ERS_RESULT_CAN_RECOVER and | 201 | function. If error_detected returns PCI_ERS_RESULT_CAN_RECOVER and |
205 | reset_link returns PCI_ERS_RESULT_RECOVERED, the error handling goes | 202 | reset_link returns PCI_ERS_RESULT_RECOVERED, the error handling goes |
@@ -253,11 +250,11 @@ cleanup uncorrectable status register. Pls. refer to section 3.3. | |||
253 | 250 | ||
254 | 4. Software error injection | 251 | 4. Software error injection |
255 | 252 | ||
256 | Debugging PCIE AER error recovery code is quite difficult because it | 253 | Debugging PCIe AER error recovery code is quite difficult because it |
257 | is hard to trigger real hardware errors. Software based error | 254 | is hard to trigger real hardware errors. Software based error |
258 | injection can be used to fake various kinds of PCIE errors. | 255 | injection can be used to fake various kinds of PCIe errors. |
259 | 256 | ||
260 | First you should enable PCIE AER software error injection in kernel | 257 | First you should enable PCIe AER software error injection in kernel |
261 | configuration, that is, following item should be in your .config. | 258 | configuration, that is, following item should be in your .config. |
262 | 259 | ||
263 | CONFIG_PCIEAER_INJECT=y or CONFIG_PCIEAER_INJECT=m | 260 | CONFIG_PCIEAER_INJECT=y or CONFIG_PCIEAER_INJECT=m |
diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX index 9bb62f7b89c3..71b6f500ddb9 100644 --- a/Documentation/RCU/00-INDEX +++ b/Documentation/RCU/00-INDEX | |||
@@ -6,16 +6,22 @@ checklist.txt | |||
6 | - Review Checklist for RCU Patches | 6 | - Review Checklist for RCU Patches |
7 | listRCU.txt | 7 | listRCU.txt |
8 | - Using RCU to Protect Read-Mostly Linked Lists | 8 | - Using RCU to Protect Read-Mostly Linked Lists |
9 | lockdep.txt | ||
10 | - RCU and lockdep checking | ||
9 | NMI-RCU.txt | 11 | NMI-RCU.txt |
10 | - Using RCU to Protect Dynamic NMI Handlers | 12 | - Using RCU to Protect Dynamic NMI Handlers |
13 | rcubarrier.txt | ||
14 | - RCU and Unloadable Modules | ||
15 | rculist_nulls.txt | ||
16 | - RCU list primitives for use with SLAB_DESTROY_BY_RCU | ||
11 | rcuref.txt | 17 | rcuref.txt |
12 | - Reference-count design for elements of lists/arrays protected by RCU | 18 | - Reference-count design for elements of lists/arrays protected by RCU |
13 | rcu.txt | 19 | rcu.txt |
14 | - RCU Concepts | 20 | - RCU Concepts |
15 | rcubarrier.txt | ||
16 | - Unloading modules that use RCU callbacks | ||
17 | RTFP.txt | 21 | RTFP.txt |
18 | - List of RCU papers (bibliography) going back to 1980. | 22 | - List of RCU papers (bibliography) going back to 1980. |
23 | stallwarn.txt | ||
24 | - RCU CPU stall warnings (CONFIG_RCU_CPU_STALL_DETECTOR) | ||
19 | torture.txt | 25 | torture.txt |
20 | - RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST) | 26 | - RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST) |
21 | trace.txt | 27 | trace.txt |
diff --git a/Documentation/RCU/NMI-RCU.txt b/Documentation/RCU/NMI-RCU.txt index a6d32e65d222..a8536cb88091 100644 --- a/Documentation/RCU/NMI-RCU.txt +++ b/Documentation/RCU/NMI-RCU.txt | |||
@@ -34,7 +34,7 @@ NMI handler. | |||
34 | cpu = smp_processor_id(); | 34 | cpu = smp_processor_id(); |
35 | ++nmi_count(cpu); | 35 | ++nmi_count(cpu); |
36 | 36 | ||
37 | if (!rcu_dereference(nmi_callback)(regs, cpu)) | 37 | if (!rcu_dereference_sched(nmi_callback)(regs, cpu)) |
38 | default_do_nmi(regs); | 38 | default_do_nmi(regs); |
39 | 39 | ||
40 | nmi_exit(); | 40 | nmi_exit(); |
@@ -47,12 +47,13 @@ function pointer. If this handler returns zero, do_nmi() invokes the | |||
47 | default_do_nmi() function to handle a machine-specific NMI. Finally, | 47 | default_do_nmi() function to handle a machine-specific NMI. Finally, |
48 | preemption is restored. | 48 | preemption is restored. |
49 | 49 | ||
50 | Strictly speaking, rcu_dereference() is not needed, since this code runs | 50 | In theory, rcu_dereference_sched() is not needed, since this code runs |
51 | only on i386, which does not need rcu_dereference() anyway. However, | 51 | only on i386, which in theory does not need rcu_dereference_sched() |
52 | it is a good documentation aid, particularly for anyone attempting to | 52 | anyway. However, in practice it is a good documentation aid, particularly |
53 | do something similar on Alpha. | 53 | for anyone attempting to do something similar on Alpha or on systems |
54 | with aggressive optimizing compilers. | ||
54 | 55 | ||
55 | Quick Quiz: Why might the rcu_dereference() be necessary on Alpha, | 56 | Quick Quiz: Why might the rcu_dereference_sched() be necessary on Alpha, |
56 | given that the code referenced by the pointer is read-only? | 57 | given that the code referenced by the pointer is read-only? |
57 | 58 | ||
58 | 59 | ||
@@ -99,17 +100,21 @@ invoke irq_enter() and irq_exit() on NMI entry and exit, respectively. | |||
99 | 100 | ||
100 | Answer to Quick Quiz | 101 | Answer to Quick Quiz |
101 | 102 | ||
102 | Why might the rcu_dereference() be necessary on Alpha, given | 103 | Why might the rcu_dereference_sched() be necessary on Alpha, given |
103 | that the code referenced by the pointer is read-only? | 104 | that the code referenced by the pointer is read-only? |
104 | 105 | ||
105 | Answer: The caller to set_nmi_callback() might well have | 106 | Answer: The caller to set_nmi_callback() might well have |
106 | initialized some data that is to be used by the | 107 | initialized some data that is to be used by the new NMI |
107 | new NMI handler. In this case, the rcu_dereference() | 108 | handler. In this case, the rcu_dereference_sched() would |
108 | would be needed, because otherwise a CPU that received | 109 | be needed, because otherwise a CPU that received an NMI |
109 | an NMI just after the new handler was set might see | 110 | just after the new handler was set might see the pointer |
110 | the pointer to the new NMI handler, but the old | 111 | to the new NMI handler, but the old pre-initialized |
111 | pre-initialized version of the handler's data. | 112 | version of the handler's data. |
112 | 113 | ||
113 | More important, the rcu_dereference() makes it clear | 114 | This same sad story can happen on other CPUs when using |
114 | to someone reading the code that the pointer is being | 115 | a compiler with aggressive pointer-value speculation |
115 | protected by RCU. | 116 | optimizations. |
117 | |||
118 | More important, the rcu_dereference_sched() makes it | ||
119 | clear to someone reading the code that the pointer is | ||
120 | being protected by RCU-sched. | ||
diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt index d2b85237c76e..5aea459e3dd6 100644 --- a/Documentation/RCU/RTFP.txt +++ b/Documentation/RCU/RTFP.txt | |||
@@ -25,10 +25,10 @@ to be referencing the data structure. However, this mechanism was not | |||
25 | optimized for modern computer systems, which is not surprising given | 25 | optimized for modern computer systems, which is not surprising given |
26 | that these overheads were not so expensive in the mid-80s. Nonetheless, | 26 | that these overheads were not so expensive in the mid-80s. Nonetheless, |
27 | passive serialization appears to be the first deferred-destruction | 27 | passive serialization appears to be the first deferred-destruction |
28 | mechanism to be used in production. Furthermore, the relevant patent has | 28 | mechanism to be used in production. Furthermore, the relevant patent |
29 | lapsed, so this approach may be used in non-GPL software, if desired. | 29 | has lapsed, so this approach may be used in non-GPL software, if desired. |
30 | (In contrast, use of RCU is permitted only in software licensed under | 30 | (In contrast, implementation of RCU is permitted only in software licensed |
31 | GPL. Sorry!!!) | 31 | under either GPL or LGPL. Sorry!!!) |
32 | 32 | ||
33 | In 1990, Pugh [Pugh90] noted that explicitly tracking which threads | 33 | In 1990, Pugh [Pugh90] noted that explicitly tracking which threads |
34 | were reading a given data structure permitted deferred free to operate | 34 | were reading a given data structure permitted deferred free to operate |
@@ -150,6 +150,18 @@ preemptible RCU [PaulEMcKenney2007PreemptibleRCU], and the three-part | |||
150 | LWN "What is RCU?" series [PaulEMcKenney2007WhatIsRCUFundamentally, | 150 | LWN "What is RCU?" series [PaulEMcKenney2007WhatIsRCUFundamentally, |
151 | PaulEMcKenney2008WhatIsRCUUsage, and PaulEMcKenney2008WhatIsRCUAPI]. | 151 | PaulEMcKenney2008WhatIsRCUUsage, and PaulEMcKenney2008WhatIsRCUAPI]. |
152 | 152 | ||
153 | 2008 saw a journal paper on real-time RCU [DinakarGuniguntala2008IBMSysJ], | ||
154 | a history of how Linux changed RCU more than RCU changed Linux | ||
155 | [PaulEMcKenney2008RCUOSR], and a design overview of hierarchical RCU | ||
156 | [PaulEMcKenney2008HierarchicalRCU]. | ||
157 | |||
158 | 2009 introduced user-level RCU algorithms [PaulEMcKenney2009MaliciousURCU], | ||
159 | which Mathieu Desnoyers is now maintaining [MathieuDesnoyers2009URCU] | ||
160 | [MathieuDesnoyersPhD]. TINY_RCU [PaulEMcKenney2009BloatWatchRCU] made | ||
161 | its appearance, as did expedited RCU [PaulEMcKenney2009expeditedRCU]. | ||
162 | The problem of resizeable RCU-protected hash tables may now be on a path | ||
163 | to a solution [JoshTriplett2009RPHash]. | ||
164 | |||
153 | Bibtex Entries | 165 | Bibtex Entries |
154 | 166 | ||
155 | @article{Kung80 | 167 | @article{Kung80 |
@@ -730,6 +742,11 @@ Revised: | |||
730 | " | 742 | " |
731 | } | 743 | } |
732 | 744 | ||
745 | # | ||
746 | # "What is RCU?" LWN series. | ||
747 | # | ||
748 | ######################################################################## | ||
749 | |||
733 | @article{DinakarGuniguntala2008IBMSysJ | 750 | @article{DinakarGuniguntala2008IBMSysJ |
734 | ,author="D. Guniguntala and P. E. McKenney and J. Triplett and J. Walpole" | 751 | ,author="D. Guniguntala and P. E. McKenney and J. Triplett and J. Walpole" |
735 | ,title="The read-copy-update mechanism for supporting real-time applications on shared-memory multiprocessor systems with {Linux}" | 752 | ,title="The read-copy-update mechanism for supporting real-time applications on shared-memory multiprocessor systems with {Linux}" |
@@ -820,3 +837,39 @@ Revised: | |||
820 | Uniprocessor assumptions allow simplified RCU implementation. | 837 | Uniprocessor assumptions allow simplified RCU implementation. |
821 | " | 838 | " |
822 | } | 839 | } |
840 | |||
841 | @unpublished{PaulEMcKenney2009expeditedRCU | ||
842 | ,Author="Paul E. McKenney" | ||
843 | ,Title="[{PATCH} -tip 0/3] expedited 'big hammer' {RCU} grace periods" | ||
844 | ,month="June" | ||
845 | ,day="25" | ||
846 | ,year="2009" | ||
847 | ,note="Available: | ||
848 | \url{http://lkml.org/lkml/2009/6/25/306} | ||
849 | [Viewed August 16, 2009]" | ||
850 | ,annotation=" | ||
851 | First posting of expedited RCU to be accepted into -tip. | ||
852 | " | ||
853 | } | ||
854 | |||
855 | @unpublished{JoshTriplett2009RPHash | ||
856 | ,Author="Josh Triplett" | ||
857 | ,Title="Scalable concurrent hash tables via relativistic programming" | ||
858 | ,month="September" | ||
859 | ,year="2009" | ||
860 | ,note="Linux Plumbers Conference presentation" | ||
861 | ,annotation=" | ||
862 | RP fun with hash tables. | ||
863 | " | ||
864 | } | ||
865 | |||
866 | @phdthesis{MathieuDesnoyersPhD | ||
867 | , title = "Low-Impact Operating System Tracing" | ||
868 | , author = "Mathieu Desnoyers" | ||
869 | , school = "Ecole Polytechnique de Montr\'{e}al" | ||
870 | , month = "December" | ||
871 | , year = 2009 | ||
872 | ,note="Available: | ||
873 | \url{http://www.lttng.org/pub/thesis/desnoyers-dissertation-2009-12.pdf} | ||
874 | [Viewed December 9, 2009]" | ||
875 | } | ||
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index 51525a30e8b4..790d1a812376 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt | |||
@@ -8,13 +8,12 @@ would cause. This list is based on experiences reviewing such patches | |||
8 | over a rather long period of time, but improvements are always welcome! | 8 | over a rather long period of time, but improvements are always welcome! |
9 | 9 | ||
10 | 0. Is RCU being applied to a read-mostly situation? If the data | 10 | 0. Is RCU being applied to a read-mostly situation? If the data |
11 | structure is updated more than about 10% of the time, then | 11 | structure is updated more than about 10% of the time, then you |
12 | you should strongly consider some other approach, unless | 12 | should strongly consider some other approach, unless detailed |
13 | detailed performance measurements show that RCU is nonetheless | 13 | performance measurements show that RCU is nonetheless the right |
14 | the right tool for the job. Yes, you might think of RCU | 14 | tool for the job. Yes, RCU does reduce read-side overhead by |
15 | as simply cutting overhead off of the readers and imposing it | 15 | increasing write-side overhead, which is exactly why normal uses |
16 | on the writers. That is exactly why normal uses of RCU will | 16 | of RCU will do much more reading than updating. |
17 | do much more reading than updating. | ||
18 | 17 | ||
19 | Another exception is where performance is not an issue, and RCU | 18 | Another exception is where performance is not an issue, and RCU |
20 | provides a simpler implementation. An example of this situation | 19 | provides a simpler implementation. An example of this situation |
@@ -35,13 +34,13 @@ over a rather long period of time, but improvements are always welcome! | |||
35 | 34 | ||
36 | If you choose #b, be prepared to describe how you have handled | 35 | If you choose #b, be prepared to describe how you have handled |
37 | memory barriers on weakly ordered machines (pretty much all of | 36 | memory barriers on weakly ordered machines (pretty much all of |
38 | them -- even x86 allows reads to be reordered), and be prepared | 37 | them -- even x86 allows later loads to be reordered to precede |
39 | to explain why this added complexity is worthwhile. If you | 38 | earlier stores), and be prepared to explain why this added |
40 | choose #c, be prepared to explain how this single task does not | 39 | complexity is worthwhile. If you choose #c, be prepared to |
41 | become a major bottleneck on big multiprocessor machines (for | 40 | explain how this single task does not become a major bottleneck on |
42 | example, if the task is updating information relating to itself | 41 | big multiprocessor machines (for example, if the task is updating |
43 | that other tasks can read, there by definition can be no | 42 | information relating to itself that other tasks can read, there |
44 | bottleneck). | 43 | by definition can be no bottleneck). |
45 | 44 | ||
46 | 2. Do the RCU read-side critical sections make proper use of | 45 | 2. Do the RCU read-side critical sections make proper use of |
47 | rcu_read_lock() and friends? These primitives are needed | 46 | rcu_read_lock() and friends? These primitives are needed |
@@ -51,8 +50,10 @@ over a rather long period of time, but improvements are always welcome! | |||
51 | actuarial risk of your kernel. | 50 | actuarial risk of your kernel. |
52 | 51 | ||
53 | As a rough rule of thumb, any dereference of an RCU-protected | 52 | As a rough rule of thumb, any dereference of an RCU-protected |
54 | pointer must be covered by rcu_read_lock() or rcu_read_lock_bh() | 53 | pointer must be covered by rcu_read_lock(), rcu_read_lock_bh(), |
55 | or by the appropriate update-side lock. | 54 | rcu_read_lock_sched(), or by the appropriate update-side lock. |
55 | Disabling of preemption can serve as rcu_read_lock_sched(), but | ||
56 | is less readable. | ||
56 | 57 | ||
57 | 3. Does the update code tolerate concurrent accesses? | 58 | 3. Does the update code tolerate concurrent accesses? |
58 | 59 | ||
@@ -62,25 +63,27 @@ over a rather long period of time, but improvements are always welcome! | |||
62 | of ways to handle this concurrency, depending on the situation: | 63 | of ways to handle this concurrency, depending on the situation: |
63 | 64 | ||
64 | a. Use the RCU variants of the list and hlist update | 65 | a. Use the RCU variants of the list and hlist update |
65 | primitives to add, remove, and replace elements on an | 66 | primitives to add, remove, and replace elements on |
66 | RCU-protected list. Alternatively, use the RCU-protected | 67 | an RCU-protected list. Alternatively, use the other |
67 | trees that have been added to the Linux kernel. | 68 | RCU-protected data structures that have been added to |
69 | the Linux kernel. | ||
68 | 70 | ||
69 | This is almost always the best approach. | 71 | This is almost always the best approach. |
70 | 72 | ||
71 | b. Proceed as in (a) above, but also maintain per-element | 73 | b. Proceed as in (a) above, but also maintain per-element |
72 | locks (that are acquired by both readers and writers) | 74 | locks (that are acquired by both readers and writers) |
73 | that guard per-element state. Of course, fields that | 75 | that guard per-element state. Of course, fields that |
74 | the readers refrain from accessing can be guarded by the | 76 | the readers refrain from accessing can be guarded by |
75 | update-side lock. | 77 | some other lock acquired only by updaters, if desired. |
76 | 78 | ||
77 | This works quite well, also. | 79 | This works quite well, also. |
78 | 80 | ||
79 | c. Make updates appear atomic to readers. For example, | 81 | c. Make updates appear atomic to readers. For example, |
80 | pointer updates to properly aligned fields will appear | 82 | pointer updates to properly aligned fields will |
81 | atomic, as will individual atomic primitives. Operations | 83 | appear atomic, as will individual atomic primitives. |
82 | performed under a lock and sequences of multiple atomic | 84 | Sequences of perations performed under a lock will -not- |
83 | primitives will -not- appear to be atomic. | 85 | appear to be atomic to RCU readers, nor will sequences |
86 | of multiple atomic primitives. | ||
84 | 87 | ||
85 | This can work, but is starting to get a bit tricky. | 88 | This can work, but is starting to get a bit tricky. |
86 | 89 | ||
@@ -98,9 +101,9 @@ over a rather long period of time, but improvements are always welcome! | |||
98 | a new structure containing updated values. | 101 | a new structure containing updated values. |
99 | 102 | ||
100 | 4. Weakly ordered CPUs pose special challenges. Almost all CPUs | 103 | 4. Weakly ordered CPUs pose special challenges. Almost all CPUs |
101 | are weakly ordered -- even i386 CPUs allow reads to be reordered. | 104 | are weakly ordered -- even x86 CPUs allow later loads to be |
102 | RCU code must take all of the following measures to prevent | 105 | reordered to precede earlier stores. RCU code must take all of |
103 | memory-corruption problems: | 106 | the following measures to prevent memory-corruption problems: |
104 | 107 | ||
105 | a. Readers must maintain proper ordering of their memory | 108 | a. Readers must maintain proper ordering of their memory |
106 | accesses. The rcu_dereference() primitive ensures that | 109 | accesses. The rcu_dereference() primitive ensures that |
@@ -113,14 +116,25 @@ over a rather long period of time, but improvements are always welcome! | |||
113 | The rcu_dereference() primitive is also an excellent | 116 | The rcu_dereference() primitive is also an excellent |
114 | documentation aid, letting the person reading the code | 117 | documentation aid, letting the person reading the code |
115 | know exactly which pointers are protected by RCU. | 118 | know exactly which pointers are protected by RCU. |
116 | 119 | Please note that compilers can also reorder code, and | |
117 | The rcu_dereference() primitive is used by the various | 120 | they are becoming increasingly aggressive about doing |
118 | "_rcu()" list-traversal primitives, such as the | 121 | just that. The rcu_dereference() primitive therefore |
119 | list_for_each_entry_rcu(). Note that it is perfectly | 122 | also prevents destructive compiler optimizations. |
120 | legal (if redundant) for update-side code to use | 123 | |
121 | rcu_dereference() and the "_rcu()" list-traversal | 124 | The rcu_dereference() primitive is used by the |
122 | primitives. This is particularly useful in code | 125 | various "_rcu()" list-traversal primitives, such |
123 | that is common to readers and updaters. | 126 | as the list_for_each_entry_rcu(). Note that it is |
127 | perfectly legal (if redundant) for update-side code to | ||
128 | use rcu_dereference() and the "_rcu()" list-traversal | ||
129 | primitives. This is particularly useful in code that | ||
130 | is common to readers and updaters. However, lockdep | ||
131 | will complain if you access rcu_dereference() outside | ||
132 | of an RCU read-side critical section. See lockdep.txt | ||
133 | to learn what to do about this. | ||
134 | |||
135 | Of course, neither rcu_dereference() nor the "_rcu()" | ||
136 | list-traversal primitives can substitute for a good | ||
137 | concurrency design coordinating among multiple updaters. | ||
124 | 138 | ||
125 | b. If the list macros are being used, the list_add_tail_rcu() | 139 | b. If the list macros are being used, the list_add_tail_rcu() |
126 | and list_add_rcu() primitives must be used in order | 140 | and list_add_rcu() primitives must be used in order |
@@ -135,11 +149,14 @@ over a rather long period of time, but improvements are always welcome! | |||
135 | readers. Similarly, if the hlist macros are being used, | 149 | readers. Similarly, if the hlist macros are being used, |
136 | the hlist_del_rcu() primitive is required. | 150 | the hlist_del_rcu() primitive is required. |
137 | 151 | ||
138 | The list_replace_rcu() primitive may be used to | 152 | The list_replace_rcu() and hlist_replace_rcu() primitives |
139 | replace an old structure with a new one in an | 153 | may be used to replace an old structure with a new one |
140 | RCU-protected list. | 154 | in their respective types of RCU-protected lists. |
155 | |||
156 | d. Rules similar to (4b) and (4c) apply to the "hlist_nulls" | ||
157 | type of RCU-protected linked lists. | ||
141 | 158 | ||
142 | d. Updates must ensure that initialization of a given | 159 | e. Updates must ensure that initialization of a given |
143 | structure happens before pointers to that structure are | 160 | structure happens before pointers to that structure are |
144 | publicized. Use the rcu_assign_pointer() primitive | 161 | publicized. Use the rcu_assign_pointer() primitive |
145 | when publicizing a pointer to a structure that can | 162 | when publicizing a pointer to a structure that can |
@@ -151,16 +168,31 @@ over a rather long period of time, but improvements are always welcome! | |||
151 | it cannot block. | 168 | it cannot block. |
152 | 169 | ||
153 | 6. Since synchronize_rcu() can block, it cannot be called from | 170 | 6. Since synchronize_rcu() can block, it cannot be called from |
154 | any sort of irq context. Ditto for synchronize_sched() and | 171 | any sort of irq context. The same rule applies for |
155 | synchronize_srcu(). | 172 | synchronize_rcu_bh(), synchronize_sched(), synchronize_srcu(), |
156 | 173 | synchronize_rcu_expedited(), synchronize_rcu_bh_expedited(), | |
157 | 7. If the updater uses call_rcu(), then the corresponding readers | 174 | synchronize_sched_expedite(), and synchronize_srcu_expedited(). |
158 | must use rcu_read_lock() and rcu_read_unlock(). If the updater | 175 | |
159 | uses call_rcu_bh(), then the corresponding readers must use | 176 | The expedited forms of these primitives have the same semantics |
160 | rcu_read_lock_bh() and rcu_read_unlock_bh(). If the updater | 177 | as the non-expedited forms, but expediting is both expensive |
161 | uses call_rcu_sched(), then the corresponding readers must | 178 | and unfriendly to real-time workloads. Use of the expedited |
162 | disable preemption. Mixing things up will result in confusion | 179 | primitives should be restricted to rare configuration-change |
163 | and broken kernels. | 180 | operations that would not normally be undertaken while a real-time |
181 | workload is running. | ||
182 | |||
183 | 7. If the updater uses call_rcu() or synchronize_rcu(), then the | ||
184 | corresponding readers must use rcu_read_lock() and | ||
185 | rcu_read_unlock(). If the updater uses call_rcu_bh() or | ||
186 | synchronize_rcu_bh(), then the corresponding readers must | ||
187 | use rcu_read_lock_bh() and rcu_read_unlock_bh(). If the | ||
188 | updater uses call_rcu_sched() or synchronize_sched(), then | ||
189 | the corresponding readers must disable preemption, possibly | ||
190 | by calling rcu_read_lock_sched() and rcu_read_unlock_sched(). | ||
191 | If the updater uses synchronize_srcu(), the the corresponding | ||
192 | readers must use srcu_read_lock() and srcu_read_unlock(), | ||
193 | and with the same srcu_struct. The rules for the expedited | ||
194 | primitives are the same as for their non-expedited counterparts. | ||
195 | Mixing things up will result in confusion and broken kernels. | ||
164 | 196 | ||
165 | One exception to this rule: rcu_read_lock() and rcu_read_unlock() | 197 | One exception to this rule: rcu_read_lock() and rcu_read_unlock() |
166 | may be substituted for rcu_read_lock_bh() and rcu_read_unlock_bh() | 198 | may be substituted for rcu_read_lock_bh() and rcu_read_unlock_bh() |
@@ -212,6 +244,8 @@ over a rather long period of time, but improvements are always welcome! | |||
212 | e. Periodically invoke synchronize_rcu(), permitting a limited | 244 | e. Periodically invoke synchronize_rcu(), permitting a limited |
213 | number of updates per grace period. | 245 | number of updates per grace period. |
214 | 246 | ||
247 | The same cautions apply to call_rcu_bh() and call_rcu_sched(). | ||
248 | |||
215 | 9. All RCU list-traversal primitives, which include | 249 | 9. All RCU list-traversal primitives, which include |
216 | rcu_dereference(), list_for_each_entry_rcu(), | 250 | rcu_dereference(), list_for_each_entry_rcu(), |
217 | list_for_each_continue_rcu(), and list_for_each_safe_rcu(), | 251 | list_for_each_continue_rcu(), and list_for_each_safe_rcu(), |
@@ -219,17 +253,21 @@ over a rather long period of time, but improvements are always welcome! | |||
219 | must be protected by appropriate update-side locks. RCU | 253 | must be protected by appropriate update-side locks. RCU |
220 | read-side critical sections are delimited by rcu_read_lock() | 254 | read-side critical sections are delimited by rcu_read_lock() |
221 | and rcu_read_unlock(), or by similar primitives such as | 255 | and rcu_read_unlock(), or by similar primitives such as |
222 | rcu_read_lock_bh() and rcu_read_unlock_bh(). | 256 | rcu_read_lock_bh() and rcu_read_unlock_bh(), in which case |
257 | the matching rcu_dereference() primitive must be used in order | ||
258 | to keep lockdep happy, in this case, rcu_dereference_bh(). | ||
223 | 259 | ||
224 | The reason that it is permissible to use RCU list-traversal | 260 | The reason that it is permissible to use RCU list-traversal |
225 | primitives when the update-side lock is held is that doing so | 261 | primitives when the update-side lock is held is that doing so |
226 | can be quite helpful in reducing code bloat when common code is | 262 | can be quite helpful in reducing code bloat when common code is |
227 | shared between readers and updaters. | 263 | shared between readers and updaters. Additional primitives |
264 | are provided for this case, as discussed in lockdep.txt. | ||
228 | 265 | ||
229 | 10. Conversely, if you are in an RCU read-side critical section, | 266 | 10. Conversely, if you are in an RCU read-side critical section, |
230 | and you don't hold the appropriate update-side lock, you -must- | 267 | and you don't hold the appropriate update-side lock, you -must- |
231 | use the "_rcu()" variants of the list macros. Failing to do so | 268 | use the "_rcu()" variants of the list macros. Failing to do so |
232 | will break Alpha and confuse people reading your code. | 269 | will break Alpha, cause aggressive compilers to generate bad code, |
270 | and confuse people trying to read your code. | ||
233 | 271 | ||
234 | 11. Note that synchronize_rcu() -only- guarantees to wait until | 272 | 11. Note that synchronize_rcu() -only- guarantees to wait until |
235 | all currently executing rcu_read_lock()-protected RCU read-side | 273 | all currently executing rcu_read_lock()-protected RCU read-side |
@@ -239,15 +277,21 @@ over a rather long period of time, but improvements are always welcome! | |||
239 | rcu_read_lock()-protected read-side critical sections, do -not- | 277 | rcu_read_lock()-protected read-side critical sections, do -not- |
240 | use synchronize_rcu(). | 278 | use synchronize_rcu(). |
241 | 279 | ||
242 | If you want to wait for some of these other things, you might | 280 | Similarly, disabling preemption is not an acceptable substitute |
243 | instead need to use synchronize_irq() or synchronize_sched(). | 281 | for rcu_read_lock(). Code that attempts to use preemption |
282 | disabling where it should be using rcu_read_lock() will break | ||
283 | in real-time kernel builds. | ||
284 | |||
285 | If you want to wait for interrupt handlers, NMI handlers, and | ||
286 | code under the influence of preempt_disable(), you instead | ||
287 | need to use synchronize_irq() or synchronize_sched(). | ||
244 | 288 | ||
245 | 12. Any lock acquired by an RCU callback must be acquired elsewhere | 289 | 12. Any lock acquired by an RCU callback must be acquired elsewhere |
246 | with softirq disabled, e.g., via spin_lock_irqsave(), | 290 | with softirq disabled, e.g., via spin_lock_irqsave(), |
247 | spin_lock_bh(), etc. Failing to disable irq on a given | 291 | spin_lock_bh(), etc. Failing to disable irq on a given |
248 | acquisition of that lock will result in deadlock as soon as the | 292 | acquisition of that lock will result in deadlock as soon as |
249 | RCU callback happens to interrupt that acquisition's critical | 293 | the RCU softirq handler happens to run your RCU callback while |
250 | section. | 294 | interrupting that acquisition's critical section. |
251 | 295 | ||
252 | 13. RCU callbacks can be and are executed in parallel. In many cases, | 296 | 13. RCU callbacks can be and are executed in parallel. In many cases, |
253 | the callback code simply wrappers around kfree(), so that this | 297 | the callback code simply wrappers around kfree(), so that this |
@@ -265,29 +309,30 @@ over a rather long period of time, but improvements are always welcome! | |||
265 | not the case, a self-spawning RCU callback would prevent the | 309 | not the case, a self-spawning RCU callback would prevent the |
266 | victim CPU from ever going offline.) | 310 | victim CPU from ever going offline.) |
267 | 311 | ||
268 | 14. SRCU (srcu_read_lock(), srcu_read_unlock(), and synchronize_srcu()) | 312 | 14. SRCU (srcu_read_lock(), srcu_read_unlock(), srcu_dereference(), |
269 | may only be invoked from process context. Unlike other forms of | 313 | synchronize_srcu(), and synchronize_srcu_expedited()) may only |
270 | RCU, it -is- permissible to block in an SRCU read-side critical | 314 | be invoked from process context. Unlike other forms of RCU, it |
271 | section (demarked by srcu_read_lock() and srcu_read_unlock()), | 315 | -is- permissible to block in an SRCU read-side critical section |
272 | hence the "SRCU": "sleepable RCU". Please note that if you | 316 | (demarked by srcu_read_lock() and srcu_read_unlock()), hence the |
273 | don't need to sleep in read-side critical sections, you should | 317 | "SRCU": "sleepable RCU". Please note that if you don't need |
274 | be using RCU rather than SRCU, because RCU is almost always | 318 | to sleep in read-side critical sections, you should be using |
275 | faster and easier to use than is SRCU. | 319 | RCU rather than SRCU, because RCU is almost always faster and |
320 | easier to use than is SRCU. | ||
276 | 321 | ||
277 | Also unlike other forms of RCU, explicit initialization | 322 | Also unlike other forms of RCU, explicit initialization |
278 | and cleanup is required via init_srcu_struct() and | 323 | and cleanup is required via init_srcu_struct() and |
279 | cleanup_srcu_struct(). These are passed a "struct srcu_struct" | 324 | cleanup_srcu_struct(). These are passed a "struct srcu_struct" |
280 | that defines the scope of a given SRCU domain. Once initialized, | 325 | that defines the scope of a given SRCU domain. Once initialized, |
281 | the srcu_struct is passed to srcu_read_lock(), srcu_read_unlock() | 326 | the srcu_struct is passed to srcu_read_lock(), srcu_read_unlock() |
282 | and synchronize_srcu(). A given synchronize_srcu() waits only | 327 | synchronize_srcu(), and synchronize_srcu_expedited(). A given |
283 | for SRCU read-side critical sections governed by srcu_read_lock() | 328 | synchronize_srcu() waits only for SRCU read-side critical |
284 | and srcu_read_unlock() calls that have been passd the same | 329 | sections governed by srcu_read_lock() and srcu_read_unlock() |
285 | srcu_struct. This property is what makes sleeping read-side | 330 | calls that have been passed the same srcu_struct. This property |
286 | critical sections tolerable -- a given subsystem delays only | 331 | is what makes sleeping read-side critical sections tolerable -- |
287 | its own updates, not those of other subsystems using SRCU. | 332 | a given subsystem delays only its own updates, not those of other |
288 | Therefore, SRCU is less prone to OOM the system than RCU would | 333 | subsystems using SRCU. Therefore, SRCU is less prone to OOM the |
289 | be if RCU's read-side critical sections were permitted to | 334 | system than RCU would be if RCU's read-side critical sections |
290 | sleep. | 335 | were permitted to sleep. |
291 | 336 | ||
292 | The ability to sleep in read-side critical sections does not | 337 | The ability to sleep in read-side critical sections does not |
293 | come for free. First, corresponding srcu_read_lock() and | 338 | come for free. First, corresponding srcu_read_lock() and |
@@ -300,8 +345,8 @@ over a rather long period of time, but improvements are always welcome! | |||
300 | requiring SRCU's read-side deadlock immunity or low read-side | 345 | requiring SRCU's read-side deadlock immunity or low read-side |
301 | realtime latency. | 346 | realtime latency. |
302 | 347 | ||
303 | Note that, rcu_assign_pointer() and rcu_dereference() relate to | 348 | Note that, rcu_assign_pointer() relates to SRCU just as they do |
304 | SRCU just as they do to other forms of RCU. | 349 | to other forms of RCU. |
305 | 350 | ||
306 | 15. The whole point of call_rcu(), synchronize_rcu(), and friends | 351 | 15. The whole point of call_rcu(), synchronize_rcu(), and friends |
307 | is to wait until all pre-existing readers have finished before | 352 | is to wait until all pre-existing readers have finished before |
@@ -311,12 +356,12 @@ over a rather long period of time, but improvements are always welcome! | |||
311 | destructive operation, and -only- -then- invoke call_rcu(), | 356 | destructive operation, and -only- -then- invoke call_rcu(), |
312 | synchronize_rcu(), or friends. | 357 | synchronize_rcu(), or friends. |
313 | 358 | ||
314 | Because these primitives only wait for pre-existing readers, | 359 | Because these primitives only wait for pre-existing readers, it |
315 | it is the caller's responsibility to guarantee safety to | 360 | is the caller's responsibility to guarantee that any subsequent |
316 | any subsequent readers. | 361 | readers will execute safely. |
317 | 362 | ||
318 | 16. The various RCU read-side primitives do -not- contain memory | 363 | 16. The various RCU read-side primitives do -not- necessarily contain |
319 | barriers. The CPU (and in some cases, the compiler) is free | 364 | memory barriers. You should therefore plan for the CPU |
320 | to reorder code into and out of RCU read-side critical sections. | 365 | and the compiler to freely reorder code into and out of RCU |
321 | It is the responsibility of the RCU update-side primitives to | 366 | read-side critical sections. It is the responsibility of the |
322 | deal with this. | 367 | RCU update-side primitives to deal with this. |
diff --git a/Documentation/RCU/lockdep.txt b/Documentation/RCU/lockdep.txt new file mode 100644 index 000000000000..d7a49b2f6994 --- /dev/null +++ b/Documentation/RCU/lockdep.txt | |||
@@ -0,0 +1,91 @@ | |||
1 | RCU and lockdep checking | ||
2 | |||
3 | All flavors of RCU have lockdep checking available, so that lockdep is | ||
4 | aware of when each task enters and leaves any flavor of RCU read-side | ||
5 | critical section. Each flavor of RCU is tracked separately (but note | ||
6 | that this is not the case in 2.6.32 and earlier). This allows lockdep's | ||
7 | tracking to include RCU state, which can sometimes help when debugging | ||
8 | deadlocks and the like. | ||
9 | |||
10 | In addition, RCU provides the following primitives that check lockdep's | ||
11 | state: | ||
12 | |||
13 | rcu_read_lock_held() for normal RCU. | ||
14 | rcu_read_lock_bh_held() for RCU-bh. | ||
15 | rcu_read_lock_sched_held() for RCU-sched. | ||
16 | srcu_read_lock_held() for SRCU. | ||
17 | |||
18 | These functions are conservative, and will therefore return 1 if they | ||
19 | aren't certain (for example, if CONFIG_DEBUG_LOCK_ALLOC is not set). | ||
20 | This prevents things like WARN_ON(!rcu_read_lock_held()) from giving false | ||
21 | positives when lockdep is disabled. | ||
22 | |||
23 | In addition, a separate kernel config parameter CONFIG_PROVE_RCU enables | ||
24 | checking of rcu_dereference() primitives: | ||
25 | |||
26 | rcu_dereference(p): | ||
27 | Check for RCU read-side critical section. | ||
28 | rcu_dereference_bh(p): | ||
29 | Check for RCU-bh read-side critical section. | ||
30 | rcu_dereference_sched(p): | ||
31 | Check for RCU-sched read-side critical section. | ||
32 | srcu_dereference(p, sp): | ||
33 | Check for SRCU read-side critical section. | ||
34 | rcu_dereference_check(p, c): | ||
35 | Use explicit check expression "c". This is useful in | ||
36 | code that is invoked by both readers and updaters. | ||
37 | rcu_dereference_raw(p) | ||
38 | Don't check. (Use sparingly, if at all.) | ||
39 | rcu_dereference_protected(p, c): | ||
40 | Use explicit check expression "c", and omit all barriers | ||
41 | and compiler constraints. This is useful when the data | ||
42 | structure cannot change, for example, in code that is | ||
43 | invoked only by updaters. | ||
44 | rcu_access_pointer(p): | ||
45 | Return the value of the pointer and omit all barriers, | ||
46 | but retain the compiler constraints that prevent duplicating | ||
47 | or coalescsing. This is useful when when testing the | ||
48 | value of the pointer itself, for example, against NULL. | ||
49 | |||
50 | The rcu_dereference_check() check expression can be any boolean | ||
51 | expression, but would normally include one of the rcu_read_lock_held() | ||
52 | family of functions and a lockdep expression. However, any boolean | ||
53 | expression can be used. For a moderately ornate example, consider | ||
54 | the following: | ||
55 | |||
56 | file = rcu_dereference_check(fdt->fd[fd], | ||
57 | rcu_read_lock_held() || | ||
58 | lockdep_is_held(&files->file_lock) || | ||
59 | atomic_read(&files->count) == 1); | ||
60 | |||
61 | This expression picks up the pointer "fdt->fd[fd]" in an RCU-safe manner, | ||
62 | and, if CONFIG_PROVE_RCU is configured, verifies that this expression | ||
63 | is used in: | ||
64 | |||
65 | 1. An RCU read-side critical section, or | ||
66 | 2. with files->file_lock held, or | ||
67 | 3. on an unshared files_struct. | ||
68 | |||
69 | In case (1), the pointer is picked up in an RCU-safe manner for vanilla | ||
70 | RCU read-side critical sections, in case (2) the ->file_lock prevents | ||
71 | any change from taking place, and finally, in case (3) the current task | ||
72 | is the only task accessing the file_struct, again preventing any change | ||
73 | from taking place. If the above statement was invoked only from updater | ||
74 | code, it could instead be written as follows: | ||
75 | |||
76 | file = rcu_dereference_protected(fdt->fd[fd], | ||
77 | lockdep_is_held(&files->file_lock) || | ||
78 | atomic_read(&files->count) == 1); | ||
79 | |||
80 | This would verify cases #2 and #3 above, and furthermore lockdep would | ||
81 | complain if this was used in an RCU read-side critical section unless one | ||
82 | of these two cases held. Because rcu_dereference_protected() omits all | ||
83 | barriers and compiler constraints, it generates better code than do the | ||
84 | other flavors of rcu_dereference(). On the other hand, it is illegal | ||
85 | to use rcu_dereference_protected() if either the RCU-protected pointer | ||
86 | or the RCU-protected data that it points to can change concurrently. | ||
87 | |||
88 | There are currently only "universal" versions of the rcu_assign_pointer() | ||
89 | and RCU list-/tree-traversal primitives, which do not (yet) check for | ||
90 | being in an RCU read-side critical section. In the future, separate | ||
91 | versions of these primitives might be created. | ||
diff --git a/Documentation/RCU/rcu.txt b/Documentation/RCU/rcu.txt index 2a23523ce471..31852705b586 100644 --- a/Documentation/RCU/rcu.txt +++ b/Documentation/RCU/rcu.txt | |||
@@ -75,6 +75,8 @@ o I hear that RCU is patented? What is with that? | |||
75 | search for the string "Patent" in RTFP.txt to find them. | 75 | search for the string "Patent" in RTFP.txt to find them. |
76 | Of these, one was allowed to lapse by the assignee, and the | 76 | Of these, one was allowed to lapse by the assignee, and the |
77 | others have been contributed to the Linux kernel under GPL. | 77 | others have been contributed to the Linux kernel under GPL. |
78 | There are now also LGPL implementations of user-level RCU | ||
79 | available (http://lttng.org/?q=node/18). | ||
78 | 80 | ||
79 | o I hear that RCU needs work in order to support realtime kernels? | 81 | o I hear that RCU needs work in order to support realtime kernels? |
80 | 82 | ||
@@ -91,48 +93,4 @@ o Where can I find more information on RCU? | |||
91 | 93 | ||
92 | o What are all these files in this directory? | 94 | o What are all these files in this directory? |
93 | 95 | ||
94 | 96 | See 00-INDEX for the list. | |
95 | NMI-RCU.txt | ||
96 | |||
97 | Describes how to use RCU to implement dynamic | ||
98 | NMI handlers, which can be revectored on the fly, | ||
99 | without rebooting. | ||
100 | |||
101 | RTFP.txt | ||
102 | |||
103 | List of RCU-related publications and web sites. | ||
104 | |||
105 | UP.txt | ||
106 | |||
107 | Discussion of RCU usage in UP kernels. | ||
108 | |||
109 | arrayRCU.txt | ||
110 | |||
111 | Describes how to use RCU to protect arrays, with | ||
112 | resizeable arrays whose elements reference other | ||
113 | data structures being of the most interest. | ||
114 | |||
115 | checklist.txt | ||
116 | |||
117 | Lists things to check for when inspecting code that | ||
118 | uses RCU. | ||
119 | |||
120 | listRCU.txt | ||
121 | |||
122 | Describes how to use RCU to protect linked lists. | ||
123 | This is the simplest and most common use of RCU | ||
124 | in the Linux kernel. | ||
125 | |||
126 | rcu.txt | ||
127 | |||
128 | You are reading it! | ||
129 | |||
130 | rcuref.txt | ||
131 | |||
132 | Describes how to combine use of reference counts | ||
133 | with RCU. | ||
134 | |||
135 | whatisRCU.txt | ||
136 | |||
137 | Overview of how the RCU implementation works. Along | ||
138 | the way, presents a conceptual view of RCU. | ||
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt new file mode 100644 index 000000000000..44c6dcc93d6d --- /dev/null +++ b/Documentation/RCU/stallwarn.txt | |||
@@ -0,0 +1,106 @@ | |||
1 | Using RCU's CPU Stall Detector | ||
2 | |||
3 | The CONFIG_RCU_CPU_STALL_DETECTOR kernel config parameter enables | ||
4 | RCU's CPU stall detector, which detects conditions that unduly delay | ||
5 | RCU grace periods. The stall detector's idea of what constitutes | ||
6 | "unduly delayed" is controlled by a set of C preprocessor macros: | ||
7 | |||
8 | RCU_SECONDS_TILL_STALL_CHECK | ||
9 | |||
10 | This macro defines the period of time that RCU will wait from | ||
11 | the beginning of a grace period until it issues an RCU CPU | ||
12 | stall warning. This time period is normally ten seconds. | ||
13 | |||
14 | RCU_SECONDS_TILL_STALL_RECHECK | ||
15 | |||
16 | This macro defines the period of time that RCU will wait after | ||
17 | issuing a stall warning until it issues another stall warning | ||
18 | for the same stall. This time period is normally set to thirty | ||
19 | seconds. | ||
20 | |||
21 | RCU_STALL_RAT_DELAY | ||
22 | |||
23 | The CPU stall detector tries to make the offending CPU print its | ||
24 | own warnings, as this often gives better-quality stack traces. | ||
25 | However, if the offending CPU does not detect its own stall in | ||
26 | the number of jiffies specified by RCU_STALL_RAT_DELAY, then | ||
27 | some other CPU will complain. This delay is normally set to | ||
28 | two jiffies. | ||
29 | |||
30 | When a CPU detects that it is stalling, it will print a message similar | ||
31 | to the following: | ||
32 | |||
33 | INFO: rcu_sched_state detected stall on CPU 5 (t=2500 jiffies) | ||
34 | |||
35 | This message indicates that CPU 5 detected that it was causing a stall, | ||
36 | and that the stall was affecting RCU-sched. This message will normally be | ||
37 | followed by a stack dump of the offending CPU. On TREE_RCU kernel builds, | ||
38 | RCU and RCU-sched are implemented by the same underlying mechanism, | ||
39 | while on TREE_PREEMPT_RCU kernel builds, RCU is instead implemented | ||
40 | by rcu_preempt_state. | ||
41 | |||
42 | On the other hand, if the offending CPU fails to print out a stall-warning | ||
43 | message quickly enough, some other CPU will print a message similar to | ||
44 | the following: | ||
45 | |||
46 | INFO: rcu_bh_state detected stalls on CPUs/tasks: { 3 5 } (detected by 2, 2502 jiffies) | ||
47 | |||
48 | This message indicates that CPU 2 detected that CPUs 3 and 5 were both | ||
49 | causing stalls, and that the stall was affecting RCU-bh. This message | ||
50 | will normally be followed by stack dumps for each CPU. Please note that | ||
51 | TREE_PREEMPT_RCU builds can be stalled by tasks as well as by CPUs, | ||
52 | and that the tasks will be indicated by PID, for example, "P3421". | ||
53 | It is even possible for a rcu_preempt_state stall to be caused by both | ||
54 | CPUs -and- tasks, in which case the offending CPUs and tasks will all | ||
55 | be called out in the list. | ||
56 | |||
57 | Finally, if the grace period ends just as the stall warning starts | ||
58 | printing, there will be a spurious stall-warning message: | ||
59 | |||
60 | INFO: rcu_bh_state detected stalls on CPUs/tasks: { } (detected by 4, 2502 jiffies) | ||
61 | |||
62 | This is rare, but does happen from time to time in real life. | ||
63 | |||
64 | So your kernel printed an RCU CPU stall warning. The next question is | ||
65 | "What caused it?" The following problems can result in RCU CPU stall | ||
66 | warnings: | ||
67 | |||
68 | o A CPU looping in an RCU read-side critical section. | ||
69 | |||
70 | o A CPU looping with interrupts disabled. This condition can | ||
71 | result in RCU-sched and RCU-bh stalls. | ||
72 | |||
73 | o A CPU looping with preemption disabled. This condition can | ||
74 | result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh | ||
75 | stalls. | ||
76 | |||
77 | o A CPU looping with bottom halves disabled. This condition can | ||
78 | result in RCU-sched and RCU-bh stalls. | ||
79 | |||
80 | o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel | ||
81 | without invoking schedule(). | ||
82 | |||
83 | o A bug in the RCU implementation. | ||
84 | |||
85 | o A hardware failure. This is quite unlikely, but has occurred | ||
86 | at least once in real life. A CPU failed in a running system, | ||
87 | becoming unresponsive, but not causing an immediate crash. | ||
88 | This resulted in a series of RCU CPU stall warnings, eventually | ||
89 | leading the realization that the CPU had failed. | ||
90 | |||
91 | The RCU, RCU-sched, and RCU-bh implementations have CPU stall | ||
92 | warning. SRCU does not have its own CPU stall warnings, but its | ||
93 | calls to synchronize_sched() will result in RCU-sched detecting | ||
94 | RCU-sched-related CPU stalls. Please note that RCU only detects | ||
95 | CPU stalls when there is a grace period in progress. No grace period, | ||
96 | no CPU stall warnings. | ||
97 | |||
98 | To diagnose the cause of the stall, inspect the stack traces. | ||
99 | The offending function will usually be near the top of the stack. | ||
100 | If you have a series of stall warnings from a single extended stall, | ||
101 | comparing the stack traces can often help determine where the stall | ||
102 | is occurring, which will usually be in the function nearest the top of | ||
103 | that portion of the stack which remains the same from trace to trace. | ||
104 | If you can reliably trigger the stall, ftrace can be quite helpful. | ||
105 | |||
106 | RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE. | ||
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt index 9dba3bb90e60..5d9016795fd8 100644 --- a/Documentation/RCU/torture.txt +++ b/Documentation/RCU/torture.txt | |||
@@ -30,6 +30,18 @@ MODULE PARAMETERS | |||
30 | 30 | ||
31 | This module has the following parameters: | 31 | This module has the following parameters: |
32 | 32 | ||
33 | fqs_duration Duration (in microseconds) of artificially induced bursts | ||
34 | of force_quiescent_state() invocations. In RCU | ||
35 | implementations having force_quiescent_state(), these | ||
36 | bursts help force races between forcing a given grace | ||
37 | period and that grace period ending on its own. | ||
38 | |||
39 | fqs_holdoff Holdoff time (in microseconds) between consecutive calls | ||
40 | to force_quiescent_state() within a burst. | ||
41 | |||
42 | fqs_stutter Wait time (in seconds) between consecutive bursts | ||
43 | of calls to force_quiescent_state(). | ||
44 | |||
33 | irqreaders Says to invoke RCU readers from irq level. This is currently | 45 | irqreaders Says to invoke RCU readers from irq level. This is currently |
34 | done via timers. Defaults to "1" for variants of RCU that | 46 | done via timers. Defaults to "1" for variants of RCU that |
35 | permit this. (Or, more accurately, variants of RCU that do | 47 | permit this. (Or, more accurately, variants of RCU that do |
@@ -170,16 +182,6 @@ Similarly, sched_expedited RCU provides the following: | |||
170 | sched_expedited-torture: Reader Pipe: 12660320201 95875 0 0 0 0 0 0 0 0 0 | 182 | sched_expedited-torture: Reader Pipe: 12660320201 95875 0 0 0 0 0 0 0 0 0 |
171 | sched_expedited-torture: Reader Batch: 12660424885 0 0 0 0 0 0 0 0 0 0 | 183 | sched_expedited-torture: Reader Batch: 12660424885 0 0 0 0 0 0 0 0 0 0 |
172 | sched_expedited-torture: Free-Block Circulation: 1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0 | 184 | sched_expedited-torture: Free-Block Circulation: 1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0 |
173 | state: -1 / 0:0 3:0 4:0 | ||
174 | |||
175 | As before, the first four lines are similar to those for RCU. | ||
176 | The last line shows the task-migration state. The first number is | ||
177 | -1 if synchronize_sched_expedited() is idle, -2 if in the process of | ||
178 | posting wakeups to the migration kthreads, and N when waiting on CPU N. | ||
179 | Each of the colon-separated fields following the "/" is a CPU:state pair. | ||
180 | Valid states are "0" for idle, "1" for waiting for quiescent state, | ||
181 | "2" for passed through quiescent state, and "3" when a race with a | ||
182 | CPU-hotplug event forces use of the synchronize_sched() primitive. | ||
183 | 185 | ||
184 | 186 | ||
185 | USAGE | 187 | USAGE |
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index 8608fd85e921..efd8cc95c06b 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt | |||
@@ -256,23 +256,23 @@ o Each element of the form "1/1 0:127 ^0" represents one struct | |||
256 | The output of "cat rcu/rcu_pending" looks as follows: | 256 | The output of "cat rcu/rcu_pending" looks as follows: |
257 | 257 | ||
258 | rcu_sched: | 258 | rcu_sched: |
259 | 0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741 | 259 | 0 np=255892 qsp=53936 rpq=85 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741 |
260 | 1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792 | 260 | 1 np=261224 qsp=54638 rpq=33 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792 |
261 | 2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629 | 261 | 2 np=237496 qsp=49664 rpq=23 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629 |
262 | 3 np=236249 qsp=48766 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723 | 262 | 3 np=236249 qsp=48766 rpq=98 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723 |
263 | 4 np=221310 qsp=46850 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110 | 263 | 4 np=221310 qsp=46850 rpq=7 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110 |
264 | 5 np=237332 qsp=48449 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456 | 264 | 5 np=237332 qsp=48449 rpq=9 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456 |
265 | 6 np=219995 qsp=46718 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834 | 265 | 6 np=219995 qsp=46718 rpq=12 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834 |
266 | 7 np=249893 qsp=49390 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888 | 266 | 7 np=249893 qsp=49390 rpq=42 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888 |
267 | rcu_bh: | 267 | rcu_bh: |
268 | 0 np=146741 qsp=1419 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314 | 268 | 0 np=146741 qsp=1419 rpq=6 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314 |
269 | 1 np=155792 qsp=12597 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180 | 269 | 1 np=155792 qsp=12597 rpq=3 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180 |
270 | 2 np=136629 qsp=18680 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936 | 270 | 2 np=136629 qsp=18680 rpq=1 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936 |
271 | 3 np=137723 qsp=2843 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863 | 271 | 3 np=137723 qsp=2843 rpq=0 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863 |
272 | 4 np=123110 qsp=12433 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671 | 272 | 4 np=123110 qsp=12433 rpq=0 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671 |
273 | 5 np=137456 qsp=4210 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235 | 273 | 5 np=137456 qsp=4210 rpq=1 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235 |
274 | 6 np=120834 qsp=9902 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921 | 274 | 6 np=120834 qsp=9902 rpq=2 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921 |
275 | 7 np=144888 qsp=26336 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542 | 275 | 7 np=144888 qsp=26336 rpq=0 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542 |
276 | 276 | ||
277 | As always, this is once again split into "rcu_sched" and "rcu_bh" | 277 | As always, this is once again split into "rcu_sched" and "rcu_bh" |
278 | portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional | 278 | portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional |
@@ -284,6 +284,9 @@ o "np" is the number of times that __rcu_pending() has been invoked | |||
284 | o "qsp" is the number of times that the RCU was waiting for a | 284 | o "qsp" is the number of times that the RCU was waiting for a |
285 | quiescent state from this CPU. | 285 | quiescent state from this CPU. |
286 | 286 | ||
287 | o "rpq" is the number of times that the CPU had passed through | ||
288 | a quiescent state, but not yet reported it to RCU. | ||
289 | |||
287 | o "cbr" is the number of times that this CPU had RCU callbacks | 290 | o "cbr" is the number of times that this CPU had RCU callbacks |
288 | that had passed through a grace period, and were thus ready | 291 | that had passed through a grace period, and were thus ready |
289 | to be invoked. | 292 | to be invoked. |
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index d542ca243b80..cfaac34c4557 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt | |||
@@ -323,14 +323,17 @@ used as follows: | |||
323 | Defer Protect | 323 | Defer Protect |
324 | 324 | ||
325 | a. synchronize_rcu() rcu_read_lock() / rcu_read_unlock() | 325 | a. synchronize_rcu() rcu_read_lock() / rcu_read_unlock() |
326 | call_rcu() | 326 | call_rcu() rcu_dereference() |
327 | 327 | ||
328 | b. call_rcu_bh() rcu_read_lock_bh() / rcu_read_unlock_bh() | 328 | b. call_rcu_bh() rcu_read_lock_bh() / rcu_read_unlock_bh() |
329 | rcu_dereference_bh() | ||
329 | 330 | ||
330 | c. synchronize_sched() preempt_disable() / preempt_enable() | 331 | c. synchronize_sched() rcu_read_lock_sched() / rcu_read_unlock_sched() |
332 | preempt_disable() / preempt_enable() | ||
331 | local_irq_save() / local_irq_restore() | 333 | local_irq_save() / local_irq_restore() |
332 | hardirq enter / hardirq exit | 334 | hardirq enter / hardirq exit |
333 | NMI enter / NMI exit | 335 | NMI enter / NMI exit |
336 | rcu_dereference_sched() | ||
334 | 337 | ||
335 | These three mechanisms are used as follows: | 338 | These three mechanisms are used as follows: |
336 | 339 | ||
@@ -780,9 +783,8 @@ Linux-kernel source code, but it helps to have a full list of the | |||
780 | APIs, since there does not appear to be a way to categorize them | 783 | APIs, since there does not appear to be a way to categorize them |
781 | in docbook. Here is the list, by category. | 784 | in docbook. Here is the list, by category. |
782 | 785 | ||
783 | RCU pointer/list traversal: | 786 | RCU list traversal: |
784 | 787 | ||
785 | rcu_dereference | ||
786 | list_for_each_entry_rcu | 788 | list_for_each_entry_rcu |
787 | hlist_for_each_entry_rcu | 789 | hlist_for_each_entry_rcu |
788 | hlist_nulls_for_each_entry_rcu | 790 | hlist_nulls_for_each_entry_rcu |
@@ -808,7 +810,7 @@ RCU: Critical sections Grace period Barrier | |||
808 | 810 | ||
809 | rcu_read_lock synchronize_net rcu_barrier | 811 | rcu_read_lock synchronize_net rcu_barrier |
810 | rcu_read_unlock synchronize_rcu | 812 | rcu_read_unlock synchronize_rcu |
811 | synchronize_rcu_expedited | 813 | rcu_dereference synchronize_rcu_expedited |
812 | call_rcu | 814 | call_rcu |
813 | 815 | ||
814 | 816 | ||
@@ -816,7 +818,7 @@ bh: Critical sections Grace period Barrier | |||
816 | 818 | ||
817 | rcu_read_lock_bh call_rcu_bh rcu_barrier_bh | 819 | rcu_read_lock_bh call_rcu_bh rcu_barrier_bh |
818 | rcu_read_unlock_bh synchronize_rcu_bh | 820 | rcu_read_unlock_bh synchronize_rcu_bh |
819 | synchronize_rcu_bh_expedited | 821 | rcu_dereference_bh synchronize_rcu_bh_expedited |
820 | 822 | ||
821 | 823 | ||
822 | sched: Critical sections Grace period Barrier | 824 | sched: Critical sections Grace period Barrier |
@@ -825,17 +827,25 @@ sched: Critical sections Grace period Barrier | |||
825 | rcu_read_unlock_sched call_rcu_sched | 827 | rcu_read_unlock_sched call_rcu_sched |
826 | [preempt_disable] synchronize_sched_expedited | 828 | [preempt_disable] synchronize_sched_expedited |
827 | [and friends] | 829 | [and friends] |
830 | rcu_dereference_sched | ||
828 | 831 | ||
829 | 832 | ||
830 | SRCU: Critical sections Grace period Barrier | 833 | SRCU: Critical sections Grace period Barrier |
831 | 834 | ||
832 | srcu_read_lock synchronize_srcu N/A | 835 | srcu_read_lock synchronize_srcu N/A |
833 | srcu_read_unlock synchronize_srcu_expedited | 836 | srcu_read_unlock synchronize_srcu_expedited |
837 | srcu_dereference | ||
834 | 838 | ||
835 | SRCU: Initialization/cleanup | 839 | SRCU: Initialization/cleanup |
836 | init_srcu_struct | 840 | init_srcu_struct |
837 | cleanup_srcu_struct | 841 | cleanup_srcu_struct |
838 | 842 | ||
843 | All: lockdep-checked RCU-protected pointer access | ||
844 | |||
845 | rcu_dereference_check | ||
846 | rcu_dereference_protected | ||
847 | rcu_access_pointer | ||
848 | |||
839 | See the comment headers in the source code (or the docbook generated | 849 | See the comment headers in the source code (or the docbook generated |
840 | from them) for more information. | 850 | from them) for more information. |
841 | 851 | ||
diff --git a/Documentation/Smack.txt b/Documentation/Smack.txt index 34614b4c708e..e9dab41c0fe0 100644 --- a/Documentation/Smack.txt +++ b/Documentation/Smack.txt | |||
@@ -73,7 +73,7 @@ NOTE: Smack labels are limited to 23 characters. The attr command | |||
73 | If you don't do anything special all users will get the floor ("_") | 73 | If you don't do anything special all users will get the floor ("_") |
74 | label when they log in. If you do want to log in via the hacked ssh | 74 | label when they log in. If you do want to log in via the hacked ssh |
75 | at other labels use the attr command to set the smack value on the | 75 | at other labels use the attr command to set the smack value on the |
76 | home directory and it's contents. | 76 | home directory and its contents. |
77 | 77 | ||
78 | You can add access rules in /etc/smack/accesses. They take the form: | 78 | You can add access rules in /etc/smack/accesses. They take the form: |
79 | 79 | ||
diff --git a/Documentation/SubmitChecklist b/Documentation/SubmitChecklist index 1053a56be3b1..da0382daa395 100644 --- a/Documentation/SubmitChecklist +++ b/Documentation/SubmitChecklist | |||
@@ -9,10 +9,16 @@ Documentation/SubmittingPatches and elsewhere regarding submitting Linux | |||
9 | kernel patches. | 9 | kernel patches. |
10 | 10 | ||
11 | 11 | ||
12 | 1: Builds cleanly with applicable or modified CONFIG options =y, =m, and | 12 | 1: If you use a facility then #include the file that defines/declares |
13 | that facility. Don't depend on other header files pulling in ones | ||
14 | that you use. | ||
15 | |||
16 | 2: Builds cleanly with applicable or modified CONFIG options =y, =m, and | ||
13 | =n. No gcc warnings/errors, no linker warnings/errors. | 17 | =n. No gcc warnings/errors, no linker warnings/errors. |
14 | 18 | ||
15 | 2: Passes allnoconfig, allmodconfig | 19 | 2b: Passes allnoconfig, allmodconfig |
20 | |||
21 | 2c: Builds successfully when using O=builddir | ||
16 | 22 | ||
17 | 3: Builds on multiple CPU architectures by using local cross-compile tools | 23 | 3: Builds on multiple CPU architectures by using local cross-compile tools |
18 | or some other build farm. | 24 | or some other build farm. |
@@ -91,3 +97,13 @@ kernel patches. | |||
91 | 97 | ||
92 | 25: If any ioctl's are added by the patch, then also update | 98 | 25: If any ioctl's are added by the patch, then also update |
93 | Documentation/ioctl/ioctl-number.txt. | 99 | Documentation/ioctl/ioctl-number.txt. |
100 | |||
101 | 26: If your modified source code depends on or uses any of the kernel | ||
102 | APIs or features that are related to the following kconfig symbols, | ||
103 | then test multiple builds with the related kconfig symbols disabled | ||
104 | and/or =m (if that option is available) [not all of these at the | ||
105 | same time, just various/random combinations of them]: | ||
106 | |||
107 | CONFIG_SMP, CONFIG_SYSFS, CONFIG_PROC_FS, CONFIG_INPUT, CONFIG_PCI, | ||
108 | CONFIG_BLOCK, CONFIG_PM, CONFIG_HOTPLUG, CONFIG_MAGIC_SYSRQ, | ||
109 | CONFIG_NET, CONFIG_INET=n (but latter with CONFIG_NET=y) | ||
diff --git a/Documentation/SubmittingDrivers b/Documentation/SubmittingDrivers index 99e72a81fa2f..4947fd8fb182 100644 --- a/Documentation/SubmittingDrivers +++ b/Documentation/SubmittingDrivers | |||
@@ -130,6 +130,8 @@ Linux kernel master tree: | |||
130 | ftp.??.kernel.org:/pub/linux/kernel/... | 130 | ftp.??.kernel.org:/pub/linux/kernel/... |
131 | ?? == your country code, such as "us", "uk", "fr", etc. | 131 | ?? == your country code, such as "us", "uk", "fr", etc. |
132 | 132 | ||
133 | http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git | ||
134 | |||
133 | Linux kernel mailing list: | 135 | Linux kernel mailing list: |
134 | linux-kernel@vger.kernel.org | 136 | linux-kernel@vger.kernel.org |
135 | [mail majordomo@vger.kernel.org to subscribe] | 137 | [mail majordomo@vger.kernel.org to subscribe] |
@@ -160,3 +162,6 @@ How to NOT write kernel driver by Arjan van de Ven: | |||
160 | 162 | ||
161 | Kernel Janitor: | 163 | Kernel Janitor: |
162 | http://janitor.kernelnewbies.org/ | 164 | http://janitor.kernelnewbies.org/ |
165 | |||
166 | GIT, Fast Version Control System: | ||
167 | http://git-scm.com/ | ||
diff --git a/Documentation/arm/00-INDEX b/Documentation/arm/00-INDEX index 82e418d648d0..7f5fc3ba9c91 100644 --- a/Documentation/arm/00-INDEX +++ b/Documentation/arm/00-INDEX | |||
@@ -20,6 +20,8 @@ Samsung-S3C24XX | |||
20 | - S3C24XX ARM Linux Overview | 20 | - S3C24XX ARM Linux Overview |
21 | Sharp-LH | 21 | Sharp-LH |
22 | - Linux on Sharp LH79524 and LH7A40X System On a Chip (SOC) | 22 | - Linux on Sharp LH79524 and LH7A40X System On a Chip (SOC) |
23 | SPEAr | ||
24 | - ST SPEAr platform Linux Overview | ||
23 | VFP/ | 25 | VFP/ |
24 | - Release notes for Linux Kernel Vector Floating Point support code | 26 | - Release notes for Linux Kernel Vector Floating Point support code |
25 | empeg/ | 27 | empeg/ |
diff --git a/Documentation/arm/SA1100/ADSBitsy b/Documentation/arm/SA1100/ADSBitsy index 7197a9e958ee..f9f62e8c0719 100644 --- a/Documentation/arm/SA1100/ADSBitsy +++ b/Documentation/arm/SA1100/ADSBitsy | |||
@@ -32,7 +32,7 @@ Notes: | |||
32 | 32 | ||
33 | - The flash on board is divided into 3 partitions. | 33 | - The flash on board is divided into 3 partitions. |
34 | You should be careful to use flash on board. | 34 | You should be careful to use flash on board. |
35 | It's partition is different from GraphicsClient Plus and GraphicsMaster | 35 | Its partition is different from GraphicsClient Plus and GraphicsMaster |
36 | 36 | ||
37 | - 16bpp mode requires a different cable than what ships with the board. | 37 | - 16bpp mode requires a different cable than what ships with the board. |
38 | Contact ADS or look through the manual to wire your own. Currently, | 38 | Contact ADS or look through the manual to wire your own. Currently, |
diff --git a/Documentation/arm/SPEAr/overview.txt b/Documentation/arm/SPEAr/overview.txt new file mode 100644 index 000000000000..253a35c6f782 --- /dev/null +++ b/Documentation/arm/SPEAr/overview.txt | |||
@@ -0,0 +1,60 @@ | |||
1 | SPEAr ARM Linux Overview | ||
2 | ========================== | ||
3 | |||
4 | Introduction | ||
5 | ------------ | ||
6 | |||
7 | SPEAr (Structured Processor Enhanced Architecture). | ||
8 | weblink : http://www.st.com/spear | ||
9 | |||
10 | The ST Microelectronics SPEAr range of ARM9/CortexA9 System-on-Chip CPUs are | ||
11 | supported by the 'spear' platform of ARM Linux. Currently SPEAr300, | ||
12 | SPEAr310, SPEAr320 and SPEAr600 SOCs are supported. Support for the SPEAr13XX | ||
13 | series is in progress. | ||
14 | |||
15 | Hierarchy in SPEAr is as follows: | ||
16 | |||
17 | SPEAr (Platform) | ||
18 | - SPEAr3XX (3XX SOC series, based on ARM9) | ||
19 | - SPEAr300 (SOC) | ||
20 | - SPEAr300_EVB (Evaluation Board) | ||
21 | - SPEAr310 (SOC) | ||
22 | - SPEAr310_EVB (Evaluation Board) | ||
23 | - SPEAr320 (SOC) | ||
24 | - SPEAr320_EVB (Evaluation Board) | ||
25 | - SPEAr6XX (6XX SOC series, based on ARM9) | ||
26 | - SPEAr600 (SOC) | ||
27 | - SPEAr600_EVB (Evaluation Board) | ||
28 | - SPEAr13XX (13XX SOC series, based on ARM CORTEXA9) | ||
29 | - SPEAr1300 (SOC) | ||
30 | |||
31 | Configuration | ||
32 | ------------- | ||
33 | |||
34 | A generic configuration is provided for each machine, and can be used as the | ||
35 | default by | ||
36 | make spear600_defconfig | ||
37 | make spear300_defconfig | ||
38 | make spear310_defconfig | ||
39 | make spear320_defconfig | ||
40 | |||
41 | Layout | ||
42 | ------ | ||
43 | |||
44 | The common files for multiple machine families (SPEAr3XX, SPEAr6XX and | ||
45 | SPEAr13XX) are located in the platform code contained in arch/arm/plat-spear | ||
46 | with headers in plat/. | ||
47 | |||
48 | Each machine series have a directory with name arch/arm/mach-spear followed by | ||
49 | series name. Like mach-spear3xx, mach-spear6xx and mach-spear13xx. | ||
50 | |||
51 | Common file for machines of spear3xx family is mach-spear3xx/spear3xx.c and for | ||
52 | spear6xx is mach-spear6xx/spear6xx.c. mach-spear* also contain soc/machine | ||
53 | specific files, like spear300.c, spear310.c, spear320.c and spear600.c. | ||
54 | mach-spear* also contains board specific files for each machine type. | ||
55 | |||
56 | |||
57 | Document Author | ||
58 | --------------- | ||
59 | |||
60 | Viresh Kumar, (c) 2010 ST Microelectronics | ||
diff --git a/Documentation/arm/Samsung-S3C24XX/CPUfreq.txt b/Documentation/arm/Samsung-S3C24XX/CPUfreq.txt index 76b3a11e90be..fa968aa99d67 100644 --- a/Documentation/arm/Samsung-S3C24XX/CPUfreq.txt +++ b/Documentation/arm/Samsung-S3C24XX/CPUfreq.txt | |||
@@ -14,8 +14,8 @@ Introduction | |||
14 | how the clocks are arranged. The first implementation used as single | 14 | how the clocks are arranged. The first implementation used as single |
15 | PLL to feed the ARM, memory and peripherals via a series of dividers | 15 | PLL to feed the ARM, memory and peripherals via a series of dividers |
16 | and muxes and this is the implementation that is documented here. A | 16 | and muxes and this is the implementation that is documented here. A |
17 | newer version where there is a seperate PLL and clock divider for the | 17 | newer version where there is a separate PLL and clock divider for the |
18 | ARM core is available as a seperate driver. | 18 | ARM core is available as a separate driver. |
19 | 19 | ||
20 | 20 | ||
21 | Layout | 21 | Layout |
diff --git a/Documentation/arm/Samsung/Overview.txt b/Documentation/arm/Samsung/Overview.txt new file mode 100644 index 000000000000..7cced1fea9c3 --- /dev/null +++ b/Documentation/arm/Samsung/Overview.txt | |||
@@ -0,0 +1,86 @@ | |||
1 | Samsung ARM Linux Overview | ||
2 | ========================== | ||
3 | |||
4 | Introduction | ||
5 | ------------ | ||
6 | |||
7 | The Samsung range of ARM SoCs spans many similar devices, from the initial | ||
8 | ARM9 through to the newest ARM cores. This document shows an overview of | ||
9 | the current kernel support, how to use it and where to find the code | ||
10 | that supports this. | ||
11 | |||
12 | The currently supported SoCs are: | ||
13 | |||
14 | - S3C24XX: See Documentation/arm/Samsung-S3C24XX/Overview.txt for full list | ||
15 | - S3C64XX: S3C6400 and S3C6410 | ||
16 | - S5PC6440 | ||
17 | |||
18 | S5PC100 and S5PC110 support is currently being merged | ||
19 | |||
20 | |||
21 | S3C24XX Systems | ||
22 | --------------- | ||
23 | |||
24 | There is still documentation in Documnetation/arm/Samsung-S3C24XX/ which | ||
25 | deals with the architecture and drivers specific to these devices. | ||
26 | |||
27 | See Documentation/arm/Samsung-S3C24XX/Overview.txt for more information | ||
28 | on the implementation details and specific support. | ||
29 | |||
30 | |||
31 | Configuration | ||
32 | ------------- | ||
33 | |||
34 | A number of configurations are supplied, as there is no current way of | ||
35 | unifying all the SoCs into one kernel. | ||
36 | |||
37 | s5p6440_defconfig - S5P6440 specific default configuration | ||
38 | s5pc100_defconfig - S5PC100 specific default configuration | ||
39 | |||
40 | |||
41 | Layout | ||
42 | ------ | ||
43 | |||
44 | The directory layout is currently being restructured, and consists of | ||
45 | several platform directories and then the machine specific directories | ||
46 | of the CPUs being built for. | ||
47 | |||
48 | plat-samsung provides the base for all the implementations, and is the | ||
49 | last in the line of include directories that are processed for the build | ||
50 | specific information. It contains the base clock, GPIO and device definitions | ||
51 | to get the system running. | ||
52 | |||
53 | plat-s3c is the s3c24xx/s3c64xx platform directory, although it is currently | ||
54 | involved in other builds this will be phased out once the relevant code is | ||
55 | moved elsewhere. | ||
56 | |||
57 | plat-s3c24xx is for s3c24xx specific builds, see the S3C24XX docs. | ||
58 | |||
59 | plat-s3c64xx is for the s3c64xx specific bits, see the S3C24XX docs. | ||
60 | |||
61 | plat-s5p is for s5p specific builds, more to be added. | ||
62 | |||
63 | |||
64 | [ to finish ] | ||
65 | |||
66 | |||
67 | Port Contributors | ||
68 | ----------------- | ||
69 | |||
70 | Ben Dooks (BJD) | ||
71 | Vincent Sanders | ||
72 | Herbert Potzl | ||
73 | Arnaud Patard (RTP) | ||
74 | Roc Wu | ||
75 | Klaus Fetscher | ||
76 | Dimitry Andric | ||
77 | Shannon Holland | ||
78 | Guillaume Gourat (NexVision) | ||
79 | Christer Weinigel (wingel) (Acer N30) | ||
80 | Lucas Correia Villa Real (S3C2400 port) | ||
81 | |||
82 | |||
83 | Document Author | ||
84 | --------------- | ||
85 | |||
86 | Copyright 2009-2010 Ben Dooks <ben-linux@fluff.org> | ||
diff --git a/Documentation/arm/Samsung/clksrc-change-registers.awk b/Documentation/arm/Samsung/clksrc-change-registers.awk new file mode 100755 index 000000000000..0c50220851fb --- /dev/null +++ b/Documentation/arm/Samsung/clksrc-change-registers.awk | |||
@@ -0,0 +1,167 @@ | |||
1 | #!/usr/bin/awk -f | ||
2 | # | ||
3 | # Copyright 2010 Ben Dooks <ben-linux@fluff.org> | ||
4 | # | ||
5 | # Released under GPLv2 | ||
6 | |||
7 | # example usage | ||
8 | # ./clksrc-change-registers.awk arch/arm/plat-s5pc1xx/include/plat/regs-clock.h < src > dst | ||
9 | |||
10 | function extract_value(s) | ||
11 | { | ||
12 | eqat = index(s, "=") | ||
13 | comat = index(s, ",") | ||
14 | return substr(s, eqat+2, (comat-eqat)-2) | ||
15 | } | ||
16 | |||
17 | function remove_brackets(b) | ||
18 | { | ||
19 | return substr(b, 2, length(b)-2) | ||
20 | } | ||
21 | |||
22 | function splitdefine(l, p) | ||
23 | { | ||
24 | r = split(l, tp) | ||
25 | |||
26 | p[0] = tp[2] | ||
27 | p[1] = remove_brackets(tp[3]) | ||
28 | } | ||
29 | |||
30 | function find_length(f) | ||
31 | { | ||
32 | if (0) | ||
33 | printf "find_length " f "\n" > "/dev/stderr" | ||
34 | |||
35 | if (f ~ /0x1/) | ||
36 | return 1 | ||
37 | else if (f ~ /0x3/) | ||
38 | return 2 | ||
39 | else if (f ~ /0x7/) | ||
40 | return 3 | ||
41 | else if (f ~ /0xf/) | ||
42 | return 4 | ||
43 | |||
44 | printf "unknown legnth " f "\n" > "/dev/stderr" | ||
45 | exit | ||
46 | } | ||
47 | |||
48 | function find_shift(s) | ||
49 | { | ||
50 | id = index(s, "<") | ||
51 | if (id <= 0) { | ||
52 | printf "cannot find shift " s "\n" > "/dev/stderr" | ||
53 | exit | ||
54 | } | ||
55 | |||
56 | return substr(s, id+2) | ||
57 | } | ||
58 | |||
59 | |||
60 | BEGIN { | ||
61 | if (ARGC < 2) { | ||
62 | print "too few arguments" > "/dev/stderr" | ||
63 | exit | ||
64 | } | ||
65 | |||
66 | # read the header file and find the mask values that we will need | ||
67 | # to replace and create an associative array of values | ||
68 | |||
69 | while (getline line < ARGV[1] > 0) { | ||
70 | if (line ~ /\#define.*_MASK/ && | ||
71 | !(line ~ /S5PC100_EPLL_MASK/) && | ||
72 | !(line ~ /USB_SIG_MASK/)) { | ||
73 | splitdefine(line, fields) | ||
74 | name = fields[0] | ||
75 | if (0) | ||
76 | printf "MASK " line "\n" > "/dev/stderr" | ||
77 | dmask[name,0] = find_length(fields[1]) | ||
78 | dmask[name,1] = find_shift(fields[1]) | ||
79 | if (0) | ||
80 | printf "=> '" name "' LENGTH=" dmask[name,0] " SHIFT=" dmask[name,1] "\n" > "/dev/stderr" | ||
81 | } else { | ||
82 | } | ||
83 | } | ||
84 | |||
85 | delete ARGV[1] | ||
86 | } | ||
87 | |||
88 | /clksrc_clk.*=.*{/ { | ||
89 | shift="" | ||
90 | mask="" | ||
91 | divshift="" | ||
92 | reg_div="" | ||
93 | reg_src="" | ||
94 | indent=1 | ||
95 | |||
96 | print $0 | ||
97 | |||
98 | for(; indent >= 1;) { | ||
99 | if ((getline line) <= 0) { | ||
100 | printf "unexpected end of file" > "/dev/stderr" | ||
101 | exit 1; | ||
102 | } | ||
103 | |||
104 | if (line ~ /\.shift/) { | ||
105 | shift = extract_value(line) | ||
106 | } else if (line ~ /\.mask/) { | ||
107 | mask = extract_value(line) | ||
108 | } else if (line ~ /\.reg_divider/) { | ||
109 | reg_div = extract_value(line) | ||
110 | } else if (line ~ /\.reg_source/) { | ||
111 | reg_src = extract_value(line) | ||
112 | } else if (line ~ /\.divider_shift/) { | ||
113 | divshift = extract_value(line) | ||
114 | } else if (line ~ /{/) { | ||
115 | indent++ | ||
116 | print line | ||
117 | } else if (line ~ /}/) { | ||
118 | indent-- | ||
119 | |||
120 | if (indent == 0) { | ||
121 | if (0) { | ||
122 | printf "shift '" shift "' ='" dmask[shift,0] "'\n" > "/dev/stderr" | ||
123 | printf "mask '" mask "'\n" > "/dev/stderr" | ||
124 | printf "dshft '" divshift "'\n" > "/dev/stderr" | ||
125 | printf "rdiv '" reg_div "'\n" > "/dev/stderr" | ||
126 | printf "rsrc '" reg_src "'\n" > "/dev/stderr" | ||
127 | } | ||
128 | |||
129 | generated = mask | ||
130 | sub(reg_src, reg_div, generated) | ||
131 | |||
132 | if (0) { | ||
133 | printf "/* rsrc " reg_src " */\n" | ||
134 | printf "/* rdiv " reg_div " */\n" | ||
135 | printf "/* shift " shift " */\n" | ||
136 | printf "/* mask " mask " */\n" | ||
137 | printf "/* generated " generated " */\n" | ||
138 | } | ||
139 | |||
140 | if (reg_div != "") { | ||
141 | printf "\t.reg_div = { " | ||
142 | printf ".reg = " reg_div ", " | ||
143 | printf ".shift = " dmask[generated,1] ", " | ||
144 | printf ".size = " dmask[generated,0] ", " | ||
145 | printf "},\n" | ||
146 | } | ||
147 | |||
148 | printf "\t.reg_src = { " | ||
149 | printf ".reg = " reg_src ", " | ||
150 | printf ".shift = " dmask[mask,1] ", " | ||
151 | printf ".size = " dmask[mask,0] ", " | ||
152 | |||
153 | printf "},\n" | ||
154 | |||
155 | } | ||
156 | |||
157 | print line | ||
158 | } else { | ||
159 | print line | ||
160 | } | ||
161 | |||
162 | if (0) | ||
163 | printf indent ":" line "\n" > "/dev/stderr" | ||
164 | } | ||
165 | } | ||
166 | |||
167 | // && ! /clksrc_clk.*=.*{/ { print $0 } | ||
diff --git a/Documentation/arm/Sharp-LH/ADC-LH7-Touchscreen b/Documentation/arm/Sharp-LH/ADC-LH7-Touchscreen index 1e6a23fdf2fc..dc460f055647 100644 --- a/Documentation/arm/Sharp-LH/ADC-LH7-Touchscreen +++ b/Documentation/arm/Sharp-LH/ADC-LH7-Touchscreen | |||
@@ -7,7 +7,7 @@ The driver only implements a four-wire touch panel protocol. | |||
7 | 7 | ||
8 | The touchscreen driver is maintenance free except for the pen-down or | 8 | The touchscreen driver is maintenance free except for the pen-down or |
9 | touch threshold. Some resistive displays and board combinations may | 9 | touch threshold. Some resistive displays and board combinations may |
10 | require tuning of this threshold. The driver exposes some of it's | 10 | require tuning of this threshold. The driver exposes some of its |
11 | internal state in the sys filesystem. If the kernel is configured | 11 | internal state in the sys filesystem. If the kernel is configured |
12 | with it, CONFIG_SYSFS, and sysfs is mounted at /sys, there will be a | 12 | with it, CONFIG_SYSFS, and sysfs is mounted at /sys, there will be a |
13 | directory | 13 | directory |
diff --git a/Documentation/arm/memory.txt b/Documentation/arm/memory.txt index 9d58c7c5eddd..eb0fae18ffb1 100644 --- a/Documentation/arm/memory.txt +++ b/Documentation/arm/memory.txt | |||
@@ -59,7 +59,11 @@ PAGE_OFFSET high_memory-1 Kernel direct-mapped RAM region. | |||
59 | This maps the platforms RAM, and typically | 59 | This maps the platforms RAM, and typically |
60 | maps all platform RAM in a 1:1 relationship. | 60 | maps all platform RAM in a 1:1 relationship. |
61 | 61 | ||
62 | TASK_SIZE PAGE_OFFSET-1 Kernel module space | 62 | PKMAP_BASE PAGE_OFFSET-1 Permanent kernel mappings |
63 | One way of mapping HIGHMEM pages into kernel | ||
64 | space. | ||
65 | |||
66 | MODULES_VADDR MODULES_END-1 Kernel module space | ||
63 | Kernel modules inserted via insmod are | 67 | Kernel modules inserted via insmod are |
64 | placed here using dynamic mappings. | 68 | placed here using dynamic mappings. |
65 | 69 | ||
diff --git a/Documentation/atomic_ops.txt b/Documentation/atomic_ops.txt index 396bec3b74ed..ac4d47187122 100644 --- a/Documentation/atomic_ops.txt +++ b/Documentation/atomic_ops.txt | |||
@@ -320,7 +320,7 @@ counter decrement would not become globally visible until the | |||
320 | obj->active update does. | 320 | obj->active update does. |
321 | 321 | ||
322 | As a historical note, 32-bit Sparc used to only allow usage of | 322 | As a historical note, 32-bit Sparc used to only allow usage of |
323 | 24-bits of it's atomic_t type. This was because it used 8 bits | 323 | 24-bits of its atomic_t type. This was because it used 8 bits |
324 | as a spinlock for SMP safety. Sparc32 lacked a "compare and swap" | 324 | as a spinlock for SMP safety. Sparc32 lacked a "compare and swap" |
325 | type instruction. However, 32-bit Sparc has since been moved over | 325 | type instruction. However, 32-bit Sparc has since been moved over |
326 | to a "hash table of spinlocks" scheme, that allows the full 32-bit | 326 | to a "hash table of spinlocks" scheme, that allows the full 32-bit |
diff --git a/Documentation/blackfin/bfin-gpio-notes.txt b/Documentation/blackfin/bfin-gpio-notes.txt index 9898c7ded7d3..f731c1e56475 100644 --- a/Documentation/blackfin/bfin-gpio-notes.txt +++ b/Documentation/blackfin/bfin-gpio-notes.txt | |||
@@ -43,7 +43,7 @@ | |||
43 | void bfin_gpio_irq_free(unsigned gpio); | 43 | void bfin_gpio_irq_free(unsigned gpio); |
44 | 44 | ||
45 | The request functions will record the function state for a certain pin, | 45 | The request functions will record the function state for a certain pin, |
46 | the free functions will clear it's function state. | 46 | the free functions will clear its function state. |
47 | Once a pin is requested, it can't be requested again before it is freed by | 47 | Once a pin is requested, it can't be requested again before it is freed by |
48 | previous caller, otherwise kernel will dump stacks, and the request | 48 | previous caller, otherwise kernel will dump stacks, and the request |
49 | function fail. | 49 | function fail. |
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index 6fab97ea7e6b..508b5b2b0289 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt | |||
@@ -1162,8 +1162,8 @@ where a driver received a request ala this before: | |||
1162 | 1162 | ||
1163 | As mentioned, there is no virtual mapping of a bio. For DMA, this is | 1163 | As mentioned, there is no virtual mapping of a bio. For DMA, this is |
1164 | not a problem as the driver probably never will need a virtual mapping. | 1164 | not a problem as the driver probably never will need a virtual mapping. |
1165 | Instead it needs a bus mapping (pci_map_page for a single segment or | 1165 | Instead it needs a bus mapping (dma_map_page for a single segment or |
1166 | use blk_rq_map_sg for scatter gather) to be able to ship it to the driver. For | 1166 | use dma_map_sg for scatter gather) to be able to ship it to the driver. For |
1167 | PIO drivers (or drivers that need to revert to PIO transfer once in a | 1167 | PIO drivers (or drivers that need to revert to PIO transfer once in a |
1168 | while (IDE for example)), where the CPU is doing the actual data | 1168 | while (IDE for example)), where the CPU is doing the actual data |
1169 | transfer a virtual mapping is needed. If the driver supports highmem I/O, | 1169 | transfer a virtual mapping is needed. If the driver supports highmem I/O, |
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt index e164403f60e1..f65274081c8d 100644 --- a/Documentation/block/queue-sysfs.txt +++ b/Documentation/block/queue-sysfs.txt | |||
@@ -25,11 +25,11 @@ size allowed by the hardware. | |||
25 | 25 | ||
26 | nomerges (RW) | 26 | nomerges (RW) |
27 | ------------- | 27 | ------------- |
28 | This enables the user to disable the lookup logic involved with IO merging | 28 | This enables the user to disable the lookup logic involved with IO |
29 | requests in the block layer. Merging may still occur through a direct | 29 | merging requests in the block layer. By default (0) all merges are |
30 | 1-hit cache, since that comes for (almost) free. The IO scheduler will not | 30 | enabled. When set to 1 only simple one-hit merges will be tried. When |
31 | waste cycles doing tree/hash lookups for merges if nomerges is 1. Defaults | 31 | set to 2 no merge algorithms will be tried (including one-hit or more |
32 | to 0, enabling all merges. | 32 | complex tree/hash lookups). |
33 | 33 | ||
34 | nr_requests (RW) | 34 | nr_requests (RW) |
35 | ---------------- | 35 | ---------------- |
diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt index da42ab414c48..9164ae3b83bc 100644 --- a/Documentation/cachetlb.txt +++ b/Documentation/cachetlb.txt | |||
@@ -5,7 +5,7 @@ | |||
5 | 5 | ||
6 | This document describes the cache/tlb flushing interfaces called | 6 | This document describes the cache/tlb flushing interfaces called |
7 | by the Linux VM subsystem. It enumerates over each interface, | 7 | by the Linux VM subsystem. It enumerates over each interface, |
8 | describes it's intended purpose, and what side effect is expected | 8 | describes its intended purpose, and what side effect is expected |
9 | after the interface is invoked. | 9 | after the interface is invoked. |
10 | 10 | ||
11 | The side effects described below are stated for a uniprocessor | 11 | The side effects described below are stated for a uniprocessor |
@@ -88,12 +88,12 @@ changes occur: | |||
88 | This is used primarily during fault processing. | 88 | This is used primarily during fault processing. |
89 | 89 | ||
90 | 5) void update_mmu_cache(struct vm_area_struct *vma, | 90 | 5) void update_mmu_cache(struct vm_area_struct *vma, |
91 | unsigned long address, pte_t pte) | 91 | unsigned long address, pte_t *ptep) |
92 | 92 | ||
93 | At the end of every page fault, this routine is invoked to | 93 | At the end of every page fault, this routine is invoked to |
94 | tell the architecture specific code that a translation | 94 | tell the architecture specific code that a translation |
95 | described by "pte" now exists at virtual address "address" | 95 | now exists at virtual address "address" for address space |
96 | for address space "vma->vm_mm", in the software page tables. | 96 | "vma->vm_mm", in the software page tables. |
97 | 97 | ||
98 | A port may use this information in any way it so chooses. | 98 | A port may use this information in any way it so chooses. |
99 | For example, it could use this event to pre-load TLB | 99 | For example, it could use this event to pre-load TLB |
@@ -231,7 +231,7 @@ require a whole different set of interfaces to handle properly. | |||
231 | The biggest problem is that of virtual aliasing in the data cache | 231 | The biggest problem is that of virtual aliasing in the data cache |
232 | of a processor. | 232 | of a processor. |
233 | 233 | ||
234 | Is your port susceptible to virtual aliasing in it's D-cache? | 234 | Is your port susceptible to virtual aliasing in its D-cache? |
235 | Well, if your D-cache is virtually indexed, is larger in size than | 235 | Well, if your D-cache is virtually indexed, is larger in size than |
236 | PAGE_SIZE, and does not prevent multiple cache lines for the same | 236 | PAGE_SIZE, and does not prevent multiple cache lines for the same |
237 | physical address from existing at once, you have this problem. | 237 | physical address from existing at once, you have this problem. |
@@ -249,7 +249,7 @@ one way to solve this (in particular SPARC_FLAG_MMAPSHARED). | |||
249 | Next, you have to solve the D-cache aliasing issue for all | 249 | Next, you have to solve the D-cache aliasing issue for all |
250 | other cases. Please keep in mind that fact that, for a given page | 250 | other cases. Please keep in mind that fact that, for a given page |
251 | mapped into some user address space, there is always at least one more | 251 | mapped into some user address space, there is always at least one more |
252 | mapping, that of the kernel in it's linear mapping starting at | 252 | mapping, that of the kernel in its linear mapping starting at |
253 | PAGE_OFFSET. So immediately, once the first user maps a given | 253 | PAGE_OFFSET. So immediately, once the first user maps a given |
254 | physical page into its address space, by implication the D-cache | 254 | physical page into its address space, by implication the D-cache |
255 | aliasing problem has the potential to exist since the kernel already | 255 | aliasing problem has the potential to exist since the kernel already |
@@ -377,3 +377,27 @@ maps this page at its virtual address. | |||
377 | All the functionality of flush_icache_page can be implemented in | 377 | All the functionality of flush_icache_page can be implemented in |
378 | flush_dcache_page and update_mmu_cache. In 2.7 the hope is to | 378 | flush_dcache_page and update_mmu_cache. In 2.7 the hope is to |
379 | remove this interface completely. | 379 | remove this interface completely. |
380 | |||
381 | The final category of APIs is for I/O to deliberately aliased address | ||
382 | ranges inside the kernel. Such aliases are set up by use of the | ||
383 | vmap/vmalloc API. Since kernel I/O goes via physical pages, the I/O | ||
384 | subsystem assumes that the user mapping and kernel offset mapping are | ||
385 | the only aliases. This isn't true for vmap aliases, so anything in | ||
386 | the kernel trying to do I/O to vmap areas must manually manage | ||
387 | coherency. It must do this by flushing the vmap range before doing | ||
388 | I/O and invalidating it after the I/O returns. | ||
389 | |||
390 | void flush_kernel_vmap_range(void *vaddr, int size) | ||
391 | flushes the kernel cache for a given virtual address range in | ||
392 | the vmap area. This is to make sure that any data the kernel | ||
393 | modified in the vmap range is made visible to the physical | ||
394 | page. The design is to make this area safe to perform I/O on. | ||
395 | Note that this API does *not* also flush the offset map alias | ||
396 | of the area. | ||
397 | |||
398 | void invalidate_kernel_vmap_range(void *vaddr, int size) invalidates | ||
399 | the cache for a given virtual address range in the vmap area | ||
400 | which prevents the processor from making the cache stale by | ||
401 | speculatively reading data while the I/O was occurring to the | ||
402 | physical pages. This is only necessary for data reads into the | ||
403 | vmap area. | ||
diff --git a/Documentation/cdrom/ide-cd b/Documentation/cdrom/ide-cd index 2c558cd6c1ef..f4dc9de2694e 100644 --- a/Documentation/cdrom/ide-cd +++ b/Documentation/cdrom/ide-cd | |||
@@ -159,42 +159,7 @@ two arguments: the CDROM device, and the slot number to which you wish | |||
159 | to change. If the slot number is -1, the drive is unloaded. | 159 | to change. If the slot number is -1, the drive is unloaded. |
160 | 160 | ||
161 | 161 | ||
162 | 4. Compilation options | 162 | 4. Common problems |
163 | ---------------------- | ||
164 | |||
165 | There are a few additional options which can be set when compiling the | ||
166 | driver. Most people should not need to mess with any of these; they | ||
167 | are listed here simply for completeness. A compilation option can be | ||
168 | enabled by adding a line of the form `#define <option> 1' to the top | ||
169 | of ide-cd.c. All these options are disabled by default. | ||
170 | |||
171 | VERBOSE_IDE_CD_ERRORS | ||
172 | If this is set, ATAPI error codes will be translated into textual | ||
173 | descriptions. In addition, a dump is made of the command which | ||
174 | provoked the error. This is off by default to save the memory used | ||
175 | by the (somewhat long) table of error descriptions. | ||
176 | |||
177 | STANDARD_ATAPI | ||
178 | If this is set, the code needed to deal with certain drives which do | ||
179 | not properly implement the ATAPI spec will be disabled. If you know | ||
180 | your drive implements ATAPI properly, you can turn this on to get a | ||
181 | slightly smaller kernel. | ||
182 | |||
183 | NO_DOOR_LOCKING | ||
184 | If this is set, the driver will never attempt to lock the door of | ||
185 | the drive. | ||
186 | |||
187 | CDROM_NBLOCKS_BUFFER | ||
188 | This sets the size of the buffer to be used for a CDROMREADAUDIO | ||
189 | ioctl. The default is 8. | ||
190 | |||
191 | TEST | ||
192 | This currently enables an additional ioctl which enables a user-mode | ||
193 | program to execute an arbitrary packet command. See the source for | ||
194 | details. This should be left off unless you know what you're doing. | ||
195 | |||
196 | |||
197 | 5. Common problems | ||
198 | ------------------ | 163 | ------------------ |
199 | 164 | ||
200 | This section discusses some common problems encountered when trying to | 165 | This section discusses some common problems encountered when trying to |
@@ -371,7 +336,7 @@ f. Data corruption. | |||
371 | expense of low system performance. | 336 | expense of low system performance. |
372 | 337 | ||
373 | 338 | ||
374 | 6. cdchange.c | 339 | 5. cdchange.c |
375 | ------------- | 340 | ------------- |
376 | 341 | ||
377 | /* | 342 | /* |
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt index 630879cd9a42..48e0b21b0059 100644 --- a/Documentation/cgroups/blkio-controller.txt +++ b/Documentation/cgroups/blkio-controller.txt | |||
@@ -17,6 +17,9 @@ HOWTO | |||
17 | You can do a very simple testing of running two dd threads in two different | 17 | You can do a very simple testing of running two dd threads in two different |
18 | cgroups. Here is what you can do. | 18 | cgroups. Here is what you can do. |
19 | 19 | ||
20 | - Enable Block IO controller | ||
21 | CONFIG_BLK_CGROUP=y | ||
22 | |||
20 | - Enable group scheduling in CFQ | 23 | - Enable group scheduling in CFQ |
21 | CONFIG_CFQ_GROUP_IOSCHED=y | 24 | CONFIG_CFQ_GROUP_IOSCHED=y |
22 | 25 | ||
@@ -54,32 +57,52 @@ cgroups. Here is what you can do. | |||
54 | 57 | ||
55 | Various user visible config options | 58 | Various user visible config options |
56 | =================================== | 59 | =================================== |
57 | CONFIG_CFQ_GROUP_IOSCHED | ||
58 | - Enables group scheduling in CFQ. Currently only 1 level of group | ||
59 | creation is allowed. | ||
60 | |||
61 | CONFIG_DEBUG_CFQ_IOSCHED | ||
62 | - Enables some debugging messages in blktrace. Also creates extra | ||
63 | cgroup file blkio.dequeue. | ||
64 | |||
65 | Config options selected automatically | ||
66 | ===================================== | ||
67 | These config options are not user visible and are selected/deselected | ||
68 | automatically based on IO scheduler configuration. | ||
69 | |||
70 | CONFIG_BLK_CGROUP | 60 | CONFIG_BLK_CGROUP |
71 | - Block IO controller. Selected by CONFIG_CFQ_GROUP_IOSCHED. | 61 | - Block IO controller. |
72 | 62 | ||
73 | CONFIG_DEBUG_BLK_CGROUP | 63 | CONFIG_DEBUG_BLK_CGROUP |
74 | - Debug help. Selected by CONFIG_DEBUG_CFQ_IOSCHED. | 64 | - Debug help. Right now some additional stats file show up in cgroup |
65 | if this option is enabled. | ||
66 | |||
67 | CONFIG_CFQ_GROUP_IOSCHED | ||
68 | - Enables group scheduling in CFQ. Currently only 1 level of group | ||
69 | creation is allowed. | ||
75 | 70 | ||
76 | Details of cgroup files | 71 | Details of cgroup files |
77 | ======================= | 72 | ======================= |
78 | - blkio.weight | 73 | - blkio.weight |
79 | - Specifies per cgroup weight. | 74 | - Specifies per cgroup weight. This is default weight of the group |
80 | 75 | on all the devices until and unless overridden by per device rule. | |
76 | (See blkio.weight_device). | ||
81 | Currently allowed range of weights is from 100 to 1000. | 77 | Currently allowed range of weights is from 100 to 1000. |
82 | 78 | ||
79 | - blkio.weight_device | ||
80 | - One can specify per cgroup per device rules using this interface. | ||
81 | These rules override the default value of group weight as specified | ||
82 | by blkio.weight. | ||
83 | |||
84 | Following is the format. | ||
85 | |||
86 | #echo dev_maj:dev_minor weight > /path/to/cgroup/blkio.weight_device | ||
87 | Configure weight=300 on /dev/sdb (8:16) in this cgroup | ||
88 | # echo 8:16 300 > blkio.weight_device | ||
89 | # cat blkio.weight_device | ||
90 | dev weight | ||
91 | 8:16 300 | ||
92 | |||
93 | Configure weight=500 on /dev/sda (8:0) in this cgroup | ||
94 | # echo 8:0 500 > blkio.weight_device | ||
95 | # cat blkio.weight_device | ||
96 | dev weight | ||
97 | 8:0 500 | ||
98 | 8:16 300 | ||
99 | |||
100 | Remove specific weight for /dev/sda in this cgroup | ||
101 | # echo 8:0 0 > blkio.weight_device | ||
102 | # cat blkio.weight_device | ||
103 | dev weight | ||
104 | 8:16 300 | ||
105 | |||
83 | - blkio.time | 106 | - blkio.time |
84 | - disk time allocated to cgroup per device in milliseconds. First | 107 | - disk time allocated to cgroup per device in milliseconds. First |
85 | two fields specify the major and minor number of the device and | 108 | two fields specify the major and minor number of the device and |
@@ -92,13 +115,105 @@ Details of cgroup files | |||
92 | third field specifies the number of sectors transferred by the | 115 | third field specifies the number of sectors transferred by the |
93 | group to/from the device. | 116 | group to/from the device. |
94 | 117 | ||
118 | - blkio.io_service_bytes | ||
119 | - Number of bytes transferred to/from the disk by the group. These | ||
120 | are further divided by the type of operation - read or write, sync | ||
121 | or async. First two fields specify the major and minor number of the | ||
122 | device, third field specifies the operation type and the fourth field | ||
123 | specifies the number of bytes. | ||
124 | |||
125 | - blkio.io_serviced | ||
126 | - Number of IOs completed to/from the disk by the group. These | ||
127 | are further divided by the type of operation - read or write, sync | ||
128 | or async. First two fields specify the major and minor number of the | ||
129 | device, third field specifies the operation type and the fourth field | ||
130 | specifies the number of IOs. | ||
131 | |||
132 | - blkio.io_service_time | ||
133 | - Total amount of time between request dispatch and request completion | ||
134 | for the IOs done by this cgroup. This is in nanoseconds to make it | ||
135 | meaningful for flash devices too. For devices with queue depth of 1, | ||
136 | this time represents the actual service time. When queue_depth > 1, | ||
137 | that is no longer true as requests may be served out of order. This | ||
138 | may cause the service time for a given IO to include the service time | ||
139 | of multiple IOs when served out of order which may result in total | ||
140 | io_service_time > actual time elapsed. This time is further divided by | ||
141 | the type of operation - read or write, sync or async. First two fields | ||
142 | specify the major and minor number of the device, third field | ||
143 | specifies the operation type and the fourth field specifies the | ||
144 | io_service_time in ns. | ||
145 | |||
146 | - blkio.io_wait_time | ||
147 | - Total amount of time the IOs for this cgroup spent waiting in the | ||
148 | scheduler queues for service. This can be greater than the total time | ||
149 | elapsed since it is cumulative io_wait_time for all IOs. It is not a | ||
150 | measure of total time the cgroup spent waiting but rather a measure of | ||
151 | the wait_time for its individual IOs. For devices with queue_depth > 1 | ||
152 | this metric does not include the time spent waiting for service once | ||
153 | the IO is dispatched to the device but till it actually gets serviced | ||
154 | (there might be a time lag here due to re-ordering of requests by the | ||
155 | device). This is in nanoseconds to make it meaningful for flash | ||
156 | devices too. This time is further divided by the type of operation - | ||
157 | read or write, sync or async. First two fields specify the major and | ||
158 | minor number of the device, third field specifies the operation type | ||
159 | and the fourth field specifies the io_wait_time in ns. | ||
160 | |||
161 | - blkio.io_merged | ||
162 | - Total number of bios/requests merged into requests belonging to this | ||
163 | cgroup. This is further divided by the type of operation - read or | ||
164 | write, sync or async. | ||
165 | |||
166 | - blkio.io_queued | ||
167 | - Total number of requests queued up at any given instant for this | ||
168 | cgroup. This is further divided by the type of operation - read or | ||
169 | write, sync or async. | ||
170 | |||
171 | - blkio.avg_queue_size | ||
172 | - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. | ||
173 | The average queue size for this cgroup over the entire time of this | ||
174 | cgroup's existence. Queue size samples are taken each time one of the | ||
175 | queues of this cgroup gets a timeslice. | ||
176 | |||
177 | - blkio.group_wait_time | ||
178 | - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. | ||
179 | This is the amount of time the cgroup had to wait since it became busy | ||
180 | (i.e., went from 0 to 1 request queued) to get a timeslice for one of | ||
181 | its queues. This is different from the io_wait_time which is the | ||
182 | cumulative total of the amount of time spent by each IO in that cgroup | ||
183 | waiting in the scheduler queue. This is in nanoseconds. If this is | ||
184 | read when the cgroup is in a waiting (for timeslice) state, the stat | ||
185 | will only report the group_wait_time accumulated till the last time it | ||
186 | got a timeslice and will not include the current delta. | ||
187 | |||
188 | - blkio.empty_time | ||
189 | - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. | ||
190 | This is the amount of time a cgroup spends without any pending | ||
191 | requests when not being served, i.e., it does not include any time | ||
192 | spent idling for one of the queues of the cgroup. This is in | ||
193 | nanoseconds. If this is read when the cgroup is in an empty state, | ||
194 | the stat will only report the empty_time accumulated till the last | ||
195 | time it had a pending request and will not include the current delta. | ||
196 | |||
197 | - blkio.idle_time | ||
198 | - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. | ||
199 | This is the amount of time spent by the IO scheduler idling for a | ||
200 | given cgroup in anticipation of a better request than the exising ones | ||
201 | from other queues/cgroups. This is in nanoseconds. If this is read | ||
202 | when the cgroup is in an idling state, the stat will only report the | ||
203 | idle_time accumulated till the last idle period and will not include | ||
204 | the current delta. | ||
205 | |||
95 | - blkio.dequeue | 206 | - blkio.dequeue |
96 | - Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This | 207 | - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This |
97 | gives the statistics about how many a times a group was dequeued | 208 | gives the statistics about how many a times a group was dequeued |
98 | from service tree of the device. First two fields specify the major | 209 | from service tree of the device. First two fields specify the major |
99 | and minor number of the device and third field specifies the number | 210 | and minor number of the device and third field specifies the number |
100 | of times a group was dequeued from a particular device. | 211 | of times a group was dequeued from a particular device. |
101 | 212 | ||
213 | - blkio.reset_stats | ||
214 | - Writing an int to this file will result in resetting all the stats | ||
215 | for that cgroup. | ||
216 | |||
102 | CFQ sysfs tunable | 217 | CFQ sysfs tunable |
103 | ================= | 218 | ================= |
104 | /sys/block/<disk>/queue/iosched/group_isolation | 219 | /sys/block/<disk>/queue/iosched/group_isolation |
diff --git a/Documentation/cgroups/cgroup_event_listener.c b/Documentation/cgroups/cgroup_event_listener.c new file mode 100644 index 000000000000..8c2bfc4a6358 --- /dev/null +++ b/Documentation/cgroups/cgroup_event_listener.c | |||
@@ -0,0 +1,110 @@ | |||
1 | /* | ||
2 | * cgroup_event_listener.c - Simple listener of cgroup events | ||
3 | * | ||
4 | * Copyright (C) Kirill A. Shutemov <kirill@shutemov.name> | ||
5 | */ | ||
6 | |||
7 | #include <assert.h> | ||
8 | #include <errno.h> | ||
9 | #include <fcntl.h> | ||
10 | #include <libgen.h> | ||
11 | #include <limits.h> | ||
12 | #include <stdio.h> | ||
13 | #include <string.h> | ||
14 | #include <unistd.h> | ||
15 | |||
16 | #include <sys/eventfd.h> | ||
17 | |||
18 | #define USAGE_STR "Usage: cgroup_event_listener <path-to-control-file> <args>\n" | ||
19 | |||
20 | int main(int argc, char **argv) | ||
21 | { | ||
22 | int efd = -1; | ||
23 | int cfd = -1; | ||
24 | int event_control = -1; | ||
25 | char event_control_path[PATH_MAX]; | ||
26 | char line[LINE_MAX]; | ||
27 | int ret; | ||
28 | |||
29 | if (argc != 3) { | ||
30 | fputs(USAGE_STR, stderr); | ||
31 | return 1; | ||
32 | } | ||
33 | |||
34 | cfd = open(argv[1], O_RDONLY); | ||
35 | if (cfd == -1) { | ||
36 | fprintf(stderr, "Cannot open %s: %s\n", argv[1], | ||
37 | strerror(errno)); | ||
38 | goto out; | ||
39 | } | ||
40 | |||
41 | ret = snprintf(event_control_path, PATH_MAX, "%s/cgroup.event_control", | ||
42 | dirname(argv[1])); | ||
43 | if (ret >= PATH_MAX) { | ||
44 | fputs("Path to cgroup.event_control is too long\n", stderr); | ||
45 | goto out; | ||
46 | } | ||
47 | |||
48 | event_control = open(event_control_path, O_WRONLY); | ||
49 | if (event_control == -1) { | ||
50 | fprintf(stderr, "Cannot open %s: %s\n", event_control_path, | ||
51 | strerror(errno)); | ||
52 | goto out; | ||
53 | } | ||
54 | |||
55 | efd = eventfd(0, 0); | ||
56 | if (efd == -1) { | ||
57 | perror("eventfd() failed"); | ||
58 | goto out; | ||
59 | } | ||
60 | |||
61 | ret = snprintf(line, LINE_MAX, "%d %d %s", efd, cfd, argv[2]); | ||
62 | if (ret >= LINE_MAX) { | ||
63 | fputs("Arguments string is too long\n", stderr); | ||
64 | goto out; | ||
65 | } | ||
66 | |||
67 | ret = write(event_control, line, strlen(line) + 1); | ||
68 | if (ret == -1) { | ||
69 | perror("Cannot write to cgroup.event_control"); | ||
70 | goto out; | ||
71 | } | ||
72 | |||
73 | while (1) { | ||
74 | uint64_t result; | ||
75 | |||
76 | ret = read(efd, &result, sizeof(result)); | ||
77 | if (ret == -1) { | ||
78 | if (errno == EINTR) | ||
79 | continue; | ||
80 | perror("Cannot read from eventfd"); | ||
81 | break; | ||
82 | } | ||
83 | assert(ret == sizeof(result)); | ||
84 | |||
85 | ret = access(event_control_path, W_OK); | ||
86 | if ((ret == -1) && (errno == ENOENT)) { | ||
87 | puts("The cgroup seems to have removed."); | ||
88 | ret = 0; | ||
89 | break; | ||
90 | } | ||
91 | |||
92 | if (ret == -1) { | ||
93 | perror("cgroup.event_control " | ||
94 | "is not accessable any more"); | ||
95 | break; | ||
96 | } | ||
97 | |||
98 | printf("%s %s: crossed\n", argv[1], argv[2]); | ||
99 | } | ||
100 | |||
101 | out: | ||
102 | if (efd >= 0) | ||
103 | close(efd); | ||
104 | if (event_control >= 0) | ||
105 | close(event_control); | ||
106 | if (cfd >= 0) | ||
107 | close(cfd); | ||
108 | |||
109 | return (ret != 0); | ||
110 | } | ||
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 0b33bfe7dde9..b34823ff1646 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt | |||
@@ -22,6 +22,8 @@ CONTENTS: | |||
22 | 2. Usage Examples and Syntax | 22 | 2. Usage Examples and Syntax |
23 | 2.1 Basic Usage | 23 | 2.1 Basic Usage |
24 | 2.2 Attaching processes | 24 | 2.2 Attaching processes |
25 | 2.3 Mounting hierarchies by name | ||
26 | 2.4 Notification API | ||
25 | 3. Kernel API | 27 | 3. Kernel API |
26 | 3.1 Overview | 28 | 3.1 Overview |
27 | 3.2 Synchronization | 29 | 3.2 Synchronization |
@@ -233,8 +235,7 @@ containing the following files describing that cgroup: | |||
233 | - cgroup.procs: list of tgids in the cgroup. This list is not | 235 | - cgroup.procs: list of tgids in the cgroup. This list is not |
234 | guaranteed to be sorted or free of duplicate tgids, and userspace | 236 | guaranteed to be sorted or free of duplicate tgids, and userspace |
235 | should sort/uniquify the list if this property is required. | 237 | should sort/uniquify the list if this property is required. |
236 | Writing a tgid into this file moves all threads with that tgid into | 238 | This is a read-only file, for now. |
237 | this cgroup. | ||
238 | - notify_on_release flag: run the release agent on exit? | 239 | - notify_on_release flag: run the release agent on exit? |
239 | - release_agent: the path to use for release notifications (this file | 240 | - release_agent: the path to use for release notifications (this file |
240 | exists in the top cgroup only) | 241 | exists in the top cgroup only) |
@@ -338,7 +339,7 @@ To mount a cgroup hierarchy with all available subsystems, type: | |||
338 | The "xxx" is not interpreted by the cgroup code, but will appear in | 339 | The "xxx" is not interpreted by the cgroup code, but will appear in |
339 | /proc/mounts so may be any useful identifying string that you like. | 340 | /proc/mounts so may be any useful identifying string that you like. |
340 | 341 | ||
341 | To mount a cgroup hierarchy with just the cpuset and numtasks | 342 | To mount a cgroup hierarchy with just the cpuset and memory |
342 | subsystems, type: | 343 | subsystems, type: |
343 | # mount -t cgroup -o cpuset,memory hier1 /dev/cgroup | 344 | # mount -t cgroup -o cpuset,memory hier1 /dev/cgroup |
344 | 345 | ||
@@ -434,6 +435,25 @@ you give a subsystem a name. | |||
434 | The name of the subsystem appears as part of the hierarchy description | 435 | The name of the subsystem appears as part of the hierarchy description |
435 | in /proc/mounts and /proc/<pid>/cgroups. | 436 | in /proc/mounts and /proc/<pid>/cgroups. |
436 | 437 | ||
438 | 2.4 Notification API | ||
439 | -------------------- | ||
440 | |||
441 | There is mechanism which allows to get notifications about changing | ||
442 | status of a cgroup. | ||
443 | |||
444 | To register new notification handler you need: | ||
445 | - create a file descriptor for event notification using eventfd(2); | ||
446 | - open a control file to be monitored (e.g. memory.usage_in_bytes); | ||
447 | - write "<event_fd> <control_fd> <args>" to cgroup.event_control. | ||
448 | Interpretation of args is defined by control file implementation; | ||
449 | |||
450 | eventfd will be woken up by control file implementation or when the | ||
451 | cgroup is removed. | ||
452 | |||
453 | To unregister notification handler just close eventfd. | ||
454 | |||
455 | NOTE: Support of notifications should be implemented for the control | ||
456 | file. See documentation for the subsystem. | ||
437 | 457 | ||
438 | 3. Kernel API | 458 | 3. Kernel API |
439 | ============= | 459 | ============= |
@@ -488,6 +508,11 @@ Each subsystem should: | |||
488 | - add an entry in linux/cgroup_subsys.h | 508 | - add an entry in linux/cgroup_subsys.h |
489 | - define a cgroup_subsys object called <name>_subsys | 509 | - define a cgroup_subsys object called <name>_subsys |
490 | 510 | ||
511 | If a subsystem can be compiled as a module, it should also have in its | ||
512 | module initcall a call to cgroup_load_subsys(), and in its exitcall a | ||
513 | call to cgroup_unload_subsys(). It should also set its_subsys.module = | ||
514 | THIS_MODULE in its .c file. | ||
515 | |||
491 | Each subsystem may export the following methods. The only mandatory | 516 | Each subsystem may export the following methods. The only mandatory |
492 | methods are create/destroy. Any others that are null are presumed to | 517 | methods are create/destroy. Any others that are null are presumed to |
493 | be successful no-ops. | 518 | be successful no-ops. |
@@ -536,10 +561,21 @@ returns an error, this will abort the attach operation. If a NULL | |||
536 | task is passed, then a successful result indicates that *any* | 561 | task is passed, then a successful result indicates that *any* |
537 | unspecified task can be moved into the cgroup. Note that this isn't | 562 | unspecified task can be moved into the cgroup. Note that this isn't |
538 | called on a fork. If this method returns 0 (success) then this should | 563 | called on a fork. If this method returns 0 (success) then this should |
539 | remain valid while the caller holds cgroup_mutex. If threadgroup is | 564 | remain valid while the caller holds cgroup_mutex and it is ensured that either |
565 | attach() or cancel_attach() will be called in future. If threadgroup is | ||
540 | true, then a successful result indicates that all threads in the given | 566 | true, then a successful result indicates that all threads in the given |
541 | thread's threadgroup can be moved together. | 567 | thread's threadgroup can be moved together. |
542 | 568 | ||
569 | void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | ||
570 | struct task_struct *task, bool threadgroup) | ||
571 | (cgroup_mutex held by caller) | ||
572 | |||
573 | Called when a task attach operation has failed after can_attach() has succeeded. | ||
574 | A subsystem whose can_attach() has some side-effects should provide this | ||
575 | function, so that the subsystem can implement a rollback. If not, not necessary. | ||
576 | This will be called only about subsystems whose can_attach() operation have | ||
577 | succeeded. | ||
578 | |||
543 | void attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | 579 | void attach(struct cgroup_subsys *ss, struct cgroup *cgrp, |
544 | struct cgroup *old_cgrp, struct task_struct *task, | 580 | struct cgroup *old_cgrp, struct task_struct *task, |
545 | bool threadgroup) | 581 | bool threadgroup) |
diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt index 1d7e9784439a..51682ab2dd1a 100644 --- a/Documentation/cgroups/cpusets.txt +++ b/Documentation/cgroups/cpusets.txt | |||
@@ -42,7 +42,7 @@ Nodes to a set of tasks. In this document "Memory Node" refers to | |||
42 | an on-line node that contains memory. | 42 | an on-line node that contains memory. |
43 | 43 | ||
44 | Cpusets constrain the CPU and Memory placement of tasks to only | 44 | Cpusets constrain the CPU and Memory placement of tasks to only |
45 | the resources within a tasks current cpuset. They form a nested | 45 | the resources within a task's current cpuset. They form a nested |
46 | hierarchy visible in a virtual file system. These are the essential | 46 | hierarchy visible in a virtual file system. These are the essential |
47 | hooks, beyond what is already present, required to manage dynamic | 47 | hooks, beyond what is already present, required to manage dynamic |
48 | job placement on large systems. | 48 | job placement on large systems. |
@@ -53,11 +53,11 @@ Documentation/cgroups/cgroups.txt. | |||
53 | Requests by a task, using the sched_setaffinity(2) system call to | 53 | Requests by a task, using the sched_setaffinity(2) system call to |
54 | include CPUs in its CPU affinity mask, and using the mbind(2) and | 54 | include CPUs in its CPU affinity mask, and using the mbind(2) and |
55 | set_mempolicy(2) system calls to include Memory Nodes in its memory | 55 | set_mempolicy(2) system calls to include Memory Nodes in its memory |
56 | policy, are both filtered through that tasks cpuset, filtering out any | 56 | policy, are both filtered through that task's cpuset, filtering out any |
57 | CPUs or Memory Nodes not in that cpuset. The scheduler will not | 57 | CPUs or Memory Nodes not in that cpuset. The scheduler will not |
58 | schedule a task on a CPU that is not allowed in its cpus_allowed | 58 | schedule a task on a CPU that is not allowed in its cpus_allowed |
59 | vector, and the kernel page allocator will not allocate a page on a | 59 | vector, and the kernel page allocator will not allocate a page on a |
60 | node that is not allowed in the requesting tasks mems_allowed vector. | 60 | node that is not allowed in the requesting task's mems_allowed vector. |
61 | 61 | ||
62 | User level code may create and destroy cpusets by name in the cgroup | 62 | User level code may create and destroy cpusets by name in the cgroup |
63 | virtual file system, manage the attributes and permissions of these | 63 | virtual file system, manage the attributes and permissions of these |
@@ -121,9 +121,9 @@ Cpusets extends these two mechanisms as follows: | |||
121 | - Each task in the system is attached to a cpuset, via a pointer | 121 | - Each task in the system is attached to a cpuset, via a pointer |
122 | in the task structure to a reference counted cgroup structure. | 122 | in the task structure to a reference counted cgroup structure. |
123 | - Calls to sched_setaffinity are filtered to just those CPUs | 123 | - Calls to sched_setaffinity are filtered to just those CPUs |
124 | allowed in that tasks cpuset. | 124 | allowed in that task's cpuset. |
125 | - Calls to mbind and set_mempolicy are filtered to just | 125 | - Calls to mbind and set_mempolicy are filtered to just |
126 | those Memory Nodes allowed in that tasks cpuset. | 126 | those Memory Nodes allowed in that task's cpuset. |
127 | - The root cpuset contains all the systems CPUs and Memory | 127 | - The root cpuset contains all the systems CPUs and Memory |
128 | Nodes. | 128 | Nodes. |
129 | - For any cpuset, one can define child cpusets containing a subset | 129 | - For any cpuset, one can define child cpusets containing a subset |
@@ -141,11 +141,11 @@ into the rest of the kernel, none in performance critical paths: | |||
141 | - in init/main.c, to initialize the root cpuset at system boot. | 141 | - in init/main.c, to initialize the root cpuset at system boot. |
142 | - in fork and exit, to attach and detach a task from its cpuset. | 142 | - in fork and exit, to attach and detach a task from its cpuset. |
143 | - in sched_setaffinity, to mask the requested CPUs by what's | 143 | - in sched_setaffinity, to mask the requested CPUs by what's |
144 | allowed in that tasks cpuset. | 144 | allowed in that task's cpuset. |
145 | - in sched.c migrate_live_tasks(), to keep migrating tasks within | 145 | - in sched.c migrate_live_tasks(), to keep migrating tasks within |
146 | the CPUs allowed by their cpuset, if possible. | 146 | the CPUs allowed by their cpuset, if possible. |
147 | - in the mbind and set_mempolicy system calls, to mask the requested | 147 | - in the mbind and set_mempolicy system calls, to mask the requested |
148 | Memory Nodes by what's allowed in that tasks cpuset. | 148 | Memory Nodes by what's allowed in that task's cpuset. |
149 | - in page_alloc.c, to restrict memory to allowed nodes. | 149 | - in page_alloc.c, to restrict memory to allowed nodes. |
150 | - in vmscan.c, to restrict page recovery to the current cpuset. | 150 | - in vmscan.c, to restrict page recovery to the current cpuset. |
151 | 151 | ||
@@ -155,7 +155,7 @@ new system calls are added for cpusets - all support for querying and | |||
155 | modifying cpusets is via this cpuset file system. | 155 | modifying cpusets is via this cpuset file system. |
156 | 156 | ||
157 | The /proc/<pid>/status file for each task has four added lines, | 157 | The /proc/<pid>/status file for each task has four added lines, |
158 | displaying the tasks cpus_allowed (on which CPUs it may be scheduled) | 158 | displaying the task's cpus_allowed (on which CPUs it may be scheduled) |
159 | and mems_allowed (on which Memory Nodes it may obtain memory), | 159 | and mems_allowed (on which Memory Nodes it may obtain memory), |
160 | in the two formats seen in the following example: | 160 | in the two formats seen in the following example: |
161 | 161 | ||
@@ -168,20 +168,20 @@ Each cpuset is represented by a directory in the cgroup file system | |||
168 | containing (on top of the standard cgroup files) the following | 168 | containing (on top of the standard cgroup files) the following |
169 | files describing that cpuset: | 169 | files describing that cpuset: |
170 | 170 | ||
171 | - cpus: list of CPUs in that cpuset | 171 | - cpuset.cpus: list of CPUs in that cpuset |
172 | - mems: list of Memory Nodes in that cpuset | 172 | - cpuset.mems: list of Memory Nodes in that cpuset |
173 | - memory_migrate flag: if set, move pages to cpusets nodes | 173 | - cpuset.memory_migrate flag: if set, move pages to cpusets nodes |
174 | - cpu_exclusive flag: is cpu placement exclusive? | 174 | - cpuset.cpu_exclusive flag: is cpu placement exclusive? |
175 | - mem_exclusive flag: is memory placement exclusive? | 175 | - cpuset.mem_exclusive flag: is memory placement exclusive? |
176 | - mem_hardwall flag: is memory allocation hardwalled | 176 | - cpuset.mem_hardwall flag: is memory allocation hardwalled |
177 | - memory_pressure: measure of how much paging pressure in cpuset | 177 | - cpuset.memory_pressure: measure of how much paging pressure in cpuset |
178 | - memory_spread_page flag: if set, spread page cache evenly on allowed nodes | 178 | - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes |
179 | - memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes | 179 | - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes |
180 | - sched_load_balance flag: if set, load balance within CPUs on that cpuset | 180 | - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset |
181 | - sched_relax_domain_level: the searching range when migrating tasks | 181 | - cpuset.sched_relax_domain_level: the searching range when migrating tasks |
182 | 182 | ||
183 | In addition, the root cpuset only has the following file: | 183 | In addition, the root cpuset only has the following file: |
184 | - memory_pressure_enabled flag: compute memory_pressure? | 184 | - cpuset.memory_pressure_enabled flag: compute memory_pressure? |
185 | 185 | ||
186 | New cpusets are created using the mkdir system call or shell | 186 | New cpusets are created using the mkdir system call or shell |
187 | command. The properties of a cpuset, such as its flags, allowed | 187 | command. The properties of a cpuset, such as its flags, allowed |
@@ -229,7 +229,7 @@ If a cpuset is cpu or mem exclusive, no other cpuset, other than | |||
229 | a direct ancestor or descendant, may share any of the same CPUs or | 229 | a direct ancestor or descendant, may share any of the same CPUs or |
230 | Memory Nodes. | 230 | Memory Nodes. |
231 | 231 | ||
232 | A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled", | 232 | A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled", |
233 | i.e. it restricts kernel allocations for page, buffer and other data | 233 | i.e. it restricts kernel allocations for page, buffer and other data |
234 | commonly shared by the kernel across multiple users. All cpusets, | 234 | commonly shared by the kernel across multiple users. All cpusets, |
235 | whether hardwalled or not, restrict allocations of memory for user | 235 | whether hardwalled or not, restrict allocations of memory for user |
@@ -304,15 +304,15 @@ times 1000. | |||
304 | --------------------------- | 304 | --------------------------- |
305 | There are two boolean flag files per cpuset that control where the | 305 | There are two boolean flag files per cpuset that control where the |
306 | kernel allocates pages for the file system buffers and related in | 306 | kernel allocates pages for the file system buffers and related in |
307 | kernel data structures. They are called 'memory_spread_page' and | 307 | kernel data structures. They are called 'cpuset.memory_spread_page' and |
308 | 'memory_spread_slab'. | 308 | 'cpuset.memory_spread_slab'. |
309 | 309 | ||
310 | If the per-cpuset boolean flag file 'memory_spread_page' is set, then | 310 | If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then |
311 | the kernel will spread the file system buffers (page cache) evenly | 311 | the kernel will spread the file system buffers (page cache) evenly |
312 | over all the nodes that the faulting task is allowed to use, instead | 312 | over all the nodes that the faulting task is allowed to use, instead |
313 | of preferring to put those pages on the node where the task is running. | 313 | of preferring to put those pages on the node where the task is running. |
314 | 314 | ||
315 | If the per-cpuset boolean flag file 'memory_spread_slab' is set, | 315 | If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set, |
316 | then the kernel will spread some file system related slab caches, | 316 | then the kernel will spread some file system related slab caches, |
317 | such as for inodes and dentries evenly over all the nodes that the | 317 | such as for inodes and dentries evenly over all the nodes that the |
318 | faulting task is allowed to use, instead of preferring to put those | 318 | faulting task is allowed to use, instead of preferring to put those |
@@ -323,41 +323,41 @@ stack segment pages of a task. | |||
323 | 323 | ||
324 | By default, both kinds of memory spreading are off, and memory | 324 | By default, both kinds of memory spreading are off, and memory |
325 | pages are allocated on the node local to where the task is running, | 325 | pages are allocated on the node local to where the task is running, |
326 | except perhaps as modified by the tasks NUMA mempolicy or cpuset | 326 | except perhaps as modified by the task's NUMA mempolicy or cpuset |
327 | configuration, so long as sufficient free memory pages are available. | 327 | configuration, so long as sufficient free memory pages are available. |
328 | 328 | ||
329 | When new cpusets are created, they inherit the memory spread settings | 329 | When new cpusets are created, they inherit the memory spread settings |
330 | of their parent. | 330 | of their parent. |
331 | 331 | ||
332 | Setting memory spreading causes allocations for the affected page | 332 | Setting memory spreading causes allocations for the affected page |
333 | or slab caches to ignore the tasks NUMA mempolicy and be spread | 333 | or slab caches to ignore the task's NUMA mempolicy and be spread |
334 | instead. Tasks using mbind() or set_mempolicy() calls to set NUMA | 334 | instead. Tasks using mbind() or set_mempolicy() calls to set NUMA |
335 | mempolicies will not notice any change in these calls as a result of | 335 | mempolicies will not notice any change in these calls as a result of |
336 | their containing tasks memory spread settings. If memory spreading | 336 | their containing task's memory spread settings. If memory spreading |
337 | is turned off, then the currently specified NUMA mempolicy once again | 337 | is turned off, then the currently specified NUMA mempolicy once again |
338 | applies to memory page allocations. | 338 | applies to memory page allocations. |
339 | 339 | ||
340 | Both 'memory_spread_page' and 'memory_spread_slab' are boolean flag | 340 | Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag |
341 | files. By default they contain "0", meaning that the feature is off | 341 | files. By default they contain "0", meaning that the feature is off |
342 | for that cpuset. If a "1" is written to that file, then that turns | 342 | for that cpuset. If a "1" is written to that file, then that turns |
343 | the named feature on. | 343 | the named feature on. |
344 | 344 | ||
345 | The implementation is simple. | 345 | The implementation is simple. |
346 | 346 | ||
347 | Setting the flag 'memory_spread_page' turns on a per-process flag | 347 | Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag |
348 | PF_SPREAD_PAGE for each task that is in that cpuset or subsequently | 348 | PF_SPREAD_PAGE for each task that is in that cpuset or subsequently |
349 | joins that cpuset. The page allocation calls for the page cache | 349 | joins that cpuset. The page allocation calls for the page cache |
350 | is modified to perform an inline check for this PF_SPREAD_PAGE task | 350 | is modified to perform an inline check for this PF_SPREAD_PAGE task |
351 | flag, and if set, a call to a new routine cpuset_mem_spread_node() | 351 | flag, and if set, a call to a new routine cpuset_mem_spread_node() |
352 | returns the node to prefer for the allocation. | 352 | returns the node to prefer for the allocation. |
353 | 353 | ||
354 | Similarly, setting 'memory_spread_slab' turns on the flag | 354 | Similarly, setting 'cpuset.memory_spread_slab' turns on the flag |
355 | PF_SPREAD_SLAB, and appropriately marked slab caches will allocate | 355 | PF_SPREAD_SLAB, and appropriately marked slab caches will allocate |
356 | pages from the node returned by cpuset_mem_spread_node(). | 356 | pages from the node returned by cpuset_mem_spread_node(). |
357 | 357 | ||
358 | The cpuset_mem_spread_node() routine is also simple. It uses the | 358 | The cpuset_mem_spread_node() routine is also simple. It uses the |
359 | value of a per-task rotor cpuset_mem_spread_rotor to select the next | 359 | value of a per-task rotor cpuset_mem_spread_rotor to select the next |
360 | node in the current tasks mems_allowed to prefer for the allocation. | 360 | node in the current task's mems_allowed to prefer for the allocation. |
361 | 361 | ||
362 | This memory placement policy is also known (in other contexts) as | 362 | This memory placement policy is also known (in other contexts) as |
363 | round-robin or interleave. | 363 | round-robin or interleave. |
@@ -404,24 +404,24 @@ the following two situations: | |||
404 | system overhead on those CPUs, including avoiding task load | 404 | system overhead on those CPUs, including avoiding task load |
405 | balancing if that is not needed. | 405 | balancing if that is not needed. |
406 | 406 | ||
407 | When the per-cpuset flag "sched_load_balance" is enabled (the default | 407 | When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default |
408 | setting), it requests that all the CPUs in that cpusets allowed 'cpus' | 408 | setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus' |
409 | be contained in a single sched domain, ensuring that load balancing | 409 | be contained in a single sched domain, ensuring that load balancing |
410 | can move a task (not otherwised pinned, as by sched_setaffinity) | 410 | can move a task (not otherwised pinned, as by sched_setaffinity) |
411 | from any CPU in that cpuset to any other. | 411 | from any CPU in that cpuset to any other. |
412 | 412 | ||
413 | When the per-cpuset flag "sched_load_balance" is disabled, then the | 413 | When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the |
414 | scheduler will avoid load balancing across the CPUs in that cpuset, | 414 | scheduler will avoid load balancing across the CPUs in that cpuset, |
415 | --except-- in so far as is necessary because some overlapping cpuset | 415 | --except-- in so far as is necessary because some overlapping cpuset |
416 | has "sched_load_balance" enabled. | 416 | has "sched_load_balance" enabled. |
417 | 417 | ||
418 | So, for example, if the top cpuset has the flag "sched_load_balance" | 418 | So, for example, if the top cpuset has the flag "cpuset.sched_load_balance" |
419 | enabled, then the scheduler will have one sched domain covering all | 419 | enabled, then the scheduler will have one sched domain covering all |
420 | CPUs, and the setting of the "sched_load_balance" flag in any other | 420 | CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other |
421 | cpusets won't matter, as we're already fully load balancing. | 421 | cpusets won't matter, as we're already fully load balancing. |
422 | 422 | ||
423 | Therefore in the above two situations, the top cpuset flag | 423 | Therefore in the above two situations, the top cpuset flag |
424 | "sched_load_balance" should be disabled, and only some of the smaller, | 424 | "cpuset.sched_load_balance" should be disabled, and only some of the smaller, |
425 | child cpusets have this flag enabled. | 425 | child cpusets have this flag enabled. |
426 | 426 | ||
427 | When doing this, you don't usually want to leave any unpinned tasks in | 427 | When doing this, you don't usually want to leave any unpinned tasks in |
@@ -433,7 +433,7 @@ scheduler might not consider the possibility of load balancing that | |||
433 | task to that underused CPU. | 433 | task to that underused CPU. |
434 | 434 | ||
435 | Of course, tasks pinned to a particular CPU can be left in a cpuset | 435 | Of course, tasks pinned to a particular CPU can be left in a cpuset |
436 | that disables "sched_load_balance" as those tasks aren't going anywhere | 436 | that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere |
437 | else anyway. | 437 | else anyway. |
438 | 438 | ||
439 | There is an impedance mismatch here, between cpusets and sched domains. | 439 | There is an impedance mismatch here, between cpusets and sched domains. |
@@ -443,19 +443,19 @@ overlap and each CPU is in at most one sched domain. | |||
443 | It is necessary for sched domains to be flat because load balancing | 443 | It is necessary for sched domains to be flat because load balancing |
444 | across partially overlapping sets of CPUs would risk unstable dynamics | 444 | across partially overlapping sets of CPUs would risk unstable dynamics |
445 | that would be beyond our understanding. So if each of two partially | 445 | that would be beyond our understanding. So if each of two partially |
446 | overlapping cpusets enables the flag 'sched_load_balance', then we | 446 | overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we |
447 | form a single sched domain that is a superset of both. We won't move | 447 | form a single sched domain that is a superset of both. We won't move |
448 | a task to a CPU outside it cpuset, but the scheduler load balancing | 448 | a task to a CPU outside it cpuset, but the scheduler load balancing |
449 | code might waste some compute cycles considering that possibility. | 449 | code might waste some compute cycles considering that possibility. |
450 | 450 | ||
451 | This mismatch is why there is not a simple one-to-one relation | 451 | This mismatch is why there is not a simple one-to-one relation |
452 | between which cpusets have the flag "sched_load_balance" enabled, | 452 | between which cpusets have the flag "cpuset.sched_load_balance" enabled, |
453 | and the sched domain configuration. If a cpuset enables the flag, it | 453 | and the sched domain configuration. If a cpuset enables the flag, it |
454 | will get balancing across all its CPUs, but if it disables the flag, | 454 | will get balancing across all its CPUs, but if it disables the flag, |
455 | it will only be assured of no load balancing if no other overlapping | 455 | it will only be assured of no load balancing if no other overlapping |
456 | cpuset enables the flag. | 456 | cpuset enables the flag. |
457 | 457 | ||
458 | If two cpusets have partially overlapping 'cpus' allowed, and only | 458 | If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only |
459 | one of them has this flag enabled, then the other may find its | 459 | one of them has this flag enabled, then the other may find its |
460 | tasks only partially load balanced, just on the overlapping CPUs. | 460 | tasks only partially load balanced, just on the overlapping CPUs. |
461 | This is just the general case of the top_cpuset example given a few | 461 | This is just the general case of the top_cpuset example given a few |
@@ -468,23 +468,23 @@ load balancing to the other CPUs. | |||
468 | 1.7.1 sched_load_balance implementation details. | 468 | 1.7.1 sched_load_balance implementation details. |
469 | ------------------------------------------------ | 469 | ------------------------------------------------ |
470 | 470 | ||
471 | The per-cpuset flag 'sched_load_balance' defaults to enabled (contrary | 471 | The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary |
472 | to most cpuset flags.) When enabled for a cpuset, the kernel will | 472 | to most cpuset flags.) When enabled for a cpuset, the kernel will |
473 | ensure that it can load balance across all the CPUs in that cpuset | 473 | ensure that it can load balance across all the CPUs in that cpuset |
474 | (makes sure that all the CPUs in the cpus_allowed of that cpuset are | 474 | (makes sure that all the CPUs in the cpus_allowed of that cpuset are |
475 | in the same sched domain.) | 475 | in the same sched domain.) |
476 | 476 | ||
477 | If two overlapping cpusets both have 'sched_load_balance' enabled, | 477 | If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled, |
478 | then they will be (must be) both in the same sched domain. | 478 | then they will be (must be) both in the same sched domain. |
479 | 479 | ||
480 | If, as is the default, the top cpuset has 'sched_load_balance' enabled, | 480 | If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled, |
481 | then by the above that means there is a single sched domain covering | 481 | then by the above that means there is a single sched domain covering |
482 | the whole system, regardless of any other cpuset settings. | 482 | the whole system, regardless of any other cpuset settings. |
483 | 483 | ||
484 | The kernel commits to user space that it will avoid load balancing | 484 | The kernel commits to user space that it will avoid load balancing |
485 | where it can. It will pick as fine a granularity partition of sched | 485 | where it can. It will pick as fine a granularity partition of sched |
486 | domains as it can while still providing load balancing for any set | 486 | domains as it can while still providing load balancing for any set |
487 | of CPUs allowed to a cpuset having 'sched_load_balance' enabled. | 487 | of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled. |
488 | 488 | ||
489 | The internal kernel cpuset to scheduler interface passes from the | 489 | The internal kernel cpuset to scheduler interface passes from the |
490 | cpuset code to the scheduler code a partition of the load balanced | 490 | cpuset code to the scheduler code a partition of the load balanced |
@@ -495,9 +495,9 @@ all the CPUs that must be load balanced. | |||
495 | The cpuset code builds a new such partition and passes it to the | 495 | The cpuset code builds a new such partition and passes it to the |
496 | scheduler sched domain setup code, to have the sched domains rebuilt | 496 | scheduler sched domain setup code, to have the sched domains rebuilt |
497 | as necessary, whenever: | 497 | as necessary, whenever: |
498 | - the 'sched_load_balance' flag of a cpuset with non-empty CPUs changes, | 498 | - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes, |
499 | - or CPUs come or go from a cpuset with this flag enabled, | 499 | - or CPUs come or go from a cpuset with this flag enabled, |
500 | - or 'sched_relax_domain_level' value of a cpuset with non-empty CPUs | 500 | - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs |
501 | and with this flag enabled changes, | 501 | and with this flag enabled changes, |
502 | - or a cpuset with non-empty CPUs and with this flag enabled is removed, | 502 | - or a cpuset with non-empty CPUs and with this flag enabled is removed, |
503 | - or a cpu is offlined/onlined. | 503 | - or a cpu is offlined/onlined. |
@@ -542,7 +542,7 @@ As the result, task B on CPU X need to wait task A or wait load balance | |||
542 | on the next tick. For some applications in special situation, waiting | 542 | on the next tick. For some applications in special situation, waiting |
543 | 1 tick may be too long. | 543 | 1 tick may be too long. |
544 | 544 | ||
545 | The 'sched_relax_domain_level' file allows you to request changing | 545 | The 'cpuset.sched_relax_domain_level' file allows you to request changing |
546 | this searching range as you like. This file takes int value which | 546 | this searching range as you like. This file takes int value which |
547 | indicates size of searching range in levels ideally as follows, | 547 | indicates size of searching range in levels ideally as follows, |
548 | otherwise initial value -1 that indicates the cpuset has no request. | 548 | otherwise initial value -1 that indicates the cpuset has no request. |
@@ -559,8 +559,8 @@ The system default is architecture dependent. The system default | |||
559 | can be changed using the relax_domain_level= boot parameter. | 559 | can be changed using the relax_domain_level= boot parameter. |
560 | 560 | ||
561 | This file is per-cpuset and affect the sched domain where the cpuset | 561 | This file is per-cpuset and affect the sched domain where the cpuset |
562 | belongs to. Therefore if the flag 'sched_load_balance' of a cpuset | 562 | belongs to. Therefore if the flag 'cpuset.sched_load_balance' of a cpuset |
563 | is disabled, then 'sched_relax_domain_level' have no effect since | 563 | is disabled, then 'cpuset.sched_relax_domain_level' have no effect since |
564 | there is no sched domain belonging the cpuset. | 564 | there is no sched domain belonging the cpuset. |
565 | 565 | ||
566 | If multiple cpusets are overlapping and hence they form a single sched | 566 | If multiple cpusets are overlapping and hence they form a single sched |
@@ -594,7 +594,7 @@ is attached, is subtle. | |||
594 | If a cpuset has its Memory Nodes modified, then for each task attached | 594 | If a cpuset has its Memory Nodes modified, then for each task attached |
595 | to that cpuset, the next time that the kernel attempts to allocate | 595 | to that cpuset, the next time that the kernel attempts to allocate |
596 | a page of memory for that task, the kernel will notice the change | 596 | a page of memory for that task, the kernel will notice the change |
597 | in the tasks cpuset, and update its per-task memory placement to | 597 | in the task's cpuset, and update its per-task memory placement to |
598 | remain within the new cpusets memory placement. If the task was using | 598 | remain within the new cpusets memory placement. If the task was using |
599 | mempolicy MPOL_BIND, and the nodes to which it was bound overlap with | 599 | mempolicy MPOL_BIND, and the nodes to which it was bound overlap with |
600 | its new cpuset, then the task will continue to use whatever subset | 600 | its new cpuset, then the task will continue to use whatever subset |
@@ -603,13 +603,13 @@ was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed | |||
603 | in the new cpuset, then the task will be essentially treated as if it | 603 | in the new cpuset, then the task will be essentially treated as if it |
604 | was MPOL_BIND bound to the new cpuset (even though its NUMA placement, | 604 | was MPOL_BIND bound to the new cpuset (even though its NUMA placement, |
605 | as queried by get_mempolicy(), doesn't change). If a task is moved | 605 | as queried by get_mempolicy(), doesn't change). If a task is moved |
606 | from one cpuset to another, then the kernel will adjust the tasks | 606 | from one cpuset to another, then the kernel will adjust the task's |
607 | memory placement, as above, the next time that the kernel attempts | 607 | memory placement, as above, the next time that the kernel attempts |
608 | to allocate a page of memory for that task. | 608 | to allocate a page of memory for that task. |
609 | 609 | ||
610 | If a cpuset has its 'cpus' modified, then each task in that cpuset | 610 | If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset |
611 | will have its allowed CPU placement changed immediately. Similarly, | 611 | will have its allowed CPU placement changed immediately. Similarly, |
612 | if a tasks pid is written to another cpusets 'tasks' file, then its | 612 | if a task's pid is written to another cpusets 'cpuset.tasks' file, then its |
613 | allowed CPU placement is changed immediately. If such a task had been | 613 | allowed CPU placement is changed immediately. If such a task had been |
614 | bound to some subset of its cpuset using the sched_setaffinity() call, | 614 | bound to some subset of its cpuset using the sched_setaffinity() call, |
615 | the task will be allowed to run on any CPU allowed in its new cpuset, | 615 | the task will be allowed to run on any CPU allowed in its new cpuset, |
@@ -622,21 +622,21 @@ and the processor placement is updated immediately. | |||
622 | Normally, once a page is allocated (given a physical page | 622 | Normally, once a page is allocated (given a physical page |
623 | of main memory) then that page stays on whatever node it | 623 | of main memory) then that page stays on whatever node it |
624 | was allocated, so long as it remains allocated, even if the | 624 | was allocated, so long as it remains allocated, even if the |
625 | cpusets memory placement policy 'mems' subsequently changes. | 625 | cpusets memory placement policy 'cpuset.mems' subsequently changes. |
626 | If the cpuset flag file 'memory_migrate' is set true, then when | 626 | If the cpuset flag file 'cpuset.memory_migrate' is set true, then when |
627 | tasks are attached to that cpuset, any pages that task had | 627 | tasks are attached to that cpuset, any pages that task had |
628 | allocated to it on nodes in its previous cpuset are migrated | 628 | allocated to it on nodes in its previous cpuset are migrated |
629 | to the tasks new cpuset. The relative placement of the page within | 629 | to the task's new cpuset. The relative placement of the page within |
630 | the cpuset is preserved during these migration operations if possible. | 630 | the cpuset is preserved during these migration operations if possible. |
631 | For example if the page was on the second valid node of the prior cpuset | 631 | For example if the page was on the second valid node of the prior cpuset |
632 | then the page will be placed on the second valid node of the new cpuset. | 632 | then the page will be placed on the second valid node of the new cpuset. |
633 | 633 | ||
634 | Also if 'memory_migrate' is set true, then if that cpusets | 634 | Also if 'cpuset.memory_migrate' is set true, then if that cpuset's |
635 | 'mems' file is modified, pages allocated to tasks in that | 635 | 'cpuset.mems' file is modified, pages allocated to tasks in that |
636 | cpuset, that were on nodes in the previous setting of 'mems', | 636 | cpuset, that were on nodes in the previous setting of 'cpuset.mems', |
637 | will be moved to nodes in the new setting of 'mems.' | 637 | will be moved to nodes in the new setting of 'mems.' |
638 | Pages that were not in the tasks prior cpuset, or in the cpusets | 638 | Pages that were not in the task's prior cpuset, or in the cpuset's |
639 | prior 'mems' setting, will not be moved. | 639 | prior 'cpuset.mems' setting, will not be moved. |
640 | 640 | ||
641 | There is an exception to the above. If hotplug functionality is used | 641 | There is an exception to the above. If hotplug functionality is used |
642 | to remove all the CPUs that are currently assigned to a cpuset, | 642 | to remove all the CPUs that are currently assigned to a cpuset, |
@@ -655,7 +655,7 @@ There is a second exception to the above. GFP_ATOMIC requests are | |||
655 | kernel internal allocations that must be satisfied, immediately. | 655 | kernel internal allocations that must be satisfied, immediately. |
656 | The kernel may drop some request, in rare cases even panic, if a | 656 | The kernel may drop some request, in rare cases even panic, if a |
657 | GFP_ATOMIC alloc fails. If the request cannot be satisfied within | 657 | GFP_ATOMIC alloc fails. If the request cannot be satisfied within |
658 | the current tasks cpuset, then we relax the cpuset, and look for | 658 | the current task's cpuset, then we relax the cpuset, and look for |
659 | memory anywhere we can find it. It's better to violate the cpuset | 659 | memory anywhere we can find it. It's better to violate the cpuset |
660 | than stress the kernel. | 660 | than stress the kernel. |
661 | 661 | ||
@@ -678,8 +678,8 @@ and then start a subshell 'sh' in that cpuset: | |||
678 | cd /dev/cpuset | 678 | cd /dev/cpuset |
679 | mkdir Charlie | 679 | mkdir Charlie |
680 | cd Charlie | 680 | cd Charlie |
681 | /bin/echo 2-3 > cpus | 681 | /bin/echo 2-3 > cpuset.cpus |
682 | /bin/echo 1 > mems | 682 | /bin/echo 1 > cpuset.mems |
683 | /bin/echo $$ > tasks | 683 | /bin/echo $$ > tasks |
684 | sh | 684 | sh |
685 | # The subshell 'sh' is now running in cpuset Charlie | 685 | # The subshell 'sh' is now running in cpuset Charlie |
@@ -725,10 +725,13 @@ Now you want to do something with this cpuset. | |||
725 | 725 | ||
726 | In this directory you can find several files: | 726 | In this directory you can find several files: |
727 | # ls | 727 | # ls |
728 | cpu_exclusive memory_migrate mems tasks | 728 | cpuset.cpu_exclusive cpuset.memory_spread_slab |
729 | cpus memory_pressure notify_on_release | 729 | cpuset.cpus cpuset.mems |
730 | mem_exclusive memory_spread_page sched_load_balance | 730 | cpuset.mem_exclusive cpuset.sched_load_balance |
731 | mem_hardwall memory_spread_slab sched_relax_domain_level | 731 | cpuset.mem_hardwall cpuset.sched_relax_domain_level |
732 | cpuset.memory_migrate notify_on_release | ||
733 | cpuset.memory_pressure tasks | ||
734 | cpuset.memory_spread_page | ||
732 | 735 | ||
733 | Reading them will give you information about the state of this cpuset: | 736 | Reading them will give you information about the state of this cpuset: |
734 | the CPUs and Memory Nodes it can use, the processes that are using | 737 | the CPUs and Memory Nodes it can use, the processes that are using |
@@ -736,13 +739,13 @@ it, its properties. By writing to these files you can manipulate | |||
736 | the cpuset. | 739 | the cpuset. |
737 | 740 | ||
738 | Set some flags: | 741 | Set some flags: |
739 | # /bin/echo 1 > cpu_exclusive | 742 | # /bin/echo 1 > cpuset.cpu_exclusive |
740 | 743 | ||
741 | Add some cpus: | 744 | Add some cpus: |
742 | # /bin/echo 0-7 > cpus | 745 | # /bin/echo 0-7 > cpuset.cpus |
743 | 746 | ||
744 | Add some mems: | 747 | Add some mems: |
745 | # /bin/echo 0-7 > mems | 748 | # /bin/echo 0-7 > cpuset.mems |
746 | 749 | ||
747 | Now attach your shell to this cpuset: | 750 | Now attach your shell to this cpuset: |
748 | # /bin/echo $$ > tasks | 751 | # /bin/echo $$ > tasks |
@@ -774,28 +777,28 @@ echo "/sbin/cpuset_release_agent" > /dev/cpuset/release_agent | |||
774 | This is the syntax to use when writing in the cpus or mems files | 777 | This is the syntax to use when writing in the cpus or mems files |
775 | in cpuset directories: | 778 | in cpuset directories: |
776 | 779 | ||
777 | # /bin/echo 1-4 > cpus -> set cpus list to cpus 1,2,3,4 | 780 | # /bin/echo 1-4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 |
778 | # /bin/echo 1,2,3,4 > cpus -> set cpus list to cpus 1,2,3,4 | 781 | # /bin/echo 1,2,3,4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 |
779 | 782 | ||
780 | To add a CPU to a cpuset, write the new list of CPUs including the | 783 | To add a CPU to a cpuset, write the new list of CPUs including the |
781 | CPU to be added. To add 6 to the above cpuset: | 784 | CPU to be added. To add 6 to the above cpuset: |
782 | 785 | ||
783 | # /bin/echo 1-4,6 > cpus -> set cpus list to cpus 1,2,3,4,6 | 786 | # /bin/echo 1-4,6 > cpuset.cpus -> set cpus list to cpus 1,2,3,4,6 |
784 | 787 | ||
785 | Similarly to remove a CPU from a cpuset, write the new list of CPUs | 788 | Similarly to remove a CPU from a cpuset, write the new list of CPUs |
786 | without the CPU to be removed. | 789 | without the CPU to be removed. |
787 | 790 | ||
788 | To remove all the CPUs: | 791 | To remove all the CPUs: |
789 | 792 | ||
790 | # /bin/echo "" > cpus -> clear cpus list | 793 | # /bin/echo "" > cpuset.cpus -> clear cpus list |
791 | 794 | ||
792 | 2.3 Setting flags | 795 | 2.3 Setting flags |
793 | ----------------- | 796 | ----------------- |
794 | 797 | ||
795 | The syntax is very simple: | 798 | The syntax is very simple: |
796 | 799 | ||
797 | # /bin/echo 1 > cpu_exclusive -> set flag 'cpu_exclusive' | 800 | # /bin/echo 1 > cpuset.cpu_exclusive -> set flag 'cpuset.cpu_exclusive' |
798 | # /bin/echo 0 > cpu_exclusive -> unset flag 'cpu_exclusive' | 801 | # /bin/echo 0 > cpuset.cpu_exclusive -> unset flag 'cpuset.cpu_exclusive' |
799 | 802 | ||
800 | 2.4 Attaching processes | 803 | 2.4 Attaching processes |
801 | ----------------------- | 804 | ----------------------- |
diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt index 72db89ed0609..b7eececfb195 100644 --- a/Documentation/cgroups/memcg_test.txt +++ b/Documentation/cgroups/memcg_test.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | Memory Resource Controller(Memcg) Implementation Memo. | 1 | Memory Resource Controller(Memcg) Implementation Memo. |
2 | Last Updated: 2009/1/20 | 2 | Last Updated: 2010/2 |
3 | Base Kernel Version: based on 2.6.29-rc2. | 3 | Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34). |
4 | 4 | ||
5 | Because VM is getting complex (one of reasons is memcg...), memcg's behavior | 5 | Because VM is getting complex (one of reasons is memcg...), memcg's behavior |
6 | is complex. This is a document for memcg's internal behavior. | 6 | is complex. This is a document for memcg's internal behavior. |
@@ -244,7 +244,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. | |||
244 | we have to check if OLDPAGE/NEWPAGE is a valid page after commit(). | 244 | we have to check if OLDPAGE/NEWPAGE is a valid page after commit(). |
245 | 245 | ||
246 | 8. LRU | 246 | 8. LRU |
247 | Each memcg has its own private LRU. Now, it's handling is under global | 247 | Each memcg has its own private LRU. Now, its handling is under global |
248 | VM's control (means that it's handled under global zone->lru_lock). | 248 | VM's control (means that it's handled under global zone->lru_lock). |
249 | Almost all routines around memcg's LRU is called by global LRU's | 249 | Almost all routines around memcg's LRU is called by global LRU's |
250 | list management functions under zone->lru_lock(). | 250 | list management functions under zone->lru_lock(). |
@@ -337,7 +337,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. | |||
337 | race and lock dependency with other cgroup subsystems. | 337 | race and lock dependency with other cgroup subsystems. |
338 | 338 | ||
339 | example) | 339 | example) |
340 | # mount -t cgroup none /cgroup -t cpuset,memory,cpu,devices | 340 | # mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices |
341 | 341 | ||
342 | and do task move, mkdir, rmdir etc...under this. | 342 | and do task move, mkdir, rmdir etc...under this. |
343 | 343 | ||
@@ -348,7 +348,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. | |||
348 | 348 | ||
349 | For example, test like following is good. | 349 | For example, test like following is good. |
350 | (Shell-A) | 350 | (Shell-A) |
351 | # mount -t cgroup none /cgroup -t memory | 351 | # mount -t cgroup none /cgroup -o memory |
352 | # mkdir /cgroup/test | 352 | # mkdir /cgroup/test |
353 | # echo 40M > /cgroup/test/memory.limit_in_bytes | 353 | # echo 40M > /cgroup/test/memory.limit_in_bytes |
354 | # echo 0 > /cgroup/test/tasks | 354 | # echo 0 > /cgroup/test/tasks |
@@ -378,3 +378,42 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. | |||
378 | #echo 50M > memory.limit_in_bytes | 378 | #echo 50M > memory.limit_in_bytes |
379 | #echo 50M > memory.memsw.limit_in_bytes | 379 | #echo 50M > memory.memsw.limit_in_bytes |
380 | run 51M of malloc | 380 | run 51M of malloc |
381 | |||
382 | 9.9 Move charges at task migration | ||
383 | Charges associated with a task can be moved along with task migration. | ||
384 | |||
385 | (Shell-A) | ||
386 | #mkdir /cgroup/A | ||
387 | #echo $$ >/cgroup/A/tasks | ||
388 | run some programs which uses some amount of memory in /cgroup/A. | ||
389 | |||
390 | (Shell-B) | ||
391 | #mkdir /cgroup/B | ||
392 | #echo 1 >/cgroup/B/memory.move_charge_at_immigrate | ||
393 | #echo "pid of the program running in group A" >/cgroup/B/tasks | ||
394 | |||
395 | You can see charges have been moved by reading *.usage_in_bytes or | ||
396 | memory.stat of both A and B. | ||
397 | See 8.2 of Documentation/cgroups/memory.txt to see what value should be | ||
398 | written to move_charge_at_immigrate. | ||
399 | |||
400 | 9.10 Memory thresholds | ||
401 | Memory controler implements memory thresholds using cgroups notification | ||
402 | API. You can use Documentation/cgroups/cgroup_event_listener.c to test | ||
403 | it. | ||
404 | |||
405 | (Shell-A) Create cgroup and run event listener | ||
406 | # mkdir /cgroup/A | ||
407 | # ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M | ||
408 | |||
409 | (Shell-B) Add task to cgroup and try to allocate and free memory | ||
410 | # echo $$ >/cgroup/A/tasks | ||
411 | # a="$(dd if=/dev/zero bs=1M count=10)" | ||
412 | # a= | ||
413 | |||
414 | You will see message from cgroup_event_listener every time you cross | ||
415 | the thresholds. | ||
416 | |||
417 | Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds. | ||
418 | |||
419 | It's good idea to test root cgroup as well. | ||
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index b871f2552b45..7781857dc940 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
@@ -1,18 +1,15 @@ | |||
1 | Memory Resource Controller | 1 | Memory Resource Controller |
2 | 2 | ||
3 | NOTE: The Memory Resource Controller has been generically been referred | 3 | NOTE: The Memory Resource Controller has been generically been referred |
4 | to as the memory controller in this document. Do not confuse memory controller | 4 | to as the memory controller in this document. Do not confuse memory |
5 | used here with the memory controller that is used in hardware. | 5 | controller used here with the memory controller that is used in hardware. |
6 | 6 | ||
7 | Salient features | 7 | (For editors) |
8 | 8 | In this document: | |
9 | a. Enable control of Anonymous, Page Cache (mapped and unmapped) and | 9 | When we mention a cgroup (cgroupfs's directory) with memory controller, |
10 | Swap Cache memory pages. | 10 | we call it "memory cgroup". When you see git-log and source code, you'll |
11 | b. The infrastructure allows easy addition of other types of memory to control | 11 | see patch's title and function names tend to use "memcg". |
12 | c. Provides *zero overhead* for non memory controller users | 12 | In this document, we avoid using it. |
13 | d. Provides a double LRU: global memory pressure causes reclaim from the | ||
14 | global LRU; a cgroup on hitting a limit, reclaims from the per | ||
15 | cgroup LRU | ||
16 | 13 | ||
17 | Benefits and Purpose of the memory controller | 14 | Benefits and Purpose of the memory controller |
18 | 15 | ||
@@ -33,6 +30,45 @@ d. A CD/DVD burner could control the amount of memory used by the | |||
33 | e. There are several other use cases, find one or use the controller just | 30 | e. There are several other use cases, find one or use the controller just |
34 | for fun (to learn and hack on the VM subsystem). | 31 | for fun (to learn and hack on the VM subsystem). |
35 | 32 | ||
33 | Current Status: linux-2.6.34-mmotm(development version of 2010/April) | ||
34 | |||
35 | Features: | ||
36 | - accounting anonymous pages, file caches, swap caches usage and limiting them. | ||
37 | - private LRU and reclaim routine. (system's global LRU and private LRU | ||
38 | work independently from each other) | ||
39 | - optionally, memory+swap usage can be accounted and limited. | ||
40 | - hierarchical accounting | ||
41 | - soft limit | ||
42 | - moving(recharging) account at moving a task is selectable. | ||
43 | - usage threshold notifier | ||
44 | - oom-killer disable knob and oom-notifier | ||
45 | - Root cgroup has no limit controls. | ||
46 | |||
47 | Kernel memory and Hugepages are not under control yet. We just manage | ||
48 | pages on LRU. To add more controls, we have to take care of performance. | ||
49 | |||
50 | Brief summary of control files. | ||
51 | |||
52 | tasks # attach a task(thread) and show list of threads | ||
53 | cgroup.procs # show list of processes | ||
54 | cgroup.event_control # an interface for event_fd() | ||
55 | memory.usage_in_bytes # show current memory(RSS+Cache) usage. | ||
56 | memory.memsw.usage_in_bytes # show current memory+Swap usage | ||
57 | memory.limit_in_bytes # set/show limit of memory usage | ||
58 | memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage | ||
59 | memory.failcnt # show the number of memory usage hits limits | ||
60 | memory.memsw.failcnt # show the number of memory+Swap hits limits | ||
61 | memory.max_usage_in_bytes # show max memory usage recorded | ||
62 | memory.memsw.usage_in_bytes # show max memory+Swap usage recorded | ||
63 | memory.soft_limit_in_bytes # set/show soft limit of memory usage | ||
64 | memory.stat # show various statistics | ||
65 | memory.use_hierarchy # set/show hierarchical account enabled | ||
66 | memory.force_empty # trigger forced move charge to parent | ||
67 | memory.swappiness # set/show swappiness parameter of vmscan | ||
68 | (See sysctl's vm.swappiness) | ||
69 | memory.move_charge_at_immigrate # set/show controls of moving charges | ||
70 | memory.oom_control # set/show oom controls. | ||
71 | |||
36 | 1. History | 72 | 1. History |
37 | 73 | ||
38 | The memory controller has a long history. A request for comments for the memory | 74 | The memory controller has a long history. A request for comments for the memory |
@@ -106,14 +142,14 @@ the necessary data structures and check if the cgroup that is being charged | |||
106 | is over its limit. If it is then reclaim is invoked on the cgroup. | 142 | is over its limit. If it is then reclaim is invoked on the cgroup. |
107 | More details can be found in the reclaim section of this document. | 143 | More details can be found in the reclaim section of this document. |
108 | If everything goes well, a page meta-data-structure called page_cgroup is | 144 | If everything goes well, a page meta-data-structure called page_cgroup is |
109 | allocated and associated with the page. This routine also adds the page to | 145 | updated. page_cgroup has its own LRU on cgroup. |
110 | the per cgroup LRU. | 146 | (*) page_cgroup structure is allocated at boot/memory-hotplug time. |
111 | 147 | ||
112 | 2.2.1 Accounting details | 148 | 2.2.1 Accounting details |
113 | 149 | ||
114 | All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. | 150 | All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. |
115 | (some pages which never be reclaimable and will not be on global LRU | 151 | Some pages which are never reclaimable and will not be on the global LRU |
116 | are not accounted. we just accounts pages under usual vm management.) | 152 | are not accounted. We just account pages under usual VM management. |
117 | 153 | ||
118 | RSS pages are accounted at page_fault unless they've already been accounted | 154 | RSS pages are accounted at page_fault unless they've already been accounted |
119 | for earlier. A file page will be accounted for as Page Cache when it's | 155 | for earlier. A file page will be accounted for as Page Cache when it's |
@@ -121,12 +157,19 @@ inserted into inode (radix-tree). While it's mapped into the page tables of | |||
121 | processes, duplicate accounting is carefully avoided. | 157 | processes, duplicate accounting is carefully avoided. |
122 | 158 | ||
123 | A RSS page is unaccounted when it's fully unmapped. A PageCache page is | 159 | A RSS page is unaccounted when it's fully unmapped. A PageCache page is |
124 | unaccounted when it's removed from radix-tree. | 160 | unaccounted when it's removed from radix-tree. Even if RSS pages are fully |
161 | unmapped (by kswapd), they may exist as SwapCache in the system until they | ||
162 | are really freed. Such SwapCaches also also accounted. | ||
163 | A swapped-in page is not accounted until it's mapped. | ||
164 | |||
165 | Note: The kernel does swapin-readahead and read multiple swaps at once. | ||
166 | This means swapped-in pages may contain pages for other tasks than a task | ||
167 | causing page fault. So, we avoid accounting at swap-in I/O. | ||
125 | 168 | ||
126 | At page migration, accounting information is kept. | 169 | At page migration, accounting information is kept. |
127 | 170 | ||
128 | Note: we just account pages-on-lru because our purpose is to control amount | 171 | Note: we just account pages-on-LRU because our purpose is to control amount |
129 | of used pages. not-on-lru pages are tend to be out-of-control from vm view. | 172 | of used pages; not-on-LRU pages tend to be out-of-control from VM view. |
130 | 173 | ||
131 | 2.3 Shared Page Accounting | 174 | 2.3 Shared Page Accounting |
132 | 175 | ||
@@ -143,6 +186,7 @@ caller of swapoff rather than the users of shmem. | |||
143 | 186 | ||
144 | 187 | ||
145 | 2.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP) | 188 | 2.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP) |
189 | |||
146 | Swap Extension allows you to record charge for swap. A swapped-in page is | 190 | Swap Extension allows you to record charge for swap. A swapped-in page is |
147 | charged back to original page allocator if possible. | 191 | charged back to original page allocator if possible. |
148 | 192 | ||
@@ -150,13 +194,20 @@ When swap is accounted, following files are added. | |||
150 | - memory.memsw.usage_in_bytes. | 194 | - memory.memsw.usage_in_bytes. |
151 | - memory.memsw.limit_in_bytes. | 195 | - memory.memsw.limit_in_bytes. |
152 | 196 | ||
153 | usage of mem+swap is limited by memsw.limit_in_bytes. | 197 | memsw means memory+swap. Usage of memory+swap is limited by |
198 | memsw.limit_in_bytes. | ||
154 | 199 | ||
155 | * why 'mem+swap' rather than swap. | 200 | Example: Assume a system with 4G of swap. A task which allocates 6G of memory |
201 | (by mistake) under 2G memory limitation will use all swap. | ||
202 | In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap. | ||
203 | By using memsw limit, you can avoid system OOM which can be caused by swap | ||
204 | shortage. | ||
205 | |||
206 | * why 'memory+swap' rather than swap. | ||
156 | The global LRU(kswapd) can swap out arbitrary pages. Swap-out means | 207 | The global LRU(kswapd) can swap out arbitrary pages. Swap-out means |
157 | to move account from memory to swap...there is no change in usage of | 208 | to move account from memory to swap...there is no change in usage of |
158 | mem+swap. In other words, when we want to limit the usage of swap without | 209 | memory+swap. In other words, when we want to limit the usage of swap without |
159 | affecting global LRU, mem+swap limit is better than just limiting swap from | 210 | affecting global LRU, memory+swap limit is better than just limiting swap from |
160 | OS point of view. | 211 | OS point of view. |
161 | 212 | ||
162 | * What happens when a cgroup hits memory.memsw.limit_in_bytes | 213 | * What happens when a cgroup hits memory.memsw.limit_in_bytes |
@@ -168,12 +219,12 @@ it by cgroup. | |||
168 | 219 | ||
169 | 2.5 Reclaim | 220 | 2.5 Reclaim |
170 | 221 | ||
171 | Each cgroup maintains a per cgroup LRU that consists of an active | 222 | Each cgroup maintains a per cgroup LRU which has the same structure as |
172 | and inactive list. When a cgroup goes over its limit, we first try | 223 | global VM. When a cgroup goes over its limit, we first try |
173 | to reclaim memory from the cgroup so as to make space for the new | 224 | to reclaim memory from the cgroup so as to make space for the new |
174 | pages that the cgroup has touched. If the reclaim is unsuccessful, | 225 | pages that the cgroup has touched. If the reclaim is unsuccessful, |
175 | an OOM routine is invoked to select and kill the bulkiest task in the | 226 | an OOM routine is invoked to select and kill the bulkiest task in the |
176 | cgroup. | 227 | cgroup. (See 10. OOM Control below.) |
177 | 228 | ||
178 | The reclaim algorithm has not been modified for cgroups, except that | 229 | The reclaim algorithm has not been modified for cgroups, except that |
179 | pages that are selected for reclaiming come from the per cgroup LRU | 230 | pages that are selected for reclaiming come from the per cgroup LRU |
@@ -182,13 +233,24 @@ list. | |||
182 | NOTE: Reclaim does not work for the root cgroup, since we cannot set any | 233 | NOTE: Reclaim does not work for the root cgroup, since we cannot set any |
183 | limits on the root cgroup. | 234 | limits on the root cgroup. |
184 | 235 | ||
185 | 2. Locking | 236 | Note2: When panic_on_oom is set to "2", the whole system will panic. |
237 | |||
238 | When oom event notifier is registered, event will be delivered. | ||
239 | (See oom_control section) | ||
240 | |||
241 | 2.6 Locking | ||
186 | 242 | ||
187 | The memory controller uses the following hierarchy | 243 | lock_page_cgroup()/unlock_page_cgroup() should not be called under |
244 | mapping->tree_lock. | ||
188 | 245 | ||
189 | 1. zone->lru_lock is used for selecting pages to be isolated | 246 | Other lock order is following: |
190 | 2. mem->per_zone->lru_lock protects the per cgroup LRU (per zone) | 247 | PG_locked. |
191 | 3. lock_page_cgroup() is used to protect page->page_cgroup | 248 | mm->page_table_lock |
249 | zone->lru_lock | ||
250 | lock_page_cgroup. | ||
251 | In many cases, just lock_page_cgroup() is called. | ||
252 | per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by | ||
253 | zone->lru_lock, it has no lock of its own. | ||
192 | 254 | ||
193 | 3. User Interface | 255 | 3. User Interface |
194 | 256 | ||
@@ -197,6 +259,7 @@ The memory controller uses the following hierarchy | |||
197 | a. Enable CONFIG_CGROUPS | 259 | a. Enable CONFIG_CGROUPS |
198 | b. Enable CONFIG_RESOURCE_COUNTERS | 260 | b. Enable CONFIG_RESOURCE_COUNTERS |
199 | c. Enable CONFIG_CGROUP_MEM_RES_CTLR | 261 | c. Enable CONFIG_CGROUP_MEM_RES_CTLR |
262 | d. Enable CONFIG_CGROUP_MEM_RES_CTLR_SWAP (to use swap extension) | ||
200 | 263 | ||
201 | 1. Prepare the cgroups | 264 | 1. Prepare the cgroups |
202 | # mkdir -p /cgroups | 265 | # mkdir -p /cgroups |
@@ -204,31 +267,28 @@ c. Enable CONFIG_CGROUP_MEM_RES_CTLR | |||
204 | 267 | ||
205 | 2. Make the new group and move bash into it | 268 | 2. Make the new group and move bash into it |
206 | # mkdir /cgroups/0 | 269 | # mkdir /cgroups/0 |
207 | # echo $$ > /cgroups/0/tasks | 270 | # echo $$ > /cgroups/0/tasks |
208 | 271 | ||
209 | Since now we're in the 0 cgroup, | 272 | Since now we're in the 0 cgroup, we can alter the memory limit: |
210 | We can alter the memory limit: | ||
211 | # echo 4M > /cgroups/0/memory.limit_in_bytes | 273 | # echo 4M > /cgroups/0/memory.limit_in_bytes |
212 | 274 | ||
213 | NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, | 275 | NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, |
214 | mega or gigabytes. | 276 | mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, Gibibytes.) |
277 | |||
215 | NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited). | 278 | NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited). |
216 | NOTE: We cannot set limits on the root cgroup any more. | 279 | NOTE: We cannot set limits on the root cgroup any more. |
217 | 280 | ||
218 | # cat /cgroups/0/memory.limit_in_bytes | 281 | # cat /cgroups/0/memory.limit_in_bytes |
219 | 4194304 | 282 | 4194304 |
220 | 283 | ||
221 | NOTE: The interface has now changed to display the usage in bytes | ||
222 | instead of pages | ||
223 | |||
224 | We can check the usage: | 284 | We can check the usage: |
225 | # cat /cgroups/0/memory.usage_in_bytes | 285 | # cat /cgroups/0/memory.usage_in_bytes |
226 | 1216512 | 286 | 1216512 |
227 | 287 | ||
228 | A successful write to this file does not guarantee a successful set of | 288 | A successful write to this file does not guarantee a successful set of |
229 | this limit to the value written into the file. This can be due to a | 289 | this limit to the value written into the file. This can be due to a |
230 | number of factors, such as rounding up to page boundaries or the total | 290 | number of factors, such as rounding up to page boundaries or the total |
231 | availability of memory on the system. The user is required to re-read | 291 | availability of memory on the system. The user is required to re-read |
232 | this file after a write to guarantee the value committed by the kernel. | 292 | this file after a write to guarantee the value committed by the kernel. |
233 | 293 | ||
234 | # echo 1 > memory.limit_in_bytes | 294 | # echo 1 > memory.limit_in_bytes |
@@ -243,15 +303,23 @@ caches, RSS and Active pages/Inactive pages are shown. | |||
243 | 303 | ||
244 | 4. Testing | 304 | 4. Testing |
245 | 305 | ||
246 | Balbir posted lmbench, AIM9, LTP and vmmstress results [10] and [11]. | 306 | For testing features and implementation, see memcg_test.txt. |
247 | Apart from that v6 has been tested with several applications and regular | 307 | |
248 | daily use. The controller has also been tested on the PPC64, x86_64 and | 308 | Performance test is also important. To see pure memory controller's overhead, |
249 | UML platforms. | 309 | testing on tmpfs will give you good numbers of small overheads. |
310 | Example: do kernel make on tmpfs. | ||
311 | |||
312 | Page-fault scalability is also important. At measuring parallel | ||
313 | page fault test, multi-process test may be better than multi-thread | ||
314 | test because it has noise of shared objects/status. | ||
315 | |||
316 | But the above two are testing extreme situations. | ||
317 | Trying usual test under memory controller is always helpful. | ||
250 | 318 | ||
251 | 4.1 Troubleshooting | 319 | 4.1 Troubleshooting |
252 | 320 | ||
253 | Sometimes a user might find that the application under a cgroup is | 321 | Sometimes a user might find that the application under a cgroup is |
254 | terminated. There are several causes for this: | 322 | terminated by OOM killer. There are several causes for this: |
255 | 323 | ||
256 | 1. The cgroup limit is too low (just too low to do anything useful) | 324 | 1. The cgroup limit is too low (just too low to do anything useful) |
257 | 2. The user is using anonymous memory and swap is turned off or too low | 325 | 2. The user is using anonymous memory and swap is turned off or too low |
@@ -259,21 +327,29 @@ terminated. There are several causes for this: | |||
259 | A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of | 327 | A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of |
260 | some of the pages cached in the cgroup (page cache pages). | 328 | some of the pages cached in the cgroup (page cache pages). |
261 | 329 | ||
330 | To know what happens, disable OOM_Kill by 10. OOM Control(see below) and | ||
331 | seeing what happens will be helpful. | ||
332 | |||
262 | 4.2 Task migration | 333 | 4.2 Task migration |
263 | 334 | ||
264 | When a task migrates from one cgroup to another, it's charge is not | 335 | When a task migrates from one cgroup to another, its charge is not |
265 | carried forward. The pages allocated from the original cgroup still | 336 | carried forward by default. The pages allocated from the original cgroup still |
266 | remain charged to it, the charge is dropped when the page is freed or | 337 | remain charged to it, the charge is dropped when the page is freed or |
267 | reclaimed. | 338 | reclaimed. |
268 | 339 | ||
340 | You can move charges of a task along with task migration. | ||
341 | See 8. "Move charges at task migration" | ||
342 | |||
269 | 4.3 Removing a cgroup | 343 | 4.3 Removing a cgroup |
270 | 344 | ||
271 | A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a | 345 | A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a |
272 | cgroup might have some charge associated with it, even though all | 346 | cgroup might have some charge associated with it, even though all |
273 | tasks have migrated away from it. | 347 | tasks have migrated away from it. (because we charge against pages, not |
274 | Such charges are freed(at default) or moved to its parent. When moved, | 348 | against tasks.) |
275 | both of RSS and CACHES are moved to parent. | 349 | |
276 | If both of them are busy, rmdir() returns -EBUSY. See 5.1 Also. | 350 | Such charges are freed or moved to their parent. At moving, both of RSS |
351 | and CACHES are moved to parent. | ||
352 | rmdir() may return -EBUSY if freeing/moving fails. See 5.1 also. | ||
277 | 353 | ||
278 | Charges recorded in swap information is not updated at removal of cgroup. | 354 | Charges recorded in swap information is not updated at removal of cgroup. |
279 | Recorded information is discarded and a cgroup which uses swap (swapcache) | 355 | Recorded information is discarded and a cgroup which uses swap (swapcache) |
@@ -289,10 +365,10 @@ will be charged as a new owner of it. | |||
289 | 365 | ||
290 | # echo 0 > memory.force_empty | 366 | # echo 0 > memory.force_empty |
291 | 367 | ||
292 | Almost all pages tracked by this memcg will be unmapped and freed. Some of | 368 | Almost all pages tracked by this memory cgroup will be unmapped and freed. |
293 | pages cannot be freed because it's locked or in-use. Such pages are moved | 369 | Some pages cannot be freed because they are locked or in-use. Such pages are |
294 | to parent and this cgroup will be empty. But this may return -EBUSY in | 370 | moved to parent and this cgroup will be empty. This may return -EBUSY if |
295 | some too busy case. | 371 | VM is too busy to free/move all pages immediately. |
296 | 372 | ||
297 | Typical use case of this interface is that calling this before rmdir(). | 373 | Typical use case of this interface is that calling this before rmdir(). |
298 | Because rmdir() moves all pages to parent, some out-of-use page caches can be | 374 | Because rmdir() moves all pages to parent, some out-of-use page caches can be |
@@ -302,19 +378,41 @@ will be charged as a new owner of it. | |||
302 | 378 | ||
303 | memory.stat file includes following statistics | 379 | memory.stat file includes following statistics |
304 | 380 | ||
381 | # per-memory cgroup local status | ||
305 | cache - # of bytes of page cache memory. | 382 | cache - # of bytes of page cache memory. |
306 | rss - # of bytes of anonymous and swap cache memory. | 383 | rss - # of bytes of anonymous and swap cache memory. |
384 | mapped_file - # of bytes of mapped file (includes tmpfs/shmem) | ||
307 | pgpgin - # of pages paged in (equivalent to # of charging events). | 385 | pgpgin - # of pages paged in (equivalent to # of charging events). |
308 | pgpgout - # of pages paged out (equivalent to # of uncharging events). | 386 | pgpgout - # of pages paged out (equivalent to # of uncharging events). |
309 | active_anon - # of bytes of anonymous and swap cache memory on active | 387 | swap - # of bytes of swap usage |
310 | lru list. | ||
311 | inactive_anon - # of bytes of anonymous memory and swap cache memory on | 388 | inactive_anon - # of bytes of anonymous memory and swap cache memory on |
312 | inactive lru list. | 389 | LRU list. |
313 | active_file - # of bytes of file-backed memory on active lru list. | 390 | active_anon - # of bytes of anonymous and swap cache memory on active |
314 | inactive_file - # of bytes of file-backed memory on inactive lru list. | 391 | inactive LRU list. |
392 | inactive_file - # of bytes of file-backed memory on inactive LRU list. | ||
393 | active_file - # of bytes of file-backed memory on active LRU list. | ||
315 | unevictable - # of bytes of memory that cannot be reclaimed (mlocked etc). | 394 | unevictable - # of bytes of memory that cannot be reclaimed (mlocked etc). |
316 | 395 | ||
317 | The following additional stats are dependent on CONFIG_DEBUG_VM. | 396 | # status considering hierarchy (see memory.use_hierarchy settings) |
397 | |||
398 | hierarchical_memory_limit - # of bytes of memory limit with regard to hierarchy | ||
399 | under which the memory cgroup is | ||
400 | hierarchical_memsw_limit - # of bytes of memory+swap limit with regard to | ||
401 | hierarchy under which memory cgroup is. | ||
402 | |||
403 | total_cache - sum of all children's "cache" | ||
404 | total_rss - sum of all children's "rss" | ||
405 | total_mapped_file - sum of all children's "cache" | ||
406 | total_pgpgin - sum of all children's "pgpgin" | ||
407 | total_pgpgout - sum of all children's "pgpgout" | ||
408 | total_swap - sum of all children's "swap" | ||
409 | total_inactive_anon - sum of all children's "inactive_anon" | ||
410 | total_active_anon - sum of all children's "active_anon" | ||
411 | total_inactive_file - sum of all children's "inactive_file" | ||
412 | total_active_file - sum of all children's "active_file" | ||
413 | total_unevictable - sum of all children's "unevictable" | ||
414 | |||
415 | # The following additional stats are dependent on CONFIG_DEBUG_VM. | ||
318 | 416 | ||
319 | inactive_ratio - VM internal parameter. (see mm/page_alloc.c) | 417 | inactive_ratio - VM internal parameter. (see mm/page_alloc.c) |
320 | recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) | 418 | recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) |
@@ -323,24 +421,37 @@ recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) | |||
323 | recent_scanned_file - VM internal parameter. (see mm/vmscan.c) | 421 | recent_scanned_file - VM internal parameter. (see mm/vmscan.c) |
324 | 422 | ||
325 | Memo: | 423 | Memo: |
326 | recent_rotated means recent frequency of lru rotation. | 424 | recent_rotated means recent frequency of LRU rotation. |
327 | recent_scanned means recent # of scans to lru. | 425 | recent_scanned means recent # of scans to LRU. |
328 | showing for better debug please see the code for meanings. | 426 | showing for better debug please see the code for meanings. |
329 | 427 | ||
330 | Note: | 428 | Note: |
331 | Only anonymous and swap cache memory is listed as part of 'rss' stat. | 429 | Only anonymous and swap cache memory is listed as part of 'rss' stat. |
332 | This should not be confused with the true 'resident set size' or the | 430 | This should not be confused with the true 'resident set size' or the |
333 | amount of physical memory used by the cgroup. Per-cgroup rss | 431 | amount of physical memory used by the cgroup. |
334 | accounting is not done yet. | 432 | 'rss + file_mapped" will give you resident set size of cgroup. |
433 | (Note: file and shmem may be shared among other cgroups. In that case, | ||
434 | file_mapped is accounted only when the memory cgroup is owner of page | ||
435 | cache.) | ||
335 | 436 | ||
336 | 5.3 swappiness | 437 | 5.3 swappiness |
337 | Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. | ||
338 | 438 | ||
339 | Following cgroups' swapiness can't be changed. | 439 | Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. |
340 | - root cgroup (uses /proc/sys/vm/swappiness). | 440 | |
341 | - a cgroup which uses hierarchy and it has child cgroup. | 441 | Following cgroups' swappiness can't be changed. |
342 | - a cgroup which uses hierarchy and not the root of hierarchy. | 442 | - root cgroup (uses /proc/sys/vm/swappiness). |
443 | - a cgroup which uses hierarchy and it has other cgroup(s) below it. | ||
444 | - a cgroup which uses hierarchy and not the root of hierarchy. | ||
445 | |||
446 | 5.4 failcnt | ||
343 | 447 | ||
448 | A memory cgroup provides memory.failcnt and memory.memsw.failcnt files. | ||
449 | This failcnt(== failure count) shows the number of times that a usage counter | ||
450 | hit its limit. When a memory cgroup hits a limit, failcnt increases and | ||
451 | memory under it will be reclaimed. | ||
452 | |||
453 | You can reset failcnt by writing 0 to failcnt file. | ||
454 | # echo 0 > .../memory.failcnt | ||
344 | 455 | ||
345 | 6. Hierarchy support | 456 | 6. Hierarchy support |
346 | 457 | ||
@@ -359,13 +470,13 @@ hierarchy | |||
359 | 470 | ||
360 | In the diagram above, with hierarchical accounting enabled, all memory | 471 | In the diagram above, with hierarchical accounting enabled, all memory |
361 | usage of e, is accounted to its ancestors up until the root (i.e, c and root), | 472 | usage of e, is accounted to its ancestors up until the root (i.e, c and root), |
362 | that has memory.use_hierarchy enabled. If one of the ancestors goes over its | 473 | that has memory.use_hierarchy enabled. If one of the ancestors goes over its |
363 | limit, the reclaim algorithm reclaims from the tasks in the ancestor and the | 474 | limit, the reclaim algorithm reclaims from the tasks in the ancestor and the |
364 | children of the ancestor. | 475 | children of the ancestor. |
365 | 476 | ||
366 | 6.1 Enabling hierarchical accounting and reclaim | 477 | 6.1 Enabling hierarchical accounting and reclaim |
367 | 478 | ||
368 | The memory controller by default disables the hierarchy feature. Support | 479 | A memory cgroup by default disables the hierarchy feature. Support |
369 | can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup | 480 | can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup |
370 | 481 | ||
371 | # echo 1 > memory.use_hierarchy | 482 | # echo 1 > memory.use_hierarchy |
@@ -375,9 +486,10 @@ The feature can be disabled by | |||
375 | # echo 0 > memory.use_hierarchy | 486 | # echo 0 > memory.use_hierarchy |
376 | 487 | ||
377 | NOTE1: Enabling/disabling will fail if the cgroup already has other | 488 | NOTE1: Enabling/disabling will fail if the cgroup already has other |
378 | cgroups created below it. | 489 | cgroups created below it. |
379 | 490 | ||
380 | NOTE2: This feature can be enabled/disabled per subtree. | 491 | NOTE2: When panic_on_oom is set to "2", the whole system will panic in |
492 | case of an OOM event in any cgroup. | ||
381 | 493 | ||
382 | 7. Soft limits | 494 | 7. Soft limits |
383 | 495 | ||
@@ -387,7 +499,7 @@ is to allow control groups to use as much of the memory as needed, provided | |||
387 | a. There is no memory contention | 499 | a. There is no memory contention |
388 | b. They do not exceed their hard limit | 500 | b. They do not exceed their hard limit |
389 | 501 | ||
390 | When the system detects memory contention or low memory control groups | 502 | When the system detects memory contention or low memory, control groups |
391 | are pushed back to their soft limits. If the soft limit of each control | 503 | are pushed back to their soft limits. If the soft limit of each control |
392 | group is very high, they are pushed back as much as possible to make | 504 | group is very high, they are pushed back as much as possible to make |
393 | sure that one control group does not starve the others of memory. | 505 | sure that one control group does not starve the others of memory. |
@@ -401,7 +513,7 @@ it gets invoked from balance_pgdat (kswapd). | |||
401 | 7.1 Interface | 513 | 7.1 Interface |
402 | 514 | ||
403 | Soft limits can be setup by using the following commands (in this example we | 515 | Soft limits can be setup by using the following commands (in this example we |
404 | assume a soft limit of 256 megabytes) | 516 | assume a soft limit of 256 MiB) |
405 | 517 | ||
406 | # echo 256M > memory.soft_limit_in_bytes | 518 | # echo 256M > memory.soft_limit_in_bytes |
407 | 519 | ||
@@ -414,7 +526,121 @@ NOTE1: Soft limits take effect over a long period of time, since they involve | |||
414 | NOTE2: It is recommended to set the soft limit always below the hard limit, | 526 | NOTE2: It is recommended to set the soft limit always below the hard limit, |
415 | otherwise the hard limit will take precedence. | 527 | otherwise the hard limit will take precedence. |
416 | 528 | ||
417 | 8. TODO | 529 | 8. Move charges at task migration |
530 | |||
531 | Users can move charges associated with a task along with task migration, that | ||
532 | is, uncharge task's pages from the old cgroup and charge them to the new cgroup. | ||
533 | This feature is not supported in !CONFIG_MMU environments because of lack of | ||
534 | page tables. | ||
535 | |||
536 | 8.1 Interface | ||
537 | |||
538 | This feature is disabled by default. It can be enabled(and disabled again) by | ||
539 | writing to memory.move_charge_at_immigrate of the destination cgroup. | ||
540 | |||
541 | If you want to enable it: | ||
542 | |||
543 | # echo (some positive value) > memory.move_charge_at_immigrate | ||
544 | |||
545 | Note: Each bits of move_charge_at_immigrate has its own meaning about what type | ||
546 | of charges should be moved. See 8.2 for details. | ||
547 | Note: Charges are moved only when you move mm->owner, IOW, a leader of a thread | ||
548 | group. | ||
549 | Note: If we cannot find enough space for the task in the destination cgroup, we | ||
550 | try to make space by reclaiming memory. Task migration may fail if we | ||
551 | cannot make enough space. | ||
552 | Note: It can take several seconds if you move charges much. | ||
553 | |||
554 | And if you want disable it again: | ||
555 | |||
556 | # echo 0 > memory.move_charge_at_immigrate | ||
557 | |||
558 | 8.2 Type of charges which can be move | ||
559 | |||
560 | Each bits of move_charge_at_immigrate has its own meaning about what type of | ||
561 | charges should be moved. But in any cases, it must be noted that an account of | ||
562 | a page or a swap can be moved only when it is charged to the task's current(old) | ||
563 | memory cgroup. | ||
564 | |||
565 | bit | what type of charges would be moved ? | ||
566 | -----+------------------------------------------------------------------------ | ||
567 | 0 | A charge of an anonymous page(or swap of it) used by the target task. | ||
568 | | Those pages and swaps must be used only by the target task. You must | ||
569 | | enable Swap Extension(see 2.4) to enable move of swap charges. | ||
570 | -----+------------------------------------------------------------------------ | ||
571 | 1 | A charge of file pages(normal file, tmpfs file(e.g. ipc shared memory) | ||
572 | | and swaps of tmpfs file) mmapped by the target task. Unlike the case of | ||
573 | | anonymous pages, file pages(and swaps) in the range mmapped by the task | ||
574 | | will be moved even if the task hasn't done page fault, i.e. they might | ||
575 | | not be the task's "RSS", but other task's "RSS" that maps the same file. | ||
576 | | And mapcount of the page is ignored(the page can be moved even if | ||
577 | | page_mapcount(page) > 1). You must enable Swap Extension(see 2.4) to | ||
578 | | enable move of swap charges. | ||
579 | |||
580 | 8.3 TODO | ||
581 | |||
582 | - Implement madvise(2) to let users decide the vma to be moved or not to be | ||
583 | moved. | ||
584 | - All of moving charge operations are done under cgroup_mutex. It's not good | ||
585 | behavior to hold the mutex too long, so we may need some trick. | ||
586 | |||
587 | 9. Memory thresholds | ||
588 | |||
589 | Memory cgroup implements memory thresholds using cgroups notification | ||
590 | API (see cgroups.txt). It allows to register multiple memory and memsw | ||
591 | thresholds and gets notifications when it crosses. | ||
592 | |||
593 | To register a threshold application need: | ||
594 | - create an eventfd using eventfd(2); | ||
595 | - open memory.usage_in_bytes or memory.memsw.usage_in_bytes; | ||
596 | - write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to | ||
597 | cgroup.event_control. | ||
598 | |||
599 | Application will be notified through eventfd when memory usage crosses | ||
600 | threshold in any direction. | ||
601 | |||
602 | It's applicable for root and non-root cgroup. | ||
603 | |||
604 | 10. OOM Control | ||
605 | |||
606 | memory.oom_control file is for OOM notification and other controls. | ||
607 | |||
608 | Memory cgroup implements OOM notifier using cgroup notification | ||
609 | API (See cgroups.txt). It allows to register multiple OOM notification | ||
610 | delivery and gets notification when OOM happens. | ||
611 | |||
612 | To register a notifier, application need: | ||
613 | - create an eventfd using eventfd(2) | ||
614 | - open memory.oom_control file | ||
615 | - write string like "<event_fd> <fd of memory.oom_control>" to | ||
616 | cgroup.event_control | ||
617 | |||
618 | Application will be notified through eventfd when OOM happens. | ||
619 | OOM notification doesn't work for root cgroup. | ||
620 | |||
621 | You can disable OOM-killer by writing "1" to memory.oom_control file, as: | ||
622 | |||
623 | #echo 1 > memory.oom_control | ||
624 | |||
625 | This operation is only allowed to the top cgroup of sub-hierarchy. | ||
626 | If OOM-killer is disabled, tasks under cgroup will hang/sleep | ||
627 | in memory cgroup's OOM-waitqueue when they request accountable memory. | ||
628 | |||
629 | For running them, you have to relax the memory cgroup's OOM status by | ||
630 | * enlarge limit or reduce usage. | ||
631 | To reduce usage, | ||
632 | * kill some tasks. | ||
633 | * move some tasks to other group with account migration. | ||
634 | * remove some files (on tmpfs?) | ||
635 | |||
636 | Then, stopped tasks will work again. | ||
637 | |||
638 | At reading, current status of OOM is shown. | ||
639 | oom_kill_disable 0 or 1 (if 1, oom-killer is disabled) | ||
640 | under_oom 0 or 1 (if 1, the memory cgroup is under OOM, tasks may | ||
641 | be stopped.) | ||
642 | |||
643 | 11. TODO | ||
418 | 644 | ||
419 | 1. Add support for accounting huge pages (as a separate controller) | 645 | 1. Add support for accounting huge pages (as a separate controller) |
420 | 2. Make per-cgroup scanner reclaim not-shared pages first | 646 | 2. Make per-cgroup scanner reclaim not-shared pages first |
diff --git a/Documentation/circular-buffers.txt b/Documentation/circular-buffers.txt new file mode 100644 index 000000000000..8117e5bf6065 --- /dev/null +++ b/Documentation/circular-buffers.txt | |||
@@ -0,0 +1,234 @@ | |||
1 | ================ | ||
2 | CIRCULAR BUFFERS | ||
3 | ================ | ||
4 | |||
5 | By: David Howells <dhowells@redhat.com> | ||
6 | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | ||
7 | |||
8 | |||
9 | Linux provides a number of features that can be used to implement circular | ||
10 | buffering. There are two sets of such features: | ||
11 | |||
12 | (1) Convenience functions for determining information about power-of-2 sized | ||
13 | buffers. | ||
14 | |||
15 | (2) Memory barriers for when the producer and the consumer of objects in the | ||
16 | buffer don't want to share a lock. | ||
17 | |||
18 | To use these facilities, as discussed below, there needs to be just one | ||
19 | producer and just one consumer. It is possible to handle multiple producers by | ||
20 | serialising them, and to handle multiple consumers by serialising them. | ||
21 | |||
22 | |||
23 | Contents: | ||
24 | |||
25 | (*) What is a circular buffer? | ||
26 | |||
27 | (*) Measuring power-of-2 buffers. | ||
28 | |||
29 | (*) Using memory barriers with circular buffers. | ||
30 | - The producer. | ||
31 | - The consumer. | ||
32 | |||
33 | |||
34 | ========================== | ||
35 | WHAT IS A CIRCULAR BUFFER? | ||
36 | ========================== | ||
37 | |||
38 | First of all, what is a circular buffer? A circular buffer is a buffer of | ||
39 | fixed, finite size into which there are two indices: | ||
40 | |||
41 | (1) A 'head' index - the point at which the producer inserts items into the | ||
42 | buffer. | ||
43 | |||
44 | (2) A 'tail' index - the point at which the consumer finds the next item in | ||
45 | the buffer. | ||
46 | |||
47 | Typically when the tail pointer is equal to the head pointer, the buffer is | ||
48 | empty; and the buffer is full when the head pointer is one less than the tail | ||
49 | pointer. | ||
50 | |||
51 | The head index is incremented when items are added, and the tail index when | ||
52 | items are removed. The tail index should never jump the head index, and both | ||
53 | indices should be wrapped to 0 when they reach the end of the buffer, thus | ||
54 | allowing an infinite amount of data to flow through the buffer. | ||
55 | |||
56 | Typically, items will all be of the same unit size, but this isn't strictly | ||
57 | required to use the techniques below. The indices can be increased by more | ||
58 | than 1 if multiple items or variable-sized items are to be included in the | ||
59 | buffer, provided that neither index overtakes the other. The implementer must | ||
60 | be careful, however, as a region more than one unit in size may wrap the end of | ||
61 | the buffer and be broken into two segments. | ||
62 | |||
63 | |||
64 | ============================ | ||
65 | MEASURING POWER-OF-2 BUFFERS | ||
66 | ============================ | ||
67 | |||
68 | Calculation of the occupancy or the remaining capacity of an arbitrarily sized | ||
69 | circular buffer would normally be a slow operation, requiring the use of a | ||
70 | modulus (divide) instruction. However, if the buffer is of a power-of-2 size, | ||
71 | then a much quicker bitwise-AND instruction can be used instead. | ||
72 | |||
73 | Linux provides a set of macros for handling power-of-2 circular buffers. These | ||
74 | can be made use of by: | ||
75 | |||
76 | #include <linux/circ_buf.h> | ||
77 | |||
78 | The macros are: | ||
79 | |||
80 | (*) Measure the remaining capacity of a buffer: | ||
81 | |||
82 | CIRC_SPACE(head_index, tail_index, buffer_size); | ||
83 | |||
84 | This returns the amount of space left in the buffer[1] into which items | ||
85 | can be inserted. | ||
86 | |||
87 | |||
88 | (*) Measure the maximum consecutive immediate space in a buffer: | ||
89 | |||
90 | CIRC_SPACE_TO_END(head_index, tail_index, buffer_size); | ||
91 | |||
92 | This returns the amount of consecutive space left in the buffer[1] into | ||
93 | which items can be immediately inserted without having to wrap back to the | ||
94 | beginning of the buffer. | ||
95 | |||
96 | |||
97 | (*) Measure the occupancy of a buffer: | ||
98 | |||
99 | CIRC_CNT(head_index, tail_index, buffer_size); | ||
100 | |||
101 | This returns the number of items currently occupying a buffer[2]. | ||
102 | |||
103 | |||
104 | (*) Measure the non-wrapping occupancy of a buffer: | ||
105 | |||
106 | CIRC_CNT_TO_END(head_index, tail_index, buffer_size); | ||
107 | |||
108 | This returns the number of consecutive items[2] that can be extracted from | ||
109 | the buffer without having to wrap back to the beginning of the buffer. | ||
110 | |||
111 | |||
112 | Each of these macros will nominally return a value between 0 and buffer_size-1, | ||
113 | however: | ||
114 | |||
115 | [1] CIRC_SPACE*() are intended to be used in the producer. To the producer | ||
116 | they will return a lower bound as the producer controls the head index, | ||
117 | but the consumer may still be depleting the buffer on another CPU and | ||
118 | moving the tail index. | ||
119 | |||
120 | To the consumer it will show an upper bound as the producer may be busy | ||
121 | depleting the space. | ||
122 | |||
123 | [2] CIRC_CNT*() are intended to be used in the consumer. To the consumer they | ||
124 | will return a lower bound as the consumer controls the tail index, but the | ||
125 | producer may still be filling the buffer on another CPU and moving the | ||
126 | head index. | ||
127 | |||
128 | To the producer it will show an upper bound as the consumer may be busy | ||
129 | emptying the buffer. | ||
130 | |||
131 | [3] To a third party, the order in which the writes to the indices by the | ||
132 | producer and consumer become visible cannot be guaranteed as they are | ||
133 | independent and may be made on different CPUs - so the result in such a | ||
134 | situation will merely be a guess, and may even be negative. | ||
135 | |||
136 | |||
137 | =========================================== | ||
138 | USING MEMORY BARRIERS WITH CIRCULAR BUFFERS | ||
139 | =========================================== | ||
140 | |||
141 | By using memory barriers in conjunction with circular buffers, you can avoid | ||
142 | the need to: | ||
143 | |||
144 | (1) use a single lock to govern access to both ends of the buffer, thus | ||
145 | allowing the buffer to be filled and emptied at the same time; and | ||
146 | |||
147 | (2) use atomic counter operations. | ||
148 | |||
149 | There are two sides to this: the producer that fills the buffer, and the | ||
150 | consumer that empties it. Only one thing should be filling a buffer at any one | ||
151 | time, and only one thing should be emptying a buffer at any one time, but the | ||
152 | two sides can operate simultaneously. | ||
153 | |||
154 | |||
155 | THE PRODUCER | ||
156 | ------------ | ||
157 | |||
158 | The producer will look something like this: | ||
159 | |||
160 | spin_lock(&producer_lock); | ||
161 | |||
162 | unsigned long head = buffer->head; | ||
163 | unsigned long tail = ACCESS_ONCE(buffer->tail); | ||
164 | |||
165 | if (CIRC_SPACE(head, tail, buffer->size) >= 1) { | ||
166 | /* insert one item into the buffer */ | ||
167 | struct item *item = buffer[head]; | ||
168 | |||
169 | produce_item(item); | ||
170 | |||
171 | smp_wmb(); /* commit the item before incrementing the head */ | ||
172 | |||
173 | buffer->head = (head + 1) & (buffer->size - 1); | ||
174 | |||
175 | /* wake_up() will make sure that the head is committed before | ||
176 | * waking anyone up */ | ||
177 | wake_up(consumer); | ||
178 | } | ||
179 | |||
180 | spin_unlock(&producer_lock); | ||
181 | |||
182 | This will instruct the CPU that the contents of the new item must be written | ||
183 | before the head index makes it available to the consumer and then instructs the | ||
184 | CPU that the revised head index must be written before the consumer is woken. | ||
185 | |||
186 | Note that wake_up() doesn't have to be the exact mechanism used, but whatever | ||
187 | is used must guarantee a (write) memory barrier between the update of the head | ||
188 | index and the change of state of the consumer, if a change of state occurs. | ||
189 | |||
190 | |||
191 | THE CONSUMER | ||
192 | ------------ | ||
193 | |||
194 | The consumer will look something like this: | ||
195 | |||
196 | spin_lock(&consumer_lock); | ||
197 | |||
198 | unsigned long head = ACCESS_ONCE(buffer->head); | ||
199 | unsigned long tail = buffer->tail; | ||
200 | |||
201 | if (CIRC_CNT(head, tail, buffer->size) >= 1) { | ||
202 | /* read index before reading contents at that index */ | ||
203 | smp_read_barrier_depends(); | ||
204 | |||
205 | /* extract one item from the buffer */ | ||
206 | struct item *item = buffer[tail]; | ||
207 | |||
208 | consume_item(item); | ||
209 | |||
210 | smp_mb(); /* finish reading descriptor before incrementing tail */ | ||
211 | |||
212 | buffer->tail = (tail + 1) & (buffer->size - 1); | ||
213 | } | ||
214 | |||
215 | spin_unlock(&consumer_lock); | ||
216 | |||
217 | This will instruct the CPU to make sure the index is up to date before reading | ||
218 | the new item, and then it shall make sure the CPU has finished reading the item | ||
219 | before it writes the new tail pointer, which will erase the item. | ||
220 | |||
221 | |||
222 | Note the use of ACCESS_ONCE() in both algorithms to read the opposition index. | ||
223 | This prevents the compiler from discarding and reloading its cached value - | ||
224 | which some compilers will do across smp_read_barrier_depends(). This isn't | ||
225 | strictly needed if you can be sure that the opposition index will _only_ be | ||
226 | used the once. | ||
227 | |||
228 | |||
229 | =============== | ||
230 | FURTHER READING | ||
231 | =============== | ||
232 | |||
233 | See also Documentation/memory-barriers.txt for a description of Linux's memory | ||
234 | barrier facilities. | ||
diff --git a/Documentation/connector/cn_test.c b/Documentation/connector/cn_test.c index b07add3467f1..7764594778d4 100644 --- a/Documentation/connector/cn_test.c +++ b/Documentation/connector/cn_test.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/module.h> | 25 | #include <linux/module.h> |
26 | #include <linux/moduleparam.h> | 26 | #include <linux/moduleparam.h> |
27 | #include <linux/skbuff.h> | 27 | #include <linux/skbuff.h> |
28 | #include <linux/slab.h> | ||
28 | #include <linux/timer.h> | 29 | #include <linux/timer.h> |
29 | 30 | ||
30 | #include <linux/connector.h> | 31 | #include <linux/connector.h> |
diff --git a/Documentation/connector/connector.txt b/Documentation/connector/connector.txt index 78c9466a9aa8..e5c5f5e6ab70 100644 --- a/Documentation/connector/connector.txt +++ b/Documentation/connector/connector.txt | |||
@@ -88,7 +88,7 @@ int cn_netlink_send(struct cn_msg *msg, u32 __groups, int gfp_mask); | |||
88 | int gfp_mask - GFP mask. | 88 | int gfp_mask - GFP mask. |
89 | 89 | ||
90 | Note: When registering new callback user, connector core assigns | 90 | Note: When registering new callback user, connector core assigns |
91 | netlink group to the user which is equal to it's id.idx. | 91 | netlink group to the user which is equal to its id.idx. |
92 | 92 | ||
93 | /*****************************************/ | 93 | /*****************************************/ |
94 | Protocol description. | 94 | Protocol description. |
diff --git a/Documentation/console/console.txt b/Documentation/console/console.txt index 877a1b26cc3d..926cf1b5e63e 100644 --- a/Documentation/console/console.txt +++ b/Documentation/console/console.txt | |||
@@ -74,7 +74,7 @@ driver takes over the consoles vacated by the driver. Binding, on the other | |||
74 | hand, will bind the driver to the consoles that are currently occupied by a | 74 | hand, will bind the driver to the consoles that are currently occupied by a |
75 | system driver. | 75 | system driver. |
76 | 76 | ||
77 | NOTE1: Binding and binding must be selected in Kconfig. It's under: | 77 | NOTE1: Binding and unbinding must be selected in Kconfig. It's under: |
78 | 78 | ||
79 | Device Drivers -> Character devices -> Support for binding and unbinding | 79 | Device Drivers -> Character devices -> Support for binding and unbinding |
80 | console drivers | 80 | console drivers |
diff --git a/Documentation/cpu-freq/pcc-cpufreq.txt b/Documentation/cpu-freq/pcc-cpufreq.txt new file mode 100644 index 000000000000..9e3c3b33514c --- /dev/null +++ b/Documentation/cpu-freq/pcc-cpufreq.txt | |||
@@ -0,0 +1,207 @@ | |||
1 | /* | ||
2 | * pcc-cpufreq.txt - PCC interface documentation | ||
3 | * | ||
4 | * Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com> | ||
5 | * Copyright (C) 2009 Hewlett-Packard Development Company, L.P. | ||
6 | * Nagananda Chumbalkar <nagananda.chumbalkar@hp.com> | ||
7 | * | ||
8 | * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or modify | ||
11 | * it under the terms of the GNU General Public License as published by | ||
12 | * the Free Software Foundation; version 2 of the License. | ||
13 | * | ||
14 | * This program is distributed in the hope that it will be useful, but | ||
15 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON | ||
17 | * INFRINGEMENT. See the GNU General Public License for more details. | ||
18 | * | ||
19 | * You should have received a copy of the GNU General Public License along | ||
20 | * with this program; if not, write to the Free Software Foundation, Inc., | ||
21 | * 675 Mass Ave, Cambridge, MA 02139, USA. | ||
22 | * | ||
23 | * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
24 | */ | ||
25 | |||
26 | |||
27 | Processor Clocking Control Driver | ||
28 | --------------------------------- | ||
29 | |||
30 | Contents: | ||
31 | --------- | ||
32 | 1. Introduction | ||
33 | 1.1 PCC interface | ||
34 | 1.1.1 Get Average Frequency | ||
35 | 1.1.2 Set Desired Frequency | ||
36 | 1.2 Platforms affected | ||
37 | 2. Driver and /sys details | ||
38 | 2.1 scaling_available_frequencies | ||
39 | 2.2 cpuinfo_transition_latency | ||
40 | 2.3 cpuinfo_cur_freq | ||
41 | 2.4 related_cpus | ||
42 | 3. Caveats | ||
43 | |||
44 | 1. Introduction: | ||
45 | ---------------- | ||
46 | Processor Clocking Control (PCC) is an interface between the platform | ||
47 | firmware and OSPM. It is a mechanism for coordinating processor | ||
48 | performance (ie: frequency) between the platform firmware and the OS. | ||
49 | |||
50 | The PCC driver (pcc-cpufreq) allows OSPM to take advantage of the PCC | ||
51 | interface. | ||
52 | |||
53 | OS utilizes the PCC interface to inform platform firmware what frequency the | ||
54 | OS wants for a logical processor. The platform firmware attempts to achieve | ||
55 | the requested frequency. If the request for the target frequency could not be | ||
56 | satisfied by platform firmware, then it usually means that power budget | ||
57 | conditions are in place, and "power capping" is taking place. | ||
58 | |||
59 | 1.1 PCC interface: | ||
60 | ------------------ | ||
61 | The complete PCC specification is available here: | ||
62 | http://www.acpica.org/download/Processor-Clocking-Control-v1p0.pdf | ||
63 | |||
64 | PCC relies on a shared memory region that provides a channel for communication | ||
65 | between the OS and platform firmware. PCC also implements a "doorbell" that | ||
66 | is used by the OS to inform the platform firmware that a command has been | ||
67 | sent. | ||
68 | |||
69 | The ACPI PCCH() method is used to discover the location of the PCC shared | ||
70 | memory region. The shared memory region header contains the "command" and | ||
71 | "status" interface. PCCH() also contains details on how to access the platform | ||
72 | doorbell. | ||
73 | |||
74 | The following commands are supported by the PCC interface: | ||
75 | * Get Average Frequency | ||
76 | * Set Desired Frequency | ||
77 | |||
78 | The ACPI PCCP() method is implemented for each logical processor and is | ||
79 | used to discover the offsets for the input and output buffers in the shared | ||
80 | memory region. | ||
81 | |||
82 | When PCC mode is enabled, the platform will not expose processor performance | ||
83 | or throttle states (_PSS, _TSS and related ACPI objects) to OSPM. Therefore, | ||
84 | the native P-state driver (such as acpi-cpufreq for Intel, powernow-k8 for | ||
85 | AMD) will not load. | ||
86 | |||
87 | However, OSPM remains in control of policy. The governor (eg: "ondemand") | ||
88 | computes the required performance for each processor based on server workload. | ||
89 | The PCC driver fills in the command interface, and the input buffer and | ||
90 | communicates the request to the platform firmware. The platform firmware is | ||
91 | responsible for delivering the requested performance. | ||
92 | |||
93 | Each PCC command is "global" in scope and can affect all the logical CPUs in | ||
94 | the system. Therefore, PCC is capable of performing "group" updates. With PCC | ||
95 | the OS is capable of getting/setting the frequency of all the logical CPUs in | ||
96 | the system with a single call to the BIOS. | ||
97 | |||
98 | 1.1.1 Get Average Frequency: | ||
99 | ---------------------------- | ||
100 | This command is used by the OSPM to query the running frequency of the | ||
101 | processor since the last time this command was completed. The output buffer | ||
102 | indicates the average unhalted frequency of the logical processor expressed as | ||
103 | a percentage of the nominal (ie: maximum) CPU frequency. The output buffer | ||
104 | also signifies if the CPU frequency is limited by a power budget condition. | ||
105 | |||
106 | 1.1.2 Set Desired Frequency: | ||
107 | ---------------------------- | ||
108 | This command is used by the OSPM to communicate to the platform firmware the | ||
109 | desired frequency for a logical processor. The output buffer is currently | ||
110 | ignored by OSPM. The next invocation of "Get Average Frequency" will inform | ||
111 | OSPM if the desired frequency was achieved or not. | ||
112 | |||
113 | 1.2 Platforms affected: | ||
114 | ----------------------- | ||
115 | The PCC driver will load on any system where the platform firmware: | ||
116 | * supports the PCC interface, and the associated PCCH() and PCCP() methods | ||
117 | * assumes responsibility for managing the hardware clocking controls in order | ||
118 | to deliver the requested processor performance | ||
119 | |||
120 | Currently, certain HP ProLiant platforms implement the PCC interface. On those | ||
121 | platforms PCC is the "default" choice. | ||
122 | |||
123 | However, it is possible to disable this interface via a BIOS setting. In | ||
124 | such an instance, as is also the case on platforms where the PCC interface | ||
125 | is not implemented, the PCC driver will fail to load silently. | ||
126 | |||
127 | 2. Driver and /sys details: | ||
128 | --------------------------- | ||
129 | When the driver loads, it merely prints the lowest and the highest CPU | ||
130 | frequencies supported by the platform firmware. | ||
131 | |||
132 | The PCC driver loads with a message such as: | ||
133 | pcc-cpufreq: (v1.00.00) driver loaded with frequency limits: 1600 MHz, 2933 | ||
134 | MHz | ||
135 | |||
136 | This means that the OPSM can request the CPU to run at any frequency in | ||
137 | between the limits (1600 MHz, and 2933 MHz) specified in the message. | ||
138 | |||
139 | Internally, there is no need for the driver to convert the "target" frequency | ||
140 | to a corresponding P-state. | ||
141 | |||
142 | The VERSION number for the driver will be of the format v.xy.ab. | ||
143 | eg: 1.00.02 | ||
144 | ----- -- | ||
145 | | | | ||
146 | | -- this will increase with bug fixes/enhancements to the driver | ||
147 | |-- this is the version of the PCC specification the driver adheres to | ||
148 | |||
149 | |||
150 | The following is a brief discussion on some of the fields exported via the | ||
151 | /sys filesystem and how their values are affected by the PCC driver: | ||
152 | |||
153 | 2.1 scaling_available_frequencies: | ||
154 | ---------------------------------- | ||
155 | scaling_available_frequencies is not created in /sys. No intermediate | ||
156 | frequencies need to be listed because the BIOS will try to achieve any | ||
157 | frequency, within limits, requested by the governor. A frequency does not have | ||
158 | to be strictly associated with a P-state. | ||
159 | |||
160 | 2.2 cpuinfo_transition_latency: | ||
161 | ------------------------------- | ||
162 | The cpuinfo_transition_latency field is 0. The PCC specification does | ||
163 | not include a field to expose this value currently. | ||
164 | |||
165 | 2.3 cpuinfo_cur_freq: | ||
166 | --------------------- | ||
167 | A) Often cpuinfo_cur_freq will show a value different than what is declared | ||
168 | in the scaling_available_frequencies or scaling_cur_freq, or scaling_max_freq. | ||
169 | This is due to "turbo boost" available on recent Intel processors. If certain | ||
170 | conditions are met the BIOS can achieve a slightly higher speed than requested | ||
171 | by OSPM. An example: | ||
172 | |||
173 | scaling_cur_freq : 2933000 | ||
174 | cpuinfo_cur_freq : 3196000 | ||
175 | |||
176 | B) There is a round-off error associated with the cpuinfo_cur_freq value. | ||
177 | Since the driver obtains the current frequency as a "percentage" (%) of the | ||
178 | nominal frequency from the BIOS, sometimes, the values displayed by | ||
179 | scaling_cur_freq and cpuinfo_cur_freq may not match. An example: | ||
180 | |||
181 | scaling_cur_freq : 1600000 | ||
182 | cpuinfo_cur_freq : 1583000 | ||
183 | |||
184 | In this example, the nominal frequency is 2933 MHz. The driver obtains the | ||
185 | current frequency, cpuinfo_cur_freq, as 54% of the nominal frequency: | ||
186 | |||
187 | 54% of 2933 MHz = 1583 MHz | ||
188 | |||
189 | Nominal frequency is the maximum frequency of the processor, and it usually | ||
190 | corresponds to the frequency of the P0 P-state. | ||
191 | |||
192 | 2.4 related_cpus: | ||
193 | ----------------- | ||
194 | The related_cpus field is identical to affected_cpus. | ||
195 | |||
196 | affected_cpus : 4 | ||
197 | related_cpus : 4 | ||
198 | |||
199 | Currently, the PCC driver does not evaluate _PSD. The platforms that support | ||
200 | PCC do not implement SW_ALL. So OSPM doesn't need to perform any coordination | ||
201 | to ensure that the same frequency is requested of all dependent CPUs. | ||
202 | |||
203 | 3. Caveats: | ||
204 | ----------- | ||
205 | The "cpufreq_stats" module in its present form cannot be loaded and | ||
206 | expected to work with the PCC driver. Since the "cpufreq_stats" module | ||
207 | provides information wrt each P-state, it is not applicable to the PCC driver. | ||
diff --git a/Documentation/credentials.txt b/Documentation/credentials.txt index df03169782ea..a2db35287003 100644 --- a/Documentation/credentials.txt +++ b/Documentation/credentials.txt | |||
@@ -408,9 +408,6 @@ This should be used inside the RCU read lock, as in the following example: | |||
408 | ... | 408 | ... |
409 | } | 409 | } |
410 | 410 | ||
411 | A function need not get RCU read lock to use __task_cred() if it is holding a | ||
412 | spinlock at the time as this implicitly holds the RCU read lock. | ||
413 | |||
414 | Should it be necessary to hold another task's credentials for a long period of | 411 | Should it be necessary to hold another task's credentials for a long period of |
415 | time, and possibly to sleep whilst doing so, then the caller should get a | 412 | time, and possibly to sleep whilst doing so, then the caller should get a |
416 | reference on them using: | 413 | reference on them using: |
@@ -426,17 +423,16 @@ credentials, hiding the RCU magic from the caller: | |||
426 | uid_t task_uid(task) Task's real UID | 423 | uid_t task_uid(task) Task's real UID |
427 | uid_t task_euid(task) Task's effective UID | 424 | uid_t task_euid(task) Task's effective UID |
428 | 425 | ||
429 | If the caller is holding a spinlock or the RCU read lock at the time anyway, | 426 | If the caller is holding the RCU read lock at the time anyway, then: |
430 | then: | ||
431 | 427 | ||
432 | __task_cred(task)->uid | 428 | __task_cred(task)->uid |
433 | __task_cred(task)->euid | 429 | __task_cred(task)->euid |
434 | 430 | ||
435 | should be used instead. Similarly, if multiple aspects of a task's credentials | 431 | should be used instead. Similarly, if multiple aspects of a task's credentials |
436 | need to be accessed, RCU read lock or a spinlock should be used, __task_cred() | 432 | need to be accessed, RCU read lock should be used, __task_cred() called, the |
437 | called, the result stored in a temporary pointer and then the credential | 433 | result stored in a temporary pointer and then the credential aspects called |
438 | aspects called from that before dropping the lock. This prevents the | 434 | from that before dropping the lock. This prevents the potentially expensive |
439 | potentially expensive RCU magic from being invoked multiple times. | 435 | RCU magic from being invoked multiple times. |
440 | 436 | ||
441 | Should some other single aspect of another task's credentials need to be | 437 | Should some other single aspect of another task's credentials need to be |
442 | accessed, then this can be used: | 438 | accessed, then this can be used: |
diff --git a/Documentation/development-process/2.Process b/Documentation/development-process/2.Process index d750321acd5a..97726eba6102 100644 --- a/Documentation/development-process/2.Process +++ b/Documentation/development-process/2.Process | |||
@@ -151,7 +151,7 @@ The stages that a patch goes through are, generally: | |||
151 | well. | 151 | well. |
152 | 152 | ||
153 | - Wider review. When the patch is getting close to ready for mainline | 153 | - Wider review. When the patch is getting close to ready for mainline |
154 | inclusion, it will be accepted by a relevant subsystem maintainer - | 154 | inclusion, it should be accepted by a relevant subsystem maintainer - |
155 | though this acceptance is not a guarantee that the patch will make it | 155 | though this acceptance is not a guarantee that the patch will make it |
156 | all the way to the mainline. The patch will show up in the maintainer's | 156 | all the way to the mainline. The patch will show up in the maintainer's |
157 | subsystem tree and into the staging trees (described below). When the | 157 | subsystem tree and into the staging trees (described below). When the |
@@ -159,6 +159,15 @@ The stages that a patch goes through are, generally: | |||
159 | the discovery of any problems resulting from the integration of this | 159 | the discovery of any problems resulting from the integration of this |
160 | patch with work being done by others. | 160 | patch with work being done by others. |
161 | 161 | ||
162 | - Please note that most maintainers also have day jobs, so merging | ||
163 | your patch may not be their highest priority. If your patch is | ||
164 | getting feedback about changes that are needed, you should either | ||
165 | make those changes or justify why they should not be made. If your | ||
166 | patch has no review complaints but is not being merged by its | ||
167 | appropriate subsystem or driver maintainer, you should be persistent | ||
168 | in updating the patch to the current kernel so that it applies cleanly | ||
169 | and keep sending it for review and merging. | ||
170 | |||
162 | - Merging into the mainline. Eventually, a successful patch will be | 171 | - Merging into the mainline. Eventually, a successful patch will be |
163 | merged into the mainline repository managed by Linus Torvalds. More | 172 | merged into the mainline repository managed by Linus Torvalds. More |
164 | comments and/or problems may surface at this time; it is important that | 173 | comments and/or problems may surface at this time; it is important that |
@@ -258,12 +267,8 @@ an appropriate subsystem tree or be sent directly to Linus. In a typical | |||
258 | development cycle, approximately 10% of the patches going into the mainline | 267 | development cycle, approximately 10% of the patches going into the mainline |
259 | get there via -mm. | 268 | get there via -mm. |
260 | 269 | ||
261 | The current -mm patch can always be found from the front page of | 270 | The current -mm patch is available in the "mmotm" (-mm of the moment) |
262 | 271 | directory at: | |
263 | http://kernel.org/ | ||
264 | |||
265 | Those who want to see the current state of -mm can get the "-mm of the | ||
266 | moment" tree, found at: | ||
267 | 272 | ||
268 | http://userweb.kernel.org/~akpm/mmotm/ | 273 | http://userweb.kernel.org/~akpm/mmotm/ |
269 | 274 | ||
@@ -298,6 +303,12 @@ volatility of linux-next tends to make it a difficult development target. | |||
298 | See http://lwn.net/Articles/289013/ for more information on this topic, and | 303 | See http://lwn.net/Articles/289013/ for more information on this topic, and |
299 | stay tuned; much is still in flux where linux-next is involved. | 304 | stay tuned; much is still in flux where linux-next is involved. |
300 | 305 | ||
306 | Besides the mmotm and linux-next trees, the kernel source tree now contains | ||
307 | the drivers/staging/ directory and many sub-directories for drivers or | ||
308 | filesystems that are on their way to being added to the kernel tree | ||
309 | proper, but they remain in drivers/staging/ while they still need more | ||
310 | work. | ||
311 | |||
301 | 312 | ||
302 | 2.5: TOOLS | 313 | 2.5: TOOLS |
303 | 314 | ||
@@ -319,9 +330,9 @@ developers; even if they do not use it for their own work, they'll need git | |||
319 | to keep up with what other developers (and the mainline) are doing. | 330 | to keep up with what other developers (and the mainline) are doing. |
320 | 331 | ||
321 | Git is now packaged by almost all Linux distributions. There is a home | 332 | Git is now packaged by almost all Linux distributions. There is a home |
322 | page at | 333 | page at: |
323 | 334 | ||
324 | http://git.or.cz/ | 335 | http://git-scm.com/ |
325 | 336 | ||
326 | That page has pointers to documentation and tutorials. One should be | 337 | That page has pointers to documentation and tutorials. One should be |
327 | aware, in particular, of the Kernel Hacker's Guide to git, which has | 338 | aware, in particular, of the Kernel Hacker's Guide to git, which has |
diff --git a/Documentation/development-process/7.AdvancedTopics b/Documentation/development-process/7.AdvancedTopics index a2cf74093aa1..837179447e17 100644 --- a/Documentation/development-process/7.AdvancedTopics +++ b/Documentation/development-process/7.AdvancedTopics | |||
@@ -25,7 +25,7 @@ long document in its own right. Instead, the focus here will be on how git | |||
25 | fits into the kernel development process in particular. Developers who | 25 | fits into the kernel development process in particular. Developers who |
26 | wish to come up to speed with git will find more information at: | 26 | wish to come up to speed with git will find more information at: |
27 | 27 | ||
28 | http://git.or.cz/ | 28 | http://git-scm.com/ |
29 | 29 | ||
30 | http://www.kernel.org/pub/software/scm/git/docs/user-manual.html | 30 | http://www.kernel.org/pub/software/scm/git/docs/user-manual.html |
31 | 31 | ||
diff --git a/Documentation/device-mapper/snapshot.txt b/Documentation/device-mapper/snapshot.txt index e3a77b215135..0d5bc46dc167 100644 --- a/Documentation/device-mapper/snapshot.txt +++ b/Documentation/device-mapper/snapshot.txt | |||
@@ -122,3 +122,47 @@ volumeGroup-base: 0 2097152 snapshot-merge 254:11 254:12 P 16 | |||
122 | brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real | 122 | brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real |
123 | brw------- 1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow | 123 | brw------- 1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow |
124 | brw------- 1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base | 124 | brw------- 1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base |
125 | |||
126 | |||
127 | How to determine when a merging is complete | ||
128 | =========================================== | ||
129 | The snapshot-merge and snapshot status lines end with: | ||
130 | <sectors_allocated>/<total_sectors> <metadata_sectors> | ||
131 | |||
132 | Both <sectors_allocated> and <total_sectors> include both data and metadata. | ||
133 | During merging, the number of sectors allocated gets smaller and | ||
134 | smaller. Merging has finished when the number of sectors holding data | ||
135 | is zero, in other words <sectors_allocated> == <metadata_sectors>. | ||
136 | |||
137 | Here is a practical example (using a hybrid of lvm and dmsetup commands): | ||
138 | |||
139 | # lvs | ||
140 | LV VG Attr LSize Origin Snap% Move Log Copy% Convert | ||
141 | base volumeGroup owi-a- 4.00g | ||
142 | snap volumeGroup swi-a- 1.00g base 18.97 | ||
143 | |||
144 | # dmsetup status volumeGroup-snap | ||
145 | 0 8388608 snapshot 397896/2097152 1560 | ||
146 | ^^^^ metadata sectors | ||
147 | |||
148 | # lvconvert --merge -b volumeGroup/snap | ||
149 | Merging of volume snap started. | ||
150 | |||
151 | # lvs volumeGroup/snap | ||
152 | LV VG Attr LSize Origin Snap% Move Log Copy% Convert | ||
153 | base volumeGroup Owi-a- 4.00g 17.23 | ||
154 | |||
155 | # dmsetup status volumeGroup-base | ||
156 | 0 8388608 snapshot-merge 281688/2097152 1104 | ||
157 | |||
158 | # dmsetup status volumeGroup-base | ||
159 | 0 8388608 snapshot-merge 180480/2097152 712 | ||
160 | |||
161 | # dmsetup status volumeGroup-base | ||
162 | 0 8388608 snapshot-merge 16/2097152 16 | ||
163 | |||
164 | Merging has finished. | ||
165 | |||
166 | # lvs | ||
167 | LV VG Attr LSize Origin Snap% Move Log Copy% Convert | ||
168 | base volumeGroup owi-a- 4.00g | ||
diff --git a/Documentation/devices.txt b/Documentation/devices.txt index 53d64d382343..1d83d124056c 100644 --- a/Documentation/devices.txt +++ b/Documentation/devices.txt | |||
@@ -443,6 +443,8 @@ Your cooperation is appreciated. | |||
443 | 231 = /dev/snapshot System memory snapshot device | 443 | 231 = /dev/snapshot System memory snapshot device |
444 | 232 = /dev/kvm Kernel-based virtual machine (hardware virtualization extensions) | 444 | 232 = /dev/kvm Kernel-based virtual machine (hardware virtualization extensions) |
445 | 233 = /dev/kmview View-OS A process with a view | 445 | 233 = /dev/kmview View-OS A process with a view |
446 | 234 = /dev/btrfs-control Btrfs control device | ||
447 | 235 = /dev/autofs Autofs control device | ||
446 | 240-254 Reserved for local use | 448 | 240-254 Reserved for local use |
447 | 255 Reserved for MISC_DYNAMIC_MINOR | 449 | 255 Reserved for MISC_DYNAMIC_MINOR |
448 | 450 | ||
diff --git a/Documentation/dontdiff b/Documentation/dontdiff index 3ad6acead949..d9bcffd59433 100644 --- a/Documentation/dontdiff +++ b/Documentation/dontdiff | |||
@@ -69,7 +69,6 @@ av_permissions.h | |||
69 | bbootsect | 69 | bbootsect |
70 | bin2c | 70 | bin2c |
71 | binkernel.spec | 71 | binkernel.spec |
72 | binoffset | ||
73 | bootsect | 72 | bootsect |
74 | bounds.h | 73 | bounds.h |
75 | bsetup | 74 | bsetup |
diff --git a/Documentation/driver-model/platform.txt b/Documentation/driver-model/platform.txt index 2e2c2ea90ceb..41f41632ee55 100644 --- a/Documentation/driver-model/platform.txt +++ b/Documentation/driver-model/platform.txt | |||
@@ -192,7 +192,7 @@ command line. This will execute all matching early_param() callbacks. | |||
192 | User specified early platform devices will be registered at this point. | 192 | User specified early platform devices will be registered at this point. |
193 | For the early serial console case the user can specify port on the | 193 | For the early serial console case the user can specify port on the |
194 | kernel command line as "earlyprintk=serial.0" where "earlyprintk" is | 194 | kernel command line as "earlyprintk=serial.0" where "earlyprintk" is |
195 | the class string, "serial" is the name of the platfrom driver and | 195 | the class string, "serial" is the name of the platform driver and |
196 | 0 is the platform device id. If the id is -1 then the dot and the | 196 | 0 is the platform device id. If the id is -1 then the dot and the |
197 | id can be omitted. | 197 | id can be omitted. |
198 | 198 | ||
diff --git a/Documentation/dvb/ci.txt b/Documentation/dvb/ci.txt index 2ecd834585e6..4a0c2b56e690 100644 --- a/Documentation/dvb/ci.txt +++ b/Documentation/dvb/ci.txt | |||
@@ -41,7 +41,7 @@ This application requires the following to function properly as of now. | |||
41 | 41 | ||
42 | * Cards that fall in this category | 42 | * Cards that fall in this category |
43 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 43 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
44 | At present the cards that fall in this category are the Twinhan and it's | 44 | At present the cards that fall in this category are the Twinhan and its |
45 | clones, these cards are available as VVMER, Tomato, Hercules, Orange and | 45 | clones, these cards are available as VVMER, Tomato, Hercules, Orange and |
46 | so on. | 46 | so on. |
47 | 47 | ||
diff --git a/Documentation/dvb/contributors.txt b/Documentation/dvb/contributors.txt index 4865addebe1c..47c30098dab6 100644 --- a/Documentation/dvb/contributors.txt +++ b/Documentation/dvb/contributors.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | Thanks go to the following people for patches and contributions: | 1 | Thanks go to the following people for patches and contributions: |
2 | 2 | ||
3 | Michael Hunold <m.hunold@gmx.de> | 3 | Michael Hunold <m.hunold@gmx.de> |
4 | for the initial saa7146 driver and it's recent overhaul | 4 | for the initial saa7146 driver and its recent overhaul |
5 | 5 | ||
6 | Christian Theiss | 6 | Christian Theiss |
7 | for his work on the initial Linux DVB driver | 7 | for his work on the initial Linux DVB driver |
diff --git a/Documentation/dvb/get_dvb_firmware b/Documentation/dvb/get_dvb_firmware index 14b7b5a3bcb9..239cbdbf4d12 100644 --- a/Documentation/dvb/get_dvb_firmware +++ b/Documentation/dvb/get_dvb_firmware | |||
@@ -26,7 +26,7 @@ use IO::Handle; | |||
26 | "dec3000s", "vp7041", "dibusb", "nxt2002", "nxt2004", | 26 | "dec3000s", "vp7041", "dibusb", "nxt2002", "nxt2004", |
27 | "or51211", "or51132_qam", "or51132_vsb", "bluebird", | 27 | "or51211", "or51132_qam", "or51132_vsb", "bluebird", |
28 | "opera1", "cx231xx", "cx18", "cx23885", "pvrusb2", "mpc718", | 28 | "opera1", "cx231xx", "cx18", "cx23885", "pvrusb2", "mpc718", |
29 | "af9015"); | 29 | "af9015", "ngene"); |
30 | 30 | ||
31 | # Check args | 31 | # Check args |
32 | syntax() if (scalar(@ARGV) != 1); | 32 | syntax() if (scalar(@ARGV) != 1); |
@@ -39,7 +39,7 @@ for ($i=0; $i < scalar(@components); $i++) { | |||
39 | die $@ if $@; | 39 | die $@ if $@; |
40 | print STDERR <<EOF; | 40 | print STDERR <<EOF; |
41 | Firmware(s) $outfile extracted successfully. | 41 | Firmware(s) $outfile extracted successfully. |
42 | Now copy it(they) to either /usr/lib/hotplug/firmware or /lib/firmware | 42 | Now copy it(them) to either /usr/lib/hotplug/firmware or /lib/firmware |
43 | (depending on configuration of firmware hotplug). | 43 | (depending on configuration of firmware hotplug). |
44 | EOF | 44 | EOF |
45 | exit(0); | 45 | exit(0); |
@@ -549,6 +549,24 @@ sub af9015 { | |||
549 | close INFILE; | 549 | close INFILE; |
550 | } | 550 | } |
551 | 551 | ||
552 | sub ngene { | ||
553 | my $url = "http://www.digitaldevices.de/download/"; | ||
554 | my $file1 = "ngene_15.fw"; | ||
555 | my $hash1 = "d798d5a757121174f0dbc5f2833c0c85"; | ||
556 | my $file2 = "ngene_17.fw"; | ||
557 | my $hash2 = "26b687136e127b8ac24b81e0eeafc20b"; | ||
558 | |||
559 | checkstandard(); | ||
560 | |||
561 | wgetfile($file1, $url . $file1); | ||
562 | verify($file1, $hash1); | ||
563 | |||
564 | wgetfile($file2, $url . $file2); | ||
565 | verify($file2, $hash2); | ||
566 | |||
567 | "$file1, $file2"; | ||
568 | } | ||
569 | |||
552 | # --------------------------------------------------------------- | 570 | # --------------------------------------------------------------- |
553 | # Utilities | 571 | # Utilities |
554 | 572 | ||
@@ -667,6 +685,7 @@ sub delzero{ | |||
667 | sub syntax() { | 685 | sub syntax() { |
668 | print STDERR "syntax: get_dvb_firmware <component>\n"; | 686 | print STDERR "syntax: get_dvb_firmware <component>\n"; |
669 | print STDERR "Supported components:\n"; | 687 | print STDERR "Supported components:\n"; |
688 | @components = sort @components; | ||
670 | for($i=0; $i < scalar(@components); $i++) { | 689 | for($i=0; $i < scalar(@components); $i++) { |
671 | print STDERR "\t" . $components[$i] . "\n"; | 690 | print STDERR "\t" . $components[$i] . "\n"; |
672 | } | 691 | } |
diff --git a/Documentation/eisa.txt b/Documentation/eisa.txt index 60e361ba08c0..f297fc1202ae 100644 --- a/Documentation/eisa.txt +++ b/Documentation/eisa.txt | |||
@@ -171,7 +171,7 @@ device. | |||
171 | virtual_root.force_probe : | 171 | virtual_root.force_probe : |
172 | 172 | ||
173 | Force the probing code to probe EISA slots even when it cannot find an | 173 | Force the probing code to probe EISA slots even when it cannot find an |
174 | EISA compliant mainboard (nothing appears on slot 0). Defaultd to 0 | 174 | EISA compliant mainboard (nothing appears on slot 0). Defaults to 0 |
175 | (don't force), and set to 1 (force probing) when either | 175 | (don't force), and set to 1 (force probing) when either |
176 | CONFIG_ALPHA_JENSEN or CONFIG_EISA_VLB_PRIMING are set. | 176 | CONFIG_ALPHA_JENSEN or CONFIG_EISA_VLB_PRIMING are set. |
177 | 177 | ||
diff --git a/Documentation/email-clients.txt b/Documentation/email-clients.txt index a618efab7b15..945ff3fda433 100644 --- a/Documentation/email-clients.txt +++ b/Documentation/email-clients.txt | |||
@@ -216,26 +216,14 @@ Works. Use "Insert file..." or external editor. | |||
216 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 216 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
217 | Gmail (Web GUI) | 217 | Gmail (Web GUI) |
218 | 218 | ||
219 | If you just have to use Gmail to send patches, it CAN be made to work. It | 219 | Does not work for sending patches. |
220 | requires a bit of external help, though. | 220 | |
221 | 221 | Gmail web client converts tabs to spaces automatically. | |
222 | The first problem is that Gmail converts tabs to spaces. This will | 222 | |
223 | totally break your patches. To prevent this, you have to use a different | 223 | At the same time it wraps lines every 78 chars with CRLF style line breaks |
224 | editor. There is a firefox extension called "ViewSourceWith" | 224 | although tab2space problem can be solved with external editor. |
225 | (https://addons.mozilla.org/en-US/firefox/addon/394) which allows you to | 225 | |
226 | edit any text box in the editor of your choice. Configure it to launch | 226 | Another problem is that Gmail will base64-encode any message that has a |
227 | your favorite editor. When you want to send a patch, use this technique. | 227 | non-ASCII character. That includes things like European names. |
228 | Once you have crafted your messsage + patch, save and exit the editor, | ||
229 | which should reload the Gmail edit box. GMAIL WILL PRESERVE THE TABS. | ||
230 | Hoorah. Apparently you can cut-n-paste literal tabs, but Gmail will | ||
231 | convert those to spaces upon sending! | ||
232 | |||
233 | The second problem is that Gmail converts tabs to spaces on replies. If | ||
234 | you reply to a patch, don't expect to be able to apply it as a patch. | ||
235 | |||
236 | The last problem is that Gmail will base64-encode any message that has a | ||
237 | non-ASCII character. That includes things like European names. Be aware. | ||
238 | |||
239 | Gmail is not convenient for lkml patches, but CAN be made to work. | ||
240 | 228 | ||
241 | ### | 229 | ### |
diff --git a/Documentation/fault-injection/provoke-crashes.txt b/Documentation/fault-injection/provoke-crashes.txt new file mode 100644 index 000000000000..7a9d3d81525b --- /dev/null +++ b/Documentation/fault-injection/provoke-crashes.txt | |||
@@ -0,0 +1,38 @@ | |||
1 | The lkdtm module provides an interface to crash or injure the kernel at | ||
2 | predefined crashpoints to evaluate the reliability of crash dumps obtained | ||
3 | using different dumping solutions. The module uses KPROBEs to instrument | ||
4 | crashing points, but can also crash the kernel directly without KRPOBE | ||
5 | support. | ||
6 | |||
7 | |||
8 | You can provide the way either through module arguments when inserting | ||
9 | the module, or through a debugfs interface. | ||
10 | |||
11 | Usage: insmod lkdtm.ko [recur_count={>0}] cpoint_name=<> cpoint_type=<> | ||
12 | [cpoint_count={>0}] | ||
13 | |||
14 | recur_count : Recursion level for the stack overflow test. Default is 10. | ||
15 | |||
16 | cpoint_name : Crash point where the kernel is to be crashed. It can be | ||
17 | one of INT_HARDWARE_ENTRY, INT_HW_IRQ_EN, INT_TASKLET_ENTRY, | ||
18 | FS_DEVRW, MEM_SWAPOUT, TIMERADD, SCSI_DISPATCH_CMD, | ||
19 | IDE_CORE_CP, DIRECT | ||
20 | |||
21 | cpoint_type : Indicates the action to be taken on hitting the crash point. | ||
22 | It can be one of PANIC, BUG, EXCEPTION, LOOP, OVERFLOW, | ||
23 | CORRUPT_STACK, UNALIGNED_LOAD_STORE_WRITE, OVERWRITE_ALLOCATION, | ||
24 | WRITE_AFTER_FREE, | ||
25 | |||
26 | cpoint_count : Indicates the number of times the crash point is to be hit | ||
27 | to trigger an action. The default is 10. | ||
28 | |||
29 | You can also induce failures by mounting debugfs and writing the type to | ||
30 | <mountpoint>/provoke-crash/<crashpoint>. E.g., | ||
31 | |||
32 | mount -t debugfs debugfs /mnt | ||
33 | echo EXCEPTION > /mnt/provoke-crash/INT_HARDWARE_ENTRY | ||
34 | |||
35 | |||
36 | A special file is `DIRECT' which will induce the crash directly without | ||
37 | KPROBE instrumentation. This mode is the only one available when the module | ||
38 | is built on a kernel without KPROBEs support. | ||
diff --git a/Documentation/fb/imacfb.txt b/Documentation/fb/efifb.txt index 316ec9bb7deb..a59916c29b33 100644 --- a/Documentation/fb/imacfb.txt +++ b/Documentation/fb/efifb.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | 1 | ||
2 | What is imacfb? | 2 | What is efifb? |
3 | =============== | 3 | =============== |
4 | 4 | ||
5 | This is a generic EFI platform driver for Intel based Apple computers. | 5 | This is a generic EFI platform driver for Intel based Apple computers. |
6 | Imacfb is only for EFI booted Intel Macs. | 6 | efifb is only for EFI booted Intel Macs. |
7 | 7 | ||
8 | Supported Hardware | 8 | Supported Hardware |
9 | ================== | 9 | ================== |
@@ -16,16 +16,16 @@ MacMini | |||
16 | How to use it? | 16 | How to use it? |
17 | ============== | 17 | ============== |
18 | 18 | ||
19 | Imacfb does not have any kind of autodetection of your machine. | 19 | efifb does not have any kind of autodetection of your machine. |
20 | You have to add the following kernel parameters in your elilo.conf: | 20 | You have to add the following kernel parameters in your elilo.conf: |
21 | Macbook : | 21 | Macbook : |
22 | video=imacfb:macbook | 22 | video=efifb:macbook |
23 | MacMini : | 23 | MacMini : |
24 | video=imacfb:mini | 24 | video=efifb:mini |
25 | Macbook Pro 15", iMac 17" : | 25 | Macbook Pro 15", iMac 17" : |
26 | video=imacfb:i17 | 26 | video=efifb:i17 |
27 | Macbook Pro 17", iMac 20" : | 27 | Macbook Pro 17", iMac 20" : |
28 | video=imacfb:i20 | 28 | video=efifb:i20 |
29 | 29 | ||
30 | -- | 30 | -- |
31 | Edgar Hucek <gimli@dark-green.com> | 31 | Edgar Hucek <gimli@dark-green.com> |
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 88a216d18092..672be0109d02 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt | |||
@@ -6,21 +6,6 @@ be removed from this file. | |||
6 | 6 | ||
7 | --------------------------- | 7 | --------------------------- |
8 | 8 | ||
9 | What: USER_SCHED | ||
10 | When: 2.6.34 | ||
11 | |||
12 | Why: USER_SCHED was implemented as a proof of concept for group scheduling. | ||
13 | The effect of USER_SCHED can already be achieved from userspace with | ||
14 | the help of libcgroup. The removal of USER_SCHED will also simplify | ||
15 | the scheduler code with the removal of one major ifdef. There are also | ||
16 | issues USER_SCHED has with USER_NS. A decision was taken not to fix | ||
17 | those and instead remove USER_SCHED. Also new group scheduling | ||
18 | features will not be implemented for USER_SCHED. | ||
19 | |||
20 | Who: Dhaval Giani <dhaval@linux.vnet.ibm.com> | ||
21 | |||
22 | --------------------------- | ||
23 | |||
24 | What: PRISM54 | 9 | What: PRISM54 |
25 | When: 2.6.34 | 10 | When: 2.6.34 |
26 | 11 | ||
@@ -64,6 +49,17 @@ Who: Robin Getz <rgetz@blackfin.uclinux.org> & Matt Mackall <mpm@selenic.com> | |||
64 | 49 | ||
65 | --------------------------- | 50 | --------------------------- |
66 | 51 | ||
52 | What: Deprecated snapshot ioctls | ||
53 | When: 2.6.36 | ||
54 | |||
55 | Why: The ioctls in kernel/power/user.c were marked as deprecated long time | ||
56 | ago. Now they notify users about that so that they need to replace | ||
57 | their userspace. After some more time, remove them completely. | ||
58 | |||
59 | Who: Jiri Slaby <jirislaby@gmail.com> | ||
60 | |||
61 | --------------------------- | ||
62 | |||
67 | What: The ieee80211_regdom module parameter | 63 | What: The ieee80211_regdom module parameter |
68 | When: March 2010 / desktop catchup | 64 | When: March 2010 / desktop catchup |
69 | 65 | ||
@@ -88,27 +84,6 @@ Who: Luis R. Rodriguez <lrodriguez@atheros.com> | |||
88 | 84 | ||
89 | --------------------------- | 85 | --------------------------- |
90 | 86 | ||
91 | What: CONFIG_WIRELESS_OLD_REGULATORY - old static regulatory information | ||
92 | When: March 2010 / desktop catchup | ||
93 | |||
94 | Why: The old regulatory infrastructure has been replaced with a new one | ||
95 | which does not require statically defined regulatory domains. We do | ||
96 | not want to keep static regulatory domains in the kernel due to the | ||
97 | the dynamic nature of regulatory law and localization. We kept around | ||
98 | the old static definitions for the regulatory domains of: | ||
99 | |||
100 | * US | ||
101 | * JP | ||
102 | * EU | ||
103 | |||
104 | and used by default the US when CONFIG_WIRELESS_OLD_REGULATORY was | ||
105 | set. We will remove this option once the standard Linux desktop catches | ||
106 | up with the new userspace APIs we have implemented. | ||
107 | |||
108 | Who: Luis R. Rodriguez <lrodriguez@atheros.com> | ||
109 | |||
110 | --------------------------- | ||
111 | |||
112 | What: dev->power.power_state | 87 | What: dev->power.power_state |
113 | When: July 2007 | 88 | When: July 2007 |
114 | Why: Broken design for runtime control over driver power states, confusing | 89 | Why: Broken design for runtime control over driver power states, confusing |
@@ -142,19 +117,25 @@ Who: Mauro Carvalho Chehab <mchehab@infradead.org> | |||
142 | --------------------------- | 117 | --------------------------- |
143 | 118 | ||
144 | What: PCMCIA control ioctl (needed for pcmcia-cs [cardmgr, cardctl]) | 119 | What: PCMCIA control ioctl (needed for pcmcia-cs [cardmgr, cardctl]) |
145 | When: November 2005 | 120 | When: 2.6.35/2.6.36 |
146 | Files: drivers/pcmcia/: pcmcia_ioctl.c | 121 | Files: drivers/pcmcia/: pcmcia_ioctl.c |
147 | Why: With the 16-bit PCMCIA subsystem now behaving (almost) like a | 122 | Why: With the 16-bit PCMCIA subsystem now behaving (almost) like a |
148 | normal hotpluggable bus, and with it using the default kernel | 123 | normal hotpluggable bus, and with it using the default kernel |
149 | infrastructure (hotplug, driver core, sysfs) keeping the PCMCIA | 124 | infrastructure (hotplug, driver core, sysfs) keeping the PCMCIA |
150 | control ioctl needed by cardmgr and cardctl from pcmcia-cs is | 125 | control ioctl needed by cardmgr and cardctl from pcmcia-cs is |
151 | unnecessary, and makes further cleanups and integration of the | 126 | unnecessary and potentially harmful (it does not provide for |
127 | proper locking), and makes further cleanups and integration of the | ||
152 | PCMCIA subsystem into the Linux kernel device driver model more | 128 | PCMCIA subsystem into the Linux kernel device driver model more |
153 | difficult. The features provided by cardmgr and cardctl are either | 129 | difficult. The features provided by cardmgr and cardctl are either |
154 | handled by the kernel itself now or are available in the new | 130 | handled by the kernel itself now or are available in the new |
155 | pcmciautils package available at | 131 | pcmciautils package available at |
156 | http://kernel.org/pub/linux/utils/kernel/pcmcia/ | 132 | http://kernel.org/pub/linux/utils/kernel/pcmcia/ |
157 | Who: Dominik Brodowski <linux@brodo.de> | 133 | |
134 | For all architectures except ARM, the associated config symbol | ||
135 | has been removed from kernel 2.6.34; for ARM, it will be likely | ||
136 | be removed from kernel 2.6.35. The actual code will then likely | ||
137 | be removed from kernel 2.6.36. | ||
138 | Who: Dominik Brodowski <linux@dominikbrodowski.net> | ||
158 | 139 | ||
159 | --------------------------- | 140 | --------------------------- |
160 | 141 | ||
@@ -260,16 +241,6 @@ Who: Thomas Gleixner <tglx@linutronix.de> | |||
260 | 241 | ||
261 | --------------------------- | 242 | --------------------------- |
262 | 243 | ||
263 | What (Why): | ||
264 | - xt_recent: the old ipt_recent proc dir | ||
265 | (superseded by /proc/net/xt_recent) | ||
266 | |||
267 | When: January 2009 or Linux 2.7.0, whichever comes first | ||
268 | Why: Superseded by newer revisions or modules | ||
269 | Who: Jan Engelhardt <jengelh@computergmbh.de> | ||
270 | |||
271 | --------------------------- | ||
272 | |||
273 | What: GPIO autorequest on gpio_direction_{input,output}() in gpiolib | 244 | What: GPIO autorequest on gpio_direction_{input,output}() in gpiolib |
274 | When: February 2010 | 245 | When: February 2010 |
275 | Why: All callers should use explicit gpio_request()/gpio_free(). | 246 | Why: All callers should use explicit gpio_request()/gpio_free(). |
@@ -468,12 +439,6 @@ Who: Alok N Kataria <akataria@vmware.com> | |||
468 | 439 | ||
469 | ---------------------------- | 440 | ---------------------------- |
470 | 441 | ||
471 | What: adt7473 hardware monitoring driver | ||
472 | When: February 2010 | ||
473 | Why: Obsoleted by the adt7475 driver. | ||
474 | Who: Jean Delvare <khali@linux-fr.org> | ||
475 | |||
476 | --------------------------- | ||
477 | What: Support for lcd_switch and display_get in asus-laptop driver | 442 | What: Support for lcd_switch and display_get in asus-laptop driver |
478 | When: March 2010 | 443 | When: March 2010 |
479 | Why: These two features use non-standard interfaces. There are the | 444 | Why: These two features use non-standard interfaces. There are the |
@@ -545,6 +510,142 @@ Who: Hans de Goede <hdegoede@redhat.com> | |||
545 | 510 | ||
546 | ---------------------------- | 511 | ---------------------------- |
547 | 512 | ||
513 | What: sysfs-class-rfkill state file | ||
514 | When: Feb 2014 | ||
515 | Files: net/rfkill/core.c | ||
516 | Why: Documented as obsolete since Feb 2010. This file is limited to 3 | ||
517 | states while the rfkill drivers can have 4 states. | ||
518 | Who: anybody or Florian Mickler <florian@mickler.org> | ||
519 | |||
520 | ---------------------------- | ||
521 | |||
522 | What: sysfs-class-rfkill claim file | ||
523 | When: Feb 2012 | ||
524 | Files: net/rfkill/core.c | ||
525 | Why: It is not possible to claim an rfkill driver since 2007. This is | ||
526 | Documented as obsolete since Feb 2010. | ||
527 | Who: anybody or Florian Mickler <florian@mickler.org> | ||
528 | |||
529 | ---------------------------- | ||
530 | |||
531 | What: capifs | ||
532 | When: February 2011 | ||
533 | Files: drivers/isdn/capi/capifs.* | ||
534 | Why: udev fully replaces this special file system that only contains CAPI | ||
535 | NCCI TTY device nodes. User space (pppdcapiplugin) works without | ||
536 | noticing the difference. | ||
537 | Who: Jan Kiszka <jan.kiszka@web.de> | ||
538 | |||
539 | ---------------------------- | ||
540 | |||
541 | What: KVM memory aliases support | ||
542 | When: July 2010 | ||
543 | Why: Memory aliasing support is used for speeding up guest vga access | ||
544 | through the vga windows. | ||
545 | |||
546 | Modern userspace no longer uses this feature, so it's just bitrotted | ||
547 | code and can be removed with no impact. | ||
548 | Who: Avi Kivity <avi@redhat.com> | ||
549 | |||
550 | ---------------------------- | ||
551 | |||
552 | What: xtime, wall_to_monotonic | ||
553 | When: 2.6.36+ | ||
554 | Files: kernel/time/timekeeping.c include/linux/time.h | ||
555 | Why: Cleaning up timekeeping internal values. Please use | ||
556 | existing timekeeping accessor functions to access | ||
557 | the equivalent functionality. | ||
558 | Who: John Stultz <johnstul@us.ibm.com> | ||
559 | |||
560 | ---------------------------- | ||
561 | |||
562 | What: KVM kernel-allocated memory slots | ||
563 | When: July 2010 | ||
564 | Why: Since 2.6.25, kvm supports user-allocated memory slots, which are | ||
565 | much more flexible than kernel-allocated slots. All current userspace | ||
566 | supports the newer interface and this code can be removed with no | ||
567 | impact. | ||
568 | Who: Avi Kivity <avi@redhat.com> | ||
569 | |||
570 | ---------------------------- | ||
571 | |||
572 | What: KVM paravirt mmu host support | ||
573 | When: January 2011 | ||
574 | Why: The paravirt mmu host support is slower than non-paravirt mmu, both | ||
575 | on newer and older hardware. It is already not exposed to the guest, | ||
576 | and kept only for live migration purposes. | ||
577 | Who: Avi Kivity <avi@redhat.com> | ||
578 | |||
579 | ---------------------------- | ||
580 | |||
581 | What: "acpi=ht" boot option | ||
582 | When: 2.6.35 | ||
583 | Why: Useful in 2003, implementation is a hack. | ||
584 | Generally invoked by accident today. | ||
585 | Seen as doing more harm than good. | ||
586 | Who: Len Brown <len.brown@intel.com> | ||
587 | |||
588 | ---------------------------- | ||
589 | |||
590 | What: iwlwifi 50XX module parameters | ||
591 | When: 2.6.40 | ||
592 | Why: The "..50" modules parameters were used to configure 5000 series and | ||
593 | up devices; different set of module parameters also available for 4965 | ||
594 | with same functionalities. Consolidate both set into single place | ||
595 | in drivers/net/wireless/iwlwifi/iwl-agn.c | ||
596 | |||
597 | Who: Wey-Yi Guy <wey-yi.w.guy@intel.com> | ||
598 | |||
599 | ---------------------------- | ||
600 | |||
601 | What: iwl4965 alias support | ||
602 | When: 2.6.40 | ||
603 | Why: Internal alias support has been present in module-init-tools for some | ||
604 | time, the MODULE_ALIAS("iwl4965") boilerplate aliases can be removed | ||
605 | with no impact. | ||
606 | |||
607 | Who: Wey-Yi Guy <wey-yi.w.guy@intel.com> | ||
608 | |||
609 | --------------------------- | ||
610 | |||
611 | What: xt_NOTRACK | ||
612 | Files: net/netfilter/xt_NOTRACK.c | ||
613 | When: April 2011 | ||
614 | Why: Superseded by xt_CT | ||
615 | Who: Netfilter developer team <netfilter-devel@vger.kernel.org> | ||
616 | |||
617 | --------------------------- | ||
618 | |||
619 | What: video4linux /dev/vtx teletext API support | ||
620 | When: 2.6.35 | ||
621 | Files: drivers/media/video/saa5246a.c drivers/media/video/saa5249.c | ||
622 | include/linux/videotext.h | ||
623 | Why: The vtx device nodes have been superseded by vbi device nodes | ||
624 | for many years. No applications exist that use the vtx support. | ||
625 | Of the two i2c drivers that actually support this API the saa5249 | ||
626 | has been impossible to use for a year now and no known hardware | ||
627 | that supports this device exists. The saa5246a is theoretically | ||
628 | supported by the old mxb boards, but it never actually worked. | ||
629 | |||
630 | In summary: there is no hardware that can use this API and there | ||
631 | are no applications actually implementing this API. | ||
632 | |||
633 | The vtx support still reserves minors 192-223 and we would really | ||
634 | like to reuse those for upcoming new functionality. In the unlikely | ||
635 | event that new hardware appears that wants to use the functionality | ||
636 | provided by the vtx API, then that functionality should be build | ||
637 | around the sliced VBI API instead. | ||
638 | Who: Hans Verkuil <hverkuil@xs4all.nl> | ||
639 | |||
640 | ---------------------------- | ||
641 | |||
642 | What: IRQF_DISABLED | ||
643 | When: 2.6.36 | ||
644 | Why: The flag is a NOOP as we run interrupt handlers with interrupts disabled | ||
645 | Who: Thomas Gleixner <tglx@linutronix.de> | ||
646 | |||
647 | ---------------------------- | ||
648 | |||
548 | What: old ieee1394 subsystem (CONFIG_IEEE1394) | 649 | What: old ieee1394 subsystem (CONFIG_IEEE1394) |
549 | When: 2.6.37 | 650 | When: 2.6.37 |
550 | Files: drivers/ieee1394/ except init_ohci1394_dma.c | 651 | Files: drivers/ieee1394/ except init_ohci1394_dma.c |
@@ -552,3 +653,6 @@ Why: superseded by drivers/firewire/ (CONFIG_FIREWIRE) which offers more | |||
552 | features, better performance, and better security, all with smaller | 653 | features, better performance, and better security, all with smaller |
553 | and more modern code base | 654 | and more modern code base |
554 | Who: Stefan Richter <stefanr@s5r6.in-berlin.de> | 655 | Who: Stefan Richter <stefanr@s5r6.in-berlin.de> |
656 | |||
657 | ---------------------------- | ||
658 | |||
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index 875d49696b6e..4303614b5add 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX | |||
@@ -16,6 +16,8 @@ befs.txt | |||
16 | - information about the BeOS filesystem for Linux. | 16 | - information about the BeOS filesystem for Linux. |
17 | bfs.txt | 17 | bfs.txt |
18 | - info for the SCO UnixWare Boot Filesystem (BFS). | 18 | - info for the SCO UnixWare Boot Filesystem (BFS). |
19 | ceph.txt | ||
20 | - info for the Ceph Distributed File System | ||
19 | cifs.txt | 21 | cifs.txt |
20 | - description of the CIFS filesystem. | 22 | - description of the CIFS filesystem. |
21 | coda.txt | 23 | coda.txt |
@@ -32,6 +34,8 @@ dlmfs.txt | |||
32 | - info on the userspace interface to the OCFS2 DLM. | 34 | - info on the userspace interface to the OCFS2 DLM. |
33 | dnotify.txt | 35 | dnotify.txt |
34 | - info about directory notification in Linux. | 36 | - info about directory notification in Linux. |
37 | dnotify_test.c | ||
38 | - example program for dnotify | ||
35 | ecryptfs.txt | 39 | ecryptfs.txt |
36 | - docs on eCryptfs: stacked cryptographic filesystem for Linux. | 40 | - docs on eCryptfs: stacked cryptographic filesystem for Linux. |
37 | exofs.txt | 41 | exofs.txt |
@@ -62,6 +66,8 @@ jfs.txt | |||
62 | - info and mount options for the JFS filesystem. | 66 | - info and mount options for the JFS filesystem. |
63 | locks.txt | 67 | locks.txt |
64 | - info on file locking implementations, flock() vs. fcntl(), etc. | 68 | - info on file locking implementations, flock() vs. fcntl(), etc. |
69 | logfs.txt | ||
70 | - info on the LogFS flash filesystem. | ||
65 | mandatory-locking.txt | 71 | mandatory-locking.txt |
66 | - info on the Linux implementation of Sys V mandatory file locking. | 72 | - info on the Linux implementation of Sys V mandatory file locking. |
67 | ncpfs.txt | 73 | ncpfs.txt |
diff --git a/Documentation/filesystems/9p.txt b/Documentation/filesystems/9p.txt index 57e0b80a5274..c0236e753bc8 100644 --- a/Documentation/filesystems/9p.txt +++ b/Documentation/filesystems/9p.txt | |||
@@ -37,6 +37,15 @@ For Plan 9 From User Space applications (http://swtch.com/plan9) | |||
37 | 37 | ||
38 | mount -t 9p `namespace`/acme /mnt/9 -o trans=unix,uname=$USER | 38 | mount -t 9p `namespace`/acme /mnt/9 -o trans=unix,uname=$USER |
39 | 39 | ||
40 | For server running on QEMU host with virtio transport: | ||
41 | |||
42 | mount -t 9p -o trans=virtio <mount_tag> /mnt/9 | ||
43 | |||
44 | where mount_tag is the tag associated by the server to each of the exported | ||
45 | mount points. Each 9P export is seen by the client as a virtio device with an | ||
46 | associated "mount_tag" property. Available mount tags can be | ||
47 | seen by reading /sys/bus/virtio/drivers/9pnet_virtio/virtio<n>/mount_tag files. | ||
48 | |||
40 | OPTIONS | 49 | OPTIONS |
41 | ======= | 50 | ======= |
42 | 51 | ||
@@ -47,7 +56,7 @@ OPTIONS | |||
47 | fd - used passed file descriptors for connection | 56 | fd - used passed file descriptors for connection |
48 | (see rfdno and wfdno) | 57 | (see rfdno and wfdno) |
49 | virtio - connect to the next virtio channel available | 58 | virtio - connect to the next virtio channel available |
50 | (from lguest or KVM with trans_virtio module) | 59 | (from QEMU with trans_virtio module) |
51 | rdma - connect to a specified RDMA channel | 60 | rdma - connect to a specified RDMA channel |
52 | 61 | ||
53 | uname=name user name to attempt mount as on the remote server. The | 62 | uname=name user name to attempt mount as on the remote server. The |
@@ -85,7 +94,12 @@ OPTIONS | |||
85 | 94 | ||
86 | port=n port to connect to on the remote server | 95 | port=n port to connect to on the remote server |
87 | 96 | ||
88 | noextend force legacy mode (no 9p2000.u semantics) | 97 | noextend force legacy mode (no 9p2000.u or 9p2000.L semantics) |
98 | |||
99 | version=name Select 9P protocol version. Valid options are: | ||
100 | 9p2000 - Legacy mode (same as noextend) | ||
101 | 9p2000.u - Use 9P2000.u protocol | ||
102 | 9p2000.L - Use 9P2000.L protocol | ||
89 | 103 | ||
90 | dfltuid attempt to mount as a particular uid | 104 | dfltuid attempt to mount as a particular uid |
91 | 105 | ||
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 18b9d0ca0630..61c98f03baa1 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
@@ -178,7 +178,7 @@ prototypes: | |||
178 | locking rules: | 178 | locking rules: |
179 | All except set_page_dirty may block | 179 | All except set_page_dirty may block |
180 | 180 | ||
181 | BKL PageLocked(page) i_sem | 181 | BKL PageLocked(page) i_mutex |
182 | writepage: no yes, unlocks (see below) | 182 | writepage: no yes, unlocks (see below) |
183 | readpage: no yes, unlocks | 183 | readpage: no yes, unlocks |
184 | sync_page: no maybe | 184 | sync_page: no maybe |
@@ -429,8 +429,9 @@ check_flags: no | |||
429 | implementations. If your fs is not using generic_file_llseek, you | 429 | implementations. If your fs is not using generic_file_llseek, you |
430 | need to acquire and release the appropriate locks in your ->llseek(). | 430 | need to acquire and release the appropriate locks in your ->llseek(). |
431 | For many filesystems, it is probably safe to acquire the inode | 431 | For many filesystems, it is probably safe to acquire the inode |
432 | semaphore. Note some filesystems (i.e. remote ones) provide no | 432 | mutex or just to use i_size_read() instead. |
433 | protection for i_size so you will need to use the BKL. | 433 | Note: this does not protect the file->f_pos against concurrent modifications |
434 | since this is something the userspace has to take care about. | ||
434 | 435 | ||
435 | Note: ext2_release() was *the* source of contention on fs-intensive | 436 | Note: ext2_release() was *the* source of contention on fs-intensive |
436 | loads and dropping BKL on ->release() helps to get rid of that (we still | 437 | loads and dropping BKL on ->release() helps to get rid of that (we still |
@@ -460,13 +461,6 @@ in sys_read() and friends. | |||
460 | 461 | ||
461 | --------------------------- dquot_operations ------------------------------- | 462 | --------------------------- dquot_operations ------------------------------- |
462 | prototypes: | 463 | prototypes: |
463 | int (*initialize) (struct inode *, int); | ||
464 | int (*drop) (struct inode *); | ||
465 | int (*alloc_space) (struct inode *, qsize_t, int); | ||
466 | int (*alloc_inode) (const struct inode *, unsigned long); | ||
467 | int (*free_space) (struct inode *, qsize_t); | ||
468 | int (*free_inode) (const struct inode *, unsigned long); | ||
469 | int (*transfer) (struct inode *, struct iattr *); | ||
470 | int (*write_dquot) (struct dquot *); | 464 | int (*write_dquot) (struct dquot *); |
471 | int (*acquire_dquot) (struct dquot *); | 465 | int (*acquire_dquot) (struct dquot *); |
472 | int (*release_dquot) (struct dquot *); | 466 | int (*release_dquot) (struct dquot *); |
@@ -479,13 +473,6 @@ a proper locking wrt the filesystem and call the generic quota operations. | |||
479 | What filesystem should expect from the generic quota functions: | 473 | What filesystem should expect from the generic quota functions: |
480 | 474 | ||
481 | FS recursion Held locks when called | 475 | FS recursion Held locks when called |
482 | initialize: yes maybe dqonoff_sem | ||
483 | drop: yes - | ||
484 | alloc_space: ->mark_dirty() - | ||
485 | alloc_inode: ->mark_dirty() - | ||
486 | free_space: ->mark_dirty() - | ||
487 | free_inode: ->mark_dirty() - | ||
488 | transfer: yes - | ||
489 | write_dquot: yes dqonoff_sem or dqptr_sem | 476 | write_dquot: yes dqonoff_sem or dqptr_sem |
490 | acquire_dquot: yes dqonoff_sem or dqptr_sem | 477 | acquire_dquot: yes dqonoff_sem or dqptr_sem |
491 | release_dquot: yes dqonoff_sem or dqptr_sem | 478 | release_dquot: yes dqonoff_sem or dqptr_sem |
@@ -495,10 +482,6 @@ write_info: yes dqonoff_sem | |||
495 | FS recursion means calling ->quota_read() and ->quota_write() from superblock | 482 | FS recursion means calling ->quota_read() and ->quota_write() from superblock |
496 | operations. | 483 | operations. |
497 | 484 | ||
498 | ->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called | ||
499 | only directly by the filesystem and do not call any fs functions only | ||
500 | the ->mark_dirty() operation. | ||
501 | |||
502 | More details about quota locking can be found in fs/dquot.c. | 485 | More details about quota locking can be found in fs/dquot.c. |
503 | 486 | ||
504 | --------------------------- vm_operations_struct ----------------------------- | 487 | --------------------------- vm_operations_struct ----------------------------- |
diff --git a/Documentation/filesystems/Makefile b/Documentation/filesystems/Makefile new file mode 100644 index 000000000000..a5dd114da14f --- /dev/null +++ b/Documentation/filesystems/Makefile | |||
@@ -0,0 +1,8 @@ | |||
1 | # kbuild trick to avoid linker error. Can be omitted if a module is built. | ||
2 | obj- := dummy.o | ||
3 | |||
4 | # List of programs to build | ||
5 | hostprogs-y := dnotify_test | ||
6 | |||
7 | # Tell kbuild to always build the programs | ||
8 | always := $(hostprogs-y) | ||
diff --git a/Documentation/filesystems/autofs4-mount-control.txt b/Documentation/filesystems/autofs4-mount-control.txt index 8f78ded4b648..51986bf08a4d 100644 --- a/Documentation/filesystems/autofs4-mount-control.txt +++ b/Documentation/filesystems/autofs4-mount-control.txt | |||
@@ -146,7 +146,7 @@ found to be inadequate, in this case. The Generic Netlink system was | |||
146 | used for this as raw Netlink would lead to a significant increase in | 146 | used for this as raw Netlink would lead to a significant increase in |
147 | complexity. There's no question that the Generic Netlink system is an | 147 | complexity. There's no question that the Generic Netlink system is an |
148 | elegant solution for common case ioctl functions but it's not a complete | 148 | elegant solution for common case ioctl functions but it's not a complete |
149 | replacement probably because it's primary purpose in life is to be a | 149 | replacement probably because its primary purpose in life is to be a |
150 | message bus implementation rather than specifically an ioctl replacement. | 150 | message bus implementation rather than specifically an ioctl replacement. |
151 | While it would be possible to work around this there is one concern | 151 | While it would be possible to work around this there is one concern |
152 | that lead to the decision to not use it. This is that the autofs | 152 | that lead to the decision to not use it. This is that the autofs |
diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt new file mode 100644 index 000000000000..763d8ebbbebd --- /dev/null +++ b/Documentation/filesystems/ceph.txt | |||
@@ -0,0 +1,140 @@ | |||
1 | Ceph Distributed File System | ||
2 | ============================ | ||
3 | |||
4 | Ceph is a distributed network file system designed to provide good | ||
5 | performance, reliability, and scalability. | ||
6 | |||
7 | Basic features include: | ||
8 | |||
9 | * POSIX semantics | ||
10 | * Seamless scaling from 1 to many thousands of nodes | ||
11 | * High availability and reliability. No single point of failure. | ||
12 | * N-way replication of data across storage nodes | ||
13 | * Fast recovery from node failures | ||
14 | * Automatic rebalancing of data on node addition/removal | ||
15 | * Easy deployment: most FS components are userspace daemons | ||
16 | |||
17 | Also, | ||
18 | * Flexible snapshots (on any directory) | ||
19 | * Recursive accounting (nested files, directories, bytes) | ||
20 | |||
21 | In contrast to cluster filesystems like GFS, OCFS2, and GPFS that rely | ||
22 | on symmetric access by all clients to shared block devices, Ceph | ||
23 | separates data and metadata management into independent server | ||
24 | clusters, similar to Lustre. Unlike Lustre, however, metadata and | ||
25 | storage nodes run entirely as user space daemons. Storage nodes | ||
26 | utilize btrfs to store data objects, leveraging its advanced features | ||
27 | (checksumming, metadata replication, etc.). File data is striped | ||
28 | across storage nodes in large chunks to distribute workload and | ||
29 | facilitate high throughputs. When storage nodes fail, data is | ||
30 | re-replicated in a distributed fashion by the storage nodes themselves | ||
31 | (with some minimal coordination from a cluster monitor), making the | ||
32 | system extremely efficient and scalable. | ||
33 | |||
34 | Metadata servers effectively form a large, consistent, distributed | ||
35 | in-memory cache above the file namespace that is extremely scalable, | ||
36 | dynamically redistributes metadata in response to workload changes, | ||
37 | and can tolerate arbitrary (well, non-Byzantine) node failures. The | ||
38 | metadata server takes a somewhat unconventional approach to metadata | ||
39 | storage to significantly improve performance for common workloads. In | ||
40 | particular, inodes with only a single link are embedded in | ||
41 | directories, allowing entire directories of dentries and inodes to be | ||
42 | loaded into its cache with a single I/O operation. The contents of | ||
43 | extremely large directories can be fragmented and managed by | ||
44 | independent metadata servers, allowing scalable concurrent access. | ||
45 | |||
46 | The system offers automatic data rebalancing/migration when scaling | ||
47 | from a small cluster of just a few nodes to many hundreds, without | ||
48 | requiring an administrator carve the data set into static volumes or | ||
49 | go through the tedious process of migrating data between servers. | ||
50 | When the file system approaches full, new nodes can be easily added | ||
51 | and things will "just work." | ||
52 | |||
53 | Ceph includes flexible snapshot mechanism that allows a user to create | ||
54 | a snapshot on any subdirectory (and its nested contents) in the | ||
55 | system. Snapshot creation and deletion are as simple as 'mkdir | ||
56 | .snap/foo' and 'rmdir .snap/foo'. | ||
57 | |||
58 | Ceph also provides some recursive accounting on directories for nested | ||
59 | files and bytes. That is, a 'getfattr -d foo' on any directory in the | ||
60 | system will reveal the total number of nested regular files and | ||
61 | subdirectories, and a summation of all nested file sizes. This makes | ||
62 | the identification of large disk space consumers relatively quick, as | ||
63 | no 'du' or similar recursive scan of the file system is required. | ||
64 | |||
65 | |||
66 | Mount Syntax | ||
67 | ============ | ||
68 | |||
69 | The basic mount syntax is: | ||
70 | |||
71 | # mount -t ceph monip[:port][,monip2[:port]...]:/[subdir] mnt | ||
72 | |||
73 | You only need to specify a single monitor, as the client will get the | ||
74 | full list when it connects. (However, if the monitor you specify | ||
75 | happens to be down, the mount won't succeed.) The port can be left | ||
76 | off if the monitor is using the default. So if the monitor is at | ||
77 | 1.2.3.4, | ||
78 | |||
79 | # mount -t ceph 1.2.3.4:/ /mnt/ceph | ||
80 | |||
81 | is sufficient. If /sbin/mount.ceph is installed, a hostname can be | ||
82 | used instead of an IP address. | ||
83 | |||
84 | |||
85 | |||
86 | Mount Options | ||
87 | ============= | ||
88 | |||
89 | ip=A.B.C.D[:N] | ||
90 | Specify the IP and/or port the client should bind to locally. | ||
91 | There is normally not much reason to do this. If the IP is not | ||
92 | specified, the client's IP address is determined by looking at the | ||
93 | address its connection to the monitor originates from. | ||
94 | |||
95 | wsize=X | ||
96 | Specify the maximum write size in bytes. By default there is no | ||
97 | maximum. Ceph will normally size writes based on the file stripe | ||
98 | size. | ||
99 | |||
100 | rsize=X | ||
101 | Specify the maximum readahead. | ||
102 | |||
103 | mount_timeout=X | ||
104 | Specify the timeout value for mount (in seconds), in the case | ||
105 | of a non-responsive Ceph file system. The default is 30 | ||
106 | seconds. | ||
107 | |||
108 | rbytes | ||
109 | When stat() is called on a directory, set st_size to 'rbytes', | ||
110 | the summation of file sizes over all files nested beneath that | ||
111 | directory. This is the default. | ||
112 | |||
113 | norbytes | ||
114 | When stat() is called on a directory, set st_size to the | ||
115 | number of entries in that directory. | ||
116 | |||
117 | nocrc | ||
118 | Disable CRC32C calculation for data writes. If set, the storage node | ||
119 | must rely on TCP's error correction to detect data corruption | ||
120 | in the data payload. | ||
121 | |||
122 | noasyncreaddir | ||
123 | Disable client's use its local cache to satisfy readdir | ||
124 | requests. (This does not change correctness; the client uses | ||
125 | cached metadata only when a lease or capability ensures it is | ||
126 | valid.) | ||
127 | |||
128 | |||
129 | More Information | ||
130 | ================ | ||
131 | |||
132 | For more information on Ceph, see the home page at | ||
133 | http://ceph.newdream.net/ | ||
134 | |||
135 | The Linux kernel client source tree is available at | ||
136 | git://ceph.newdream.net/git/ceph-client.git | ||
137 | git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git | ||
138 | |||
139 | and the source for the full system is at | ||
140 | git://ceph.newdream.net/git/ceph.git | ||
diff --git a/Documentation/filesystems/dentry-locking.txt b/Documentation/filesystems/dentry-locking.txt index 4c0c575a4012..79334ed5daa7 100644 --- a/Documentation/filesystems/dentry-locking.txt +++ b/Documentation/filesystems/dentry-locking.txt | |||
@@ -62,7 +62,8 @@ changes are : | |||
62 | 2. Insertion of a dentry into the hash table is done using | 62 | 2. Insertion of a dentry into the hash table is done using |
63 | hlist_add_head_rcu() which take care of ordering the writes - the | 63 | hlist_add_head_rcu() which take care of ordering the writes - the |
64 | writes to the dentry must be visible before the dentry is | 64 | writes to the dentry must be visible before the dentry is |
65 | inserted. This works in conjunction with hlist_for_each_rcu() while | 65 | inserted. This works in conjunction with hlist_for_each_rcu(), |
66 | which has since been replaced by hlist_for_each_entry_rcu(), while | ||
66 | walking the hash chain. The only requirement is that all | 67 | walking the hash chain. The only requirement is that all |
67 | initialization to the dentry must be done before | 68 | initialization to the dentry must be done before |
68 | hlist_add_head_rcu() since we don't have dcache_lock protection | 69 | hlist_add_head_rcu() since we don't have dcache_lock protection |
diff --git a/Documentation/filesystems/dlmfs.txt b/Documentation/filesystems/dlmfs.txt index c50bbb2d52b4..1b528b2ad809 100644 --- a/Documentation/filesystems/dlmfs.txt +++ b/Documentation/filesystems/dlmfs.txt | |||
@@ -47,7 +47,7 @@ You'll want to start heartbeating on a volume which all the nodes in | |||
47 | your lockspace can access. The easiest way to do this is via | 47 | your lockspace can access. The easiest way to do this is via |
48 | ocfs2_hb_ctl (distributed with ocfs2-tools). Right now it requires | 48 | ocfs2_hb_ctl (distributed with ocfs2-tools). Right now it requires |
49 | that an OCFS2 file system be in place so that it can automatically | 49 | that an OCFS2 file system be in place so that it can automatically |
50 | find it's heartbeat area, though it will eventually support heartbeat | 50 | find its heartbeat area, though it will eventually support heartbeat |
51 | against raw disks. | 51 | against raw disks. |
52 | 52 | ||
53 | Please see the ocfs2_hb_ctl and mkfs.ocfs2 manual pages distributed | 53 | Please see the ocfs2_hb_ctl and mkfs.ocfs2 manual pages distributed |
diff --git a/Documentation/filesystems/dnotify.txt b/Documentation/filesystems/dnotify.txt index 9f5d338ddbb8..6baf88f46859 100644 --- a/Documentation/filesystems/dnotify.txt +++ b/Documentation/filesystems/dnotify.txt | |||
@@ -62,38 +62,9 @@ disabled, fcntl(fd, F_NOTIFY, ...) will return -EINVAL. | |||
62 | 62 | ||
63 | Example | 63 | Example |
64 | ------- | 64 | ------- |
65 | See Documentation/filesystems/dnotify_test.c for an example. | ||
65 | 66 | ||
66 | #define _GNU_SOURCE /* needed to get the defines */ | 67 | NOTE |
67 | #include <fcntl.h> /* in glibc 2.2 this has the needed | 68 | ---- |
68 | values defined */ | 69 | Beginning with Linux 2.6.13, dnotify has been replaced by inotify. |
69 | #include <signal.h> | 70 | See Documentation/filesystems/inotify.txt for more information on it. |
70 | #include <stdio.h> | ||
71 | #include <unistd.h> | ||
72 | |||
73 | static volatile int event_fd; | ||
74 | |||
75 | static void handler(int sig, siginfo_t *si, void *data) | ||
76 | { | ||
77 | event_fd = si->si_fd; | ||
78 | } | ||
79 | |||
80 | int main(void) | ||
81 | { | ||
82 | struct sigaction act; | ||
83 | int fd; | ||
84 | |||
85 | act.sa_sigaction = handler; | ||
86 | sigemptyset(&act.sa_mask); | ||
87 | act.sa_flags = SA_SIGINFO; | ||
88 | sigaction(SIGRTMIN + 1, &act, NULL); | ||
89 | |||
90 | fd = open(".", O_RDONLY); | ||
91 | fcntl(fd, F_SETSIG, SIGRTMIN + 1); | ||
92 | fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT); | ||
93 | /* we will now be notified if any of the files | ||
94 | in "." is modified or new files are created */ | ||
95 | while (1) { | ||
96 | pause(); | ||
97 | printf("Got event on fd=%d\n", event_fd); | ||
98 | } | ||
99 | } | ||
diff --git a/Documentation/filesystems/dnotify_test.c b/Documentation/filesystems/dnotify_test.c new file mode 100644 index 000000000000..8b37b4a1e18d --- /dev/null +++ b/Documentation/filesystems/dnotify_test.c | |||
@@ -0,0 +1,34 @@ | |||
1 | #define _GNU_SOURCE /* needed to get the defines */ | ||
2 | #include <fcntl.h> /* in glibc 2.2 this has the needed | ||
3 | values defined */ | ||
4 | #include <signal.h> | ||
5 | #include <stdio.h> | ||
6 | #include <unistd.h> | ||
7 | |||
8 | static volatile int event_fd; | ||
9 | |||
10 | static void handler(int sig, siginfo_t *si, void *data) | ||
11 | { | ||
12 | event_fd = si->si_fd; | ||
13 | } | ||
14 | |||
15 | int main(void) | ||
16 | { | ||
17 | struct sigaction act; | ||
18 | int fd; | ||
19 | |||
20 | act.sa_sigaction = handler; | ||
21 | sigemptyset(&act.sa_mask); | ||
22 | act.sa_flags = SA_SIGINFO; | ||
23 | sigaction(SIGRTMIN + 1, &act, NULL); | ||
24 | |||
25 | fd = open(".", O_RDONLY); | ||
26 | fcntl(fd, F_SETSIG, SIGRTMIN + 1); | ||
27 | fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT); | ||
28 | /* we will now be notified if any of the files | ||
29 | in "." is modified or new files are created */ | ||
30 | while (1) { | ||
31 | pause(); | ||
32 | printf("Got event on fd=%d\n", event_fd); | ||
33 | } | ||
34 | } | ||
diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt index 867c5b50cb42..272f80d5f966 100644 --- a/Documentation/filesystems/ext3.txt +++ b/Documentation/filesystems/ext3.txt | |||
@@ -59,8 +59,19 @@ commit=nrsec (*) Ext3 can be told to sync all its data and metadata | |||
59 | Setting it to very large values will improve | 59 | Setting it to very large values will improve |
60 | performance. | 60 | performance. |
61 | 61 | ||
62 | barrier=1 This enables/disables barriers. barrier=0 disables | 62 | barrier=<0(*)|1> This enables/disables the use of write barriers in |
63 | it, barrier=1 enables it. | 63 | barrier the jbd code. barrier=0 disables, barrier=1 enables. |
64 | nobarrier (*) This also requires an IO stack which can support | ||
65 | barriers, and if jbd gets an error on a barrier | ||
66 | write, it will disable again with a warning. | ||
67 | Write barriers enforce proper on-disk ordering | ||
68 | of journal commits, making volatile disk write caches | ||
69 | safe to use, at some performance penalty. If | ||
70 | your disks are battery-backed in one way or another, | ||
71 | disabling barriers may safely improve performance. | ||
72 | The mount options "barrier" and "nobarrier" can | ||
73 | also be used to enable or disable barriers, for | ||
74 | consistency with other ext3 mount options. | ||
64 | 75 | ||
65 | orlov (*) This enables the new Orlov block allocator. It is | 76 | orlov (*) This enables the new Orlov block allocator. It is |
66 | enabled by default. | 77 | enabled by default. |
diff --git a/Documentation/filesystems/fiemap.txt b/Documentation/filesystems/fiemap.txt index 606233cd4618..1b805a0efbb0 100644 --- a/Documentation/filesystems/fiemap.txt +++ b/Documentation/filesystems/fiemap.txt | |||
@@ -38,7 +38,7 @@ flags, it will return EBADR and the contents of fm_flags will contain | |||
38 | the set of flags which caused the error. If the kernel is compatible | 38 | the set of flags which caused the error. If the kernel is compatible |
39 | with all flags passed, the contents of fm_flags will be unmodified. | 39 | with all flags passed, the contents of fm_flags will be unmodified. |
40 | It is up to userspace to determine whether rejection of a particular | 40 | It is up to userspace to determine whether rejection of a particular |
41 | flag is fatal to it's operation. This scheme is intended to allow the | 41 | flag is fatal to its operation. This scheme is intended to allow the |
42 | fiemap interface to grow in the future but without losing | 42 | fiemap interface to grow in the future but without losing |
43 | compatibility with old software. | 43 | compatibility with old software. |
44 | 44 | ||
@@ -56,7 +56,7 @@ If this flag is set, the kernel will sync the file before mapping extents. | |||
56 | 56 | ||
57 | * FIEMAP_FLAG_XATTR | 57 | * FIEMAP_FLAG_XATTR |
58 | If this flag is set, the extents returned will describe the inodes | 58 | If this flag is set, the extents returned will describe the inodes |
59 | extended attribute lookup tree, instead of it's data tree. | 59 | extended attribute lookup tree, instead of its data tree. |
60 | 60 | ||
61 | 61 | ||
62 | Extent Mapping | 62 | Extent Mapping |
@@ -89,7 +89,7 @@ struct fiemap_extent { | |||
89 | }; | 89 | }; |
90 | 90 | ||
91 | All offsets and lengths are in bytes and mirror those on disk. It is valid | 91 | All offsets and lengths are in bytes and mirror those on disk. It is valid |
92 | for an extents logical offset to start before the request or it's logical | 92 | for an extents logical offset to start before the request or its logical |
93 | length to extend past the request. Unless FIEMAP_EXTENT_NOT_ALIGNED is | 93 | length to extend past the request. Unless FIEMAP_EXTENT_NOT_ALIGNED is |
94 | returned, fe_logical, fe_physical, and fe_length will be aligned to the | 94 | returned, fe_logical, fe_physical, and fe_length will be aligned to the |
95 | block size of the file system. With the exception of extents flagged as | 95 | block size of the file system. With the exception of extents flagged as |
@@ -125,7 +125,7 @@ been allocated for the file yet. | |||
125 | 125 | ||
126 | * FIEMAP_EXTENT_DELALLOC | 126 | * FIEMAP_EXTENT_DELALLOC |
127 | - This will also set FIEMAP_EXTENT_UNKNOWN. | 127 | - This will also set FIEMAP_EXTENT_UNKNOWN. |
128 | Delayed allocation - while there is data for this extent, it's | 128 | Delayed allocation - while there is data for this extent, its |
129 | physical location has not been allocated yet. | 129 | physical location has not been allocated yet. |
130 | 130 | ||
131 | * FIEMAP_EXTENT_ENCODED | 131 | * FIEMAP_EXTENT_ENCODED |
@@ -159,7 +159,7 @@ Data is located within a meta data block. | |||
159 | Data is packed into a block with data from other files. | 159 | Data is packed into a block with data from other files. |
160 | 160 | ||
161 | * FIEMAP_EXTENT_UNWRITTEN | 161 | * FIEMAP_EXTENT_UNWRITTEN |
162 | Unwritten extent - the extent is allocated but it's data has not been | 162 | Unwritten extent - the extent is allocated but its data has not been |
163 | initialized. This indicates the extent's data will be all zero if read | 163 | initialized. This indicates the extent's data will be all zero if read |
164 | through the filesystem but the contents are undefined if read directly from | 164 | through the filesystem but the contents are undefined if read directly from |
165 | the device. | 165 | the device. |
@@ -176,7 +176,7 @@ VFS -> File System Implementation | |||
176 | 176 | ||
177 | File systems wishing to support fiemap must implement a ->fiemap callback on | 177 | File systems wishing to support fiemap must implement a ->fiemap callback on |
178 | their inode_operations structure. The fs ->fiemap call is responsible for | 178 | their inode_operations structure. The fs ->fiemap call is responsible for |
179 | defining it's set of supported fiemap flags, and calling a helper function on | 179 | defining its set of supported fiemap flags, and calling a helper function on |
180 | each discovered extent: | 180 | each discovered extent: |
181 | 181 | ||
182 | struct inode_operations { | 182 | struct inode_operations { |
diff --git a/Documentation/filesystems/fuse.txt b/Documentation/filesystems/fuse.txt index 397a41adb4c3..13af4a49e7db 100644 --- a/Documentation/filesystems/fuse.txt +++ b/Documentation/filesystems/fuse.txt | |||
@@ -91,7 +91,7 @@ Mount options | |||
91 | 'default_permissions' | 91 | 'default_permissions' |
92 | 92 | ||
93 | By default FUSE doesn't check file access permissions, the | 93 | By default FUSE doesn't check file access permissions, the |
94 | filesystem is free to implement it's access policy or leave it to | 94 | filesystem is free to implement its access policy or leave it to |
95 | the underlying file access mechanism (e.g. in case of network | 95 | the underlying file access mechanism (e.g. in case of network |
96 | filesystems). This option enables permission checking, restricting | 96 | filesystems). This option enables permission checking, restricting |
97 | access based on file mode. It is usually useful together with the | 97 | access based on file mode. It is usually useful together with the |
@@ -171,7 +171,7 @@ or may honor them by sending a reply to the _original_ request, with | |||
171 | the error set to EINTR. | 171 | the error set to EINTR. |
172 | 172 | ||
173 | It is also possible that there's a race between processing the | 173 | It is also possible that there's a race between processing the |
174 | original request and it's INTERRUPT request. There are two possibilities: | 174 | original request and its INTERRUPT request. There are two possibilities: |
175 | 175 | ||
176 | 1) The INTERRUPT request is processed before the original request is | 176 | 1) The INTERRUPT request is processed before the original request is |
177 | processed | 177 | processed |
diff --git a/Documentation/filesystems/gfs2.txt b/Documentation/filesystems/gfs2.txt index 5e3ab8f3beff..0b59c0200912 100644 --- a/Documentation/filesystems/gfs2.txt +++ b/Documentation/filesystems/gfs2.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | Global File System | 1 | Global File System |
2 | ------------------ | 2 | ------------------ |
3 | 3 | ||
4 | http://sources.redhat.com/cluster/ | 4 | http://sources.redhat.com/cluster/wiki/ |
5 | 5 | ||
6 | GFS is a cluster file system. It allows a cluster of computers to | 6 | GFS is a cluster file system. It allows a cluster of computers to |
7 | simultaneously use a block device that is shared between them (with FC, | 7 | simultaneously use a block device that is shared between them (with FC, |
@@ -36,11 +36,11 @@ GFS2 is not on-disk compatible with previous versions of GFS, but it | |||
36 | is pretty close. | 36 | is pretty close. |
37 | 37 | ||
38 | The following man pages can be found at the URL above: | 38 | The following man pages can be found at the URL above: |
39 | fsck.gfs2 to repair a filesystem | 39 | fsck.gfs2 to repair a filesystem |
40 | gfs2_grow to expand a filesystem online | 40 | gfs2_grow to expand a filesystem online |
41 | gfs2_jadd to add journals to a filesystem online | 41 | gfs2_jadd to add journals to a filesystem online |
42 | gfs2_tool to manipulate, examine and tune a filesystem | 42 | gfs2_tool to manipulate, examine and tune a filesystem |
43 | gfs2_quota to examine and change quota values in a filesystem | 43 | gfs2_quota to examine and change quota values in a filesystem |
44 | gfs2_convert to convert a gfs filesystem to gfs2 in-place | 44 | gfs2_convert to convert a gfs filesystem to gfs2 in-place |
45 | mount.gfs2 to help mount(8) mount a filesystem | 45 | mount.gfs2 to help mount(8) mount a filesystem |
46 | mkfs.gfs2 to make a filesystem | 46 | mkfs.gfs2 to make a filesystem |
diff --git a/Documentation/filesystems/hpfs.txt b/Documentation/filesystems/hpfs.txt index fa45c3baed98..74630bd504fb 100644 --- a/Documentation/filesystems/hpfs.txt +++ b/Documentation/filesystems/hpfs.txt | |||
@@ -103,7 +103,7 @@ to analyze or change OS2SYS.INI. | |||
103 | Codepages | 103 | Codepages |
104 | 104 | ||
105 | HPFS can contain several uppercasing tables for several codepages and each | 105 | HPFS can contain several uppercasing tables for several codepages and each |
106 | file has a pointer to codepage it's name is in. However OS/2 was created in | 106 | file has a pointer to codepage its name is in. However OS/2 was created in |
107 | America where people don't care much about codepages and so multiple codepages | 107 | America where people don't care much about codepages and so multiple codepages |
108 | support is quite buggy. I have Czech OS/2 working in codepage 852 on my disk. | 108 | support is quite buggy. I have Czech OS/2 working in codepage 852 on my disk. |
109 | Once I booted English OS/2 working in cp 850 and I created a file on my 852 | 109 | Once I booted English OS/2 working in cp 850 and I created a file on my 852 |
diff --git a/Documentation/filesystems/logfs.txt b/Documentation/filesystems/logfs.txt new file mode 100644 index 000000000000..bca42c22a143 --- /dev/null +++ b/Documentation/filesystems/logfs.txt | |||
@@ -0,0 +1,241 @@ | |||
1 | |||
2 | The LogFS Flash Filesystem | ||
3 | ========================== | ||
4 | |||
5 | Specification | ||
6 | ============= | ||
7 | |||
8 | Superblocks | ||
9 | ----------- | ||
10 | |||
11 | Two superblocks exist at the beginning and end of the filesystem. | ||
12 | Each superblock is 256 Bytes large, with another 3840 Bytes reserved | ||
13 | for future purposes, making a total of 4096 Bytes. | ||
14 | |||
15 | Superblock locations may differ for MTD and block devices. On MTD the | ||
16 | first non-bad block contains a superblock in the first 4096 Bytes and | ||
17 | the last non-bad block contains a superblock in the last 4096 Bytes. | ||
18 | On block devices, the first 4096 Bytes of the device contain the first | ||
19 | superblock and the last aligned 4096 Byte-block contains the second | ||
20 | superblock. | ||
21 | |||
22 | For the most part, the superblocks can be considered read-only. They | ||
23 | are written only to correct errors detected within the superblocks, | ||
24 | move the journal and change the filesystem parameters through tunefs. | ||
25 | As a result, the superblock does not contain any fields that require | ||
26 | constant updates, like the amount of free space, etc. | ||
27 | |||
28 | Segments | ||
29 | -------- | ||
30 | |||
31 | The space in the device is split up into equal-sized segments. | ||
32 | Segments are the primary write unit of LogFS. Within each segments, | ||
33 | writes happen from front (low addresses) to back (high addresses. If | ||
34 | only a partial segment has been written, the segment number, the | ||
35 | current position within and optionally a write buffer are stored in | ||
36 | the journal. | ||
37 | |||
38 | Segments are erased as a whole. Therefore Garbage Collection may be | ||
39 | required to completely free a segment before doing so. | ||
40 | |||
41 | Journal | ||
42 | -------- | ||
43 | |||
44 | The journal contains all global information about the filesystem that | ||
45 | is subject to frequent change. At mount time, it has to be scanned | ||
46 | for the most recent commit entry, which contains a list of pointers to | ||
47 | all currently valid entries. | ||
48 | |||
49 | Object Store | ||
50 | ------------ | ||
51 | |||
52 | All space except for the superblocks and journal is part of the object | ||
53 | store. Each segment contains a segment header and a number of | ||
54 | objects, each consisting of the object header and the payload. | ||
55 | Objects are either inodes, directory entries (dentries), file data | ||
56 | blocks or indirect blocks. | ||
57 | |||
58 | Levels | ||
59 | ------ | ||
60 | |||
61 | Garbage collection (GC) may fail if all data is written | ||
62 | indiscriminately. One requirement of GC is that data is separated | ||
63 | roughly according to the distance between the tree root and the data. | ||
64 | Effectively that means all file data is on level 0, indirect blocks | ||
65 | are on levels 1, 2, 3 4 or 5 for 1x, 2x, 3x, 4x or 5x indirect blocks, | ||
66 | respectively. Inode file data is on level 6 for the inodes and 7-11 | ||
67 | for indirect blocks. | ||
68 | |||
69 | Each segment contains objects of a single level only. As a result, | ||
70 | each level requires its own separate segment to be open for writing. | ||
71 | |||
72 | Inode File | ||
73 | ---------- | ||
74 | |||
75 | All inodes are stored in a special file, the inode file. Single | ||
76 | exception is the inode file's inode (master inode) which for obvious | ||
77 | reasons is stored in the journal instead. Instead of data blocks, the | ||
78 | leaf nodes of the inode files are inodes. | ||
79 | |||
80 | Aliases | ||
81 | ------- | ||
82 | |||
83 | Writes in LogFS are done by means of a wandering tree. A naïve | ||
84 | implementation would require that for each write or a block, all | ||
85 | parent blocks are written as well, since the block pointers have | ||
86 | changed. Such an implementation would not be very efficient. | ||
87 | |||
88 | In LogFS, the block pointer changes are cached in the journal by means | ||
89 | of alias entries. Each alias consists of its logical address - inode | ||
90 | number, block index, level and child number (index into block) - and | ||
91 | the changed data. Any 8-byte word can be changes in this manner. | ||
92 | |||
93 | Currently aliases are used for block pointers, file size, file used | ||
94 | bytes and the height of an inodes indirect tree. | ||
95 | |||
96 | Segment Aliases | ||
97 | --------------- | ||
98 | |||
99 | Related to regular aliases, these are used to handle bad blocks. | ||
100 | Initially, bad blocks are handled by moving the affected segment | ||
101 | content to a spare segment and noting this move in the journal with a | ||
102 | segment alias, a simple (to, from) tupel. GC will later empty this | ||
103 | segment and the alias can be removed again. This is used on MTD only. | ||
104 | |||
105 | Vim | ||
106 | --- | ||
107 | |||
108 | By cleverly predicting the life time of data, it is possible to | ||
109 | separate long-living data from short-living data and thereby reduce | ||
110 | the GC overhead later. Each type of distinc life expectency (vim) can | ||
111 | have a separate segment open for writing. Each (level, vim) tupel can | ||
112 | be open just once. If an open segment with unknown vim is encountered | ||
113 | at mount time, it is closed and ignored henceforth. | ||
114 | |||
115 | Indirect Tree | ||
116 | ------------- | ||
117 | |||
118 | Inodes in LogFS are similar to FFS-style filesystems with direct and | ||
119 | indirect block pointers. One difference is that LogFS uses a single | ||
120 | indirect pointer that can be either a 1x, 2x, etc. indirect pointer. | ||
121 | A height field in the inode defines the height of the indirect tree | ||
122 | and thereby the indirection of the pointer. | ||
123 | |||
124 | Another difference is the addressing of indirect blocks. In LogFS, | ||
125 | the first 16 pointers in the first indirect block are left empty, | ||
126 | corresponding to the 16 direct pointers in the inode. In ext2 (maybe | ||
127 | others as well) the first pointer in the first indirect block | ||
128 | corresponds to logical block 12, skipping the 12 direct pointers. | ||
129 | So where ext2 is using arithmetic to better utilize space, LogFS keeps | ||
130 | arithmetic simple and uses compression to save space. | ||
131 | |||
132 | Compression | ||
133 | ----------- | ||
134 | |||
135 | Both file data and metadata can be compressed. Compression for file | ||
136 | data can be enabled with chattr +c and disabled with chattr -c. Doing | ||
137 | so has no effect on existing data, but new data will be stored | ||
138 | accordingly. New inodes will inherit the compression flag of the | ||
139 | parent directory. | ||
140 | |||
141 | Metadata is always compressed. However, the space accounting ignores | ||
142 | this and charges for the uncompressed size. Failing to do so could | ||
143 | result in GC failures when, after moving some data, indirect blocks | ||
144 | compress worse than previously. Even on a 100% full medium, GC may | ||
145 | not consume any extra space, so the compression gains are lost space | ||
146 | to the user. | ||
147 | |||
148 | However, they are not lost space to the filesystem internals. By | ||
149 | cheating the user for those bytes, the filesystem gained some slack | ||
150 | space and GC will run less often and faster. | ||
151 | |||
152 | Garbage Collection and Wear Leveling | ||
153 | ------------------------------------ | ||
154 | |||
155 | Garbage collection is invoked whenever the number of free segments | ||
156 | falls below a threshold. The best (known) candidate is picked based | ||
157 | on the least amount of valid data contained in the segment. All | ||
158 | remaining valid data is copied elsewhere, thereby invalidating it. | ||
159 | |||
160 | The GC code also checks for aliases and writes then back if their | ||
161 | number gets too large. | ||
162 | |||
163 | Wear leveling is done by occasionally picking a suboptimal segment for | ||
164 | garbage collection. If a stale segments erase count is significantly | ||
165 | lower than the active segments' erase counts, it will be picked. Wear | ||
166 | leveling is rate limited, so it will never monopolize the device for | ||
167 | more than one segment worth at a time. | ||
168 | |||
169 | Values for "occasionally", "significantly lower" are compile time | ||
170 | constants. | ||
171 | |||
172 | Hashed directories | ||
173 | ------------------ | ||
174 | |||
175 | To satisfy efficient lookup(), directory entries are hashed and | ||
176 | located based on the hash. In order to both support large directories | ||
177 | and not be overly inefficient for small directories, several hash | ||
178 | tables of increasing size are used. For each table, the hash value | ||
179 | modulo the table size gives the table index. | ||
180 | |||
181 | Tables sizes are chosen to limit the number of indirect blocks with a | ||
182 | fully populated table to 0, 1, 2 or 3 respectively. So the first | ||
183 | table contains 16 entries, the second 512-16, etc. | ||
184 | |||
185 | The last table is special in several ways. First its size depends on | ||
186 | the effective 32bit limit on telldir/seekdir cookies. Since logfs | ||
187 | uses the upper half of the address space for indirect blocks, the size | ||
188 | is limited to 2^31. Secondly the table contains hash buckets with 16 | ||
189 | entries each. | ||
190 | |||
191 | Using single-entry buckets would result in birthday "attacks". At | ||
192 | just 2^16 used entries, hash collisions would be likely (P >= 0.5). | ||
193 | My math skills are insufficient to do the combinatorics for the 17x | ||
194 | collisions necessary to overflow a bucket, but testing showed that in | ||
195 | 10,000 runs the lowest directory fill before a bucket overflow was | ||
196 | 188,057,130 entries with an average of 315,149,915 entries. So for | ||
197 | directory sizes of up to a million, bucket overflows should be | ||
198 | virtually impossible under normal circumstances. | ||
199 | |||
200 | With carefully chosen filenames, it is obviously possible to cause an | ||
201 | overflow with just 21 entries (4 higher tables + 16 entries + 1). So | ||
202 | there may be a security concern if a malicious user has write access | ||
203 | to a directory. | ||
204 | |||
205 | Open For Discussion | ||
206 | =================== | ||
207 | |||
208 | Device Address Space | ||
209 | -------------------- | ||
210 | |||
211 | A device address space is used for caching. Both block devices and | ||
212 | MTD provide functions to either read a single page or write a segment. | ||
213 | Partial segments may be written for data integrity, but where possible | ||
214 | complete segments are written for performance on simple block device | ||
215 | flash media. | ||
216 | |||
217 | Meta Inodes | ||
218 | ----------- | ||
219 | |||
220 | Inodes are stored in the inode file, which is just a regular file for | ||
221 | most purposes. At umount time, however, the inode file needs to | ||
222 | remain open until all dirty inodes are written. So | ||
223 | generic_shutdown_super() may not close this inode, but shouldn't | ||
224 | complain about remaining inodes due to the inode file either. Same | ||
225 | goes for mapping inode of the device address space. | ||
226 | |||
227 | Currently logfs uses a hack that essentially copies part of fs/inode.c | ||
228 | code over. A general solution would be preferred. | ||
229 | |||
230 | Indirect block mapping | ||
231 | ---------------------- | ||
232 | |||
233 | With compression, the block device (or mapping inode) cannot be used | ||
234 | to cache indirect blocks. Some other place is required. Currently | ||
235 | logfs uses the top half of each inode's address space. The low 8TB | ||
236 | (on 32bit) are filled with file data, the high 8TB are used for | ||
237 | indirect blocks. | ||
238 | |||
239 | One problem is that 16TB files created on 64bit systems actually have | ||
240 | data in the top 8TB. But files >16TB would cause problems anyway, so | ||
241 | only the limit has changed. | ||
diff --git a/Documentation/filesystems/nfs/nfs41-server.txt b/Documentation/filesystems/nfs/nfs41-server.txt index 1bd0d0c05171..04884914a1c8 100644 --- a/Documentation/filesystems/nfs/nfs41-server.txt +++ b/Documentation/filesystems/nfs/nfs41-server.txt | |||
@@ -17,8 +17,7 @@ kernels must turn 4.1 on or off *before* turning support for version 4 | |||
17 | on or off; rpc.nfsd does this correctly.) | 17 | on or off; rpc.nfsd does this correctly.) |
18 | 18 | ||
19 | The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based | 19 | The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based |
20 | on the latest NFSv4.1 Internet Draft: | 20 | on RFC 5661. |
21 | http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29 | ||
22 | 21 | ||
23 | From the many new features in NFSv4.1 the current implementation | 22 | From the many new features in NFSv4.1 the current implementation |
24 | focuses on the mandatory-to-implement NFSv4.1 Sessions, providing | 23 | focuses on the mandatory-to-implement NFSv4.1 Sessions, providing |
@@ -44,7 +43,7 @@ interoperability problems with future clients. Known issues: | |||
44 | trunking, but this is a mandatory feature, and its use is | 43 | trunking, but this is a mandatory feature, and its use is |
45 | recommended to clients in a number of places. (E.g. to ensure | 44 | recommended to clients in a number of places. (E.g. to ensure |
46 | timely renewal in case an existing connection's retry timeouts | 45 | timely renewal in case an existing connection's retry timeouts |
47 | have gotten too long; see section 8.3 of the draft.) | 46 | have gotten too long; see section 8.3 of the RFC.) |
48 | Therefore, lack of this feature may cause future clients to | 47 | Therefore, lack of this feature may cause future clients to |
49 | fail. | 48 | fail. |
50 | - Incomplete backchannel support: incomplete backchannel gss | 49 | - Incomplete backchannel support: incomplete backchannel gss |
@@ -138,7 +137,7 @@ NS*| OPENATTR | OPT | | Section 18.17 | | |||
138 | | READ | REQ | | Section 18.22 | | 137 | | READ | REQ | | Section 18.22 | |
139 | | READDIR | REQ | | Section 18.23 | | 138 | | READDIR | REQ | | Section 18.23 | |
140 | | READLINK | OPT | | Section 18.24 | | 139 | | READLINK | OPT | | Section 18.24 | |
141 | NS | RECLAIM_COMPLETE | REQ | | Section 18.51 | | 140 | | RECLAIM_COMPLETE | REQ | | Section 18.51 | |
142 | | RELEASE_LOCKOWNER | MNI | | N/A | | 141 | | RELEASE_LOCKOWNER | MNI | | N/A | |
143 | | REMOVE | REQ | | Section 18.25 | | 142 | | REMOVE | REQ | | Section 18.25 | |
144 | | RENAME | REQ | | Section 18.26 | | 143 | | RENAME | REQ | | Section 18.26 | |
diff --git a/Documentation/filesystems/nfs/rpc-cache.txt b/Documentation/filesystems/nfs/rpc-cache.txt index 8a382bea6808..ebcaaee21616 100644 --- a/Documentation/filesystems/nfs/rpc-cache.txt +++ b/Documentation/filesystems/nfs/rpc-cache.txt | |||
@@ -185,7 +185,7 @@ failed lookup meant a definite 'no'. | |||
185 | request/response format | 185 | request/response format |
186 | ----------------------- | 186 | ----------------------- |
187 | 187 | ||
188 | While each cache is free to use it's own format for requests | 188 | While each cache is free to use its own format for requests |
189 | and responses over channel, the following is recommended as | 189 | and responses over channel, the following is recommended as |
190 | appropriate and support routines are available to help: | 190 | appropriate and support routines are available to help: |
191 | Each request or response record should be printable ASCII | 191 | Each request or response record should be printable ASCII |
diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt index 839efd8a8a8c..d3e7673995eb 100644 --- a/Documentation/filesystems/nilfs2.txt +++ b/Documentation/filesystems/nilfs2.txt | |||
@@ -50,8 +50,8 @@ NILFS2 supports the following mount options: | |||
50 | (*) == default | 50 | (*) == default |
51 | 51 | ||
52 | nobarrier Disables barriers. | 52 | nobarrier Disables barriers. |
53 | errors=continue(*) Keep going on a filesystem error. | 53 | errors=continue Keep going on a filesystem error. |
54 | errors=remount-ro Remount the filesystem read-only on an error. | 54 | errors=remount-ro(*) Remount the filesystem read-only on an error. |
55 | errors=panic Panic and halt the machine if an error occurs. | 55 | errors=panic Panic and halt the machine if an error occurs. |
56 | cp=n Specify the checkpoint-number of the snapshot to be | 56 | cp=n Specify the checkpoint-number of the snapshot to be |
57 | mounted. Checkpoints and snapshots are listed by lscp | 57 | mounted. Checkpoints and snapshots are listed by lscp |
@@ -74,6 +74,9 @@ norecovery Disable recovery of the filesystem on mount. | |||
74 | This disables every write access on the device for | 74 | This disables every write access on the device for |
75 | read-only mounts or snapshots. This option will fail | 75 | read-only mounts or snapshots. This option will fail |
76 | for r/w mounts on an unclean volume. | 76 | for r/w mounts on an unclean volume. |
77 | discard Issue discard/TRIM commands to the underlying block | ||
78 | device when blocks are freed. This is useful for SSD | ||
79 | devices and sparse/thinly-provisioned LUNs. | ||
77 | 80 | ||
78 | NILFS2 usage | 81 | NILFS2 usage |
79 | ============ | 82 | ============ |
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt index c58b9f5ba002..1f7ae144f6d8 100644 --- a/Documentation/filesystems/ocfs2.txt +++ b/Documentation/filesystems/ocfs2.txt | |||
@@ -80,3 +80,10 @@ user_xattr (*) Enables Extended User Attributes. | |||
80 | nouser_xattr Disables Extended User Attributes. | 80 | nouser_xattr Disables Extended User Attributes. |
81 | acl Enables POSIX Access Control Lists support. | 81 | acl Enables POSIX Access Control Lists support. |
82 | noacl (*) Disables POSIX Access Control Lists support. | 82 | noacl (*) Disables POSIX Access Control Lists support. |
83 | resv_level=2 (*) Set how agressive allocation reservations will be. | ||
84 | Valid values are between 0 (reservations off) to 8 | ||
85 | (maximum space for reservations). | ||
86 | dir_resv_level= (*) By default, directory reservations will scale with file | ||
87 | reservations - users should rarely need to change this | ||
88 | value. If allocation reservations are turned off, this | ||
89 | option will have no effect. | ||
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 0d07513a67a6..9fb6cbe70bde 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -164,6 +164,7 @@ read the file /proc/PID/status: | |||
164 | VmExe: 68 kB | 164 | VmExe: 68 kB |
165 | VmLib: 1412 kB | 165 | VmLib: 1412 kB |
166 | VmPTE: 20 kb | 166 | VmPTE: 20 kb |
167 | VmSwap: 0 kB | ||
167 | Threads: 1 | 168 | Threads: 1 |
168 | SigQ: 0/28578 | 169 | SigQ: 0/28578 |
169 | SigPnd: 0000000000000000 | 170 | SigPnd: 0000000000000000 |
@@ -188,7 +189,13 @@ memory usage. Its seven fields are explained in Table 1-3. The stat file | |||
188 | contains details information about the process itself. Its fields are | 189 | contains details information about the process itself. Its fields are |
189 | explained in Table 1-4. | 190 | explained in Table 1-4. |
190 | 191 | ||
191 | Table 1-2: Contents of the statm files (as of 2.6.30-rc7) | 192 | (for SMP CONFIG users) |
193 | For making accounting scalable, RSS related information are handled in | ||
194 | asynchronous manner and the vaule may not be very precise. To see a precise | ||
195 | snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table. | ||
196 | It's slow but very precise. | ||
197 | |||
198 | Table 1-2: Contents of the status files (as of 2.6.30-rc7) | ||
192 | .............................................................................. | 199 | .............................................................................. |
193 | Field Content | 200 | Field Content |
194 | Name filename of the executable | 201 | Name filename of the executable |
@@ -213,6 +220,7 @@ Table 1-2: Contents of the statm files (as of 2.6.30-rc7) | |||
213 | VmExe size of text segment | 220 | VmExe size of text segment |
214 | VmLib size of shared library code | 221 | VmLib size of shared library code |
215 | VmPTE size of page table entries | 222 | VmPTE size of page table entries |
223 | VmSwap size of swap usage (the number of referred swapents) | ||
216 | Threads number of threads | 224 | Threads number of threads |
217 | SigQ number of signals queued/max. number for queue | 225 | SigQ number of signals queued/max. number for queue |
218 | SigPnd bitmap of pending signals for the thread | 226 | SigPnd bitmap of pending signals for the thread |
@@ -297,7 +305,7 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7) | |||
297 | cgtime guest time of the task children in jiffies | 305 | cgtime guest time of the task children in jiffies |
298 | .............................................................................. | 306 | .............................................................................. |
299 | 307 | ||
300 | The /proc/PID/map file containing the currently mapped memory regions and | 308 | The /proc/PID/maps file containing the currently mapped memory regions and |
301 | their access permissions. | 309 | their access permissions. |
302 | 310 | ||
303 | The format is: | 311 | The format is: |
@@ -308,7 +316,7 @@ address perms offset dev inode pathname | |||
308 | 08049000-0804a000 rw-p 00001000 03:00 8312 /opt/test | 316 | 08049000-0804a000 rw-p 00001000 03:00 8312 /opt/test |
309 | 0804a000-0806b000 rw-p 00000000 00:00 0 [heap] | 317 | 0804a000-0806b000 rw-p 00000000 00:00 0 [heap] |
310 | a7cb1000-a7cb2000 ---p 00000000 00:00 0 | 318 | a7cb1000-a7cb2000 ---p 00000000 00:00 0 |
311 | a7cb2000-a7eb2000 rw-p 00000000 00:00 0 [threadstack:001ff4b4] | 319 | a7cb2000-a7eb2000 rw-p 00000000 00:00 0 |
312 | a7eb2000-a7eb3000 ---p 00000000 00:00 0 | 320 | a7eb2000-a7eb3000 ---p 00000000 00:00 0 |
313 | a7eb3000-a7ed5000 rw-p 00000000 00:00 0 | 321 | a7eb3000-a7ed5000 rw-p 00000000 00:00 0 |
314 | a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6 | 322 | a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6 |
@@ -344,7 +352,6 @@ is not associated with a file: | |||
344 | [stack] = the stack of the main process | 352 | [stack] = the stack of the main process |
345 | [vdso] = the "virtual dynamic shared object", | 353 | [vdso] = the "virtual dynamic shared object", |
346 | the kernel system call handler | 354 | the kernel system call handler |
347 | [threadstack:xxxxxxxx] = the stack of the thread, xxxxxxxx is the stack size | ||
348 | 355 | ||
349 | or if empty, the mapping is anonymous. | 356 | or if empty, the mapping is anonymous. |
350 | 357 | ||
@@ -430,6 +437,7 @@ Table 1-5: Kernel info in /proc | |||
430 | modules List of loaded modules | 437 | modules List of loaded modules |
431 | mounts Mounted filesystems | 438 | mounts Mounted filesystems |
432 | net Networking info (see text) | 439 | net Networking info (see text) |
440 | pagetypeinfo Additional page allocator information (see text) (2.5) | ||
433 | partitions Table of partitions known to the system | 441 | partitions Table of partitions known to the system |
434 | pci Deprecated info of PCI bus (new way -> /proc/bus/pci/, | 442 | pci Deprecated info of PCI bus (new way -> /proc/bus/pci/, |
435 | decoupled by lspci (2.4) | 443 | decoupled by lspci (2.4) |
@@ -557,6 +565,10 @@ The default_smp_affinity mask applies to all non-active IRQs, which are the | |||
557 | IRQs which have not yet been allocated/activated, and hence which lack a | 565 | IRQs which have not yet been allocated/activated, and hence which lack a |
558 | /proc/irq/[0-9]* directory. | 566 | /proc/irq/[0-9]* directory. |
559 | 567 | ||
568 | The node file on an SMP system shows the node to which the device using the IRQ | ||
569 | reports itself as being attached. This hardware locality information does not | ||
570 | include information about any possible driver locality preference. | ||
571 | |||
560 | prof_cpu_mask specifies which CPUs are to be profiled by the system wide | 572 | prof_cpu_mask specifies which CPUs are to be profiled by the system wide |
561 | profiler. Default value is ffffffff (all cpus). | 573 | profiler. Default value is ffffffff (all cpus). |
562 | 574 | ||
@@ -584,7 +596,7 @@ Node 0, zone DMA 0 4 5 4 4 3 ... | |||
584 | Node 0, zone Normal 1 0 0 1 101 8 ... | 596 | Node 0, zone Normal 1 0 0 1 101 8 ... |
585 | Node 0, zone HighMem 2 0 0 1 1 0 ... | 597 | Node 0, zone HighMem 2 0 0 1 1 0 ... |
586 | 598 | ||
587 | Memory fragmentation is a problem under some workloads, and buddyinfo is a | 599 | External fragmentation is a problem under some workloads, and buddyinfo is a |
588 | useful tool for helping diagnose these problems. Buddyinfo will give you a | 600 | useful tool for helping diagnose these problems. Buddyinfo will give you a |
589 | clue as to how big an area you can safely allocate, or why a previous | 601 | clue as to how big an area you can safely allocate, or why a previous |
590 | allocation failed. | 602 | allocation failed. |
@@ -594,6 +606,48 @@ available. In this case, there are 0 chunks of 2^0*PAGE_SIZE available in | |||
594 | ZONE_DMA, 4 chunks of 2^1*PAGE_SIZE in ZONE_DMA, 101 chunks of 2^4*PAGE_SIZE | 606 | ZONE_DMA, 4 chunks of 2^1*PAGE_SIZE in ZONE_DMA, 101 chunks of 2^4*PAGE_SIZE |
595 | available in ZONE_NORMAL, etc... | 607 | available in ZONE_NORMAL, etc... |
596 | 608 | ||
609 | More information relevant to external fragmentation can be found in | ||
610 | pagetypeinfo. | ||
611 | |||
612 | > cat /proc/pagetypeinfo | ||
613 | Page block order: 9 | ||
614 | Pages per block: 512 | ||
615 | |||
616 | Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10 | ||
617 | Node 0, zone DMA, type Unmovable 0 0 0 1 1 1 1 1 1 1 0 | ||
618 | Node 0, zone DMA, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0 | ||
619 | Node 0, zone DMA, type Movable 1 1 2 1 2 1 1 0 1 0 2 | ||
620 | Node 0, zone DMA, type Reserve 0 0 0 0 0 0 0 0 0 1 0 | ||
621 | Node 0, zone DMA, type Isolate 0 0 0 0 0 0 0 0 0 0 0 | ||
622 | Node 0, zone DMA32, type Unmovable 103 54 77 1 1 1 11 8 7 1 9 | ||
623 | Node 0, zone DMA32, type Reclaimable 0 0 2 1 0 0 0 0 1 0 0 | ||
624 | Node 0, zone DMA32, type Movable 169 152 113 91 77 54 39 13 6 1 452 | ||
625 | Node 0, zone DMA32, type Reserve 1 2 2 2 2 0 1 1 1 1 0 | ||
626 | Node 0, zone DMA32, type Isolate 0 0 0 0 0 0 0 0 0 0 0 | ||
627 | |||
628 | Number of blocks type Unmovable Reclaimable Movable Reserve Isolate | ||
629 | Node 0, zone DMA 2 0 5 1 0 | ||
630 | Node 0, zone DMA32 41 6 967 2 0 | ||
631 | |||
632 | Fragmentation avoidance in the kernel works by grouping pages of different | ||
633 | migrate types into the same contiguous regions of memory called page blocks. | ||
634 | A page block is typically the size of the default hugepage size e.g. 2MB on | ||
635 | X86-64. By keeping pages grouped based on their ability to move, the kernel | ||
636 | can reclaim pages within a page block to satisfy a high-order allocation. | ||
637 | |||
638 | The pagetypinfo begins with information on the size of a page block. It | ||
639 | then gives the same type of information as buddyinfo except broken down | ||
640 | by migrate-type and finishes with details on how many page blocks of each | ||
641 | type exist. | ||
642 | |||
643 | If min_free_kbytes has been tuned correctly (recommendations made by hugeadm | ||
644 | from libhugetlbfs http://sourceforge.net/projects/libhugetlbfs/), one can | ||
645 | make an estimate of the likely number of huge pages that can be allocated | ||
646 | at a given point in time. All the "Movable" blocks should be allocatable | ||
647 | unless memory has been mlock()'d. Some of the Reclaimable blocks should | ||
648 | also be allocatable although a lot of filesystem metadata may have to be | ||
649 | reclaimed to achieve this. | ||
650 | |||
597 | .............................................................................. | 651 | .............................................................................. |
598 | 652 | ||
599 | meminfo: | 653 | meminfo: |
@@ -914,7 +968,7 @@ your system and how much traffic was routed over those devices: | |||
914 | ...] 1375103 17405 0 0 0 0 0 0 | 968 | ...] 1375103 17405 0 0 0 0 0 0 |
915 | ...] 1703981 5535 0 0 0 3 0 0 | 969 | ...] 1703981 5535 0 0 0 3 0 0 |
916 | 970 | ||
917 | In addition, each Channel Bond interface has it's own directory. For | 971 | In addition, each Channel Bond interface has its own directory. For |
918 | example, the bond0 device will have a directory called /proc/net/bond0/. | 972 | example, the bond0 device will have a directory called /proc/net/bond0/. |
919 | It will contain information that is specific to that bond, such as the | 973 | It will contain information that is specific to that bond, such as the |
920 | current slaves of the bond, the link status of the slaves, and how | 974 | current slaves of the bond, the link status of the slaves, and how |
@@ -1311,7 +1365,7 @@ been accounted as having caused 1MB of write. | |||
1311 | In other words: The number of bytes which this process caused to not happen, | 1365 | In other words: The number of bytes which this process caused to not happen, |
1312 | by truncating pagecache. A task can cause "negative" IO too. If this task | 1366 | by truncating pagecache. A task can cause "negative" IO too. If this task |
1313 | truncates some dirty pagecache, some IO which another task has been accounted | 1367 | truncates some dirty pagecache, some IO which another task has been accounted |
1314 | for (in it's write_bytes) will not be happening. We _could_ just subtract that | 1368 | for (in its write_bytes) will not be happening. We _could_ just subtract that |
1315 | from the truncating task's write_bytes, but there is information loss in doing | 1369 | from the truncating task's write_bytes, but there is information loss in doing |
1316 | that. | 1370 | that. |
1317 | 1371 | ||
diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt index 23a181074f94..fc0e39af43c3 100644 --- a/Documentation/filesystems/sharedsubtree.txt +++ b/Documentation/filesystems/sharedsubtree.txt | |||
@@ -837,6 +837,9 @@ replicas continue to be exactly same. | |||
837 | individual lists does not affect propagation or the way propagation | 837 | individual lists does not affect propagation or the way propagation |
838 | tree is modified by operations. | 838 | tree is modified by operations. |
839 | 839 | ||
840 | All vfsmounts in a peer group have the same ->mnt_master. If it is | ||
841 | non-NULL, they form a contiguous (ordered) segment of slave list. | ||
842 | |||
840 | A example propagation tree looks as shown in the figure below. | 843 | A example propagation tree looks as shown in the figure below. |
841 | [ NOTE: Though it looks like a forest, if we consider all the shared | 844 | [ NOTE: Though it looks like a forest, if we consider all the shared |
842 | mounts as a conceptual entity called 'pnode', it becomes a tree] | 845 | mounts as a conceptual entity called 'pnode', it becomes a tree] |
@@ -874,8 +877,19 @@ replicas continue to be exactly same. | |||
874 | 877 | ||
875 | NOTE: The propagation tree is orthogonal to the mount tree. | 878 | NOTE: The propagation tree is orthogonal to the mount tree. |
876 | 879 | ||
880 | 8B Locking: | ||
881 | |||
882 | ->mnt_share, ->mnt_slave, ->mnt_slave_list, ->mnt_master are protected | ||
883 | by namespace_sem (exclusive for modifications, shared for reading). | ||
884 | |||
885 | Normally we have ->mnt_flags modifications serialized by vfsmount_lock. | ||
886 | There are two exceptions: do_add_mount() and clone_mnt(). | ||
887 | The former modifies a vfsmount that has not been visible in any shared | ||
888 | data structures yet. | ||
889 | The latter holds namespace_sem and the only references to vfsmount | ||
890 | are in lists that can't be traversed without namespace_sem. | ||
877 | 891 | ||
878 | 8B Algorithm: | 892 | 8C Algorithm: |
879 | 893 | ||
880 | The crux of the implementation resides in rbind/move operation. | 894 | The crux of the implementation resides in rbind/move operation. |
881 | 895 | ||
diff --git a/Documentation/filesystems/smbfs.txt b/Documentation/filesystems/smbfs.txt index f673ef0de0f7..194fb0decd2c 100644 --- a/Documentation/filesystems/smbfs.txt +++ b/Documentation/filesystems/smbfs.txt | |||
@@ -3,6 +3,6 @@ protocol used by Windows for Workgroups, Windows 95 and Windows NT. | |||
3 | Smbfs was inspired by Samba, the program written by Andrew Tridgell | 3 | Smbfs was inspired by Samba, the program written by Andrew Tridgell |
4 | that turns any Unix host into a file server for DOS or Windows clients. | 4 | that turns any Unix host into a file server for DOS or Windows clients. |
5 | 5 | ||
6 | Smbfs is a SMB client, but uses parts of samba for it's operation. For | 6 | Smbfs is a SMB client, but uses parts of samba for its operation. For |
7 | more info on samba, including documentation, please go to | 7 | more info on samba, including documentation, please go to |
8 | http://www.samba.org/ and then on to your nearest mirror. | 8 | http://www.samba.org/ and then on to your nearest mirror. |
diff --git a/Documentation/filesystems/squashfs.txt b/Documentation/filesystems/squashfs.txt index b324c033035a..203f7202cc9e 100644 --- a/Documentation/filesystems/squashfs.txt +++ b/Documentation/filesystems/squashfs.txt | |||
@@ -38,7 +38,8 @@ Hard link support: yes no | |||
38 | Real inode numbers: yes no | 38 | Real inode numbers: yes no |
39 | 32-bit uids/gids: yes no | 39 | 32-bit uids/gids: yes no |
40 | File creation time: yes no | 40 | File creation time: yes no |
41 | Xattr and ACL support: no no | 41 | Xattr support: yes no |
42 | ACL support: no no | ||
42 | 43 | ||
43 | Squashfs compresses data, inodes and directories. In addition, inode and | 44 | Squashfs compresses data, inodes and directories. In addition, inode and |
44 | directory data are highly compacted, and packed on byte boundaries. Each | 45 | directory data are highly compacted, and packed on byte boundaries. Each |
@@ -58,7 +59,7 @@ obtained from this site also. | |||
58 | 3. SQUASHFS FILESYSTEM DESIGN | 59 | 3. SQUASHFS FILESYSTEM DESIGN |
59 | ----------------------------- | 60 | ----------------------------- |
60 | 61 | ||
61 | A squashfs filesystem consists of seven parts, packed together on a byte | 62 | A squashfs filesystem consists of a maximum of eight parts, packed together on a byte |
62 | alignment: | 63 | alignment: |
63 | 64 | ||
64 | --------------- | 65 | --------------- |
@@ -80,6 +81,9 @@ alignment: | |||
80 | |---------------| | 81 | |---------------| |
81 | | uid/gid | | 82 | | uid/gid | |
82 | | lookup table | | 83 | | lookup table | |
84 | |---------------| | ||
85 | | xattr | | ||
86 | | table | | ||
83 | --------------- | 87 | --------------- |
84 | 88 | ||
85 | Compressed data blocks are written to the filesystem as files are read from | 89 | Compressed data blocks are written to the filesystem as files are read from |
@@ -192,6 +196,26 @@ This table is stored compressed into metadata blocks. A second index table is | |||
192 | used to locate these. This second index table for speed of access (and because | 196 | used to locate these. This second index table for speed of access (and because |
193 | it is small) is read at mount time and cached in memory. | 197 | it is small) is read at mount time and cached in memory. |
194 | 198 | ||
199 | 3.7 Xattr table | ||
200 | --------------- | ||
201 | |||
202 | The xattr table contains extended attributes for each inode. The xattrs | ||
203 | for each inode are stored in a list, each list entry containing a type, | ||
204 | name and value field. The type field encodes the xattr prefix | ||
205 | ("user.", "trusted." etc) and it also encodes how the name/value fields | ||
206 | should be interpreted. Currently the type indicates whether the value | ||
207 | is stored inline (in which case the value field contains the xattr value), | ||
208 | or if it is stored out of line (in which case the value field stores a | ||
209 | reference to where the actual value is stored). This allows large values | ||
210 | to be stored out of line improving scanning and lookup performance and it | ||
211 | also allows values to be de-duplicated, the value being stored once, and | ||
212 | all other occurences holding an out of line reference to that value. | ||
213 | |||
214 | The xattr lists are packed into compressed 8K metadata blocks. | ||
215 | To reduce overhead in inodes, rather than storing the on-disk | ||
216 | location of the xattr list inside each inode, a 32-bit xattr id | ||
217 | is stored. This xattr id is mapped into the location of the xattr | ||
218 | list using a second xattr id lookup table. | ||
195 | 219 | ||
196 | 4. TODOS AND OUTSTANDING ISSUES | 220 | 4. TODOS AND OUTSTANDING ISSUES |
197 | ------------------------------- | 221 | ------------------------------- |
@@ -199,9 +223,7 @@ it is small) is read at mount time and cached in memory. | |||
199 | 4.1 Todo list | 223 | 4.1 Todo list |
200 | ------------- | 224 | ------------- |
201 | 225 | ||
202 | Implement Xattr and ACL support. The Squashfs 4.0 filesystem layout has hooks | 226 | Implement ACL support. |
203 | for these but the code has not been written. Once the code has been written | ||
204 | the existing layout should not require modification. | ||
205 | 227 | ||
206 | 4.2 Squashfs internal cache | 228 | 4.2 Squashfs internal cache |
207 | --------------------------- | 229 | --------------------------- |
diff --git a/Documentation/filesystems/sysfs-tagging.txt b/Documentation/filesystems/sysfs-tagging.txt new file mode 100644 index 000000000000..caaaf1266d8f --- /dev/null +++ b/Documentation/filesystems/sysfs-tagging.txt | |||
@@ -0,0 +1,42 @@ | |||
1 | Sysfs tagging | ||
2 | ------------- | ||
3 | |||
4 | (Taken almost verbatim from Eric Biederman's netns tagging patch | ||
5 | commit msg) | ||
6 | |||
7 | The problem. Network devices show up in sysfs and with the network | ||
8 | namespace active multiple devices with the same name can show up in | ||
9 | the same directory, ouch! | ||
10 | |||
11 | To avoid that problem and allow existing applications in network | ||
12 | namespaces to see the same interface that is currently presented in | ||
13 | sysfs, sysfs now has tagging directory support. | ||
14 | |||
15 | By using the network namespace pointers as tags to separate out the | ||
16 | the sysfs directory entries we ensure that we don't have conflicts | ||
17 | in the directories and applications only see a limited set of | ||
18 | the network devices. | ||
19 | |||
20 | Each sysfs directory entry may be tagged with zero or one | ||
21 | namespaces. A sysfs_dirent is augmented with a void *s_ns. If a | ||
22 | directory entry is tagged, then sysfs_dirent->s_flags will have a | ||
23 | flag between KOBJ_NS_TYPE_NONE and KOBJ_NS_TYPES, and s_ns will | ||
24 | point to the namespace to which it belongs. | ||
25 | |||
26 | Each sysfs superblock's sysfs_super_info contains an array void | ||
27 | *ns[KOBJ_NS_TYPES]. When a a task in a tagging namespace | ||
28 | kobj_nstype first mounts sysfs, a new superblock is created. It | ||
29 | will be differentiated from other sysfs mounts by having its | ||
30 | s_fs_info->ns[kobj_nstype] set to the new namespace. Note that | ||
31 | through bind mounting and mounts propagation, a task can easily view | ||
32 | the contents of other namespaces' sysfs mounts. Therefore, when a | ||
33 | namespace exits, it will call kobj_ns_exit() to invalidate any | ||
34 | sysfs_dirent->s_ns pointers pointing to it. | ||
35 | |||
36 | Users of this interface: | ||
37 | - define a type in the kobj_ns_type enumeration. | ||
38 | - call kobj_ns_type_register() with its kobj_ns_type_operations which has | ||
39 | - current_ns() which returns current's namespace | ||
40 | - netlink_ns() which returns a socket's namespace | ||
41 | - initial_ns() which returns the initial namesapce | ||
42 | - call kobj_ns_exit() when an individual tag is no longer valid | ||
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt index 3015da0c6b2a..98ef55124158 100644 --- a/Documentation/filesystems/tmpfs.txt +++ b/Documentation/filesystems/tmpfs.txt | |||
@@ -82,21 +82,31 @@ tmpfs has a mount option to set the NUMA memory allocation policy for | |||
82 | all files in that instance (if CONFIG_NUMA is enabled) - which can be | 82 | all files in that instance (if CONFIG_NUMA is enabled) - which can be |
83 | adjusted on the fly via 'mount -o remount ...' | 83 | adjusted on the fly via 'mount -o remount ...' |
84 | 84 | ||
85 | mpol=default prefers to allocate memory from the local node | 85 | mpol=default use the process allocation policy |
86 | (see set_mempolicy(2)) | ||
86 | mpol=prefer:Node prefers to allocate memory from the given Node | 87 | mpol=prefer:Node prefers to allocate memory from the given Node |
87 | mpol=bind:NodeList allocates memory only from nodes in NodeList | 88 | mpol=bind:NodeList allocates memory only from nodes in NodeList |
88 | mpol=interleave prefers to allocate from each node in turn | 89 | mpol=interleave prefers to allocate from each node in turn |
89 | mpol=interleave:NodeList allocates from each node of NodeList in turn | 90 | mpol=interleave:NodeList allocates from each node of NodeList in turn |
91 | mpol=local prefers to allocate memory from the local node | ||
90 | 92 | ||
91 | NodeList format is a comma-separated list of decimal numbers and ranges, | 93 | NodeList format is a comma-separated list of decimal numbers and ranges, |
92 | a range being two hyphen-separated decimal numbers, the smallest and | 94 | a range being two hyphen-separated decimal numbers, the smallest and |
93 | largest node numbers in the range. For example, mpol=bind:0-3,5,7,9-15 | 95 | largest node numbers in the range. For example, mpol=bind:0-3,5,7,9-15 |
94 | 96 | ||
97 | A memory policy with a valid NodeList will be saved, as specified, for | ||
98 | use at file creation time. When a task allocates a file in the file | ||
99 | system, the mount option memory policy will be applied with a NodeList, | ||
100 | if any, modified by the calling task's cpuset constraints | ||
101 | [See Documentation/cgroups/cpusets.txt] and any optional flags, listed | ||
102 | below. If the resulting NodeLists is the empty set, the effective memory | ||
103 | policy for the file will revert to "default" policy. | ||
104 | |||
95 | NUMA memory allocation policies have optional flags that can be used in | 105 | NUMA memory allocation policies have optional flags that can be used in |
96 | conjunction with their modes. These optional flags can be specified | 106 | conjunction with their modes. These optional flags can be specified |
97 | when tmpfs is mounted by appending them to the mode before the NodeList. | 107 | when tmpfs is mounted by appending them to the mode before the NodeList. |
98 | See Documentation/vm/numa_memory_policy.txt for a list of all available | 108 | See Documentation/vm/numa_memory_policy.txt for a list of all available |
99 | memory allocation policy mode flags. | 109 | memory allocation policy mode flags and their effect on memory policy. |
100 | 110 | ||
101 | =static is equivalent to MPOL_F_STATIC_NODES | 111 | =static is equivalent to MPOL_F_STATIC_NODES |
102 | =relative is equivalent to MPOL_F_RELATIVE_NODES | 112 | =relative is equivalent to MPOL_F_RELATIVE_NODES |
@@ -134,3 +144,5 @@ Author: | |||
134 | Christoph Rohland <cr@sap.com>, 1.12.01 | 144 | Christoph Rohland <cr@sap.com>, 1.12.01 |
135 | Updated: | 145 | Updated: |
136 | Hugh Dickins, 4 June 2007 | 146 | Hugh Dickins, 4 June 2007 |
147 | Updated: | ||
148 | KOSAKI Motohiro, 16 Mar 2010 | ||
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 3de2f32edd90..b66858538df5 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt | |||
@@ -72,7 +72,7 @@ structure (this is the kernel-side implementation of file | |||
72 | descriptors). The freshly allocated file structure is initialized with | 72 | descriptors). The freshly allocated file structure is initialized with |
73 | a pointer to the dentry and a set of file operation member functions. | 73 | a pointer to the dentry and a set of file operation member functions. |
74 | These are taken from the inode data. The open() file method is then | 74 | These are taken from the inode data. The open() file method is then |
75 | called so the specific filesystem implementation can do it's work. You | 75 | called so the specific filesystem implementation can do its work. You |
76 | can see that this is another switch performed by the VFS. The file | 76 | can see that this is another switch performed by the VFS. The file |
77 | structure is placed into the file descriptor table for the process. | 77 | structure is placed into the file descriptor table for the process. |
78 | 78 | ||
diff --git a/Documentation/filesystems/xfs-delayed-logging-design.txt b/Documentation/filesystems/xfs-delayed-logging-design.txt new file mode 100644 index 000000000000..d8119e9d2d60 --- /dev/null +++ b/Documentation/filesystems/xfs-delayed-logging-design.txt | |||
@@ -0,0 +1,816 @@ | |||
1 | XFS Delayed Logging Design | ||
2 | -------------------------- | ||
3 | |||
4 | Introduction to Re-logging in XFS | ||
5 | --------------------------------- | ||
6 | |||
7 | XFS logging is a combination of logical and physical logging. Some objects, | ||
8 | such as inodes and dquots, are logged in logical format where the details | ||
9 | logged are made up of the changes to in-core structures rather than on-disk | ||
10 | structures. Other objects - typically buffers - have their physical changes | ||
11 | logged. The reason for these differences is to reduce the amount of log space | ||
12 | required for objects that are frequently logged. Some parts of inodes are more | ||
13 | frequently logged than others, and inodes are typically more frequently logged | ||
14 | than any other object (except maybe the superblock buffer) so keeping the | ||
15 | amount of metadata logged low is of prime importance. | ||
16 | |||
17 | The reason that this is such a concern is that XFS allows multiple separate | ||
18 | modifications to a single object to be carried in the log at any given time. | ||
19 | This allows the log to avoid needing to flush each change to disk before | ||
20 | recording a new change to the object. XFS does this via a method called | ||
21 | "re-logging". Conceptually, this is quite simple - all it requires is that any | ||
22 | new change to the object is recorded with a *new copy* of all the existing | ||
23 | changes in the new transaction that is written to the log. | ||
24 | |||
25 | That is, if we have a sequence of changes A through to F, and the object was | ||
26 | written to disk after change D, we would see in the log the following series | ||
27 | of transactions, their contents and the log sequence number (LSN) of the | ||
28 | transaction: | ||
29 | |||
30 | Transaction Contents LSN | ||
31 | A A X | ||
32 | B A+B X+n | ||
33 | C A+B+C X+n+m | ||
34 | D A+B+C+D X+n+m+o | ||
35 | <object written to disk> | ||
36 | E E Y (> X+n+m+o) | ||
37 | F E+F YÙ+p | ||
38 | |||
39 | In other words, each time an object is relogged, the new transaction contains | ||
40 | the aggregation of all the previous changes currently held only in the log. | ||
41 | |||
42 | This relogging technique also allows objects to be moved forward in the log so | ||
43 | that an object being relogged does not prevent the tail of the log from ever | ||
44 | moving forward. This can be seen in the table above by the changing | ||
45 | (increasing) LSN of each subsquent transaction - the LSN is effectively a | ||
46 | direct encoding of the location in the log of the transaction. | ||
47 | |||
48 | This relogging is also used to implement long-running, multiple-commit | ||
49 | transactions. These transaction are known as rolling transactions, and require | ||
50 | a special log reservation known as a permanent transaction reservation. A | ||
51 | typical example of a rolling transaction is the removal of extents from an | ||
52 | inode which can only be done at a rate of two extents per transaction because | ||
53 | of reservation size limitations. Hence a rolling extent removal transaction | ||
54 | keeps relogging the inode and btree buffers as they get modified in each | ||
55 | removal operation. This keeps them moving forward in the log as the operation | ||
56 | progresses, ensuring that current operation never gets blocked by itself if the | ||
57 | log wraps around. | ||
58 | |||
59 | Hence it can be seen that the relogging operation is fundamental to the correct | ||
60 | working of the XFS journalling subsystem. From the above description, most | ||
61 | people should be able to see why the XFS metadata operations writes so much to | ||
62 | the log - repeated operations to the same objects write the same changes to | ||
63 | the log over and over again. Worse is the fact that objects tend to get | ||
64 | dirtier as they get relogged, so each subsequent transaction is writing more | ||
65 | metadata into the log. | ||
66 | |||
67 | Another feature of the XFS transaction subsystem is that most transactions are | ||
68 | asynchronous. That is, they don't commit to disk until either a log buffer is | ||
69 | filled (a log buffer can hold multiple transactions) or a synchronous operation | ||
70 | forces the log buffers holding the transactions to disk. This means that XFS is | ||
71 | doing aggregation of transactions in memory - batching them, if you like - to | ||
72 | minimise the impact of the log IO on transaction throughput. | ||
73 | |||
74 | The limitation on asynchronous transaction throughput is the number and size of | ||
75 | log buffers made available by the log manager. By default there are 8 log | ||
76 | buffers available and the size of each is 32kB - the size can be increased up | ||
77 | to 256kB by use of a mount option. | ||
78 | |||
79 | Effectively, this gives us the maximum bound of outstanding metadata changes | ||
80 | that can be made to the filesystem at any point in time - if all the log | ||
81 | buffers are full and under IO, then no more transactions can be committed until | ||
82 | the current batch completes. It is now common for a single current CPU core to | ||
83 | be to able to issue enough transactions to keep the log buffers full and under | ||
84 | IO permanently. Hence the XFS journalling subsystem can be considered to be IO | ||
85 | bound. | ||
86 | |||
87 | Delayed Logging: Concepts | ||
88 | ------------------------- | ||
89 | |||
90 | The key thing to note about the asynchronous logging combined with the | ||
91 | relogging technique XFS uses is that we can be relogging changed objects | ||
92 | multiple times before they are committed to disk in the log buffers. If we | ||
93 | return to the previous relogging example, it is entirely possible that | ||
94 | transactions A through D are committed to disk in the same log buffer. | ||
95 | |||
96 | That is, a single log buffer may contain multiple copies of the same object, | ||
97 | but only one of those copies needs to be there - the last one "D", as it | ||
98 | contains all the changes from the previous changes. In other words, we have one | ||
99 | necessary copy in the log buffer, and three stale copies that are simply | ||
100 | wasting space. When we are doing repeated operations on the same set of | ||
101 | objects, these "stale objects" can be over 90% of the space used in the log | ||
102 | buffers. It is clear that reducing the number of stale objects written to the | ||
103 | log would greatly reduce the amount of metadata we write to the log, and this | ||
104 | is the fundamental goal of delayed logging. | ||
105 | |||
106 | From a conceptual point of view, XFS is already doing relogging in memory (where | ||
107 | memory == log buffer), only it is doing it extremely inefficiently. It is using | ||
108 | logical to physical formatting to do the relogging because there is no | ||
109 | infrastructure to keep track of logical changes in memory prior to physically | ||
110 | formatting the changes in a transaction to the log buffer. Hence we cannot avoid | ||
111 | accumulating stale objects in the log buffers. | ||
112 | |||
113 | Delayed logging is the name we've given to keeping and tracking transactional | ||
114 | changes to objects in memory outside the log buffer infrastructure. Because of | ||
115 | the relogging concept fundamental to the XFS journalling subsystem, this is | ||
116 | actually relatively easy to do - all the changes to logged items are already | ||
117 | tracked in the current infrastructure. The big problem is how to accumulate | ||
118 | them and get them to the log in a consistent, recoverable manner. | ||
119 | Describing the problems and how they have been solved is the focus of this | ||
120 | document. | ||
121 | |||
122 | One of the key changes that delayed logging makes to the operation of the | ||
123 | journalling subsystem is that it disassociates the amount of outstanding | ||
124 | metadata changes from the size and number of log buffers available. In other | ||
125 | words, instead of there only being a maximum of 2MB of transaction changes not | ||
126 | written to the log at any point in time, there may be a much greater amount | ||
127 | being accumulated in memory. Hence the potential for loss of metadata on a | ||
128 | crash is much greater than for the existing logging mechanism. | ||
129 | |||
130 | It should be noted that this does not change the guarantee that log recovery | ||
131 | will result in a consistent filesystem. What it does mean is that as far as the | ||
132 | recovered filesystem is concerned, there may be many thousands of transactions | ||
133 | that simply did not occur as a result of the crash. This makes it even more | ||
134 | important that applications that care about their data use fsync() where they | ||
135 | need to ensure application level data integrity is maintained. | ||
136 | |||
137 | It should be noted that delayed logging is not an innovative new concept that | ||
138 | warrants rigorous proofs to determine whether it is correct or not. The method | ||
139 | of accumulating changes in memory for some period before writing them to the | ||
140 | log is used effectively in many filesystems including ext3 and ext4. Hence | ||
141 | no time is spent in this document trying to convince the reader that the | ||
142 | concept is sound. Instead it is simply considered a "solved problem" and as | ||
143 | such implementing it in XFS is purely an exercise in software engineering. | ||
144 | |||
145 | The fundamental requirements for delayed logging in XFS are simple: | ||
146 | |||
147 | 1. Reduce the amount of metadata written to the log by at least | ||
148 | an order of magnitude. | ||
149 | 2. Supply sufficient statistics to validate Requirement #1. | ||
150 | 3. Supply sufficient new tracing infrastructure to be able to debug | ||
151 | problems with the new code. | ||
152 | 4. No on-disk format change (metadata or log format). | ||
153 | 5. Enable and disable with a mount option. | ||
154 | 6. No performance regressions for synchronous transaction workloads. | ||
155 | |||
156 | Delayed Logging: Design | ||
157 | ----------------------- | ||
158 | |||
159 | Storing Changes | ||
160 | |||
161 | The problem with accumulating changes at a logical level (i.e. just using the | ||
162 | existing log item dirty region tracking) is that when it comes to writing the | ||
163 | changes to the log buffers, we need to ensure that the object we are formatting | ||
164 | is not changing while we do this. This requires locking the object to prevent | ||
165 | concurrent modification. Hence flushing the logical changes to the log would | ||
166 | require us to lock every object, format them, and then unlock them again. | ||
167 | |||
168 | This introduces lots of scope for deadlocks with transactions that are already | ||
169 | running. For example, a transaction has object A locked and modified, but needs | ||
170 | the delayed logging tracking lock to commit the transaction. However, the | ||
171 | flushing thread has the delayed logging tracking lock already held, and is | ||
172 | trying to get the lock on object A to flush it to the log buffer. This appears | ||
173 | to be an unsolvable deadlock condition, and it was solving this problem that | ||
174 | was the barrier to implementing delayed logging for so long. | ||
175 | |||
176 | The solution is relatively simple - it just took a long time to recognise it. | ||
177 | Put simply, the current logging code formats the changes to each item into an | ||
178 | vector array that points to the changed regions in the item. The log write code | ||
179 | simply copies the memory these vectors point to into the log buffer during | ||
180 | transaction commit while the item is locked in the transaction. Instead of | ||
181 | using the log buffer as the destination of the formatting code, we can use an | ||
182 | allocated memory buffer big enough to fit the formatted vector. | ||
183 | |||
184 | If we then copy the vector into the memory buffer and rewrite the vector to | ||
185 | point to the memory buffer rather than the object itself, we now have a copy of | ||
186 | the changes in a format that is compatible with the log buffer writing code. | ||
187 | that does not require us to lock the item to access. This formatting and | ||
188 | rewriting can all be done while the object is locked during transaction commit, | ||
189 | resulting in a vector that is transactionally consistent and can be accessed | ||
190 | without needing to lock the owning item. | ||
191 | |||
192 | Hence we avoid the need to lock items when we need to flush outstanding | ||
193 | asynchronous transactions to the log. The differences between the existing | ||
194 | formatting method and the delayed logging formatting can be seen in the | ||
195 | diagram below. | ||
196 | |||
197 | Current format log vector: | ||
198 | |||
199 | Object +---------------------------------------------+ | ||
200 | Vector 1 +----+ | ||
201 | Vector 2 +----+ | ||
202 | Vector 3 +----------+ | ||
203 | |||
204 | After formatting: | ||
205 | |||
206 | Log Buffer +-V1-+-V2-+----V3----+ | ||
207 | |||
208 | Delayed logging vector: | ||
209 | |||
210 | Object +---------------------------------------------+ | ||
211 | Vector 1 +----+ | ||
212 | Vector 2 +----+ | ||
213 | Vector 3 +----------+ | ||
214 | |||
215 | After formatting: | ||
216 | |||
217 | Memory Buffer +-V1-+-V2-+----V3----+ | ||
218 | Vector 1 +----+ | ||
219 | Vector 2 +----+ | ||
220 | Vector 3 +----------+ | ||
221 | |||
222 | The memory buffer and associated vector need to be passed as a single object, | ||
223 | but still need to be associated with the parent object so if the object is | ||
224 | relogged we can replace the current memory buffer with a new memory buffer that | ||
225 | contains the latest changes. | ||
226 | |||
227 | The reason for keeping the vector around after we've formatted the memory | ||
228 | buffer is to support splitting vectors across log buffer boundaries correctly. | ||
229 | If we don't keep the vector around, we do not know where the region boundaries | ||
230 | are in the item, so we'd need a new encapsulation method for regions in the log | ||
231 | buffer writing (i.e. double encapsulation). This would be an on-disk format | ||
232 | change and as such is not desirable. It also means we'd have to write the log | ||
233 | region headers in the formatting stage, which is problematic as there is per | ||
234 | region state that needs to be placed into the headers during the log write. | ||
235 | |||
236 | Hence we need to keep the vector, but by attaching the memory buffer to it and | ||
237 | rewriting the vector addresses to point at the memory buffer we end up with a | ||
238 | self-describing object that can be passed to the log buffer write code to be | ||
239 | handled in exactly the same manner as the existing log vectors are handled. | ||
240 | Hence we avoid needing a new on-disk format to handle items that have been | ||
241 | relogged in memory. | ||
242 | |||
243 | |||
244 | Tracking Changes | ||
245 | |||
246 | Now that we can record transactional changes in memory in a form that allows | ||
247 | them to be used without limitations, we need to be able to track and accumulate | ||
248 | them so that they can be written to the log at some later point in time. The | ||
249 | log item is the natural place to store this vector and buffer, and also makes sense | ||
250 | to be the object that is used to track committed objects as it will always | ||
251 | exist once the object has been included in a transaction. | ||
252 | |||
253 | The log item is already used to track the log items that have been written to | ||
254 | the log but not yet written to disk. Such log items are considered "active" | ||
255 | and as such are stored in the Active Item List (AIL) which is a LSN-ordered | ||
256 | double linked list. Items are inserted into this list during log buffer IO | ||
257 | completion, after which they are unpinned and can be written to disk. An object | ||
258 | that is in the AIL can be relogged, which causes the object to be pinned again | ||
259 | and then moved forward in the AIL when the log buffer IO completes for that | ||
260 | transaction. | ||
261 | |||
262 | Essentially, this shows that an item that is in the AIL can still be modified | ||
263 | and relogged, so any tracking must be separate to the AIL infrastructure. As | ||
264 | such, we cannot reuse the AIL list pointers for tracking committed items, nor | ||
265 | can we store state in any field that is protected by the AIL lock. Hence the | ||
266 | committed item tracking needs it's own locks, lists and state fields in the log | ||
267 | item. | ||
268 | |||
269 | Similar to the AIL, tracking of committed items is done through a new list | ||
270 | called the Committed Item List (CIL). The list tracks log items that have been | ||
271 | committed and have formatted memory buffers attached to them. It tracks objects | ||
272 | in transaction commit order, so when an object is relogged it is removed from | ||
273 | it's place in the list and re-inserted at the tail. This is entirely arbitrary | ||
274 | and done to make it easy for debugging - the last items in the list are the | ||
275 | ones that are most recently modified. Ordering of the CIL is not necessary for | ||
276 | transactional integrity (as discussed in the next section) so the ordering is | ||
277 | done for convenience/sanity of the developers. | ||
278 | |||
279 | |||
280 | Delayed Logging: Checkpoints | ||
281 | |||
282 | When we have a log synchronisation event, commonly known as a "log force", | ||
283 | all the items in the CIL must be written into the log via the log buffers. | ||
284 | We need to write these items in the order that they exist in the CIL, and they | ||
285 | need to be written as an atomic transaction. The need for all the objects to be | ||
286 | written as an atomic transaction comes from the requirements of relogging and | ||
287 | log replay - all the changes in all the objects in a given transaction must | ||
288 | either be completely replayed during log recovery, or not replayed at all. If | ||
289 | a transaction is not replayed because it is not complete in the log, then | ||
290 | no later transactions should be replayed, either. | ||
291 | |||
292 | To fulfill this requirement, we need to write the entire CIL in a single log | ||
293 | transaction. Fortunately, the XFS log code has no fixed limit on the size of a | ||
294 | transaction, nor does the log replay code. The only fundamental limit is that | ||
295 | the transaction cannot be larger than just under half the size of the log. The | ||
296 | reason for this limit is that to find the head and tail of the log, there must | ||
297 | be at least one complete transaction in the log at any given time. If a | ||
298 | transaction is larger than half the log, then there is the possibility that a | ||
299 | crash during the write of a such a transaction could partially overwrite the | ||
300 | only complete previous transaction in the log. This will result in a recovery | ||
301 | failure and an inconsistent filesystem and hence we must enforce the maximum | ||
302 | size of a checkpoint to be slightly less than a half the log. | ||
303 | |||
304 | Apart from this size requirement, a checkpoint transaction looks no different | ||
305 | to any other transaction - it contains a transaction header, a series of | ||
306 | formatted log items and a commit record at the tail. From a recovery | ||
307 | perspective, the checkpoint transaction is also no different - just a lot | ||
308 | bigger with a lot more items in it. The worst case effect of this is that we | ||
309 | might need to tune the recovery transaction object hash size. | ||
310 | |||
311 | Because the checkpoint is just another transaction and all the changes to log | ||
312 | items are stored as log vectors, we can use the existing log buffer writing | ||
313 | code to write the changes into the log. To do this efficiently, we need to | ||
314 | minimise the time we hold the CIL locked while writing the checkpoint | ||
315 | transaction. The current log write code enables us to do this easily with the | ||
316 | way it separates the writing of the transaction contents (the log vectors) from | ||
317 | the transaction commit record, but tracking this requires us to have a | ||
318 | per-checkpoint context that travels through the log write process through to | ||
319 | checkpoint completion. | ||
320 | |||
321 | Hence a checkpoint has a context that tracks the state of the current | ||
322 | checkpoint from initiation to checkpoint completion. A new context is initiated | ||
323 | at the same time a checkpoint transaction is started. That is, when we remove | ||
324 | all the current items from the CIL during a checkpoint operation, we move all | ||
325 | those changes into the current checkpoint context. We then initialise a new | ||
326 | context and attach that to the CIL for aggregation of new transactions. | ||
327 | |||
328 | This allows us to unlock the CIL immediately after transfer of all the | ||
329 | committed items and effectively allow new transactions to be issued while we | ||
330 | are formatting the checkpoint into the log. It also allows concurrent | ||
331 | checkpoints to be written into the log buffers in the case of log force heavy | ||
332 | workloads, just like the existing transaction commit code does. This, however, | ||
333 | requires that we strictly order the commit records in the log so that | ||
334 | checkpoint sequence order is maintained during log replay. | ||
335 | |||
336 | To ensure that we can be writing an item into a checkpoint transaction at | ||
337 | the same time another transaction modifies the item and inserts the log item | ||
338 | into the new CIL, then checkpoint transaction commit code cannot use log items | ||
339 | to store the list of log vectors that need to be written into the transaction. | ||
340 | Hence log vectors need to be able to be chained together to allow them to be | ||
341 | detatched from the log items. That is, when the CIL is flushed the memory | ||
342 | buffer and log vector attached to each log item needs to be attached to the | ||
343 | checkpoint context so that the log item can be released. In diagrammatic form, | ||
344 | the CIL would look like this before the flush: | ||
345 | |||
346 | CIL Head | ||
347 | | | ||
348 | V | ||
349 | Log Item <-> log vector 1 -> memory buffer | ||
350 | | -> vector array | ||
351 | V | ||
352 | Log Item <-> log vector 2 -> memory buffer | ||
353 | | -> vector array | ||
354 | V | ||
355 | ...... | ||
356 | | | ||
357 | V | ||
358 | Log Item <-> log vector N-1 -> memory buffer | ||
359 | | -> vector array | ||
360 | V | ||
361 | Log Item <-> log vector N -> memory buffer | ||
362 | -> vector array | ||
363 | |||
364 | And after the flush the CIL head is empty, and the checkpoint context log | ||
365 | vector list would look like: | ||
366 | |||
367 | Checkpoint Context | ||
368 | | | ||
369 | V | ||
370 | log vector 1 -> memory buffer | ||
371 | | -> vector array | ||
372 | | -> Log Item | ||
373 | V | ||
374 | log vector 2 -> memory buffer | ||
375 | | -> vector array | ||
376 | | -> Log Item | ||
377 | V | ||
378 | ...... | ||
379 | | | ||
380 | V | ||
381 | log vector N-1 -> memory buffer | ||
382 | | -> vector array | ||
383 | | -> Log Item | ||
384 | V | ||
385 | log vector N -> memory buffer | ||
386 | -> vector array | ||
387 | -> Log Item | ||
388 | |||
389 | Once this transfer is done, the CIL can be unlocked and new transactions can | ||
390 | start, while the checkpoint flush code works over the log vector chain to | ||
391 | commit the checkpoint. | ||
392 | |||
393 | Once the checkpoint is written into the log buffers, the checkpoint context is | ||
394 | attached to the log buffer that the commit record was written to along with a | ||
395 | completion callback. Log IO completion will call that callback, which can then | ||
396 | run transaction committed processing for the log items (i.e. insert into AIL | ||
397 | and unpin) in the log vector chain and then free the log vector chain and | ||
398 | checkpoint context. | ||
399 | |||
400 | Discussion Point: I am uncertain as to whether the log item is the most | ||
401 | efficient way to track vectors, even though it seems like the natural way to do | ||
402 | it. The fact that we walk the log items (in the CIL) just to chain the log | ||
403 | vectors and break the link between the log item and the log vector means that | ||
404 | we take a cache line hit for the log item list modification, then another for | ||
405 | the log vector chaining. If we track by the log vectors, then we only need to | ||
406 | break the link between the log item and the log vector, which means we should | ||
407 | dirty only the log item cachelines. Normally I wouldn't be concerned about one | ||
408 | vs two dirty cachelines except for the fact I've seen upwards of 80,000 log | ||
409 | vectors in one checkpoint transaction. I'd guess this is a "measure and | ||
410 | compare" situation that can be done after a working and reviewed implementation | ||
411 | is in the dev tree.... | ||
412 | |||
413 | Delayed Logging: Checkpoint Sequencing | ||
414 | |||
415 | One of the key aspects of the XFS transaction subsystem is that it tags | ||
416 | committed transactions with the log sequence number of the transaction commit. | ||
417 | This allows transactions to be issued asynchronously even though there may be | ||
418 | future operations that cannot be completed until that transaction is fully | ||
419 | committed to the log. In the rare case that a dependent operation occurs (e.g. | ||
420 | re-using a freed metadata extent for a data extent), a special, optimised log | ||
421 | force can be issued to force the dependent transaction to disk immediately. | ||
422 | |||
423 | To do this, transactions need to record the LSN of the commit record of the | ||
424 | transaction. This LSN comes directly from the log buffer the transaction is | ||
425 | written into. While this works just fine for the existing transaction | ||
426 | mechanism, it does not work for delayed logging because transactions are not | ||
427 | written directly into the log buffers. Hence some other method of sequencing | ||
428 | transactions is required. | ||
429 | |||
430 | As discussed in the checkpoint section, delayed logging uses per-checkpoint | ||
431 | contexts, and as such it is simple to assign a sequence number to each | ||
432 | checkpoint. Because the switching of checkpoint contexts must be done | ||
433 | atomically, it is simple to ensure that each new context has a monotonically | ||
434 | increasing sequence number assigned to it without the need for an external | ||
435 | atomic counter - we can just take the current context sequence number and add | ||
436 | one to it for the new context. | ||
437 | |||
438 | Then, instead of assigning a log buffer LSN to the transaction commit LSN | ||
439 | during the commit, we can assign the current checkpoint sequence. This allows | ||
440 | operations that track transactions that have not yet completed know what | ||
441 | checkpoint sequence needs to be committed before they can continue. As a | ||
442 | result, the code that forces the log to a specific LSN now needs to ensure that | ||
443 | the log forces to a specific checkpoint. | ||
444 | |||
445 | To ensure that we can do this, we need to track all the checkpoint contexts | ||
446 | that are currently committing to the log. When we flush a checkpoint, the | ||
447 | context gets added to a "committing" list which can be searched. When a | ||
448 | checkpoint commit completes, it is removed from the committing list. Because | ||
449 | the checkpoint context records the LSN of the commit record for the checkpoint, | ||
450 | we can also wait on the log buffer that contains the commit record, thereby | ||
451 | using the existing log force mechanisms to execute synchronous forces. | ||
452 | |||
453 | It should be noted that the synchronous forces may need to be extended with | ||
454 | mitigation algorithms similar to the current log buffer code to allow | ||
455 | aggregation of multiple synchronous transactions if there are already | ||
456 | synchronous transactions being flushed. Investigation of the performance of the | ||
457 | current design is needed before making any decisions here. | ||
458 | |||
459 | The main concern with log forces is to ensure that all the previous checkpoints | ||
460 | are also committed to disk before the one we need to wait for. Therefore we | ||
461 | need to check that all the prior contexts in the committing list are also | ||
462 | complete before waiting on the one we need to complete. We do this | ||
463 | synchronisation in the log force code so that we don't need to wait anywhere | ||
464 | else for such serialisation - it only matters when we do a log force. | ||
465 | |||
466 | The only remaining complexity is that a log force now also has to handle the | ||
467 | case where the forcing sequence number is the same as the current context. That | ||
468 | is, we need to flush the CIL and potentially wait for it to complete. This is a | ||
469 | simple addition to the existing log forcing code to check the sequence numbers | ||
470 | and push if required. Indeed, placing the current sequence checkpoint flush in | ||
471 | the log force code enables the current mechanism for issuing synchronous | ||
472 | transactions to remain untouched (i.e. commit an asynchronous transaction, then | ||
473 | force the log at the LSN of that transaction) and so the higher level code | ||
474 | behaves the same regardless of whether delayed logging is being used or not. | ||
475 | |||
476 | Delayed Logging: Checkpoint Log Space Accounting | ||
477 | |||
478 | The big issue for a checkpoint transaction is the log space reservation for the | ||
479 | transaction. We don't know how big a checkpoint transaction is going to be | ||
480 | ahead of time, nor how many log buffers it will take to write out, nor the | ||
481 | number of split log vector regions are going to be used. We can track the | ||
482 | amount of log space required as we add items to the commit item list, but we | ||
483 | still need to reserve the space in the log for the checkpoint. | ||
484 | |||
485 | A typical transaction reserves enough space in the log for the worst case space | ||
486 | usage of the transaction. The reservation accounts for log record headers, | ||
487 | transaction and region headers, headers for split regions, buffer tail padding, | ||
488 | etc. as well as the actual space for all the changed metadata in the | ||
489 | transaction. While some of this is fixed overhead, much of it is dependent on | ||
490 | the size of the transaction and the number of regions being logged (the number | ||
491 | of log vectors in the transaction). | ||
492 | |||
493 | An example of the differences would be logging directory changes versus logging | ||
494 | inode changes. If you modify lots of inode cores (e.g. chmod -R g+w *), then | ||
495 | there are lots of transactions that only contain an inode core and an inode log | ||
496 | format structure. That is, two vectors totaling roughly 150 bytes. If we modify | ||
497 | 10,000 inodes, we have about 1.5MB of metadata to write in 20,000 vectors. Each | ||
498 | vector is 12 bytes, so the total to be logged is approximately 1.75MB. In | ||
499 | comparison, if we are logging full directory buffers, they are typically 4KB | ||
500 | each, so we in 1.5MB of directory buffers we'd have roughly 400 buffers and a | ||
501 | buffer format structure for each buffer - roughly 800 vectors or 1.51MB total | ||
502 | space. From this, it should be obvious that a static log space reservation is | ||
503 | not particularly flexible and is difficult to select the "optimal value" for | ||
504 | all workloads. | ||
505 | |||
506 | Further, if we are going to use a static reservation, which bit of the entire | ||
507 | reservation does it cover? We account for space used by the transaction | ||
508 | reservation by tracking the space currently used by the object in the CIL and | ||
509 | then calculating the increase or decrease in space used as the object is | ||
510 | relogged. This allows for a checkpoint reservation to only have to account for | ||
511 | log buffer metadata used such as log header records. | ||
512 | |||
513 | However, even using a static reservation for just the log metadata is | ||
514 | problematic. Typically log record headers use at least 16KB of log space per | ||
515 | 1MB of log space consumed (512 bytes per 32k) and the reservation needs to be | ||
516 | large enough to handle arbitrary sized checkpoint transactions. This | ||
517 | reservation needs to be made before the checkpoint is started, and we need to | ||
518 | be able to reserve the space without sleeping. For a 8MB checkpoint, we need a | ||
519 | reservation of around 150KB, which is a non-trivial amount of space. | ||
520 | |||
521 | A static reservation needs to manipulate the log grant counters - we can take a | ||
522 | permanent reservation on the space, but we still need to make sure we refresh | ||
523 | the write reservation (the actual space available to the transaction) after | ||
524 | every checkpoint transaction completion. Unfortunately, if this space is not | ||
525 | available when required, then the regrant code will sleep waiting for it. | ||
526 | |||
527 | The problem with this is that it can lead to deadlocks as we may need to commit | ||
528 | checkpoints to be able to free up log space (refer back to the description of | ||
529 | rolling transactions for an example of this). Hence we *must* always have | ||
530 | space available in the log if we are to use static reservations, and that is | ||
531 | very difficult and complex to arrange. It is possible to do, but there is a | ||
532 | simpler way. | ||
533 | |||
534 | The simpler way of doing this is tracking the entire log space used by the | ||
535 | items in the CIL and using this to dynamically calculate the amount of log | ||
536 | space required by the log metadata. If this log metadata space changes as a | ||
537 | result of a transaction commit inserting a new memory buffer into the CIL, then | ||
538 | the difference in space required is removed from the transaction that causes | ||
539 | the change. Transactions at this level will *always* have enough space | ||
540 | available in their reservation for this as they have already reserved the | ||
541 | maximal amount of log metadata space they require, and such a delta reservation | ||
542 | will always be less than or equal to the maximal amount in the reservation. | ||
543 | |||
544 | Hence we can grow the checkpoint transaction reservation dynamically as items | ||
545 | are added to the CIL and avoid the need for reserving and regranting log space | ||
546 | up front. This avoids deadlocks and removes a blocking point from the | ||
547 | checkpoint flush code. | ||
548 | |||
549 | As mentioned early, transactions can't grow to more than half the size of the | ||
550 | log. Hence as part of the reservation growing, we need to also check the size | ||
551 | of the reservation against the maximum allowed transaction size. If we reach | ||
552 | the maximum threshold, we need to push the CIL to the log. This is effectively | ||
553 | a "background flush" and is done on demand. This is identical to | ||
554 | a CIL push triggered by a log force, only that there is no waiting for the | ||
555 | checkpoint commit to complete. This background push is checked and executed by | ||
556 | transaction commit code. | ||
557 | |||
558 | If the transaction subsystem goes idle while we still have items in the CIL, | ||
559 | they will be flushed by the periodic log force issued by the xfssyncd. This log | ||
560 | force will push the CIL to disk, and if the transaction subsystem stays idle, | ||
561 | allow the idle log to be covered (effectively marked clean) in exactly the same | ||
562 | manner that is done for the existing logging method. A discussion point is | ||
563 | whether this log force needs to be done more frequently than the current rate | ||
564 | which is once every 30s. | ||
565 | |||
566 | |||
567 | Delayed Logging: Log Item Pinning | ||
568 | |||
569 | Currently log items are pinned during transaction commit while the items are | ||
570 | still locked. This happens just after the items are formatted, though it could | ||
571 | be done any time before the items are unlocked. The result of this mechanism is | ||
572 | that items get pinned once for every transaction that is committed to the log | ||
573 | buffers. Hence items that are relogged in the log buffers will have a pin count | ||
574 | for every outstanding transaction they were dirtied in. When each of these | ||
575 | transactions is completed, they will unpin the item once. As a result, the item | ||
576 | only becomes unpinned when all the transactions complete and there are no | ||
577 | pending transactions. Thus the pinning and unpinning of a log item is symmetric | ||
578 | as there is a 1:1 relationship with transaction commit and log item completion. | ||
579 | |||
580 | For delayed logging, however, we have an assymetric transaction commit to | ||
581 | completion relationship. Every time an object is relogged in the CIL it goes | ||
582 | through the commit process without a corresponding completion being registered. | ||
583 | That is, we now have a many-to-one relationship between transaction commit and | ||
584 | log item completion. The result of this is that pinning and unpinning of the | ||
585 | log items becomes unbalanced if we retain the "pin on transaction commit, unpin | ||
586 | on transaction completion" model. | ||
587 | |||
588 | To keep pin/unpin symmetry, the algorithm needs to change to a "pin on | ||
589 | insertion into the CIL, unpin on checkpoint completion". In other words, the | ||
590 | pinning and unpinning becomes symmetric around a checkpoint context. We have to | ||
591 | pin the object the first time it is inserted into the CIL - if it is already in | ||
592 | the CIL during a transaction commit, then we do not pin it again. Because there | ||
593 | can be multiple outstanding checkpoint contexts, we can still see elevated pin | ||
594 | counts, but as each checkpoint completes the pin count will retain the correct | ||
595 | value according to it's context. | ||
596 | |||
597 | Just to make matters more slightly more complex, this checkpoint level context | ||
598 | for the pin count means that the pinning of an item must take place under the | ||
599 | CIL commit/flush lock. If we pin the object outside this lock, we cannot | ||
600 | guarantee which context the pin count is associated with. This is because of | ||
601 | the fact pinning the item is dependent on whether the item is present in the | ||
602 | current CIL or not. If we don't pin the CIL first before we check and pin the | ||
603 | object, we have a race with CIL being flushed between the check and the pin | ||
604 | (or not pinning, as the case may be). Hence we must hold the CIL flush/commit | ||
605 | lock to guarantee that we pin the items correctly. | ||
606 | |||
607 | Delayed Logging: Concurrent Scalability | ||
608 | |||
609 | A fundamental requirement for the CIL is that accesses through transaction | ||
610 | commits must scale to many concurrent commits. The current transaction commit | ||
611 | code does not break down even when there are transactions coming from 2048 | ||
612 | processors at once. The current transaction code does not go any faster than if | ||
613 | there was only one CPU using it, but it does not slow down either. | ||
614 | |||
615 | As a result, the delayed logging transaction commit code needs to be designed | ||
616 | for concurrency from the ground up. It is obvious that there are serialisation | ||
617 | points in the design - the three important ones are: | ||
618 | |||
619 | 1. Locking out new transaction commits while flushing the CIL | ||
620 | 2. Adding items to the CIL and updating item space accounting | ||
621 | 3. Checkpoint commit ordering | ||
622 | |||
623 | Looking at the transaction commit and CIL flushing interactions, it is clear | ||
624 | that we have a many-to-one interaction here. That is, the only restriction on | ||
625 | the number of concurrent transactions that can be trying to commit at once is | ||
626 | the amount of space available in the log for their reservations. The practical | ||
627 | limit here is in the order of several hundred concurrent transactions for a | ||
628 | 128MB log, which means that it is generally one per CPU in a machine. | ||
629 | |||
630 | The amount of time a transaction commit needs to hold out a flush is a | ||
631 | relatively long period of time - the pinning of log items needs to be done | ||
632 | while we are holding out a CIL flush, so at the moment that means it is held | ||
633 | across the formatting of the objects into memory buffers (i.e. while memcpy()s | ||
634 | are in progress). Ultimately a two pass algorithm where the formatting is done | ||
635 | separately to the pinning of objects could be used to reduce the hold time of | ||
636 | the transaction commit side. | ||
637 | |||
638 | Because of the number of potential transaction commit side holders, the lock | ||
639 | really needs to be a sleeping lock - if the CIL flush takes the lock, we do not | ||
640 | want every other CPU in the machine spinning on the CIL lock. Given that | ||
641 | flushing the CIL could involve walking a list of tens of thousands of log | ||
642 | items, it will get held for a significant time and so spin contention is a | ||
643 | significant concern. Preventing lots of CPUs spinning doing nothing is the | ||
644 | main reason for choosing a sleeping lock even though nothing in either the | ||
645 | transaction commit or CIL flush side sleeps with the lock held. | ||
646 | |||
647 | It should also be noted that CIL flushing is also a relatively rare operation | ||
648 | compared to transaction commit for asynchronous transaction workloads - only | ||
649 | time will tell if using a read-write semaphore for exclusion will limit | ||
650 | transaction commit concurrency due to cache line bouncing of the lock on the | ||
651 | read side. | ||
652 | |||
653 | The second serialisation point is on the transaction commit side where items | ||
654 | are inserted into the CIL. Because transactions can enter this code | ||
655 | concurrently, the CIL needs to be protected separately from the above | ||
656 | commit/flush exclusion. It also needs to be an exclusive lock but it is only | ||
657 | held for a very short time and so a spin lock is appropriate here. It is | ||
658 | possible that this lock will become a contention point, but given the short | ||
659 | hold time once per transaction I think that contention is unlikely. | ||
660 | |||
661 | The final serialisation point is the checkpoint commit record ordering code | ||
662 | that is run as part of the checkpoint commit and log force sequencing. The code | ||
663 | path that triggers a CIL flush (i.e. whatever triggers the log force) will enter | ||
664 | an ordering loop after writing all the log vectors into the log buffers but | ||
665 | before writing the commit record. This loop walks the list of committing | ||
666 | checkpoints and needs to block waiting for checkpoints to complete their commit | ||
667 | record write. As a result it needs a lock and a wait variable. Log force | ||
668 | sequencing also requires the same lock, list walk, and blocking mechanism to | ||
669 | ensure completion of checkpoints. | ||
670 | |||
671 | These two sequencing operations can use the mechanism even though the | ||
672 | events they are waiting for are different. The checkpoint commit record | ||
673 | sequencing needs to wait until checkpoint contexts contain a commit LSN | ||
674 | (obtained through completion of a commit record write) while log force | ||
675 | sequencing needs to wait until previous checkpoint contexts are removed from | ||
676 | the committing list (i.e. they've completed). A simple wait variable and | ||
677 | broadcast wakeups (thundering herds) has been used to implement these two | ||
678 | serialisation queues. They use the same lock as the CIL, too. If we see too | ||
679 | much contention on the CIL lock, or too many context switches as a result of | ||
680 | the broadcast wakeups these operations can be put under a new spinlock and | ||
681 | given separate wait lists to reduce lock contention and the number of processes | ||
682 | woken by the wrong event. | ||
683 | |||
684 | |||
685 | Lifecycle Changes | ||
686 | |||
687 | The existing log item life cycle is as follows: | ||
688 | |||
689 | 1. Transaction allocate | ||
690 | 2. Transaction reserve | ||
691 | 3. Lock item | ||
692 | 4. Join item to transaction | ||
693 | If not already attached, | ||
694 | Allocate log item | ||
695 | Attach log item to owner item | ||
696 | Attach log item to transaction | ||
697 | 5. Modify item | ||
698 | Record modifications in log item | ||
699 | 6. Transaction commit | ||
700 | Pin item in memory | ||
701 | Format item into log buffer | ||
702 | Write commit LSN into transaction | ||
703 | Unlock item | ||
704 | Attach transaction to log buffer | ||
705 | |||
706 | <log buffer IO dispatched> | ||
707 | <log buffer IO completes> | ||
708 | |||
709 | 7. Transaction completion | ||
710 | Mark log item committed | ||
711 | Insert log item into AIL | ||
712 | Write commit LSN into log item | ||
713 | Unpin log item | ||
714 | 8. AIL traversal | ||
715 | Lock item | ||
716 | Mark log item clean | ||
717 | Flush item to disk | ||
718 | |||
719 | <item IO completion> | ||
720 | |||
721 | 9. Log item removed from AIL | ||
722 | Moves log tail | ||
723 | Item unlocked | ||
724 | |||
725 | Essentially, steps 1-6 operate independently from step 7, which is also | ||
726 | independent of steps 8-9. An item can be locked in steps 1-6 or steps 8-9 | ||
727 | at the same time step 7 is occurring, but only steps 1-6 or 8-9 can occur | ||
728 | at the same time. If the log item is in the AIL or between steps 6 and 7 | ||
729 | and steps 1-6 are re-entered, then the item is relogged. Only when steps 8-9 | ||
730 | are entered and completed is the object considered clean. | ||
731 | |||
732 | With delayed logging, there are new steps inserted into the life cycle: | ||
733 | |||
734 | 1. Transaction allocate | ||
735 | 2. Transaction reserve | ||
736 | 3. Lock item | ||
737 | 4. Join item to transaction | ||
738 | If not already attached, | ||
739 | Allocate log item | ||
740 | Attach log item to owner item | ||
741 | Attach log item to transaction | ||
742 | 5. Modify item | ||
743 | Record modifications in log item | ||
744 | 6. Transaction commit | ||
745 | Pin item in memory if not pinned in CIL | ||
746 | Format item into log vector + buffer | ||
747 | Attach log vector and buffer to log item | ||
748 | Insert log item into CIL | ||
749 | Write CIL context sequence into transaction | ||
750 | Unlock item | ||
751 | |||
752 | <next log force> | ||
753 | |||
754 | 7. CIL push | ||
755 | lock CIL flush | ||
756 | Chain log vectors and buffers together | ||
757 | Remove items from CIL | ||
758 | unlock CIL flush | ||
759 | write log vectors into log | ||
760 | sequence commit records | ||
761 | attach checkpoint context to log buffer | ||
762 | |||
763 | <log buffer IO dispatched> | ||
764 | <log buffer IO completes> | ||
765 | |||
766 | 8. Checkpoint completion | ||
767 | Mark log item committed | ||
768 | Insert item into AIL | ||
769 | Write commit LSN into log item | ||
770 | Unpin log item | ||
771 | 9. AIL traversal | ||
772 | Lock item | ||
773 | Mark log item clean | ||
774 | Flush item to disk | ||
775 | <item IO completion> | ||
776 | 10. Log item removed from AIL | ||
777 | Moves log tail | ||
778 | Item unlocked | ||
779 | |||
780 | From this, it can be seen that the only life cycle differences between the two | ||
781 | logging methods are in the middle of the life cycle - they still have the same | ||
782 | beginning and end and execution constraints. The only differences are in the | ||
783 | commiting of the log items to the log itself and the completion processing. | ||
784 | Hence delayed logging should not introduce any constraints on log item | ||
785 | behaviour, allocation or freeing that don't already exist. | ||
786 | |||
787 | As a result of this zero-impact "insertion" of delayed logging infrastructure | ||
788 | and the design of the internal structures to avoid on disk format changes, we | ||
789 | can basically switch between delayed logging and the existing mechanism with a | ||
790 | mount option. Fundamentally, there is no reason why the log manager would not | ||
791 | be able to swap methods automatically and transparently depending on load | ||
792 | characteristics, but this should not be necessary if delayed logging works as | ||
793 | designed. | ||
794 | |||
795 | Roadmap: | ||
796 | |||
797 | 2.6.35 Inclusion in mainline as an experimental mount option | ||
798 | => approximately 2-3 months to merge window | ||
799 | => needs to be in xfs-dev tree in 4-6 weeks | ||
800 | => code is nearing readiness for review | ||
801 | |||
802 | 2.6.37 Remove experimental tag from mount option | ||
803 | => should be roughly 6 months after initial merge | ||
804 | => enough time to: | ||
805 | => gain confidence and fix problems reported by early | ||
806 | adopters (a.k.a. guinea pigs) | ||
807 | => address worst performance regressions and undesired | ||
808 | behaviours | ||
809 | => start tuning/optimising code for parallelism | ||
810 | => start tuning/optimising algorithms consuming | ||
811 | excessive CPU time | ||
812 | |||
813 | 2.6.39 Switch default mount option to use delayed logging | ||
814 | => should be roughly 12 months after initial merge | ||
815 | => enough time to shake out remaining problems before next round of | ||
816 | enterprise distro kernel rebases | ||
diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt index 1866c27eec69..c2c6e9b39bbe 100644 --- a/Documentation/gpio.txt +++ b/Documentation/gpio.txt | |||
@@ -253,6 +253,70 @@ pin setup (e.g. controlling which pin the GPIO uses, pullup/pulldown). | |||
253 | Also note that it's your responsibility to have stopped using a GPIO | 253 | Also note that it's your responsibility to have stopped using a GPIO |
254 | before you free it. | 254 | before you free it. |
255 | 255 | ||
256 | Considering in most cases GPIOs are actually configured right after they | ||
257 | are claimed, three additional calls are defined: | ||
258 | |||
259 | /* request a single GPIO, with initial configuration specified by | ||
260 | * 'flags', identical to gpio_request() wrt other arguments and | ||
261 | * return value | ||
262 | */ | ||
263 | int gpio_request_one(unsigned gpio, unsigned long flags, const char *label); | ||
264 | |||
265 | /* request multiple GPIOs in a single call | ||
266 | */ | ||
267 | int gpio_request_array(struct gpio *array, size_t num); | ||
268 | |||
269 | /* release multiple GPIOs in a single call | ||
270 | */ | ||
271 | void gpio_free_array(struct gpio *array, size_t num); | ||
272 | |||
273 | where 'flags' is currently defined to specify the following properties: | ||
274 | |||
275 | * GPIOF_DIR_IN - to configure direction as input | ||
276 | * GPIOF_DIR_OUT - to configure direction as output | ||
277 | |||
278 | * GPIOF_INIT_LOW - as output, set initial level to LOW | ||
279 | * GPIOF_INIT_HIGH - as output, set initial level to HIGH | ||
280 | |||
281 | since GPIOF_INIT_* are only valid when configured as output, so group valid | ||
282 | combinations as: | ||
283 | |||
284 | * GPIOF_IN - configure as input | ||
285 | * GPIOF_OUT_INIT_LOW - configured as output, initial level LOW | ||
286 | * GPIOF_OUT_INIT_HIGH - configured as output, initial level HIGH | ||
287 | |||
288 | In the future, these flags can be extended to support more properties such | ||
289 | as open-drain status. | ||
290 | |||
291 | Further more, to ease the claim/release of multiple GPIOs, 'struct gpio' is | ||
292 | introduced to encapsulate all three fields as: | ||
293 | |||
294 | struct gpio { | ||
295 | unsigned gpio; | ||
296 | unsigned long flags; | ||
297 | const char *label; | ||
298 | }; | ||
299 | |||
300 | A typical example of usage: | ||
301 | |||
302 | static struct gpio leds_gpios[] = { | ||
303 | { 32, GPIOF_OUT_INIT_HIGH, "Power LED" }, /* default to ON */ | ||
304 | { 33, GPIOF_OUT_INIT_LOW, "Green LED" }, /* default to OFF */ | ||
305 | { 34, GPIOF_OUT_INIT_LOW, "Red LED" }, /* default to OFF */ | ||
306 | { 35, GPIOF_OUT_INIT_LOW, "Blue LED" }, /* default to OFF */ | ||
307 | { ... }, | ||
308 | }; | ||
309 | |||
310 | err = gpio_request_one(31, GPIOF_IN, "Reset Button"); | ||
311 | if (err) | ||
312 | ... | ||
313 | |||
314 | err = gpio_request_array(leds_gpios, ARRAY_SIZE(leds_gpios)); | ||
315 | if (err) | ||
316 | ... | ||
317 | |||
318 | gpio_free_array(leds_gpios, ARRAY_SIZE(leds_gpios)); | ||
319 | |||
256 | 320 | ||
257 | GPIOs mapped to IRQs | 321 | GPIOs mapped to IRQs |
258 | -------------------- | 322 | -------------------- |
diff --git a/Documentation/hwmon/abituguru b/Documentation/hwmon/abituguru index 87ffa0f5ec70..5eb3b9d5f0d5 100644 --- a/Documentation/hwmon/abituguru +++ b/Documentation/hwmon/abituguru | |||
@@ -30,7 +30,7 @@ Supported chips: | |||
30 | bank1_types=1,1,0,0,0,0,0,2,0,0,0,0,2,0,0,1 | 30 | bank1_types=1,1,0,0,0,0,0,2,0,0,0,0,2,0,0,1 |
31 | You may also need to specify the fan_sensors option for these boards | 31 | You may also need to specify the fan_sensors option for these boards |
32 | fan_sensors=5 | 32 | fan_sensors=5 |
33 | 2) There is a seperate abituguru3 driver for these motherboards, | 33 | 2) There is a separate abituguru3 driver for these motherboards, |
34 | the abituguru (without the 3 !) driver will not work on these | 34 | the abituguru (without the 3 !) driver will not work on these |
35 | motherboards (and visa versa)! | 35 | motherboards (and visa versa)! |
36 | 36 | ||
diff --git a/Documentation/hwmon/adt7411 b/Documentation/hwmon/adt7411 new file mode 100644 index 000000000000..1632960f9745 --- /dev/null +++ b/Documentation/hwmon/adt7411 | |||
@@ -0,0 +1,42 @@ | |||
1 | Kernel driver adt7411 | ||
2 | ===================== | ||
3 | |||
4 | Supported chips: | ||
5 | * Analog Devices ADT7411 | ||
6 | Prefix: 'adt7411' | ||
7 | Addresses scanned: 0x48, 0x4a, 0x4b | ||
8 | Datasheet: Publicly available at the Analog Devices website | ||
9 | |||
10 | Author: Wolfram Sang (based on adt7470 by Darrick J. Wong) | ||
11 | |||
12 | Description | ||
13 | ----------- | ||
14 | |||
15 | This driver implements support for the Analog Devices ADT7411 chip. There may | ||
16 | be other chips that implement this interface. | ||
17 | |||
18 | The ADT7411 can use an I2C/SMBus compatible 2-wire interface or an | ||
19 | SPI-compatible 4-wire interface. It provides a 10-bit analog to digital | ||
20 | converter which measures 1 temperature, vdd and 8 input voltages. It has an | ||
21 | internal temperature sensor, but an external one can also be connected (one | ||
22 | loses 2 inputs then). There are high- and low-limit registers for all inputs. | ||
23 | |||
24 | Check the datasheet for details. | ||
25 | |||
26 | sysfs-Interface | ||
27 | --------------- | ||
28 | |||
29 | in0_input - vdd voltage input | ||
30 | in[1-8]_input - analog 1-8 input | ||
31 | temp1_input - temperature input | ||
32 | |||
33 | Besides standard interfaces, this driver adds (0 = off, 1 = on): | ||
34 | |||
35 | adc_ref_vdd - Use vdd as reference instead of 2.25 V | ||
36 | fast_sampling - Sample at 22.5 kHz instead of 1.4 kHz, but drop filters | ||
37 | no_average - Turn off averaging over 16 samples | ||
38 | |||
39 | Notes | ||
40 | ----- | ||
41 | |||
42 | SPI, external temperature sensor and limit registers are not supported yet. | ||
diff --git a/Documentation/hwmon/adt7473 b/Documentation/hwmon/adt7473 deleted file mode 100644 index 446612bd1fb9..000000000000 --- a/Documentation/hwmon/adt7473 +++ /dev/null | |||
@@ -1,74 +0,0 @@ | |||
1 | Kernel driver adt7473 | ||
2 | ====================== | ||
3 | |||
4 | Supported chips: | ||
5 | * Analog Devices ADT7473 | ||
6 | Prefix: 'adt7473' | ||
7 | Addresses scanned: I2C 0x2C, 0x2D, 0x2E | ||
8 | Datasheet: Publicly available at the Analog Devices website | ||
9 | |||
10 | Author: Darrick J. Wong | ||
11 | |||
12 | This driver is depreacted, please use the adt7475 driver instead. | ||
13 | |||
14 | Description | ||
15 | ----------- | ||
16 | |||
17 | This driver implements support for the Analog Devices ADT7473 chip family. | ||
18 | |||
19 | The ADT7473 uses the 2-wire interface compatible with the SMBUS 2.0 | ||
20 | specification. Using an analog to digital converter it measures three (3) | ||
21 | temperatures and two (2) voltages. It has four (4) 16-bit counters for | ||
22 | measuring fan speed. There are three (3) PWM outputs that can be used | ||
23 | to control fan speed. | ||
24 | |||
25 | A sophisticated control system for the PWM outputs is designed into the | ||
26 | ADT7473 that allows fan speed to be adjusted automatically based on any of the | ||
27 | three temperature sensors. Each PWM output is individually adjustable and | ||
28 | programmable. Once configured, the ADT7473 will adjust the PWM outputs in | ||
29 | response to the measured temperatures without further host intervention. | ||
30 | This feature can also be disabled for manual control of the PWM's. | ||
31 | |||
32 | Each of the measured inputs (voltage, temperature, fan speed) has | ||
33 | corresponding high/low limit values. The ADT7473 will signal an ALARM if | ||
34 | any measured value exceeds either limit. | ||
35 | |||
36 | The ADT7473 samples all inputs continuously. The driver will not read | ||
37 | the registers more often than once every other second. Further, | ||
38 | configuration data is only read once per minute. | ||
39 | |||
40 | Special Features | ||
41 | ---------------- | ||
42 | |||
43 | The ADT7473 have a 10-bit ADC and can therefore measure temperatures | ||
44 | with 0.25 degC resolution. Temperature readings can be configured either | ||
45 | for twos complement format or "Offset 64" format, wherein 63 is subtracted | ||
46 | from the raw value to get the temperature value. | ||
47 | |||
48 | The Analog Devices datasheet is very detailed and describes a procedure for | ||
49 | determining an optimal configuration for the automatic PWM control. | ||
50 | |||
51 | Configuration Notes | ||
52 | ------------------- | ||
53 | |||
54 | Besides standard interfaces driver adds the following: | ||
55 | |||
56 | * PWM Control | ||
57 | |||
58 | * pwm#_auto_point1_pwm and temp#_auto_point1_temp and | ||
59 | * pwm#_auto_point2_pwm and temp#_auto_point2_temp - | ||
60 | |||
61 | point1: Set the pwm speed at a lower temperature bound. | ||
62 | point2: Set the pwm speed at a higher temperature bound. | ||
63 | |||
64 | The ADT7473 will scale the pwm between the lower and higher pwm speed when | ||
65 | the temperature is between the two temperature boundaries. PWM values range | ||
66 | from 0 (off) to 255 (full speed). Fan speed will be set to maximum when the | ||
67 | temperature sensor associated with the PWM control exceeds temp#_max. | ||
68 | |||
69 | Notes | ||
70 | ----- | ||
71 | |||
72 | The NVIDIA binary driver presents an ADT7473 chip via an on-card i2c bus. | ||
73 | Unfortunately, they fail to set the i2c adapter class, so this driver may | ||
74 | fail to find the chip until the nvidia driver is patched. | ||
diff --git a/Documentation/hwmon/asc7621 b/Documentation/hwmon/asc7621 new file mode 100644 index 000000000000..7287be7e1f21 --- /dev/null +++ b/Documentation/hwmon/asc7621 | |||
@@ -0,0 +1,296 @@ | |||
1 | Kernel driver asc7621 | ||
2 | ================== | ||
3 | |||
4 | Supported chips: | ||
5 | Andigilog aSC7621 and aSC7621a | ||
6 | Prefix: 'asc7621' | ||
7 | Addresses scanned: I2C 0x2c, 0x2d, 0x2e | ||
8 | Datasheet: http://www.fairview5.com/linux/asc7621/asc7621.pdf | ||
9 | |||
10 | Author: | ||
11 | George Joseph | ||
12 | |||
13 | Description provided by Dave Pivin @ Andigilog: | ||
14 | |||
15 | Andigilog has both the PECI and pre-PECI versions of the Heceta-6, as | ||
16 | Intel calls them. Heceta-6e has high frequency PWM and Heceta-6p has | ||
17 | added PECI and a 4th thermal zone. The Andigilog aSC7611 is the | ||
18 | Heceta-6e part and aSC7621 is the Heceta-6p part. They are both in | ||
19 | volume production, shipping to Intel and their subs. | ||
20 | |||
21 | We have enhanced both parts relative to the governing Intel | ||
22 | specification. First enhancement is temperature reading resolution. We | ||
23 | have used registers below 20h for vendor-specific functions in addition | ||
24 | to those in the Intel-specified vendor range. | ||
25 | |||
26 | Our conversion process produces a result that is reported as two bytes. | ||
27 | The fan speed control uses this finer value to produce a "step-less" fan | ||
28 | PWM output. These two bytes are "read-locked" to guarantee that once a | ||
29 | high or low byte is read, the other byte is locked-in until after the | ||
30 | next read of any register. So to get an atomic reading, read high or low | ||
31 | byte, then the very next read should be the opposite byte. Our data | ||
32 | sheet says 10-bits of resolution, although you may find the lower bits | ||
33 | are active, they are not necessarily reliable or useful externally. We | ||
34 | chose not to mask them. | ||
35 | |||
36 | We employ significant filtering that is user tunable as described in the | ||
37 | data sheet. Our temperature reports and fan PWM outputs are very smooth | ||
38 | when compared to the competition, in addition to the higher resolution | ||
39 | temperature reports. The smoother PWM output does not require user | ||
40 | intervention. | ||
41 | |||
42 | We offer GPIO features on the former VID pins. These are open-drain | ||
43 | outputs or inputs and may be used as general purpose I/O or as alarm | ||
44 | outputs that are based on temperature limits. These are in 19h and 1Ah. | ||
45 | |||
46 | We offer flexible mapping of temperature readings to thermal zones. Any | ||
47 | temperature may be mapped to any zone, which has a default assignment | ||
48 | that follows Intel's specs. | ||
49 | |||
50 | Since there is a fan to zone assignment that allows for the "hotter" of | ||
51 | a set of zones to control the PWM of an individual fan, but there is no | ||
52 | indication to the user, we have added an indicator that shows which zone | ||
53 | is currently controlling the PWM for a given fan. This is in register | ||
54 | 00h. | ||
55 | |||
56 | Both remote diode temperature readings may be given an offset value such | ||
57 | that the reported reading as well as the temperature used to determine | ||
58 | PWM may be offset for system calibration purposes. | ||
59 | |||
60 | PECI Extended configuration allows for having more than two domains per | ||
61 | PECI address and also provides an enabling function for each PECI | ||
62 | address. One could use our flexible zone assignment to have a zone | ||
63 | assigned to up to 4 PECI addresses. This is not possible in the default | ||
64 | Intel configuration. This would be useful in multi-CPU systems with | ||
65 | individual fans on each that would benefit from individual fan control. | ||
66 | This is in register 0Eh. | ||
67 | |||
68 | The tachometer measurement system is flexible and able to adapt to many | ||
69 | fan types. We can also support pulse-stretched PWM so that 3-wire fans | ||
70 | may be used. These characteristics are in registers 04h to 07h. | ||
71 | |||
72 | Finally, we have added a tach disable function that turns off the tach | ||
73 | measurement system for individual tachs in order to save power. That is | ||
74 | in register 75h. | ||
75 | |||
76 | -- | ||
77 | aSC7621 Product Description | ||
78 | |||
79 | The aSC7621 has a two wire digital interface compatible with SMBus 2.0. | ||
80 | Using a 10-bit ADC, the aSC7621 measures the temperature of two remote diode | ||
81 | connected transistors as well as its own die. Support for Platform | ||
82 | Environmental Control Interface (PECI) is included. | ||
83 | |||
84 | Using temperature information from these four zones, an automatic fan speed | ||
85 | control algorithm is employed to minimize acoustic impact while achieving | ||
86 | recommended CPU temperature under varying operational loads. | ||
87 | |||
88 | To set fan speed, the aSC7621 has three independent pulse width modulation | ||
89 | (PWM) outputs that are controlled by one, or a combination of three, | ||
90 | temperature zones. Both high- and low-frequency PWM ranges are supported. | ||
91 | |||
92 | The aSC7621 also includes a digital filter that can be invoked to smooth | ||
93 | temperature readings for better control of fan speed and minimum acoustic | ||
94 | impact. | ||
95 | |||
96 | The aSC7621 has tachometer inputs to measure fan speed on up to four fans. | ||
97 | Limit and status registers for all measured values are included to alert | ||
98 | the system host that any measurements are outside of programmed limits | ||
99 | via status registers. | ||
100 | |||
101 | System voltages of VCCP, 2.5V, 3.3V, 5.0V, and 12V motherboard power are | ||
102 | monitored efficiently with internal scaling resistors. | ||
103 | |||
104 | Features | ||
105 | - Supports PECI interface and monitors internal and remote thermal diodes | ||
106 | - 2-wire, SMBus 2.0 compliant, serial interface | ||
107 | - 10-bit ADC | ||
108 | - Monitors VCCP, 2.5V, 3.3V, 5.0V, and 12V motherboard/processor supplies | ||
109 | - Programmable autonomous fan control based on temperature readings | ||
110 | - Noise filtering of temperature reading for fan speed control | ||
111 | - 0.25C digital temperature sensor resolution | ||
112 | - 3 PWM fan speed control outputs for 2-, 3- or 4-wire fans and up to 4 fan | ||
113 | tachometer inputs | ||
114 | - Enhanced measured temperature to Temperature Zone assignment. | ||
115 | - Provides high and low PWM frequency ranges | ||
116 | - 3 GPIO pins for custom use | ||
117 | - 24-Lead QSOP package | ||
118 | |||
119 | Configuration Notes | ||
120 | =================== | ||
121 | |||
122 | Except where noted below, the sysfs entries created by this driver follow | ||
123 | the standards defined in "sysfs-interface". | ||
124 | |||
125 | temp1_source | ||
126 | 0 (default) peci_legacy = 0, Remote 1 Temperature | ||
127 | peci_legacy = 1, PECI Processor Temperature 0 | ||
128 | 1 Remote 1 Temperature | ||
129 | 2 Remote 2 Temperature | ||
130 | 3 Internal Temperature | ||
131 | 4 PECI Processor Temperature 0 | ||
132 | 5 PECI Processor Temperature 1 | ||
133 | 6 PECI Processor Temperature 2 | ||
134 | 7 PECI Processor Temperature 3 | ||
135 | |||
136 | temp2_source | ||
137 | 0 (default) Internal Temperature | ||
138 | 1 Remote 1 Temperature | ||
139 | 2 Remote 2 Temperature | ||
140 | 3 Internal Temperature | ||
141 | 4 PECI Processor Temperature 0 | ||
142 | 5 PECI Processor Temperature 1 | ||
143 | 6 PECI Processor Temperature 2 | ||
144 | 7 PECI Processor Temperature 3 | ||
145 | |||
146 | temp3_source | ||
147 | 0 (default) Remote 2 Temperature | ||
148 | 1 Remote 1 Temperature | ||
149 | 2 Remote 2 Temperature | ||
150 | 3 Internal Temperature | ||
151 | 4 PECI Processor Temperature 0 | ||
152 | 5 PECI Processor Temperature 1 | ||
153 | 6 PECI Processor Temperature 2 | ||
154 | 7 PECI Processor Temperature 3 | ||
155 | |||
156 | temp4_source | ||
157 | 0 (default) peci_legacy = 0, PECI Processor Temperature 0 | ||
158 | peci_legacy = 1, Remote 1 Temperature | ||
159 | 1 Remote 1 Temperature | ||
160 | 2 Remote 2 Temperature | ||
161 | 3 Internal Temperature | ||
162 | 4 PECI Processor Temperature 0 | ||
163 | 5 PECI Processor Temperature 1 | ||
164 | 6 PECI Processor Temperature 2 | ||
165 | 7 PECI Processor Temperature 3 | ||
166 | |||
167 | temp[1-4]_smoothing_enable | ||
168 | temp[1-4]_smoothing_time | ||
169 | Smooths spikes in temp readings caused by noise. | ||
170 | Valid values in milliseconds are: | ||
171 | 35000 | ||
172 | 17600 | ||
173 | 11800 | ||
174 | 7000 | ||
175 | 4400 | ||
176 | 3000 | ||
177 | 1600 | ||
178 | 800 | ||
179 | |||
180 | temp[1-4]_crit | ||
181 | When the corresponding zone temperature reaches this value, | ||
182 | ALL pwm outputs will got to 100%. | ||
183 | |||
184 | temp[5-8]_input | ||
185 | temp[5-8]_enable | ||
186 | The aSC7621 can also read temperatures provided by the processor | ||
187 | via the PECI bus. Usually these are "core" temps and are relative | ||
188 | to the point where the automatic thermal control circuit starts | ||
189 | throttling. This means that these are usually negative numbers. | ||
190 | |||
191 | pwm[1-3]_enable | ||
192 | 0 Fan off. | ||
193 | 1 Fan on manual control. | ||
194 | 2 Fan on automatic control and will run at the minimum pwm | ||
195 | if the temperature for the zone is below the minimum. | ||
196 | 3 Fan on automatic control but will be off if the temperature | ||
197 | for the zone is below the minimum. | ||
198 | 4-254 Ignored. | ||
199 | 255 Fan on full. | ||
200 | |||
201 | pwm[1-3]_auto_channels | ||
202 | Bitmap as described in sysctl-interface with the following | ||
203 | exceptions... | ||
204 | Only the following combination of zones (and their corresponding masks) | ||
205 | are valid: | ||
206 | 1 | ||
207 | 2 | ||
208 | 3 | ||
209 | 2,3 | ||
210 | 1,2,3 | ||
211 | 4 | ||
212 | 1,2,3,4 | ||
213 | |||
214 | Special values: | ||
215 | 0 Disabled. | ||
216 | 16 Fan on manual control. | ||
217 | 31 Fan on full. | ||
218 | |||
219 | |||
220 | pwm[1-3]_invert | ||
221 | When set, inverts the meaning of pwm[1-3]. | ||
222 | i.e. when pwm = 0, the fan will be on full and | ||
223 | when pwm = 255 the fan will be off. | ||
224 | |||
225 | pwm[1-3]_freq | ||
226 | PWM frequency in Hz | ||
227 | Valid values in Hz are: | ||
228 | |||
229 | 10 | ||
230 | 15 | ||
231 | 23 | ||
232 | 30 (default) | ||
233 | 38 | ||
234 | 47 | ||
235 | 62 | ||
236 | 94 | ||
237 | 23000 | ||
238 | 24000 | ||
239 | 25000 | ||
240 | 26000 | ||
241 | 27000 | ||
242 | 28000 | ||
243 | 29000 | ||
244 | 30000 | ||
245 | |||
246 | Setting any other value will be ignored. | ||
247 | |||
248 | peci_enable | ||
249 | Enables or disables PECI | ||
250 | |||
251 | peci_avg | ||
252 | Input filter average time. | ||
253 | |||
254 | 0 0 Sec. (no Smoothing) (default) | ||
255 | 1 0.25 Sec. | ||
256 | 2 0.5 Sec. | ||
257 | 3 1.0 Sec. | ||
258 | 4 2.0 Sec. | ||
259 | 5 4.0 Sec. | ||
260 | 6 8.0 Sec. | ||
261 | 7 0.0 Sec. | ||
262 | |||
263 | peci_legacy | ||
264 | |||
265 | 0 Standard Mode (default) | ||
266 | Remote Diode 1 reading is associated with | ||
267 | Temperature Zone 1, PECI is associated with | ||
268 | Zone 4 | ||
269 | |||
270 | 1 Legacy Mode | ||
271 | PECI is associated with Temperature Zone 1, | ||
272 | Remote Diode 1 is associated with Zone 4 | ||
273 | |||
274 | peci_diode | ||
275 | Diode filter | ||
276 | |||
277 | 0 0.25 Sec. | ||
278 | 1 1.1 Sec. | ||
279 | 2 2.4 Sec. (default) | ||
280 | 3 3.4 Sec. | ||
281 | 4 5.0 Sec. | ||
282 | 5 6.8 Sec. | ||
283 | 6 10.2 Sec. | ||
284 | 7 16.4 Sec. | ||
285 | |||
286 | peci_4domain | ||
287 | Four domain enable | ||
288 | |||
289 | 0 1 or 2 Domains for enabled processors (default) | ||
290 | 1 3 or 4 Domains for enabled processors | ||
291 | |||
292 | peci_domain | ||
293 | Domain | ||
294 | |||
295 | 0 Processor contains a single domain (0) (default) | ||
296 | 1 Processor contains two domains (0,1) | ||
diff --git a/Documentation/hwmon/it87 b/Documentation/hwmon/it87 index f9ba96c0ac4a..8d08bf0d38ed 100644 --- a/Documentation/hwmon/it87 +++ b/Documentation/hwmon/it87 | |||
@@ -5,31 +5,23 @@ Supported chips: | |||
5 | * IT8705F | 5 | * IT8705F |
6 | Prefix: 'it87' | 6 | Prefix: 'it87' |
7 | Addresses scanned: from Super I/O config space (8 I/O ports) | 7 | Addresses scanned: from Super I/O config space (8 I/O ports) |
8 | Datasheet: Publicly available at the ITE website | 8 | Datasheet: Once publicly available at the ITE website, but no longer |
9 | http://www.ite.com.tw/product_info/file/pc/IT8705F_V.0.4.1.pdf | ||
10 | * IT8712F | 9 | * IT8712F |
11 | Prefix: 'it8712' | 10 | Prefix: 'it8712' |
12 | Addresses scanned: from Super I/O config space (8 I/O ports) | 11 | Addresses scanned: from Super I/O config space (8 I/O ports) |
13 | Datasheet: Publicly available at the ITE website | 12 | Datasheet: Once publicly available at the ITE website, but no longer |
14 | http://www.ite.com.tw/product_info/file/pc/IT8712F_V0.9.1.pdf | ||
15 | http://www.ite.com.tw/product_info/file/pc/Errata%20V0.1%20for%20IT8712F%20V0.9.1.pdf | ||
16 | http://www.ite.com.tw/product_info/file/pc/IT8712F_V0.9.3.pdf | ||
17 | * IT8716F/IT8726F | 13 | * IT8716F/IT8726F |
18 | Prefix: 'it8716' | 14 | Prefix: 'it8716' |
19 | Addresses scanned: from Super I/O config space (8 I/O ports) | 15 | Addresses scanned: from Super I/O config space (8 I/O ports) |
20 | Datasheet: Publicly available at the ITE website | 16 | Datasheet: Once publicly available at the ITE website, but no longer |
21 | http://www.ite.com.tw/product_info/file/pc/IT8716F_V0.3.ZIP | ||
22 | http://www.ite.com.tw/product_info/file/pc/IT8726F_V0.3.pdf | ||
23 | * IT8718F | 17 | * IT8718F |
24 | Prefix: 'it8718' | 18 | Prefix: 'it8718' |
25 | Addresses scanned: from Super I/O config space (8 I/O ports) | 19 | Addresses scanned: from Super I/O config space (8 I/O ports) |
26 | Datasheet: Publicly available at the ITE website | 20 | Datasheet: Once publicly available at the ITE website, but no longer |
27 | http://www.ite.com.tw/product_info/file/pc/IT8718F_V0.2.zip | ||
28 | http://www.ite.com.tw/product_info/file/pc/IT8718F_V0%203_(for%20C%20version).zip | ||
29 | * IT8720F | 21 | * IT8720F |
30 | Prefix: 'it8720' | 22 | Prefix: 'it8720' |
31 | Addresses scanned: from Super I/O config space (8 I/O ports) | 23 | Addresses scanned: from Super I/O config space (8 I/O ports) |
32 | Datasheet: Not yet publicly available. | 24 | Datasheet: Not publicly available |
33 | * SiS950 [clone of IT8705F] | 25 | * SiS950 [clone of IT8705F] |
34 | Prefix: 'it87' | 26 | Prefix: 'it87' |
35 | Addresses scanned: from Super I/O config space (8 I/O ports) | 27 | Addresses scanned: from Super I/O config space (8 I/O ports) |
@@ -136,6 +128,10 @@ registers are read whenever any data is read (unless it is less than 1.5 | |||
136 | seconds since the last update). This means that you can easily miss | 128 | seconds since the last update). This means that you can easily miss |
137 | once-only alarms. | 129 | once-only alarms. |
138 | 130 | ||
131 | Out-of-limit readings can also result in beeping, if the chip is properly | ||
132 | wired and configured. Beeping can be enabled or disabled per sensor type | ||
133 | (temperatures, voltages and fans.) | ||
134 | |||
139 | The IT87xx only updates its values each 1.5 seconds; reading it more often | 135 | The IT87xx only updates its values each 1.5 seconds; reading it more often |
140 | will do no harm, but will return 'old' values. | 136 | will do no harm, but will return 'old' values. |
141 | 137 | ||
@@ -150,11 +146,38 @@ Fan speed control | |||
150 | ----------------- | 146 | ----------------- |
151 | 147 | ||
152 | The fan speed control features are limited to manual PWM mode. Automatic | 148 | The fan speed control features are limited to manual PWM mode. Automatic |
153 | "Smart Guardian" mode control handling is not implemented. However | 149 | "Smart Guardian" mode control handling is only implemented for older chips |
154 | if you want to go for "manual mode" just write 1 to pwmN_enable. | 150 | (see below.) However if you want to go for "manual mode" just write 1 to |
151 | pwmN_enable. | ||
155 | 152 | ||
156 | If you are only able to control the fan speed with very small PWM values, | 153 | If you are only able to control the fan speed with very small PWM values, |
157 | try lowering the PWM base frequency (pwm1_freq). Depending on the fan, | 154 | try lowering the PWM base frequency (pwm1_freq). Depending on the fan, |
158 | it may give you a somewhat greater control range. The same frequency is | 155 | it may give you a somewhat greater control range. The same frequency is |
159 | used to drive all fan outputs, which is why pwm2_freq and pwm3_freq are | 156 | used to drive all fan outputs, which is why pwm2_freq and pwm3_freq are |
160 | read-only. | 157 | read-only. |
158 | |||
159 | |||
160 | Automatic fan speed control (old interface) | ||
161 | ------------------------------------------- | ||
162 | |||
163 | The driver supports the old interface to automatic fan speed control | ||
164 | which is implemented by IT8705F chips up to revision F and IT8712F | ||
165 | chips up to revision G. | ||
166 | |||
167 | This interface implements 4 temperature vs. PWM output trip points. | ||
168 | The PWM output of trip point 4 is always the maximum value (fan running | ||
169 | at full speed) while the PWM output of the other 3 trip points can be | ||
170 | freely chosen. The temperature of all 4 trip points can be freely chosen. | ||
171 | Additionally, trip point 1 has an hysteresis temperature attached, to | ||
172 | prevent fast switching between fan on and off. | ||
173 | |||
174 | The chip automatically computes the PWM output value based on the input | ||
175 | temperature, based on this simple rule: if the temperature value is | ||
176 | between trip point N and trip point N+1 then the PWM output value is | ||
177 | the one of trip point N. The automatic control mode is less flexible | ||
178 | than the manual control mode, but it reacts faster, is more robust and | ||
179 | doesn't use CPU cycles. | ||
180 | |||
181 | Trip points must be set properly before switching to automatic fan speed | ||
182 | control mode. The driver will perform basic integrity checks before | ||
183 | actually switching to automatic control mode. | ||
diff --git a/Documentation/hwmon/lm85 b/Documentation/hwmon/lm85 index a13680871bc7..a76aefeeb68a 100644 --- a/Documentation/hwmon/lm85 +++ b/Documentation/hwmon/lm85 | |||
@@ -157,7 +157,7 @@ temperature configuration points: | |||
157 | 157 | ||
158 | There are three PWM outputs. The LM85 datasheet suggests that the | 158 | There are three PWM outputs. The LM85 datasheet suggests that the |
159 | pwm3 output control both fan3 and fan4. Each PWM can be individually | 159 | pwm3 output control both fan3 and fan4. Each PWM can be individually |
160 | configured and assigned to a zone for it's control value. Each PWM can be | 160 | configured and assigned to a zone for its control value. Each PWM can be |
161 | configured individually according to the following options. | 161 | configured individually according to the following options. |
162 | 162 | ||
163 | * pwm#_auto_pwm_min - this specifies the PWM value for temp#_auto_temp_off | 163 | * pwm#_auto_pwm_min - this specifies the PWM value for temp#_auto_temp_off |
diff --git a/Documentation/hwmon/lm90 b/Documentation/hwmon/lm90 index 93d8e3d55150..6a03dd4bcc94 100644 --- a/Documentation/hwmon/lm90 +++ b/Documentation/hwmon/lm90 | |||
@@ -84,6 +84,10 @@ Supported chips: | |||
84 | Addresses scanned: I2C 0x4c | 84 | Addresses scanned: I2C 0x4c |
85 | Datasheet: Publicly available at the Maxim website | 85 | Datasheet: Publicly available at the Maxim website |
86 | http://www.maxim-ic.com/quick_view2.cfm/qv_pk/3500 | 86 | http://www.maxim-ic.com/quick_view2.cfm/qv_pk/3500 |
87 | * Winbond/Nuvoton W83L771AWG/ASG | ||
88 | Prefix: 'w83l771' | ||
89 | Addresses scanned: I2C 0x4c | ||
90 | Datasheet: Not publicly available, can be requested from Nuvoton | ||
87 | 91 | ||
88 | 92 | ||
89 | Author: Jean Delvare <khali@linux-fr.org> | 93 | Author: Jean Delvare <khali@linux-fr.org> |
@@ -147,6 +151,12 @@ MAX6680 and MAX6681: | |||
147 | * Selectable address | 151 | * Selectable address |
148 | * Remote sensor type selection | 152 | * Remote sensor type selection |
149 | 153 | ||
154 | W83L771AWG/ASG | ||
155 | * The AWG and ASG variants only differ in package format. | ||
156 | * Filter and alert configuration register at 0xBF | ||
157 | * Diode ideality factor configuration (remote sensor) at 0xE3 | ||
158 | * Moving average (depending on conversion rate) | ||
159 | |||
150 | All temperature values are given in degrees Celsius. Resolution | 160 | All temperature values are given in degrees Celsius. Resolution |
151 | is 1.0 degree for the local temperature, 0.125 degree for the remote | 161 | is 1.0 degree for the local temperature, 0.125 degree for the remote |
152 | temperature, except for the MAX6657, MAX6658 and MAX6659 which have a | 162 | temperature, except for the MAX6657, MAX6658 and MAX6659 which have a |
@@ -163,6 +173,18 @@ The lm90 driver will not update its values more frequently than every | |||
163 | other second; reading them more often will do no harm, but will return | 173 | other second; reading them more often will do no harm, but will return |
164 | 'old' values. | 174 | 'old' values. |
165 | 175 | ||
176 | SMBus Alert Support | ||
177 | ------------------- | ||
178 | |||
179 | This driver has basic support for SMBus alert. When an alert is received, | ||
180 | the status register is read and the faulty temperature channel is logged. | ||
181 | |||
182 | The Analog Devices chips (ADM1032 and ADT7461) do not implement the SMBus | ||
183 | alert protocol properly so additional care is needed: the ALERT output is | ||
184 | disabled when an alert is received, and is re-enabled only when the alarm | ||
185 | is gone. Otherwise the chip would block alerts from other chips in the bus | ||
186 | as long as the alarm is active. | ||
187 | |||
166 | PEC Support | 188 | PEC Support |
167 | ----------- | 189 | ----------- |
168 | 190 | ||
diff --git a/Documentation/i2c/busses/i2c-i801 b/Documentation/i2c/busses/i2c-i801 index 81c0c59a60ea..e307914a3eda 100644 --- a/Documentation/i2c/busses/i2c-i801 +++ b/Documentation/i2c/busses/i2c-i801 | |||
@@ -15,7 +15,8 @@ Supported adapters: | |||
15 | * Intel 82801I (ICH9) | 15 | * Intel 82801I (ICH9) |
16 | * Intel EP80579 (Tolapai) | 16 | * Intel EP80579 (Tolapai) |
17 | * Intel 82801JI (ICH10) | 17 | * Intel 82801JI (ICH10) |
18 | * Intel PCH | 18 | * Intel 3400/5 Series (PCH) |
19 | * Intel Cougar Point (PCH) | ||
19 | Datasheets: Publicly available at the Intel website | 20 | Datasheets: Publicly available at the Intel website |
20 | 21 | ||
21 | Authors: | 22 | Authors: |
@@ -26,7 +27,13 @@ Authors: | |||
26 | Module Parameters | 27 | Module Parameters |
27 | ----------------- | 28 | ----------------- |
28 | 29 | ||
29 | None. | 30 | * disable_features (bit vector) |
31 | Disable selected features normally supported by the device. This makes it | ||
32 | possible to work around possible driver or hardware bugs if the feature in | ||
33 | question doesn't work as intended for whatever reason. Bit values: | ||
34 | 1 disable SMBus PEC | ||
35 | 2 disable the block buffer | ||
36 | 8 disable the I2C block read functionality | ||
30 | 37 | ||
31 | 38 | ||
32 | Description | 39 | Description |
diff --git a/Documentation/i2c/busses/i2c-parport b/Documentation/i2c/busses/i2c-parport index dceaba1ad930..2461c7b53b2c 100644 --- a/Documentation/i2c/busses/i2c-parport +++ b/Documentation/i2c/busses/i2c-parport | |||
@@ -29,6 +29,9 @@ can be easily added when needed. | |||
29 | Earlier kernels defaulted to type=0 (Philips). But now, if the type | 29 | Earlier kernels defaulted to type=0 (Philips). But now, if the type |
30 | parameter is missing, the driver will simply fail to initialize. | 30 | parameter is missing, the driver will simply fail to initialize. |
31 | 31 | ||
32 | SMBus alert support is available on adapters which have this line properly | ||
33 | connected to the parallel port's interrupt pin. | ||
34 | |||
32 | 35 | ||
33 | Building your own adapter | 36 | Building your own adapter |
34 | ------------------------- | 37 | ------------------------- |
diff --git a/Documentation/i2c/busses/i2c-parport-light b/Documentation/i2c/busses/i2c-parport-light index 287436478520..bdc9cbb2e0f2 100644 --- a/Documentation/i2c/busses/i2c-parport-light +++ b/Documentation/i2c/busses/i2c-parport-light | |||
@@ -9,3 +9,14 @@ parport handling is not an option. The drawback is a reduced portability | |||
9 | and the impossibility to daisy-chain other parallel port devices. | 9 | and the impossibility to daisy-chain other parallel port devices. |
10 | 10 | ||
11 | Please see i2c-parport for documentation. | 11 | Please see i2c-parport for documentation. |
12 | |||
13 | Module parameters: | ||
14 | |||
15 | * type: type of adapter (see i2c-parport or modinfo) | ||
16 | |||
17 | * base: base I/O address | ||
18 | Default is 0x378 which is fairly common for parallel ports, at least on PC. | ||
19 | |||
20 | * irq: optional IRQ | ||
21 | This must be passed if you want SMBus alert support, assuming your adapter | ||
22 | actually supports this. | ||
diff --git a/Documentation/i2c/smbus-protocol b/Documentation/i2c/smbus-protocol index 9df47441f0e7..7c19d1a2bea0 100644 --- a/Documentation/i2c/smbus-protocol +++ b/Documentation/i2c/smbus-protocol | |||
@@ -185,6 +185,22 @@ the protocol. All ARP communications use slave address 0x61 and | |||
185 | require PEC checksums. | 185 | require PEC checksums. |
186 | 186 | ||
187 | 187 | ||
188 | SMBus Alert | ||
189 | =========== | ||
190 | |||
191 | SMBus Alert was introduced in Revision 1.0 of the specification. | ||
192 | |||
193 | The SMBus alert protocol allows several SMBus slave devices to share a | ||
194 | single interrupt pin on the SMBus master, while still allowing the master | ||
195 | to know which slave triggered the interrupt. | ||
196 | |||
197 | This is implemented the following way in the Linux kernel: | ||
198 | * I2C bus drivers which support SMBus alert should call | ||
199 | i2c_setup_smbus_alert() to setup SMBus alert support. | ||
200 | * I2C drivers for devices which can trigger SMBus alerts should implement | ||
201 | the optional alert() callback. | ||
202 | |||
203 | |||
188 | I2C Block Transactions | 204 | I2C Block Transactions |
189 | ====================== | 205 | ====================== |
190 | 206 | ||
diff --git a/Documentation/i2c/writing-clients b/Documentation/i2c/writing-clients index 0a74603eb671..5ebf5af1d716 100644 --- a/Documentation/i2c/writing-clients +++ b/Documentation/i2c/writing-clients | |||
@@ -74,6 +74,11 @@ structure at all. You should use this to keep device-specific data. | |||
74 | /* retrieve the value */ | 74 | /* retrieve the value */ |
75 | void *i2c_get_clientdata(const struct i2c_client *client); | 75 | void *i2c_get_clientdata(const struct i2c_client *client); |
76 | 76 | ||
77 | Note that starting with kernel 2.6.34, you don't have to set the `data' field | ||
78 | to NULL in remove() or if probe() failed anymore. The i2c-core does this | ||
79 | automatically on these occasions. Those are also the only times the core will | ||
80 | touch this field. | ||
81 | |||
77 | 82 | ||
78 | Accessing the client | 83 | Accessing the client |
79 | ==================== | 84 | ==================== |
@@ -318,8 +323,9 @@ Plain I2C communication | |||
318 | These routines read and write some bytes from/to a client. The client | 323 | These routines read and write some bytes from/to a client. The client |
319 | contains the i2c address, so you do not have to include it. The second | 324 | contains the i2c address, so you do not have to include it. The second |
320 | parameter contains the bytes to read/write, the third the number of bytes | 325 | parameter contains the bytes to read/write, the third the number of bytes |
321 | to read/write (must be less than the length of the buffer.) Returned is | 326 | to read/write (must be less than the length of the buffer, also should be |
322 | the actual number of bytes read/written. | 327 | less than 64k since msg.len is u16.) Returned is the actual number of bytes |
328 | read/written. | ||
323 | 329 | ||
324 | int i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msg, | 330 | int i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msg, |
325 | int num); | 331 | int num); |
diff --git a/Documentation/init.txt b/Documentation/init.txt new file mode 100644 index 000000000000..535ad5e82b98 --- /dev/null +++ b/Documentation/init.txt | |||
@@ -0,0 +1,49 @@ | |||
1 | Explaining the dreaded "No init found." boot hang message | ||
2 | ========================================================= | ||
3 | |||
4 | OK, so you've got this pretty unintuitive message (currently located | ||
5 | in init/main.c) and are wondering what the H*** went wrong. | ||
6 | Some high-level reasons for failure (listed roughly in order of execution) | ||
7 | to load the init binary are: | ||
8 | A) Unable to mount root FS | ||
9 | B) init binary doesn't exist on rootfs | ||
10 | C) broken console device | ||
11 | D) binary exists but dependencies not available | ||
12 | E) binary cannot be loaded | ||
13 | |||
14 | Detailed explanations: | ||
15 | 0) Set "debug" kernel parameter (in bootloader config file or CONFIG_CMDLINE) | ||
16 | to get more detailed kernel messages. | ||
17 | A) make sure you have the correct root FS type | ||
18 | (and root= kernel parameter points to the correct partition), | ||
19 | required drivers such as storage hardware (such as SCSI or USB!) | ||
20 | and filesystem (ext3, jffs2 etc.) are builtin (alternatively as modules, | ||
21 | to be pre-loaded by an initrd) | ||
22 | C) Possibly a conflict in console= setup --> initial console unavailable. | ||
23 | E.g. some serial consoles are unreliable due to serial IRQ issues (e.g. | ||
24 | missing interrupt-based configuration). | ||
25 | Try using a different console= device or e.g. netconsole= . | ||
26 | D) e.g. required library dependencies of the init binary such as | ||
27 | /lib/ld-linux.so.2 missing or broken. Use readelf -d <INIT>|grep NEEDED | ||
28 | to find out which libraries are required. | ||
29 | E) make sure the binary's architecture matches your hardware. | ||
30 | E.g. i386 vs. x86_64 mismatch, or trying to load x86 on ARM hardware. | ||
31 | In case you tried loading a non-binary file here (shell script?), | ||
32 | you should make sure that the script specifies an interpreter in its shebang | ||
33 | header line (#!/...) that is fully working (including its library | ||
34 | dependencies). And before tackling scripts, better first test a simple | ||
35 | non-script binary such as /bin/sh and confirm its successful execution. | ||
36 | To find out more, add code to init/main.c to display kernel_execve()s | ||
37 | return values. | ||
38 | |||
39 | Please extend this explanation whenever you find new failure causes | ||
40 | (after all loading the init binary is a CRITICAL and hard transition step | ||
41 | which needs to be made as painless as possible), then submit patch to LKML. | ||
42 | Further TODOs: | ||
43 | - Implement the various run_init_process() invocations via a struct array | ||
44 | which can then store the kernel_execve() result value and on failure | ||
45 | log it all by iterating over _all_ results (very important usability fix). | ||
46 | - try to make the implementation itself more helpful in general, | ||
47 | e.g. by providing additional error messages at affected places. | ||
48 | |||
49 | Andreas Mohr <andi at lisas period de> | ||
diff --git a/Documentation/input/elantech.txt b/Documentation/input/elantech.txt index a10c3b6ba7c4..56941ae1f5db 100644 --- a/Documentation/input/elantech.txt +++ b/Documentation/input/elantech.txt | |||
@@ -333,14 +333,14 @@ byte 0: | |||
333 | byte 1: | 333 | byte 1: |
334 | 334 | ||
335 | bit 7 6 5 4 3 2 1 0 | 335 | bit 7 6 5 4 3 2 1 0 |
336 | x15 x14 x13 x12 x11 x10 x9 x8 | 336 | . . . . . x10 x9 x8 |
337 | 337 | ||
338 | byte 2: | 338 | byte 2: |
339 | 339 | ||
340 | bit 7 6 5 4 3 2 1 0 | 340 | bit 7 6 5 4 3 2 1 0 |
341 | x7 x6 x5 x4 x4 x2 x1 x0 | 341 | x7 x6 x5 x4 x4 x2 x1 x0 |
342 | 342 | ||
343 | x15..x0 = absolute x value (horizontal) | 343 | x10..x0 = absolute x value (horizontal) |
344 | 344 | ||
345 | byte 3: | 345 | byte 3: |
346 | 346 | ||
@@ -350,14 +350,14 @@ byte 3: | |||
350 | byte 4: | 350 | byte 4: |
351 | 351 | ||
352 | bit 7 6 5 4 3 2 1 0 | 352 | bit 7 6 5 4 3 2 1 0 |
353 | y15 y14 y13 y12 y11 y10 y8 y8 | 353 | . . . . . . y9 y8 |
354 | 354 | ||
355 | byte 5: | 355 | byte 5: |
356 | 356 | ||
357 | bit 7 6 5 4 3 2 1 0 | 357 | bit 7 6 5 4 3 2 1 0 |
358 | y7 y6 y5 y4 y3 y2 y1 y0 | 358 | y7 y6 y5 y4 y3 y2 y1 y0 |
359 | 359 | ||
360 | y15..y0 = absolute y value (vertical) | 360 | y9..y0 = absolute y value (vertical) |
361 | 361 | ||
362 | 362 | ||
363 | 4.2.2 Two finger touch | 363 | 4.2.2 Two finger touch |
diff --git a/Documentation/input/joystick.txt b/Documentation/input/joystick.txt index 154d767b2acb..8007b7ca87bf 100644 --- a/Documentation/input/joystick.txt +++ b/Documentation/input/joystick.txt | |||
@@ -402,7 +402,7 @@ for the port of the SoundFusion is supported by the cs461x.c module. | |||
402 | ~~~~~~~~~~~~~~~~~~~~~~~~ | 402 | ~~~~~~~~~~~~~~~~~~~~~~~~ |
403 | The Live! has a special PCI gameport, which, although it doesn't provide | 403 | The Live! has a special PCI gameport, which, although it doesn't provide |
404 | any "Enhanced" stuff like 4DWave and friends, is quite a bit faster than | 404 | any "Enhanced" stuff like 4DWave and friends, is quite a bit faster than |
405 | it's ISA counterparts. It also requires special support, hence the | 405 | its ISA counterparts. It also requires special support, hence the |
406 | emu10k1-gp.c module for it instead of the normal ns558.c one. | 406 | emu10k1-gp.c module for it instead of the normal ns558.c one. |
407 | 407 | ||
408 | 3.15 SoundBlaster 64 and 128 - ES1370 and ES1371, ESS Solo1 and S3 SonicVibes | 408 | 3.15 SoundBlaster 64 and 128 - ES1370 and ES1371, ESS Solo1 and S3 SonicVibes |
diff --git a/Documentation/input/multi-touch-protocol.txt b/Documentation/input/multi-touch-protocol.txt index 8490480ce432..c0fc1c75fd88 100644 --- a/Documentation/input/multi-touch-protocol.txt +++ b/Documentation/input/multi-touch-protocol.txt | |||
@@ -68,6 +68,22 @@ like: | |||
68 | SYN_MT_REPORT | 68 | SYN_MT_REPORT |
69 | SYN_REPORT | 69 | SYN_REPORT |
70 | 70 | ||
71 | Here is the sequence after lifting one of the fingers: | ||
72 | |||
73 | ABS_MT_POSITION_X | ||
74 | ABS_MT_POSITION_Y | ||
75 | SYN_MT_REPORT | ||
76 | SYN_REPORT | ||
77 | |||
78 | And here is the sequence after lifting the remaining finger: | ||
79 | |||
80 | SYN_MT_REPORT | ||
81 | SYN_REPORT | ||
82 | |||
83 | If the driver reports one of BTN_TOUCH or ABS_PRESSURE in addition to the | ||
84 | ABS_MT events, the last SYN_MT_REPORT event may be omitted. Otherwise, the | ||
85 | last SYN_REPORT will be dropped by the input core, resulting in no | ||
86 | zero-finger event reaching userland. | ||
71 | 87 | ||
72 | Event Semantics | 88 | Event Semantics |
73 | --------------- | 89 | --------------- |
@@ -217,11 +233,6 @@ where examples can be found. | |||
217 | difference between the contact position and the approaching tool position | 233 | difference between the contact position and the approaching tool position |
218 | could be used to derive tilt. | 234 | could be used to derive tilt. |
219 | [2] The list can of course be extended. | 235 | [2] The list can of course be extended. |
220 | [3] The multi-touch X driver is currently in the prototyping stage. At the | 236 | [3] Multitouch X driver project: http://bitmath.org/code/multitouch/. |
221 | time of writing (April 2009), the MT protocol is not yet merged, and the | ||
222 | prototype implements finger matching, basic mouse support and two-finger | ||
223 | scrolling. The project aims at improving the quality of current multi-touch | ||
224 | functionality available in the Synaptics X driver, and in addition | ||
225 | implement more advanced gestures. | ||
226 | [4] See the section on event computation. | 237 | [4] See the section on event computation. |
227 | [5] See the section on finger tracking. | 238 | [5] See the section on finger tracking. |
diff --git a/Documentation/input/rotary-encoder.txt b/Documentation/input/rotary-encoder.txt index 3a6aec40c0b0..8b4129de1d2d 100644 --- a/Documentation/input/rotary-encoder.txt +++ b/Documentation/input/rotary-encoder.txt | |||
@@ -75,7 +75,7 @@ and the number of steps or will clamp at the maximum and zero depending on | |||
75 | the configuration. | 75 | the configuration. |
76 | 76 | ||
77 | Because GPIO to IRQ mapping is platform specific, this information must | 77 | Because GPIO to IRQ mapping is platform specific, this information must |
78 | be given in seperately to the driver. See the example below. | 78 | be given in separately to the driver. See the example below. |
79 | 79 | ||
80 | ---------<snip>--------- | 80 | ---------<snip>--------- |
81 | 81 | ||
diff --git a/Documentation/input/sentelic.txt b/Documentation/input/sentelic.txt index f7160a2fb6a2..b35affd5c649 100644 --- a/Documentation/input/sentelic.txt +++ b/Documentation/input/sentelic.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | Copyright (C) 2002-2008 Sentelic Corporation. | 1 | Copyright (C) 2002-2010 Sentelic Corporation. |
2 | Last update: Oct-31-2008 | 2 | Last update: Jan-13-2010 |
3 | 3 | ||
4 | ============================================================================== | 4 | ============================================================================== |
5 | * Finger Sensing Pad Intellimouse Mode(scrolling wheel, 4th and 5th buttons) | 5 | * Finger Sensing Pad Intellimouse Mode(scrolling wheel, 4th and 5th buttons) |
@@ -44,7 +44,7 @@ B) MSID 6: Horizontal and Vertical scrolling. | |||
44 | Packet 1 | 44 | Packet 1 |
45 | Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 | 45 | Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 |
46 | BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| | 46 | BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| |
47 | 1 |Y|X|y|x|1|M|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 | | |B|F|l|r|u|d| | 47 | 1 |Y|X|y|x|1|M|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 | | |B|F|r|l|u|d| |
48 | |---------------| |---------------| |---------------| |---------------| | 48 | |---------------| |---------------| |---------------| |---------------| |
49 | 49 | ||
50 | Byte 1: Bit7 => Y overflow | 50 | Byte 1: Bit7 => Y overflow |
@@ -59,15 +59,15 @@ Byte 2: X Movement(9-bit 2's complement integers) | |||
59 | Byte 3: Y Movement(9-bit 2's complement integers) | 59 | Byte 3: Y Movement(9-bit 2's complement integers) |
60 | Byte 4: Bit0 => the Vertical scrolling movement downward. | 60 | Byte 4: Bit0 => the Vertical scrolling movement downward. |
61 | Bit1 => the Vertical scrolling movement upward. | 61 | Bit1 => the Vertical scrolling movement upward. |
62 | Bit2 => the Vertical scrolling movement rightward. | 62 | Bit2 => the Horizontal scrolling movement leftward. |
63 | Bit3 => the Vertical scrolling movement leftward. | 63 | Bit3 => the Horizontal scrolling movement rightward. |
64 | Bit4 => 1 = 4th mouse button is pressed, Forward one page. | 64 | Bit4 => 1 = 4th mouse button is pressed, Forward one page. |
65 | 0 = 4th mouse button is not pressed. | 65 | 0 = 4th mouse button is not pressed. |
66 | Bit5 => 1 = 5th mouse button is pressed, Backward one page. | 66 | Bit5 => 1 = 5th mouse button is pressed, Backward one page. |
67 | 0 = 5th mouse button is not pressed. | 67 | 0 = 5th mouse button is not pressed. |
68 | 68 | ||
69 | C) MSID 7: | 69 | C) MSID 7: |
70 | # FSP uses 2 packets(8 Bytes) data to represent Absolute Position | 70 | # FSP uses 2 packets (8 Bytes) to represent Absolute Position. |
71 | so we have PACKET NUMBER to identify packets. | 71 | so we have PACKET NUMBER to identify packets. |
72 | If PACKET NUMBER is 0, the packet is Packet 1. | 72 | If PACKET NUMBER is 0, the packet is Packet 1. |
73 | If PACKET NUMBER is 1, the packet is Packet 2. | 73 | If PACKET NUMBER is 1, the packet is Packet 2. |
@@ -129,7 +129,7 @@ Byte 3: Message Type => 0x00 (Disabled) | |||
129 | Byte 4: Bit7~Bit0 => Don't Care | 129 | Byte 4: Bit7~Bit0 => Don't Care |
130 | 130 | ||
131 | ============================================================================== | 131 | ============================================================================== |
132 | * Absolute position for STL3888-A0. | 132 | * Absolute position for STL3888-Ax. |
133 | ============================================================================== | 133 | ============================================================================== |
134 | Packet 1 (ABSOLUTE POSITION) | 134 | Packet 1 (ABSOLUTE POSITION) |
135 | Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 | 135 | Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 |
@@ -179,14 +179,14 @@ Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0]) | |||
179 | Bit5~Bit4 => y2_g | 179 | Bit5~Bit4 => y2_g |
180 | Bit7~Bit6 => x2_g | 180 | Bit7~Bit6 => x2_g |
181 | 181 | ||
182 | Notify Packet for STL3888-A0 | 182 | Notify Packet for STL3888-Ax |
183 | Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 | 183 | Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 |
184 | BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| | 184 | BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| |
185 | 1 |1|0|1|P|1|M|R|L| 2 |C|C|C|C|C|C|C|C| 3 |0|0|F|F|0|0|0|i| 4 |r|l|d|u|0|0|0|0| | 185 | 1 |1|0|1|P|1|M|R|L| 2 |C|C|C|C|C|C|C|C| 3 |0|0|F|F|0|0|0|i| 4 |r|l|d|u|0|0|0|0| |
186 | |---------------| |---------------| |---------------| |---------------| | 186 | |---------------| |---------------| |---------------| |---------------| |
187 | 187 | ||
188 | Byte 1: Bit7~Bit6 => 00, Normal data packet | 188 | Byte 1: Bit7~Bit6 => 00, Normal data packet |
189 | => 01, Absolute coordination packet | 189 | => 01, Absolute coordinates packet |
190 | => 10, Notify packet | 190 | => 10, Notify packet |
191 | Bit5 => 1 | 191 | Bit5 => 1 |
192 | Bit4 => when in absolute coordinates mode (valid when EN_PKT_GO is 1): | 192 | Bit4 => when in absolute coordinates mode (valid when EN_PKT_GO is 1): |
@@ -205,15 +205,106 @@ Byte 4: Bit7 => scroll right button | |||
205 | Bit6 => scroll left button | 205 | Bit6 => scroll left button |
206 | Bit5 => scroll down button | 206 | Bit5 => scroll down button |
207 | Bit4 => scroll up button | 207 | Bit4 => scroll up button |
208 | * Note that if gesture and additional button (Bit4~Bit7) | 208 | * Note that if gesture and additional buttoni (Bit4~Bit7) |
209 | happen at the same time, the button information will not | 209 | happen at the same time, the button information will not |
210 | be sent. | 210 | be sent. |
211 | Bit3~Bit0 => Reserved | ||
212 | |||
213 | Sample sequence of Multi-finger, Multi-coordinate mode: | ||
214 | |||
215 | notify packet (valid bit == 1), abs pkt 1, abs pkt 2, abs pkt 1, | ||
216 | abs pkt 2, ..., notify packet (valid bit == 0) | ||
217 | |||
218 | ============================================================================== | ||
219 | * Absolute position for STL3888-B0. | ||
220 | ============================================================================== | ||
221 | Packet 1(ABSOLUTE POSITION) | ||
222 | Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 | ||
223 | BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| | ||
224 | 1 |0|1|V|F|1|0|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 |r|l|u|d|X|X|Y|Y| | ||
225 | |---------------| |---------------| |---------------| |---------------| | ||
226 | |||
227 | Byte 1: Bit7~Bit6 => 00, Normal data packet | ||
228 | => 01, Absolute coordinates packet | ||
229 | => 10, Notify packet | ||
230 | Bit5 => Valid bit, 0 means that the coordinate is invalid or finger up. | ||
231 | When both fingers are up, the last two reports have zero valid | ||
232 | bit. | ||
233 | Bit4 => finger up/down information. 1: finger down, 0: finger up. | ||
234 | Bit3 => 1 | ||
235 | Bit2 => finger index, 0 is the first finger, 1 is the second finger. | ||
236 | Bit1 => Right Button, 1 is pressed, 0 is not pressed. | ||
237 | Bit0 => Left Button, 1 is pressed, 0 is not pressed. | ||
238 | Byte 2: X coordinate (xpos[9:2]) | ||
239 | Byte 3: Y coordinate (ypos[9:2]) | ||
240 | Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0]) | ||
241 | Bit3~Bit2 => X coordinate (ypos[1:0]) | ||
242 | Bit4 => scroll down button | ||
243 | Bit5 => scroll up button | ||
244 | Bit6 => scroll left button | ||
245 | Bit7 => scroll right button | ||
246 | |||
247 | Packet 2 (ABSOLUTE POSITION) | ||
248 | Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 | ||
249 | BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| | ||
250 | 1 |0|1|V|F|1|1|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 |r|l|u|d|X|X|Y|Y| | ||
251 | |---------------| |---------------| |---------------| |---------------| | ||
252 | |||
253 | Byte 1: Bit7~Bit6 => 00, Normal data packet | ||
254 | => 01, Absolute coordination packet | ||
255 | => 10, Notify packet | ||
256 | Bit5 => Valid bit, 0 means that the coordinate is invalid or finger up. | ||
257 | When both fingers are up, the last two reports have zero valid | ||
258 | bit. | ||
259 | Bit4 => finger up/down information. 1: finger down, 0: finger up. | ||
260 | Bit3 => 1 | ||
261 | Bit2 => finger index, 0 is the first finger, 1 is the second finger. | ||
262 | Bit1 => Right Button, 1 is pressed, 0 is not pressed. | ||
263 | Bit0 => Left Button, 1 is pressed, 0 is not pressed. | ||
264 | Byte 2: X coordinate (xpos[9:2]) | ||
265 | Byte 3: Y coordinate (ypos[9:2]) | ||
266 | Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0]) | ||
267 | Bit3~Bit2 => X coordinate (ypos[1:0]) | ||
268 | Bit4 => scroll down button | ||
269 | Bit5 => scroll up button | ||
270 | Bit6 => scroll left button | ||
271 | Bit7 => scroll right button | ||
272 | |||
273 | Notify Packet for STL3888-B0 | ||
274 | Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 | ||
275 | BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| | ||
276 | 1 |1|0|1|P|1|M|R|L| 2 |C|C|C|C|C|C|C|C| 3 |0|0|F|F|0|0|0|i| 4 |r|l|u|d|0|0|0|0| | ||
277 | |---------------| |---------------| |---------------| |---------------| | ||
278 | |||
279 | Byte 1: Bit7~Bit6 => 00, Normal data packet | ||
280 | => 01, Absolute coordination packet | ||
281 | => 10, Notify packet | ||
282 | Bit5 => 1 | ||
283 | Bit4 => when in absolute coordinate mode (valid when EN_PKT_GO is 1): | ||
284 | 0: left button is generated by the on-pad command | ||
285 | 1: left button is generated by the external button | ||
286 | Bit3 => 1 | ||
287 | Bit2 => Middle Button, 1 is pressed, 0 is not pressed. | ||
288 | Bit1 => Right Button, 1 is pressed, 0 is not pressed. | ||
289 | Bit0 => Left Button, 1 is pressed, 0 is not pressed. | ||
290 | Byte 2: Message Type => 0xB7 (Multi Finger, Multi Coordinate mode) | ||
291 | Byte 3: Bit7~Bit6 => Don't care | ||
292 | Bit5~Bit4 => Number of fingers | ||
293 | Bit3~Bit1 => Reserved | ||
294 | Bit0 => 1: enter gesture mode; 0: leaving gesture mode | ||
295 | Byte 4: Bit7 => scroll right button | ||
296 | Bit6 => scroll left button | ||
297 | Bit5 => scroll up button | ||
298 | Bit4 => scroll down button | ||
299 | * Note that if gesture and additional button(Bit4~Bit7) | ||
300 | happen at the same time, the button information will not | ||
301 | be sent. | ||
211 | Bit3~Bit0 => Reserved | 302 | Bit3~Bit0 => Reserved |
212 | 303 | ||
213 | Sample sequence of Multi-finger, Multi-coordinate mode: | 304 | Sample sequence of Multi-finger, Multi-coordinate mode: |
214 | 305 | ||
215 | notify packet (valid bit == 1), abs pkt 1, abs pkt 2, abs pkt 1, | 306 | notify packet (valid bit == 1), abs pkt 1, abs pkt 2, abs pkt 1, |
216 | abs pkt 2, ..., notify packet(valid bit == 0) | 307 | abs pkt 2, ..., notify packet (valid bit == 0) |
217 | 308 | ||
218 | ============================================================================== | 309 | ============================================================================== |
219 | * FSP Enable/Disable packet | 310 | * FSP Enable/Disable packet |
@@ -409,7 +500,8 @@ offset width default r/w name | |||
409 | 0: read only, 1: read/write enable | 500 | 0: read only, 1: read/write enable |
410 | (Note that following registers does not require clock gating being | 501 | (Note that following registers does not require clock gating being |
411 | enabled prior to write: 05 06 07 08 09 0c 0f 10 11 12 16 17 18 23 2e | 502 | enabled prior to write: 05 06 07 08 09 0c 0f 10 11 12 16 17 18 23 2e |
412 | 40 41 42 43.) | 503 | 40 41 42 43. In addition to that, this bit must be 1 when gesture |
504 | mode is enabled) | ||
413 | 505 | ||
414 | 0x31 RW on-pad command detection | 506 | 0x31 RW on-pad command detection |
415 | bit7 0 RW on-pad command left button down tag | 507 | bit7 0 RW on-pad command left button down tag |
@@ -463,6 +555,10 @@ offset width default r/w name | |||
463 | absolute coordinates; otherwise, host only receives packets with | 555 | absolute coordinates; otherwise, host only receives packets with |
464 | relative coordinate.) | 556 | relative coordinate.) |
465 | 557 | ||
558 | bit7 0 RW EN_PS2_F2: PS/2 gesture mode 2nd | ||
559 | finger packet enable | ||
560 | 0: disable, 1: enable | ||
561 | |||
466 | 0x43 RW on-pad control | 562 | 0x43 RW on-pad control |
467 | bit0 0 RW on-pad control enable | 563 | bit0 0 RW on-pad control enable |
468 | 0: disable, 1: enable | 564 | 0: disable, 1: enable |
diff --git a/Documentation/intel_txt.txt b/Documentation/intel_txt.txt index f40a1f030019..5dc59b04a71f 100644 --- a/Documentation/intel_txt.txt +++ b/Documentation/intel_txt.txt | |||
@@ -126,7 +126,7 @@ o Tboot then applies an (optional) user-defined launch policy to | |||
126 | o Tboot adjusts the e820 table provided by the bootloader to reserve | 126 | o Tboot adjusts the e820 table provided by the bootloader to reserve |
127 | its own location in memory as well as to reserve certain other | 127 | its own location in memory as well as to reserve certain other |
128 | TXT-related regions. | 128 | TXT-related regions. |
129 | o As part of it's launch, tboot DMA protects all of RAM (using the | 129 | o As part of its launch, tboot DMA protects all of RAM (using the |
130 | VT-d PMRs). Thus, the kernel must be booted with 'intel_iommu=on' | 130 | VT-d PMRs). Thus, the kernel must be booted with 'intel_iommu=on' |
131 | in order to remove this blanket protection and use VT-d's | 131 | in order to remove this blanket protection and use VT-d's |
132 | page-level protection. | 132 | page-level protection. |
@@ -161,13 +161,15 @@ o In order to put a system into any of the sleep states after a TXT | |||
161 | has been restored, it will restore the TPM PCRs and then | 161 | has been restored, it will restore the TPM PCRs and then |
162 | transfer control back to the kernel's S3 resume vector. | 162 | transfer control back to the kernel's S3 resume vector. |
163 | In order to preserve system integrity across S3, the kernel | 163 | In order to preserve system integrity across S3, the kernel |
164 | provides tboot with a set of memory ranges (kernel | 164 | provides tboot with a set of memory ranges (RAM and RESERVED_KERN |
165 | code/data/bss, S3 resume code, and AP trampoline) that tboot | 165 | in the e820 table, but not any memory that BIOS might alter over |
166 | will calculate a MAC (message authentication code) over and then | 166 | the S3 transition) that tboot will calculate a MAC (message |
167 | seal with the TPM. On resume and once the measured environment | 167 | authentication code) over and then seal with the TPM. On resume |
168 | has been re-established, tboot will re-calculate the MAC and | 168 | and once the measured environment has been re-established, tboot |
169 | verify it against the sealed value. Tboot's policy determines | 169 | will re-calculate the MAC and verify it against the sealed value. |
170 | what happens if the verification fails. | 170 | Tboot's policy determines what happens if the verification fails. |
171 | Note that the c/s 194 of tboot which has the new MAC code supports | ||
172 | this. | ||
171 | 173 | ||
172 | That's pretty much it for TXT support. | 174 | That's pretty much it for TXT support. |
173 | 175 | ||
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index 35cf64d4436d..dd5806f4fcc4 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt | |||
@@ -139,7 +139,6 @@ Code Seq#(hex) Include File Comments | |||
139 | 'K' all linux/kd.h | 139 | 'K' all linux/kd.h |
140 | 'L' 00-1F linux/loop.h conflict! | 140 | 'L' 00-1F linux/loop.h conflict! |
141 | 'L' 10-1F drivers/scsi/mpt2sas/mpt2sas_ctl.h conflict! | 141 | 'L' 10-1F drivers/scsi/mpt2sas/mpt2sas_ctl.h conflict! |
142 | 'L' 20-2F linux/usb/vstusb.h | ||
143 | 'L' E0-FF linux/ppdd.h encrypted disk device driver | 142 | 'L' E0-FF linux/ppdd.h encrypted disk device driver |
144 | <http://linux01.gwdg.de/~alatham/ppdd.html> | 143 | <http://linux01.gwdg.de/~alatham/ppdd.html> |
145 | 'M' all linux/soundcard.h conflict! | 144 | 'M' all linux/soundcard.h conflict! |
@@ -292,6 +291,7 @@ Code Seq#(hex) Include File Comments | |||
292 | 0x92 00-0F drivers/usb/mon/mon_bin.c | 291 | 0x92 00-0F drivers/usb/mon/mon_bin.c |
293 | 0x93 60-7F linux/auto_fs.h | 292 | 0x93 60-7F linux/auto_fs.h |
294 | 0x94 all fs/btrfs/ioctl.h | 293 | 0x94 all fs/btrfs/ioctl.h |
294 | 0x97 00-7F fs/ceph/ioctl.h Ceph file system | ||
295 | 0x99 00-0F 537-Addinboard driver | 295 | 0x99 00-0F 537-Addinboard driver |
296 | <mailto:buk@buks.ipn.de> | 296 | <mailto:buk@buks.ipn.de> |
297 | 0xA0 all linux/sdp/sdp.h Industrial Device Project | 297 | 0xA0 all linux/sdp/sdp.h Industrial Device Project |
diff --git a/Documentation/isdn/INTERFACE.CAPI b/Documentation/isdn/INTERFACE.CAPI index 5fe8de5cc727..f172091fb7cd 100644 --- a/Documentation/isdn/INTERFACE.CAPI +++ b/Documentation/isdn/INTERFACE.CAPI | |||
@@ -149,10 +149,11 @@ char *(*procinfo)(struct capi_ctr *ctrlr) | |||
149 | pointer to a callback function returning the entry for the device in | 149 | pointer to a callback function returning the entry for the device in |
150 | the CAPI controller info table, /proc/capi/controller | 150 | the CAPI controller info table, /proc/capi/controller |
151 | 151 | ||
152 | read_proc_t *ctr_read_proc | 152 | const struct file_operations *proc_fops |
153 | pointer to the read_proc callback function for the device's proc file | 153 | pointers to callback functions for the device's proc file |
154 | system entry, /proc/capi/controllers/<n>; will be called with a | 154 | system entry, /proc/capi/controllers/<n>; pointer to the device's |
155 | pointer to the device's capi_ctr structure as the last (data) argument | 155 | capi_ctr structure is available from struct proc_dir_entry::data |
156 | which is available from struct inode. | ||
156 | 157 | ||
157 | Note: Callback functions except send_message() are never called in interrupt | 158 | Note: Callback functions except send_message() are never called in interrupt |
158 | context. | 159 | context. |
diff --git a/Documentation/isdn/README.gigaset b/Documentation/isdn/README.gigaset index 794941fc9493..e472df842323 100644 --- a/Documentation/isdn/README.gigaset +++ b/Documentation/isdn/README.gigaset | |||
@@ -292,10 +292,10 @@ GigaSet 307x Device Driver | |||
292 | to /etc/modprobe.d/gigaset, /etc/modprobe.conf.local or a similar file. | 292 | to /etc/modprobe.d/gigaset, /etc/modprobe.conf.local or a similar file. |
293 | 293 | ||
294 | Problem: | 294 | Problem: |
295 | Your isdn script aborts with a message about isdnlog. | 295 | The isdnlog program emits error messages or just doesn't work. |
296 | Solution: | 296 | Solution: |
297 | Try deactivating (or commenting out) isdnlog. This driver does not | 297 | Isdnlog supports only the HiSax driver. Do not attempt to use it with |
298 | support it. | 298 | other drivers such as Gigaset. |
299 | 299 | ||
300 | Problem: | 300 | Problem: |
301 | You have two or more DECT data adapters (M101/M105) and only the | 301 | You have two or more DECT data adapters (M101/M105) and only the |
@@ -321,8 +321,8 @@ GigaSet 307x Device Driver | |||
321 | writing an appropriate value to /sys/module/gigaset/parameters/debug, e.g. | 321 | writing an appropriate value to /sys/module/gigaset/parameters/debug, e.g. |
322 | echo 0 > /sys/module/gigaset/parameters/debug | 322 | echo 0 > /sys/module/gigaset/parameters/debug |
323 | switches off debugging output completely, | 323 | switches off debugging output completely, |
324 | echo 0x10a020 > /sys/module/gigaset/parameters/debug | 324 | echo 0x302020 > /sys/module/gigaset/parameters/debug |
325 | enables the standard set of debugging output messages. These values are | 325 | enables a reasonable set of debugging output messages. These values are |
326 | bit patterns where every bit controls a certain type of debugging output. | 326 | bit patterns where every bit controls a certain type of debugging output. |
327 | See the constants DEBUG_* in the source file gigaset.h for details. | 327 | See the constants DEBUG_* in the source file gigaset.h for details. |
328 | 328 | ||
diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt index c412c245848f..b472e4e0ba67 100644 --- a/Documentation/kbuild/kconfig-language.txt +++ b/Documentation/kbuild/kconfig-language.txt | |||
@@ -181,7 +181,7 @@ Expressions are listed in decreasing order of precedence. | |||
181 | (7) Returns the result of max(/expr/, /expr/). | 181 | (7) Returns the result of max(/expr/, /expr/). |
182 | 182 | ||
183 | An expression can have a value of 'n', 'm' or 'y' (or 0, 1, 2 | 183 | An expression can have a value of 'n', 'm' or 'y' (or 0, 1, 2 |
184 | respectively for calculations). A menu entry becomes visible when it's | 184 | respectively for calculations). A menu entry becomes visible when its |
185 | expression evaluates to 'm' or 'y'. | 185 | expression evaluates to 'm' or 'y'. |
186 | 186 | ||
187 | There are two types of symbols: constant and non-constant symbols. | 187 | There are two types of symbols: constant and non-constant symbols. |
diff --git a/Documentation/kbuild/kconfig.txt b/Documentation/kbuild/kconfig.txt index 49efae703979..b2cb16ebcb16 100644 --- a/Documentation/kbuild/kconfig.txt +++ b/Documentation/kbuild/kconfig.txt | |||
@@ -96,7 +96,7 @@ Environment variables for 'silentoldconfig' | |||
96 | KCONFIG_NOSILENTUPDATE | 96 | KCONFIG_NOSILENTUPDATE |
97 | -------------------------------------------------- | 97 | -------------------------------------------------- |
98 | If this variable has a non-blank value, it prevents silent kernel | 98 | If this variable has a non-blank value, it prevents silent kernel |
99 | config udpates (requires explicit updates). | 99 | config updates (requires explicit updates). |
100 | 100 | ||
101 | KCONFIG_AUTOCONFIG | 101 | KCONFIG_AUTOCONFIG |
102 | -------------------------------------------------- | 102 | -------------------------------------------------- |
diff --git a/Documentation/kernel-docs.txt b/Documentation/kernel-docs.txt index 28cdc2af2131..ec8d31ee12e0 100644 --- a/Documentation/kernel-docs.txt +++ b/Documentation/kernel-docs.txt | |||
@@ -116,7 +116,7 @@ | |||
116 | Author: Ingo Molnar, Gadi Oxman and Miguel de Icaza. | 116 | Author: Ingo Molnar, Gadi Oxman and Miguel de Icaza. |
117 | URL: http://www.linuxjournal.com/article.php?sid=2391 | 117 | URL: http://www.linuxjournal.com/article.php?sid=2391 |
118 | Keywords: RAID, MD driver. | 118 | Keywords: RAID, MD driver. |
119 | Description: Linux Journal Kernel Korner article. Here is it's | 119 | Description: Linux Journal Kernel Korner article. Here is its |
120 | abstract: "A description of the implementation of the RAID-1, | 120 | abstract: "A description of the implementation of the RAID-1, |
121 | RAID-4 and RAID-5 personalities of the MD device driver in the | 121 | RAID-4 and RAID-5 personalities of the MD device driver in the |
122 | Linux kernel, providing users with high performance and reliable, | 122 | Linux kernel, providing users with high performance and reliable, |
@@ -127,7 +127,7 @@ | |||
127 | URL: http://www.linuxjournal.com/article.php?sid=1219 | 127 | URL: http://www.linuxjournal.com/article.php?sid=1219 |
128 | Keywords: device driver, module, loading/unloading modules, | 128 | Keywords: device driver, module, loading/unloading modules, |
129 | allocating resources. | 129 | allocating resources. |
130 | Description: Linux Journal Kernel Korner article. Here is it's | 130 | Description: Linux Journal Kernel Korner article. Here is its |
131 | abstract: "This is the first of a series of four articles | 131 | abstract: "This is the first of a series of four articles |
132 | co-authored by Alessandro Rubini and Georg Zezchwitz which present | 132 | co-authored by Alessandro Rubini and Georg Zezchwitz which present |
133 | a practical approach to writing Linux device drivers as kernel | 133 | a practical approach to writing Linux device drivers as kernel |
@@ -141,7 +141,7 @@ | |||
141 | Keywords: character driver, init_module, clean_up module, | 141 | Keywords: character driver, init_module, clean_up module, |
142 | autodetection, mayor number, minor number, file operations, | 142 | autodetection, mayor number, minor number, file operations, |
143 | open(), close(). | 143 | open(), close(). |
144 | Description: Linux Journal Kernel Korner article. Here is it's | 144 | Description: Linux Journal Kernel Korner article. Here is its |
145 | abstract: "This article, the second of four, introduces part of | 145 | abstract: "This article, the second of four, introduces part of |
146 | the actual code to create custom module implementing a character | 146 | the actual code to create custom module implementing a character |
147 | device driver. It describes the code for module initialization and | 147 | device driver. It describes the code for module initialization and |
@@ -152,7 +152,7 @@ | |||
152 | URL: http://www.linuxjournal.com/article.php?sid=1221 | 152 | URL: http://www.linuxjournal.com/article.php?sid=1221 |
153 | Keywords: read(), write(), select(), ioctl(), blocking/non | 153 | Keywords: read(), write(), select(), ioctl(), blocking/non |
154 | blocking mode, interrupt handler. | 154 | blocking mode, interrupt handler. |
155 | Description: Linux Journal Kernel Korner article. Here is it's | 155 | Description: Linux Journal Kernel Korner article. Here is its |
156 | abstract: "This article, the third of four on writing character | 156 | abstract: "This article, the third of four on writing character |
157 | device drivers, introduces concepts of reading, writing, and using | 157 | device drivers, introduces concepts of reading, writing, and using |
158 | ioctl-calls". | 158 | ioctl-calls". |
@@ -161,7 +161,7 @@ | |||
161 | Author: Alessandro Rubini and Georg v. Zezschwitz. | 161 | Author: Alessandro Rubini and Georg v. Zezschwitz. |
162 | URL: http://www.linuxjournal.com/article.php?sid=1222 | 162 | URL: http://www.linuxjournal.com/article.php?sid=1222 |
163 | Keywords: interrupts, irqs, DMA, bottom halves, task queues. | 163 | Keywords: interrupts, irqs, DMA, bottom halves, task queues. |
164 | Description: Linux Journal Kernel Korner article. Here is it's | 164 | Description: Linux Journal Kernel Korner article. Here is its |
165 | abstract: "This is the fourth in a series of articles about | 165 | abstract: "This is the fourth in a series of articles about |
166 | writing character device drivers as loadable kernel modules. This | 166 | writing character device drivers as loadable kernel modules. This |
167 | month, we further investigate the field of interrupt handling. | 167 | month, we further investigate the field of interrupt handling. |
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index e7848a0d99eb..b56ea860da21 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -54,9 +54,11 @@ parameter is applicable: | |||
54 | IMA Integrity measurement architecture is enabled. | 54 | IMA Integrity measurement architecture is enabled. |
55 | IOSCHED More than one I/O scheduler is enabled. | 55 | IOSCHED More than one I/O scheduler is enabled. |
56 | IP_PNP IP DHCP, BOOTP, or RARP is enabled. | 56 | IP_PNP IP DHCP, BOOTP, or RARP is enabled. |
57 | IPV6 IPv6 support is enabled. | ||
57 | ISAPNP ISA PnP code is enabled. | 58 | ISAPNP ISA PnP code is enabled. |
58 | ISDN Appropriate ISDN support is enabled. | 59 | ISDN Appropriate ISDN support is enabled. |
59 | JOY Appropriate joystick support is enabled. | 60 | JOY Appropriate joystick support is enabled. |
61 | KGDB Kernel debugger support is enabled. | ||
60 | KVM Kernel Virtual Machine support is enabled. | 62 | KVM Kernel Virtual Machine support is enabled. |
61 | LIBATA Libata driver is enabled | 63 | LIBATA Libata driver is enabled |
62 | LP Printer support is enabled. | 64 | LP Printer support is enabled. |
@@ -98,6 +100,7 @@ parameter is applicable: | |||
98 | SWSUSP Software suspend (hibernation) is enabled. | 100 | SWSUSP Software suspend (hibernation) is enabled. |
99 | SUSPEND System suspend states are enabled. | 101 | SUSPEND System suspend states are enabled. |
100 | FTRACE Function tracing enabled. | 102 | FTRACE Function tracing enabled. |
103 | TPM TPM drivers are enabled. | ||
101 | TS Appropriate touchscreen support is enabled. | 104 | TS Appropriate touchscreen support is enabled. |
102 | UMS USB Mass Storage support is enabled. | 105 | UMS USB Mass Storage support is enabled. |
103 | USB USB support is enabled. | 106 | USB USB support is enabled. |
@@ -150,6 +153,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
150 | strict -- Be less tolerant of platforms that are not | 153 | strict -- Be less tolerant of platforms that are not |
151 | strictly ACPI specification compliant. | 154 | strictly ACPI specification compliant. |
152 | rsdt -- prefer RSDT over (default) XSDT | 155 | rsdt -- prefer RSDT over (default) XSDT |
156 | copy_dsdt -- copy DSDT to memory | ||
153 | 157 | ||
154 | See also Documentation/power/pm.txt, pci=noacpi | 158 | See also Documentation/power/pm.txt, pci=noacpi |
155 | 159 | ||
@@ -199,10 +203,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
199 | acpi_display_output=video | 203 | acpi_display_output=video |
200 | See above. | 204 | See above. |
201 | 205 | ||
202 | acpi_early_pdc_eval [HW,ACPI] Evaluate processor _PDC methods | ||
203 | early. Needed on some platforms to properly | ||
204 | initialize the EC. | ||
205 | |||
206 | acpi_irq_balance [HW,ACPI] | 206 | acpi_irq_balance [HW,ACPI] |
207 | ACPI will balance active IRQs | 207 | ACPI will balance active IRQs |
208 | default in APIC mode | 208 | default in APIC mode |
@@ -290,9 +290,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
290 | advansys= [HW,SCSI] | 290 | advansys= [HW,SCSI] |
291 | See header of drivers/scsi/advansys.c. | 291 | See header of drivers/scsi/advansys.c. |
292 | 292 | ||
293 | advwdt= [HW,WDT] Advantech WDT | ||
294 | Format: <iostart>,<iostop> | ||
295 | |||
296 | aedsp16= [HW,OSS] Audio Excel DSP 16 | 293 | aedsp16= [HW,OSS] Audio Excel DSP 16 |
297 | Format: <io>,<irq>,<dma>,<mss_io>,<mpu_io>,<mpu_irq> | 294 | Format: <io>,<irq>,<dma>,<mss_io>,<mpu_io>,<mpu_irq> |
298 | See also header of sound/oss/aedsp16.c. | 295 | See also header of sound/oss/aedsp16.c. |
@@ -323,15 +320,12 @@ and is between 256 and 4096 characters. It is defined in the file | |||
323 | amd_iommu= [HW,X86-84] | 320 | amd_iommu= [HW,X86-84] |
324 | Pass parameters to the AMD IOMMU driver in the system. | 321 | Pass parameters to the AMD IOMMU driver in the system. |
325 | Possible values are: | 322 | Possible values are: |
326 | isolate - enable device isolation (each device, as far | ||
327 | as possible, will get its own protection | ||
328 | domain) [default] | ||
329 | share - put every device behind one IOMMU into the | ||
330 | same protection domain | ||
331 | fullflush - enable flushing of IO/TLB entries when | 323 | fullflush - enable flushing of IO/TLB entries when |
332 | they are unmapped. Otherwise they are | 324 | they are unmapped. Otherwise they are |
333 | flushed before they will be reused, which | 325 | flushed before they will be reused, which |
334 | is a lot of faster | 326 | is a lot of faster |
327 | off - do not initialize any AMD IOMMU found in | ||
328 | the system | ||
335 | 329 | ||
336 | amijoy.map= [HW,JOY] Amiga joystick support | 330 | amijoy.map= [HW,JOY] Amiga joystick support |
337 | Map of devices attached to JOY0DAT and JOY1DAT | 331 | Map of devices attached to JOY0DAT and JOY1DAT |
@@ -356,6 +350,9 @@ and is between 256 and 4096 characters. It is defined in the file | |||
356 | Change the amount of debugging information output | 350 | Change the amount of debugging information output |
357 | when initialising the APIC and IO-APIC components. | 351 | when initialising the APIC and IO-APIC components. |
358 | 352 | ||
353 | autoconf= [IPV6] | ||
354 | See Documentation/networking/ipv6.txt. | ||
355 | |||
359 | show_lapic= [APIC,X86] Advanced Programmable Interrupt Controller | 356 | show_lapic= [APIC,X86] Advanced Programmable Interrupt Controller |
360 | Limit apic dumping. The parameter defines the maximal | 357 | Limit apic dumping. The parameter defines the maximal |
361 | number of local apics being dumped. Also it is possible | 358 | number of local apics being dumped. Also it is possible |
@@ -638,6 +635,12 @@ and is between 256 and 4096 characters. It is defined in the file | |||
638 | See drivers/char/README.epca and | 635 | See drivers/char/README.epca and |
639 | Documentation/serial/digiepca.txt. | 636 | Documentation/serial/digiepca.txt. |
640 | 637 | ||
638 | disable= [IPV6] | ||
639 | See Documentation/networking/ipv6.txt. | ||
640 | |||
641 | disable_ipv6= [IPV6] | ||
642 | See Documentation/networking/ipv6.txt. | ||
643 | |||
641 | disable_mtrr_cleanup [X86] | 644 | disable_mtrr_cleanup [X86] |
642 | The kernel tries to adjust MTRR layout from continuous | 645 | The kernel tries to adjust MTRR layout from continuous |
643 | to discrete, to make X server driver able to add WB | 646 | to discrete, to make X server driver able to add WB |
@@ -707,6 +710,12 @@ and is between 256 and 4096 characters. It is defined in the file | |||
707 | The VGA output is eventually overwritten by the real | 710 | The VGA output is eventually overwritten by the real |
708 | console. | 711 | console. |
709 | 712 | ||
713 | ekgdboc= [X86,KGDB] Allow early kernel console debugging | ||
714 | ekgdboc=kbd | ||
715 | |||
716 | This is desgined to be used in conjunction with | ||
717 | the boot argument: earlyprintk=vga | ||
718 | |||
710 | eata= [HW,SCSI] | 719 | eata= [HW,SCSI] |
711 | 720 | ||
712 | edd= [EDD] | 721 | edd= [EDD] |
@@ -753,9 +762,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
753 | This option is obsoleted by the "netdev=" option, which | 762 | This option is obsoleted by the "netdev=" option, which |
754 | has equivalent usage. See its documentation for details. | 763 | has equivalent usage. See its documentation for details. |
755 | 764 | ||
756 | eurwdt= [HW,WDT] Eurotech CPU-1220/1410 onboard watchdog. | ||
757 | Format: <io>[,<irq>] | ||
758 | |||
759 | failslab= | 765 | failslab= |
760 | fail_page_alloc= | 766 | fail_page_alloc= |
761 | fail_make_request=[KNL] | 767 | fail_make_request=[KNL] |
@@ -783,8 +789,12 @@ and is between 256 and 4096 characters. It is defined in the file | |||
783 | as early as possible in order to facilitate early | 789 | as early as possible in order to facilitate early |
784 | boot debugging. | 790 | boot debugging. |
785 | 791 | ||
786 | ftrace_dump_on_oops | 792 | ftrace_dump_on_oops[=orig_cpu] |
787 | [FTRACE] will dump the trace buffers on oops. | 793 | [FTRACE] will dump the trace buffers on oops. |
794 | If no parameter is passed, ftrace will dump | ||
795 | buffers of all CPUs, but if you pass orig_cpu, it will | ||
796 | dump only the buffer of the CPU that triggered the | ||
797 | oops. | ||
788 | 798 | ||
789 | ftrace_filter=[function-list] | 799 | ftrace_filter=[function-list] |
790 | [FTRACE] Limit the functions traced by the function | 800 | [FTRACE] Limit the functions traced by the function |
@@ -1111,10 +1121,26 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1111 | use the HighMem zone if it exists, and the Normal | 1121 | use the HighMem zone if it exists, and the Normal |
1112 | zone if it does not. | 1122 | zone if it does not. |
1113 | 1123 | ||
1114 | kgdboc= [HW] kgdb over consoles. | 1124 | kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port. |
1115 | Requires a tty driver that supports console polling. | 1125 | Format: <Controller#>[,poll interval] |
1116 | (only serial supported for now) | 1126 | The controller # is the number of the ehci usb debug |
1117 | Format: <serial_device>[,baud] | 1127 | port as it is probed via PCI. The poll interval is |
1128 | optional and is the number seconds in between | ||
1129 | each poll cycle to the debug port in case you need | ||
1130 | the functionality for interrupting the kernel with | ||
1131 | gdb or control-c on the dbgp connection. When | ||
1132 | not using this parameter you use sysrq-g to break into | ||
1133 | the kernel debugger. | ||
1134 | |||
1135 | kgdboc= [KGDB,HW] kgdb over consoles. | ||
1136 | Requires a tty driver that supports console polling, | ||
1137 | or a supported polling keyboard driver (non-usb). | ||
1138 | Serial only format: <serial_device>[,baud] | ||
1139 | keyboard only format: kbd | ||
1140 | keyboard and serial format: kbd,<serial_device>[,baud] | ||
1141 | |||
1142 | kgdbwait [KGDB] Stop kernel execution and enter the | ||
1143 | kernel debugger at the earliest opportunity. | ||
1118 | 1144 | ||
1119 | kmac= [MIPS] korina ethernet MAC address. | 1145 | kmac= [MIPS] korina ethernet MAC address. |
1120 | Configure the RouterBoard 532 series on-chip | 1146 | Configure the RouterBoard 532 series on-chip |
@@ -1193,7 +1219,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1193 | 1219 | ||
1194 | libata.force= [LIBATA] Force configurations. The format is comma | 1220 | libata.force= [LIBATA] Force configurations. The format is comma |
1195 | separated list of "[ID:]VAL" where ID is | 1221 | separated list of "[ID:]VAL" where ID is |
1196 | PORT[:DEVICE]. PORT and DEVICE are decimal numbers | 1222 | PORT[.DEVICE]. PORT and DEVICE are decimal numbers |
1197 | matching port, link or device. Basically, it matches | 1223 | matching port, link or device. Basically, it matches |
1198 | the ATA ID string printed on console by libata. If | 1224 | the ATA ID string printed on console by libata. If |
1199 | the whole ID part is omitted, the last PORT and DEVICE | 1225 | the whole ID part is omitted, the last PORT and DEVICE |
@@ -1738,6 +1764,9 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1738 | nomfgpt [X86-32] Disable Multi-Function General Purpose | 1764 | nomfgpt [X86-32] Disable Multi-Function General Purpose |
1739 | Timer usage (for AMD Geode machines). | 1765 | Timer usage (for AMD Geode machines). |
1740 | 1766 | ||
1767 | nopat [X86] Disable PAT (page attribute table extension of | ||
1768 | pagetables) support. | ||
1769 | |||
1741 | norandmaps Don't use address space randomization. Equivalent to | 1770 | norandmaps Don't use address space randomization. Equivalent to |
1742 | echo 0 > /proc/sys/kernel/randomize_va_space | 1771 | echo 0 > /proc/sys/kernel/randomize_va_space |
1743 | 1772 | ||
@@ -1781,6 +1810,12 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1781 | purges which is reported from either PAL_VM_SUMMARY or | 1810 | purges which is reported from either PAL_VM_SUMMARY or |
1782 | SAL PALO. | 1811 | SAL PALO. |
1783 | 1812 | ||
1813 | nr_cpus= [SMP] Maximum number of processors that an SMP kernel | ||
1814 | could support. nr_cpus=n : n >= 1 limits the kernel to | ||
1815 | supporting 'n' processors. Later in runtime you can not | ||
1816 | use hotplug cpu feature to put more cpu back to online. | ||
1817 | just like you compile the kernel NR_CPUS=n | ||
1818 | |||
1784 | nr_uarts= [SERIAL] maximum number of UARTs to be registered. | 1819 | nr_uarts= [SERIAL] maximum number of UARTs to be registered. |
1785 | 1820 | ||
1786 | numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA. | 1821 | numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA. |
@@ -1948,8 +1983,12 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1948 | IRQ routing is enabled. | 1983 | IRQ routing is enabled. |
1949 | noacpi [X86] Do not use ACPI for IRQ routing | 1984 | noacpi [X86] Do not use ACPI for IRQ routing |
1950 | or for PCI scanning. | 1985 | or for PCI scanning. |
1951 | use_crs [X86] Use _CRS for PCI resource | 1986 | use_crs [X86] Use PCI host bridge window information |
1952 | allocation. | 1987 | from ACPI. On BIOSes from 2008 or later, this |
1988 | is enabled by default. If you need to use this, | ||
1989 | please report a bug. | ||
1990 | nocrs [X86] Ignore PCI host bridge windows from ACPI. | ||
1991 | If you need to use this, please report a bug. | ||
1953 | routeirq Do IRQ routing for all PCI devices. | 1992 | routeirq Do IRQ routing for all PCI devices. |
1954 | This is normally done in pci_enable_device(), | 1993 | This is normally done in pci_enable_device(), |
1955 | so this option is a temporary workaround | 1994 | so this option is a temporary workaround |
@@ -1998,6 +2037,14 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1998 | force Enable ASPM even on devices that claim not to support it. | 2037 | force Enable ASPM even on devices that claim not to support it. |
1999 | WARNING: Forcing ASPM on may cause system lockups. | 2038 | WARNING: Forcing ASPM on may cause system lockups. |
2000 | 2039 | ||
2040 | pcie_pme= [PCIE,PM] Native PCIe PME signaling options: | ||
2041 | off Do not use native PCIe PME signaling. | ||
2042 | force Use native PCIe PME signaling even if the BIOS refuses | ||
2043 | to allow the kernel to control the relevant PCIe config | ||
2044 | registers. | ||
2045 | nomsi Do not use MSI for native PCIe PME signaling (this makes | ||
2046 | all PCIe root ports use INTx for everything). | ||
2047 | |||
2001 | pcmv= [HW,PCMCIA] BadgePAD 4 | 2048 | pcmv= [HW,PCMCIA] BadgePAD 4 |
2002 | 2049 | ||
2003 | pd. [PARIDE] | 2050 | pd. [PARIDE] |
@@ -2214,9 +2261,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
2214 | 2261 | ||
2215 | sched_debug [KNL] Enables verbose scheduler debug messages. | 2262 | sched_debug [KNL] Enables verbose scheduler debug messages. |
2216 | 2263 | ||
2217 | sc1200wdt= [HW,WDT] SC1200 WDT (watchdog) driver | ||
2218 | Format: <io>[,<timeout>[,<isapnp>]] | ||
2219 | |||
2220 | scsi_debug_*= [SCSI] | 2264 | scsi_debug_*= [SCSI] |
2221 | See drivers/scsi/scsi_debug.c. | 2265 | See drivers/scsi/scsi_debug.c. |
2222 | 2266 | ||
@@ -2588,6 +2632,15 @@ and is between 256 and 4096 characters. It is defined in the file | |||
2588 | 2632 | ||
2589 | tp720= [HW,PS2] | 2633 | tp720= [HW,PS2] |
2590 | 2634 | ||
2635 | tpm_suspend_pcr=[HW,TPM] | ||
2636 | Format: integer pcr id | ||
2637 | Specify that at suspend time, the tpm driver | ||
2638 | should extend the specified pcr with zeros, | ||
2639 | as a workaround for some chips which fail to | ||
2640 | flush the last written pcr on TPM_SaveState. | ||
2641 | This will guarantee that all the other pcrs | ||
2642 | are saved. | ||
2643 | |||
2591 | trace_buf_size=nn[KMG] | 2644 | trace_buf_size=nn[KMG] |
2592 | [FTRACE] will set tracing buffer size. | 2645 | [FTRACE] will set tracing buffer size. |
2593 | 2646 | ||
@@ -2703,6 +2756,13 @@ and is between 256 and 4096 characters. It is defined in the file | |||
2703 | medium is write-protected). | 2756 | medium is write-protected). |
2704 | Example: quirks=0419:aaf5:rl,0421:0433:rc | 2757 | Example: quirks=0419:aaf5:rl,0421:0433:rc |
2705 | 2758 | ||
2759 | userpte= | ||
2760 | [X86] Flags controlling user PTE allocations. | ||
2761 | |||
2762 | nohigh = do not allocate PTE pages in | ||
2763 | HIGHMEM regardless of setting | ||
2764 | of CONFIG_HIGHPTE. | ||
2765 | |||
2706 | vdso= [X86,SH] | 2766 | vdso= [X86,SH] |
2707 | vdso=2: enable compat VDSO (default with COMPAT_VDSO) | 2767 | vdso=2: enable compat VDSO (default with COMPAT_VDSO) |
2708 | vdso=1: enable VDSO (default) | 2768 | vdso=1: enable VDSO (default) |
@@ -2789,13 +2849,21 @@ and is between 256 and 4096 characters. It is defined in the file | |||
2789 | wd7000= [HW,SCSI] | 2849 | wd7000= [HW,SCSI] |
2790 | See header of drivers/scsi/wd7000.c. | 2850 | See header of drivers/scsi/wd7000.c. |
2791 | 2851 | ||
2792 | wdt= [WDT] Watchdog | 2852 | watchdog timers [HW,WDT] For information on watchdog timers, |
2793 | See Documentation/watchdog/wdt.txt. | 2853 | see Documentation/watchdog/watchdog-parameters.txt |
2854 | or other driver-specific files in the | ||
2855 | Documentation/watchdog/ directory. | ||
2794 | 2856 | ||
2795 | x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of | 2857 | x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of |
2796 | default x2apic cluster mode on platforms | 2858 | default x2apic cluster mode on platforms |
2797 | supporting x2apic. | 2859 | supporting x2apic. |
2798 | 2860 | ||
2861 | x86_mrst_timer= [X86-32,APBT] | ||
2862 | Choose timer option for x86 Moorestown MID platform. | ||
2863 | Two valid options are apbt timer only and lapic timer | ||
2864 | plus one apbt timer for broadcast timer. | ||
2865 | x86_mrst_timer=apbt_only | lapic_and_apbt | ||
2866 | |||
2799 | xd= [HW,XT] Original XT pre-IDE (RLL encoded) disks. | 2867 | xd= [HW,XT] Original XT pre-IDE (RLL encoded) disks. |
2800 | xd_geo= See header of drivers/block/xd.c. | 2868 | xd_geo= See header of drivers/block/xd.c. |
2801 | 2869 | ||
diff --git a/Documentation/kobject.txt b/Documentation/kobject.txt index c79ab996dada..3ab2472509cb 100644 --- a/Documentation/kobject.txt +++ b/Documentation/kobject.txt | |||
@@ -59,37 +59,56 @@ nice to have in other objects. The C language does not allow for the | |||
59 | direct expression of inheritance, so other techniques - such as structure | 59 | direct expression of inheritance, so other techniques - such as structure |
60 | embedding - must be used. | 60 | embedding - must be used. |
61 | 61 | ||
62 | So, for example, the UIO code has a structure that defines the memory | 62 | (As an aside, for those familiar with the kernel linked list implementation, |
63 | region associated with a uio device: | 63 | this is analogous as to how "list_head" structs are rarely useful on |
64 | their own, but are invariably found embedded in the larger objects of | ||
65 | interest.) | ||
64 | 66 | ||
65 | struct uio_mem { | 67 | So, for example, the UIO code in drivers/uio/uio.c has a structure that |
68 | defines the memory region associated with a uio device: | ||
69 | |||
70 | struct uio_map { | ||
66 | struct kobject kobj; | 71 | struct kobject kobj; |
67 | unsigned long addr; | 72 | struct uio_mem *mem; |
68 | unsigned long size; | 73 | }; |
69 | int memtype; | ||
70 | void __iomem *internal_addr; | ||
71 | }; | ||
72 | 74 | ||
73 | If you have a struct uio_mem structure, finding its embedded kobject is | 75 | If you have a struct uio_map structure, finding its embedded kobject is |
74 | just a matter of using the kobj member. Code that works with kobjects will | 76 | just a matter of using the kobj member. Code that works with kobjects will |
75 | often have the opposite problem, however: given a struct kobject pointer, | 77 | often have the opposite problem, however: given a struct kobject pointer, |
76 | what is the pointer to the containing structure? You must avoid tricks | 78 | what is the pointer to the containing structure? You must avoid tricks |
77 | (such as assuming that the kobject is at the beginning of the structure) | 79 | (such as assuming that the kobject is at the beginning of the structure) |
78 | and, instead, use the container_of() macro, found in <linux/kernel.h>: | 80 | and, instead, use the container_of() macro, found in <linux/kernel.h>: |
79 | 81 | ||
80 | container_of(pointer, type, member) | 82 | container_of(pointer, type, member) |
83 | |||
84 | where: | ||
85 | |||
86 | * "pointer" is the pointer to the embedded kobject, | ||
87 | * "type" is the type of the containing structure, and | ||
88 | * "member" is the name of the structure field to which "pointer" points. | ||
89 | |||
90 | The return value from container_of() is a pointer to the corresponding | ||
91 | container type. So, for example, a pointer "kp" to a struct kobject | ||
92 | embedded *within* a struct uio_map could be converted to a pointer to the | ||
93 | *containing* uio_map structure with: | ||
94 | |||
95 | struct uio_map *u_map = container_of(kp, struct uio_map, kobj); | ||
96 | |||
97 | For convenience, programmers often define a simple macro for "back-casting" | ||
98 | kobject pointers to the containing type. Exactly this happens in the | ||
99 | earlier drivers/uio/uio.c, as you can see here: | ||
100 | |||
101 | struct uio_map { | ||
102 | struct kobject kobj; | ||
103 | struct uio_mem *mem; | ||
104 | }; | ||
81 | 105 | ||
82 | where pointer is the pointer to the embedded kobject, type is the type of | 106 | #define to_map(map) container_of(map, struct uio_map, kobj) |
83 | the containing structure, and member is the name of the structure field to | ||
84 | which pointer points. The return value from container_of() is a pointer to | ||
85 | the given type. So, for example, a pointer "kp" to a struct kobject | ||
86 | embedded within a struct uio_mem could be converted to a pointer to the | ||
87 | containing uio_mem structure with: | ||
88 | 107 | ||
89 | struct uio_mem *u_mem = container_of(kp, struct uio_mem, kobj); | 108 | where the macro argument "map" is a pointer to the struct kobject in |
109 | question. That macro is subsequently invoked with: | ||
90 | 110 | ||
91 | Programmers often define a simple macro for "back-casting" kobject pointers | 111 | struct uio_map *map = to_map(kobj); |
92 | to the containing type. | ||
93 | 112 | ||
94 | 113 | ||
95 | Initialization of kobjects | 114 | Initialization of kobjects |
@@ -266,7 +285,7 @@ kobj_type: | |||
266 | 285 | ||
267 | struct kobj_type { | 286 | struct kobj_type { |
268 | void (*release)(struct kobject *); | 287 | void (*release)(struct kobject *); |
269 | struct sysfs_ops *sysfs_ops; | 288 | const struct sysfs_ops *sysfs_ops; |
270 | struct attribute **default_attrs; | 289 | struct attribute **default_attrs; |
271 | }; | 290 | }; |
272 | 291 | ||
@@ -387,4 +406,5 @@ called, and the objects in the former circle release each other. | |||
387 | Example code to copy from | 406 | Example code to copy from |
388 | 407 | ||
389 | For a more complete example of using ksets and kobjects properly, see the | 408 | For a more complete example of using ksets and kobjects properly, see the |
390 | sample/kobject/kset-example.c code. | 409 | example programs samples/kobject/{kobject-example.c,kset-example.c}, |
410 | which will be built as loadable modules if you select CONFIG_SAMPLE_KOBJECT. | ||
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt index 053037a1fe6d..6653017680dd 100644 --- a/Documentation/kprobes.txt +++ b/Documentation/kprobes.txt | |||
@@ -1,6 +1,7 @@ | |||
1 | Title : Kernel Probes (Kprobes) | 1 | Title : Kernel Probes (Kprobes) |
2 | Authors : Jim Keniston <jkenisto@us.ibm.com> | 2 | Authors : Jim Keniston <jkenisto@us.ibm.com> |
3 | : Prasanna S Panchamukhi <prasanna@in.ibm.com> | 3 | : Prasanna S Panchamukhi <prasanna.panchamukhi@gmail.com> |
4 | : Masami Hiramatsu <mhiramat@redhat.com> | ||
4 | 5 | ||
5 | CONTENTS | 6 | CONTENTS |
6 | 7 | ||
@@ -15,6 +16,7 @@ CONTENTS | |||
15 | 9. Jprobes Example | 16 | 9. Jprobes Example |
16 | 10. Kretprobes Example | 17 | 10. Kretprobes Example |
17 | Appendix A: The kprobes debugfs interface | 18 | Appendix A: The kprobes debugfs interface |
19 | Appendix B: The kprobes sysctl interface | ||
18 | 20 | ||
19 | 1. Concepts: Kprobes, Jprobes, Return Probes | 21 | 1. Concepts: Kprobes, Jprobes, Return Probes |
20 | 22 | ||
@@ -42,13 +44,13 @@ registration/unregistration of a group of *probes. These functions | |||
42 | can speed up unregistration process when you have to unregister | 44 | can speed up unregistration process when you have to unregister |
43 | a lot of probes at once. | 45 | a lot of probes at once. |
44 | 46 | ||
45 | The next three subsections explain how the different types of | 47 | The next four subsections explain how the different types of |
46 | probes work. They explain certain things that you'll need to | 48 | probes work and how jump optimization works. They explain certain |
47 | know in order to make the best use of Kprobes -- e.g., the | 49 | things that you'll need to know in order to make the best use of |
48 | difference between a pre_handler and a post_handler, and how | 50 | Kprobes -- e.g., the difference between a pre_handler and |
49 | to use the maxactive and nmissed fields of a kretprobe. But | 51 | a post_handler, and how to use the maxactive and nmissed fields of |
50 | if you're in a hurry to start using Kprobes, you can skip ahead | 52 | a kretprobe. But if you're in a hurry to start using Kprobes, you |
51 | to section 2. | 53 | can skip ahead to section 2. |
52 | 54 | ||
53 | 1.1 How Does a Kprobe Work? | 55 | 1.1 How Does a Kprobe Work? |
54 | 56 | ||
@@ -161,13 +163,123 @@ In case probed function is entered but there is no kretprobe_instance | |||
161 | object available, then in addition to incrementing the nmissed count, | 163 | object available, then in addition to incrementing the nmissed count, |
162 | the user entry_handler invocation is also skipped. | 164 | the user entry_handler invocation is also skipped. |
163 | 165 | ||
166 | 1.4 How Does Jump Optimization Work? | ||
167 | |||
168 | If your kernel is built with CONFIG_OPTPROBES=y (currently this flag | ||
169 | is automatically set 'y' on x86/x86-64, non-preemptive kernel) and | ||
170 | the "debug.kprobes_optimization" kernel parameter is set to 1 (see | ||
171 | sysctl(8)), Kprobes tries to reduce probe-hit overhead by using a jump | ||
172 | instruction instead of a breakpoint instruction at each probepoint. | ||
173 | |||
174 | 1.4.1 Init a Kprobe | ||
175 | |||
176 | When a probe is registered, before attempting this optimization, | ||
177 | Kprobes inserts an ordinary, breakpoint-based kprobe at the specified | ||
178 | address. So, even if it's not possible to optimize this particular | ||
179 | probepoint, there'll be a probe there. | ||
180 | |||
181 | 1.4.2 Safety Check | ||
182 | |||
183 | Before optimizing a probe, Kprobes performs the following safety checks: | ||
184 | |||
185 | - Kprobes verifies that the region that will be replaced by the jump | ||
186 | instruction (the "optimized region") lies entirely within one function. | ||
187 | (A jump instruction is multiple bytes, and so may overlay multiple | ||
188 | instructions.) | ||
189 | |||
190 | - Kprobes analyzes the entire function and verifies that there is no | ||
191 | jump into the optimized region. Specifically: | ||
192 | - the function contains no indirect jump; | ||
193 | - the function contains no instruction that causes an exception (since | ||
194 | the fixup code triggered by the exception could jump back into the | ||
195 | optimized region -- Kprobes checks the exception tables to verify this); | ||
196 | and | ||
197 | - there is no near jump to the optimized region (other than to the first | ||
198 | byte). | ||
199 | |||
200 | - For each instruction in the optimized region, Kprobes verifies that | ||
201 | the instruction can be executed out of line. | ||
202 | |||
203 | 1.4.3 Preparing Detour Buffer | ||
204 | |||
205 | Next, Kprobes prepares a "detour" buffer, which contains the following | ||
206 | instruction sequence: | ||
207 | - code to push the CPU's registers (emulating a breakpoint trap) | ||
208 | - a call to the trampoline code which calls user's probe handlers. | ||
209 | - code to restore registers | ||
210 | - the instructions from the optimized region | ||
211 | - a jump back to the original execution path. | ||
212 | |||
213 | 1.4.4 Pre-optimization | ||
214 | |||
215 | After preparing the detour buffer, Kprobes verifies that none of the | ||
216 | following situations exist: | ||
217 | - The probe has either a break_handler (i.e., it's a jprobe) or a | ||
218 | post_handler. | ||
219 | - Other instructions in the optimized region are probed. | ||
220 | - The probe is disabled. | ||
221 | In any of the above cases, Kprobes won't start optimizing the probe. | ||
222 | Since these are temporary situations, Kprobes tries to start | ||
223 | optimizing it again if the situation is changed. | ||
224 | |||
225 | If the kprobe can be optimized, Kprobes enqueues the kprobe to an | ||
226 | optimizing list, and kicks the kprobe-optimizer workqueue to optimize | ||
227 | it. If the to-be-optimized probepoint is hit before being optimized, | ||
228 | Kprobes returns control to the original instruction path by setting | ||
229 | the CPU's instruction pointer to the copied code in the detour buffer | ||
230 | -- thus at least avoiding the single-step. | ||
231 | |||
232 | 1.4.5 Optimization | ||
233 | |||
234 | The Kprobe-optimizer doesn't insert the jump instruction immediately; | ||
235 | rather, it calls synchronize_sched() for safety first, because it's | ||
236 | possible for a CPU to be interrupted in the middle of executing the | ||
237 | optimized region(*). As you know, synchronize_sched() can ensure | ||
238 | that all interruptions that were active when synchronize_sched() | ||
239 | was called are done, but only if CONFIG_PREEMPT=n. So, this version | ||
240 | of kprobe optimization supports only kernels with CONFIG_PREEMPT=n.(**) | ||
241 | |||
242 | After that, the Kprobe-optimizer calls stop_machine() to replace | ||
243 | the optimized region with a jump instruction to the detour buffer, | ||
244 | using text_poke_smp(). | ||
245 | |||
246 | 1.4.6 Unoptimization | ||
247 | |||
248 | When an optimized kprobe is unregistered, disabled, or blocked by | ||
249 | another kprobe, it will be unoptimized. If this happens before | ||
250 | the optimization is complete, the kprobe is just dequeued from the | ||
251 | optimized list. If the optimization has been done, the jump is | ||
252 | replaced with the original code (except for an int3 breakpoint in | ||
253 | the first byte) by using text_poke_smp(). | ||
254 | |||
255 | (*)Please imagine that the 2nd instruction is interrupted and then | ||
256 | the optimizer replaces the 2nd instruction with the jump *address* | ||
257 | while the interrupt handler is running. When the interrupt | ||
258 | returns to original address, there is no valid instruction, | ||
259 | and it causes an unexpected result. | ||
260 | |||
261 | (**)This optimization-safety checking may be replaced with the | ||
262 | stop-machine method that ksplice uses for supporting a CONFIG_PREEMPT=y | ||
263 | kernel. | ||
264 | |||
265 | NOTE for geeks: | ||
266 | The jump optimization changes the kprobe's pre_handler behavior. | ||
267 | Without optimization, the pre_handler can change the kernel's execution | ||
268 | path by changing regs->ip and returning 1. However, when the probe | ||
269 | is optimized, that modification is ignored. Thus, if you want to | ||
270 | tweak the kernel's execution path, you need to suppress optimization, | ||
271 | using one of the following techniques: | ||
272 | - Specify an empty function for the kprobe's post_handler or break_handler. | ||
273 | or | ||
274 | - Execute 'sysctl -w debug.kprobes_optimization=n' | ||
275 | |||
164 | 2. Architectures Supported | 276 | 2. Architectures Supported |
165 | 277 | ||
166 | Kprobes, jprobes, and return probes are implemented on the following | 278 | Kprobes, jprobes, and return probes are implemented on the following |
167 | architectures: | 279 | architectures: |
168 | 280 | ||
169 | - i386 | 281 | - i386 (Supports jump optimization) |
170 | - x86_64 (AMD-64, EM64T) | 282 | - x86_64 (AMD-64, EM64T) (Supports jump optimization) |
171 | - ppc64 | 283 | - ppc64 |
172 | - ia64 (Does not support probes on instruction slot1.) | 284 | - ia64 (Does not support probes on instruction slot1.) |
173 | - sparc64 (Return probes not yet implemented.) | 285 | - sparc64 (Return probes not yet implemented.) |
@@ -214,7 +326,7 @@ occurs during execution of kp->pre_handler or kp->post_handler, | |||
214 | or during single-stepping of the probed instruction, Kprobes calls | 326 | or during single-stepping of the probed instruction, Kprobes calls |
215 | kp->fault_handler. Any or all handlers can be NULL. If kp->flags | 327 | kp->fault_handler. Any or all handlers can be NULL. If kp->flags |
216 | is set KPROBE_FLAG_DISABLED, that kp will be registered but disabled, | 328 | is set KPROBE_FLAG_DISABLED, that kp will be registered but disabled, |
217 | so, it's handlers aren't hit until calling enable_kprobe(kp). | 329 | so, its handlers aren't hit until calling enable_kprobe(kp). |
218 | 330 | ||
219 | NOTE: | 331 | NOTE: |
220 | 1. With the introduction of the "symbol_name" field to struct kprobe, | 332 | 1. With the introduction of the "symbol_name" field to struct kprobe, |
@@ -389,7 +501,10 @@ the probe which has been registered. | |||
389 | 501 | ||
390 | Kprobes allows multiple probes at the same address. Currently, | 502 | Kprobes allows multiple probes at the same address. Currently, |
391 | however, there cannot be multiple jprobes on the same function at | 503 | however, there cannot be multiple jprobes on the same function at |
392 | the same time. | 504 | the same time. Also, a probepoint for which there is a jprobe or |
505 | a post_handler cannot be optimized. So if you install a jprobe, | ||
506 | or a kprobe with a post_handler, at an optimized probepoint, the | ||
507 | probepoint will be unoptimized automatically. | ||
393 | 508 | ||
394 | In general, you can install a probe anywhere in the kernel. | 509 | In general, you can install a probe anywhere in the kernel. |
395 | In particular, you can probe interrupt handlers. Known exceptions | 510 | In particular, you can probe interrupt handlers. Known exceptions |
@@ -453,6 +568,38 @@ reason, Kprobes doesn't support return probes (or kprobes or jprobes) | |||
453 | on the x86_64 version of __switch_to(); the registration functions | 568 | on the x86_64 version of __switch_to(); the registration functions |
454 | return -EINVAL. | 569 | return -EINVAL. |
455 | 570 | ||
571 | On x86/x86-64, since the Jump Optimization of Kprobes modifies | ||
572 | instructions widely, there are some limitations to optimization. To | ||
573 | explain it, we introduce some terminology. Imagine a 3-instruction | ||
574 | sequence consisting of a two 2-byte instructions and one 3-byte | ||
575 | instruction. | ||
576 | |||
577 | IA | ||
578 | | | ||
579 | [-2][-1][0][1][2][3][4][5][6][7] | ||
580 | [ins1][ins2][ ins3 ] | ||
581 | [<- DCR ->] | ||
582 | [<- JTPR ->] | ||
583 | |||
584 | ins1: 1st Instruction | ||
585 | ins2: 2nd Instruction | ||
586 | ins3: 3rd Instruction | ||
587 | IA: Insertion Address | ||
588 | JTPR: Jump Target Prohibition Region | ||
589 | DCR: Detoured Code Region | ||
590 | |||
591 | The instructions in DCR are copied to the out-of-line buffer | ||
592 | of the kprobe, because the bytes in DCR are replaced by | ||
593 | a 5-byte jump instruction. So there are several limitations. | ||
594 | |||
595 | a) The instructions in DCR must be relocatable. | ||
596 | b) The instructions in DCR must not include a call instruction. | ||
597 | c) JTPR must not be targeted by any jump or call instruction. | ||
598 | d) DCR must not straddle the border betweeen functions. | ||
599 | |||
600 | Anyway, these limitations are checked by the in-kernel instruction | ||
601 | decoder, so you don't need to worry about that. | ||
602 | |||
456 | 6. Probe Overhead | 603 | 6. Probe Overhead |
457 | 604 | ||
458 | On a typical CPU in use in 2005, a kprobe hit takes 0.5 to 1.0 | 605 | On a typical CPU in use in 2005, a kprobe hit takes 0.5 to 1.0 |
@@ -476,6 +623,19 @@ k = 0.49 usec; j = 0.76; r = 0.80; kr = 0.82; jr = 1.07 | |||
476 | ppc64: POWER5 (gr), 1656 MHz (SMT disabled, 1 virtual CPU per physical CPU) | 623 | ppc64: POWER5 (gr), 1656 MHz (SMT disabled, 1 virtual CPU per physical CPU) |
477 | k = 0.77 usec; j = 1.31; r = 1.26; kr = 1.45; jr = 1.99 | 624 | k = 0.77 usec; j = 1.31; r = 1.26; kr = 1.45; jr = 1.99 |
478 | 625 | ||
626 | 6.1 Optimized Probe Overhead | ||
627 | |||
628 | Typically, an optimized kprobe hit takes 0.07 to 0.1 microseconds to | ||
629 | process. Here are sample overhead figures (in usec) for x86 architectures. | ||
630 | k = unoptimized kprobe, b = boosted (single-step skipped), o = optimized kprobe, | ||
631 | r = unoptimized kretprobe, rb = boosted kretprobe, ro = optimized kretprobe. | ||
632 | |||
633 | i386: Intel(R) Xeon(R) E5410, 2.33GHz, 4656.90 bogomips | ||
634 | k = 0.80 usec; b = 0.33; o = 0.05; r = 1.10; rb = 0.61; ro = 0.33 | ||
635 | |||
636 | x86-64: Intel(R) Xeon(R) E5410, 2.33GHz, 4656.90 bogomips | ||
637 | k = 0.99 usec; b = 0.43; o = 0.06; r = 1.24; rb = 0.68; ro = 0.30 | ||
638 | |||
479 | 7. TODO | 639 | 7. TODO |
480 | 640 | ||
481 | a. SystemTap (http://sourceware.org/systemtap): Provides a simplified | 641 | a. SystemTap (http://sourceware.org/systemtap): Provides a simplified |
@@ -523,7 +683,8 @@ is also specified. Following columns show probe status. If the probe is on | |||
523 | a virtual address that is no longer valid (module init sections, module | 683 | a virtual address that is no longer valid (module init sections, module |
524 | virtual addresses that correspond to modules that've been unloaded), | 684 | virtual addresses that correspond to modules that've been unloaded), |
525 | such probes are marked with [GONE]. If the probe is temporarily disabled, | 685 | such probes are marked with [GONE]. If the probe is temporarily disabled, |
526 | such probes are marked with [DISABLED]. | 686 | such probes are marked with [DISABLED]. If the probe is optimized, it is |
687 | marked with [OPTIMIZED]. | ||
527 | 688 | ||
528 | /sys/kernel/debug/kprobes/enabled: Turn kprobes ON/OFF forcibly. | 689 | /sys/kernel/debug/kprobes/enabled: Turn kprobes ON/OFF forcibly. |
529 | 690 | ||
@@ -533,3 +694,19 @@ registered probes will be disarmed, till such time a "1" is echoed to this | |||
533 | file. Note that this knob just disarms and arms all kprobes and doesn't | 694 | file. Note that this knob just disarms and arms all kprobes and doesn't |
534 | change each probe's disabling state. This means that disabled kprobes (marked | 695 | change each probe's disabling state. This means that disabled kprobes (marked |
535 | [DISABLED]) will be not enabled if you turn ON all kprobes by this knob. | 696 | [DISABLED]) will be not enabled if you turn ON all kprobes by this knob. |
697 | |||
698 | |||
699 | Appendix B: The kprobes sysctl interface | ||
700 | |||
701 | /proc/sys/debug/kprobes-optimization: Turn kprobes optimization ON/OFF. | ||
702 | |||
703 | When CONFIG_OPTPROBES=y, this sysctl interface appears and it provides | ||
704 | a knob to globally and forcibly turn jump optimization (see section | ||
705 | 1.4) ON or OFF. By default, jump optimization is allowed (ON). | ||
706 | If you echo "0" to this file or set "debug.kprobes_optimization" to | ||
707 | 0 via sysctl, all optimized probes will be unoptimized, and any new | ||
708 | probes registered after that will not be optimized. Note that this | ||
709 | knob *changes* the optimized state. This means that optimized probes | ||
710 | (marked [OPTIMIZED]) will be unoptimized ([OPTIMIZED] tag will be | ||
711 | removed). If the knob is turned on, they will be optimized again. | ||
712 | |||
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt index 2811e452f756..a237518e51b9 100644 --- a/Documentation/kvm/api.txt +++ b/Documentation/kvm/api.txt | |||
@@ -23,12 +23,12 @@ of a virtual machine. The ioctls belong to three classes | |||
23 | Only run vcpu ioctls from the same thread that was used to create the | 23 | Only run vcpu ioctls from the same thread that was used to create the |
24 | vcpu. | 24 | vcpu. |
25 | 25 | ||
26 | 2. File descritpors | 26 | 2. File descriptors |
27 | 27 | ||
28 | The kvm API is centered around file descriptors. An initial | 28 | The kvm API is centered around file descriptors. An initial |
29 | open("/dev/kvm") obtains a handle to the kvm subsystem; this handle | 29 | open("/dev/kvm") obtains a handle to the kvm subsystem; this handle |
30 | can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this | 30 | can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this |
31 | handle will create a VM file descripror which can be used to issue VM | 31 | handle will create a VM file descriptor which can be used to issue VM |
32 | ioctls. A KVM_CREATE_VCPU ioctl on a VM fd will create a virtual cpu | 32 | ioctls. A KVM_CREATE_VCPU ioctl on a VM fd will create a virtual cpu |
33 | and return a file descriptor pointing to it. Finally, ioctls on a vcpu | 33 | and return a file descriptor pointing to it. Finally, ioctls on a vcpu |
34 | fd can be used to control the vcpu, including the important task of | 34 | fd can be used to control the vcpu, including the important task of |
@@ -643,7 +643,7 @@ Type: vm ioctl | |||
643 | Parameters: struct kvm_clock_data (in) | 643 | Parameters: struct kvm_clock_data (in) |
644 | Returns: 0 on success, -1 on error | 644 | Returns: 0 on success, -1 on error |
645 | 645 | ||
646 | Sets the current timestamp of kvmclock to the valued specific in its parameter. | 646 | Sets the current timestamp of kvmclock to the value specified in its parameter. |
647 | In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios | 647 | In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios |
648 | such as migration. | 648 | such as migration. |
649 | 649 | ||
@@ -656,6 +656,7 @@ struct kvm_clock_data { | |||
656 | 4.29 KVM_GET_VCPU_EVENTS | 656 | 4.29 KVM_GET_VCPU_EVENTS |
657 | 657 | ||
658 | Capability: KVM_CAP_VCPU_EVENTS | 658 | Capability: KVM_CAP_VCPU_EVENTS |
659 | Extended by: KVM_CAP_INTR_SHADOW | ||
659 | Architectures: x86 | 660 | Architectures: x86 |
660 | Type: vm ioctl | 661 | Type: vm ioctl |
661 | Parameters: struct kvm_vcpu_event (out) | 662 | Parameters: struct kvm_vcpu_event (out) |
@@ -676,7 +677,7 @@ struct kvm_vcpu_events { | |||
676 | __u8 injected; | 677 | __u8 injected; |
677 | __u8 nr; | 678 | __u8 nr; |
678 | __u8 soft; | 679 | __u8 soft; |
679 | __u8 pad; | 680 | __u8 shadow; |
680 | } interrupt; | 681 | } interrupt; |
681 | struct { | 682 | struct { |
682 | __u8 injected; | 683 | __u8 injected; |
@@ -688,9 +689,13 @@ struct kvm_vcpu_events { | |||
688 | __u32 flags; | 689 | __u32 flags; |
689 | }; | 690 | }; |
690 | 691 | ||
692 | KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that | ||
693 | interrupt.shadow contains a valid state. Otherwise, this field is undefined. | ||
694 | |||
691 | 4.30 KVM_SET_VCPU_EVENTS | 695 | 4.30 KVM_SET_VCPU_EVENTS |
692 | 696 | ||
693 | Capability: KVM_CAP_VCPU_EVENTS | 697 | Capability: KVM_CAP_VCPU_EVENTS |
698 | Extended by: KVM_CAP_INTR_SHADOW | ||
694 | Architectures: x86 | 699 | Architectures: x86 |
695 | Type: vm ioctl | 700 | Type: vm ioctl |
696 | Parameters: struct kvm_vcpu_event (in) | 701 | Parameters: struct kvm_vcpu_event (in) |
@@ -709,6 +714,183 @@ current in-kernel state. The bits are: | |||
709 | KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel | 714 | KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel |
710 | KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector | 715 | KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector |
711 | 716 | ||
717 | If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in | ||
718 | the flags field to signal that interrupt.shadow contains a valid state and | ||
719 | shall be written into the VCPU. | ||
720 | |||
721 | 4.32 KVM_GET_DEBUGREGS | ||
722 | |||
723 | Capability: KVM_CAP_DEBUGREGS | ||
724 | Architectures: x86 | ||
725 | Type: vm ioctl | ||
726 | Parameters: struct kvm_debugregs (out) | ||
727 | Returns: 0 on success, -1 on error | ||
728 | |||
729 | Reads debug registers from the vcpu. | ||
730 | |||
731 | struct kvm_debugregs { | ||
732 | __u64 db[4]; | ||
733 | __u64 dr6; | ||
734 | __u64 dr7; | ||
735 | __u64 flags; | ||
736 | __u64 reserved[9]; | ||
737 | }; | ||
738 | |||
739 | 4.33 KVM_SET_DEBUGREGS | ||
740 | |||
741 | Capability: KVM_CAP_DEBUGREGS | ||
742 | Architectures: x86 | ||
743 | Type: vm ioctl | ||
744 | Parameters: struct kvm_debugregs (in) | ||
745 | Returns: 0 on success, -1 on error | ||
746 | |||
747 | Writes debug registers into the vcpu. | ||
748 | |||
749 | See KVM_GET_DEBUGREGS for the data structure. The flags field is unused | ||
750 | yet and must be cleared on entry. | ||
751 | |||
752 | 4.34 KVM_SET_USER_MEMORY_REGION | ||
753 | |||
754 | Capability: KVM_CAP_USER_MEM | ||
755 | Architectures: all | ||
756 | Type: vm ioctl | ||
757 | Parameters: struct kvm_userspace_memory_region (in) | ||
758 | Returns: 0 on success, -1 on error | ||
759 | |||
760 | struct kvm_userspace_memory_region { | ||
761 | __u32 slot; | ||
762 | __u32 flags; | ||
763 | __u64 guest_phys_addr; | ||
764 | __u64 memory_size; /* bytes */ | ||
765 | __u64 userspace_addr; /* start of the userspace allocated memory */ | ||
766 | }; | ||
767 | |||
768 | /* for kvm_memory_region::flags */ | ||
769 | #define KVM_MEM_LOG_DIRTY_PAGES 1UL | ||
770 | |||
771 | This ioctl allows the user to create or modify a guest physical memory | ||
772 | slot. When changing an existing slot, it may be moved in the guest | ||
773 | physical memory space, or its flags may be modified. It may not be | ||
774 | resized. Slots may not overlap in guest physical address space. | ||
775 | |||
776 | Memory for the region is taken starting at the address denoted by the | ||
777 | field userspace_addr, which must point at user addressable memory for | ||
778 | the entire memory slot size. Any object may back this memory, including | ||
779 | anonymous memory, ordinary files, and hugetlbfs. | ||
780 | |||
781 | It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr | ||
782 | be identical. This allows large pages in the guest to be backed by large | ||
783 | pages in the host. | ||
784 | |||
785 | The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which | ||
786 | instructs kvm to keep track of writes to memory within the slot. See | ||
787 | the KVM_GET_DIRTY_LOG ioctl. | ||
788 | |||
789 | When the KVM_CAP_SYNC_MMU capability, changes in the backing of the memory | ||
790 | region are automatically reflected into the guest. For example, an mmap() | ||
791 | that affects the region will be made visible immediately. Another example | ||
792 | is madvise(MADV_DROP). | ||
793 | |||
794 | It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl. | ||
795 | The KVM_SET_MEMORY_REGION does not allow fine grained control over memory | ||
796 | allocation and is deprecated. | ||
797 | |||
798 | 4.35 KVM_SET_TSS_ADDR | ||
799 | |||
800 | Capability: KVM_CAP_SET_TSS_ADDR | ||
801 | Architectures: x86 | ||
802 | Type: vm ioctl | ||
803 | Parameters: unsigned long tss_address (in) | ||
804 | Returns: 0 on success, -1 on error | ||
805 | |||
806 | This ioctl defines the physical address of a three-page region in the guest | ||
807 | physical address space. The region must be within the first 4GB of the | ||
808 | guest physical address space and must not conflict with any memory slot | ||
809 | or any mmio address. The guest may malfunction if it accesses this memory | ||
810 | region. | ||
811 | |||
812 | This ioctl is required on Intel-based hosts. This is needed on Intel hardware | ||
813 | because of a quirk in the virtualization implementation (see the internals | ||
814 | documentation when it pops into existence). | ||
815 | |||
816 | 4.36 KVM_ENABLE_CAP | ||
817 | |||
818 | Capability: KVM_CAP_ENABLE_CAP | ||
819 | Architectures: ppc | ||
820 | Type: vcpu ioctl | ||
821 | Parameters: struct kvm_enable_cap (in) | ||
822 | Returns: 0 on success; -1 on error | ||
823 | |||
824 | +Not all extensions are enabled by default. Using this ioctl the application | ||
825 | can enable an extension, making it available to the guest. | ||
826 | |||
827 | On systems that do not support this ioctl, it always fails. On systems that | ||
828 | do support it, it only works for extensions that are supported for enablement. | ||
829 | |||
830 | To check if a capability can be enabled, the KVM_CHECK_EXTENSION ioctl should | ||
831 | be used. | ||
832 | |||
833 | struct kvm_enable_cap { | ||
834 | /* in */ | ||
835 | __u32 cap; | ||
836 | |||
837 | The capability that is supposed to get enabled. | ||
838 | |||
839 | __u32 flags; | ||
840 | |||
841 | A bitfield indicating future enhancements. Has to be 0 for now. | ||
842 | |||
843 | __u64 args[4]; | ||
844 | |||
845 | Arguments for enabling a feature. If a feature needs initial values to | ||
846 | function properly, this is the place to put them. | ||
847 | |||
848 | __u8 pad[64]; | ||
849 | }; | ||
850 | |||
851 | 4.37 KVM_GET_MP_STATE | ||
852 | |||
853 | Capability: KVM_CAP_MP_STATE | ||
854 | Architectures: x86, ia64 | ||
855 | Type: vcpu ioctl | ||
856 | Parameters: struct kvm_mp_state (out) | ||
857 | Returns: 0 on success; -1 on error | ||
858 | |||
859 | struct kvm_mp_state { | ||
860 | __u32 mp_state; | ||
861 | }; | ||
862 | |||
863 | Returns the vcpu's current "multiprocessing state" (though also valid on | ||
864 | uniprocessor guests). | ||
865 | |||
866 | Possible values are: | ||
867 | |||
868 | - KVM_MP_STATE_RUNNABLE: the vcpu is currently running | ||
869 | - KVM_MP_STATE_UNINITIALIZED: the vcpu is an application processor (AP) | ||
870 | which has not yet received an INIT signal | ||
871 | - KVM_MP_STATE_INIT_RECEIVED: the vcpu has received an INIT signal, and is | ||
872 | now ready for a SIPI | ||
873 | - KVM_MP_STATE_HALTED: the vcpu has executed a HLT instruction and | ||
874 | is waiting for an interrupt | ||
875 | - KVM_MP_STATE_SIPI_RECEIVED: the vcpu has just received a SIPI (vector | ||
876 | accesible via KVM_GET_VCPU_EVENTS) | ||
877 | |||
878 | This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel | ||
879 | irqchip, the multiprocessing state must be maintained by userspace. | ||
880 | |||
881 | 4.38 KVM_SET_MP_STATE | ||
882 | |||
883 | Capability: KVM_CAP_MP_STATE | ||
884 | Architectures: x86, ia64 | ||
885 | Type: vcpu ioctl | ||
886 | Parameters: struct kvm_mp_state (in) | ||
887 | Returns: 0 on success; -1 on error | ||
888 | |||
889 | Sets the vcpu's current "multiprocessing state"; see KVM_GET_MP_STATE for | ||
890 | arguments. | ||
891 | |||
892 | This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel | ||
893 | irqchip, the multiprocessing state must be maintained by userspace. | ||
712 | 894 | ||
713 | 5. The kvm_run structure | 895 | 5. The kvm_run structure |
714 | 896 | ||
@@ -795,11 +977,11 @@ Unused. | |||
795 | __u64 data_offset; /* relative to kvm_run start */ | 977 | __u64 data_offset; /* relative to kvm_run start */ |
796 | } io; | 978 | } io; |
797 | 979 | ||
798 | If exit_reason is KVM_EXIT_IO_IN or KVM_EXIT_IO_OUT, then the vcpu has | 980 | If exit_reason is KVM_EXIT_IO, then the vcpu has |
799 | executed a port I/O instruction which could not be satisfied by kvm. | 981 | executed a port I/O instruction which could not be satisfied by kvm. |
800 | data_offset describes where the data is located (KVM_EXIT_IO_OUT) or | 982 | data_offset describes where the data is located (KVM_EXIT_IO_OUT) or |
801 | where kvm expects application code to place the data for the next | 983 | where kvm expects application code to place the data for the next |
802 | KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a patcked array. | 984 | KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a packed array. |
803 | 985 | ||
804 | struct { | 986 | struct { |
805 | struct kvm_debug_exit_arch arch; | 987 | struct kvm_debug_exit_arch arch; |
@@ -815,11 +997,18 @@ Unused. | |||
815 | __u8 is_write; | 997 | __u8 is_write; |
816 | } mmio; | 998 | } mmio; |
817 | 999 | ||
818 | If exit_reason is KVM_EXIT_MMIO or KVM_EXIT_IO_OUT, then the vcpu has | 1000 | If exit_reason is KVM_EXIT_MMIO, then the vcpu has |
819 | executed a memory-mapped I/O instruction which could not be satisfied | 1001 | executed a memory-mapped I/O instruction which could not be satisfied |
820 | by kvm. The 'data' member contains the written data if 'is_write' is | 1002 | by kvm. The 'data' member contains the written data if 'is_write' is |
821 | true, and should be filled by application code otherwise. | 1003 | true, and should be filled by application code otherwise. |
822 | 1004 | ||
1005 | NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO and KVM_EXIT_OSI, the corresponding | ||
1006 | operations are complete (and guest state is consistent) only after userspace | ||
1007 | has re-entered the kernel with KVM_RUN. The kernel side will first finish | ||
1008 | incomplete operations and then check for pending signals. Userspace | ||
1009 | can re-enter the guest with an unmasked signal pending to complete | ||
1010 | pending operations. | ||
1011 | |||
823 | /* KVM_EXIT_HYPERCALL */ | 1012 | /* KVM_EXIT_HYPERCALL */ |
824 | struct { | 1013 | struct { |
825 | __u64 nr; | 1014 | __u64 nr; |
@@ -829,7 +1018,9 @@ true, and should be filled by application code otherwise. | |||
829 | __u32 pad; | 1018 | __u32 pad; |
830 | } hypercall; | 1019 | } hypercall; |
831 | 1020 | ||
832 | Unused. | 1021 | Unused. This was once used for 'hypercall to userspace'. To implement |
1022 | such functionality, use KVM_EXIT_IO (x86) or KVM_EXIT_MMIO (all except s390). | ||
1023 | Note KVM_EXIT_IO is significantly faster than KVM_EXIT_MMIO. | ||
833 | 1024 | ||
834 | /* KVM_EXIT_TPR_ACCESS */ | 1025 | /* KVM_EXIT_TPR_ACCESS */ |
835 | struct { | 1026 | struct { |
@@ -870,6 +1061,19 @@ s390 specific. | |||
870 | 1061 | ||
871 | powerpc specific. | 1062 | powerpc specific. |
872 | 1063 | ||
1064 | /* KVM_EXIT_OSI */ | ||
1065 | struct { | ||
1066 | __u64 gprs[32]; | ||
1067 | } osi; | ||
1068 | |||
1069 | MOL uses a special hypercall interface it calls 'OSI'. To enable it, we catch | ||
1070 | hypercalls and exit with this exit struct that contains all the guest gprs. | ||
1071 | |||
1072 | If exit_reason is KVM_EXIT_OSI, then the vcpu has triggered such a hypercall. | ||
1073 | Userspace can now handle the hypercall and when it's done modify the gprs as | ||
1074 | necessary. Upon guest entry all guest GPRs will then be replaced by the values | ||
1075 | in this struct. | ||
1076 | |||
873 | /* Fix the size of the union. */ | 1077 | /* Fix the size of the union. */ |
874 | char padding[256]; | 1078 | char padding[256]; |
875 | }; | 1079 | }; |
diff --git a/Documentation/kvm/cpuid.txt b/Documentation/kvm/cpuid.txt new file mode 100644 index 000000000000..14a12ea92b7f --- /dev/null +++ b/Documentation/kvm/cpuid.txt | |||
@@ -0,0 +1,42 @@ | |||
1 | KVM CPUID bits | ||
2 | Glauber Costa <glommer@redhat.com>, Red Hat Inc, 2010 | ||
3 | ===================================================== | ||
4 | |||
5 | A guest running on a kvm host, can check some of its features using | ||
6 | cpuid. This is not always guaranteed to work, since userspace can | ||
7 | mask-out some, or even all KVM-related cpuid features before launching | ||
8 | a guest. | ||
9 | |||
10 | KVM cpuid functions are: | ||
11 | |||
12 | function: KVM_CPUID_SIGNATURE (0x40000000) | ||
13 | returns : eax = 0, | ||
14 | ebx = 0x4b4d564b, | ||
15 | ecx = 0x564b4d56, | ||
16 | edx = 0x4d. | ||
17 | Note that this value in ebx, ecx and edx corresponds to the string "KVMKVMKVM". | ||
18 | This function queries the presence of KVM cpuid leafs. | ||
19 | |||
20 | |||
21 | function: define KVM_CPUID_FEATURES (0x40000001) | ||
22 | returns : ebx, ecx, edx = 0 | ||
23 | eax = and OR'ed group of (1 << flag), where each flags is: | ||
24 | |||
25 | |||
26 | flag || value || meaning | ||
27 | ============================================================================= | ||
28 | KVM_FEATURE_CLOCKSOURCE || 0 || kvmclock available at msrs | ||
29 | || || 0x11 and 0x12. | ||
30 | ------------------------------------------------------------------------------ | ||
31 | KVM_FEATURE_NOP_IO_DELAY || 1 || not necessary to perform delays | ||
32 | || || on PIO operations. | ||
33 | ------------------------------------------------------------------------------ | ||
34 | KVM_FEATURE_MMU_OP || 2 || deprecated. | ||
35 | ------------------------------------------------------------------------------ | ||
36 | KVM_FEATURE_CLOCKSOURCE2 || 3 || kvmclock available at msrs | ||
37 | || || 0x4b564d00 and 0x4b564d01 | ||
38 | ------------------------------------------------------------------------------ | ||
39 | KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side | ||
40 | || || per-cpu warps are expected in | ||
41 | || || kvmclock. | ||
42 | ------------------------------------------------------------------------------ | ||
diff --git a/Documentation/kvm/mmu.txt b/Documentation/kvm/mmu.txt new file mode 100644 index 000000000000..aaed6ab9d7ab --- /dev/null +++ b/Documentation/kvm/mmu.txt | |||
@@ -0,0 +1,304 @@ | |||
1 | The x86 kvm shadow mmu | ||
2 | ====================== | ||
3 | |||
4 | The mmu (in arch/x86/kvm, files mmu.[ch] and paging_tmpl.h) is responsible | ||
5 | for presenting a standard x86 mmu to the guest, while translating guest | ||
6 | physical addresses to host physical addresses. | ||
7 | |||
8 | The mmu code attempts to satisfy the following requirements: | ||
9 | |||
10 | - correctness: the guest should not be able to determine that it is running | ||
11 | on an emulated mmu except for timing (we attempt to comply | ||
12 | with the specification, not emulate the characteristics of | ||
13 | a particular implementation such as tlb size) | ||
14 | - security: the guest must not be able to touch host memory not assigned | ||
15 | to it | ||
16 | - performance: minimize the performance penalty imposed by the mmu | ||
17 | - scaling: need to scale to large memory and large vcpu guests | ||
18 | - hardware: support the full range of x86 virtualization hardware | ||
19 | - integration: Linux memory management code must be in control of guest memory | ||
20 | so that swapping, page migration, page merging, transparent | ||
21 | hugepages, and similar features work without change | ||
22 | - dirty tracking: report writes to guest memory to enable live migration | ||
23 | and framebuffer-based displays | ||
24 | - footprint: keep the amount of pinned kernel memory low (most memory | ||
25 | should be shrinkable) | ||
26 | - reliablity: avoid multipage or GFP_ATOMIC allocations | ||
27 | |||
28 | Acronyms | ||
29 | ======== | ||
30 | |||
31 | pfn host page frame number | ||
32 | hpa host physical address | ||
33 | hva host virtual address | ||
34 | gfn guest frame number | ||
35 | gpa guest physical address | ||
36 | gva guest virtual address | ||
37 | ngpa nested guest physical address | ||
38 | ngva nested guest virtual address | ||
39 | pte page table entry (used also to refer generically to paging structure | ||
40 | entries) | ||
41 | gpte guest pte (referring to gfns) | ||
42 | spte shadow pte (referring to pfns) | ||
43 | tdp two dimensional paging (vendor neutral term for NPT and EPT) | ||
44 | |||
45 | Virtual and real hardware supported | ||
46 | =================================== | ||
47 | |||
48 | The mmu supports first-generation mmu hardware, which allows an atomic switch | ||
49 | of the current paging mode and cr3 during guest entry, as well as | ||
50 | two-dimensional paging (AMD's NPT and Intel's EPT). The emulated hardware | ||
51 | it exposes is the traditional 2/3/4 level x86 mmu, with support for global | ||
52 | pages, pae, pse, pse36, cr0.wp, and 1GB pages. Work is in progress to support | ||
53 | exposing NPT capable hardware on NPT capable hosts. | ||
54 | |||
55 | Translation | ||
56 | =========== | ||
57 | |||
58 | The primary job of the mmu is to program the processor's mmu to translate | ||
59 | addresses for the guest. Different translations are required at different | ||
60 | times: | ||
61 | |||
62 | - when guest paging is disabled, we translate guest physical addresses to | ||
63 | host physical addresses (gpa->hpa) | ||
64 | - when guest paging is enabled, we translate guest virtual addresses, to | ||
65 | guest physical addresses, to host physical addresses (gva->gpa->hpa) | ||
66 | - when the guest launches a guest of its own, we translate nested guest | ||
67 | virtual addresses, to nested guest physical addresses, to guest physical | ||
68 | addresses, to host physical addresses (ngva->ngpa->gpa->hpa) | ||
69 | |||
70 | The primary challenge is to encode between 1 and 3 translations into hardware | ||
71 | that support only 1 (traditional) and 2 (tdp) translations. When the | ||
72 | number of required translations matches the hardware, the mmu operates in | ||
73 | direct mode; otherwise it operates in shadow mode (see below). | ||
74 | |||
75 | Memory | ||
76 | ====== | ||
77 | |||
78 | Guest memory (gpa) is part of the user address space of the process that is | ||
79 | using kvm. Userspace defines the translation between guest addresses and user | ||
80 | addresses (gpa->hva); note that two gpas may alias to the same gva, but not | ||
81 | vice versa. | ||
82 | |||
83 | These gvas may be backed using any method available to the host: anonymous | ||
84 | memory, file backed memory, and device memory. Memory might be paged by the | ||
85 | host at any time. | ||
86 | |||
87 | Events | ||
88 | ====== | ||
89 | |||
90 | The mmu is driven by events, some from the guest, some from the host. | ||
91 | |||
92 | Guest generated events: | ||
93 | - writes to control registers (especially cr3) | ||
94 | - invlpg/invlpga instruction execution | ||
95 | - access to missing or protected translations | ||
96 | |||
97 | Host generated events: | ||
98 | - changes in the gpa->hpa translation (either through gpa->hva changes or | ||
99 | through hva->hpa changes) | ||
100 | - memory pressure (the shrinker) | ||
101 | |||
102 | Shadow pages | ||
103 | ============ | ||
104 | |||
105 | The principal data structure is the shadow page, 'struct kvm_mmu_page'. A | ||
106 | shadow page contains 512 sptes, which can be either leaf or nonleaf sptes. A | ||
107 | shadow page may contain a mix of leaf and nonleaf sptes. | ||
108 | |||
109 | A nonleaf spte allows the hardware mmu to reach the leaf pages and | ||
110 | is not related to a translation directly. It points to other shadow pages. | ||
111 | |||
112 | A leaf spte corresponds to either one or two translations encoded into | ||
113 | one paging structure entry. These are always the lowest level of the | ||
114 | translation stack, with optional higher level translations left to NPT/EPT. | ||
115 | Leaf ptes point at guest pages. | ||
116 | |||
117 | The following table shows translations encoded by leaf ptes, with higher-level | ||
118 | translations in parentheses: | ||
119 | |||
120 | Non-nested guests: | ||
121 | nonpaging: gpa->hpa | ||
122 | paging: gva->gpa->hpa | ||
123 | paging, tdp: (gva->)gpa->hpa | ||
124 | Nested guests: | ||
125 | non-tdp: ngva->gpa->hpa (*) | ||
126 | tdp: (ngva->)ngpa->gpa->hpa | ||
127 | |||
128 | (*) the guest hypervisor will encode the ngva->gpa translation into its page | ||
129 | tables if npt is not present | ||
130 | |||
131 | Shadow pages contain the following information: | ||
132 | role.level: | ||
133 | The level in the shadow paging hierarchy that this shadow page belongs to. | ||
134 | 1=4k sptes, 2=2M sptes, 3=1G sptes, etc. | ||
135 | role.direct: | ||
136 | If set, leaf sptes reachable from this page are for a linear range. | ||
137 | Examples include real mode translation, large guest pages backed by small | ||
138 | host pages, and gpa->hpa translations when NPT or EPT is active. | ||
139 | The linear range starts at (gfn << PAGE_SHIFT) and its size is determined | ||
140 | by role.level (2MB for first level, 1GB for second level, 0.5TB for third | ||
141 | level, 256TB for fourth level) | ||
142 | If clear, this page corresponds to a guest page table denoted by the gfn | ||
143 | field. | ||
144 | role.quadrant: | ||
145 | When role.cr4_pae=0, the guest uses 32-bit gptes while the host uses 64-bit | ||
146 | sptes. That means a guest page table contains more ptes than the host, | ||
147 | so multiple shadow pages are needed to shadow one guest page. | ||
148 | For first-level shadow pages, role.quadrant can be 0 or 1 and denotes the | ||
149 | first or second 512-gpte block in the guest page table. For second-level | ||
150 | page tables, each 32-bit gpte is converted to two 64-bit sptes | ||
151 | (since each first-level guest page is shadowed by two first-level | ||
152 | shadow pages) so role.quadrant takes values in the range 0..3. Each | ||
153 | quadrant maps 1GB virtual address space. | ||
154 | role.access: | ||
155 | Inherited guest access permissions in the form uwx. Note execute | ||
156 | permission is positive, not negative. | ||
157 | role.invalid: | ||
158 | The page is invalid and should not be used. It is a root page that is | ||
159 | currently pinned (by a cpu hardware register pointing to it); once it is | ||
160 | unpinned it will be destroyed. | ||
161 | role.cr4_pae: | ||
162 | Contains the value of cr4.pae for which the page is valid (e.g. whether | ||
163 | 32-bit or 64-bit gptes are in use). | ||
164 | role.cr4_nxe: | ||
165 | Contains the value of efer.nxe for which the page is valid. | ||
166 | role.cr0_wp: | ||
167 | Contains the value of cr0.wp for which the page is valid. | ||
168 | gfn: | ||
169 | Either the guest page table containing the translations shadowed by this | ||
170 | page, or the base page frame for linear translations. See role.direct. | ||
171 | spt: | ||
172 | A pageful of 64-bit sptes containing the translations for this page. | ||
173 | Accessed by both kvm and hardware. | ||
174 | The page pointed to by spt will have its page->private pointing back | ||
175 | at the shadow page structure. | ||
176 | sptes in spt point either at guest pages, or at lower-level shadow pages. | ||
177 | Specifically, if sp1 and sp2 are shadow pages, then sp1->spt[n] may point | ||
178 | at __pa(sp2->spt). sp2 will point back at sp1 through parent_pte. | ||
179 | The spt array forms a DAG structure with the shadow page as a node, and | ||
180 | guest pages as leaves. | ||
181 | gfns: | ||
182 | An array of 512 guest frame numbers, one for each present pte. Used to | ||
183 | perform a reverse map from a pte to a gfn. | ||
184 | slot_bitmap: | ||
185 | A bitmap containing one bit per memory slot. If the page contains a pte | ||
186 | mapping a page from memory slot n, then bit n of slot_bitmap will be set | ||
187 | (if a page is aliased among several slots, then it is not guaranteed that | ||
188 | all slots will be marked). | ||
189 | Used during dirty logging to avoid scanning a shadow page if none if its | ||
190 | pages need tracking. | ||
191 | root_count: | ||
192 | A counter keeping track of how many hardware registers (guest cr3 or | ||
193 | pdptrs) are now pointing at the page. While this counter is nonzero, the | ||
194 | page cannot be destroyed. See role.invalid. | ||
195 | multimapped: | ||
196 | Whether there exist multiple sptes pointing at this page. | ||
197 | parent_pte/parent_ptes: | ||
198 | If multimapped is zero, parent_pte points at the single spte that points at | ||
199 | this page's spt. Otherwise, parent_ptes points at a data structure | ||
200 | with a list of parent_ptes. | ||
201 | unsync: | ||
202 | If true, then the translations in this page may not match the guest's | ||
203 | translation. This is equivalent to the state of the tlb when a pte is | ||
204 | changed but before the tlb entry is flushed. Accordingly, unsync ptes | ||
205 | are synchronized when the guest executes invlpg or flushes its tlb by | ||
206 | other means. Valid for leaf pages. | ||
207 | unsync_children: | ||
208 | How many sptes in the page point at pages that are unsync (or have | ||
209 | unsynchronized children). | ||
210 | unsync_child_bitmap: | ||
211 | A bitmap indicating which sptes in spt point (directly or indirectly) at | ||
212 | pages that may be unsynchronized. Used to quickly locate all unsychronized | ||
213 | pages reachable from a given page. | ||
214 | |||
215 | Reverse map | ||
216 | =========== | ||
217 | |||
218 | The mmu maintains a reverse mapping whereby all ptes mapping a page can be | ||
219 | reached given its gfn. This is used, for example, when swapping out a page. | ||
220 | |||
221 | Synchronized and unsynchronized pages | ||
222 | ===================================== | ||
223 | |||
224 | The guest uses two events to synchronize its tlb and page tables: tlb flushes | ||
225 | and page invalidations (invlpg). | ||
226 | |||
227 | A tlb flush means that we need to synchronize all sptes reachable from the | ||
228 | guest's cr3. This is expensive, so we keep all guest page tables write | ||
229 | protected, and synchronize sptes to gptes when a gpte is written. | ||
230 | |||
231 | A special case is when a guest page table is reachable from the current | ||
232 | guest cr3. In this case, the guest is obliged to issue an invlpg instruction | ||
233 | before using the translation. We take advantage of that by removing write | ||
234 | protection from the guest page, and allowing the guest to modify it freely. | ||
235 | We synchronize modified gptes when the guest invokes invlpg. This reduces | ||
236 | the amount of emulation we have to do when the guest modifies multiple gptes, | ||
237 | or when the a guest page is no longer used as a page table and is used for | ||
238 | random guest data. | ||
239 | |||
240 | As a side effect we have to resynchronize all reachable unsynchronized shadow | ||
241 | pages on a tlb flush. | ||
242 | |||
243 | |||
244 | Reaction to events | ||
245 | ================== | ||
246 | |||
247 | - guest page fault (or npt page fault, or ept violation) | ||
248 | |||
249 | This is the most complicated event. The cause of a page fault can be: | ||
250 | |||
251 | - a true guest fault (the guest translation won't allow the access) (*) | ||
252 | - access to a missing translation | ||
253 | - access to a protected translation | ||
254 | - when logging dirty pages, memory is write protected | ||
255 | - synchronized shadow pages are write protected (*) | ||
256 | - access to untranslatable memory (mmio) | ||
257 | |||
258 | (*) not applicable in direct mode | ||
259 | |||
260 | Handling a page fault is performed as follows: | ||
261 | |||
262 | - if needed, walk the guest page tables to determine the guest translation | ||
263 | (gva->gpa or ngpa->gpa) | ||
264 | - if permissions are insufficient, reflect the fault back to the guest | ||
265 | - determine the host page | ||
266 | - if this is an mmio request, there is no host page; call the emulator | ||
267 | to emulate the instruction instead | ||
268 | - walk the shadow page table to find the spte for the translation, | ||
269 | instantiating missing intermediate page tables as necessary | ||
270 | - try to unsynchronize the page | ||
271 | - if successful, we can let the guest continue and modify the gpte | ||
272 | - emulate the instruction | ||
273 | - if failed, unshadow the page and let the guest continue | ||
274 | - update any translations that were modified by the instruction | ||
275 | |||
276 | invlpg handling: | ||
277 | |||
278 | - walk the shadow page hierarchy and drop affected translations | ||
279 | - try to reinstantiate the indicated translation in the hope that the | ||
280 | guest will use it in the near future | ||
281 | |||
282 | Guest control register updates: | ||
283 | |||
284 | - mov to cr3 | ||
285 | - look up new shadow roots | ||
286 | - synchronize newly reachable shadow pages | ||
287 | |||
288 | - mov to cr0/cr4/efer | ||
289 | - set up mmu context for new paging mode | ||
290 | - look up new shadow roots | ||
291 | - synchronize newly reachable shadow pages | ||
292 | |||
293 | Host translation updates: | ||
294 | |||
295 | - mmu notifier called with updated hva | ||
296 | - look up affected sptes through reverse map | ||
297 | - drop (or update) translations | ||
298 | |||
299 | Further reading | ||
300 | =============== | ||
301 | |||
302 | - NPT presentation from KVM Forum 2008 | ||
303 | http://www.linux-kvm.org/wiki/images/c/c8/KvmForum2008%24kdf2008_21.pdf | ||
304 | |||
diff --git a/Documentation/laptops/00-INDEX b/Documentation/laptops/00-INDEX index ee5692b26dd4..fa688538e757 100644 --- a/Documentation/laptops/00-INDEX +++ b/Documentation/laptops/00-INDEX | |||
@@ -2,6 +2,12 @@ | |||
2 | - This file | 2 | - This file |
3 | acer-wmi.txt | 3 | acer-wmi.txt |
4 | - information on the Acer Laptop WMI Extras driver. | 4 | - information on the Acer Laptop WMI Extras driver. |
5 | asus-laptop.txt | ||
6 | - information on the Asus Laptop Extras driver. | ||
7 | disk-shock-protection.txt | ||
8 | - information on hard disk shock protection. | ||
9 | dslm.c | ||
10 | - Simple Disk Sleep Monitor program | ||
5 | laptop-mode.txt | 11 | laptop-mode.txt |
6 | - how to conserve battery power using laptop-mode. | 12 | - how to conserve battery power using laptop-mode. |
7 | sony-laptop.txt | 13 | sony-laptop.txt |
diff --git a/Documentation/laptops/Makefile b/Documentation/laptops/Makefile new file mode 100644 index 000000000000..5cb144af3c09 --- /dev/null +++ b/Documentation/laptops/Makefile | |||
@@ -0,0 +1,8 @@ | |||
1 | # kbuild trick to avoid linker error. Can be omitted if a module is built. | ||
2 | obj- := dummy.o | ||
3 | |||
4 | # List of programs to build | ||
5 | hostprogs-y := dslm | ||
6 | |||
7 | # Tell kbuild to always build the programs | ||
8 | always := $(hostprogs-y) | ||
diff --git a/Documentation/laptops/dslm.c b/Documentation/laptops/dslm.c new file mode 100644 index 000000000000..72ff290c5fc6 --- /dev/null +++ b/Documentation/laptops/dslm.c | |||
@@ -0,0 +1,166 @@ | |||
1 | /* | ||
2 | * dslm.c | ||
3 | * Simple Disk Sleep Monitor | ||
4 | * by Bartek Kania | ||
5 | * Licenced under the GPL | ||
6 | */ | ||
7 | #include <unistd.h> | ||
8 | #include <stdlib.h> | ||
9 | #include <stdio.h> | ||
10 | #include <fcntl.h> | ||
11 | #include <errno.h> | ||
12 | #include <time.h> | ||
13 | #include <string.h> | ||
14 | #include <signal.h> | ||
15 | #include <sys/ioctl.h> | ||
16 | #include <linux/hdreg.h> | ||
17 | |||
18 | #ifdef DEBUG | ||
19 | #define D(x) x | ||
20 | #else | ||
21 | #define D(x) | ||
22 | #endif | ||
23 | |||
24 | int endit = 0; | ||
25 | |||
26 | /* Check if the disk is in powersave-mode | ||
27 | * Most of the code is stolen from hdparm. | ||
28 | * 1 = active, 0 = standby/sleep, -1 = unknown */ | ||
29 | static int check_powermode(int fd) | ||
30 | { | ||
31 | unsigned char args[4] = {WIN_CHECKPOWERMODE1,0,0,0}; | ||
32 | int state; | ||
33 | |||
34 | if (ioctl(fd, HDIO_DRIVE_CMD, &args) | ||
35 | && (args[0] = WIN_CHECKPOWERMODE2) /* try again with 0x98 */ | ||
36 | && ioctl(fd, HDIO_DRIVE_CMD, &args)) { | ||
37 | if (errno != EIO || args[0] != 0 || args[1] != 0) { | ||
38 | state = -1; /* "unknown"; */ | ||
39 | } else | ||
40 | state = 0; /* "sleeping"; */ | ||
41 | } else { | ||
42 | state = (args[2] == 255) ? 1 : 0; | ||
43 | } | ||
44 | D(printf(" drive state is: %d\n", state)); | ||
45 | |||
46 | return state; | ||
47 | } | ||
48 | |||
49 | static char *state_name(int i) | ||
50 | { | ||
51 | if (i == -1) return "unknown"; | ||
52 | if (i == 0) return "sleeping"; | ||
53 | if (i == 1) return "active"; | ||
54 | |||
55 | return "internal error"; | ||
56 | } | ||
57 | |||
58 | static char *myctime(time_t time) | ||
59 | { | ||
60 | char *ts = ctime(&time); | ||
61 | ts[strlen(ts) - 1] = 0; | ||
62 | |||
63 | return ts; | ||
64 | } | ||
65 | |||
66 | static void measure(int fd) | ||
67 | { | ||
68 | time_t start_time; | ||
69 | int last_state; | ||
70 | time_t last_time; | ||
71 | int curr_state; | ||
72 | time_t curr_time = 0; | ||
73 | time_t time_diff; | ||
74 | time_t active_time = 0; | ||
75 | time_t sleep_time = 0; | ||
76 | time_t unknown_time = 0; | ||
77 | time_t total_time = 0; | ||
78 | int changes = 0; | ||
79 | float tmp; | ||
80 | |||
81 | printf("Starting measurements\n"); | ||
82 | |||
83 | last_state = check_powermode(fd); | ||
84 | start_time = last_time = time(0); | ||
85 | printf(" System is in state %s\n\n", state_name(last_state)); | ||
86 | |||
87 | while(!endit) { | ||
88 | sleep(1); | ||
89 | curr_state = check_powermode(fd); | ||
90 | |||
91 | if (curr_state != last_state || endit) { | ||
92 | changes++; | ||
93 | curr_time = time(0); | ||
94 | time_diff = curr_time - last_time; | ||
95 | |||
96 | if (last_state == 1) active_time += time_diff; | ||
97 | else if (last_state == 0) sleep_time += time_diff; | ||
98 | else unknown_time += time_diff; | ||
99 | |||
100 | last_state = curr_state; | ||
101 | last_time = curr_time; | ||
102 | |||
103 | printf("%s: State-change to %s\n", myctime(curr_time), | ||
104 | state_name(curr_state)); | ||
105 | } | ||
106 | } | ||
107 | changes--; /* Compensate for SIGINT */ | ||
108 | |||
109 | total_time = time(0) - start_time; | ||
110 | printf("\nTotal running time: %lus\n", curr_time - start_time); | ||
111 | printf(" State changed %d times\n", changes); | ||
112 | |||
113 | tmp = (float)sleep_time / (float)total_time * 100; | ||
114 | printf(" Time in sleep state: %lus (%.2f%%)\n", sleep_time, tmp); | ||
115 | tmp = (float)active_time / (float)total_time * 100; | ||
116 | printf(" Time in active state: %lus (%.2f%%)\n", active_time, tmp); | ||
117 | tmp = (float)unknown_time / (float)total_time * 100; | ||
118 | printf(" Time in unknown state: %lus (%.2f%%)\n", unknown_time, tmp); | ||
119 | } | ||
120 | |||
121 | static void ender(int s) | ||
122 | { | ||
123 | endit = 1; | ||
124 | } | ||
125 | |||
126 | static void usage(void) | ||
127 | { | ||
128 | puts("usage: dslm [-w <time>] <disk>"); | ||
129 | exit(0); | ||
130 | } | ||
131 | |||
132 | int main(int argc, char **argv) | ||
133 | { | ||
134 | int fd; | ||
135 | char *disk = 0; | ||
136 | int settle_time = 60; | ||
137 | |||
138 | /* Parse the simple command-line */ | ||
139 | if (argc == 2) | ||
140 | disk = argv[1]; | ||
141 | else if (argc == 4) { | ||
142 | settle_time = atoi(argv[2]); | ||
143 | disk = argv[3]; | ||
144 | } else | ||
145 | usage(); | ||
146 | |||
147 | if (!(fd = open(disk, O_RDONLY|O_NONBLOCK))) { | ||
148 | printf("Can't open %s, because: %s\n", disk, strerror(errno)); | ||
149 | exit(-1); | ||
150 | } | ||
151 | |||
152 | if (settle_time) { | ||
153 | printf("Waiting %d seconds for the system to settle down to " | ||
154 | "'normal'\n", settle_time); | ||
155 | sleep(settle_time); | ||
156 | } else | ||
157 | puts("Not waiting for system to settle down"); | ||
158 | |||
159 | signal(SIGINT, ender); | ||
160 | |||
161 | measure(fd); | ||
162 | |||
163 | close(fd); | ||
164 | |||
165 | return 0; | ||
166 | } | ||
diff --git a/Documentation/laptops/laptop-mode.txt b/Documentation/laptops/laptop-mode.txt index eeedee11c8c2..0bf25eebce94 100644 --- a/Documentation/laptops/laptop-mode.txt +++ b/Documentation/laptops/laptop-mode.txt | |||
@@ -207,7 +207,7 @@ Tips & Tricks | |||
207 | * Drew Scott Daniels observed: "I don't know why, but when I decrease the number | 207 | * Drew Scott Daniels observed: "I don't know why, but when I decrease the number |
208 | of colours that my display uses it consumes less battery power. I've seen | 208 | of colours that my display uses it consumes less battery power. I've seen |
209 | this on powerbooks too. I hope that this is a piece of information that | 209 | this on powerbooks too. I hope that this is a piece of information that |
210 | might be useful to the Laptop Mode patch or it's users." | 210 | might be useful to the Laptop Mode patch or its users." |
211 | 211 | ||
212 | * In syslog.conf, you can prefix entries with a dash ``-'' to omit syncing the | 212 | * In syslog.conf, you can prefix entries with a dash ``-'' to omit syncing the |
213 | file after every logging. When you're using laptop-mode and your disk doesn't | 213 | file after every logging. When you're using laptop-mode and your disk doesn't |
@@ -779,172 +779,4 @@ Monitoring tool | |||
779 | --------------- | 779 | --------------- |
780 | 780 | ||
781 | Bartek Kania submitted this, it can be used to measure how much time your disk | 781 | Bartek Kania submitted this, it can be used to measure how much time your disk |
782 | spends spun up/down. | 782 | spends spun up/down. See Documentation/laptops/dslm.c |
783 | |||
784 | ---------------------------dslm.c BEGIN----------------------------------------- | ||
785 | /* | ||
786 | * Simple Disk Sleep Monitor | ||
787 | * by Bartek Kania | ||
788 | * Licenced under the GPL | ||
789 | */ | ||
790 | #include <unistd.h> | ||
791 | #include <stdlib.h> | ||
792 | #include <stdio.h> | ||
793 | #include <fcntl.h> | ||
794 | #include <errno.h> | ||
795 | #include <time.h> | ||
796 | #include <string.h> | ||
797 | #include <signal.h> | ||
798 | #include <sys/ioctl.h> | ||
799 | #include <linux/hdreg.h> | ||
800 | |||
801 | #ifdef DEBUG | ||
802 | #define D(x) x | ||
803 | #else | ||
804 | #define D(x) | ||
805 | #endif | ||
806 | |||
807 | int endit = 0; | ||
808 | |||
809 | /* Check if the disk is in powersave-mode | ||
810 | * Most of the code is stolen from hdparm. | ||
811 | * 1 = active, 0 = standby/sleep, -1 = unknown */ | ||
812 | int check_powermode(int fd) | ||
813 | { | ||
814 | unsigned char args[4] = {WIN_CHECKPOWERMODE1,0,0,0}; | ||
815 | int state; | ||
816 | |||
817 | if (ioctl(fd, HDIO_DRIVE_CMD, &args) | ||
818 | && (args[0] = WIN_CHECKPOWERMODE2) /* try again with 0x98 */ | ||
819 | && ioctl(fd, HDIO_DRIVE_CMD, &args)) { | ||
820 | if (errno != EIO || args[0] != 0 || args[1] != 0) { | ||
821 | state = -1; /* "unknown"; */ | ||
822 | } else | ||
823 | state = 0; /* "sleeping"; */ | ||
824 | } else { | ||
825 | state = (args[2] == 255) ? 1 : 0; | ||
826 | } | ||
827 | D(printf(" drive state is: %d\n", state)); | ||
828 | |||
829 | return state; | ||
830 | } | ||
831 | |||
832 | char *state_name(int i) | ||
833 | { | ||
834 | if (i == -1) return "unknown"; | ||
835 | if (i == 0) return "sleeping"; | ||
836 | if (i == 1) return "active"; | ||
837 | |||
838 | return "internal error"; | ||
839 | } | ||
840 | |||
841 | char *myctime(time_t time) | ||
842 | { | ||
843 | char *ts = ctime(&time); | ||
844 | ts[strlen(ts) - 1] = 0; | ||
845 | |||
846 | return ts; | ||
847 | } | ||
848 | |||
849 | void measure(int fd) | ||
850 | { | ||
851 | time_t start_time; | ||
852 | int last_state; | ||
853 | time_t last_time; | ||
854 | int curr_state; | ||
855 | time_t curr_time = 0; | ||
856 | time_t time_diff; | ||
857 | time_t active_time = 0; | ||
858 | time_t sleep_time = 0; | ||
859 | time_t unknown_time = 0; | ||
860 | time_t total_time = 0; | ||
861 | int changes = 0; | ||
862 | float tmp; | ||
863 | |||
864 | printf("Starting measurements\n"); | ||
865 | |||
866 | last_state = check_powermode(fd); | ||
867 | start_time = last_time = time(0); | ||
868 | printf(" System is in state %s\n\n", state_name(last_state)); | ||
869 | |||
870 | while(!endit) { | ||
871 | sleep(1); | ||
872 | curr_state = check_powermode(fd); | ||
873 | |||
874 | if (curr_state != last_state || endit) { | ||
875 | changes++; | ||
876 | curr_time = time(0); | ||
877 | time_diff = curr_time - last_time; | ||
878 | |||
879 | if (last_state == 1) active_time += time_diff; | ||
880 | else if (last_state == 0) sleep_time += time_diff; | ||
881 | else unknown_time += time_diff; | ||
882 | |||
883 | last_state = curr_state; | ||
884 | last_time = curr_time; | ||
885 | |||
886 | printf("%s: State-change to %s\n", myctime(curr_time), | ||
887 | state_name(curr_state)); | ||
888 | } | ||
889 | } | ||
890 | changes--; /* Compensate for SIGINT */ | ||
891 | |||
892 | total_time = time(0) - start_time; | ||
893 | printf("\nTotal running time: %lus\n", curr_time - start_time); | ||
894 | printf(" State changed %d times\n", changes); | ||
895 | |||
896 | tmp = (float)sleep_time / (float)total_time * 100; | ||
897 | printf(" Time in sleep state: %lus (%.2f%%)\n", sleep_time, tmp); | ||
898 | tmp = (float)active_time / (float)total_time * 100; | ||
899 | printf(" Time in active state: %lus (%.2f%%)\n", active_time, tmp); | ||
900 | tmp = (float)unknown_time / (float)total_time * 100; | ||
901 | printf(" Time in unknown state: %lus (%.2f%%)\n", unknown_time, tmp); | ||
902 | } | ||
903 | |||
904 | void ender(int s) | ||
905 | { | ||
906 | endit = 1; | ||
907 | } | ||
908 | |||
909 | void usage() | ||
910 | { | ||
911 | puts("usage: dslm [-w <time>] <disk>"); | ||
912 | exit(0); | ||
913 | } | ||
914 | |||
915 | int main(int argc, char **argv) | ||
916 | { | ||
917 | int fd; | ||
918 | char *disk = 0; | ||
919 | int settle_time = 60; | ||
920 | |||
921 | /* Parse the simple command-line */ | ||
922 | if (argc == 2) | ||
923 | disk = argv[1]; | ||
924 | else if (argc == 4) { | ||
925 | settle_time = atoi(argv[2]); | ||
926 | disk = argv[3]; | ||
927 | } else | ||
928 | usage(); | ||
929 | |||
930 | if (!(fd = open(disk, O_RDONLY|O_NONBLOCK))) { | ||
931 | printf("Can't open %s, because: %s\n", disk, strerror(errno)); | ||
932 | exit(-1); | ||
933 | } | ||
934 | |||
935 | if (settle_time) { | ||
936 | printf("Waiting %d seconds for the system to settle down to " | ||
937 | "'normal'\n", settle_time); | ||
938 | sleep(settle_time); | ||
939 | } else | ||
940 | puts("Not waiting for system to settle down"); | ||
941 | |||
942 | signal(SIGINT, ender); | ||
943 | |||
944 | measure(fd); | ||
945 | |||
946 | close(fd); | ||
947 | |||
948 | return 0; | ||
949 | } | ||
950 | ---------------------------dslm.c END------------------------------------------- | ||
diff --git a/Documentation/laptops/thinkpad-acpi.txt b/Documentation/laptops/thinkpad-acpi.txt index 75afa1229fd7..fc15538d8b46 100644 --- a/Documentation/laptops/thinkpad-acpi.txt +++ b/Documentation/laptops/thinkpad-acpi.txt | |||
@@ -292,13 +292,13 @@ sysfs notes: | |||
292 | 292 | ||
293 | Warning: when in NVRAM mode, the volume up/down/mute | 293 | Warning: when in NVRAM mode, the volume up/down/mute |
294 | keys are synthesized according to changes in the mixer, | 294 | keys are synthesized according to changes in the mixer, |
295 | so you have to use volume up or volume down to unmute, | 295 | which uses a single volume up or volume down hotkey |
296 | as per the ThinkPad volume mixer user interface. When | 296 | press to unmute, as per the ThinkPad volume mixer user |
297 | in ACPI event mode, volume up/down/mute are reported as | 297 | interface. When in ACPI event mode, volume up/down/mute |
298 | separate events, but this behaviour may be corrected in | 298 | events are reported by the firmware and can behave |
299 | future releases of this driver, in which case the | 299 | differently (and that behaviour changes with firmware |
300 | ThinkPad volume mixer user interface semantics will be | 300 | version -- not just with firmware models -- as well as |
301 | enforced. | 301 | OSI(Linux) state). |
302 | 302 | ||
303 | hotkey_poll_freq: | 303 | hotkey_poll_freq: |
304 | frequency in Hz for hot key polling. It must be between | 304 | frequency in Hz for hot key polling. It must be between |
@@ -309,7 +309,7 @@ sysfs notes: | |||
309 | will cause hot key presses that require NVRAM polling | 309 | will cause hot key presses that require NVRAM polling |
310 | to never be reported. | 310 | to never be reported. |
311 | 311 | ||
312 | Setting hotkey_poll_freq too low will cause repeated | 312 | Setting hotkey_poll_freq too low may cause repeated |
313 | pressings of the same hot key to be misreported as a | 313 | pressings of the same hot key to be misreported as a |
314 | single key press, or to not even be detected at all. | 314 | single key press, or to not even be detected at all. |
315 | The recommended polling frequency is 10Hz. | 315 | The recommended polling frequency is 10Hz. |
@@ -397,6 +397,7 @@ ACPI Scan | |||
397 | event code Key Notes | 397 | event code Key Notes |
398 | 398 | ||
399 | 0x1001 0x00 FN+F1 - | 399 | 0x1001 0x00 FN+F1 - |
400 | |||
400 | 0x1002 0x01 FN+F2 IBM: battery (rare) | 401 | 0x1002 0x01 FN+F2 IBM: battery (rare) |
401 | Lenovo: Screen lock | 402 | Lenovo: Screen lock |
402 | 403 | ||
@@ -404,7 +405,8 @@ event code Key Notes | |||
404 | this hot key, even with hot keys | 405 | this hot key, even with hot keys |
405 | disabled or with Fn+F3 masked | 406 | disabled or with Fn+F3 masked |
406 | off | 407 | off |
407 | IBM: screen lock | 408 | IBM: screen lock, often turns |
409 | off the ThinkLight as side-effect | ||
408 | Lenovo: battery | 410 | Lenovo: battery |
409 | 411 | ||
410 | 0x1004 0x03 FN+F4 Sleep button (ACPI sleep button | 412 | 0x1004 0x03 FN+F4 Sleep button (ACPI sleep button |
@@ -433,7 +435,8 @@ event code Key Notes | |||
433 | Do you feel lucky today? | 435 | Do you feel lucky today? |
434 | 436 | ||
435 | 0x1008 0x07 FN+F8 IBM: toggle screen expand | 437 | 0x1008 0x07 FN+F8 IBM: toggle screen expand |
436 | Lenovo: configure UltraNav | 438 | Lenovo: configure UltraNav, |
439 | or toggle screen expand | ||
437 | 440 | ||
438 | 0x1009 0x08 FN+F9 - | 441 | 0x1009 0x08 FN+F9 - |
439 | .. .. .. | 442 | .. .. .. |
@@ -444,7 +447,7 @@ event code Key Notes | |||
444 | either through the ACPI event, | 447 | either through the ACPI event, |
445 | or through a hotkey event. | 448 | or through a hotkey event. |
446 | The firmware may refuse to | 449 | The firmware may refuse to |
447 | generate further FN+F4 key | 450 | generate further FN+F12 key |
448 | press events until a S3 or S4 | 451 | press events until a S3 or S4 |
449 | ACPI sleep cycle is performed, | 452 | ACPI sleep cycle is performed, |
450 | or some time passes. | 453 | or some time passes. |
@@ -512,15 +515,19 @@ events for switches: | |||
512 | SW_RFKILL_ALL T60 and later hardware rfkill rocker switch | 515 | SW_RFKILL_ALL T60 and later hardware rfkill rocker switch |
513 | SW_TABLET_MODE Tablet ThinkPads HKEY events 0x5009 and 0x500A | 516 | SW_TABLET_MODE Tablet ThinkPads HKEY events 0x5009 and 0x500A |
514 | 517 | ||
515 | Non hot-key ACPI HKEY event map: | 518 | Non hotkey ACPI HKEY event map: |
519 | ------------------------------- | ||
520 | |||
521 | Events that are not propagated by the driver, except for legacy | ||
522 | compatibility purposes when hotkey_report_mode is set to 1: | ||
523 | |||
516 | 0x5001 Lid closed | 524 | 0x5001 Lid closed |
517 | 0x5002 Lid opened | 525 | 0x5002 Lid opened |
518 | 0x5009 Tablet swivel: switched to tablet mode | 526 | 0x5009 Tablet swivel: switched to tablet mode |
519 | 0x500A Tablet swivel: switched to normal mode | 527 | 0x500A Tablet swivel: switched to normal mode |
520 | 0x7000 Radio Switch may have changed state | 528 | 0x7000 Radio Switch may have changed state |
521 | 529 | ||
522 | The above events are not propagated by the driver, except for legacy | 530 | Events that are never propagated by the driver: |
523 | compatibility purposes when hotkey_report_mode is set to 1. | ||
524 | 531 | ||
525 | 0x2304 System is waking up from suspend to undock | 532 | 0x2304 System is waking up from suspend to undock |
526 | 0x2305 System is waking up from suspend to eject bay | 533 | 0x2305 System is waking up from suspend to eject bay |
@@ -528,14 +535,39 @@ compatibility purposes when hotkey_report_mode is set to 1. | |||
528 | 0x2405 System is waking up from hibernation to eject bay | 535 | 0x2405 System is waking up from hibernation to eject bay |
529 | 0x5010 Brightness level changed/control event | 536 | 0x5010 Brightness level changed/control event |
530 | 537 | ||
531 | The above events are never propagated by the driver. | 538 | Events that are propagated by the driver to userspace: |
532 | 539 | ||
540 | 0x2313 ALARM: System is waking up from suspend because | ||
541 | the battery is nearly empty | ||
542 | 0x2413 ALARM: System is waking up from hibernation because | ||
543 | the battery is nearly empty | ||
533 | 0x3003 Bay ejection (see 0x2x05) complete, can sleep again | 544 | 0x3003 Bay ejection (see 0x2x05) complete, can sleep again |
545 | 0x3006 Bay hotplug request (hint to power up SATA link when | ||
546 | the optical drive tray is ejected) | ||
534 | 0x4003 Undocked (see 0x2x04), can sleep again | 547 | 0x4003 Undocked (see 0x2x04), can sleep again |
535 | 0x500B Tablet pen inserted into its storage bay | 548 | 0x500B Tablet pen inserted into its storage bay |
536 | 0x500C Tablet pen removed from its storage bay | 549 | 0x500C Tablet pen removed from its storage bay |
537 | 550 | 0x6011 ALARM: battery is too hot | |
538 | The above events are propagated by the driver. | 551 | 0x6012 ALARM: battery is extremely hot |
552 | 0x6021 ALARM: a sensor is too hot | ||
553 | 0x6022 ALARM: a sensor is extremely hot | ||
554 | 0x6030 System thermal table changed | ||
555 | |||
556 | Battery nearly empty alarms are a last resort attempt to get the | ||
557 | operating system to hibernate or shutdown cleanly (0x2313), or shutdown | ||
558 | cleanly (0x2413) before power is lost. They must be acted upon, as the | ||
559 | wake up caused by the firmware will have negated most safety nets... | ||
560 | |||
561 | When any of the "too hot" alarms happen, according to Lenovo the user | ||
562 | should suspend or hibernate the laptop (and in the case of battery | ||
563 | alarms, unplug the AC adapter) to let it cool down. These alarms do | ||
564 | signal that something is wrong, they should never happen on normal | ||
565 | operating conditions. | ||
566 | |||
567 | The "extremely hot" alarms are emergencies. According to Lenovo, the | ||
568 | operating system is to force either an immediate suspend or hibernate | ||
569 | cycle, or a system shutdown. Obviously, something is very wrong if this | ||
570 | happens. | ||
539 | 571 | ||
540 | Compatibility notes: | 572 | Compatibility notes: |
541 | 573 | ||
@@ -650,6 +682,10 @@ LCD, CRT or DVI (if available). The following commands are available: | |||
650 | echo expand_toggle > /proc/acpi/ibm/video | 682 | echo expand_toggle > /proc/acpi/ibm/video |
651 | echo video_switch > /proc/acpi/ibm/video | 683 | echo video_switch > /proc/acpi/ibm/video |
652 | 684 | ||
685 | NOTE: Access to this feature is restricted to processes owning the | ||
686 | CAP_SYS_ADMIN capability for safety reasons, as it can interact badly | ||
687 | enough with some versions of X.org to crash it. | ||
688 | |||
653 | Each video output device can be enabled or disabled individually. | 689 | Each video output device can be enabled or disabled individually. |
654 | Reading /proc/acpi/ibm/video shows the status of each device. | 690 | Reading /proc/acpi/ibm/video shows the status of each device. |
655 | 691 | ||
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 42208511b5c0..e9ce3c554514 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c | |||
@@ -34,7 +34,6 @@ | |||
34 | #include <sys/uio.h> | 34 | #include <sys/uio.h> |
35 | #include <termios.h> | 35 | #include <termios.h> |
36 | #include <getopt.h> | 36 | #include <getopt.h> |
37 | #include <zlib.h> | ||
38 | #include <assert.h> | 37 | #include <assert.h> |
39 | #include <sched.h> | 38 | #include <sched.h> |
40 | #include <limits.h> | 39 | #include <limits.h> |
@@ -264,7 +263,7 @@ static u8 *get_feature_bits(struct device *dev) | |||
264 | * Launcher virtual with an offset. | 263 | * Launcher virtual with an offset. |
265 | * | 264 | * |
266 | * This can be tough to get your head around, but usually it just means that we | 265 | * This can be tough to get your head around, but usually it just means that we |
267 | * use these trivial conversion functions when the Guest gives us it's | 266 | * use these trivial conversion functions when the Guest gives us its |
268 | * "physical" addresses: | 267 | * "physical" addresses: |
269 | */ | 268 | */ |
270 | static void *from_guest_phys(unsigned long addr) | 269 | static void *from_guest_phys(unsigned long addr) |
diff --git a/Documentation/md.txt b/Documentation/md.txt index 188f4768f1d5..e4e893ef3e01 100644 --- a/Documentation/md.txt +++ b/Documentation/md.txt | |||
@@ -136,7 +136,7 @@ raid_disks != 0. | |||
136 | 136 | ||
137 | Then uninitialized devices can be added with ADD_NEW_DISK. The | 137 | Then uninitialized devices can be added with ADD_NEW_DISK. The |
138 | structure passed to ADD_NEW_DISK must specify the state of the device | 138 | structure passed to ADD_NEW_DISK must specify the state of the device |
139 | and it's role in the array. | 139 | and its role in the array. |
140 | 140 | ||
141 | Once started with RUN_ARRAY, uninitialized spares can be added with | 141 | Once started with RUN_ARRAY, uninitialized spares can be added with |
142 | HOT_ADD_DISK. | 142 | HOT_ADD_DISK. |
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 7f5809eddee6..631ad2f1b229 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt | |||
@@ -3,6 +3,7 @@ | |||
3 | ============================ | 3 | ============================ |
4 | 4 | ||
5 | By: David Howells <dhowells@redhat.com> | 5 | By: David Howells <dhowells@redhat.com> |
6 | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | ||
6 | 7 | ||
7 | Contents: | 8 | Contents: |
8 | 9 | ||
@@ -60,6 +61,10 @@ Contents: | |||
60 | 61 | ||
61 | - And then there's the Alpha. | 62 | - And then there's the Alpha. |
62 | 63 | ||
64 | (*) Example uses. | ||
65 | |||
66 | - Circular buffers. | ||
67 | |||
63 | (*) References. | 68 | (*) References. |
64 | 69 | ||
65 | 70 | ||
@@ -2226,6 +2231,21 @@ The Alpha defines the Linux kernel's memory barrier model. | |||
2226 | See the subsection on "Cache Coherency" above. | 2231 | See the subsection on "Cache Coherency" above. |
2227 | 2232 | ||
2228 | 2233 | ||
2234 | ============ | ||
2235 | EXAMPLE USES | ||
2236 | ============ | ||
2237 | |||
2238 | CIRCULAR BUFFERS | ||
2239 | ---------------- | ||
2240 | |||
2241 | Memory barriers can be used to implement circular buffering without the need | ||
2242 | of a lock to serialise the producer with the consumer. See: | ||
2243 | |||
2244 | Documentation/circular-buffers.txt | ||
2245 | |||
2246 | for details. | ||
2247 | |||
2248 | |||
2229 | ========== | 2249 | ========== |
2230 | REFERENCES | 2250 | REFERENCES |
2231 | ========== | 2251 | ========== |
diff --git a/Documentation/netlabel/lsm_interface.txt b/Documentation/netlabel/lsm_interface.txt index 98dd9f7430f2..638c74f7de7f 100644 --- a/Documentation/netlabel/lsm_interface.txt +++ b/Documentation/netlabel/lsm_interface.txt | |||
@@ -38,7 +38,7 @@ Depending on the exact configuration, translation between the network packet | |||
38 | label and the internal LSM security identifier can be time consuming. The | 38 | label and the internal LSM security identifier can be time consuming. The |
39 | NetLabel label mapping cache is a caching mechanism which can be used to | 39 | NetLabel label mapping cache is a caching mechanism which can be used to |
40 | sidestep much of this overhead once a mapping has been established. Once the | 40 | sidestep much of this overhead once a mapping has been established. Once the |
41 | LSM has received a packet, used NetLabel to decode it's security attributes, | 41 | LSM has received a packet, used NetLabel to decode its security attributes, |
42 | and translated the security attributes into a LSM internal identifier the LSM | 42 | and translated the security attributes into a LSM internal identifier the LSM |
43 | can use the NetLabel caching functions to associate the LSM internal | 43 | can use the NetLabel caching functions to associate the LSM internal |
44 | identifier with the network packet's label. This means that in the future | 44 | identifier with the network packet's label. This means that in the future |
diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX index 50189bf07d53..fe5c099b8fc8 100644 --- a/Documentation/networking/00-INDEX +++ b/Documentation/networking/00-INDEX | |||
@@ -32,6 +32,8 @@ cs89x0.txt | |||
32 | - the Crystal LAN (CS8900/20-based) Ethernet ISA adapter driver | 32 | - the Crystal LAN (CS8900/20-based) Ethernet ISA adapter driver |
33 | cxacru.txt | 33 | cxacru.txt |
34 | - Conexant AccessRunner USB ADSL Modem | 34 | - Conexant AccessRunner USB ADSL Modem |
35 | cxacru-cf.py | ||
36 | - Conexant AccessRunner USB ADSL Modem configuration file parser | ||
35 | de4x5.txt | 37 | de4x5.txt |
36 | - the Digital EtherWORKS DE4?? and DE5?? PCI Ethernet driver | 38 | - the Digital EtherWORKS DE4?? and DE5?? PCI Ethernet driver |
37 | decnet.txt | 39 | decnet.txt |
diff --git a/Documentation/networking/Makefile b/Documentation/networking/Makefile index 6d8af1ac56c4..5aba7a33aeeb 100644 --- a/Documentation/networking/Makefile +++ b/Documentation/networking/Makefile | |||
@@ -6,3 +6,5 @@ hostprogs-y := ifenslave | |||
6 | 6 | ||
7 | # Tell kbuild to always build the programs | 7 | # Tell kbuild to always build the programs |
8 | always := $(hostprogs-y) | 8 | always := $(hostprogs-y) |
9 | |||
10 | obj-m := timestamping/ | ||
diff --git a/Documentation/networking/caif/Linux-CAIF.txt b/Documentation/networking/caif/Linux-CAIF.txt new file mode 100644 index 000000000000..7fe7a9a33a4f --- /dev/null +++ b/Documentation/networking/caif/Linux-CAIF.txt | |||
@@ -0,0 +1,212 @@ | |||
1 | Linux CAIF | ||
2 | =========== | ||
3 | copyright (C) ST-Ericsson AB 2010 | ||
4 | Author: Sjur Brendeland/ sjur.brandeland@stericsson.com | ||
5 | License terms: GNU General Public License (GPL) version 2 | ||
6 | |||
7 | |||
8 | Introduction | ||
9 | ------------ | ||
10 | CAIF is a MUX protocol used by ST-Ericsson cellular modems for | ||
11 | communication between Modem and host. The host processes can open virtual AT | ||
12 | channels, initiate GPRS Data connections, Video channels and Utility Channels. | ||
13 | The Utility Channels are general purpose pipes between modem and host. | ||
14 | |||
15 | ST-Ericsson modems support a number of transports between modem | ||
16 | and host. Currently, UART and Loopback are available for Linux. | ||
17 | |||
18 | |||
19 | Architecture: | ||
20 | ------------ | ||
21 | The implementation of CAIF is divided into: | ||
22 | * CAIF Socket Layer, Kernel API, and Net Device. | ||
23 | * CAIF Core Protocol Implementation | ||
24 | * CAIF Link Layer, implemented as NET devices. | ||
25 | |||
26 | |||
27 | RTNL | ||
28 | ! | ||
29 | ! +------+ +------+ +------+ | ||
30 | ! +------+! +------+! +------+! | ||
31 | ! ! Sock !! !Kernel!! ! Net !! | ||
32 | ! ! API !+ ! API !+ ! Dev !+ <- CAIF Client APIs | ||
33 | ! +------+ +------! +------+ | ||
34 | ! ! ! ! | ||
35 | ! +----------!----------+ | ||
36 | ! +------+ <- CAIF Protocol Implementation | ||
37 | +-------> ! CAIF ! | ||
38 | ! Core ! | ||
39 | +------+ | ||
40 | +--------!--------+ | ||
41 | ! ! | ||
42 | +------+ +-----+ | ||
43 | ! ! ! TTY ! <- Link Layer (Net Devices) | ||
44 | +------+ +-----+ | ||
45 | |||
46 | |||
47 | Using the Kernel API | ||
48 | ---------------------- | ||
49 | The Kernel API is used for accessing CAIF channels from the | ||
50 | kernel. | ||
51 | The user of the API has to implement two callbacks for receive | ||
52 | and control. | ||
53 | The receive callback gives a CAIF packet as a SKB. The control | ||
54 | callback will | ||
55 | notify of channel initialization complete, and flow-on/flow- | ||
56 | off. | ||
57 | |||
58 | |||
59 | struct caif_device caif_dev = { | ||
60 | .caif_config = { | ||
61 | .name = "MYDEV" | ||
62 | .type = CAIF_CHTY_AT | ||
63 | } | ||
64 | .receive_cb = my_receive, | ||
65 | .control_cb = my_control, | ||
66 | }; | ||
67 | caif_add_device(&caif_dev); | ||
68 | caif_transmit(&caif_dev, skb); | ||
69 | |||
70 | See the caif_kernel.h for details about the CAIF kernel API. | ||
71 | |||
72 | |||
73 | I M P L E M E N T A T I O N | ||
74 | =========================== | ||
75 | =========================== | ||
76 | |||
77 | CAIF Core Protocol Layer | ||
78 | ========================================= | ||
79 | |||
80 | CAIF Core layer implements the CAIF protocol as defined by ST-Ericsson. | ||
81 | It implements the CAIF protocol stack in a layered approach, where | ||
82 | each layer described in the specification is implemented as a separate layer. | ||
83 | The architecture is inspired by the design patterns "Protocol Layer" and | ||
84 | "Protocol Packet". | ||
85 | |||
86 | == CAIF structure == | ||
87 | The Core CAIF implementation contains: | ||
88 | - Simple implementation of CAIF. | ||
89 | - Layered architecture (a la Streams), each layer in the CAIF | ||
90 | specification is implemented in a separate c-file. | ||
91 | - Clients must implement PHY layer to access physical HW | ||
92 | with receive and transmit functions. | ||
93 | - Clients must call configuration function to add PHY layer. | ||
94 | - Clients must implement CAIF layer to consume/produce | ||
95 | CAIF payload with receive and transmit functions. | ||
96 | - Clients must call configuration function to add and connect the | ||
97 | Client layer. | ||
98 | - When receiving / transmitting CAIF Packets (cfpkt), ownership is passed | ||
99 | to the called function (except for framing layers' receive functions | ||
100 | or if a transmit function returns an error, in which case the caller | ||
101 | must free the packet). | ||
102 | |||
103 | Layered Architecture | ||
104 | -------------------- | ||
105 | The CAIF protocol can be divided into two parts: Support functions and Protocol | ||
106 | Implementation. The support functions include: | ||
107 | |||
108 | - CFPKT CAIF Packet. Implementation of CAIF Protocol Packet. The | ||
109 | CAIF Packet has functions for creating, destroying and adding content | ||
110 | and for adding/extracting header and trailers to protocol packets. | ||
111 | |||
112 | - CFLST CAIF list implementation. | ||
113 | |||
114 | - CFGLUE CAIF Glue. Contains OS Specifics, such as memory | ||
115 | allocation, endianness, etc. | ||
116 | |||
117 | The CAIF Protocol implementation contains: | ||
118 | |||
119 | - CFCNFG CAIF Configuration layer. Configures the CAIF Protocol | ||
120 | Stack and provides a Client interface for adding Link-Layer and | ||
121 | Driver interfaces on top of the CAIF Stack. | ||
122 | |||
123 | - CFCTRL CAIF Control layer. Encodes and Decodes control messages | ||
124 | such as enumeration and channel setup. Also matches request and | ||
125 | response messages. | ||
126 | |||
127 | - CFSERVL General CAIF Service Layer functionality; handles flow | ||
128 | control and remote shutdown requests. | ||
129 | |||
130 | - CFVEI CAIF VEI layer. Handles CAIF AT Channels on VEI (Virtual | ||
131 | External Interface). This layer encodes/decodes VEI frames. | ||
132 | |||
133 | - CFDGML CAIF Datagram layer. Handles CAIF Datagram layer (IP | ||
134 | traffic), encodes/decodes Datagram frames. | ||
135 | |||
136 | - CFMUX CAIF Mux layer. Handles multiplexing between multiple | ||
137 | physical bearers and multiple channels such as VEI, Datagram, etc. | ||
138 | The MUX keeps track of the existing CAIF Channels and | ||
139 | Physical Instances and selects the apropriate instance based | ||
140 | on Channel-Id and Physical-ID. | ||
141 | |||
142 | - CFFRML CAIF Framing layer. Handles Framing i.e. Frame length | ||
143 | and frame checksum. | ||
144 | |||
145 | - CFSERL CAIF Serial layer. Handles concatenation/split of frames | ||
146 | into CAIF Frames with correct length. | ||
147 | |||
148 | |||
149 | |||
150 | +---------+ | ||
151 | | Config | | ||
152 | | CFCNFG | | ||
153 | +---------+ | ||
154 | ! | ||
155 | +---------+ +---------+ +---------+ | ||
156 | | AT | | Control | | Datagram| | ||
157 | | CFVEIL | | CFCTRL | | CFDGML | | ||
158 | +---------+ +---------+ +---------+ | ||
159 | \_____________!______________/ | ||
160 | ! | ||
161 | +---------+ | ||
162 | | MUX | | ||
163 | | | | ||
164 | +---------+ | ||
165 | _____!_____ | ||
166 | / \ | ||
167 | +---------+ +---------+ | ||
168 | | CFFRML | | CFFRML | | ||
169 | | Framing | | Framing | | ||
170 | +---------+ +---------+ | ||
171 | ! ! | ||
172 | +---------+ +---------+ | ||
173 | | | | Serial | | ||
174 | | | | CFSERL | | ||
175 | +---------+ +---------+ | ||
176 | |||
177 | |||
178 | In this layered approach the following "rules" apply. | ||
179 | - All layers embed the same structure "struct cflayer" | ||
180 | - A layer does not depend on any other layer's private data. | ||
181 | - Layers are stacked by setting the pointers | ||
182 | layer->up , layer->dn | ||
183 | - In order to send data upwards, each layer should do | ||
184 | layer->up->receive(layer->up, packet); | ||
185 | - In order to send data downwards, each layer should do | ||
186 | layer->dn->transmit(layer->dn, packet); | ||
187 | |||
188 | |||
189 | Linux Driver Implementation | ||
190 | =========================== | ||
191 | |||
192 | Linux GPRS Net Device and CAIF socket are implemented on top of the | ||
193 | CAIF Core protocol. The Net device and CAIF socket have an instance of | ||
194 | 'struct cflayer', just like the CAIF Core protocol stack. | ||
195 | Net device and Socket implement the 'receive()' function defined by | ||
196 | 'struct cflayer', just like the rest of the CAIF stack. In this way, transmit and | ||
197 | receive of packets is handled as by the rest of the layers: the 'dn->transmit()' | ||
198 | function is called in order to transmit data. | ||
199 | |||
200 | The layer on top of the CAIF Core implementation is | ||
201 | sometimes referred to as the "Client layer". | ||
202 | |||
203 | |||
204 | Configuration of Link Layer | ||
205 | --------------------------- | ||
206 | The Link Layer is implemented as Linux net devices (struct net_device). | ||
207 | Payload handling and registration is done using standard Linux mechanisms. | ||
208 | |||
209 | The CAIF Protocol relies on a loss-less link layer without implementing | ||
210 | retransmission. This implies that packet drops must not happen. | ||
211 | Therefore a flow-control mechanism is implemented where the physical | ||
212 | interface can initiate flow stop for all CAIF Channels. | ||
diff --git a/Documentation/networking/caif/README b/Documentation/networking/caif/README new file mode 100644 index 000000000000..757ccfaa1385 --- /dev/null +++ b/Documentation/networking/caif/README | |||
@@ -0,0 +1,109 @@ | |||
1 | Copyright (C) ST-Ericsson AB 2010 | ||
2 | Author: Sjur Brendeland/ sjur.brandeland@stericsson.com | ||
3 | License terms: GNU General Public License (GPL) version 2 | ||
4 | --------------------------------------------------------- | ||
5 | |||
6 | === Start === | ||
7 | If you have compiled CAIF for modules do: | ||
8 | |||
9 | $modprobe crc_ccitt | ||
10 | $modprobe caif | ||
11 | $modprobe caif_socket | ||
12 | $modprobe chnl_net | ||
13 | |||
14 | |||
15 | === Preparing the setup with a STE modem === | ||
16 | |||
17 | If you are working on integration of CAIF you should make sure | ||
18 | that the kernel is built with module support. | ||
19 | |||
20 | There are some things that need to be tweaked to get the host TTY correctly | ||
21 | set up to talk to the modem. | ||
22 | Since the CAIF stack is running in the kernel and we want to use the existing | ||
23 | TTY, we are installing our physical serial driver as a line discipline above | ||
24 | the TTY device. | ||
25 | |||
26 | To achieve this we need to install the N_CAIF ldisc from user space. | ||
27 | The benefit is that we can hook up to any TTY. | ||
28 | |||
29 | The use of Start-of-frame-extension (STX) must also be set as | ||
30 | module parameter "ser_use_stx". | ||
31 | |||
32 | Normally Frame Checksum is always used on UART, but this is also provided as a | ||
33 | module parameter "ser_use_fcs". | ||
34 | |||
35 | $ modprobe caif_serial ser_ttyname=/dev/ttyS0 ser_use_stx=yes | ||
36 | $ ifconfig caif_ttyS0 up | ||
37 | |||
38 | PLEASE NOTE: There is a limitation in Android shell. | ||
39 | It only accepts one argument to insmod/modprobe! | ||
40 | |||
41 | === Trouble shooting === | ||
42 | |||
43 | There are debugfs parameters provided for serial communication. | ||
44 | /sys/kernel/debug/caif_serial/<tty-name>/ | ||
45 | |||
46 | * ser_state: Prints the bit-mask status where | ||
47 | - 0x02 means SENDING, this is a transient state. | ||
48 | - 0x10 means FLOW_OFF_SENT, i.e. the previous frame has not been sent | ||
49 | and is blocking further send operation. Flow OFF has been propagated | ||
50 | to all CAIF Channels using this TTY. | ||
51 | |||
52 | * tty_status: Prints the bit-mask tty status information | ||
53 | - 0x01 - tty->warned is on. | ||
54 | - 0x02 - tty->low_latency is on. | ||
55 | - 0x04 - tty->packed is on. | ||
56 | - 0x08 - tty->flow_stopped is on. | ||
57 | - 0x10 - tty->hw_stopped is on. | ||
58 | - 0x20 - tty->stopped is on. | ||
59 | |||
60 | * last_tx_msg: Binary blob Prints the last transmitted frame. | ||
61 | This can be printed with | ||
62 | $od --format=x1 /sys/kernel/debug/caif_serial/<tty>/last_rx_msg. | ||
63 | The first two tx messages sent look like this. Note: The initial | ||
64 | byte 02 is start of frame extension (STX) used for re-syncing | ||
65 | upon errors. | ||
66 | |||
67 | - Enumeration: | ||
68 | 0000000 02 05 00 00 03 01 d2 02 | ||
69 | | | | | | | | ||
70 | STX(1) | | | | | ||
71 | Length(2)| | | | ||
72 | Control Channel(1) | ||
73 | Command:Enumeration(1) | ||
74 | Link-ID(1) | ||
75 | Checksum(2) | ||
76 | - Channel Setup: | ||
77 | 0000000 02 07 00 00 00 21 a1 00 48 df | ||
78 | | | | | | | | | | ||
79 | STX(1) | | | | | | | ||
80 | Length(2)| | | | | | ||
81 | Control Channel(1) | ||
82 | Command:Channel Setup(1) | ||
83 | Channel Type(1) | ||
84 | Priority and Link-ID(1) | ||
85 | Endpoint(1) | ||
86 | Checksum(2) | ||
87 | |||
88 | * last_rx_msg: Prints the last transmitted frame. | ||
89 | The RX messages for LinkSetup look almost identical but they have the | ||
90 | bit 0x20 set in the command bit, and Channel Setup has added one byte | ||
91 | before Checksum containing Channel ID. | ||
92 | NOTE: Several CAIF Messages might be concatenated. The maximum debug | ||
93 | buffer size is 128 bytes. | ||
94 | |||
95 | == Error Scenarios: | ||
96 | - last_tx_msg contains channel setup message and last_rx_msg is empty -> | ||
97 | The host seems to be able to send over the UART, at least the CAIF ldisc get | ||
98 | notified that sending is completed. | ||
99 | |||
100 | - last_tx_msg contains enumeration message and last_rx_msg is empty -> | ||
101 | The host is not able to send the message from UART, the tty has not been | ||
102 | able to complete the transmit operation. | ||
103 | |||
104 | - if /sys/kernel/debug/caif_serial/<tty>/tty_status is non-zero there | ||
105 | might be problems transmitting over UART. | ||
106 | E.g. host and modem wiring is not correct you will typically see | ||
107 | tty_status = 0x10 (hw_stopped) and ser_state = 0x10 (FLOW_OFF_SENT). | ||
108 | You will probably see the enumeration message in last_tx_message | ||
109 | and empty last_rx_message. | ||
diff --git a/Documentation/networking/cxacru-cf.py b/Documentation/networking/cxacru-cf.py new file mode 100644 index 000000000000..b41d298398c8 --- /dev/null +++ b/Documentation/networking/cxacru-cf.py | |||
@@ -0,0 +1,48 @@ | |||
1 | #!/usr/bin/env python | ||
2 | # Copyright 2009 Simon Arlott | ||
3 | # | ||
4 | # This program is free software; you can redistribute it and/or modify it | ||
5 | # under the terms of the GNU General Public License as published by the Free | ||
6 | # Software Foundation; either version 2 of the License, or (at your option) | ||
7 | # any later version. | ||
8 | # | ||
9 | # This program is distributed in the hope that it will be useful, but WITHOUT | ||
10 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
11 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
12 | # more details. | ||
13 | # | ||
14 | # You should have received a copy of the GNU General Public License along with | ||
15 | # this program; if not, write to the Free Software Foundation, Inc., 59 | ||
16 | # Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | # | ||
18 | # Usage: cxacru-cf.py < cxacru-cf.bin | ||
19 | # Output: values string suitable for the sysfs adsl_config attribute | ||
20 | # | ||
21 | # Warning: cxacru-cf.bin with MD5 hash cdbac2689969d5ed5d4850f117702110 | ||
22 | # contains mis-aligned values which will stop the modem from being able | ||
23 | # to make a connection. If the first and last two bytes are removed then | ||
24 | # the values become valid, but the modulation will be forced to ANSI | ||
25 | # T1.413 only which may not be appropriate. | ||
26 | # | ||
27 | # The original binary format is a packed list of le32 values. | ||
28 | |||
29 | import sys | ||
30 | import struct | ||
31 | |||
32 | i = 0 | ||
33 | while True: | ||
34 | buf = sys.stdin.read(4) | ||
35 | |||
36 | if len(buf) == 0: | ||
37 | break | ||
38 | elif len(buf) != 4: | ||
39 | sys.stdout.write("\n") | ||
40 | sys.stderr.write("Error: read {0} not 4 bytes\n".format(len(buf))) | ||
41 | sys.exit(1) | ||
42 | |||
43 | if i > 0: | ||
44 | sys.stdout.write(" ") | ||
45 | sys.stdout.write("{0:x}={1}".format(i, struct.unpack("<I", buf)[0])) | ||
46 | i += 1 | ||
47 | |||
48 | sys.stdout.write("\n") | ||
diff --git a/Documentation/networking/cxacru.txt b/Documentation/networking/cxacru.txt index b074681a963e..2cce04457b4d 100644 --- a/Documentation/networking/cxacru.txt +++ b/Documentation/networking/cxacru.txt | |||
@@ -4,6 +4,12 @@ While it is capable of managing/maintaining the ADSL connection without the | |||
4 | module loaded, the device will sometimes stop responding after unloading the | 4 | module loaded, the device will sometimes stop responding after unloading the |
5 | driver and it is necessary to unplug/remove power to the device to fix this. | 5 | driver and it is necessary to unplug/remove power to the device to fix this. |
6 | 6 | ||
7 | Note: support for cxacru-cf.bin has been removed. It was not loaded correctly | ||
8 | so it had no effect on the device configuration. Fixing it could have stopped | ||
9 | existing devices working when an invalid configuration is supplied. | ||
10 | |||
11 | There is a script cxacru-cf.py to convert an existing file to the sysfs form. | ||
12 | |||
7 | Detected devices will appear as ATM devices named "cxacru". In /sys/class/atm/ | 13 | Detected devices will appear as ATM devices named "cxacru". In /sys/class/atm/ |
8 | these are directories named cxacruN where N is the device number. A symlink | 14 | these are directories named cxacruN where N is the device number. A symlink |
9 | named device points to the USB interface device's directory which contains | 15 | named device points to the USB interface device's directory which contains |
@@ -15,6 +21,15 @@ several sysfs attribute files for retrieving device statistics: | |||
15 | * adsl_headend_environment | 21 | * adsl_headend_environment |
16 | Information about the remote headend. | 22 | Information about the remote headend. |
17 | 23 | ||
24 | * adsl_config | ||
25 | Configuration writing interface. | ||
26 | Write parameters in hexadecimal format <index>=<value>, | ||
27 | separated by whitespace, e.g.: | ||
28 | "1=0 a=5" | ||
29 | Up to 7 parameters at a time will be sent and the modem will restart | ||
30 | the ADSL connection when any value is set. These are logged for future | ||
31 | reference. | ||
32 | |||
18 | * downstream_attenuation (dB) | 33 | * downstream_attenuation (dB) |
19 | * downstream_bits_per_frame | 34 | * downstream_bits_per_frame |
20 | * downstream_rate (kbps) | 35 | * downstream_rate (kbps) |
@@ -61,6 +76,7 @@ several sysfs attribute files for retrieving device statistics: | |||
61 | * mac_address | 76 | * mac_address |
62 | 77 | ||
63 | * modulation | 78 | * modulation |
79 | "" (when not connected) | ||
64 | "ANSI T1.413" | 80 | "ANSI T1.413" |
65 | "ITU-T G.992.1 (G.DMT)" | 81 | "ITU-T G.992.1 (G.DMT)" |
66 | "ITU-T G.992.2 (G.LITE)" | 82 | "ITU-T G.992.2 (G.LITE)" |
diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index b132e4a3cf0f..a62fdf7a6bff 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt | |||
@@ -58,8 +58,10 @@ DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet | |||
58 | size (application payload size) in bytes, see RFC 4340, section 14. | 58 | size (application payload size) in bytes, see RFC 4340, section 14. |
59 | 59 | ||
60 | DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs | 60 | DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs |
61 | supported by the endpoint (see include/linux/dccp.h for symbolic constants). | 61 | supported by the endpoint. The option value is an array of type uint8_t whose |
62 | The caller needs to provide a sufficiently large (> 2) array of type uint8_t. | 62 | size is passed as option length. The minimum array size is 4 elements, the |
63 | value returned in the optlen argument always reflects the true number of | ||
64 | built-in CCIDs. | ||
63 | 65 | ||
64 | DCCP_SOCKOPT_CCID is write-only and sets both the TX and RX CCIDs at the same | 66 | DCCP_SOCKOPT_CCID is write-only and sets both the TX and RX CCIDs at the same |
65 | time, combining the operation of the next two socket options. This option is | 67 | time, combining the operation of the next two socket options. This option is |
diff --git a/Documentation/networking/ifenslave.c b/Documentation/networking/ifenslave.c index 1b96ccda3836..2bac9618c345 100644 --- a/Documentation/networking/ifenslave.c +++ b/Documentation/networking/ifenslave.c | |||
@@ -756,7 +756,7 @@ static int enslave(char *master_ifname, char *slave_ifname) | |||
756 | */ | 756 | */ |
757 | if (abi_ver < 1) { | 757 | if (abi_ver < 1) { |
758 | /* For old ABI, the master needs to be | 758 | /* For old ABI, the master needs to be |
759 | * down before setting it's hwaddr | 759 | * down before setting its hwaddr |
760 | */ | 760 | */ |
761 | res = set_if_down(master_ifname, master_flags.ifr_flags); | 761 | res = set_if_down(master_ifname, master_flags.ifr_flags); |
762 | if (res) { | 762 | if (res) { |
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index e87f3cdc8a6a..d0536b5a4e01 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt | |||
@@ -487,6 +487,30 @@ tcp_dma_copybreak - INTEGER | |||
487 | and CONFIG_NET_DMA is enabled. | 487 | and CONFIG_NET_DMA is enabled. |
488 | Default: 4096 | 488 | Default: 4096 |
489 | 489 | ||
490 | tcp_thin_linear_timeouts - BOOLEAN | ||
491 | Enable dynamic triggering of linear timeouts for thin streams. | ||
492 | If set, a check is performed upon retransmission by timeout to | ||
493 | determine if the stream is thin (less than 4 packets in flight). | ||
494 | As long as the stream is found to be thin, up to 6 linear | ||
495 | timeouts may be performed before exponential backoff mode is | ||
496 | initiated. This improves retransmission latency for | ||
497 | non-aggressive thin streams, often found to be time-dependent. | ||
498 | For more information on thin streams, see | ||
499 | Documentation/networking/tcp-thin.txt | ||
500 | Default: 0 | ||
501 | |||
502 | tcp_thin_dupack - BOOLEAN | ||
503 | Enable dynamic triggering of retransmissions after one dupACK | ||
504 | for thin streams. If set, a check is performed upon reception | ||
505 | of a dupACK to determine if the stream is thin (less than 4 | ||
506 | packets in flight). As long as the stream is found to be thin, | ||
507 | data is retransmitted on the first received dupACK. This | ||
508 | improves retransmission latency for non-aggressive thin | ||
509 | streams, often found to be time-dependent. | ||
510 | For more information on thin streams, see | ||
511 | Documentation/networking/tcp-thin.txt | ||
512 | Default: 0 | ||
513 | |||
490 | UDP variables: | 514 | UDP variables: |
491 | 515 | ||
492 | udp_mem - vector of 3 INTEGERs: min, pressure, max | 516 | udp_mem - vector of 3 INTEGERs: min, pressure, max |
@@ -564,6 +588,37 @@ ip_local_port_range - 2 INTEGERS | |||
564 | (i.e. by default) range 1024-4999 is enough to issue up to | 588 | (i.e. by default) range 1024-4999 is enough to issue up to |
565 | 2000 connections per second to systems supporting timestamps. | 589 | 2000 connections per second to systems supporting timestamps. |
566 | 590 | ||
591 | ip_local_reserved_ports - list of comma separated ranges | ||
592 | Specify the ports which are reserved for known third-party | ||
593 | applications. These ports will not be used by automatic port | ||
594 | assignments (e.g. when calling connect() or bind() with port | ||
595 | number 0). Explicit port allocation behavior is unchanged. | ||
596 | |||
597 | The format used for both input and output is a comma separated | ||
598 | list of ranges (e.g. "1,2-4,10-10" for ports 1, 2, 3, 4 and | ||
599 | 10). Writing to the file will clear all previously reserved | ||
600 | ports and update the current list with the one given in the | ||
601 | input. | ||
602 | |||
603 | Note that ip_local_port_range and ip_local_reserved_ports | ||
604 | settings are independent and both are considered by the kernel | ||
605 | when determining which ports are available for automatic port | ||
606 | assignments. | ||
607 | |||
608 | You can reserve ports which are not in the current | ||
609 | ip_local_port_range, e.g.: | ||
610 | |||
611 | $ cat /proc/sys/net/ipv4/ip_local_port_range | ||
612 | 32000 61000 | ||
613 | $ cat /proc/sys/net/ipv4/ip_local_reserved_ports | ||
614 | 8080,9148 | ||
615 | |||
616 | although this is redundant. However such a setting is useful | ||
617 | if later the port range is changed to a value that will | ||
618 | include the reserved ports. | ||
619 | |||
620 | Default: Empty | ||
621 | |||
567 | ip_nonlocal_bind - BOOLEAN | 622 | ip_nonlocal_bind - BOOLEAN |
568 | If set, allows processes to bind() to non-local IP addresses, | 623 | If set, allows processes to bind() to non-local IP addresses, |
569 | which can be quite useful - but may break some applications. | 624 | which can be quite useful - but may break some applications. |
@@ -692,6 +747,25 @@ proxy_arp - BOOLEAN | |||
692 | conf/{all,interface}/proxy_arp is set to TRUE, | 747 | conf/{all,interface}/proxy_arp is set to TRUE, |
693 | it will be disabled otherwise | 748 | it will be disabled otherwise |
694 | 749 | ||
750 | proxy_arp_pvlan - BOOLEAN | ||
751 | Private VLAN proxy arp. | ||
752 | Basically allow proxy arp replies back to the same interface | ||
753 | (from which the ARP request/solicitation was received). | ||
754 | |||
755 | This is done to support (ethernet) switch features, like RFC | ||
756 | 3069, where the individual ports are NOT allowed to | ||
757 | communicate with each other, but they are allowed to talk to | ||
758 | the upstream router. As described in RFC 3069, it is possible | ||
759 | to allow these hosts to communicate through the upstream | ||
760 | router by proxy_arp'ing. Don't need to be used together with | ||
761 | proxy_arp. | ||
762 | |||
763 | This technology is known by different names: | ||
764 | In RFC 3069 it is called VLAN Aggregation. | ||
765 | Cisco and Allied Telesyn call it Private VLAN. | ||
766 | Hewlett-Packard call it Source-Port filtering or port-isolation. | ||
767 | Ericsson call it MAC-Forced Forwarding (RFC Draft). | ||
768 | |||
695 | shared_media - BOOLEAN | 769 | shared_media - BOOLEAN |
696 | Send(router) or accept(host) RFC1620 shared media redirects. | 770 | Send(router) or accept(host) RFC1620 shared media redirects. |
697 | Overrides ip_secure_redirects. | 771 | Overrides ip_secure_redirects. |
@@ -833,9 +907,18 @@ arp_notify - BOOLEAN | |||
833 | or hardware address changes. | 907 | or hardware address changes. |
834 | 908 | ||
835 | arp_accept - BOOLEAN | 909 | arp_accept - BOOLEAN |
836 | Define behavior when gratuitous arp replies are received: | 910 | Define behavior for gratuitous ARP frames who's IP is not |
837 | 0 - drop gratuitous arp frames | 911 | already present in the ARP table: |
838 | 1 - accept gratuitous arp frames | 912 | 0 - don't create new entries in the ARP table |
913 | 1 - create new entries in the ARP table | ||
914 | |||
915 | Both replies and requests type gratuitous arp will trigger the | ||
916 | ARP table to be updated, if this setting is on. | ||
917 | |||
918 | If the ARP table already contains the IP address of the | ||
919 | gratuitous arp frame, the arp table will be updated regardless | ||
920 | if this setting is on or off. | ||
921 | |||
839 | 922 | ||
840 | app_solicit - INTEGER | 923 | app_solicit - INTEGER |
841 | The maximum number of probes to send to the user space ARP daemon | 924 | The maximum number of probes to send to the user space ARP daemon |
diff --git a/Documentation/networking/ixgbevf.txt b/Documentation/networking/ixgbevf.txt new file mode 100755 index 000000000000..19015de6725f --- /dev/null +++ b/Documentation/networking/ixgbevf.txt | |||
@@ -0,0 +1,90 @@ | |||
1 | Linux* Base Driver for Intel(R) Network Connection | ||
2 | ================================================== | ||
3 | |||
4 | November 24, 2009 | ||
5 | |||
6 | Contents | ||
7 | ======== | ||
8 | |||
9 | - In This Release | ||
10 | - Identifying Your Adapter | ||
11 | - Known Issues/Troubleshooting | ||
12 | - Support | ||
13 | |||
14 | In This Release | ||
15 | =============== | ||
16 | |||
17 | This file describes the ixgbevf Linux* Base Driver for Intel Network | ||
18 | Connection. | ||
19 | |||
20 | The ixgbevf driver supports 82599-based virtual function devices that can only | ||
21 | be activated on kernels with CONFIG_PCI_IOV enabled. | ||
22 | |||
23 | The ixgbevf driver supports virtual functions generated by the ixgbe driver | ||
24 | with a max_vfs value of 1 or greater. | ||
25 | |||
26 | The guest OS loading the ixgbevf driver must support MSI-X interrupts. | ||
27 | |||
28 | VLANs: There is a limit of a total of 32 shared VLANs to 1 or more VFs. | ||
29 | |||
30 | Identifying Your Adapter | ||
31 | ======================== | ||
32 | |||
33 | For more information on how to identify your adapter, go to the Adapter & | ||
34 | Driver ID Guide at: | ||
35 | |||
36 | http://support.intel.com/support/network/sb/CS-008441.htm | ||
37 | |||
38 | Known Issues/Troubleshooting | ||
39 | ============================ | ||
40 | |||
41 | Unloading Physical Function (PF) Driver Causes System Reboots When VM is | ||
42 | Running and VF is Loaded on the VM | ||
43 | ------------------------------------------------------------------------ | ||
44 | Do not unload the PF driver (ixgbe) while VFs are assigned to guests. | ||
45 | |||
46 | Support | ||
47 | ======= | ||
48 | |||
49 | For general information, go to the Intel support website at: | ||
50 | |||
51 | http://support.intel.com | ||
52 | |||
53 | or the Intel Wired Networking project hosted by Sourceforge at: | ||
54 | |||
55 | http://sourceforge.net/projects/e1000 | ||
56 | |||
57 | If an issue is identified with the released source code on the supported | ||
58 | kernel with a supported adapter, email the specific information related | ||
59 | to the issue to e1000-devel@lists.sf.net | ||
60 | |||
61 | License | ||
62 | ======= | ||
63 | |||
64 | Intel 10 Gigabit Linux driver. | ||
65 | Copyright(c) 1999 - 2009 Intel Corporation. | ||
66 | |||
67 | This program is free software; you can redistribute it and/or modify it | ||
68 | under the terms and conditions of the GNU General Public License, | ||
69 | version 2, as published by the Free Software Foundation. | ||
70 | |||
71 | This program is distributed in the hope it will be useful, but WITHOUT | ||
72 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
73 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
74 | more details. | ||
75 | |||
76 | You should have received a copy of the GNU General Public License along with | ||
77 | this program; if not, write to the Free Software Foundation, Inc., | ||
78 | 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. | ||
79 | |||
80 | The full GNU General Public License is included in this distribution in | ||
81 | the file called "COPYING". | ||
82 | |||
83 | Trademarks | ||
84 | ========== | ||
85 | |||
86 | Intel, Itanium, and Pentium are trademarks or registered trademarks of | ||
87 | Intel Corporation or its subsidiaries in the United States and other | ||
88 | countries. | ||
89 | |||
90 | * Other names and brands may be claimed as the property of others. | ||
diff --git a/Documentation/networking/l2tp.txt b/Documentation/networking/l2tp.txt index 63214b280e00..e7bf3979facb 100644 --- a/Documentation/networking/l2tp.txt +++ b/Documentation/networking/l2tp.txt | |||
@@ -1,44 +1,95 @@ | |||
1 | This brief document describes how to use the kernel's PPPoL2TP driver | 1 | This document describes how to use the kernel's L2TP drivers to |
2 | to provide L2TP functionality. L2TP is a protocol that tunnels one or | 2 | provide L2TP functionality. L2TP is a protocol that tunnels one or |
3 | more PPP sessions over a UDP tunnel. It is commonly used for VPNs | 3 | more sessions over an IP tunnel. It is commonly used for VPNs |
4 | (L2TP/IPSec) and by ISPs to tunnel subscriber PPP sessions over an IP | 4 | (L2TP/IPSec) and by ISPs to tunnel subscriber PPP sessions over an IP |
5 | network infrastructure. | 5 | network infrastructure. With L2TPv3, it is also useful as a Layer-2 |
6 | tunneling infrastructure. | ||
7 | |||
8 | Features | ||
9 | ======== | ||
10 | |||
11 | L2TPv2 (PPP over L2TP (UDP tunnels)). | ||
12 | L2TPv3 ethernet pseudowires. | ||
13 | L2TPv3 PPP pseudowires. | ||
14 | L2TPv3 IP encapsulation. | ||
15 | Netlink sockets for L2TPv3 configuration management. | ||
16 | |||
17 | History | ||
18 | ======= | ||
19 | |||
20 | The original pppol2tp driver was introduced in 2.6.23 and provided | ||
21 | L2TPv2 functionality (rfc2661). L2TPv2 is used to tunnel one or more PPP | ||
22 | sessions over a UDP tunnel. | ||
23 | |||
24 | L2TPv3 (rfc3931) changes the protocol to allow different frame types | ||
25 | to be passed over an L2TP tunnel by moving the PPP-specific parts of | ||
26 | the protocol out of the core L2TP packet headers. Each frame type is | ||
27 | known as a pseudowire type. Ethernet, PPP, HDLC, Frame Relay and ATM | ||
28 | pseudowires for L2TP are defined in separate RFC standards. Another | ||
29 | change for L2TPv3 is that it can be carried directly over IP with no | ||
30 | UDP header (UDP is optional). It is also possible to create static | ||
31 | unmanaged L2TPv3 tunnels manually without a control protocol | ||
32 | (userspace daemon) to manage them. | ||
33 | |||
34 | To support L2TPv3, the original pppol2tp driver was split up to | ||
35 | separate the L2TP and PPP functionality. Existing L2TPv2 userspace | ||
36 | apps should be unaffected as the original pppol2tp sockets API is | ||
37 | retained. L2TPv3, however, uses netlink to manage L2TPv3 tunnels and | ||
38 | sessions. | ||
6 | 39 | ||
7 | Design | 40 | Design |
8 | ====== | 41 | ====== |
9 | 42 | ||
10 | The PPPoL2TP driver, drivers/net/pppol2tp.c, provides a mechanism by | 43 | The L2TP protocol separates control and data frames. The L2TP kernel |
11 | which PPP frames carried through an L2TP session are passed through | 44 | drivers handle only L2TP data frames; control frames are always |
12 | the kernel's PPP subsystem. The standard PPP daemon, pppd, handles all | 45 | handled by userspace. L2TP control frames carry messages between L2TP |
13 | PPP interaction with the peer. PPP network interfaces are created for | 46 | clients/servers and are used to setup / teardown tunnels and |
14 | each local PPP endpoint. | 47 | sessions. An L2TP client or server is implemented in userspace. |
15 | 48 | ||
16 | The L2TP protocol http://www.faqs.org/rfcs/rfc2661.html defines L2TP | 49 | Each L2TP tunnel is implemented using a UDP or L2TPIP socket; L2TPIP |
17 | control and data frames. L2TP control frames carry messages between | 50 | provides L2TPv3 IP encapsulation (no UDP) and is implemented using a |
18 | L2TP clients/servers and are used to setup / teardown tunnels and | 51 | new l2tpip socket family. The tunnel socket is typically created by |
19 | sessions. An L2TP client or server is implemented in userspace and | 52 | userspace, though for unmanaged L2TPv3 tunnels, the socket can also be |
20 | will use a regular UDP socket per tunnel. L2TP data frames carry PPP | 53 | created by the kernel. Each L2TP session (pseudowire) gets a network |
21 | frames, which may be PPP control or PPP data. The kernel's PPP | 54 | interface instance. In the case of PPP, these interfaces are created |
55 | indirectly by pppd using a pppol2tp socket. In the case of ethernet, | ||
56 | the netdevice is created upon a netlink request to create an L2TPv3 | ||
57 | ethernet pseudowire. | ||
58 | |||
59 | For PPP, the PPPoL2TP driver, net/l2tp/l2tp_ppp.c, provides a | ||
60 | mechanism by which PPP frames carried through an L2TP session are | ||
61 | passed through the kernel's PPP subsystem. The standard PPP daemon, | ||
62 | pppd, handles all PPP interaction with the peer. PPP network | ||
63 | interfaces are created for each local PPP endpoint. The kernel's PPP | ||
22 | subsystem arranges for PPP control frames to be delivered to pppd, | 64 | subsystem arranges for PPP control frames to be delivered to pppd, |
23 | while data frames are forwarded as usual. | 65 | while data frames are forwarded as usual. |
24 | 66 | ||
67 | For ethernet, the L2TPETH driver, net/l2tp/l2tp_eth.c, implements a | ||
68 | netdevice driver, managing virtual ethernet devices, one per | ||
69 | pseudowire. These interfaces can be managed using standard Linux tools | ||
70 | such as "ip" and "ifconfig". If only IP frames are passed over the | ||
71 | tunnel, the interface can be given an IP addresses of itself and its | ||
72 | peer. If non-IP frames are to be passed over the tunnel, the interface | ||
73 | can be added to a bridge using brctl. All L2TP datapath protocol | ||
74 | functions are handled by the L2TP core driver. | ||
75 | |||
25 | Each tunnel and session within a tunnel is assigned a unique tunnel_id | 76 | Each tunnel and session within a tunnel is assigned a unique tunnel_id |
26 | and session_id. These ids are carried in the L2TP header of every | 77 | and session_id. These ids are carried in the L2TP header of every |
27 | control and data packet. The pppol2tp driver uses them to lookup | 78 | control and data packet. (Actually, in L2TPv3, the tunnel_id isn't |
28 | internal tunnel and/or session contexts. Zero tunnel / session ids are | 79 | present in data frames - it is inferred from the IP connection on |
29 | treated specially - zero ids are never assigned to tunnels or sessions | 80 | which the packet was received.) The L2TP driver uses the ids to lookup |
30 | in the network. In the driver, the tunnel context keeps a pointer to | 81 | internal tunnel and/or session contexts to determine how to handle the |
31 | the tunnel UDP socket. The session context keeps a pointer to the | 82 | packet. Zero tunnel / session ids are treated specially - zero ids are |
32 | PPPoL2TP socket, as well as other data that lets the driver interface | 83 | never assigned to tunnels or sessions in the network. In the driver, |
33 | to the kernel PPP subsystem. | 84 | the tunnel context keeps a reference to the tunnel UDP or L2TPIP |
34 | 85 | socket. The session context holds data that lets the driver interface | |
35 | Note that the pppol2tp kernel driver handles only L2TP data frames; | 86 | to the kernel's network frame type subsystems, i.e. PPP, ethernet. |
36 | L2TP control frames are simply passed up to userspace in the UDP | 87 | |
37 | tunnel socket. The kernel handles all datapath aspects of the | 88 | Userspace Programming |
38 | protocol, including data packet resequencing (if enabled). | 89 | ===================== |
39 | 90 | ||
40 | There are a number of requirements on the userspace L2TP daemon in | 91 | For L2TPv2, there are a number of requirements on the userspace L2TP |
41 | order to use the pppol2tp driver. | 92 | daemon in order to use the pppol2tp driver. |
42 | 93 | ||
43 | 1. Use a UDP socket per tunnel. | 94 | 1. Use a UDP socket per tunnel. |
44 | 95 | ||
@@ -86,6 +137,35 @@ In addition to the standard PPP ioctls, a PPPIOCGL2TPSTATS is provided | |||
86 | to retrieve tunnel and session statistics from the kernel using the | 137 | to retrieve tunnel and session statistics from the kernel using the |
87 | PPPoX socket of the appropriate tunnel or session. | 138 | PPPoX socket of the appropriate tunnel or session. |
88 | 139 | ||
140 | For L2TPv3, userspace must use the netlink API defined in | ||
141 | include/linux/l2tp.h to manage tunnel and session contexts. The | ||
142 | general procedure to create a new L2TP tunnel with one session is:- | ||
143 | |||
144 | 1. Open a GENL socket using L2TP_GENL_NAME for configuring the kernel | ||
145 | using netlink. | ||
146 | |||
147 | 2. Create a UDP or L2TPIP socket for the tunnel. | ||
148 | |||
149 | 3. Create a new L2TP tunnel using a L2TP_CMD_TUNNEL_CREATE | ||
150 | request. Set attributes according to desired tunnel parameters, | ||
151 | referencing the UDP or L2TPIP socket created in the previous step. | ||
152 | |||
153 | 4. Create a new L2TP session in the tunnel using a | ||
154 | L2TP_CMD_SESSION_CREATE request. | ||
155 | |||
156 | The tunnel and all of its sessions are closed when the tunnel socket | ||
157 | is closed. The netlink API may also be used to delete sessions and | ||
158 | tunnels. Configuration and status info may be set or read using netlink. | ||
159 | |||
160 | The L2TP driver also supports static (unmanaged) L2TPv3 tunnels. These | ||
161 | are where there is no L2TP control message exchange with the peer to | ||
162 | setup the tunnel; the tunnel is configured manually at each end of the | ||
163 | tunnel. There is no need for an L2TP userspace application in this | ||
164 | case -- the tunnel socket is created by the kernel and configured | ||
165 | using parameters sent in the L2TP_CMD_TUNNEL_CREATE netlink | ||
166 | request. The "ip" utility of iproute2 has commands for managing static | ||
167 | L2TPv3 tunnels; do "ip l2tp help" for more information. | ||
168 | |||
89 | Debugging | 169 | Debugging |
90 | ========= | 170 | ========= |
91 | 171 | ||
@@ -102,6 +182,69 @@ PPPOL2TP_MSG_CONTROL userspace - kernel interface | |||
102 | PPPOL2TP_MSG_SEQ sequence numbers handling | 182 | PPPOL2TP_MSG_SEQ sequence numbers handling |
103 | PPPOL2TP_MSG_DATA data packets | 183 | PPPOL2TP_MSG_DATA data packets |
104 | 184 | ||
185 | If enabled, files under a l2tp debugfs directory can be used to dump | ||
186 | kernel state about L2TP tunnels and sessions. To access it, the | ||
187 | debugfs filesystem must first be mounted. | ||
188 | |||
189 | # mount -t debugfs debugfs /debug | ||
190 | |||
191 | Files under the l2tp directory can then be accessed. | ||
192 | |||
193 | # cat /debug/l2tp/tunnels | ||
194 | |||
195 | The debugfs files should not be used by applications to obtain L2TP | ||
196 | state information because the file format is subject to change. It is | ||
197 | implemented to provide extra debug information to help diagnose | ||
198 | problems.) Users should use the netlink API. | ||
199 | |||
200 | /proc/net/pppol2tp is also provided for backwards compaibility with | ||
201 | the original pppol2tp driver. It lists information about L2TPv2 | ||
202 | tunnels and sessions only. Its use is discouraged. | ||
203 | |||
204 | Unmanaged L2TPv3 Tunnels | ||
205 | ======================== | ||
206 | |||
207 | Some commercial L2TP products support unmanaged L2TPv3 ethernet | ||
208 | tunnels, where there is no L2TP control protocol; tunnels are | ||
209 | configured at each side manually. New commands are available in | ||
210 | iproute2's ip utility to support this. | ||
211 | |||
212 | To create an L2TPv3 ethernet pseudowire between local host 192.168.1.1 | ||
213 | and peer 192.168.1.2, using IP addresses 10.5.1.1 and 10.5.1.2 for the | ||
214 | tunnel endpoints:- | ||
215 | |||
216 | # modprobe l2tp_eth | ||
217 | # modprobe l2tp_netlink | ||
218 | |||
219 | # ip l2tp add tunnel tunnel_id 1 peer_tunnel_id 1 udp_sport 5000 \ | ||
220 | udp_dport 5000 encap udp local 192.168.1.1 remote 192.168.1.2 | ||
221 | # ip l2tp add session tunnel_id 1 session_id 1 peer_session_id 1 | ||
222 | # ifconfig -a | ||
223 | # ip addr add 10.5.1.2/32 peer 10.5.1.1/32 dev l2tpeth0 | ||
224 | # ifconfig l2tpeth0 up | ||
225 | |||
226 | Choose IP addresses to be the address of a local IP interface and that | ||
227 | of the remote system. The IP addresses of the l2tpeth0 interface can be | ||
228 | anything suitable. | ||
229 | |||
230 | Repeat the above at the peer, with ports, tunnel/session ids and IP | ||
231 | addresses reversed. The tunnel and session IDs can be any non-zero | ||
232 | 32-bit number, but the values must be reversed at the peer. | ||
233 | |||
234 | Host 1 Host2 | ||
235 | udp_sport=5000 udp_sport=5001 | ||
236 | udp_dport=5001 udp_dport=5000 | ||
237 | tunnel_id=42 tunnel_id=45 | ||
238 | peer_tunnel_id=45 peer_tunnel_id=42 | ||
239 | session_id=128 session_id=5196755 | ||
240 | peer_session_id=5196755 peer_session_id=128 | ||
241 | |||
242 | When done at both ends of the tunnel, it should be possible to send | ||
243 | data over the network. e.g. | ||
244 | |||
245 | # ping 10.5.1.1 | ||
246 | |||
247 | |||
105 | Sample Userspace Code | 248 | Sample Userspace Code |
106 | ===================== | 249 | ===================== |
107 | 250 | ||
@@ -158,12 +301,48 @@ Sample Userspace Code | |||
158 | } | 301 | } |
159 | return 0; | 302 | return 0; |
160 | 303 | ||
304 | Internal Implementation | ||
305 | ======================= | ||
306 | |||
307 | The driver keeps a struct l2tp_tunnel context per L2TP tunnel and a | ||
308 | struct l2tp_session context for each session. The l2tp_tunnel is | ||
309 | always associated with a UDP or L2TP/IP socket and keeps a list of | ||
310 | sessions in the tunnel. The l2tp_session context keeps kernel state | ||
311 | about the session. It has private data which is used for data specific | ||
312 | to the session type. With L2TPv2, the session always carried PPP | ||
313 | traffic. With L2TPv3, the session can also carry ethernet frames | ||
314 | (ethernet pseudowire) or other data types such as ATM, HDLC or Frame | ||
315 | Relay. | ||
316 | |||
317 | When a tunnel is first opened, the reference count on the socket is | ||
318 | increased using sock_hold(). This ensures that the kernel socket | ||
319 | cannot be removed while L2TP's data structures reference it. | ||
320 | |||
321 | Some L2TP sessions also have a socket (PPP pseudowires) while others | ||
322 | do not (ethernet pseudowires). We can't use the socket reference count | ||
323 | as the reference count for session contexts. The L2TP implementation | ||
324 | therefore has its own internal reference counts on the session | ||
325 | contexts. | ||
326 | |||
327 | To Do | ||
328 | ===== | ||
329 | |||
330 | Add L2TP tunnel switching support. This would route tunneled traffic | ||
331 | from one L2TP tunnel into another. Specified in | ||
332 | http://tools.ietf.org/html/draft-ietf-l2tpext-tunnel-switching-08 | ||
333 | |||
334 | Add L2TPv3 VLAN pseudowire support. | ||
335 | |||
336 | Add L2TPv3 IP pseudowire support. | ||
337 | |||
338 | Add L2TPv3 ATM pseudowire support. | ||
339 | |||
161 | Miscellaneous | 340 | Miscellaneous |
162 | ============ | 341 | ============= |
163 | 342 | ||
164 | The PPPoL2TP driver was developed as part of the OpenL2TP project by | 343 | The L2TP drivers were developed as part of the OpenL2TP project by |
165 | Katalix Systems Ltd. OpenL2TP is a full-featured L2TP client / server, | 344 | Katalix Systems Ltd. OpenL2TP is a full-featured L2TP client / server, |
166 | designed from the ground up to have the L2TP datapath in the | 345 | designed from the ground up to have the L2TP datapath in the |
167 | kernel. The project also implemented the pppol2tp plugin for pppd | 346 | kernel. The project also implemented the pppol2tp plugin for pppd |
168 | which allows pppd to use the kernel driver. Details can be found at | 347 | which allows pppd to use the kernel driver. Details can be found at |
169 | http://openl2tp.sourceforge.net. | 348 | http://www.openl2tp.org. |
diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt index a22fd85e3796..98f71a5cef00 100644 --- a/Documentation/networking/packet_mmap.txt +++ b/Documentation/networking/packet_mmap.txt | |||
@@ -2,7 +2,7 @@ | |||
2 | + ABSTRACT | 2 | + ABSTRACT |
3 | -------------------------------------------------------------------------------- | 3 | -------------------------------------------------------------------------------- |
4 | 4 | ||
5 | This file documents the CONFIG_PACKET_MMAP option available with the PACKET | 5 | This file documents the mmap() facility available with the PACKET |
6 | socket interface on 2.4 and 2.6 kernels. This type of sockets is used for | 6 | socket interface on 2.4 and 2.6 kernels. This type of sockets is used for |
7 | capture network traffic with utilities like tcpdump or any other that needs | 7 | capture network traffic with utilities like tcpdump or any other that needs |
8 | raw access to network interface. | 8 | raw access to network interface. |
@@ -44,7 +44,7 @@ enabled. For transmission, check the MTU (Maximum Transmission Unit) used and | |||
44 | supported by devices of your network. | 44 | supported by devices of your network. |
45 | 45 | ||
46 | -------------------------------------------------------------------------------- | 46 | -------------------------------------------------------------------------------- |
47 | + How to use CONFIG_PACKET_MMAP to improve capture process | 47 | + How to use mmap() to improve capture process |
48 | -------------------------------------------------------------------------------- | 48 | -------------------------------------------------------------------------------- |
49 | 49 | ||
50 | From the user standpoint, you should use the higher level libpcap library, which | 50 | From the user standpoint, you should use the higher level libpcap library, which |
@@ -64,7 +64,7 @@ the low level details or want to improve libpcap by including PACKET_MMAP | |||
64 | support. | 64 | support. |
65 | 65 | ||
66 | -------------------------------------------------------------------------------- | 66 | -------------------------------------------------------------------------------- |
67 | + How to use CONFIG_PACKET_MMAP directly to improve capture process | 67 | + How to use mmap() directly to improve capture process |
68 | -------------------------------------------------------------------------------- | 68 | -------------------------------------------------------------------------------- |
69 | 69 | ||
70 | From the system calls stand point, the use of PACKET_MMAP involves | 70 | From the system calls stand point, the use of PACKET_MMAP involves |
@@ -100,12 +100,12 @@ by the kernel. | |||
100 | The destruction of the socket and all associated resources | 100 | The destruction of the socket and all associated resources |
101 | is done by a simple call to close(fd). | 101 | is done by a simple call to close(fd). |
102 | 102 | ||
103 | Next I will describe PACKET_MMAP settings and it's constraints, | 103 | Next I will describe PACKET_MMAP settings and its constraints, |
104 | also the mapping of the circular buffer in the user process and | 104 | also the mapping of the circular buffer in the user process and |
105 | the use of this buffer. | 105 | the use of this buffer. |
106 | 106 | ||
107 | -------------------------------------------------------------------------------- | 107 | -------------------------------------------------------------------------------- |
108 | + How to use CONFIG_PACKET_MMAP directly to improve transmission process | 108 | + How to use mmap() directly to improve transmission process |
109 | -------------------------------------------------------------------------------- | 109 | -------------------------------------------------------------------------------- |
110 | Transmission process is similar to capture as shown below. | 110 | Transmission process is similar to capture as shown below. |
111 | 111 | ||
@@ -432,7 +432,7 @@ TP_STATUS_LOSING : indicates there were packet drops from last time | |||
432 | the PACKET_STATISTICS option. | 432 | the PACKET_STATISTICS option. |
433 | 433 | ||
434 | TP_STATUS_CSUMNOTREADY: currently it's used for outgoing IP packets which | 434 | TP_STATUS_CSUMNOTREADY: currently it's used for outgoing IP packets which |
435 | it's checksum will be done in hardware. So while | 435 | its checksum will be done in hardware. So while |
436 | reading the packet we should not try to check the | 436 | reading the packet we should not try to check the |
437 | checksum. | 437 | checksum. |
438 | 438 | ||
diff --git a/Documentation/networking/regulatory.txt b/Documentation/networking/regulatory.txt index ee31369e9e5b..9551622d0a7b 100644 --- a/Documentation/networking/regulatory.txt +++ b/Documentation/networking/regulatory.txt | |||
@@ -188,3 +188,27 @@ Then in some part of your code after your wiphy has been registered: | |||
188 | &mydriver_jp_regdom.reg_rules[i], | 188 | &mydriver_jp_regdom.reg_rules[i], |
189 | sizeof(struct ieee80211_reg_rule)); | 189 | sizeof(struct ieee80211_reg_rule)); |
190 | regulatory_struct_hint(rd); | 190 | regulatory_struct_hint(rd); |
191 | |||
192 | Statically compiled regulatory database | ||
193 | --------------------------------------- | ||
194 | |||
195 | In most situations the userland solution using CRDA as described | ||
196 | above is the preferred solution. However in some cases a set of | ||
197 | rules built into the kernel itself may be desirable. To account | ||
198 | for this situation, a configuration option has been provided | ||
199 | (i.e. CONFIG_CFG80211_INTERNAL_REGDB). With this option enabled, | ||
200 | the wireless database information contained in net/wireless/db.txt is | ||
201 | used to generate a data structure encoded in net/wireless/regdb.c. | ||
202 | That option also enables code in net/wireless/reg.c which queries | ||
203 | the data in regdb.c as an alternative to using CRDA. | ||
204 | |||
205 | The file net/wireless/db.txt should be kept up-to-date with the db.txt | ||
206 | file available in the git repository here: | ||
207 | |||
208 | git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-regdb.git | ||
209 | |||
210 | Again, most users in most situations should be using the CRDA package | ||
211 | provided with their distribution, and in most other situations users | ||
212 | should be building and using CRDA on their own rather than using | ||
213 | this option. If you are not absolutely sure that you should be using | ||
214 | CONFIG_CFG80211_INTERNAL_REGDB then _DO_NOT_USE_IT_. | ||
diff --git a/Documentation/networking/skfp.txt b/Documentation/networking/skfp.txt index abfddf81e34a..203ec66c9fb4 100644 --- a/Documentation/networking/skfp.txt +++ b/Documentation/networking/skfp.txt | |||
@@ -68,7 +68,7 @@ Compaq adapters (not tested): | |||
68 | ======================= | 68 | ======================= |
69 | 69 | ||
70 | From v2.01 on, the driver is integrated in the linux kernel sources. | 70 | From v2.01 on, the driver is integrated in the linux kernel sources. |
71 | Therefor, the installation is the same as for any other adapter | 71 | Therefore, the installation is the same as for any other adapter |
72 | supported by the kernel. | 72 | supported by the kernel. |
73 | Refer to the manual of your distribution about the installation | 73 | Refer to the manual of your distribution about the installation |
74 | of network adapters. | 74 | of network adapters. |
diff --git a/Documentation/networking/stmmac.txt b/Documentation/networking/stmmac.txt new file mode 100644 index 000000000000..7ee770b5ef5f --- /dev/null +++ b/Documentation/networking/stmmac.txt | |||
@@ -0,0 +1,143 @@ | |||
1 | STMicroelectronics 10/100/1000 Synopsys Ethernet driver | ||
2 | |||
3 | Copyright (C) 2007-2010 STMicroelectronics Ltd | ||
4 | Author: Giuseppe Cavallaro <peppe.cavallaro@st.com> | ||
5 | |||
6 | This is the driver for the MAC 10/100/1000 on-chip Ethernet controllers | ||
7 | (Synopsys IP blocks); it has been fully tested on STLinux platforms. | ||
8 | |||
9 | Currently this network device driver is for all STM embedded MAC/GMAC | ||
10 | (7xxx SoCs). | ||
11 | |||
12 | DWC Ether MAC 10/100/1000 Universal version 3.41a and DWC Ether MAC 10/100 | ||
13 | Universal version 4.0 have been used for developing the first code | ||
14 | implementation. | ||
15 | |||
16 | Please, for more information also visit: www.stlinux.com | ||
17 | |||
18 | 1) Kernel Configuration | ||
19 | The kernel configuration option is STMMAC_ETH: | ||
20 | Device Drivers ---> Network device support ---> Ethernet (1000 Mbit) ---> | ||
21 | STMicroelectronics 10/100/1000 Ethernet driver (STMMAC_ETH) | ||
22 | |||
23 | 2) Driver parameters list: | ||
24 | debug: message level (0: no output, 16: all); | ||
25 | phyaddr: to manually provide the physical address to the PHY device; | ||
26 | dma_rxsize: DMA rx ring size; | ||
27 | dma_txsize: DMA tx ring size; | ||
28 | buf_sz: DMA buffer size; | ||
29 | tc: control the HW FIFO threshold; | ||
30 | tx_coe: Enable/Disable Tx Checksum Offload engine; | ||
31 | watchdog: transmit timeout (in milliseconds); | ||
32 | flow_ctrl: Flow control ability [on/off]; | ||
33 | pause: Flow Control Pause Time; | ||
34 | tmrate: timer period (only if timer optimisation is configured). | ||
35 | |||
36 | 3) Command line options | ||
37 | Driver parameters can be also passed in command line by using: | ||
38 | stmmaceth=dma_rxsize:128,dma_txsize:512 | ||
39 | |||
40 | 4) Driver information and notes | ||
41 | |||
42 | 4.1) Transmit process | ||
43 | The xmit method is invoked when the kernel needs to transmit a packet; it sets | ||
44 | the descriptors in the ring and informs the DMA engine that there is a packet | ||
45 | ready to be transmitted. | ||
46 | Once the controller has finished transmitting the packet, an interrupt is | ||
47 | triggered; So the driver will be able to release the socket buffers. | ||
48 | By default, the driver sets the NETIF_F_SG bit in the features field of the | ||
49 | net_device structure enabling the scatter/gather feature. | ||
50 | |||
51 | 4.2) Receive process | ||
52 | When one or more packets are received, an interrupt happens. The interrupts | ||
53 | are not queued so the driver has to scan all the descriptors in the ring during | ||
54 | the receive process. | ||
55 | This is based on NAPI so the interrupt handler signals only if there is work to be | ||
56 | done, and it exits. | ||
57 | Then the poll method will be scheduled at some future point. | ||
58 | The incoming packets are stored, by the DMA, in a list of pre-allocated socket | ||
59 | buffers in order to avoid the memcpy (Zero-copy). | ||
60 | |||
61 | 4.3) Timer-Driver Interrupt | ||
62 | Instead of having the device that asynchronously notifies the frame receptions, the | ||
63 | driver configures a timer to generate an interrupt at regular intervals. | ||
64 | Based on the granularity of the timer, the frames that are received by the device | ||
65 | will experience different levels of latency. Some NICs have dedicated timer | ||
66 | device to perform this task. STMMAC can use either the RTC device or the TMU | ||
67 | channel 2 on STLinux platforms. | ||
68 | The timers frequency can be passed to the driver as parameter; when change it, | ||
69 | take care of both hardware capability and network stability/performance impact. | ||
70 | Several performance tests on STM platforms showed this optimisation allows to spare | ||
71 | the CPU while having the maximum throughput. | ||
72 | |||
73 | 4.4) WOL | ||
74 | Wake up on Lan feature through Magic Frame is only supported for the GMAC | ||
75 | core. | ||
76 | |||
77 | 4.5) DMA descriptors | ||
78 | Driver handles both normal and enhanced descriptors. The latter has been only | ||
79 | tested on DWC Ether MAC 10/100/1000 Universal version 3.41a. | ||
80 | |||
81 | 4.6) Ethtool support | ||
82 | Ethtool is supported. Driver statistics and internal errors can be taken using: | ||
83 | ethtool -S ethX command. It is possible to dump registers etc. | ||
84 | |||
85 | 4.7) Jumbo and Segmentation Offloading | ||
86 | Jumbo frames are supported and tested for the GMAC. | ||
87 | The GSO has been also added but it's performed in software. | ||
88 | LRO is not supported. | ||
89 | |||
90 | 4.8) Physical | ||
91 | The driver is compatible with PAL to work with PHY and GPHY devices. | ||
92 | |||
93 | 4.9) Platform information | ||
94 | Several information came from the platform; please refer to the | ||
95 | driver's Header file in include/linux directory. | ||
96 | |||
97 | struct plat_stmmacenet_data { | ||
98 | int bus_id; | ||
99 | int pbl; | ||
100 | int has_gmac; | ||
101 | void (*fix_mac_speed)(void *priv, unsigned int speed); | ||
102 | void (*bus_setup)(unsigned long ioaddr); | ||
103 | #ifdef CONFIG_STM_DRIVERS | ||
104 | struct stm_pad_config *pad_config; | ||
105 | #endif | ||
106 | void *bsp_priv; | ||
107 | }; | ||
108 | |||
109 | Where: | ||
110 | - pbl (Programmable Burst Length) is maximum number of | ||
111 | beats to be transferred in one DMA transaction. | ||
112 | GMAC also enables the 4xPBL by default. | ||
113 | - fix_mac_speed and bus_setup are used to configure internal target | ||
114 | registers (on STM platforms); | ||
115 | - has_gmac: GMAC core is on board (get it at run-time in the next step); | ||
116 | - bus_id: bus identifier. | ||
117 | |||
118 | struct plat_stmmacphy_data { | ||
119 | int bus_id; | ||
120 | int phy_addr; | ||
121 | unsigned int phy_mask; | ||
122 | int interface; | ||
123 | int (*phy_reset)(void *priv); | ||
124 | void *priv; | ||
125 | }; | ||
126 | |||
127 | Where: | ||
128 | - bus_id: bus identifier; | ||
129 | - phy_addr: physical address used for the attached phy device; | ||
130 | set it to -1 to get it at run-time; | ||
131 | - interface: physical MII interface mode; | ||
132 | - phy_reset: hook to reset HW function. | ||
133 | |||
134 | TODO: | ||
135 | - Continue to make the driver more generic and suitable for other Synopsys | ||
136 | Ethernet controllers used on other architectures (i.e. ARM). | ||
137 | - 10G controllers are not supported. | ||
138 | - MAC uses Normal descriptors and GMAC uses enhanced ones. | ||
139 | This is a limit that should be reviewed. MAC could want to | ||
140 | use the enhanced structure. | ||
141 | - Checksumming: Rx/Tx csum is done in HW in case of GMAC only. | ||
142 | - Review the timer optimisation code to use an embedded device that seems to be | ||
143 | available in new chip generations. | ||
diff --git a/Documentation/networking/tcp-thin.txt b/Documentation/networking/tcp-thin.txt new file mode 100644 index 000000000000..151e229980f1 --- /dev/null +++ b/Documentation/networking/tcp-thin.txt | |||
@@ -0,0 +1,47 @@ | |||
1 | Thin-streams and TCP | ||
2 | ==================== | ||
3 | A wide range of Internet-based services that use reliable transport | ||
4 | protocols display what we call thin-stream properties. This means | ||
5 | that the application sends data with such a low rate that the | ||
6 | retransmission mechanisms of the transport protocol are not fully | ||
7 | effective. In time-dependent scenarios (like online games, control | ||
8 | systems, stock trading etc.) where the user experience depends | ||
9 | on the data delivery latency, packet loss can be devastating for | ||
10 | the service quality. Extreme latencies are caused by TCP's | ||
11 | dependency on the arrival of new data from the application to trigger | ||
12 | retransmissions effectively through fast retransmit instead of | ||
13 | waiting for long timeouts. | ||
14 | |||
15 | After analysing a large number of time-dependent interactive | ||
16 | applications, we have seen that they often produce thin streams | ||
17 | and also stay with this traffic pattern throughout its entire | ||
18 | lifespan. The combination of time-dependency and the fact that the | ||
19 | streams provoke high latencies when using TCP is unfortunate. | ||
20 | |||
21 | In order to reduce application-layer latency when packets are lost, | ||
22 | a set of mechanisms has been made, which address these latency issues | ||
23 | for thin streams. In short, if the kernel detects a thin stream, | ||
24 | the retransmission mechanisms are modified in the following manner: | ||
25 | |||
26 | 1) If the stream is thin, fast retransmit on the first dupACK. | ||
27 | 2) If the stream is thin, do not apply exponential backoff. | ||
28 | |||
29 | These enhancements are applied only if the stream is detected as | ||
30 | thin. This is accomplished by defining a threshold for the number | ||
31 | of packets in flight. If there are less than 4 packets in flight, | ||
32 | fast retransmissions can not be triggered, and the stream is prone | ||
33 | to experience high retransmission latencies. | ||
34 | |||
35 | Since these mechanisms are targeted at time-dependent applications, | ||
36 | they must be specifically activated by the application using the | ||
37 | TCP_THIN_LINEAR_TIMEOUTS and TCP_THIN_DUPACK IOCTLS or the | ||
38 | tcp_thin_linear_timeouts and tcp_thin_dupack sysctls. Both | ||
39 | modifications are turned off by default. | ||
40 | |||
41 | References | ||
42 | ========== | ||
43 | More information on the modifications, as well as a wide range of | ||
44 | experimental data can be found here: | ||
45 | "Improving latency for interactive, thin-stream applications over | ||
46 | reliable transport" | ||
47 | http://simula.no/research/nd/publications/Simula.nd.477/simula_pdf_file | ||
diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt index 0e58b4539176..e8c8f4f06c67 100644 --- a/Documentation/networking/timestamping.txt +++ b/Documentation/networking/timestamping.txt | |||
@@ -41,11 +41,12 @@ SOF_TIMESTAMPING_SOFTWARE: return system time stamp generated in | |||
41 | SOF_TIMESTAMPING_TX/RX determine how time stamps are generated. | 41 | SOF_TIMESTAMPING_TX/RX determine how time stamps are generated. |
42 | SOF_TIMESTAMPING_RAW/SYS determine how they are reported in the | 42 | SOF_TIMESTAMPING_RAW/SYS determine how they are reported in the |
43 | following control message: | 43 | following control message: |
44 | struct scm_timestamping { | 44 | |
45 | struct timespec systime; | 45 | struct scm_timestamping { |
46 | struct timespec hwtimetrans; | 46 | struct timespec systime; |
47 | struct timespec hwtimeraw; | 47 | struct timespec hwtimetrans; |
48 | }; | 48 | struct timespec hwtimeraw; |
49 | }; | ||
49 | 50 | ||
50 | recvmsg() can be used to get this control message for regular incoming | 51 | recvmsg() can be used to get this control message for regular incoming |
51 | packets. For send time stamps the outgoing packet is looped back to | 52 | packets. For send time stamps the outgoing packet is looped back to |
@@ -87,12 +88,13 @@ by the network device and will be empty without that support. | |||
87 | SIOCSHWTSTAMP: | 88 | SIOCSHWTSTAMP: |
88 | 89 | ||
89 | Hardware time stamping must also be initialized for each device driver | 90 | Hardware time stamping must also be initialized for each device driver |
90 | that is expected to do hardware time stamping. The parameter is: | 91 | that is expected to do hardware time stamping. The parameter is defined in |
92 | /include/linux/net_tstamp.h as: | ||
91 | 93 | ||
92 | struct hwtstamp_config { | 94 | struct hwtstamp_config { |
93 | int flags; /* no flags defined right now, must be zero */ | 95 | int flags; /* no flags defined right now, must be zero */ |
94 | int tx_type; /* HWTSTAMP_TX_* */ | 96 | int tx_type; /* HWTSTAMP_TX_* */ |
95 | int rx_filter; /* HWTSTAMP_FILTER_* */ | 97 | int rx_filter; /* HWTSTAMP_FILTER_* */ |
96 | }; | 98 | }; |
97 | 99 | ||
98 | Desired behavior is passed into the kernel and to a specific device by | 100 | Desired behavior is passed into the kernel and to a specific device by |
@@ -139,42 +141,56 @@ enum { | |||
139 | /* time stamp any incoming packet */ | 141 | /* time stamp any incoming packet */ |
140 | HWTSTAMP_FILTER_ALL, | 142 | HWTSTAMP_FILTER_ALL, |
141 | 143 | ||
142 | /* return value: time stamp all packets requested plus some others */ | 144 | /* return value: time stamp all packets requested plus some others */ |
143 | HWTSTAMP_FILTER_SOME, | 145 | HWTSTAMP_FILTER_SOME, |
144 | 146 | ||
145 | /* PTP v1, UDP, any kind of event packet */ | 147 | /* PTP v1, UDP, any kind of event packet */ |
146 | HWTSTAMP_FILTER_PTP_V1_L4_EVENT, | 148 | HWTSTAMP_FILTER_PTP_V1_L4_EVENT, |
147 | 149 | ||
148 | ... | 150 | /* for the complete list of values, please check |
151 | * the include file /include/linux/net_tstamp.h | ||
152 | */ | ||
149 | }; | 153 | }; |
150 | 154 | ||
151 | 155 | ||
152 | DEVICE IMPLEMENTATION | 156 | DEVICE IMPLEMENTATION |
153 | 157 | ||
154 | A driver which supports hardware time stamping must support the | 158 | A driver which supports hardware time stamping must support the |
155 | SIOCSHWTSTAMP ioctl. Time stamps for received packets must be stored | 159 | SIOCSHWTSTAMP ioctl and update the supplied struct hwtstamp_config with |
156 | in the skb with skb_hwtstamp_set(). | 160 | the actual values as described in the section on SIOCSHWTSTAMP. |
161 | |||
162 | Time stamps for received packets must be stored in the skb. To get a pointer | ||
163 | to the shared time stamp structure of the skb call skb_hwtstamps(). Then | ||
164 | set the time stamps in the structure: | ||
165 | |||
166 | struct skb_shared_hwtstamps { | ||
167 | /* hardware time stamp transformed into duration | ||
168 | * since arbitrary point in time | ||
169 | */ | ||
170 | ktime_t hwtstamp; | ||
171 | ktime_t syststamp; /* hwtstamp transformed to system time base */ | ||
172 | }; | ||
157 | 173 | ||
158 | Time stamps for outgoing packets are to be generated as follows: | 174 | Time stamps for outgoing packets are to be generated as follows: |
159 | - In hard_start_xmit(), check if skb_hwtstamp_check_tx_hardware() | 175 | - In hard_start_xmit(), check if skb_tx(skb)->hardware is set no-zero. |
160 | returns non-zero. If yes, then the driver is expected | 176 | If yes, then the driver is expected to do hardware time stamping. |
161 | to do hardware time stamping. | ||
162 | - If this is possible for the skb and requested, then declare | 177 | - If this is possible for the skb and requested, then declare |
163 | that the driver is doing the time stamping by calling | 178 | that the driver is doing the time stamping by setting the field |
164 | skb_hwtstamp_tx_in_progress(). A driver not supporting | 179 | skb_tx(skb)->in_progress non-zero. You might want to keep a pointer |
165 | hardware time stamping doesn't do that. A driver must never | 180 | to the associated skb for the next step and not free the skb. A driver |
166 | touch sk_buff::tstamp! It is used to store how time stamping | 181 | not supporting hardware time stamping doesn't do that. A driver must |
167 | for an outgoing packets is to be done. | 182 | never touch sk_buff::tstamp! It is used to store software generated |
183 | time stamps by the network subsystem. | ||
168 | - As soon as the driver has sent the packet and/or obtained a | 184 | - As soon as the driver has sent the packet and/or obtained a |
169 | hardware time stamp for it, it passes the time stamp back by | 185 | hardware time stamp for it, it passes the time stamp back by |
170 | calling skb_hwtstamp_tx() with the original skb, the raw | 186 | calling skb_hwtstamp_tx() with the original skb, the raw |
171 | hardware time stamp and a handle to the device (necessary | 187 | hardware time stamp. skb_hwtstamp_tx() clones the original skb and |
172 | to convert the hardware time stamp to system time). If obtaining | 188 | adds the timestamps, therefore the original skb has to be freed now. |
173 | the hardware time stamp somehow fails, then the driver should | 189 | If obtaining the hardware time stamp somehow fails, then the driver |
174 | not fall back to software time stamping. The rationale is that | 190 | should not fall back to software time stamping. The rationale is that |
175 | this would occur at a later time in the processing pipeline | 191 | this would occur at a later time in the processing pipeline than other |
176 | than other software time stamping and therefore could lead | 192 | software time stamping and therefore could lead to unexpected deltas |
177 | to unexpected deltas between time stamps. | 193 | between time stamps. |
178 | - If the driver did not call skb_hwtstamp_tx_in_progress(), then | 194 | - If the driver did not call set skb_tx(skb)->in_progress, then |
179 | dev_hard_start_xmit() checks whether software time stamping | 195 | dev_hard_start_xmit() checks whether software time stamping |
180 | is wanted as fallback and potentially generates the time stamp. | 196 | is wanted as fallback and potentially generates the time stamp. |
diff --git a/Documentation/networking/timestamping/Makefile b/Documentation/networking/timestamping/Makefile index 2a1489fdc036..e79973443e9f 100644 --- a/Documentation/networking/timestamping/Makefile +++ b/Documentation/networking/timestamping/Makefile | |||
@@ -1,6 +1,13 @@ | |||
1 | CPPFLAGS = -I../../../include | 1 | # kbuild trick to avoid linker error. Can be omitted if a module is built. |
2 | obj- := dummy.o | ||
2 | 3 | ||
3 | timestamping: timestamping.c | 4 | # List of programs to build |
5 | hostprogs-y := timestamping | ||
6 | |||
7 | # Tell kbuild to always build the programs | ||
8 | always := $(hostprogs-y) | ||
9 | |||
10 | HOSTCFLAGS_timestamping.o += -I$(objtree)/usr/include | ||
4 | 11 | ||
5 | clean: | 12 | clean: |
6 | rm -f timestamping | 13 | rm -f timestamping |
diff --git a/Documentation/networking/timestamping/timestamping.c b/Documentation/networking/timestamping/timestamping.c index a7936fe8444a..8ba82bfe6a33 100644 --- a/Documentation/networking/timestamping/timestamping.c +++ b/Documentation/networking/timestamping/timestamping.c | |||
@@ -41,9 +41,9 @@ | |||
41 | #include <arpa/inet.h> | 41 | #include <arpa/inet.h> |
42 | #include <net/if.h> | 42 | #include <net/if.h> |
43 | 43 | ||
44 | #include "asm/types.h" | 44 | #include <asm/types.h> |
45 | #include "linux/net_tstamp.h" | 45 | #include <linux/net_tstamp.h> |
46 | #include "linux/errqueue.h" | 46 | #include <linux/errqueue.h> |
47 | 47 | ||
48 | #ifndef SO_TIMESTAMPING | 48 | #ifndef SO_TIMESTAMPING |
49 | # define SO_TIMESTAMPING 37 | 49 | # define SO_TIMESTAMPING 37 |
@@ -164,7 +164,7 @@ static void printpacket(struct msghdr *msg, int res, | |||
164 | 164 | ||
165 | gettimeofday(&now, 0); | 165 | gettimeofday(&now, 0); |
166 | 166 | ||
167 | printf("%ld.%06ld: received %s data, %d bytes from %s, %d bytes control messages\n", | 167 | printf("%ld.%06ld: received %s data, %d bytes from %s, %zu bytes control messages\n", |
168 | (long)now.tv_sec, (long)now.tv_usec, | 168 | (long)now.tv_sec, (long)now.tv_usec, |
169 | (recvmsg_flags & MSG_ERRQUEUE) ? "error" : "regular", | 169 | (recvmsg_flags & MSG_ERRQUEUE) ? "error" : "regular", |
170 | res, | 170 | res, |
@@ -173,7 +173,7 @@ static void printpacket(struct msghdr *msg, int res, | |||
173 | for (cmsg = CMSG_FIRSTHDR(msg); | 173 | for (cmsg = CMSG_FIRSTHDR(msg); |
174 | cmsg; | 174 | cmsg; |
175 | cmsg = CMSG_NXTHDR(msg, cmsg)) { | 175 | cmsg = CMSG_NXTHDR(msg, cmsg)) { |
176 | printf(" cmsg len %d: ", cmsg->cmsg_len); | 176 | printf(" cmsg len %zu: ", cmsg->cmsg_len); |
177 | switch (cmsg->cmsg_level) { | 177 | switch (cmsg->cmsg_level) { |
178 | case SOL_SOCKET: | 178 | case SOL_SOCKET: |
179 | printf("SOL_SOCKET "); | 179 | printf("SOL_SOCKET "); |
@@ -370,7 +370,7 @@ int main(int argc, char **argv) | |||
370 | } | 370 | } |
371 | 371 | ||
372 | sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); | 372 | sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); |
373 | if (socket < 0) | 373 | if (sock < 0) |
374 | bail("socket"); | 374 | bail("socket"); |
375 | 375 | ||
376 | memset(&device, 0, sizeof(device)); | 376 | memset(&device, 0, sizeof(device)); |
diff --git a/Documentation/networking/x25-iface.txt b/Documentation/networking/x25-iface.txt index 975cc87ebdd1..78f662ee0622 100644 --- a/Documentation/networking/x25-iface.txt +++ b/Documentation/networking/x25-iface.txt | |||
@@ -20,23 +20,23 @@ the rest of the skbuff, if any more information does exist. | |||
20 | Packet Layer to Device Driver | 20 | Packet Layer to Device Driver |
21 | ----------------------------- | 21 | ----------------------------- |
22 | 22 | ||
23 | First Byte = 0x00 | 23 | First Byte = 0x00 (X25_IFACE_DATA) |
24 | 24 | ||
25 | This indicates that the rest of the skbuff contains data to be transmitted | 25 | This indicates that the rest of the skbuff contains data to be transmitted |
26 | over the LAPB link. The LAPB link should already exist before any data is | 26 | over the LAPB link. The LAPB link should already exist before any data is |
27 | passed down. | 27 | passed down. |
28 | 28 | ||
29 | First Byte = 0x01 | 29 | First Byte = 0x01 (X25_IFACE_CONNECT) |
30 | 30 | ||
31 | Establish the LAPB link. If the link is already established then the connect | 31 | Establish the LAPB link. If the link is already established then the connect |
32 | confirmation message should be returned as soon as possible. | 32 | confirmation message should be returned as soon as possible. |
33 | 33 | ||
34 | First Byte = 0x02 | 34 | First Byte = 0x02 (X25_IFACE_DISCONNECT) |
35 | 35 | ||
36 | Terminate the LAPB link. If it is already disconnected then the disconnect | 36 | Terminate the LAPB link. If it is already disconnected then the disconnect |
37 | confirmation message should be returned as soon as possible. | 37 | confirmation message should be returned as soon as possible. |
38 | 38 | ||
39 | First Byte = 0x03 | 39 | First Byte = 0x03 (X25_IFACE_PARAMS) |
40 | 40 | ||
41 | LAPB parameters. To be defined. | 41 | LAPB parameters. To be defined. |
42 | 42 | ||
@@ -44,22 +44,22 @@ LAPB parameters. To be defined. | |||
44 | Device Driver to Packet Layer | 44 | Device Driver to Packet Layer |
45 | ----------------------------- | 45 | ----------------------------- |
46 | 46 | ||
47 | First Byte = 0x00 | 47 | First Byte = 0x00 (X25_IFACE_DATA) |
48 | 48 | ||
49 | This indicates that the rest of the skbuff contains data that has been | 49 | This indicates that the rest of the skbuff contains data that has been |
50 | received over the LAPB link. | 50 | received over the LAPB link. |
51 | 51 | ||
52 | First Byte = 0x01 | 52 | First Byte = 0x01 (X25_IFACE_CONNECT) |
53 | 53 | ||
54 | LAPB link has been established. The same message is used for both a LAPB | 54 | LAPB link has been established. The same message is used for both a LAPB |
55 | link connect_confirmation and a connect_indication. | 55 | link connect_confirmation and a connect_indication. |
56 | 56 | ||
57 | First Byte = 0x02 | 57 | First Byte = 0x02 (X25_IFACE_DISCONNECT) |
58 | 58 | ||
59 | LAPB link has been terminated. This same message is used for both a LAPB | 59 | LAPB link has been terminated. This same message is used for both a LAPB |
60 | link disconnect_confirmation and a disconnect_indication. | 60 | link disconnect_confirmation and a disconnect_indication. |
61 | 61 | ||
62 | First Byte = 0x03 | 62 | First Byte = 0x03 (X25_IFACE_PARAMS) |
63 | 63 | ||
64 | LAPB parameters. To be defined. | 64 | LAPB parameters. To be defined. |
65 | 65 | ||
diff --git a/Documentation/oops-tracing.txt b/Documentation/oops-tracing.txt index c10c022b911c..6fe9001b9263 100644 --- a/Documentation/oops-tracing.txt +++ b/Documentation/oops-tracing.txt | |||
@@ -256,9 +256,13 @@ characters, each representing a particular tainted value. | |||
256 | 9: 'A' if the ACPI table has been overridden. | 256 | 9: 'A' if the ACPI table has been overridden. |
257 | 257 | ||
258 | 10: 'W' if a warning has previously been issued by the kernel. | 258 | 10: 'W' if a warning has previously been issued by the kernel. |
259 | (Though some warnings may set more specific taint flags.) | ||
259 | 260 | ||
260 | 11: 'C' if a staging driver has been loaded. | 261 | 11: 'C' if a staging driver has been loaded. |
261 | 262 | ||
263 | 12: 'I' if the kernel is working around a severe bug in the platform | ||
264 | firmware (BIOS or similar). | ||
265 | |||
262 | The primary reason for the 'Tainted: ' string is to tell kernel | 266 | The primary reason for the 'Tainted: ' string is to tell kernel |
263 | debuggers if this is a clean kernel or if anything unusual has | 267 | debuggers if this is a clean kernel or if anything unusual has |
264 | occurred. Tainting is permanent: even if an offending module is | 268 | occurred. Tainting is permanent: even if an offending module is |
diff --git a/Documentation/padata.txt b/Documentation/padata.txt new file mode 100644 index 000000000000..269d7d0d8335 --- /dev/null +++ b/Documentation/padata.txt | |||
@@ -0,0 +1,107 @@ | |||
1 | The padata parallel execution mechanism | ||
2 | Last updated for 2.6.34 | ||
3 | |||
4 | Padata is a mechanism by which the kernel can farm work out to be done in | ||
5 | parallel on multiple CPUs while retaining the ordering of tasks. It was | ||
6 | developed for use with the IPsec code, which needs to be able to perform | ||
7 | encryption and decryption on large numbers of packets without reordering | ||
8 | those packets. The crypto developers made a point of writing padata in a | ||
9 | sufficiently general fashion that it could be put to other uses as well. | ||
10 | |||
11 | The first step in using padata is to set up a padata_instance structure for | ||
12 | overall control of how tasks are to be run: | ||
13 | |||
14 | #include <linux/padata.h> | ||
15 | |||
16 | struct padata_instance *padata_alloc(const struct cpumask *cpumask, | ||
17 | struct workqueue_struct *wq); | ||
18 | |||
19 | The cpumask describes which processors will be used to execute work | ||
20 | submitted to this instance. The workqueue wq is where the work will | ||
21 | actually be done; it should be a multithreaded queue, naturally. | ||
22 | |||
23 | There are functions for enabling and disabling the instance: | ||
24 | |||
25 | void padata_start(struct padata_instance *pinst); | ||
26 | void padata_stop(struct padata_instance *pinst); | ||
27 | |||
28 | These functions literally do nothing beyond setting or clearing the | ||
29 | "padata_start() was called" flag; if that flag is not set, other functions | ||
30 | will refuse to work. | ||
31 | |||
32 | The list of CPUs to be used can be adjusted with these functions: | ||
33 | |||
34 | int padata_set_cpumask(struct padata_instance *pinst, | ||
35 | cpumask_var_t cpumask); | ||
36 | int padata_add_cpu(struct padata_instance *pinst, int cpu); | ||
37 | int padata_remove_cpu(struct padata_instance *pinst, int cpu); | ||
38 | |||
39 | Changing the CPU mask has the look of an expensive operation, though, so it | ||
40 | probably should not be done with great frequency. | ||
41 | |||
42 | Actually submitting work to the padata instance requires the creation of a | ||
43 | padata_priv structure: | ||
44 | |||
45 | struct padata_priv { | ||
46 | /* Other stuff here... */ | ||
47 | void (*parallel)(struct padata_priv *padata); | ||
48 | void (*serial)(struct padata_priv *padata); | ||
49 | }; | ||
50 | |||
51 | This structure will almost certainly be embedded within some larger | ||
52 | structure specific to the work to be done. Most its fields are private to | ||
53 | padata, but the structure should be zeroed at initialization time, and the | ||
54 | parallel() and serial() functions should be provided. Those functions will | ||
55 | be called in the process of getting the work done as we will see | ||
56 | momentarily. | ||
57 | |||
58 | The submission of work is done with: | ||
59 | |||
60 | int padata_do_parallel(struct padata_instance *pinst, | ||
61 | struct padata_priv *padata, int cb_cpu); | ||
62 | |||
63 | The pinst and padata structures must be set up as described above; cb_cpu | ||
64 | specifies which CPU will be used for the final callback when the work is | ||
65 | done; it must be in the current instance's CPU mask. The return value from | ||
66 | padata_do_parallel() is a little strange; zero is an error return | ||
67 | indicating that the caller forgot the padata_start() formalities. -EBUSY | ||
68 | means that somebody, somewhere else is messing with the instance's CPU | ||
69 | mask, while -EINVAL is a complaint about cb_cpu not being in that CPU mask. | ||
70 | If all goes well, this function will return -EINPROGRESS, indicating that | ||
71 | the work is in progress. | ||
72 | |||
73 | Each task submitted to padata_do_parallel() will, in turn, be passed to | ||
74 | exactly one call to the above-mentioned parallel() function, on one CPU, so | ||
75 | true parallelism is achieved by submitting multiple tasks. Despite the | ||
76 | fact that the workqueue is used to make these calls, parallel() is run with | ||
77 | software interrupts disabled and thus cannot sleep. The parallel() | ||
78 | function gets the padata_priv structure pointer as its lone parameter; | ||
79 | information about the actual work to be done is probably obtained by using | ||
80 | container_of() to find the enclosing structure. | ||
81 | |||
82 | Note that parallel() has no return value; the padata subsystem assumes that | ||
83 | parallel() will take responsibility for the task from this point. The work | ||
84 | need not be completed during this call, but, if parallel() leaves work | ||
85 | outstanding, it should be prepared to be called again with a new job before | ||
86 | the previous one completes. When a task does complete, parallel() (or | ||
87 | whatever function actually finishes the job) should inform padata of the | ||
88 | fact with a call to: | ||
89 | |||
90 | void padata_do_serial(struct padata_priv *padata); | ||
91 | |||
92 | At some point in the future, padata_do_serial() will trigger a call to the | ||
93 | serial() function in the padata_priv structure. That call will happen on | ||
94 | the CPU requested in the initial call to padata_do_parallel(); it, too, is | ||
95 | done through the workqueue, but with local software interrupts disabled. | ||
96 | Note that this call may be deferred for a while since the padata code takes | ||
97 | pains to ensure that tasks are completed in the order in which they were | ||
98 | submitted. | ||
99 | |||
100 | The one remaining function in the padata API should be called to clean up | ||
101 | when a padata instance is no longer needed: | ||
102 | |||
103 | void padata_free(struct padata_instance *pinst); | ||
104 | |||
105 | This function will busy-wait while any remaining tasks are completed, so it | ||
106 | might be best not to call it while there is work outstanding. Shutting | ||
107 | down the workqueue, if necessary, should be done separately. | ||
diff --git a/Documentation/pcmcia/driver-changes.txt b/Documentation/pcmcia/driver-changes.txt index 446f43b309df..61bc4e943116 100644 --- a/Documentation/pcmcia/driver-changes.txt +++ b/Documentation/pcmcia/driver-changes.txt | |||
@@ -1,4 +1,17 @@ | |||
1 | This file details changes in 2.6 which affect PCMCIA card driver authors: | 1 | This file details changes in 2.6 which affect PCMCIA card driver authors: |
2 | * No dev_node_t (as of 2.6.35) | ||
3 | There is no more need to fill out a "dev_node_t" structure. | ||
4 | |||
5 | * New IRQ request rules (as of 2.6.35) | ||
6 | Instead of the old pcmcia_request_irq() interface, drivers may now | ||
7 | choose between: | ||
8 | - calling request_irq/free_irq directly. Use the IRQ from *p_dev->irq. | ||
9 | - use pcmcia_request_irq(p_dev, handler_t); the PCMCIA core will | ||
10 | clean up automatically on calls to pcmcia_disable_device() or | ||
11 | device ejection. | ||
12 | - drivers still not capable of IRQF_SHARED (or not telling us so) may | ||
13 | use the deprecated pcmcia_request_exclusive_irq() for the time | ||
14 | being; they might receive a shared IRQ nonetheless. | ||
2 | 15 | ||
3 | * no cs_error / CS_CHECK / CONFIG_PCMCIA_DEBUG (as of 2.6.33) | 16 | * no cs_error / CS_CHECK / CONFIG_PCMCIA_DEBUG (as of 2.6.33) |
4 | Instead of the cs_error() callback or the CS_CHECK() macro, please use | 17 | Instead of the cs_error() callback or the CS_CHECK() macro, please use |
diff --git a/Documentation/pcmcia/locking.txt b/Documentation/pcmcia/locking.txt new file mode 100644 index 000000000000..68f622bc4064 --- /dev/null +++ b/Documentation/pcmcia/locking.txt | |||
@@ -0,0 +1,118 @@ | |||
1 | This file explains the locking and exclusion scheme used in the PCCARD | ||
2 | and PCMCIA subsystems. | ||
3 | |||
4 | |||
5 | A) Overview, Locking Hierarchy: | ||
6 | =============================== | ||
7 | |||
8 | pcmcia_socket_list_rwsem - protects only the list of sockets | ||
9 | - skt_mutex - serializes card insert / ejection | ||
10 | - ops_mutex - serializes socket operation | ||
11 | |||
12 | |||
13 | B) Exclusion | ||
14 | ============ | ||
15 | |||
16 | The following functions and callbacks to struct pcmcia_socket must | ||
17 | be called with "skt_mutex" held: | ||
18 | |||
19 | socket_detect_change() | ||
20 | send_event() | ||
21 | socket_reset() | ||
22 | socket_shutdown() | ||
23 | socket_setup() | ||
24 | socket_remove() | ||
25 | socket_insert() | ||
26 | socket_early_resume() | ||
27 | socket_late_resume() | ||
28 | socket_resume() | ||
29 | socket_suspend() | ||
30 | |||
31 | struct pcmcia_callback *callback | ||
32 | |||
33 | The following functions and callbacks to struct pcmcia_socket must | ||
34 | be called with "ops_mutex" held: | ||
35 | |||
36 | socket_reset() | ||
37 | socket_setup() | ||
38 | |||
39 | struct pccard_operations *ops | ||
40 | struct pccard_resource_ops *resource_ops; | ||
41 | |||
42 | Note that send_event() and struct pcmcia_callback *callback must not be | ||
43 | called with "ops_mutex" held. | ||
44 | |||
45 | |||
46 | C) Protection | ||
47 | ============= | ||
48 | |||
49 | 1. Global Data: | ||
50 | --------------- | ||
51 | struct list_head pcmcia_socket_list; | ||
52 | |||
53 | protected by pcmcia_socket_list_rwsem; | ||
54 | |||
55 | |||
56 | 2. Per-Socket Data: | ||
57 | ------------------- | ||
58 | The resource_ops and their data are protected by ops_mutex. | ||
59 | |||
60 | The "main" struct pcmcia_socket is protected as follows (read-only fields | ||
61 | or single-use fields not mentioned): | ||
62 | |||
63 | - by pcmcia_socket_list_rwsem: | ||
64 | struct list_head socket_list; | ||
65 | |||
66 | - by thread_lock: | ||
67 | unsigned int thread_events; | ||
68 | |||
69 | - by skt_mutex: | ||
70 | u_int suspended_state; | ||
71 | void (*tune_bridge); | ||
72 | struct pcmcia_callback *callback; | ||
73 | int resume_status; | ||
74 | |||
75 | - by ops_mutex: | ||
76 | socket_state_t socket; | ||
77 | u_int state; | ||
78 | u_short lock_count; | ||
79 | pccard_mem_map cis_mem; | ||
80 | void __iomem *cis_virt; | ||
81 | struct { } irq; | ||
82 | io_window_t io[]; | ||
83 | pccard_mem_map win[]; | ||
84 | struct list_head cis_cache; | ||
85 | size_t fake_cis_len; | ||
86 | u8 *fake_cis; | ||
87 | u_int irq_mask; | ||
88 | void (*zoom_video); | ||
89 | int (*power_hook); | ||
90 | u8 resource...; | ||
91 | struct list_head devices_list; | ||
92 | u8 device_count; | ||
93 | struct pcmcia_state; | ||
94 | |||
95 | |||
96 | 3. Per PCMCIA-device Data: | ||
97 | -------------------------- | ||
98 | |||
99 | The "main" struct pcmcia_devie is protected as follows (read-only fields | ||
100 | or single-use fields not mentioned): | ||
101 | |||
102 | |||
103 | - by pcmcia_socket->ops_mutex: | ||
104 | struct list_head socket_device_list; | ||
105 | struct config_t *function_config; | ||
106 | u16 _irq:1; | ||
107 | u16 _io:1; | ||
108 | u16 _win:4; | ||
109 | u16 _locked:1; | ||
110 | u16 allow_func_id_match:1; | ||
111 | u16 suspended:1; | ||
112 | u16 _removed:1; | ||
113 | |||
114 | - by the PCMCIA driver: | ||
115 | io_req_t io; | ||
116 | irq_req_t irq; | ||
117 | config_req_t conf; | ||
118 | window_handle_t win; | ||
diff --git a/Documentation/pnp.txt b/Documentation/pnp.txt index a327db67782a..763e4659bf18 100644 --- a/Documentation/pnp.txt +++ b/Documentation/pnp.txt | |||
@@ -57,7 +57,7 @@ PC standard floppy disk controller | |||
57 | # cat resources | 57 | # cat resources |
58 | DISABLED | 58 | DISABLED |
59 | 59 | ||
60 | - Notice the string "DISABLED". THis means the device is not active. | 60 | - Notice the string "DISABLED". This means the device is not active. |
61 | 61 | ||
62 | 3.) check the device's possible configurations (optional) | 62 | 3.) check the device's possible configurations (optional) |
63 | # cat options | 63 | # cat options |
@@ -139,7 +139,7 @@ Plug and Play but it is planned to be in the near future. | |||
139 | 139 | ||
140 | Requirements for a Linux PnP protocol: | 140 | Requirements for a Linux PnP protocol: |
141 | 1.) the protocol must use EISA IDs | 141 | 1.) the protocol must use EISA IDs |
142 | 2.) the protocol must inform the PnP Layer of a devices current configuration | 142 | 2.) the protocol must inform the PnP Layer of a device's current configuration |
143 | - the ability to set resources is optional but preferred. | 143 | - the ability to set resources is optional but preferred. |
144 | 144 | ||
145 | The following are PnP protocol related functions: | 145 | The following are PnP protocol related functions: |
@@ -158,7 +158,7 @@ pnp_remove_device | |||
158 | - automatically will free mem used by the device and related structures | 158 | - automatically will free mem used by the device and related structures |
159 | 159 | ||
160 | pnp_add_id | 160 | pnp_add_id |
161 | - adds a EISA ID to the list of supported IDs for the specified device | 161 | - adds an EISA ID to the list of supported IDs for the specified device |
162 | 162 | ||
163 | For more information consult the source of a protocol such as | 163 | For more information consult the source of a protocol such as |
164 | /drivers/pnp/pnpbios/core.c. | 164 | /drivers/pnp/pnpbios/core.c. |
@@ -167,7 +167,7 @@ For more information consult the source of a protocol such as | |||
167 | 167 | ||
168 | Linux Plug and Play Drivers | 168 | Linux Plug and Play Drivers |
169 | --------------------------- | 169 | --------------------------- |
170 | This section contains information for linux PnP driver developers. | 170 | This section contains information for Linux PnP driver developers. |
171 | 171 | ||
172 | The New Way | 172 | The New Way |
173 | ........... | 173 | ........... |
@@ -235,11 +235,10 @@ static int __init serial8250_pnp_init(void) | |||
235 | The Old Way | 235 | The Old Way |
236 | ........... | 236 | ........... |
237 | 237 | ||
238 | a series of compatibility functions have been created to make it easy to convert | 238 | A series of compatibility functions have been created to make it easy to convert |
239 | |||
240 | ISAPNP drivers. They should serve as a temporary solution only. | 239 | ISAPNP drivers. They should serve as a temporary solution only. |
241 | 240 | ||
242 | they are as follows: | 241 | They are as follows: |
243 | 242 | ||
244 | struct pnp_card *pnp_find_card(unsigned short vendor, | 243 | struct pnp_card *pnp_find_card(unsigned short vendor, |
245 | unsigned short device, | 244 | unsigned short device, |
diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt index c9abbd86bc18..57080cd74575 100644 --- a/Documentation/power/devices.txt +++ b/Documentation/power/devices.txt | |||
@@ -1,7 +1,13 @@ | |||
1 | Device Power Management | ||
2 | |||
3 | Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. | ||
4 | Copyright (c) 2010 Alan Stern <stern@rowland.harvard.edu> | ||
5 | |||
6 | |||
1 | Most of the code in Linux is device drivers, so most of the Linux power | 7 | Most of the code in Linux is device drivers, so most of the Linux power |
2 | management code is also driver-specific. Most drivers will do very little; | 8 | management (PM) code is also driver-specific. Most drivers will do very |
3 | others, especially for platforms with small batteries (like cell phones), | 9 | little; others, especially for platforms with small batteries (like cell |
4 | will do a lot. | 10 | phones), will do a lot. |
5 | 11 | ||
6 | This writeup gives an overview of how drivers interact with system-wide | 12 | This writeup gives an overview of how drivers interact with system-wide |
7 | power management goals, emphasizing the models and interfaces that are | 13 | power management goals, emphasizing the models and interfaces that are |
@@ -15,9 +21,10 @@ Drivers will use one or both of these models to put devices into low-power | |||
15 | states: | 21 | states: |
16 | 22 | ||
17 | System Sleep model: | 23 | System Sleep model: |
18 | Drivers can enter low power states as part of entering system-wide | 24 | Drivers can enter low-power states as part of entering system-wide |
19 | low-power states like "suspend-to-ram", or (mostly for systems with | 25 | low-power states like "suspend" (also known as "suspend-to-RAM"), or |
20 | disks) "hibernate" (suspend-to-disk). | 26 | (mostly for systems with disks) "hibernation" (also known as |
27 | "suspend-to-disk"). | ||
21 | 28 | ||
22 | This is something that device, bus, and class drivers collaborate on | 29 | This is something that device, bus, and class drivers collaborate on |
23 | by implementing various role-specific suspend and resume methods to | 30 | by implementing various role-specific suspend and resume methods to |
@@ -25,33 +32,41 @@ states: | |||
25 | them without loss of data. | 32 | them without loss of data. |
26 | 33 | ||
27 | Some drivers can manage hardware wakeup events, which make the system | 34 | Some drivers can manage hardware wakeup events, which make the system |
28 | leave that low-power state. This feature may be disabled using the | 35 | leave the low-power state. This feature may be enabled or disabled |
29 | relevant /sys/devices/.../power/wakeup file; enabling it may cost some | 36 | using the relevant /sys/devices/.../power/wakeup file (for Ethernet |
30 | power usage, but let the whole system enter low power states more often. | 37 | drivers the ioctl interface used by ethtool may also be used for this |
38 | purpose); enabling it may cost some power usage, but let the whole | ||
39 | system enter low-power states more often. | ||
31 | 40 | ||
32 | Runtime Power Management model: | 41 | Runtime Power Management model: |
33 | Drivers may also enter low power states while the system is running, | 42 | Devices may also be put into low-power states while the system is |
34 | independently of other power management activity. Upstream drivers | 43 | running, independently of other power management activity in principle. |
35 | will normally not know (or care) if the device is in some low power | 44 | However, devices are not generally independent of each other (for |
36 | state when issuing requests; the driver will auto-resume anything | 45 | example, a parent device cannot be suspended unless all of its child |
37 | that's needed when it gets a request. | 46 | devices have been suspended). Moreover, depending on the bus type the |
38 | 47 | device is on, it may be necessary to carry out some bus-specific | |
39 | This doesn't have, or need much infrastructure; it's just something you | 48 | operations on the device for this purpose. Devices put into low power |
40 | should do when writing your drivers. For example, clk_disable() unused | 49 | states at run time may require special handling during system-wide power |
41 | clocks as part of minimizing power drain for currently-unused hardware. | 50 | transitions (suspend or hibernation). |
42 | Of course, sometimes clusters of drivers will collaborate with each | 51 | |
43 | other, which could involve task-specific power management. | 52 | For these reasons not only the device driver itself, but also the |
44 | 53 | appropriate subsystem (bus type, device type or device class) driver and | |
45 | There's not a lot to be said about those low power states except that they | 54 | the PM core are involved in runtime power management. As in the system |
46 | are very system-specific, and often device-specific. Also, that if enough | 55 | sleep power management case, they need to collaborate by implementing |
47 | drivers put themselves into low power states (at "runtime"), the effect may be | 56 | various role-specific suspend and resume methods, so that the hardware |
48 | the same as entering some system-wide low-power state (system sleep) ... and | 57 | is cleanly powered down and reactivated without data or service loss. |
49 | that synergies exist, so that several drivers using runtime pm might put the | 58 | |
50 | system into a state where even deeper power saving options are available. | 59 | There's not a lot to be said about those low-power states except that they are |
51 | 60 | very system-specific, and often device-specific. Also, that if enough devices | |
52 | Most suspended devices will have quiesced all I/O: no more DMA or irqs, no | 61 | have been put into low-power states (at runtime), the effect may be very similar |
53 | more data read or written, and requests from upstream drivers are no longer | 62 | to entering some system-wide low-power state (system sleep) ... and that |
54 | accepted. A given bus or platform may have different requirements though. | 63 | synergies exist, so that several drivers using runtime PM might put the system |
64 | into a state where even deeper power saving options are available. | ||
65 | |||
66 | Most suspended devices will have quiesced all I/O: no more DMA or IRQs (except | ||
67 | for wakeup events), no more data read or written, and requests from upstream | ||
68 | drivers are no longer accepted. A given bus or platform may have different | ||
69 | requirements though. | ||
55 | 70 | ||
56 | Examples of hardware wakeup events include an alarm from a real time clock, | 71 | Examples of hardware wakeup events include an alarm from a real time clock, |
57 | network wake-on-LAN packets, keyboard or mouse activity, and media insertion | 72 | network wake-on-LAN packets, keyboard or mouse activity, and media insertion |
@@ -60,129 +75,152 @@ or removal (for PCMCIA, MMC/SD, USB, and so on). | |||
60 | 75 | ||
61 | Interfaces for Entering System Sleep States | 76 | Interfaces for Entering System Sleep States |
62 | =========================================== | 77 | =========================================== |
63 | Most of the programming interfaces a device driver needs to know about | 78 | There are programming interfaces provided for subsystems (bus type, device type, |
64 | relate to that first model: entering a system-wide low power state, | 79 | device class) and device drivers to allow them to participate in the power |
65 | rather than just minimizing power consumption by one device. | 80 | management of devices they are concerned with. These interfaces cover both |
66 | 81 | system sleep and runtime power management. | |
67 | 82 | ||
68 | Bus Driver Methods | 83 | |
69 | ------------------ | 84 | Device Power Management Operations |
70 | The core methods to suspend and resume devices reside in struct bus_type. | 85 | ---------------------------------- |
71 | These are mostly of interest to people writing infrastructure for busses | 86 | Device power management operations, at the subsystem level as well as at the |
72 | like PCI or USB, or because they define the primitives that device drivers | 87 | device driver level, are implemented by defining and populating objects of type |
73 | may need to apply in domain-specific ways to their devices: | 88 | struct dev_pm_ops: |
74 | 89 | ||
75 | struct bus_type { | 90 | struct dev_pm_ops { |
76 | ... | 91 | int (*prepare)(struct device *dev); |
77 | int (*suspend)(struct device *dev, pm_message_t state); | 92 | void (*complete)(struct device *dev); |
78 | int (*resume)(struct device *dev); | 93 | int (*suspend)(struct device *dev); |
94 | int (*resume)(struct device *dev); | ||
95 | int (*freeze)(struct device *dev); | ||
96 | int (*thaw)(struct device *dev); | ||
97 | int (*poweroff)(struct device *dev); | ||
98 | int (*restore)(struct device *dev); | ||
99 | int (*suspend_noirq)(struct device *dev); | ||
100 | int (*resume_noirq)(struct device *dev); | ||
101 | int (*freeze_noirq)(struct device *dev); | ||
102 | int (*thaw_noirq)(struct device *dev); | ||
103 | int (*poweroff_noirq)(struct device *dev); | ||
104 | int (*restore_noirq)(struct device *dev); | ||
105 | int (*runtime_suspend)(struct device *dev); | ||
106 | int (*runtime_resume)(struct device *dev); | ||
107 | int (*runtime_idle)(struct device *dev); | ||
79 | }; | 108 | }; |
80 | 109 | ||
81 | Bus drivers implement those methods as appropriate for the hardware and | 110 | This structure is defined in include/linux/pm.h and the methods included in it |
82 | the drivers using it; PCI works differently from USB, and so on. Not many | 111 | are also described in that file. Their roles will be explained in what follows. |
83 | people write bus drivers; most driver code is a "device driver" that | 112 | For now, it should be sufficient to remember that the last three methods are |
84 | builds on top of bus-specific framework code. | 113 | specific to runtime power management while the remaining ones are used during |
114 | system-wide power transitions. | ||
85 | 115 | ||
86 | For more information on these driver calls, see the description later; | 116 | There also is a deprecated "old" or "legacy" interface for power management |
87 | they are called in phases for every device, respecting the parent-child | 117 | operations available at least for some subsystems. This approach does not use |
88 | sequencing in the driver model tree. Note that as this is being written, | 118 | struct dev_pm_ops objects and it is suitable only for implementing system sleep |
89 | only the suspend() and resume() are widely available; not many bus drivers | 119 | power management methods. Therefore it is not described in this document, so |
90 | leverage all of those phases, or pass them down to lower driver levels. | 120 | please refer directly to the source code for more information about it. |
91 | 121 | ||
92 | 122 | ||
93 | /sys/devices/.../power/wakeup files | 123 | Subsystem-Level Methods |
94 | ----------------------------------- | 124 | ----------------------- |
95 | All devices in the driver model have two flags to control handling of | 125 | The core methods to suspend and resume devices reside in struct dev_pm_ops |
96 | wakeup events, which are hardware signals that can force the device and/or | 126 | pointed to by the pm member of struct bus_type, struct device_type and |
97 | system out of a low power state. These are initialized by bus or device | 127 | struct class. They are mostly of interest to the people writing infrastructure |
98 | driver code using device_init_wakeup(dev,can_wakeup). | 128 | for buses, like PCI or USB, or device type and device class drivers. |
99 | 129 | ||
100 | The "can_wakeup" flag just records whether the device (and its driver) can | 130 | Bus drivers implement these methods as appropriate for the hardware and the |
101 | physically support wakeup events. When that flag is clear, the sysfs | 131 | drivers using it; PCI works differently from USB, and so on. Not many people |
102 | "wakeup" file is empty, and device_may_wakeup() returns false. | 132 | write subsystem-level drivers; most driver code is a "device driver" that builds |
133 | on top of bus-specific framework code. | ||
103 | 134 | ||
104 | For devices that can issue wakeup events, a separate flag controls whether | 135 | For more information on these driver calls, see the description later; |
105 | that device should try to use its wakeup mechanism. The initial value of | 136 | they are called in phases for every device, respecting the parent-child |
106 | device_may_wakeup() will be true, so that the device's "wakeup" file holds | 137 | sequencing in the driver model tree. |
107 | the value "enabled". Userspace can change that to "disabled" so that | ||
108 | device_may_wakeup() returns false; or change it back to "enabled" (so that | ||
109 | it returns true again). | ||
110 | 138 | ||
111 | 139 | ||
112 | EXAMPLE: PCI Device Driver Methods | 140 | /sys/devices/.../power/wakeup files |
113 | ----------------------------------- | 141 | ----------------------------------- |
114 | PCI framework software calls these methods when the PCI device driver bound | 142 | All devices in the driver model have two flags to control handling of wakeup |
115 | to a device device has provided them: | 143 | events (hardware signals that can force the device and/or system out of a low |
116 | 144 | power state). These flags are initialized by bus or device driver code using | |
117 | struct pci_driver { | 145 | device_set_wakeup_capable() and device_set_wakeup_enable(), defined in |
118 | ... | 146 | include/linux/pm_wakeup.h. |
119 | int (*suspend)(struct pci_device *pdev, pm_message_t state); | ||
120 | int (*suspend_late)(struct pci_device *pdev, pm_message_t state); | ||
121 | 147 | ||
122 | int (*resume_early)(struct pci_device *pdev); | 148 | The "can_wakeup" flag just records whether the device (and its driver) can |
123 | int (*resume)(struct pci_device *pdev); | 149 | physically support wakeup events. The device_set_wakeup_capable() routine |
124 | }; | 150 | affects this flag. The "should_wakeup" flag controls whether the device should |
125 | 151 | try to use its wakeup mechanism. device_set_wakeup_enable() affects this flag; | |
126 | Drivers will implement those methods, and call PCI-specific procedures | 152 | for the most part drivers should not change its value. The initial value of |
127 | like pci_set_power_state(), pci_enable_wake(), pci_save_state(), and | 153 | should_wakeup is supposed to be false for the majority of devices; the major |
128 | pci_restore_state() to manage PCI-specific mechanisms. (PCI config space | 154 | exceptions are power buttons, keyboards, and Ethernet adapters whose WoL |
129 | could be saved during driver probe, if it weren't for the fact that some | 155 | (wake-on-LAN) feature has been set up with ethtool. |
130 | systems rely on userspace tweaking using setpci.) Devices are suspended | 156 | |
131 | before their bridges enter low power states, and likewise bridges resume | 157 | Whether or not a device is capable of issuing wakeup events is a hardware |
132 | before their devices. | 158 | matter, and the kernel is responsible for keeping track of it. By contrast, |
133 | 159 | whether or not a wakeup-capable device should issue wakeup events is a policy | |
134 | 160 | decision, and it is managed by user space through a sysfs attribute: the | |
135 | Upper Layers of Driver Stacks | 161 | power/wakeup file. User space can write the strings "enabled" or "disabled" to |
136 | ----------------------------- | 162 | set or clear the should_wakeup flag, respectively. Reads from the file will |
137 | Device drivers generally have at least two interfaces, and the methods | 163 | return the corresponding string if can_wakeup is true, but if can_wakeup is |
138 | sketched above are the ones which apply to the lower level (nearer PCI, USB, | 164 | false then reads will return an empty string, to indicate that the device |
139 | or other bus hardware). The network and block layers are examples of upper | 165 | doesn't support wakeup events. (But even though the file appears empty, writes |
140 | level interfaces, as is a character device talking to userspace. | 166 | will still affect the should_wakeup flag.) |
141 | 167 | ||
142 | Power management requests normally need to flow through those upper levels, | 168 | The device_may_wakeup() routine returns true only if both flags are set. |
143 | which often use domain-oriented requests like "blank that screen". In | 169 | Drivers should check this routine when putting devices in a low-power state |
144 | some cases those upper levels will have power management intelligence that | 170 | during a system sleep transition, to see whether or not to enable the devices' |
145 | relates to end-user activity, or other devices that work in cooperation. | 171 | wakeup mechanisms. However for runtime power management, wakeup events should |
146 | 172 | be enabled whenever the device and driver both support them, regardless of the | |
147 | When those interfaces are structured using class interfaces, there is a | 173 | should_wakeup flag. |
148 | standard way to have the upper layer stop issuing requests to a given | 174 | |
149 | class device (and restart later): | 175 | |
150 | 176 | /sys/devices/.../power/control files | |
151 | struct class { | 177 | ------------------------------------ |
152 | ... | 178 | Each device in the driver model has a flag to control whether it is subject to |
153 | int (*suspend)(struct device *dev, pm_message_t state); | 179 | runtime power management. This flag, called runtime_auto, is initialized by the |
154 | int (*resume)(struct device *dev); | 180 | bus type (or generally subsystem) code using pm_runtime_allow() or |
155 | }; | 181 | pm_runtime_forbid(); the default is to allow runtime power management. |
156 | 182 | ||
157 | Those calls are issued in specific phases of the process by which the | 183 | The setting can be adjusted by user space by writing either "on" or "auto" to |
158 | system enters a low power "suspend" state, or resumes from it. | 184 | the device's power/control sysfs file. Writing "auto" calls pm_runtime_allow(), |
159 | 185 | setting the flag and allowing the device to be runtime power-managed by its | |
160 | 186 | driver. Writing "on" calls pm_runtime_forbid(), clearing the flag, returning | |
161 | Calling Drivers to Enter System Sleep States | 187 | the device to full power if it was in a low-power state, and preventing the |
162 | ============================================ | 188 | device from being runtime power-managed. User space can check the current value |
163 | When the system enters a low power state, each device's driver is asked | 189 | of the runtime_auto flag by reading the file. |
164 | to suspend the device by putting it into state compatible with the target | 190 | |
191 | The device's runtime_auto flag has no effect on the handling of system-wide | ||
192 | power transitions. In particular, the device can (and in the majority of cases | ||
193 | should and will) be put into a low-power state during a system-wide transition | ||
194 | to a sleep state even though its runtime_auto flag is clear. | ||
195 | |||
196 | For more information about the runtime power management framework, refer to | ||
197 | Documentation/power/runtime_pm.txt. | ||
198 | |||
199 | |||
200 | Calling Drivers to Enter and Leave System Sleep States | ||
201 | ====================================================== | ||
202 | When the system goes into a sleep state, each device's driver is asked to | ||
203 | suspend the device by putting it into a state compatible with the target | ||
165 | system state. That's usually some version of "off", but the details are | 204 | system state. That's usually some version of "off", but the details are |
166 | system-specific. Also, wakeup-enabled devices will usually stay partly | 205 | system-specific. Also, wakeup-enabled devices will usually stay partly |
167 | functional in order to wake the system. | 206 | functional in order to wake the system. |
168 | 207 | ||
169 | When the system leaves that low power state, the device's driver is asked | 208 | When the system leaves that low-power state, the device's driver is asked to |
170 | to resume it. The suspend and resume operations always go together, and | 209 | resume it by returning it to full power. The suspend and resume operations |
171 | both are multi-phase operations. | 210 | always go together, and both are multi-phase operations. |
172 | 211 | ||
173 | For simple drivers, suspend might quiesce the device using the class code | 212 | For simple drivers, suspend might quiesce the device using class code |
174 | and then turn its hardware as "off" as possible with late_suspend. The | 213 | and then turn its hardware as "off" as possible during suspend_noirq. The |
175 | matching resume calls would then completely reinitialize the hardware | 214 | matching resume calls would then completely reinitialize the hardware |
176 | before reactivating its class I/O queues. | 215 | before reactivating its class I/O queues. |
177 | 216 | ||
178 | More power-aware drivers drivers will use more than one device low power | 217 | More power-aware drivers might prepare the devices for triggering system wakeup |
179 | state, either at runtime or during system sleep states, and might trigger | 218 | events. |
180 | system wakeup events. | ||
181 | 219 | ||
182 | 220 | ||
183 | Call Sequence Guarantees | 221 | Call Sequence Guarantees |
184 | ------------------------ | 222 | ------------------------ |
185 | To ensure that bridges and similar links needed to talk to a device are | 223 | To ensure that bridges and similar links needing to talk to a device are |
186 | available when the device is suspended or resumed, the device tree is | 224 | available when the device is suspended or resumed, the device tree is |
187 | walked in a bottom-up order to suspend devices. A top-down order is | 225 | walked in a bottom-up order to suspend devices. A top-down order is |
188 | used to resume those devices. | 226 | used to resume those devices. |
@@ -194,67 +232,310 @@ its parent; and can't be removed or suspended after that parent. | |||
194 | The policy is that the device tree should match hardware bus topology. | 232 | The policy is that the device tree should match hardware bus topology. |
195 | (Or at least the control bus, for devices which use multiple busses.) | 233 | (Or at least the control bus, for devices which use multiple busses.) |
196 | In particular, this means that a device registration may fail if the parent of | 234 | In particular, this means that a device registration may fail if the parent of |
197 | the device is suspending (ie. has been chosen by the PM core as the next | 235 | the device is suspending (i.e. has been chosen by the PM core as the next |
198 | device to suspend) or has already suspended, as well as after all of the other | 236 | device to suspend) or has already suspended, as well as after all of the other |
199 | devices have been suspended. Device drivers must be prepared to cope with such | 237 | devices have been suspended. Device drivers must be prepared to cope with such |
200 | situations. | 238 | situations. |
201 | 239 | ||
202 | 240 | ||
203 | Suspending Devices | 241 | System Power Management Phases |
204 | ------------------ | 242 | ------------------------------ |
205 | Suspending a given device is done in several phases. Suspending the | 243 | Suspending or resuming the system is done in several phases. Different phases |
206 | system always includes every phase, executing calls for every device | 244 | are used for standby or memory sleep states ("suspend-to-RAM") and the |
207 | before the next phase begins. Not all busses or classes support all | 245 | hibernation state ("suspend-to-disk"). Each phase involves executing callbacks |
208 | these callbacks; and not all drivers use all the callbacks. | 246 | for every device before the next phase begins. Not all busses or classes |
247 | support all these callbacks and not all drivers use all the callbacks. The | ||
248 | various phases always run after tasks have been frozen and before they are | ||
249 | unfrozen. Furthermore, the *_noirq phases run at a time when IRQ handlers have | ||
250 | been disabled (except for those marked with the IRQ_WAKEUP flag). | ||
209 | 251 | ||
210 | The phases are seen by driver notifications issued in this order: | 252 | Most phases use bus, type, and class callbacks (that is, methods defined in |
253 | dev->bus->pm, dev->type->pm, and dev->class->pm). The prepare and complete | ||
254 | phases are exceptions; they use only bus callbacks. When multiple callbacks | ||
255 | are used in a phase, they are invoked in the order: <class, type, bus> during | ||
256 | power-down transitions and in the opposite order during power-up transitions. | ||
257 | For example, during the suspend phase the PM core invokes | ||
211 | 258 | ||
212 | 1 class.suspend(dev, message) is called after tasks are frozen, for | 259 | dev->class->pm.suspend(dev); |
213 | devices associated with a class that has such a method. This | 260 | dev->type->pm.suspend(dev); |
214 | method may sleep. | 261 | dev->bus->pm.suspend(dev); |
215 | 262 | ||
216 | Since I/O activity usually comes from such higher layers, this is | 263 | before moving on to the next device, whereas during the resume phase the core |
217 | a good place to quiesce all drivers of a given type (and keep such | 264 | invokes |
218 | code out of those drivers). | ||
219 | 265 | ||
220 | 2 bus.suspend(dev, message) is called next. This method may sleep, | 266 | dev->bus->pm.resume(dev); |
221 | and is often morphed into a device driver call with bus-specific | 267 | dev->type->pm.resume(dev); |
222 | parameters and/or rules. | 268 | dev->class->pm.resume(dev); |
223 | 269 | ||
224 | This call should handle parts of device suspend logic that require | 270 | These callbacks may in turn invoke device- or driver-specific methods stored in |
225 | sleeping. It probably does work to quiesce the device which hasn't | 271 | dev->driver->pm, but they don't have to. |
226 | been abstracted into class.suspend(). | ||
227 | 272 | ||
228 | The pm_message_t parameter is currently used to refine those semantics | ||
229 | (described later). | ||
230 | 273 | ||
231 | At the end of those phases, drivers should normally have stopped all I/O | 274 | Entering System Suspend |
232 | transactions (DMA, IRQs), saved enough state that they can re-initialize | 275 | ----------------------- |
233 | or restore previous state (as needed by the hardware), and placed the | 276 | When the system goes into the standby or memory sleep state, the phases are: |
234 | device into a low-power state. On many platforms they will also use | 277 | |
235 | clk_disable() to gate off one or more clock sources; sometimes they will | 278 | prepare, suspend, suspend_noirq. |
236 | also switch off power supplies, or reduce voltages. Drivers which have | 279 | |
237 | runtime PM support may already have performed some or all of the steps | 280 | 1. The prepare phase is meant to prevent races by preventing new devices |
238 | needed to prepare for the upcoming system sleep state. | 281 | from being registered; the PM core would never know that all the |
282 | children of a device had been suspended if new children could be | ||
283 | registered at will. (By contrast, devices may be unregistered at any | ||
284 | time.) Unlike the other suspend-related phases, during the prepare | ||
285 | phase the device tree is traversed top-down. | ||
286 | |||
287 | The prepare phase uses only a bus callback. After the callback method | ||
288 | returns, no new children may be registered below the device. The method | ||
289 | may also prepare the device or driver in some way for the upcoming | ||
290 | system power transition, but it should not put the device into a | ||
291 | low-power state. | ||
292 | |||
293 | 2. The suspend methods should quiesce the device to stop it from performing | ||
294 | I/O. They also may save the device registers and put it into the | ||
295 | appropriate low-power state, depending on the bus type the device is on, | ||
296 | and they may enable wakeup events. | ||
297 | |||
298 | 3. The suspend_noirq phase occurs after IRQ handlers have been disabled, | ||
299 | which means that the driver's interrupt handler will not be called while | ||
300 | the callback method is running. The methods should save the values of | ||
301 | the device's registers that weren't saved previously and finally put the | ||
302 | device into the appropriate low-power state. | ||
303 | |||
304 | The majority of subsystems and device drivers need not implement this | ||
305 | callback. However, bus types allowing devices to share interrupt | ||
306 | vectors, like PCI, generally need it; otherwise a driver might encounter | ||
307 | an error during the suspend phase by fielding a shared interrupt | ||
308 | generated by some other device after its own device had been set to low | ||
309 | power. | ||
310 | |||
311 | At the end of these phases, drivers should have stopped all I/O transactions | ||
312 | (DMA, IRQs), saved enough state that they can re-initialize or restore previous | ||
313 | state (as needed by the hardware), and placed the device into a low-power state. | ||
314 | On many platforms they will gate off one or more clock sources; sometimes they | ||
315 | will also switch off power supplies or reduce voltages. (Drivers supporting | ||
316 | runtime PM may already have performed some or all of these steps.) | ||
317 | |||
318 | If device_may_wakeup(dev) returns true, the device should be prepared for | ||
319 | generating hardware wakeup signals to trigger a system wakeup event when the | ||
320 | system is in the sleep state. For example, enable_irq_wake() might identify | ||
321 | GPIO signals hooked up to a switch or other external hardware, and | ||
322 | pci_enable_wake() does something similar for the PCI PME signal. | ||
323 | |||
324 | If any of these callbacks returns an error, the system won't enter the desired | ||
325 | low-power state. Instead the PM core will unwind its actions by resuming all | ||
326 | the devices that were suspended. | ||
327 | |||
328 | |||
329 | Leaving System Suspend | ||
330 | ---------------------- | ||
331 | When resuming from standby or memory sleep, the phases are: | ||
332 | |||
333 | resume_noirq, resume, complete. | ||
334 | |||
335 | 1. The resume_noirq callback methods should perform any actions needed | ||
336 | before the driver's interrupt handlers are invoked. This generally | ||
337 | means undoing the actions of the suspend_noirq phase. If the bus type | ||
338 | permits devices to share interrupt vectors, like PCI, the method should | ||
339 | bring the device and its driver into a state in which the driver can | ||
340 | recognize if the device is the source of incoming interrupts, if any, | ||
341 | and handle them correctly. | ||
342 | |||
343 | For example, the PCI bus type's ->pm.resume_noirq() puts the device into | ||
344 | the full-power state (D0 in the PCI terminology) and restores the | ||
345 | standard configuration registers of the device. Then it calls the | ||
346 | device driver's ->pm.resume_noirq() method to perform device-specific | ||
347 | actions. | ||
348 | |||
349 | 2. The resume methods should bring the the device back to its operating | ||
350 | state, so that it can perform normal I/O. This generally involves | ||
351 | undoing the actions of the suspend phase. | ||
352 | |||
353 | 3. The complete phase uses only a bus callback. The method should undo the | ||
354 | actions of the prepare phase. Note, however, that new children may be | ||
355 | registered below the device as soon as the resume callbacks occur; it's | ||
356 | not necessary to wait until the complete phase. | ||
357 | |||
358 | At the end of these phases, drivers should be as functional as they were before | ||
359 | suspending: I/O can be performed using DMA and IRQs, and the relevant clocks are | ||
360 | gated on. Even if the device was in a low-power state before the system sleep | ||
361 | because of runtime power management, afterwards it should be back in its | ||
362 | full-power state. There are multiple reasons why it's best to do this; they are | ||
363 | discussed in more detail in Documentation/power/runtime_pm.txt. | ||
239 | 364 | ||
240 | When any driver sees that its device_can_wakeup(dev), it should make sure | 365 | However, the details here may again be platform-specific. For example, |
241 | to use the relevant hardware signals to trigger a system wakeup event. | 366 | some systems support multiple "run" states, and the mode in effect at |
242 | For example, enable_irq_wake() might identify GPIO signals hooked up to | 367 | the end of resume might not be the one which preceded suspension. |
243 | a switch or other external hardware, and pci_enable_wake() does something | 368 | That means availability of certain clocks or power supplies changed, |
244 | similar for PCI's PME# signal. | 369 | which could easily affect how a driver works. |
370 | |||
371 | Drivers need to be able to handle hardware which has been reset since the | ||
372 | suspend methods were called, for example by complete reinitialization. | ||
373 | This may be the hardest part, and the one most protected by NDA'd documents | ||
374 | and chip errata. It's simplest if the hardware state hasn't changed since | ||
375 | the suspend was carried out, but that can't be guaranteed (in fact, it ususally | ||
376 | is not the case). | ||
377 | |||
378 | Drivers must also be prepared to notice that the device has been removed | ||
379 | while the system was powered down, whenever that's physically possible. | ||
380 | PCMCIA, MMC, USB, Firewire, SCSI, and even IDE are common examples of busses | ||
381 | where common Linux platforms will see such removal. Details of how drivers | ||
382 | will notice and handle such removals are currently bus-specific, and often | ||
383 | involve a separate thread. | ||
384 | |||
385 | These callbacks may return an error value, but the PM core will ignore such | ||
386 | errors since there's nothing it can do about them other than printing them in | ||
387 | the system log. | ||
388 | |||
389 | |||
390 | Entering Hibernation | ||
391 | -------------------- | ||
392 | Hibernating the system is more complicated than putting it into the standby or | ||
393 | memory sleep state, because it involves creating and saving a system image. | ||
394 | Therefore there are more phases for hibernation, with a different set of | ||
395 | callbacks. These phases always run after tasks have been frozen and memory has | ||
396 | been freed. | ||
397 | |||
398 | The general procedure for hibernation is to quiesce all devices (freeze), create | ||
399 | an image of the system memory while everything is stable, reactivate all | ||
400 | devices (thaw), write the image to permanent storage, and finally shut down the | ||
401 | system (poweroff). The phases used to accomplish this are: | ||
402 | |||
403 | prepare, freeze, freeze_noirq, thaw_noirq, thaw, complete, | ||
404 | prepare, poweroff, poweroff_noirq | ||
405 | |||
406 | 1. The prepare phase is discussed in the "Entering System Suspend" section | ||
407 | above. | ||
408 | |||
409 | 2. The freeze methods should quiesce the device so that it doesn't generate | ||
410 | IRQs or DMA, and they may need to save the values of device registers. | ||
411 | However the device does not have to be put in a low-power state, and to | ||
412 | save time it's best not to do so. Also, the device should not be | ||
413 | prepared to generate wakeup events. | ||
414 | |||
415 | 3. The freeze_noirq phase is analogous to the suspend_noirq phase discussed | ||
416 | above, except again that the device should not be put in a low-power | ||
417 | state and should not be allowed to generate wakeup events. | ||
418 | |||
419 | At this point the system image is created. All devices should be inactive and | ||
420 | the contents of memory should remain undisturbed while this happens, so that the | ||
421 | image forms an atomic snapshot of the system state. | ||
422 | |||
423 | 4. The thaw_noirq phase is analogous to the resume_noirq phase discussed | ||
424 | above. The main difference is that its methods can assume the device is | ||
425 | in the same state as at the end of the freeze_noirq phase. | ||
426 | |||
427 | 5. The thaw phase is analogous to the resume phase discussed above. Its | ||
428 | methods should bring the device back to an operating state, so that it | ||
429 | can be used for saving the image if necessary. | ||
430 | |||
431 | 6. The complete phase is discussed in the "Leaving System Suspend" section | ||
432 | above. | ||
433 | |||
434 | At this point the system image is saved, and the devices then need to be | ||
435 | prepared for the upcoming system shutdown. This is much like suspending them | ||
436 | before putting the system into the standby or memory sleep state, and the phases | ||
437 | are similar. | ||
438 | |||
439 | 7. The prepare phase is discussed above. | ||
440 | |||
441 | 8. The poweroff phase is analogous to the suspend phase. | ||
442 | |||
443 | 9. The poweroff_noirq phase is analogous to the suspend_noirq phase. | ||
444 | |||
445 | The poweroff and poweroff_noirq callbacks should do essentially the same things | ||
446 | as the suspend and suspend_noirq callbacks. The only notable difference is that | ||
447 | they need not store the device register values, because the registers should | ||
448 | already have been stored during the freeze or freeze_noirq phases. | ||
449 | |||
450 | |||
451 | Leaving Hibernation | ||
452 | ------------------- | ||
453 | Resuming from hibernation is, again, more complicated than resuming from a sleep | ||
454 | state in which the contents of main memory are preserved, because it requires | ||
455 | a system image to be loaded into memory and the pre-hibernation memory contents | ||
456 | to be restored before control can be passed back to the image kernel. | ||
457 | |||
458 | Although in principle, the image might be loaded into memory and the | ||
459 | pre-hibernation memory contents restored by the boot loader, in practice this | ||
460 | can't be done because boot loaders aren't smart enough and there is no | ||
461 | established protocol for passing the necessary information. So instead, the | ||
462 | boot loader loads a fresh instance of the kernel, called the boot kernel, into | ||
463 | memory and passes control to it in the usual way. Then the boot kernel reads | ||
464 | the system image, restores the pre-hibernation memory contents, and passes | ||
465 | control to the image kernel. Thus two different kernels are involved in | ||
466 | resuming from hibernation. In fact, the boot kernel may be completely different | ||
467 | from the image kernel: a different configuration and even a different version. | ||
468 | This has important consequences for device drivers and their subsystems. | ||
469 | |||
470 | To be able to load the system image into memory, the boot kernel needs to | ||
471 | include at least a subset of device drivers allowing it to access the storage | ||
472 | medium containing the image, although it doesn't need to include all of the | ||
473 | drivers present in the image kernel. After the image has been loaded, the | ||
474 | devices managed by the boot kernel need to be prepared for passing control back | ||
475 | to the image kernel. This is very similar to the initial steps involved in | ||
476 | creating a system image, and it is accomplished in the same way, using prepare, | ||
477 | freeze, and freeze_noirq phases. However the devices affected by these phases | ||
478 | are only those having drivers in the boot kernel; other devices will still be in | ||
479 | whatever state the boot loader left them. | ||
480 | |||
481 | Should the restoration of the pre-hibernation memory contents fail, the boot | ||
482 | kernel would go through the "thawing" procedure described above, using the | ||
483 | thaw_noirq, thaw, and complete phases, and then continue running normally. This | ||
484 | happens only rarely. Most often the pre-hibernation memory contents are | ||
485 | restored successfully and control is passed to the image kernel, which then | ||
486 | becomes responsible for bringing the system back to the working state. | ||
487 | |||
488 | To achieve this, the image kernel must restore the devices' pre-hibernation | ||
489 | functionality. The operation is much like waking up from the memory sleep | ||
490 | state, although it involves different phases: | ||
491 | |||
492 | restore_noirq, restore, complete | ||
493 | |||
494 | 1. The restore_noirq phase is analogous to the resume_noirq phase. | ||
495 | |||
496 | 2. The restore phase is analogous to the resume phase. | ||
497 | |||
498 | 3. The complete phase is discussed above. | ||
499 | |||
500 | The main difference from resume[_noirq] is that restore[_noirq] must assume the | ||
501 | device has been accessed and reconfigured by the boot loader or the boot kernel. | ||
502 | Consequently the state of the device may be different from the state remembered | ||
503 | from the freeze and freeze_noirq phases. The device may even need to be reset | ||
504 | and completely re-initialized. In many cases this difference doesn't matter, so | ||
505 | the resume[_noirq] and restore[_norq] method pointers can be set to the same | ||
506 | routines. Nevertheless, different callback pointers are used in case there is a | ||
507 | situation where it actually matters. | ||
245 | 508 | ||
246 | If a driver (or bus, or class) fails it suspend method, the system won't | ||
247 | enter the desired low power state; it will resume all the devices it's | ||
248 | suspended so far. | ||
249 | 509 | ||
250 | Note that drivers may need to perform different actions based on the target | 510 | System Devices |
251 | system lowpower/sleep state. At this writing, there are only platform | 511 | -------------- |
252 | specific APIs through which drivers could determine those target states. | 512 | System devices (sysdevs) follow a slightly different API, which can be found in |
513 | |||
514 | include/linux/sysdev.h | ||
515 | drivers/base/sys.c | ||
516 | |||
517 | System devices will be suspended with interrupts disabled, and after all other | ||
518 | devices have been suspended. On resume, they will be resumed before any other | ||
519 | devices, and also with interrupts disabled. These things occur in special | ||
520 | "sysdev_driver" phases, which affect only system devices. | ||
521 | |||
522 | Thus, after the suspend_noirq (or freeze_noirq or poweroff_noirq) phase, when | ||
523 | the non-boot CPUs are all offline and IRQs are disabled on the remaining online | ||
524 | CPU, then a sysdev_driver.suspend phase is carried out, and the system enters a | ||
525 | sleep state (or a system image is created). During resume (or after the image | ||
526 | has been created or loaded) a sysdev_driver.resume phase is carried out, IRQs | ||
527 | are enabled on the only online CPU, the non-boot CPUs are enabled, and the | ||
528 | resume_noirq (or thaw_noirq or restore_noirq) phase begins. | ||
529 | |||
530 | Code to actually enter and exit the system-wide low power state sometimes | ||
531 | involves hardware details that are only known to the boot firmware, and | ||
532 | may leave a CPU running software (from SRAM or flash memory) that monitors | ||
533 | the system and manages its wakeup sequence. | ||
253 | 534 | ||
254 | 535 | ||
255 | Device Low Power (suspend) States | 536 | Device Low Power (suspend) States |
256 | --------------------------------- | 537 | --------------------------------- |
257 | Device low-power states aren't very standard. One device might only handle | 538 | Device low-power states aren't standard. One device might only handle |
258 | "on" and "off, while another might support a dozen different versions of | 539 | "on" and "off, while another might support a dozen different versions of |
259 | "on" (how many engines are active?), plus a state that gets back to "on" | 540 | "on" (how many engines are active?), plus a state that gets back to "on" |
260 | faster than from a full "off". | 541 | faster than from a full "off". |
@@ -265,7 +546,7 @@ PCI device may not perform DMA or issue IRQs, and any wakeup events it | |||
265 | issues would be issued through the PME# bus signal. Plus, there are | 546 | issues would be issued through the PME# bus signal. Plus, there are |
266 | several PCI-standard device states, some of which are optional. | 547 | several PCI-standard device states, some of which are optional. |
267 | 548 | ||
268 | In contrast, integrated system-on-chip processors often use irqs as the | 549 | In contrast, integrated system-on-chip processors often use IRQs as the |
269 | wakeup event sources (so drivers would call enable_irq_wake) and might | 550 | wakeup event sources (so drivers would call enable_irq_wake) and might |
270 | be able to treat DMA completion as a wakeup event (sometimes DMA can stay | 551 | be able to treat DMA completion as a wakeup event (sometimes DMA can stay |
271 | active too, it'd only be the CPU and some peripherals that sleep). | 552 | active too, it'd only be the CPU and some peripherals that sleep). |
@@ -284,120 +565,17 @@ ways; the aforementioned LCD might be active in one product's "standby", | |||
284 | but a different product using the same SOC might work differently. | 565 | but a different product using the same SOC might work differently. |
285 | 566 | ||
286 | 567 | ||
287 | Meaning of pm_message_t.event | 568 | Power Management Notifiers |
288 | ----------------------------- | 569 | -------------------------- |
289 | Parameters to suspend calls include the device affected and a message of | 570 | There are some operations that cannot be carried out by the power management |
290 | type pm_message_t, which has one field: the event. If driver does not | 571 | callbacks discussed above, because the callbacks occur too late or too early. |
291 | recognize the event code, suspend calls may abort the request and return | 572 | To handle these cases, subsystems and device drivers may register power |
292 | a negative errno. However, most drivers will be fine if they implement | 573 | management notifiers that are called before tasks are frozen and after they have |
293 | PM_EVENT_SUSPEND semantics for all messages. | 574 | been thawed. Generally speaking, the PM notifiers are suitable for performing |
575 | actions that either require user space to be available, or at least won't | ||
576 | interfere with user space. | ||
294 | 577 | ||
295 | The event codes are used to refine the goal of suspending the device, and | 578 | For details refer to Documentation/power/notifiers.txt. |
296 | mostly matter when creating or resuming system memory image snapshots, as | ||
297 | used with suspend-to-disk: | ||
298 | |||
299 | PM_EVENT_SUSPEND -- quiesce the driver and put hardware into a low-power | ||
300 | state. When used with system sleep states like "suspend-to-RAM" or | ||
301 | "standby", the upcoming resume() call will often be able to rely on | ||
302 | state kept in hardware, or issue system wakeup events. | ||
303 | |||
304 | PM_EVENT_HIBERNATE -- Put hardware into a low-power state and enable wakeup | ||
305 | events as appropriate. It is only used with hibernation | ||
306 | (suspend-to-disk) and few devices are able to wake up the system from | ||
307 | this state; most are completely powered off. | ||
308 | |||
309 | PM_EVENT_FREEZE -- quiesce the driver, but don't necessarily change into | ||
310 | any low power mode. A system snapshot is about to be taken, often | ||
311 | followed by a call to the driver's resume() method. Neither wakeup | ||
312 | events nor DMA are allowed. | ||
313 | |||
314 | PM_EVENT_PRETHAW -- quiesce the driver, knowing that the upcoming resume() | ||
315 | will restore a suspend-to-disk snapshot from a different kernel image. | ||
316 | Drivers that are smart enough to look at their hardware state during | ||
317 | resume() processing need that state to be correct ... a PRETHAW could | ||
318 | be used to invalidate that state (by resetting the device), like a | ||
319 | shutdown() invocation would before a kexec() or system halt. Other | ||
320 | drivers might handle this the same way as PM_EVENT_FREEZE. Neither | ||
321 | wakeup events nor DMA are allowed. | ||
322 | |||
323 | To enter "standby" (ACPI S1) or "Suspend to RAM" (STR, ACPI S3) states, or | ||
324 | the similarly named APM states, only PM_EVENT_SUSPEND is used; the other event | ||
325 | codes are used for hibernation ("Suspend to Disk", STD, ACPI S4). | ||
326 | |||
327 | There's also PM_EVENT_ON, a value which never appears as a suspend event | ||
328 | but is sometimes used to record the "not suspended" device state. | ||
329 | |||
330 | |||
331 | Resuming Devices | ||
332 | ---------------- | ||
333 | Resuming is done in multiple phases, much like suspending, with all | ||
334 | devices processing each phase's calls before the next phase begins. | ||
335 | |||
336 | The phases are seen by driver notifications issued in this order: | ||
337 | |||
338 | 1 bus.resume(dev) reverses the effects of bus.suspend(). This may | ||
339 | be morphed into a device driver call with bus-specific parameters; | ||
340 | implementations may sleep. | ||
341 | |||
342 | 2 class.resume(dev) is called for devices associated with a class | ||
343 | that has such a method. Implementations may sleep. | ||
344 | |||
345 | This reverses the effects of class.suspend(), and would usually | ||
346 | reactivate the device's I/O queue. | ||
347 | |||
348 | At the end of those phases, drivers should normally be as functional as | ||
349 | they were before suspending: I/O can be performed using DMA and IRQs, and | ||
350 | the relevant clocks are gated on. The device need not be "fully on"; it | ||
351 | might be in a runtime lowpower/suspend state that acts as if it were. | ||
352 | |||
353 | However, the details here may again be platform-specific. For example, | ||
354 | some systems support multiple "run" states, and the mode in effect at | ||
355 | the end of resume() might not be the one which preceded suspension. | ||
356 | That means availability of certain clocks or power supplies changed, | ||
357 | which could easily affect how a driver works. | ||
358 | |||
359 | |||
360 | Drivers need to be able to handle hardware which has been reset since the | ||
361 | suspend methods were called, for example by complete reinitialization. | ||
362 | This may be the hardest part, and the one most protected by NDA'd documents | ||
363 | and chip errata. It's simplest if the hardware state hasn't changed since | ||
364 | the suspend() was called, but that can't always be guaranteed. | ||
365 | |||
366 | Drivers must also be prepared to notice that the device has been removed | ||
367 | while the system was powered off, whenever that's physically possible. | ||
368 | PCMCIA, MMC, USB, Firewire, SCSI, and even IDE are common examples of busses | ||
369 | where common Linux platforms will see such removal. Details of how drivers | ||
370 | will notice and handle such removals are currently bus-specific, and often | ||
371 | involve a separate thread. | ||
372 | |||
373 | |||
374 | Note that the bus-specific runtime PM wakeup mechanism can exist, and might | ||
375 | be defined to share some of the same driver code as for system wakeup. For | ||
376 | example, a bus-specific device driver's resume() method might be used there, | ||
377 | so it wouldn't only be called from bus.resume() during system-wide wakeup. | ||
378 | See bus-specific information about how runtime wakeup events are handled. | ||
379 | |||
380 | |||
381 | System Devices | ||
382 | -------------- | ||
383 | System devices follow a slightly different API, which can be found in | ||
384 | |||
385 | include/linux/sysdev.h | ||
386 | drivers/base/sys.c | ||
387 | |||
388 | System devices will only be suspended with interrupts disabled, and after | ||
389 | all other devices have been suspended. On resume, they will be resumed | ||
390 | before any other devices, and also with interrupts disabled. | ||
391 | |||
392 | That is, IRQs are disabled, the suspend_late() phase begins, then the | ||
393 | sysdev_driver.suspend() phase, and the system enters a sleep state. Then | ||
394 | the sysdev_driver.resume() phase begins, followed by the resume_early() | ||
395 | phase, after which IRQs are enabled. | ||
396 | |||
397 | Code to actually enter and exit the system-wide low power state sometimes | ||
398 | involves hardware details that are only known to the boot firmware, and | ||
399 | may leave a CPU running software (from SRAM or flash memory) that monitors | ||
400 | the system and manages its wakeup sequence. | ||
401 | 579 | ||
402 | 580 | ||
403 | Runtime Power Management | 581 | Runtime Power Management |
@@ -407,82 +585,23 @@ running. This feature is useful for devices that are not being used, and | |||
407 | can offer significant power savings on a running system. These devices | 585 | can offer significant power savings on a running system. These devices |
408 | often support a range of runtime power states, which might use names such | 586 | often support a range of runtime power states, which might use names such |
409 | as "off", "sleep", "idle", "active", and so on. Those states will in some | 587 | as "off", "sleep", "idle", "active", and so on. Those states will in some |
410 | cases (like PCI) be partially constrained by a bus the device uses, and will | 588 | cases (like PCI) be partially constrained by the bus the device uses, and will |
411 | usually include hardware states that are also used in system sleep states. | 589 | usually include hardware states that are also used in system sleep states. |
412 | 590 | ||
413 | However, note that if a driver puts a device into a runtime low power state | 591 | A system-wide power transition can be started while some devices are in low |
414 | and the system then goes into a system-wide sleep state, it normally ought | 592 | power states due to runtime power management. The system sleep PM callbacks |
415 | to resume into that runtime low power state rather than "full on". Such | 593 | should recognize such situations and react to them appropriately, but the |
416 | distinctions would be part of the driver-internal state machine for that | 594 | necessary actions are subsystem-specific. |
417 | hardware; the whole point of runtime power management is to be sure that | 595 | |
418 | drivers are decoupled in that way from the state machine governing phases | 596 | In some cases the decision may be made at the subsystem level while in other |
419 | of the system-wide power/sleep state transitions. | 597 | cases the device driver may be left to decide. In some cases it may be |
420 | 598 | desirable to leave a suspended device in that state during a system-wide power | |
421 | 599 | transition, but in other cases the device must be put back into the full-power | |
422 | Power Saving Techniques | 600 | state temporarily, for example so that its system wakeup capability can be |
423 | ----------------------- | 601 | disabled. This all depends on the hardware and the design of the subsystem and |
424 | Normally runtime power management is handled by the drivers without specific | 602 | device driver in question. |
425 | userspace or kernel intervention, by device-aware use of techniques like: | 603 | |
426 | 604 | During system-wide resume from a sleep state it's best to put devices into the | |
427 | Using information provided by other system layers | 605 | full-power state, as explained in Documentation/power/runtime_pm.txt. Refer to |
428 | - stay deeply "off" except between open() and close() | 606 | that document for more information regarding this particular issue as well as |
429 | - if transceiver/PHY indicates "nobody connected", stay "off" | 607 | for information on the device runtime power management framework in general. |
430 | - application protocols may include power commands or hints | ||
431 | |||
432 | Using fewer CPU cycles | ||
433 | - using DMA instead of PIO | ||
434 | - removing timers, or making them lower frequency | ||
435 | - shortening "hot" code paths | ||
436 | - eliminating cache misses | ||
437 | - (sometimes) offloading work to device firmware | ||
438 | |||
439 | Reducing other resource costs | ||
440 | - gating off unused clocks in software (or hardware) | ||
441 | - switching off unused power supplies | ||
442 | - eliminating (or delaying/merging) IRQs | ||
443 | - tuning DMA to use word and/or burst modes | ||
444 | |||
445 | Using device-specific low power states | ||
446 | - using lower voltages | ||
447 | - avoiding needless DMA transfers | ||
448 | |||
449 | Read your hardware documentation carefully to see the opportunities that | ||
450 | may be available. If you can, measure the actual power usage and check | ||
451 | it against the budget established for your project. | ||
452 | |||
453 | |||
454 | Examples: USB hosts, system timer, system CPU | ||
455 | ---------------------------------------------- | ||
456 | USB host controllers make interesting, if complex, examples. In many cases | ||
457 | these have no work to do: no USB devices are connected, or all of them are | ||
458 | in the USB "suspend" state. Linux host controller drivers can then disable | ||
459 | periodic DMA transfers that would otherwise be a constant power drain on the | ||
460 | memory subsystem, and enter a suspend state. In power-aware controllers, | ||
461 | entering that suspend state may disable the clock used with USB signaling, | ||
462 | saving a certain amount of power. | ||
463 | |||
464 | The controller will be woken from that state (with an IRQ) by changes to the | ||
465 | signal state on the data lines of a given port, for example by an existing | ||
466 | peripheral requesting "remote wakeup" or by plugging a new peripheral. The | ||
467 | same wakeup mechanism usually works from "standby" sleep states, and on some | ||
468 | systems also from "suspend to RAM" (or even "suspend to disk") states. | ||
469 | (Except that ACPI may be involved instead of normal IRQs, on some hardware.) | ||
470 | |||
471 | System devices like timers and CPUs may have special roles in the platform | ||
472 | power management scheme. For example, system timers using a "dynamic tick" | ||
473 | approach don't just save CPU cycles (by eliminating needless timer IRQs), | ||
474 | but they may also open the door to using lower power CPU "idle" states that | ||
475 | cost more than a jiffie to enter and exit. On x86 systems these are states | ||
476 | like "C3"; note that periodic DMA transfers from a USB host controller will | ||
477 | also prevent entry to a C3 state, much like a periodic timer IRQ. | ||
478 | |||
479 | That kind of runtime mechanism interaction is common. "System On Chip" (SOC) | ||
480 | processors often have low power idle modes that can't be entered unless | ||
481 | certain medium-speed clocks (often 12 or 48 MHz) are gated off. When the | ||
482 | drivers gate those clocks effectively, then the system idle task may be able | ||
483 | to use the lower power idle modes and thereby increase battery life. | ||
484 | |||
485 | If the CPU can have a "cpufreq" driver, there also may be opportunities | ||
486 | to shift to lower voltage settings and reduce the power cost of executing | ||
487 | a given number of instructions. (Without voltage adjustment, it's rare | ||
488 | for cpufreq to save much power; the cost-per-instruction must go down.) | ||
diff --git a/Documentation/power/pci.txt b/Documentation/power/pci.txt index dd8fe43888d3..62328d76b55b 100644 --- a/Documentation/power/pci.txt +++ b/Documentation/power/pci.txt | |||
@@ -1,299 +1,1025 @@ | |||
1 | |||
2 | PCI Power Management | 1 | PCI Power Management |
3 | ~~~~~~~~~~~~~~~~~~~~ | ||
4 | 2 | ||
5 | An overview of the concepts and the related functions in the Linux kernel | 3 | Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. |
4 | |||
5 | An overview of concepts and the Linux kernel's interfaces related to PCI power | ||
6 | management. Based on previous work by Patrick Mochel <mochel@transmeta.com> | ||
7 | (and others). | ||
6 | 8 | ||
7 | Patrick Mochel <mochel@transmeta.com> | 9 | This document only covers the aspects of power management specific to PCI |
8 | (and others) | 10 | devices. For general description of the kernel's interfaces related to device |
11 | power management refer to Documentation/power/devices.txt and | ||
12 | Documentation/power/runtime_pm.txt. | ||
9 | 13 | ||
10 | --------------------------------------------------------------------------- | 14 | --------------------------------------------------------------------------- |
11 | 15 | ||
12 | 1. Overview | 16 | 1. Hardware and Platform Support for PCI Power Management |
13 | 2. How the PCI Subsystem Does Power Management | 17 | 2. PCI Subsystem and Device Power Management |
14 | 3. PCI Utility Functions | 18 | 3. PCI Device Drivers and Power Management |
15 | 4. PCI Device Drivers | 19 | 4. Resources |
16 | 5. Resources | 20 | |
17 | 21 | ||
18 | 1. Overview | 22 | 1. Hardware and Platform Support for PCI Power Management |
19 | ~~~~~~~~~~~ | 23 | ========================================================= |
20 | 24 | ||
21 | The PCI Power Management Specification was introduced between the PCI 2.1 and | 25 | 1.1. Native and Platform-Based Power Management |
22 | PCI 2.2 Specifications. It a standard interface for controlling various | 26 | ----------------------------------------------- |
23 | power management operations. | 27 | In general, power management is a feature allowing one to save energy by putting |
24 | 28 | devices into states in which they draw less power (low-power states) at the | |
25 | Implementation of the PCI PM Spec is optional, as are several sub-components of | 29 | price of reduced functionality or performance. |
26 | it. If a device supports the PCI PM Spec, the device will have an 8 byte | 30 | |
27 | capability field in its PCI configuration space. This field is used to describe | 31 | Usually, a device is put into a low-power state when it is underutilized or |
28 | and control the standard PCI power management features. | 32 | completely inactive. However, when it is necessary to use the device once |
29 | 33 | again, it has to be put back into the "fully functional" state (full-power | |
30 | The PCI PM spec defines 4 operating states for devices (D0 - D3) and for buses | 34 | state). This may happen when there are some data for the device to handle or |
31 | (B0 - B3). The higher the number, the less power the device consumes. However, | 35 | as a result of an external event requiring the device to be active, which may |
32 | the higher the number, the longer the latency is for the device to return to | 36 | be signaled by the device itself. |
33 | an operational state (D0). | 37 | |
34 | 38 | PCI devices may be put into low-power states in two ways, by using the device | |
35 | There are actually two D3 states. When someone talks about D3, they usually | 39 | capabilities introduced by the PCI Bus Power Management Interface Specification, |
36 | mean D3hot, which corresponds to an ACPI D2 state (power is reduced, the | 40 | or with the help of platform firmware, such as an ACPI BIOS. In the first |
37 | device may lose some context). But they may also mean D3cold, which is an | 41 | approach, that is referred to as the native PCI power management (native PCI PM) |
38 | ACPI D3 state (power is fully off, all state was discarded); or both. | 42 | in what follows, the device power state is changed as a result of writing a |
39 | 43 | specific value into one of its standard configuration registers. The second | |
40 | Bus power management is not covered in this version of this document. | 44 | approach requires the platform firmware to provide special methods that may be |
41 | 45 | used by the kernel to change the device's power state. | |
42 | Note that all PCI devices support D0 and D3cold by default, regardless of | 46 | |
43 | whether or not they implement any of the PCI PM spec. | 47 | Devices supporting the native PCI PM usually can generate wakeup signals called |
44 | 48 | Power Management Events (PMEs) to let the kernel know about external events | |
45 | The possible state transitions that a device can undergo are: | 49 | requiring the device to be active. After receiving a PME the kernel is supposed |
46 | 50 | to put the device that sent it into the full-power state. However, the PCI Bus | |
47 | +---------------------------+ | 51 | Power Management Interface Specification doesn't define any standard method of |
48 | | Current State | New State | | 52 | delivering the PME from the device to the CPU and the operating system kernel. |
49 | +---------------------------+ | 53 | It is assumed that the platform firmware will perform this task and therefore, |
50 | | D0 | D1, D2, D3| | 54 | even though a PCI device is set up to generate PMEs, it also may be necessary to |
51 | +---------------------------+ | 55 | prepare the platform firmware for notifying the CPU of the PMEs coming from the |
52 | | D1 | D2, D3 | | 56 | device (e.g. by generating interrupts). |
53 | +---------------------------+ | 57 | |
54 | | D2 | D3 | | 58 | In turn, if the methods provided by the platform firmware are used for changing |
55 | +---------------------------+ | 59 | the power state of a device, usually the platform also provides a method for |
56 | | D1, D2, D3 | D0 | | 60 | preparing the device to generate wakeup signals. In that case, however, it |
57 | +---------------------------+ | 61 | often also is necessary to prepare the device for generating PMEs using the |
58 | 62 | native PCI PM mechanism, because the method provided by the platform depends on | |
59 | Note that when the system is entering a global suspend state, all devices will | 63 | that. |
60 | be placed into D3 and when resuming, all devices will be placed into D0. | 64 | |
61 | However, when the system is running, other state transitions are possible. | 65 | Thus in many situations both the native and the platform-based power management |
62 | 66 | mechanisms have to be used simultaneously to obtain the desired result. | |
63 | 2. How The PCI Subsystem Handles Power Management | 67 | |
64 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 68 | 1.2. Native PCI Power Management |
65 | 69 | -------------------------------- | |
66 | The PCI suspend/resume functionality is accessed indirectly via the Power | 70 | The PCI Bus Power Management Interface Specification (PCI PM Spec) was |
67 | Management subsystem. At boot, the PCI driver registers a power management | 71 | introduced between the PCI 2.1 and PCI 2.2 Specifications. It defined a |
68 | callback with that layer. Upon entering a suspend state, the PM layer iterates | 72 | standard interface for performing various operations related to power |
69 | through all of its registered callbacks. This currently takes place only during | 73 | management. |
70 | APM state transitions. | 74 | |
71 | 75 | The implementation of the PCI PM Spec is optional for conventional PCI devices, | |
72 | Upon going to sleep, the PCI subsystem walks its device tree twice. Both times, | 76 | but it is mandatory for PCI Express devices. If a device supports the PCI PM |
73 | it does a depth first walk of the device tree. The first walk saves each of the | 77 | Spec, it has an 8 byte power management capability field in its PCI |
74 | device's state and checks for devices that will prevent the system from entering | 78 | configuration space. This field is used to describe and control the standard |
75 | a global power state. The next walk then places the devices in a low power | 79 | features related to the native PCI power management. |
80 | |||
81 | The PCI PM Spec defines 4 operating states for devices (D0-D3) and for buses | ||
82 | (B0-B3). The higher the number, the less power is drawn by the device or bus | ||
83 | in that state. However, the higher the number, the longer the latency for | ||
84 | the device or bus to return to the full-power state (D0 or B0, respectively). | ||
85 | |||
86 | There are two variants of the D3 state defined by the specification. The first | ||
87 | one is D3hot, referred to as the software accessible D3, because devices can be | ||
88 | programmed to go into it. The second one, D3cold, is the state that PCI devices | ||
89 | are in when the supply voltage (Vcc) is removed from them. It is not possible | ||
90 | to program a PCI device to go into D3cold, although there may be a programmable | ||
91 | interface for putting the bus the device is on into a state in which Vcc is | ||
92 | removed from all devices on the bus. | ||
93 | |||
94 | PCI bus power management, however, is not supported by the Linux kernel at the | ||
95 | time of this writing and therefore it is not covered by this document. | ||
96 | |||
97 | Note that every PCI device can be in the full-power state (D0) or in D3cold, | ||
98 | regardless of whether or not it implements the PCI PM Spec. In addition to | ||
99 | that, if the PCI PM Spec is implemented by the device, it must support D3hot | ||
100 | as well as D0. The support for the D1 and D2 power states is optional. | ||
101 | |||
102 | PCI devices supporting the PCI PM Spec can be programmed to go to any of the | ||
103 | supported low-power states (except for D3cold). While in D1-D3hot the | ||
104 | standard configuration registers of the device must be accessible to software | ||
105 | (i.e. the device is required to respond to PCI configuration accesses), although | ||
106 | its I/O and memory spaces are then disabled. This allows the device to be | ||
107 | programmatically put into D0. Thus the kernel can switch the device back and | ||
108 | forth between D0 and the supported low-power states (except for D3cold) and the | ||
109 | possible power state transitions the device can undergo are the following: | ||
110 | |||
111 | +----------------------------+ | ||
112 | | Current State | New State | | ||
113 | +----------------------------+ | ||
114 | | D0 | D1, D2, D3 | | ||
115 | +----------------------------+ | ||
116 | | D1 | D2, D3 | | ||
117 | +----------------------------+ | ||
118 | | D2 | D3 | | ||
119 | +----------------------------+ | ||
120 | | D1, D2, D3 | D0 | | ||
121 | +----------------------------+ | ||
122 | |||
123 | The transition from D3cold to D0 occurs when the supply voltage is provided to | ||
124 | the device (i.e. power is restored). In that case the device returns to D0 with | ||
125 | a full power-on reset sequence and the power-on defaults are restored to the | ||
126 | device by hardware just as at initial power up. | ||
127 | |||
128 | PCI devices supporting the PCI PM Spec can be programmed to generate PMEs | ||
129 | while in a low-power state (D1-D3), but they are not required to be capable | ||
130 | of generating PMEs from all supported low-power states. In particular, the | ||
131 | capability of generating PMEs from D3cold is optional and depends on the | ||
132 | presence of additional voltage (3.3Vaux) allowing the device to remain | ||
133 | sufficiently active to generate a wakeup signal. | ||
134 | |||
135 | 1.3. ACPI Device Power Management | ||
136 | --------------------------------- | ||
137 | The platform firmware support for the power management of PCI devices is | ||
138 | system-specific. However, if the system in question is compliant with the | ||
139 | Advanced Configuration and Power Interface (ACPI) Specification, like the | ||
140 | majority of x86-based systems, it is supposed to implement device power | ||
141 | management interfaces defined by the ACPI standard. | ||
142 | |||
143 | For this purpose the ACPI BIOS provides special functions called "control | ||
144 | methods" that may be executed by the kernel to perform specific tasks, such as | ||
145 | putting a device into a low-power state. These control methods are encoded | ||
146 | using special byte-code language called the ACPI Machine Language (AML) and | ||
147 | stored in the machine's BIOS. The kernel loads them from the BIOS and executes | ||
148 | them as needed using an AML interpreter that translates the AML byte code into | ||
149 | computations and memory or I/O space accesses. This way, in theory, a BIOS | ||
150 | writer can provide the kernel with a means to perform actions depending | ||
151 | on the system design in a system-specific fashion. | ||
152 | |||
153 | ACPI control methods may be divided into global control methods, that are not | ||
154 | associated with any particular devices, and device control methods, that have | ||
155 | to be defined separately for each device supposed to be handled with the help of | ||
156 | the platform. This means, in particular, that ACPI device control methods can | ||
157 | only be used to handle devices that the BIOS writer knew about in advance. The | ||
158 | ACPI methods used for device power management fall into that category. | ||
159 | |||
160 | The ACPI specification assumes that devices can be in one of four power states | ||
161 | labeled as D0, D1, D2, and D3 that roughly correspond to the native PCI PM | ||
162 | D0-D3 states (although the difference between D3hot and D3cold is not taken | ||
163 | into account by ACPI). Moreover, for each power state of a device there is a | ||
164 | set of power resources that have to be enabled for the device to be put into | ||
165 | that state. These power resources are controlled (i.e. enabled or disabled) | ||
166 | with the help of their own control methods, _ON and _OFF, that have to be | ||
167 | defined individually for each of them. | ||
168 | |||
169 | To put a device into the ACPI power state Dx (where x is a number between 0 and | ||
170 | 3 inclusive) the kernel is supposed to (1) enable the power resources required | ||
171 | by the device in this state using their _ON control methods and (2) execute the | ||
172 | _PSx control method defined for the device. In addition to that, if the device | ||
173 | is going to be put into a low-power state (D1-D3) and is supposed to generate | ||
174 | wakeup signals from that state, the _DSW (or _PSW, replaced with _DSW by ACPI | ||
175 | 3.0) control method defined for it has to be executed before _PSx. Power | ||
176 | resources that are not required by the device in the target power state and are | ||
177 | not required any more by any other device should be disabled (by executing their | ||
178 | _OFF control methods). If the current power state of the device is D3, it can | ||
179 | only be put into D0 this way. | ||
180 | |||
181 | However, quite often the power states of devices are changed during a | ||
182 | system-wide transition into a sleep state or back into the working state. ACPI | ||
183 | defines four system sleep states, S1, S2, S3, and S4, and denotes the system | ||
184 | working state as S0. In general, the target system sleep (or working) state | ||
185 | determines the highest power (lowest number) state the device can be put | ||
186 | into and the kernel is supposed to obtain this information by executing the | ||
187 | device's _SxD control method (where x is a number between 0 and 4 inclusive). | ||
188 | If the device is required to wake up the system from the target sleep state, the | ||
189 | lowest power (highest number) state it can be put into is also determined by the | ||
190 | target state of the system. The kernel is then supposed to use the device's | ||
191 | _SxW control method to obtain the number of that state. It also is supposed to | ||
192 | use the device's _PRW control method to learn which power resources need to be | ||
193 | enabled for the device to be able to generate wakeup signals. | ||
194 | |||
195 | 1.4. Wakeup Signaling | ||
196 | --------------------- | ||
197 | Wakeup signals generated by PCI devices, either as native PCI PMEs, or as | ||
198 | a result of the execution of the _DSW (or _PSW) ACPI control method before | ||
199 | putting the device into a low-power state, have to be caught and handled as | ||
200 | appropriate. If they are sent while the system is in the working state | ||
201 | (ACPI S0), they should be translated into interrupts so that the kernel can | ||
202 | put the devices generating them into the full-power state and take care of the | ||
203 | events that triggered them. In turn, if they are sent while the system is | ||
204 | sleeping, they should cause the system's core logic to trigger wakeup. | ||
205 | |||
206 | On ACPI-based systems wakeup signals sent by conventional PCI devices are | ||
207 | converted into ACPI General-Purpose Events (GPEs) which are hardware signals | ||
208 | from the system core logic generated in response to various events that need to | ||
209 | be acted upon. Every GPE is associated with one or more sources of potentially | ||
210 | interesting events. In particular, a GPE may be associated with a PCI device | ||
211 | capable of signaling wakeup. The information on the connections between GPEs | ||
212 | and event sources is recorded in the system's ACPI BIOS from where it can be | ||
213 | read by the kernel. | ||
214 | |||
215 | If a PCI device known to the system's ACPI BIOS signals wakeup, the GPE | ||
216 | associated with it (if there is one) is triggered. The GPEs associated with PCI | ||
217 | bridges may also be triggered in response to a wakeup signal from one of the | ||
218 | devices below the bridge (this also is the case for root bridges) and, for | ||
219 | example, native PCI PMEs from devices unknown to the system's ACPI BIOS may be | ||
220 | handled this way. | ||
221 | |||
222 | A GPE may be triggered when the system is sleeping (i.e. when it is in one of | ||
223 | the ACPI S1-S4 states), in which case system wakeup is started by its core logic | ||
224 | (the device that was the source of the signal causing the system wakeup to occur | ||
225 | may be identified later). The GPEs used in such situations are referred to as | ||
226 | wakeup GPEs. | ||
227 | |||
228 | Usually, however, GPEs are also triggered when the system is in the working | ||
229 | state (ACPI S0) and in that case the system's core logic generates a System | ||
230 | Control Interrupt (SCI) to notify the kernel of the event. Then, the SCI | ||
231 | handler identifies the GPE that caused the interrupt to be generated which, | ||
232 | in turn, allows the kernel to identify the source of the event (that may be | ||
233 | a PCI device signaling wakeup). The GPEs used for notifying the kernel of | ||
234 | events occurring while the system is in the working state are referred to as | ||
235 | runtime GPEs. | ||
236 | |||
237 | Unfortunately, there is no standard way of handling wakeup signals sent by | ||
238 | conventional PCI devices on systems that are not ACPI-based, but there is one | ||
239 | for PCI Express devices. Namely, the PCI Express Base Specification introduced | ||
240 | a native mechanism for converting native PCI PMEs into interrupts generated by | ||
241 | root ports. For conventional PCI devices native PMEs are out-of-band, so they | ||
242 | are routed separately and they need not pass through bridges (in principle they | ||
243 | may be routed directly to the system's core logic), but for PCI Express devices | ||
244 | they are in-band messages that have to pass through the PCI Express hierarchy, | ||
245 | including the root port on the path from the device to the Root Complex. Thus | ||
246 | it was possible to introduce a mechanism by which a root port generates an | ||
247 | interrupt whenever it receives a PME message from one of the devices below it. | ||
248 | The PCI Express Requester ID of the device that sent the PME message is then | ||
249 | recorded in one of the root port's configuration registers from where it may be | ||
250 | read by the interrupt handler allowing the device to be identified. [PME | ||
251 | messages sent by PCI Express endpoints integrated with the Root Complex don't | ||
252 | pass through root ports, but instead they cause a Root Complex Event Collector | ||
253 | (if there is one) to generate interrupts.] | ||
254 | |||
255 | In principle the native PCI Express PME signaling may also be used on ACPI-based | ||
256 | systems along with the GPEs, but to use it the kernel has to ask the system's | ||
257 | ACPI BIOS to release control of root port configuration registers. The ACPI | ||
258 | BIOS, however, is not required to allow the kernel to control these registers | ||
259 | and if it doesn't do that, the kernel must not modify their contents. Of course | ||
260 | the native PCI Express PME signaling cannot be used by the kernel in that case. | ||
261 | |||
262 | |||
263 | 2. PCI Subsystem and Device Power Management | ||
264 | ============================================ | ||
265 | |||
266 | 2.1. Device Power Management Callbacks | ||
267 | -------------------------------------- | ||
268 | The PCI Subsystem participates in the power management of PCI devices in a | ||
269 | number of ways. First of all, it provides an intermediate code layer between | ||
270 | the device power management core (PM core) and PCI device drivers. | ||
271 | Specifically, the pm field of the PCI subsystem's struct bus_type object, | ||
272 | pci_bus_type, points to a struct dev_pm_ops object, pci_dev_pm_ops, containing | ||
273 | pointers to several device power management callbacks: | ||
274 | |||
275 | const struct dev_pm_ops pci_dev_pm_ops = { | ||
276 | .prepare = pci_pm_prepare, | ||
277 | .complete = pci_pm_complete, | ||
278 | .suspend = pci_pm_suspend, | ||
279 | .resume = pci_pm_resume, | ||
280 | .freeze = pci_pm_freeze, | ||
281 | .thaw = pci_pm_thaw, | ||
282 | .poweroff = pci_pm_poweroff, | ||
283 | .restore = pci_pm_restore, | ||
284 | .suspend_noirq = pci_pm_suspend_noirq, | ||
285 | .resume_noirq = pci_pm_resume_noirq, | ||
286 | .freeze_noirq = pci_pm_freeze_noirq, | ||
287 | .thaw_noirq = pci_pm_thaw_noirq, | ||
288 | .poweroff_noirq = pci_pm_poweroff_noirq, | ||
289 | .restore_noirq = pci_pm_restore_noirq, | ||
290 | .runtime_suspend = pci_pm_runtime_suspend, | ||
291 | .runtime_resume = pci_pm_runtime_resume, | ||
292 | .runtime_idle = pci_pm_runtime_idle, | ||
293 | }; | ||
294 | |||
295 | These callbacks are executed by the PM core in various situations related to | ||
296 | device power management and they, in turn, execute power management callbacks | ||
297 | provided by PCI device drivers. They also perform power management operations | ||
298 | involving some standard configuration registers of PCI devices that device | ||
299 | drivers need not know or care about. | ||
300 | |||
301 | The structure representing a PCI device, struct pci_dev, contains several fields | ||
302 | that these callbacks operate on: | ||
303 | |||
304 | struct pci_dev { | ||
305 | ... | ||
306 | pci_power_t current_state; /* Current operating state. */ | ||
307 | int pm_cap; /* PM capability offset in the | ||
308 | configuration space */ | ||
309 | unsigned int pme_support:5; /* Bitmask of states from which PME# | ||
310 | can be generated */ | ||
311 | unsigned int pme_interrupt:1;/* Is native PCIe PME signaling used? */ | ||
312 | unsigned int d1_support:1; /* Low power state D1 is supported */ | ||
313 | unsigned int d2_support:1; /* Low power state D2 is supported */ | ||
314 | unsigned int no_d1d2:1; /* D1 and D2 are forbidden */ | ||
315 | unsigned int wakeup_prepared:1; /* Device prepared for wake up */ | ||
316 | unsigned int d3_delay; /* D3->D0 transition time in ms */ | ||
317 | ... | ||
318 | }; | ||
319 | |||
320 | They also indirectly use some fields of the struct device that is embedded in | ||
321 | struct pci_dev. | ||
322 | |||
323 | 2.2. Device Initialization | ||
324 | -------------------------- | ||
325 | The PCI subsystem's first task related to device power management is to | ||
326 | prepare the device for power management and initialize the fields of struct | ||
327 | pci_dev used for this purpose. This happens in two functions defined in | ||
328 | drivers/pci/pci.c, pci_pm_init() and platform_pci_wakeup_init(). | ||
329 | |||
330 | The first of these functions checks if the device supports native PCI PM | ||
331 | and if that's the case the offset of its power management capability structure | ||
332 | in the configuration space is stored in the pm_cap field of the device's struct | ||
333 | pci_dev object. Next, the function checks which PCI low-power states are | ||
334 | supported by the device and from which low-power states the device can generate | ||
335 | native PCI PMEs. The power management fields of the device's struct pci_dev and | ||
336 | the struct device embedded in it are updated accordingly and the generation of | ||
337 | PMEs by the device is disabled. | ||
338 | |||
339 | The second function checks if the device can be prepared to signal wakeup with | ||
340 | the help of the platform firmware, such as the ACPI BIOS. If that is the case, | ||
341 | the function updates the wakeup fields in struct device embedded in the | ||
342 | device's struct pci_dev and uses the firmware-provided method to prevent the | ||
343 | device from signaling wakeup. | ||
344 | |||
345 | At this point the device is ready for power management. For driverless devices, | ||
346 | however, this functionality is limited to a few basic operations carried out | ||
347 | during system-wide transitions to a sleep state and back to the working state. | ||
348 | |||
349 | 2.3. Runtime Device Power Management | ||
350 | ------------------------------------ | ||
351 | The PCI subsystem plays a vital role in the runtime power management of PCI | ||
352 | devices. For this purpose it uses the general runtime power management | ||
353 | (runtime PM) framework described in Documentation/power/runtime_pm.txt. | ||
354 | Namely, it provides subsystem-level callbacks: | ||
355 | |||
356 | pci_pm_runtime_suspend() | ||
357 | pci_pm_runtime_resume() | ||
358 | pci_pm_runtime_idle() | ||
359 | |||
360 | that are executed by the core runtime PM routines. It also implements the | ||
361 | entire mechanics necessary for handling runtime wakeup signals from PCI devices | ||
362 | in low-power states, which at the time of this writing works for both the native | ||
363 | PCI Express PME signaling and the ACPI GPE-based wakeup signaling described in | ||
364 | Section 1. | ||
365 | |||
366 | First, a PCI device is put into a low-power state, or suspended, with the help | ||
367 | of pm_schedule_suspend() or pm_runtime_suspend() which for PCI devices call | ||
368 | pci_pm_runtime_suspend() to do the actual job. For this to work, the device's | ||
369 | driver has to provide a pm->runtime_suspend() callback (see below), which is | ||
370 | run by pci_pm_runtime_suspend() as the first action. If the driver's callback | ||
371 | returns successfully, the device's standard configuration registers are saved, | ||
372 | the device is prepared to generate wakeup signals and, finally, it is put into | ||
373 | the target low-power state. | ||
374 | |||
375 | The low-power state to put the device into is the lowest-power (highest number) | ||
376 | state from which it can signal wakeup. The exact method of signaling wakeup is | ||
377 | system-dependent and is determined by the PCI subsystem on the basis of the | ||
378 | reported capabilities of the device and the platform firmware. To prepare the | ||
379 | device for signaling wakeup and put it into the selected low-power state, the | ||
380 | PCI subsystem can use the platform firmware as well as the device's native PCI | ||
381 | PM capabilities, if supported. | ||
382 | |||
383 | It is expected that the device driver's pm->runtime_suspend() callback will | ||
384 | not attempt to prepare the device for signaling wakeup or to put it into a | ||
385 | low-power state. The driver ought to leave these tasks to the PCI subsystem | ||
386 | that has all of the information necessary to perform them. | ||
387 | |||
388 | A suspended device is brought back into the "active" state, or resumed, | ||
389 | with the help of pm_request_resume() or pm_runtime_resume() which both call | ||
390 | pci_pm_runtime_resume() for PCI devices. Again, this only works if the device's | ||
391 | driver provides a pm->runtime_resume() callback (see below). However, before | ||
392 | the driver's callback is executed, pci_pm_runtime_resume() brings the device | ||
393 | back into the full-power state, prevents it from signaling wakeup while in that | ||
394 | state and restores its standard configuration registers. Thus the driver's | ||
395 | callback need not worry about the PCI-specific aspects of the device resume. | ||
396 | |||
397 | Note that generally pci_pm_runtime_resume() may be called in two different | ||
398 | situations. First, it may be called at the request of the device's driver, for | ||
399 | example if there are some data for it to process. Second, it may be called | ||
400 | as a result of a wakeup signal from the device itself (this sometimes is | ||
401 | referred to as "remote wakeup"). Of course, for this purpose the wakeup signal | ||
402 | is handled in one of the ways described in Section 1 and finally converted into | ||
403 | a notification for the PCI subsystem after the source device has been | ||
404 | identified. | ||
405 | |||
406 | The pci_pm_runtime_idle() function, called for PCI devices by pm_runtime_idle() | ||
407 | and pm_request_idle(), executes the device driver's pm->runtime_idle() | ||
408 | callback, if defined, and if that callback doesn't return error code (or is not | ||
409 | present at all), suspends the device with the help of pm_runtime_suspend(). | ||
410 | Sometimes pci_pm_runtime_idle() is called automatically by the PM core (for | ||
411 | example, it is called right after the device has just been resumed), in which | ||
412 | cases it is expected to suspend the device if that makes sense. Usually, | ||
413 | however, the PCI subsystem doesn't really know if the device really can be | ||
414 | suspended, so it lets the device's driver decide by running its | ||
415 | pm->runtime_idle() callback. | ||
416 | |||
417 | 2.4. System-Wide Power Transitions | ||
418 | ---------------------------------- | ||
419 | There are a few different types of system-wide power transitions, described in | ||
420 | Documentation/power/devices.txt. Each of them requires devices to be handled | ||
421 | in a specific way and the PM core executes subsystem-level power management | ||
422 | callbacks for this purpose. They are executed in phases such that each phase | ||
423 | involves executing the same subsystem-level callback for every device belonging | ||
424 | to the given subsystem before the next phase begins. These phases always run | ||
425 | after tasks have been frozen. | ||
426 | |||
427 | 2.4.1. System Suspend | ||
428 | |||
429 | When the system is going into a sleep state in which the contents of memory will | ||
430 | be preserved, such as one of the ACPI sleep states S1-S3, the phases are: | ||
431 | |||
432 | prepare, suspend, suspend_noirq. | ||
433 | |||
434 | The following PCI bus type's callbacks, respectively, are used in these phases: | ||
435 | |||
436 | pci_pm_prepare() | ||
437 | pci_pm_suspend() | ||
438 | pci_pm_suspend_noirq() | ||
439 | |||
440 | The pci_pm_prepare() routine first puts the device into the "fully functional" | ||
441 | state with the help of pm_runtime_resume(). Then, it executes the device | ||
442 | driver's pm->prepare() callback if defined (i.e. if the driver's struct | ||
443 | dev_pm_ops object is present and the prepare pointer in that object is valid). | ||
444 | |||
445 | The pci_pm_suspend() routine first checks if the device's driver implements | ||
446 | legacy PCI suspend routines (see Section 3), in which case the driver's legacy | ||
447 | suspend callback is executed, if present, and its result is returned. Next, if | ||
448 | the device's driver doesn't provide a struct dev_pm_ops object (containing | ||
449 | pointers to the driver's callbacks), pci_pm_default_suspend() is called, which | ||
450 | simply turns off the device's bus master capability and runs | ||
451 | pcibios_disable_device() to disable it, unless the device is a bridge (PCI | ||
452 | bridges are ignored by this routine). Next, the device driver's pm->suspend() | ||
453 | callback is executed, if defined, and its result is returned if it fails. | ||
454 | Finally, pci_fixup_device() is called to apply hardware suspend quirks related | ||
455 | to the device if necessary. | ||
456 | |||
457 | Note that the suspend phase is carried out asynchronously for PCI devices, so | ||
458 | the pci_pm_suspend() callback may be executed in parallel for any pair of PCI | ||
459 | devices that don't depend on each other in a known way (i.e. none of the paths | ||
460 | in the device tree from the root bridge to a leaf device contains both of them). | ||
461 | |||
462 | The pci_pm_suspend_noirq() routine is executed after suspend_device_irqs() has | ||
463 | been called, which means that the device driver's interrupt handler won't be | ||
464 | invoked while this routine is running. It first checks if the device's driver | ||
465 | implements legacy PCI suspends routines (Section 3), in which case the legacy | ||
466 | late suspend routine is called and its result is returned (the standard | ||
467 | configuration registers of the device are saved if the driver's callback hasn't | ||
468 | done that). Second, if the device driver's struct dev_pm_ops object is not | ||
469 | present, the device's standard configuration registers are saved and the routine | ||
470 | returns success. Otherwise the device driver's pm->suspend_noirq() callback is | ||
471 | executed, if present, and its result is returned if it fails. Next, if the | ||
472 | device's standard configuration registers haven't been saved yet (one of the | ||
473 | device driver's callbacks executed before might do that), pci_pm_suspend_noirq() | ||
474 | saves them, prepares the device to signal wakeup (if necessary) and puts it into | ||
475 | a low-power state. | ||
476 | |||
477 | The low-power state to put the device into is the lowest-power (highest number) | ||
478 | state from which it can signal wakeup while the system is in the target sleep | ||
479 | state. Just like in the runtime PM case described above, the mechanism of | ||
480 | signaling wakeup is system-dependent and determined by the PCI subsystem, which | ||
481 | is also responsible for preparing the device to signal wakeup from the system's | ||
482 | target sleep state as appropriate. | ||
483 | |||
484 | PCI device drivers (that don't implement legacy power management callbacks) are | ||
485 | generally not expected to prepare devices for signaling wakeup or to put them | ||
486 | into low-power states. However, if one of the driver's suspend callbacks | ||
487 | (pm->suspend() or pm->suspend_noirq()) saves the device's standard configuration | ||
488 | registers, pci_pm_suspend_noirq() will assume that the device has been prepared | ||
489 | to signal wakeup and put into a low-power state by the driver (the driver is | ||
490 | then assumed to have used the helper functions provided by the PCI subsystem for | ||
491 | this purpose). PCI device drivers are not encouraged to do that, but in some | ||
492 | rare cases doing that in the driver may be the optimum approach. | ||
493 | |||
494 | 2.4.2. System Resume | ||
495 | |||
496 | When the system is undergoing a transition from a sleep state in which the | ||
497 | contents of memory have been preserved, such as one of the ACPI sleep states | ||
498 | S1-S3, into the working state (ACPI S0), the phases are: | ||
499 | |||
500 | resume_noirq, resume, complete. | ||
501 | |||
502 | The following PCI bus type's callbacks, respectively, are executed in these | ||
503 | phases: | ||
504 | |||
505 | pci_pm_resume_noirq() | ||
506 | pci_pm_resume() | ||
507 | pci_pm_complete() | ||
508 | |||
509 | The pci_pm_resume_noirq() routine first puts the device into the full-power | ||
510 | state, restores its standard configuration registers and applies early resume | ||
511 | hardware quirks related to the device, if necessary. This is done | ||
512 | unconditionally, regardless of whether or not the device's driver implements | ||
513 | legacy PCI power management callbacks (this way all PCI devices are in the | ||
514 | full-power state and their standard configuration registers have been restored | ||
515 | when their interrupt handlers are invoked for the first time during resume, | ||
516 | which allows the kernel to avoid problems with the handling of shared interrupts | ||
517 | by drivers whose devices are still suspended). If legacy PCI power management | ||
518 | callbacks (see Section 3) are implemented by the device's driver, the legacy | ||
519 | early resume callback is executed and its result is returned. Otherwise, the | ||
520 | device driver's pm->resume_noirq() callback is executed, if defined, and its | ||
521 | result is returned. | ||
522 | |||
523 | The pci_pm_resume() routine first checks if the device's standard configuration | ||
524 | registers have been restored and restores them if that's not the case (this | ||
525 | only is necessary in the error path during a failing suspend). Next, resume | ||
526 | hardware quirks related to the device are applied, if necessary, and if the | ||
527 | device's driver implements legacy PCI power management callbacks (see | ||
528 | Section 3), the driver's legacy resume callback is executed and its result is | ||
529 | returned. Otherwise, the device's wakeup signaling mechanisms are blocked and | ||
530 | its driver's pm->resume() callback is executed, if defined (the callback's | ||
531 | result is then returned). | ||
532 | |||
533 | The resume phase is carried out asynchronously for PCI devices, like the | ||
534 | suspend phase described above, which means that if two PCI devices don't depend | ||
535 | on each other in a known way, the pci_pm_resume() routine may be executed for | ||
536 | the both of them in parallel. | ||
537 | |||
538 | The pci_pm_complete() routine only executes the device driver's pm->complete() | ||
539 | callback, if defined. | ||
540 | |||
541 | 2.4.3. System Hibernation | ||
542 | |||
543 | System hibernation is more complicated than system suspend, because it requires | ||
544 | a system image to be created and written into a persistent storage medium. The | ||
545 | image is created atomically and all devices are quiesced, or frozen, before that | ||
546 | happens. | ||
547 | |||
548 | The freezing of devices is carried out after enough memory has been freed (at | ||
549 | the time of this writing the image creation requires at least 50% of system RAM | ||
550 | to be free) in the following three phases: | ||
551 | |||
552 | prepare, freeze, freeze_noirq | ||
553 | |||
554 | that correspond to the PCI bus type's callbacks: | ||
555 | |||
556 | pci_pm_prepare() | ||
557 | pci_pm_freeze() | ||
558 | pci_pm_freeze_noirq() | ||
559 | |||
560 | This means that the prepare phase is exactly the same as for system suspend. | ||
561 | The other two phases, however, are different. | ||
562 | |||
563 | The pci_pm_freeze() routine is quite similar to pci_pm_suspend(), but it runs | ||
564 | the device driver's pm->freeze() callback, if defined, instead of pm->suspend(), | ||
565 | and it doesn't apply the suspend-related hardware quirks. It is executed | ||
566 | asynchronously for different PCI devices that don't depend on each other in a | ||
567 | known way. | ||
568 | |||
569 | The pci_pm_freeze_noirq() routine, in turn, is similar to | ||
570 | pci_pm_suspend_noirq(), but it calls the device driver's pm->freeze_noirq() | ||
571 | routine instead of pm->suspend_noirq(). It also doesn't attempt to prepare the | ||
572 | device for signaling wakeup and put it into a low-power state. Still, it saves | ||
573 | the device's standard configuration registers if they haven't been saved by one | ||
574 | of the driver's callbacks. | ||
575 | |||
576 | Once the image has been created, it has to be saved. However, at this point all | ||
577 | devices are frozen and they cannot handle I/O, while their ability to handle | ||
578 | I/O is obviously necessary for the image saving. Thus they have to be brought | ||
579 | back to the fully functional state and this is done in the following phases: | ||
580 | |||
581 | thaw_noirq, thaw, complete | ||
582 | |||
583 | using the following PCI bus type's callbacks: | ||
584 | |||
585 | pci_pm_thaw_noirq() | ||
586 | pci_pm_thaw() | ||
587 | pci_pm_complete() | ||
588 | |||
589 | respectively. | ||
590 | |||
591 | The first of them, pci_pm_thaw_noirq(), is analogous to pci_pm_resume_noirq(), | ||
592 | but it doesn't put the device into the full power state and doesn't attempt to | ||
593 | restore its standard configuration registers. It also executes the device | ||
594 | driver's pm->thaw_noirq() callback, if defined, instead of pm->resume_noirq(). | ||
595 | |||
596 | The pci_pm_thaw() routine is similar to pci_pm_resume(), but it runs the device | ||
597 | driver's pm->thaw() callback instead of pm->resume(). It is executed | ||
598 | asynchronously for different PCI devices that don't depend on each other in a | ||
599 | known way. | ||
600 | |||
601 | The complete phase it the same as for system resume. | ||
602 | |||
603 | After saving the image, devices need to be powered down before the system can | ||
604 | enter the target sleep state (ACPI S4 for ACPI-based systems). This is done in | ||
605 | three phases: | ||
606 | |||
607 | prepare, poweroff, poweroff_noirq | ||
608 | |||
609 | where the prepare phase is exactly the same as for system suspend. The other | ||
610 | two phases are analogous to the suspend and suspend_noirq phases, respectively. | ||
611 | The PCI subsystem-level callbacks they correspond to | ||
612 | |||
613 | pci_pm_poweroff() | ||
614 | pci_pm_poweroff_noirq() | ||
615 | |||
616 | work in analogy with pci_pm_suspend() and pci_pm_poweroff_noirq(), respectively, | ||
617 | although they don't attempt to save the device's standard configuration | ||
618 | registers. | ||
619 | |||
620 | 2.4.4. System Restore | ||
621 | |||
622 | System restore requires a hibernation image to be loaded into memory and the | ||
623 | pre-hibernation memory contents to be restored before the pre-hibernation system | ||
624 | activity can be resumed. | ||
625 | |||
626 | As described in Documentation/power/devices.txt, the hibernation image is loaded | ||
627 | into memory by a fresh instance of the kernel, called the boot kernel, which in | ||
628 | turn is loaded and run by a boot loader in the usual way. After the boot kernel | ||
629 | has loaded the image, it needs to replace its own code and data with the code | ||
630 | and data of the "hibernated" kernel stored within the image, called the image | ||
631 | kernel. For this purpose all devices are frozen just like before creating | ||
632 | the image during hibernation, in the | ||
633 | |||
634 | prepare, freeze, freeze_noirq | ||
635 | |||
636 | phases described above. However, the devices affected by these phases are only | ||
637 | those having drivers in the boot kernel; other devices will still be in whatever | ||
638 | state the boot loader left them. | ||
639 | |||
640 | Should the restoration of the pre-hibernation memory contents fail, the boot | ||
641 | kernel would go through the "thawing" procedure described above, using the | ||
642 | thaw_noirq, thaw, and complete phases (that will only affect the devices having | ||
643 | drivers in the boot kernel), and then continue running normally. | ||
644 | |||
645 | If the pre-hibernation memory contents are restored successfully, which is the | ||
646 | usual situation, control is passed to the image kernel, which then becomes | ||
647 | responsible for bringing the system back to the working state. To achieve this, | ||
648 | it must restore the devices' pre-hibernation functionality, which is done much | ||
649 | like waking up from the memory sleep state, although it involves different | ||
650 | phases: | ||
651 | |||
652 | restore_noirq, restore, complete | ||
653 | |||
654 | The first two of these are analogous to the resume_noirq and resume phases | ||
655 | described above, respectively, and correspond to the following PCI subsystem | ||
656 | callbacks: | ||
657 | |||
658 | pci_pm_restore_noirq() | ||
659 | pci_pm_restore() | ||
660 | |||
661 | These callbacks work in analogy with pci_pm_resume_noirq() and pci_pm_resume(), | ||
662 | respectively, but they execute the device driver's pm->restore_noirq() and | ||
663 | pm->restore() callbacks, if available. | ||
664 | |||
665 | The complete phase is carried out in exactly the same way as during system | ||
666 | resume. | ||
667 | |||
668 | |||
669 | 3. PCI Device Drivers and Power Management | ||
670 | ========================================== | ||
671 | |||
672 | 3.1. Power Management Callbacks | ||
673 | ------------------------------- | ||
674 | PCI device drivers participate in power management by providing callbacks to be | ||
675 | executed by the PCI subsystem's power management routines described above and by | ||
676 | controlling the runtime power management of their devices. | ||
677 | |||
678 | At the time of this writing there are two ways to define power management | ||
679 | callbacks for a PCI device driver, the recommended one, based on using a | ||
680 | dev_pm_ops structure described in Documentation/power/devices.txt, and the | ||
681 | "legacy" one, in which the .suspend(), .suspend_late(), .resume_early(), and | ||
682 | .resume() callbacks from struct pci_driver are used. The legacy approach, | ||
683 | however, doesn't allow one to define runtime power management callbacks and is | ||
684 | not really suitable for any new drivers. Therefore it is not covered by this | ||
685 | document (refer to the source code to learn more about it). | ||
686 | |||
687 | It is recommended that all PCI device drivers define a struct dev_pm_ops object | ||
688 | containing pointers to power management (PM) callbacks that will be executed by | ||
689 | the PCI subsystem's PM routines in various circumstances. A pointer to the | ||
690 | driver's struct dev_pm_ops object has to be assigned to the driver.pm field in | ||
691 | its struct pci_driver object. Once that has happened, the "legacy" PM callbacks | ||
692 | in struct pci_driver are ignored (even if they are not NULL). | ||
693 | |||
694 | The PM callbacks in struct dev_pm_ops are not mandatory and if they are not | ||
695 | defined (i.e. the respective fields of struct dev_pm_ops are unset) the PCI | ||
696 | subsystem will handle the device in a simplified default manner. If they are | ||
697 | defined, though, they are expected to behave as described in the following | ||
698 | subsections. | ||
699 | |||
700 | 3.1.1. prepare() | ||
701 | |||
702 | The prepare() callback is executed during system suspend, during hibernation | ||
703 | (when a hibernation image is about to be created), during power-off after | ||
704 | saving a hibernation image and during system restore, when a hibernation image | ||
705 | has just been loaded into memory. | ||
706 | |||
707 | This callback is only necessary if the driver's device has children that in | ||
708 | general may be registered at any time. In that case the role of the prepare() | ||
709 | callback is to prevent new children of the device from being registered until | ||
710 | one of the resume_noirq(), thaw_noirq(), or restore_noirq() callbacks is run. | ||
711 | |||
712 | In addition to that the prepare() callback may carry out some operations | ||
713 | preparing the device to be suspended, although it should not allocate memory | ||
714 | (if additional memory is required to suspend the device, it has to be | ||
715 | preallocated earlier, for example in a suspend/hibernate notifier as described | ||
716 | in Documentation/power/notifiers.txt). | ||
717 | |||
718 | 3.1.2. suspend() | ||
719 | |||
720 | The suspend() callback is only executed during system suspend, after prepare() | ||
721 | callbacks have been executed for all devices in the system. | ||
722 | |||
723 | This callback is expected to quiesce the device and prepare it to be put into a | ||
724 | low-power state by the PCI subsystem. It is not required (in fact it even is | ||
725 | not recommended) that a PCI driver's suspend() callback save the standard | ||
726 | configuration registers of the device, prepare it for waking up the system, or | ||
727 | put it into a low-power state. All of these operations can very well be taken | ||
728 | care of by the PCI subsystem, without the driver's participation. | ||
729 | |||
730 | However, in some rare case it is convenient to carry out these operations in | ||
731 | a PCI driver. Then, pci_save_state(), pci_prepare_to_sleep(), and | ||
732 | pci_set_power_state() should be used to save the device's standard configuration | ||
733 | registers, to prepare it for system wakeup (if necessary), and to put it into a | ||
734 | low-power state, respectively. Moreover, if the driver calls pci_save_state(), | ||
735 | the PCI subsystem will not execute either pci_prepare_to_sleep(), or | ||
736 | pci_set_power_state() for its device, so the driver is then responsible for | ||
737 | handling the device as appropriate. | ||
738 | |||
739 | While the suspend() callback is being executed, the driver's interrupt handler | ||
740 | can be invoked to handle an interrupt from the device, so all suspend-related | ||
741 | operations relying on the driver's ability to handle interrupts should be | ||
742 | carried out in this callback. | ||
743 | |||
744 | 3.1.3. suspend_noirq() | ||
745 | |||
746 | The suspend_noirq() callback is only executed during system suspend, after | ||
747 | suspend() callbacks have been executed for all devices in the system and | ||
748 | after device interrupts have been disabled by the PM core. | ||
749 | |||
750 | The difference between suspend_noirq() and suspend() is that the driver's | ||
751 | interrupt handler will not be invoked while suspend_noirq() is running. Thus | ||
752 | suspend_noirq() can carry out operations that would cause race conditions to | ||
753 | arise if they were performed in suspend(). | ||
754 | |||
755 | 3.1.4. freeze() | ||
756 | |||
757 | The freeze() callback is hibernation-specific and is executed in two situations, | ||
758 | during hibernation, after prepare() callbacks have been executed for all devices | ||
759 | in preparation for the creation of a system image, and during restore, | ||
760 | after a system image has been loaded into memory from persistent storage and the | ||
761 | prepare() callbacks have been executed for all devices. | ||
762 | |||
763 | The role of this callback is analogous to the role of the suspend() callback | ||
764 | described above. In fact, they only need to be different in the rare cases when | ||
765 | the driver takes the responsibility for putting the device into a low-power | ||
76 | state. | 766 | state. |
77 | 767 | ||
78 | The first walk allows a graceful recovery in the event of a failure, since none | 768 | In that cases the freeze() callback should not prepare the device system wakeup |
79 | of the devices have actually been powered down. | 769 | or put it into a low-power state. Still, either it or freeze_noirq() should |
80 | 770 | save the device's standard configuration registers using pci_save_state(). | |
81 | In both walks, in particular the second, all children of a bridge are touched | ||
82 | before the actual bridge itself. This allows the bridge to retain power while | ||
83 | its children are being accessed. | ||
84 | |||
85 | Upon resuming from sleep, just the opposite must be true: all bridges must be | ||
86 | powered on and restored before their children are powered on. This is easily | ||
87 | accomplished with a breadth-first walk of the PCI device tree. | ||
88 | |||
89 | |||
90 | 3. PCI Utility Functions | ||
91 | ~~~~~~~~~~~~~~~~~~~~~~~~ | ||
92 | |||
93 | These are helper functions designed to be called by individual device drivers. | ||
94 | Assuming that a device behaves as advertised, these should be applicable in most | ||
95 | cases. However, results may vary. | ||
96 | |||
97 | Note that these functions are never implicitly called for the driver. The driver | ||
98 | is always responsible for deciding when and if to call these. | ||
99 | |||
100 | |||
101 | pci_save_state | ||
102 | -------------- | ||
103 | |||
104 | Usage: | ||
105 | pci_save_state(struct pci_dev *dev); | ||
106 | |||
107 | Description: | ||
108 | Save first 64 bytes of PCI config space, along with any additional | ||
109 | PCI-Express or PCI-X information. | ||
110 | |||
111 | |||
112 | pci_restore_state | ||
113 | ----------------- | ||
114 | |||
115 | Usage: | ||
116 | pci_restore_state(struct pci_dev *dev); | ||
117 | |||
118 | Description: | ||
119 | Restore previously saved config space. | ||
120 | |||
121 | |||
122 | pci_set_power_state | ||
123 | ------------------- | ||
124 | |||
125 | Usage: | ||
126 | pci_set_power_state(struct pci_dev *dev, pci_power_t state); | ||
127 | |||
128 | Description: | ||
129 | Transition device to low power state using PCI PM Capabilities | ||
130 | registers. | ||
131 | |||
132 | Will fail under one of the following conditions: | ||
133 | - If state is less than current state, but not D0 (illegal transition) | ||
134 | - Device doesn't support PM Capabilities | ||
135 | - Device does not support requested state | ||
136 | |||
137 | |||
138 | pci_enable_wake | ||
139 | --------------- | ||
140 | |||
141 | Usage: | ||
142 | pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable); | ||
143 | |||
144 | Description: | ||
145 | Enable device to generate PME# during low power state using PCI PM | ||
146 | Capabilities. | ||
147 | |||
148 | Checks whether if device supports generating PME# from requested state | ||
149 | and fail if it does not, unless enable == 0 (request is to disable wake | ||
150 | events, which is implicit if it doesn't even support it in the first | ||
151 | place). | ||
152 | |||
153 | Note that the PMC Register in the device's PM Capabilities has a bitmask | ||
154 | of the states it supports generating PME# from. D3hot is bit 3 and | ||
155 | D3cold is bit 4. So, while a value of 4 as the state may not seem | ||
156 | semantically correct, it is. | ||
157 | |||
158 | |||
159 | 4. PCI Device Drivers | ||
160 | ~~~~~~~~~~~~~~~~~~~~~ | ||
161 | |||
162 | These functions are intended for use by individual drivers, and are defined in | ||
163 | struct pci_driver: | ||
164 | |||
165 | int (*suspend) (struct pci_dev *dev, pm_message_t state); | ||
166 | int (*resume) (struct pci_dev *dev); | ||
167 | |||
168 | |||
169 | suspend | ||
170 | ------- | ||
171 | |||
172 | Usage: | ||
173 | |||
174 | if (dev->driver && dev->driver->suspend) | ||
175 | dev->driver->suspend(dev,state); | ||
176 | |||
177 | A driver uses this function to actually transition the device into a low power | ||
178 | state. This should include disabling I/O, IRQs, and bus-mastering, as well as | ||
179 | physically transitioning the device to a lower power state; it may also include | ||
180 | calls to pci_enable_wake(). | ||
181 | |||
182 | Bus mastering may be disabled by doing: | ||
183 | |||
184 | pci_disable_device(dev); | ||
185 | |||
186 | For devices that support the PCI PM Spec, this may be used to set the device's | ||
187 | power state to match the suspend() parameter: | ||
188 | |||
189 | pci_set_power_state(dev,state); | ||
190 | |||
191 | The driver is also responsible for disabling any other device-specific features | ||
192 | (e.g blanking screen, turning off on-card memory, etc). | ||
193 | |||
194 | The driver should be sure to track the current state of the device, as it may | ||
195 | obviate the need for some operations. | ||
196 | |||
197 | The driver should update the current_state field in its pci_dev structure in | ||
198 | this function, except for PM-capable devices when pci_set_power_state is used. | ||
199 | |||
200 | resume | ||
201 | ------ | ||
202 | |||
203 | Usage: | ||
204 | |||
205 | if (dev->driver && dev->driver->resume) | ||
206 | dev->driver->resume(dev) | ||
207 | 771 | ||
208 | The resume callback may be called from any power state, and is always meant to | 772 | 3.1.5. freeze_noirq() |
209 | transition the device to the D0 state. | ||
210 | 773 | ||
211 | The driver is responsible for reenabling any features of the device that had | 774 | The freeze_noirq() callback is hibernation-specific. It is executed during |
212 | been disabled during previous suspend calls, such as IRQs and bus mastering, | 775 | hibernation, after prepare() and freeze() callbacks have been executed for all |
213 | as well as calling pci_restore_state(). | 776 | devices in preparation for the creation of a system image, and during restore, |
777 | after a system image has been loaded into memory and after prepare() and | ||
778 | freeze() callbacks have been executed for all devices. It is always executed | ||
779 | after device interrupts have been disabled by the PM core. | ||
214 | 780 | ||
215 | If the device is currently in D3, it may need to be reinitialized in resume(). | 781 | The role of this callback is analogous to the role of the suspend_noirq() |
782 | callback described above and it very rarely is necessary to define | ||
783 | freeze_noirq(). | ||
216 | 784 | ||
217 | * Some types of devices, like bus controllers, will preserve context in D3hot | 785 | The difference between freeze_noirq() and freeze() is analogous to the |
218 | (using Vcc power). Their drivers will often want to avoid re-initializing | 786 | difference between suspend_noirq() and suspend(). |
219 | them after re-entering D0 (perhaps to avoid resetting downstream devices). | ||
220 | 787 | ||
221 | * Other kinds of devices in D3hot will discard device context as part of a | 788 | 3.1.6. poweroff() |
222 | soft reset when re-entering the D0 state. | ||
223 | |||
224 | * Devices resuming from D3cold always go through a power-on reset. Some | ||
225 | device context can also be preserved using Vaux power. | ||
226 | 789 | ||
227 | * Some systems hide D3cold resume paths from drivers. For example, on PCs | 790 | The poweroff() callback is hibernation-specific. It is executed when the system |
228 | the resume path for suspend-to-disk often runs BIOS powerup code, which | 791 | is about to be powered off after saving a hibernation image to a persistent |
229 | will sometimes re-initialize the device. | 792 | storage. prepare() callbacks are executed for all devices before poweroff() is |
793 | called. | ||
230 | 794 | ||
231 | To handle resets during D3 to D0 transitions, it may be convenient to share | 795 | The role of this callback is analogous to the role of the suspend() and freeze() |
232 | device initialization code between probe() and resume(). Device parameters | 796 | callbacks described above, although it does not need to save the contents of |
233 | can also be saved before the driver suspends into D3, avoiding re-probe. | 797 | the device's registers. In particular, if the driver wants to put the device |
798 | into a low-power state itself instead of allowing the PCI subsystem to do that, | ||
799 | the poweroff() callback should use pci_prepare_to_sleep() and | ||
800 | pci_set_power_state() to prepare the device for system wakeup and to put it | ||
801 | into a low-power state, respectively, but it need not save the device's standard | ||
802 | configuration registers. | ||
234 | 803 | ||
235 | If the device supports the PCI PM Spec, it can use this to physically transition | 804 | 3.1.7. poweroff_noirq() |
236 | the device to D0: | ||
237 | 805 | ||
238 | pci_set_power_state(dev,0); | 806 | The poweroff_noirq() callback is hibernation-specific. It is executed after |
807 | poweroff() callbacks have been executed for all devices in the system. | ||
239 | 808 | ||
240 | Note that if the entire system is transitioning out of a global sleep state, all | 809 | The role of this callback is analogous to the role of the suspend_noirq() and |
241 | devices will be placed in the D0 state, so this is not necessary. However, in | 810 | freeze_noirq() callbacks described above, but it does not need to save the |
242 | the event that the device is placed in the D3 state during normal operation, | 811 | contents of the device's registers. |
243 | this call is necessary. It is impossible to determine which of the two events is | ||
244 | taking place in the driver, so it is always a good idea to make that call. | ||
245 | 812 | ||
246 | The driver should take note of the state that it is resuming from in order to | 813 | The difference between poweroff_noirq() and poweroff() is analogous to the |
247 | ensure correct (and speedy) operation. | 814 | difference between suspend_noirq() and suspend(). |
248 | 815 | ||
249 | The driver should update the current_state field in its pci_dev structure in | 816 | 3.1.8. resume_noirq() |
250 | this function, except for PM-capable devices when pci_set_power_state is used. | ||
251 | 817 | ||
818 | The resume_noirq() callback is only executed during system resume, after the | ||
819 | PM core has enabled the non-boot CPUs. The driver's interrupt handler will not | ||
820 | be invoked while resume_noirq() is running, so this callback can carry out | ||
821 | operations that might race with the interrupt handler. | ||
252 | 822 | ||
823 | Since the PCI subsystem unconditionally puts all devices into the full power | ||
824 | state in the resume_noirq phase of system resume and restores their standard | ||
825 | configuration registers, resume_noirq() is usually not necessary. In general | ||
826 | it should only be used for performing operations that would lead to race | ||
827 | conditions if carried out by resume(). | ||
253 | 828 | ||
254 | A reference implementation | 829 | 3.1.9. resume() |
255 | ------------------------- | ||
256 | .suspend() | ||
257 | { | ||
258 | /* driver specific operations */ | ||
259 | 830 | ||
260 | /* Disable IRQ */ | 831 | The resume() callback is only executed during system resume, after |
261 | free_irq(); | 832 | resume_noirq() callbacks have been executed for all devices in the system and |
262 | /* If using MSI */ | 833 | device interrupts have been enabled by the PM core. |
263 | pci_disable_msi(); | ||
264 | 834 | ||
265 | pci_save_state(); | 835 | This callback is responsible for restoring the pre-suspend configuration of the |
266 | pci_enable_wake(); | 836 | device and bringing it back to the fully functional state. The device should be |
267 | /* Disable IO/bus master/irq router */ | 837 | able to process I/O in a usual way after resume() has returned. |
268 | pci_disable_device(); | ||
269 | pci_set_power_state(pci_choose_state()); | ||
270 | } | ||
271 | 838 | ||
272 | .resume() | 839 | 3.1.10. thaw_noirq() |
273 | { | ||
274 | pci_set_power_state(PCI_D0); | ||
275 | pci_restore_state(); | ||
276 | /* device's irq possibly is changed, driver should take care */ | ||
277 | pci_enable_device(); | ||
278 | pci_set_master(); | ||
279 | 840 | ||
280 | /* if using MSI, device's vector possibly is changed */ | 841 | The thaw_noirq() callback is hibernation-specific. It is executed after a |
281 | pci_enable_msi(); | 842 | system image has been created and the non-boot CPUs have been enabled by the PM |
843 | core, in the thaw_noirq phase of hibernation. It also may be executed if the | ||
844 | loading of a hibernation image fails during system restore (it is then executed | ||
845 | after enabling the non-boot CPUs). The driver's interrupt handler will not be | ||
846 | invoked while thaw_noirq() is running. | ||
282 | 847 | ||
283 | request_irq(); | 848 | The role of this callback is analogous to the role of resume_noirq(). The |
284 | /* driver specific operations; */ | 849 | difference between these two callbacks is that thaw_noirq() is executed after |
285 | } | 850 | freeze() and freeze_noirq(), so in general it does not need to modify the |
851 | contents of the device's registers. | ||
286 | 852 | ||
287 | This is a typical implementation. Drivers can slightly change the order | 853 | 3.1.11. thaw() |
288 | of the operations in the implementation, ignore some operations or add | ||
289 | more driver specific operations in it, but drivers should do something like | ||
290 | this on the whole. | ||
291 | 854 | ||
292 | 5. Resources | 855 | The thaw() callback is hibernation-specific. It is executed after thaw_noirq() |
293 | ~~~~~~~~~~~~ | 856 | callbacks have been executed for all devices in the system and after device |
857 | interrupts have been enabled by the PM core. | ||
294 | 858 | ||
295 | PCI Local Bus Specification | 859 | This callback is responsible for restoring the pre-freeze configuration of |
296 | PCI Bus Power Management Interface Specification | 860 | the device, so that it will work in a usual way after thaw() has returned. |
297 | 861 | ||
298 | http://www.pcisig.com | 862 | 3.1.12. restore_noirq() |
299 | 863 | ||
864 | The restore_noirq() callback is hibernation-specific. It is executed in the | ||
865 | restore_noirq phase of hibernation, when the boot kernel has passed control to | ||
866 | the image kernel and the non-boot CPUs have been enabled by the image kernel's | ||
867 | PM core. | ||
868 | |||
869 | This callback is analogous to resume_noirq() with the exception that it cannot | ||
870 | make any assumption on the previous state of the device, even if the BIOS (or | ||
871 | generally the platform firmware) is known to preserve that state over a | ||
872 | suspend-resume cycle. | ||
873 | |||
874 | For the vast majority of PCI device drivers there is no difference between | ||
875 | resume_noirq() and restore_noirq(). | ||
876 | |||
877 | 3.1.13. restore() | ||
878 | |||
879 | The restore() callback is hibernation-specific. It is executed after | ||
880 | restore_noirq() callbacks have been executed for all devices in the system and | ||
881 | after the PM core has enabled device drivers' interrupt handlers to be invoked. | ||
882 | |||
883 | This callback is analogous to resume(), just like restore_noirq() is analogous | ||
884 | to resume_noirq(). Consequently, the difference between restore_noirq() and | ||
885 | restore() is analogous to the difference between resume_noirq() and resume(). | ||
886 | |||
887 | For the vast majority of PCI device drivers there is no difference between | ||
888 | resume() and restore(). | ||
889 | |||
890 | 3.1.14. complete() | ||
891 | |||
892 | The complete() callback is executed in the following situations: | ||
893 | - during system resume, after resume() callbacks have been executed for all | ||
894 | devices, | ||
895 | - during hibernation, before saving the system image, after thaw() callbacks | ||
896 | have been executed for all devices, | ||
897 | - during system restore, when the system is going back to its pre-hibernation | ||
898 | state, after restore() callbacks have been executed for all devices. | ||
899 | It also may be executed if the loading of a hibernation image into memory fails | ||
900 | (in that case it is run after thaw() callbacks have been executed for all | ||
901 | devices that have drivers in the boot kernel). | ||
902 | |||
903 | This callback is entirely optional, although it may be necessary if the | ||
904 | prepare() callback performs operations that need to be reversed. | ||
905 | |||
906 | 3.1.15. runtime_suspend() | ||
907 | |||
908 | The runtime_suspend() callback is specific to device runtime power management | ||
909 | (runtime PM). It is executed by the PM core's runtime PM framework when the | ||
910 | device is about to be suspended (i.e. quiesced and put into a low-power state) | ||
911 | at run time. | ||
912 | |||
913 | This callback is responsible for freezing the device and preparing it to be | ||
914 | put into a low-power state, but it must allow the PCI subsystem to perform all | ||
915 | of the PCI-specific actions necessary for suspending the device. | ||
916 | |||
917 | 3.1.16. runtime_resume() | ||
918 | |||
919 | The runtime_resume() callback is specific to device runtime PM. It is executed | ||
920 | by the PM core's runtime PM framework when the device is about to be resumed | ||
921 | (i.e. put into the full-power state and programmed to process I/O normally) at | ||
922 | run time. | ||
923 | |||
924 | This callback is responsible for restoring the normal functionality of the | ||
925 | device after it has been put into the full-power state by the PCI subsystem. | ||
926 | The device is expected to be able to process I/O in the usual way after | ||
927 | runtime_resume() has returned. | ||
928 | |||
929 | 3.1.17. runtime_idle() | ||
930 | |||
931 | The runtime_idle() callback is specific to device runtime PM. It is executed | ||
932 | by the PM core's runtime PM framework whenever it may be desirable to suspend | ||
933 | the device according to the PM core's information. In particular, it is | ||
934 | automatically executed right after runtime_resume() has returned in case the | ||
935 | resume of the device has happened as a result of a spurious event. | ||
936 | |||
937 | This callback is optional, but if it is not implemented or if it returns 0, the | ||
938 | PCI subsystem will call pm_runtime_suspend() for the device, which in turn will | ||
939 | cause the driver's runtime_suspend() callback to be executed. | ||
940 | |||
941 | 3.1.18. Pointing Multiple Callback Pointers to One Routine | ||
942 | |||
943 | Although in principle each of the callbacks described in the previous | ||
944 | subsections can be defined as a separate function, it often is convenient to | ||
945 | point two or more members of struct dev_pm_ops to the same routine. There are | ||
946 | a few convenience macros that can be used for this purpose. | ||
947 | |||
948 | The SIMPLE_DEV_PM_OPS macro declares a struct dev_pm_ops object with one | ||
949 | suspend routine pointed to by the .suspend(), .freeze(), and .poweroff() | ||
950 | members and one resume routine pointed to by the .resume(), .thaw(), and | ||
951 | .restore() members. The other function pointers in this struct dev_pm_ops are | ||
952 | unset. | ||
953 | |||
954 | The UNIVERSAL_DEV_PM_OPS macro is similar to SIMPLE_DEV_PM_OPS, but it | ||
955 | additionally sets the .runtime_resume() pointer to the same value as | ||
956 | .resume() (and .thaw(), and .restore()) and the .runtime_suspend() pointer to | ||
957 | the same value as .suspend() (and .freeze() and .poweroff()). | ||
958 | |||
959 | The SET_SYSTEM_SLEEP_PM_OPS can be used inside of a declaration of struct | ||
960 | dev_pm_ops to indicate that one suspend routine is to be pointed to by the | ||
961 | .suspend(), .freeze(), and .poweroff() members and one resume routine is to | ||
962 | be pointed to by the .resume(), .thaw(), and .restore() members. | ||
963 | |||
964 | 3.2. Device Runtime Power Management | ||
965 | ------------------------------------ | ||
966 | In addition to providing device power management callbacks PCI device drivers | ||
967 | are responsible for controlling the runtime power management (runtime PM) of | ||
968 | their devices. | ||
969 | |||
970 | The PCI device runtime PM is optional, but it is recommended that PCI device | ||
971 | drivers implement it at least in the cases where there is a reliable way of | ||
972 | verifying that the device is not used (like when the network cable is detached | ||
973 | from an Ethernet adapter or there are no devices attached to a USB controller). | ||
974 | |||
975 | To support the PCI runtime PM the driver first needs to implement the | ||
976 | runtime_suspend() and runtime_resume() callbacks. It also may need to implement | ||
977 | the runtime_idle() callback to prevent the device from being suspended again | ||
978 | every time right after the runtime_resume() callback has returned | ||
979 | (alternatively, the runtime_suspend() callback will have to check if the | ||
980 | device should really be suspended and return -EAGAIN if that is not the case). | ||
981 | |||
982 | The runtime PM of PCI devices is disabled by default. It is also blocked by | ||
983 | pci_pm_init() that runs the pm_runtime_forbid() helper function. If a PCI | ||
984 | driver implements the runtime PM callbacks and intends to use the runtime PM | ||
985 | framework provided by the PM core and the PCI subsystem, it should enable this | ||
986 | feature by executing the pm_runtime_enable() helper function. However, the | ||
987 | driver should not call the pm_runtime_allow() helper function unblocking | ||
988 | the runtime PM of the device. Instead, it should allow user space or some | ||
989 | platform-specific code to do that (user space can do it via sysfs), although | ||
990 | once it has called pm_runtime_enable(), it must be prepared to handle the | ||
991 | runtime PM of the device correctly as soon as pm_runtime_allow() is called | ||
992 | (which may happen at any time). [It also is possible that user space causes | ||
993 | pm_runtime_allow() to be called via sysfs before the driver is loaded, so in | ||
994 | fact the driver has to be prepared to handle the runtime PM of the device as | ||
995 | soon as it calls pm_runtime_enable().] | ||
996 | |||
997 | The runtime PM framework works by processing requests to suspend or resume | ||
998 | devices, or to check if they are idle (in which cases it is reasonable to | ||
999 | subsequently request that they be suspended). These requests are represented | ||
1000 | by work items put into the power management workqueue, pm_wq. Although there | ||
1001 | are a few situations in which power management requests are automatically | ||
1002 | queued by the PM core (for example, after processing a request to resume a | ||
1003 | device the PM core automatically queues a request to check if the device is | ||
1004 | idle), device drivers are generally responsible for queuing power management | ||
1005 | requests for their devices. For this purpose they should use the runtime PM | ||
1006 | helper functions provided by the PM core, discussed in | ||
1007 | Documentation/power/runtime_pm.txt. | ||
1008 | |||
1009 | Devices can also be suspended and resumed synchronously, without placing a | ||
1010 | request into pm_wq. In the majority of cases this also is done by their | ||
1011 | drivers that use helper functions provided by the PM core for this purpose. | ||
1012 | |||
1013 | For more information on the runtime PM of devices refer to | ||
1014 | Documentation/power/runtime_pm.txt. | ||
1015 | |||
1016 | |||
1017 | 4. Resources | ||
1018 | ============ | ||
1019 | |||
1020 | PCI Local Bus Specification, Rev. 3.0 | ||
1021 | PCI Bus Power Management Interface Specification, Rev. 1.2 | ||
1022 | Advanced Configuration and Power Interface (ACPI) Specification, Rev. 3.0b | ||
1023 | PCI Express Base Specification, Rev. 2.0 | ||
1024 | Documentation/power/devices.txt | ||
1025 | Documentation/power/runtime_pm.txt | ||
diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt index c40866e8b957..bfed898a03fc 100644 --- a/Documentation/power/pm_qos_interface.txt +++ b/Documentation/power/pm_qos_interface.txt | |||
@@ -18,44 +18,46 @@ and pm_qos_params.h. This is done because having the available parameters | |||
18 | being runtime configurable or changeable from a driver was seen as too easy to | 18 | being runtime configurable or changeable from a driver was seen as too easy to |
19 | abuse. | 19 | abuse. |
20 | 20 | ||
21 | For each parameter a list of performance requirements is maintained along with | 21 | For each parameter a list of performance requests is maintained along with |
22 | an aggregated target value. The aggregated target value is updated with | 22 | an aggregated target value. The aggregated target value is updated with |
23 | changes to the requirement list or elements of the list. Typically the | 23 | changes to the request list or elements of the list. Typically the |
24 | aggregated target value is simply the max or min of the requirement values held | 24 | aggregated target value is simply the max or min of the request values held |
25 | in the parameter list elements. | 25 | in the parameter list elements. |
26 | 26 | ||
27 | From kernel mode the use of this interface is simple: | 27 | From kernel mode the use of this interface is simple: |
28 | pm_qos_add_requirement(param_id, name, target_value): | ||
29 | Will insert a named element in the list for that identified PM_QOS parameter | ||
30 | with the target value. Upon change to this list the new target is recomputed | ||
31 | and any registered notifiers are called only if the target value is now | ||
32 | different. | ||
33 | 28 | ||
34 | pm_qos_update_requirement(param_id, name, new_target_value): | 29 | handle = pm_qos_add_request(param_class, target_value): |
35 | Will search the list identified by the param_id for the named list element and | 30 | Will insert an element into the list for that identified PM_QOS class with the |
36 | then update its target value, calling the notification tree if the aggregated | 31 | target value. Upon change to this list the new target is recomputed and any |
37 | target is changed. with that name is already registered. | 32 | registered notifiers are called only if the target value is now different. |
33 | Clients of pm_qos need to save the returned handle. | ||
38 | 34 | ||
39 | pm_qos_remove_requirement(param_id, name): | 35 | void pm_qos_update_request(handle, new_target_value): |
40 | Will search the identified list for the named element and remove it, after | 36 | Will update the list element pointed to by the handle with the new target value |
41 | removal it will update the aggregate target and call the notification tree if | 37 | and recompute the new aggregated target, calling the notification tree if the |
42 | the target was changed as a result of removing the named requirement. | 38 | target is changed. |
39 | |||
40 | void pm_qos_remove_request(handle): | ||
41 | Will remove the element. After removal it will update the aggregate target and | ||
42 | call the notification tree if the target was changed as a result of removing | ||
43 | the request. | ||
43 | 44 | ||
44 | 45 | ||
45 | From user mode: | 46 | From user mode: |
46 | Only processes can register a pm_qos requirement. To provide for automatic | 47 | Only processes can register a pm_qos request. To provide for automatic |
47 | cleanup for process the interface requires the process to register its | 48 | cleanup of a process, the interface requires the process to register its |
48 | parameter requirements in the following way: | 49 | parameter requests in the following way: |
49 | 50 | ||
50 | To register the default pm_qos target for the specific parameter, the process | 51 | To register the default pm_qos target for the specific parameter, the process |
51 | must open one of /dev/[cpu_dma_latency, network_latency, network_throughput] | 52 | must open one of /dev/[cpu_dma_latency, network_latency, network_throughput] |
52 | 53 | ||
53 | As long as the device node is held open that process has a registered | 54 | As long as the device node is held open that process has a registered |
54 | requirement on the parameter. The name of the requirement is "process_<PID>" | 55 | request on the parameter. |
55 | derived from the current->pid from within the open system call. | ||
56 | 56 | ||
57 | To change the requested target value the process needs to write a s32 value to | 57 | To change the requested target value the process needs to write an s32 value to |
58 | the open device node. This translates to a pm_qos_update_requirement call. | 58 | the open device node. Alternatively the user mode program could write a hex |
59 | string for the value using 10 char long format e.g. "0x12345678". This | ||
60 | translates to a pm_qos_update_request call. | ||
59 | 61 | ||
60 | To remove the user mode request for a target value simply close the device | 62 | To remove the user mode request for a target value simply close the device |
61 | node. | 63 | node. |
diff --git a/Documentation/power/regulator/consumer.txt b/Documentation/power/regulator/consumer.txt index cdebb5145c25..55c4175d8099 100644 --- a/Documentation/power/regulator/consumer.txt +++ b/Documentation/power/regulator/consumer.txt | |||
@@ -8,11 +8,11 @@ Please see overview.txt for a description of the terms used in this text. | |||
8 | 1. Consumer Regulator Access (static & dynamic drivers) | 8 | 1. Consumer Regulator Access (static & dynamic drivers) |
9 | ======================================================= | 9 | ======================================================= |
10 | 10 | ||
11 | A consumer driver can get access to it's supply regulator by calling :- | 11 | A consumer driver can get access to its supply regulator by calling :- |
12 | 12 | ||
13 | regulator = regulator_get(dev, "Vcc"); | 13 | regulator = regulator_get(dev, "Vcc"); |
14 | 14 | ||
15 | The consumer passes in it's struct device pointer and power supply ID. The core | 15 | The consumer passes in its struct device pointer and power supply ID. The core |
16 | then finds the correct regulator by consulting a machine specific lookup table. | 16 | then finds the correct regulator by consulting a machine specific lookup table. |
17 | If the lookup is successful then this call will return a pointer to the struct | 17 | If the lookup is successful then this call will return a pointer to the struct |
18 | regulator that supplies this consumer. | 18 | regulator that supplies this consumer. |
@@ -34,7 +34,7 @@ usually be called in your device drivers probe() and remove() respectively. | |||
34 | 2. Regulator Output Enable & Disable (static & dynamic drivers) | 34 | 2. Regulator Output Enable & Disable (static & dynamic drivers) |
35 | ==================================================================== | 35 | ==================================================================== |
36 | 36 | ||
37 | A consumer can enable it's power supply by calling:- | 37 | A consumer can enable its power supply by calling:- |
38 | 38 | ||
39 | int regulator_enable(regulator); | 39 | int regulator_enable(regulator); |
40 | 40 | ||
@@ -49,7 +49,7 @@ int regulator_is_enabled(regulator); | |||
49 | This will return > zero when the regulator is enabled. | 49 | This will return > zero when the regulator is enabled. |
50 | 50 | ||
51 | 51 | ||
52 | A consumer can disable it's supply when no longer needed by calling :- | 52 | A consumer can disable its supply when no longer needed by calling :- |
53 | 53 | ||
54 | int regulator_disable(regulator); | 54 | int regulator_disable(regulator); |
55 | 55 | ||
@@ -140,7 +140,7 @@ by calling :- | |||
140 | int regulator_set_optimum_mode(struct regulator *regulator, int load_uA); | 140 | int regulator_set_optimum_mode(struct regulator *regulator, int load_uA); |
141 | 141 | ||
142 | This will cause the core to recalculate the total load on the regulator (based | 142 | This will cause the core to recalculate the total load on the regulator (based |
143 | on all it's consumers) and change operating mode (if necessary and permitted) | 143 | on all its consumers) and change operating mode (if necessary and permitted) |
144 | to best match the current operating load. | 144 | to best match the current operating load. |
145 | 145 | ||
146 | The load_uA value can be determined from the consumers datasheet. e.g.most | 146 | The load_uA value can be determined from the consumers datasheet. e.g.most |
diff --git a/Documentation/power/regulator/machine.txt b/Documentation/power/regulator/machine.txt index 63728fed620b..bdec39b9bd75 100644 --- a/Documentation/power/regulator/machine.txt +++ b/Documentation/power/regulator/machine.txt | |||
@@ -52,7 +52,7 @@ static struct regulator_init_data regulator1_data = { | |||
52 | }; | 52 | }; |
53 | 53 | ||
54 | Regulator-1 supplies power to Regulator-2. This relationship must be registered | 54 | Regulator-1 supplies power to Regulator-2. This relationship must be registered |
55 | with the core so that Regulator-1 is also enabled when Consumer A enables it's | 55 | with the core so that Regulator-1 is also enabled when Consumer A enables its |
56 | supply (Regulator-2). The supply regulator is set by the supply_regulator_dev | 56 | supply (Regulator-2). The supply regulator is set by the supply_regulator_dev |
57 | field below:- | 57 | field below:- |
58 | 58 | ||
diff --git a/Documentation/power/regulator/overview.txt b/Documentation/power/regulator/overview.txt index ffd185bb6054..9363e056188a 100644 --- a/Documentation/power/regulator/overview.txt +++ b/Documentation/power/regulator/overview.txt | |||
@@ -35,16 +35,16 @@ Some terms used in this document:- | |||
35 | o Consumer - Electronic device that is supplied power by a regulator. | 35 | o Consumer - Electronic device that is supplied power by a regulator. |
36 | Consumers can be classified into two types:- | 36 | Consumers can be classified into two types:- |
37 | 37 | ||
38 | Static: consumer does not change it's supply voltage or | 38 | Static: consumer does not change its supply voltage or |
39 | current limit. It only needs to enable or disable it's | 39 | current limit. It only needs to enable or disable it's |
40 | power supply. It's supply voltage is set by the hardware, | 40 | power supply. Its supply voltage is set by the hardware, |
41 | bootloader, firmware or kernel board initialisation code. | 41 | bootloader, firmware or kernel board initialisation code. |
42 | 42 | ||
43 | Dynamic: consumer needs to change it's supply voltage or | 43 | Dynamic: consumer needs to change it's supply voltage or |
44 | current limit to meet operation demands. | 44 | current limit to meet operation demands. |
45 | 45 | ||
46 | 46 | ||
47 | o Power Domain - Electronic circuit that is supplied it's input power by the | 47 | o Power Domain - Electronic circuit that is supplied its input power by the |
48 | output power of a regulator, switch or by another power | 48 | output power of a regulator, switch or by another power |
49 | domain. | 49 | domain. |
50 | 50 | ||
diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt index 356fd86f4ea8..55b859b3bc72 100644 --- a/Documentation/power/runtime_pm.txt +++ b/Documentation/power/runtime_pm.txt | |||
@@ -224,6 +224,12 @@ defined in include/linux/pm.h: | |||
224 | RPM_SUSPENDED, which means that each device is initially regarded by the | 224 | RPM_SUSPENDED, which means that each device is initially regarded by the |
225 | PM core as 'suspended', regardless of its real hardware status | 225 | PM core as 'suspended', regardless of its real hardware status |
226 | 226 | ||
227 | unsigned int runtime_auto; | ||
228 | - if set, indicates that the user space has allowed the device driver to | ||
229 | power manage the device at run time via the /sys/devices/.../power/control | ||
230 | interface; it may only be modified with the help of the pm_runtime_allow() | ||
231 | and pm_runtime_forbid() helper functions | ||
232 | |||
227 | All of the above fields are members of the 'power' member of 'struct device'. | 233 | All of the above fields are members of the 'power' member of 'struct device'. |
228 | 234 | ||
229 | 4. Run-time PM Device Helper Functions | 235 | 4. Run-time PM Device Helper Functions |
@@ -250,7 +256,7 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h: | |||
250 | to suspend the device again in future | 256 | to suspend the device again in future |
251 | 257 | ||
252 | int pm_runtime_resume(struct device *dev); | 258 | int pm_runtime_resume(struct device *dev); |
253 | - execute the subsystem-leve resume callback for the device; returns 0 on | 259 | - execute the subsystem-level resume callback for the device; returns 0 on |
254 | success, 1 if the device's run-time PM status was already 'active' or | 260 | success, 1 if the device's run-time PM status was already 'active' or |
255 | error code on failure, where -EAGAIN means it may be safe to attempt to | 261 | error code on failure, where -EAGAIN means it may be safe to attempt to |
256 | resume the device again in future, but 'power.runtime_error' should be | 262 | resume the device again in future, but 'power.runtime_error' should be |
@@ -329,6 +335,20 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h: | |||
329 | 'power.runtime_error' is set or 'power.disable_depth' is greater than | 335 | 'power.runtime_error' is set or 'power.disable_depth' is greater than |
330 | zero) | 336 | zero) |
331 | 337 | ||
338 | bool pm_runtime_suspended(struct device *dev); | ||
339 | - return true if the device's runtime PM status is 'suspended', or false | ||
340 | otherwise | ||
341 | |||
342 | void pm_runtime_allow(struct device *dev); | ||
343 | - set the power.runtime_auto flag for the device and decrease its usage | ||
344 | counter (used by the /sys/devices/.../power/control interface to | ||
345 | effectively allow the device to be power managed at run time) | ||
346 | |||
347 | void pm_runtime_forbid(struct device *dev); | ||
348 | - unset the power.runtime_auto flag for the device and increase its usage | ||
349 | counter (used by the /sys/devices/.../power/control interface to | ||
350 | effectively prevent the device from being power managed at run time) | ||
351 | |||
332 | It is safe to execute the following helper functions from interrupt context: | 352 | It is safe to execute the following helper functions from interrupt context: |
333 | 353 | ||
334 | pm_request_idle() | 354 | pm_request_idle() |
@@ -382,6 +402,18 @@ may be desirable to suspend the device as soon as ->probe() or ->remove() has | |||
382 | finished, so the PM core uses pm_runtime_idle_sync() to invoke the | 402 | finished, so the PM core uses pm_runtime_idle_sync() to invoke the |
383 | subsystem-level idle callback for the device at that time. | 403 | subsystem-level idle callback for the device at that time. |
384 | 404 | ||
405 | The user space can effectively disallow the driver of the device to power manage | ||
406 | it at run time by changing the value of its /sys/devices/.../power/control | ||
407 | attribute to "on", which causes pm_runtime_forbid() to be called. In principle, | ||
408 | this mechanism may also be used by the driver to effectively turn off the | ||
409 | run-time power management of the device until the user space turns it on. | ||
410 | Namely, during the initialization the driver can make sure that the run-time PM | ||
411 | status of the device is 'active' and call pm_runtime_forbid(). It should be | ||
412 | noted, however, that if the user space has already intentionally changed the | ||
413 | value of /sys/devices/.../power/control to "auto" to allow the driver to power | ||
414 | manage the device at run time, the driver may confuse it by using | ||
415 | pm_runtime_forbid() this way. | ||
416 | |||
385 | 6. Run-time PM and System Sleep | 417 | 6. Run-time PM and System Sleep |
386 | 418 | ||
387 | Run-time PM and system sleep (i.e., system suspend and hibernation, also known | 419 | Run-time PM and system sleep (i.e., system suspend and hibernation, also known |
@@ -431,3 +463,64 @@ The PM core always increments the run-time usage counter before calling the | |||
431 | ->prepare() callback and decrements it after calling the ->complete() callback. | 463 | ->prepare() callback and decrements it after calling the ->complete() callback. |
432 | Hence disabling run-time PM temporarily like this will not cause any run-time | 464 | Hence disabling run-time PM temporarily like this will not cause any run-time |
433 | suspend callbacks to be lost. | 465 | suspend callbacks to be lost. |
466 | |||
467 | 7. Generic subsystem callbacks | ||
468 | |||
469 | Subsystems may wish to conserve code space by using the set of generic power | ||
470 | management callbacks provided by the PM core, defined in | ||
471 | driver/base/power/generic_ops.c: | ||
472 | |||
473 | int pm_generic_runtime_idle(struct device *dev); | ||
474 | - invoke the ->runtime_idle() callback provided by the driver of this | ||
475 | device, if defined, and call pm_runtime_suspend() for this device if the | ||
476 | return value is 0 or the callback is not defined | ||
477 | |||
478 | int pm_generic_runtime_suspend(struct device *dev); | ||
479 | - invoke the ->runtime_suspend() callback provided by the driver of this | ||
480 | device and return its result, or return -EINVAL if not defined | ||
481 | |||
482 | int pm_generic_runtime_resume(struct device *dev); | ||
483 | - invoke the ->runtime_resume() callback provided by the driver of this | ||
484 | device and return its result, or return -EINVAL if not defined | ||
485 | |||
486 | int pm_generic_suspend(struct device *dev); | ||
487 | - if the device has not been suspended at run time, invoke the ->suspend() | ||
488 | callback provided by its driver and return its result, or return 0 if not | ||
489 | defined | ||
490 | |||
491 | int pm_generic_resume(struct device *dev); | ||
492 | - invoke the ->resume() callback provided by the driver of this device and, | ||
493 | if successful, change the device's runtime PM status to 'active' | ||
494 | |||
495 | int pm_generic_freeze(struct device *dev); | ||
496 | - if the device has not been suspended at run time, invoke the ->freeze() | ||
497 | callback provided by its driver and return its result, or return 0 if not | ||
498 | defined | ||
499 | |||
500 | int pm_generic_thaw(struct device *dev); | ||
501 | - if the device has not been suspended at run time, invoke the ->thaw() | ||
502 | callback provided by its driver and return its result, or return 0 if not | ||
503 | defined | ||
504 | |||
505 | int pm_generic_poweroff(struct device *dev); | ||
506 | - if the device has not been suspended at run time, invoke the ->poweroff() | ||
507 | callback provided by its driver and return its result, or return 0 if not | ||
508 | defined | ||
509 | |||
510 | int pm_generic_restore(struct device *dev); | ||
511 | - invoke the ->restore() callback provided by the driver of this device and, | ||
512 | if successful, change the device's runtime PM status to 'active' | ||
513 | |||
514 | These functions can be assigned to the ->runtime_idle(), ->runtime_suspend(), | ||
515 | ->runtime_resume(), ->suspend(), ->resume(), ->freeze(), ->thaw(), ->poweroff(), | ||
516 | or ->restore() callback pointers in the subsystem-level dev_pm_ops structures. | ||
517 | |||
518 | If a subsystem wishes to use all of them at the same time, it can simply assign | ||
519 | the GENERIC_SUBSYS_PM_OPS macro, defined in include/linux/pm.h, to its | ||
520 | dev_pm_ops structure pointer. | ||
521 | |||
522 | Device drivers that wish to use the same function as a system suspend, freeze, | ||
523 | poweroff and run-time suspend callback, and similarly for system resume, thaw, | ||
524 | restore, and run-time resume, can achieve this with the help of the | ||
525 | UNIVERSAL_DEV_PM_OPS macro defined in include/linux/pm.h (possibly setting its | ||
526 | last argument to NULL). | ||
diff --git a/Documentation/power/userland-swsusp.txt b/Documentation/power/userland-swsusp.txt index b967cd9137d6..81680f9f5909 100644 --- a/Documentation/power/userland-swsusp.txt +++ b/Documentation/power/userland-swsusp.txt | |||
@@ -24,6 +24,10 @@ assumed to be in the resume mode. The device cannot be open for simultaneous | |||
24 | reading and writing. It is also impossible to have the device open more than | 24 | reading and writing. It is also impossible to have the device open more than |
25 | once at a time. | 25 | once at a time. |
26 | 26 | ||
27 | Even opening the device has side effects. Data structures are | ||
28 | allocated, and PM_HIBERNATION_PREPARE / PM_RESTORE_PREPARE chains are | ||
29 | called. | ||
30 | |||
27 | The ioctl() commands recognized by the device are: | 31 | The ioctl() commands recognized by the device are: |
28 | 32 | ||
29 | SNAPSHOT_FREEZE - freeze user space processes (the current process is | 33 | SNAPSHOT_FREEZE - freeze user space processes (the current process is |
diff --git a/Documentation/powerpc/booting-without-of.txt b/Documentation/powerpc/booting-without-of.txt index 79f533f38c61..46d22105aa07 100644 --- a/Documentation/powerpc/booting-without-of.txt +++ b/Documentation/powerpc/booting-without-of.txt | |||
@@ -1289,7 +1289,7 @@ link between a device node and its interrupt parent in | |||
1289 | the interrupt tree. The value of interrupt-parent is the | 1289 | the interrupt tree. The value of interrupt-parent is the |
1290 | phandle of the parent node. | 1290 | phandle of the parent node. |
1291 | 1291 | ||
1292 | If the interrupt-parent property is not defined for a node, it's | 1292 | If the interrupt-parent property is not defined for a node, its |
1293 | interrupt parent is assumed to be an ancestor in the node's | 1293 | interrupt parent is assumed to be an ancestor in the node's |
1294 | _device tree_ hierarchy. | 1294 | _device tree_ hierarchy. |
1295 | 1295 | ||
diff --git a/Documentation/powerpc/dts-bindings/4xx/reboot.txt b/Documentation/powerpc/dts-bindings/4xx/reboot.txt new file mode 100644 index 000000000000..d7217260589c --- /dev/null +++ b/Documentation/powerpc/dts-bindings/4xx/reboot.txt | |||
@@ -0,0 +1,18 @@ | |||
1 | Reboot property to control system reboot on PPC4xx systems: | ||
2 | |||
3 | By setting "reset_type" to one of the following values, the default | ||
4 | software reset mechanism may be overidden. Here the possible values of | ||
5 | "reset_type": | ||
6 | |||
7 | 1 - PPC4xx core reset | ||
8 | 2 - PPC4xx chip reset | ||
9 | 3 - PPC4xx system reset (default) | ||
10 | |||
11 | Example: | ||
12 | |||
13 | cpu@0 { | ||
14 | device_type = "cpu"; | ||
15 | model = "PowerPC,440SPe"; | ||
16 | ... | ||
17 | reset-type = <2>; /* Use chip-reset */ | ||
18 | }; | ||
diff --git a/Documentation/powerpc/dts-bindings/fsl/8xxx_gpio.txt b/Documentation/powerpc/dts-bindings/fsl/8xxx_gpio.txt index d015dcec4011..b0019eb5330e 100644 --- a/Documentation/powerpc/dts-bindings/fsl/8xxx_gpio.txt +++ b/Documentation/powerpc/dts-bindings/fsl/8xxx_gpio.txt | |||
@@ -11,7 +11,7 @@ Required properties: | |||
11 | 83xx, "fsl,mpc8572-gpio" for 85xx and "fsl,mpc8610-gpio" for 86xx. | 11 | 83xx, "fsl,mpc8572-gpio" for 85xx and "fsl,mpc8610-gpio" for 86xx. |
12 | - #gpio-cells : Should be two. The first cell is the pin number and the | 12 | - #gpio-cells : Should be two. The first cell is the pin number and the |
13 | second cell is used to specify optional parameters (currently unused). | 13 | second cell is used to specify optional parameters (currently unused). |
14 | - interrupts : Interrupt mapping for GPIO IRQ (currently unused). | 14 | - interrupts : Interrupt mapping for GPIO IRQ. |
15 | - interrupt-parent : Phandle for the interrupt controller that | 15 | - interrupt-parent : Phandle for the interrupt controller that |
16 | services interrupts for this device. | 16 | services interrupts for this device. |
17 | - gpio-controller : Marks the port as GPIO controller. | 17 | - gpio-controller : Marks the port as GPIO controller. |
@@ -38,3 +38,23 @@ Example of gpio-controller nodes for a MPC8347 SoC: | |||
38 | 38 | ||
39 | See booting-without-of.txt for details of how to specify GPIO | 39 | See booting-without-of.txt for details of how to specify GPIO |
40 | information for devices. | 40 | information for devices. |
41 | |||
42 | To use GPIO pins as interrupt sources for peripherals, specify the | ||
43 | GPIO controller as the interrupt parent and define GPIO number + | ||
44 | trigger mode using the interrupts property, which is defined like | ||
45 | this: | ||
46 | |||
47 | interrupts = <number trigger>, where: | ||
48 | - number: GPIO pin (0..31) | ||
49 | - trigger: trigger mode: | ||
50 | 2 = trigger on falling edge | ||
51 | 3 = trigger on both edges | ||
52 | |||
53 | Example of device using this is: | ||
54 | |||
55 | funkyfpga@0 { | ||
56 | compatible = "funky-fpga"; | ||
57 | ... | ||
58 | interrupts = <4 3>; | ||
59 | interrupt-parent = <&gpio1>; | ||
60 | }; | ||
diff --git a/Documentation/powerpc/dts-bindings/fsl/can.txt b/Documentation/powerpc/dts-bindings/fsl/can.txt new file mode 100644 index 000000000000..2fa4fcd38fd6 --- /dev/null +++ b/Documentation/powerpc/dts-bindings/fsl/can.txt | |||
@@ -0,0 +1,53 @@ | |||
1 | CAN Device Tree Bindings | ||
2 | ------------------------ | ||
3 | |||
4 | (c) 2006-2009 Secret Lab Technologies Ltd | ||
5 | Grant Likely <grant.likely@secretlab.ca> | ||
6 | |||
7 | fsl,mpc5200-mscan nodes | ||
8 | ----------------------- | ||
9 | In addition to the required compatible-, reg- and interrupt-properties, you can | ||
10 | also specify which clock source shall be used for the controller: | ||
11 | |||
12 | - fsl,mscan-clock-source : a string describing the clock source. Valid values | ||
13 | are: "ip" for ip bus clock | ||
14 | "ref" for reference clock (XTAL) | ||
15 | "ref" is default in case this property is not | ||
16 | present. | ||
17 | |||
18 | fsl,mpc5121-mscan nodes | ||
19 | ----------------------- | ||
20 | In addition to the required compatible-, reg- and interrupt-properties, you can | ||
21 | also specify which clock source and divider shall be used for the controller: | ||
22 | |||
23 | - fsl,mscan-clock-source : a string describing the clock source. Valid values | ||
24 | are: "ip" for ip bus clock | ||
25 | "ref" for reference clock | ||
26 | "sys" for system clock | ||
27 | If this property is not present, an optimal CAN | ||
28 | clock source and frequency based on the system | ||
29 | clock will be selected. If this is not possible, | ||
30 | the reference clock will be used. | ||
31 | |||
32 | - fsl,mscan-clock-divider: for the reference and system clock, an additional | ||
33 | clock divider can be specified. By default, a | ||
34 | value of 1 is used. | ||
35 | |||
36 | Note that the MPC5121 Rev. 1 processor is not supported. | ||
37 | |||
38 | Examples: | ||
39 | can@1300 { | ||
40 | compatible = "fsl,mpc5121-mscan"; | ||
41 | interrupts = <12 0x8>; | ||
42 | interrupt-parent = <&ipic>; | ||
43 | reg = <0x1300 0x80>; | ||
44 | }; | ||
45 | |||
46 | can@1380 { | ||
47 | compatible = "fsl,mpc5121-mscan"; | ||
48 | interrupts = <13 0x8>; | ||
49 | interrupt-parent = <&ipic>; | ||
50 | reg = <0x1380 0x80>; | ||
51 | fsl,mscan-clock-source = "ref"; | ||
52 | fsl,mscan-clock-divider = <3>; | ||
53 | }; | ||
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe.txt index 6e37be1eeb2d..4f8930263dd9 100644 --- a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe.txt +++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe.txt | |||
@@ -21,6 +21,15 @@ Required properties: | |||
21 | - fsl,qe-num-snums: define how many serial number(SNUM) the QE can use for the | 21 | - fsl,qe-num-snums: define how many serial number(SNUM) the QE can use for the |
22 | threads. | 22 | threads. |
23 | 23 | ||
24 | Optional properties: | ||
25 | - fsl,firmware-phandle: | ||
26 | Usage: required only if there is no fsl,qe-firmware child node | ||
27 | Value type: <phandle> | ||
28 | Definition: Points to a firmware node (see "QE Firmware Node" below) | ||
29 | that contains the firmware that should be uploaded for this QE. | ||
30 | The compatible property for the firmware node should say, | ||
31 | "fsl,qe-firmware". | ||
32 | |||
24 | Recommended properties | 33 | Recommended properties |
25 | - brg-frequency : the internal clock source frequency for baud-rate | 34 | - brg-frequency : the internal clock source frequency for baud-rate |
26 | generators in Hz. | 35 | generators in Hz. |
@@ -59,3 +68,48 @@ Example: | |||
59 | reg = <0 c000>; | 68 | reg = <0 c000>; |
60 | }; | 69 | }; |
61 | }; | 70 | }; |
71 | |||
72 | * QE Firmware Node | ||
73 | |||
74 | This node defines a firmware binary that is embedded in the device tree, for | ||
75 | the purpose of passing the firmware from bootloader to the kernel, or from | ||
76 | the hypervisor to the guest. | ||
77 | |||
78 | The firmware node itself contains the firmware binary contents, a compatible | ||
79 | property, and any firmware-specific properties. The node should be placed | ||
80 | inside a QE node that needs it. Doing so eliminates the need for a | ||
81 | fsl,firmware-phandle property. Other QE nodes that need the same firmware | ||
82 | should define an fsl,firmware-phandle property that points to the firmware node | ||
83 | in the first QE node. | ||
84 | |||
85 | The fsl,firmware property can be specified in the DTS (possibly using incbin) | ||
86 | or can be inserted by the boot loader at boot time. | ||
87 | |||
88 | Required properties: | ||
89 | - compatible | ||
90 | Usage: required | ||
91 | Value type: <string> | ||
92 | Definition: A standard property. Specify a string that indicates what | ||
93 | kind of firmware it is. For QE, this should be "fsl,qe-firmware". | ||
94 | |||
95 | - fsl,firmware | ||
96 | Usage: required | ||
97 | Value type: <prop-encoded-array>, encoded as an array of bytes | ||
98 | Definition: A standard property. This property contains the firmware | ||
99 | binary "blob". | ||
100 | |||
101 | Example: | ||
102 | qe1@e0080000 { | ||
103 | compatible = "fsl,qe"; | ||
104 | qe_firmware:qe-firmware { | ||
105 | compatible = "fsl,qe-firmware"; | ||
106 | fsl,firmware = [0x70 0xcd 0x00 0x00 0x01 0x46 0x45 ...]; | ||
107 | }; | ||
108 | ... | ||
109 | }; | ||
110 | |||
111 | qe2@e0090000 { | ||
112 | compatible = "fsl,qe"; | ||
113 | fsl,firmware-phandle = <&qe_firmware>; | ||
114 | ... | ||
115 | }; | ||
diff --git a/Documentation/powerpc/dts-bindings/fsl/dma.txt b/Documentation/powerpc/dts-bindings/fsl/dma.txt index 0732cdd05ba1..2a4b4bce6110 100644 --- a/Documentation/powerpc/dts-bindings/fsl/dma.txt +++ b/Documentation/powerpc/dts-bindings/fsl/dma.txt | |||
@@ -44,21 +44,29 @@ Example: | |||
44 | compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel"; | 44 | compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel"; |
45 | cell-index = <0>; | 45 | cell-index = <0>; |
46 | reg = <0 0x80>; | 46 | reg = <0 0x80>; |
47 | interrupt-parent = <&ipic>; | ||
48 | interrupts = <71 8>; | ||
47 | }; | 49 | }; |
48 | dma-channel@80 { | 50 | dma-channel@80 { |
49 | compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel"; | 51 | compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel"; |
50 | cell-index = <1>; | 52 | cell-index = <1>; |
51 | reg = <0x80 0x80>; | 53 | reg = <0x80 0x80>; |
54 | interrupt-parent = <&ipic>; | ||
55 | interrupts = <71 8>; | ||
52 | }; | 56 | }; |
53 | dma-channel@100 { | 57 | dma-channel@100 { |
54 | compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel"; | 58 | compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel"; |
55 | cell-index = <2>; | 59 | cell-index = <2>; |
56 | reg = <0x100 0x80>; | 60 | reg = <0x100 0x80>; |
61 | interrupt-parent = <&ipic>; | ||
62 | interrupts = <71 8>; | ||
57 | }; | 63 | }; |
58 | dma-channel@180 { | 64 | dma-channel@180 { |
59 | compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel"; | 65 | compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel"; |
60 | cell-index = <3>; | 66 | cell-index = <3>; |
61 | reg = <0x180 0x80>; | 67 | reg = <0x180 0x80>; |
68 | interrupt-parent = <&ipic>; | ||
69 | interrupts = <71 8>; | ||
62 | }; | 70 | }; |
63 | }; | 71 | }; |
64 | 72 | ||
diff --git a/Documentation/powerpc/dts-bindings/fsl/i2c.txt b/Documentation/powerpc/dts-bindings/fsl/i2c.txt index b6d2e21474f9..50da20310585 100644 --- a/Documentation/powerpc/dts-bindings/fsl/i2c.txt +++ b/Documentation/powerpc/dts-bindings/fsl/i2c.txt | |||
@@ -2,15 +2,14 @@ | |||
2 | 2 | ||
3 | Required properties : | 3 | Required properties : |
4 | 4 | ||
5 | - device_type : Should be "i2c" | ||
6 | - reg : Offset and length of the register set for the device | 5 | - reg : Offset and length of the register set for the device |
6 | - compatible : should be "fsl,CHIP-i2c" where CHIP is the name of a | ||
7 | compatible processor, e.g. mpc8313, mpc8543, mpc8544, mpc5121, | ||
8 | mpc5200 or mpc5200b. For the mpc5121, an additional node | ||
9 | "fsl,mpc5121-i2c-ctrl" is required as shown in the example below. | ||
7 | 10 | ||
8 | Recommended properties : | 11 | Recommended properties : |
9 | 12 | ||
10 | - compatible : compatibility list with 2 entries, the first should | ||
11 | be "fsl,CHIP-i2c" where CHIP is the name of a compatible processor, | ||
12 | e.g. mpc8313, mpc8543, mpc8544, mpc5200 or mpc5200b. The second one | ||
13 | should be "fsl-i2c". | ||
14 | - interrupts : <a b> where a is the interrupt number and b is a | 13 | - interrupts : <a b> where a is the interrupt number and b is a |
15 | field that represents an encoding of the sense and level | 14 | field that represents an encoding of the sense and level |
16 | information for the interrupt. This should be encoded based on | 15 | information for the interrupt. This should be encoded based on |
@@ -24,25 +23,40 @@ Recommended properties : | |||
24 | 23 | ||
25 | Examples : | 24 | Examples : |
26 | 25 | ||
26 | /* MPC5121 based board */ | ||
27 | i2c@1740 { | ||
28 | #address-cells = <1>; | ||
29 | #size-cells = <0>; | ||
30 | compatible = "fsl,mpc5121-i2c", "fsl-i2c"; | ||
31 | reg = <0x1740 0x20>; | ||
32 | interrupts = <11 0x8>; | ||
33 | interrupt-parent = <&ipic>; | ||
34 | clock-frequency = <100000>; | ||
35 | }; | ||
36 | |||
37 | i2ccontrol@1760 { | ||
38 | compatible = "fsl,mpc5121-i2c-ctrl"; | ||
39 | reg = <0x1760 0x8>; | ||
40 | }; | ||
41 | |||
42 | /* MPC5200B based board */ | ||
27 | i2c@3d00 { | 43 | i2c@3d00 { |
28 | #address-cells = <1>; | 44 | #address-cells = <1>; |
29 | #size-cells = <0>; | 45 | #size-cells = <0>; |
30 | compatible = "fsl,mpc5200b-i2c","fsl,mpc5200-i2c","fsl-i2c"; | 46 | compatible = "fsl,mpc5200b-i2c","fsl,mpc5200-i2c","fsl-i2c"; |
31 | cell-index = <0>; | ||
32 | reg = <0x3d00 0x40>; | 47 | reg = <0x3d00 0x40>; |
33 | interrupts = <2 15 0>; | 48 | interrupts = <2 15 0>; |
34 | interrupt-parent = <&mpc5200_pic>; | 49 | interrupt-parent = <&mpc5200_pic>; |
35 | fsl,preserve-clocking; | 50 | fsl,preserve-clocking; |
36 | }; | 51 | }; |
37 | 52 | ||
53 | /* MPC8544 base board */ | ||
38 | i2c@3100 { | 54 | i2c@3100 { |
39 | #address-cells = <1>; | 55 | #address-cells = <1>; |
40 | #size-cells = <0>; | 56 | #size-cells = <0>; |
41 | cell-index = <1>; | ||
42 | compatible = "fsl,mpc8544-i2c", "fsl-i2c"; | 57 | compatible = "fsl,mpc8544-i2c", "fsl-i2c"; |
43 | reg = <0x3100 0x100>; | 58 | reg = <0x3100 0x100>; |
44 | interrupts = <43 2>; | 59 | interrupts = <43 2>; |
45 | interrupt-parent = <&mpic>; | 60 | interrupt-parent = <&mpic>; |
46 | clock-frequency = <400000>; | 61 | clock-frequency = <400000>; |
47 | }; | 62 | }; |
48 | |||
diff --git a/Documentation/powerpc/dts-bindings/fsl/mpc5121-psc.txt b/Documentation/powerpc/dts-bindings/fsl/mpc5121-psc.txt new file mode 100644 index 000000000000..8832e8798912 --- /dev/null +++ b/Documentation/powerpc/dts-bindings/fsl/mpc5121-psc.txt | |||
@@ -0,0 +1,70 @@ | |||
1 | MPC5121 PSC Device Tree Bindings | ||
2 | |||
3 | PSC in UART mode | ||
4 | ---------------- | ||
5 | |||
6 | For PSC in UART mode the needed PSC serial devices | ||
7 | are specified by fsl,mpc5121-psc-uart nodes in the | ||
8 | fsl,mpc5121-immr SoC node. Additionally the PSC FIFO | ||
9 | Controller node fsl,mpc5121-psc-fifo is requered there: | ||
10 | |||
11 | fsl,mpc5121-psc-uart nodes | ||
12 | -------------------------- | ||
13 | |||
14 | Required properties : | ||
15 | - compatible : Should contain "fsl,mpc5121-psc-uart" and "fsl,mpc5121-psc" | ||
16 | - cell-index : Index of the PSC in hardware | ||
17 | - reg : Offset and length of the register set for the PSC device | ||
18 | - interrupts : <a b> where a is the interrupt number of the | ||
19 | PSC FIFO Controller and b is a field that represents an | ||
20 | encoding of the sense and level information for the interrupt. | ||
21 | - interrupt-parent : the phandle for the interrupt controller that | ||
22 | services interrupts for this device. | ||
23 | |||
24 | Recommended properties : | ||
25 | - fsl,rx-fifo-size : the size of the RX fifo slice (a multiple of 4) | ||
26 | - fsl,tx-fifo-size : the size of the TX fifo slice (a multiple of 4) | ||
27 | |||
28 | |||
29 | fsl,mpc5121-psc-fifo node | ||
30 | ------------------------- | ||
31 | |||
32 | Required properties : | ||
33 | - compatible : Should be "fsl,mpc5121-psc-fifo" | ||
34 | - reg : Offset and length of the register set for the PSC | ||
35 | FIFO Controller | ||
36 | - interrupts : <a b> where a is the interrupt number of the | ||
37 | PSC FIFO Controller and b is a field that represents an | ||
38 | encoding of the sense and level information for the interrupt. | ||
39 | - interrupt-parent : the phandle for the interrupt controller that | ||
40 | services interrupts for this device. | ||
41 | |||
42 | |||
43 | Example for a board using PSC0 and PSC1 devices in serial mode: | ||
44 | |||
45 | serial@11000 { | ||
46 | compatible = "fsl,mpc5121-psc-uart", "fsl,mpc5121-psc"; | ||
47 | cell-index = <0>; | ||
48 | reg = <0x11000 0x100>; | ||
49 | interrupts = <40 0x8>; | ||
50 | interrupt-parent = < &ipic >; | ||
51 | fsl,rx-fifo-size = <16>; | ||
52 | fsl,tx-fifo-size = <16>; | ||
53 | }; | ||
54 | |||
55 | serial@11100 { | ||
56 | compatible = "fsl,mpc5121-psc-uart", "fsl,mpc5121-psc"; | ||
57 | cell-index = <1>; | ||
58 | reg = <0x11100 0x100>; | ||
59 | interrupts = <40 0x8>; | ||
60 | interrupt-parent = < &ipic >; | ||
61 | fsl,rx-fifo-size = <16>; | ||
62 | fsl,tx-fifo-size = <16>; | ||
63 | }; | ||
64 | |||
65 | pscfifo@11f00 { | ||
66 | compatible = "fsl,mpc5121-psc-fifo"; | ||
67 | reg = <0x11f00 0x100>; | ||
68 | interrupts = <40 0x8>; | ||
69 | interrupt-parent = < &ipic >; | ||
70 | }; | ||
diff --git a/Documentation/powerpc/dts-bindings/fsl/mpc5200.txt b/Documentation/powerpc/dts-bindings/fsl/mpc5200.txt index 5c6602dbfdc2..4ccb2cd5df94 100644 --- a/Documentation/powerpc/dts-bindings/fsl/mpc5200.txt +++ b/Documentation/powerpc/dts-bindings/fsl/mpc5200.txt | |||
@@ -195,11 +195,4 @@ External interrupts: | |||
195 | 195 | ||
196 | fsl,mpc5200-mscan nodes | 196 | fsl,mpc5200-mscan nodes |
197 | ----------------------- | 197 | ----------------------- |
198 | In addition to the required compatible-, reg- and interrupt-properites, you can | 198 | See file can.txt in this directory. |
199 | also specify which clock source shall be used for the controller: | ||
200 | |||
201 | - fsl,mscan-clock-source- a string describing the clock source. Valid values | ||
202 | are: "ip" for ip bus clock | ||
203 | "ref" for reference clock (XTAL) | ||
204 | "ref" is default in case this property is not | ||
205 | present. | ||
diff --git a/Documentation/powerpc/dts-bindings/fsl/spi.txt b/Documentation/powerpc/dts-bindings/fsl/spi.txt index e7d9a344c4f4..80510c018eea 100644 --- a/Documentation/powerpc/dts-bindings/fsl/spi.txt +++ b/Documentation/powerpc/dts-bindings/fsl/spi.txt | |||
@@ -13,6 +13,11 @@ Required properties: | |||
13 | - interrupt-parent : the phandle for the interrupt controller that | 13 | - interrupt-parent : the phandle for the interrupt controller that |
14 | services interrupts for this device. | 14 | services interrupts for this device. |
15 | 15 | ||
16 | Optional properties: | ||
17 | - gpios : specifies the gpio pins to be used for chipselects. | ||
18 | The gpios will be referred to as reg = <index> in the SPI child nodes. | ||
19 | If unspecified, a single SPI device without a chip select can be used. | ||
20 | |||
16 | Example: | 21 | Example: |
17 | spi@4c0 { | 22 | spi@4c0 { |
18 | cell-index = <0>; | 23 | cell-index = <0>; |
@@ -21,4 +26,6 @@ Example: | |||
21 | interrupts = <82 0>; | 26 | interrupts = <82 0>; |
22 | interrupt-parent = <700>; | 27 | interrupt-parent = <700>; |
23 | mode = "cpu"; | 28 | mode = "cpu"; |
29 | gpios = <&gpio 18 1 // device reg=<0> | ||
30 | &gpio 19 1>; // device reg=<1> | ||
24 | }; | 31 | }; |
diff --git a/Documentation/powerpc/dts-bindings/xilinx.txt b/Documentation/powerpc/dts-bindings/xilinx.txt index ea68046bb9cb..299d0923537b 100644 --- a/Documentation/powerpc/dts-bindings/xilinx.txt +++ b/Documentation/powerpc/dts-bindings/xilinx.txt | |||
@@ -11,7 +11,7 @@ | |||
11 | control how the core is synthesized. Historically, the EDK tool would | 11 | control how the core is synthesized. Historically, the EDK tool would |
12 | extract the device parameters relevant to device drivers and copy them | 12 | extract the device parameters relevant to device drivers and copy them |
13 | into an 'xparameters.h' in the form of #define symbols. This tells the | 13 | into an 'xparameters.h' in the form of #define symbols. This tells the |
14 | device drivers how the IP cores are configured, but it requres the kernel | 14 | device drivers how the IP cores are configured, but it requires the kernel |
15 | to be recompiled every time the FPGA bitstream is resynthesized. | 15 | to be recompiled every time the FPGA bitstream is resynthesized. |
16 | 16 | ||
17 | The new approach is to export the parameters into the device tree and | 17 | The new approach is to export the parameters into the device tree and |
diff --git a/Documentation/powerpc/phyp-assisted-dump.txt b/Documentation/powerpc/phyp-assisted-dump.txt index c4682b982a2e..ad340205d96a 100644 --- a/Documentation/powerpc/phyp-assisted-dump.txt +++ b/Documentation/powerpc/phyp-assisted-dump.txt | |||
@@ -19,7 +19,7 @@ dump offers several strong, practical advantages: | |||
19 | immediately available to the system for normal use. | 19 | immediately available to the system for normal use. |
20 | -- After the dump is completed, no further reboots are | 20 | -- After the dump is completed, no further reboots are |
21 | required; the system will be fully usable, and running | 21 | required; the system will be fully usable, and running |
22 | in it's normal, production mode on it normal kernel. | 22 | in its normal, production mode on its normal kernel. |
23 | 23 | ||
24 | The above can only be accomplished by coordination with, | 24 | The above can only be accomplished by coordination with, |
25 | and assistance from the hypervisor. The procedure is | 25 | and assistance from the hypervisor. The procedure is |
diff --git a/Documentation/powerpc/ptrace.txt b/Documentation/powerpc/ptrace.txt new file mode 100644 index 000000000000..f4a5499b7bc6 --- /dev/null +++ b/Documentation/powerpc/ptrace.txt | |||
@@ -0,0 +1,134 @@ | |||
1 | GDB intends to support the following hardware debug features of BookE | ||
2 | processors: | ||
3 | |||
4 | 4 hardware breakpoints (IAC) | ||
5 | 2 hardware watchpoints (read, write and read-write) (DAC) | ||
6 | 2 value conditions for the hardware watchpoints (DVC) | ||
7 | |||
8 | For that, we need to extend ptrace so that GDB can query and set these | ||
9 | resources. Since we're extending, we're trying to create an interface | ||
10 | that's extendable and that covers both BookE and server processors, so | ||
11 | that GDB doesn't need to special-case each of them. We added the | ||
12 | following 3 new ptrace requests. | ||
13 | |||
14 | 1. PTRACE_PPC_GETHWDEBUGINFO | ||
15 | |||
16 | Query for GDB to discover the hardware debug features. The main info to | ||
17 | be returned here is the minimum alignment for the hardware watchpoints. | ||
18 | BookE processors don't have restrictions here, but server processors have | ||
19 | an 8-byte alignment restriction for hardware watchpoints. We'd like to avoid | ||
20 | adding special cases to GDB based on what it sees in AUXV. | ||
21 | |||
22 | Since we're at it, we added other useful info that the kernel can return to | ||
23 | GDB: this query will return the number of hardware breakpoints, hardware | ||
24 | watchpoints and whether it supports a range of addresses and a condition. | ||
25 | The query will fill the following structure provided by the requesting process: | ||
26 | |||
27 | struct ppc_debug_info { | ||
28 | unit32_t version; | ||
29 | unit32_t num_instruction_bps; | ||
30 | unit32_t num_data_bps; | ||
31 | unit32_t num_condition_regs; | ||
32 | unit32_t data_bp_alignment; | ||
33 | unit32_t sizeof_condition; /* size of the DVC register */ | ||
34 | uint64_t features; /* bitmask of the individual flags */ | ||
35 | }; | ||
36 | |||
37 | features will have bits indicating whether there is support for: | ||
38 | |||
39 | #define PPC_DEBUG_FEATURE_INSN_BP_RANGE 0x1 | ||
40 | #define PPC_DEBUG_FEATURE_INSN_BP_MASK 0x2 | ||
41 | #define PPC_DEBUG_FEATURE_DATA_BP_RANGE 0x4 | ||
42 | #define PPC_DEBUG_FEATURE_DATA_BP_MASK 0x8 | ||
43 | |||
44 | 2. PTRACE_SETHWDEBUG | ||
45 | |||
46 | Sets a hardware breakpoint or watchpoint, according to the provided structure: | ||
47 | |||
48 | struct ppc_hw_breakpoint { | ||
49 | uint32_t version; | ||
50 | #define PPC_BREAKPOINT_TRIGGER_EXECUTE 0x1 | ||
51 | #define PPC_BREAKPOINT_TRIGGER_READ 0x2 | ||
52 | #define PPC_BREAKPOINT_TRIGGER_WRITE 0x4 | ||
53 | uint32_t trigger_type; /* only some combinations allowed */ | ||
54 | #define PPC_BREAKPOINT_MODE_EXACT 0x0 | ||
55 | #define PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE 0x1 | ||
56 | #define PPC_BREAKPOINT_MODE_RANGE_EXCLUSIVE 0x2 | ||
57 | #define PPC_BREAKPOINT_MODE_MASK 0x3 | ||
58 | uint32_t addr_mode; /* address match mode */ | ||
59 | |||
60 | #define PPC_BREAKPOINT_CONDITION_MODE 0x3 | ||
61 | #define PPC_BREAKPOINT_CONDITION_NONE 0x0 | ||
62 | #define PPC_BREAKPOINT_CONDITION_AND 0x1 | ||
63 | #define PPC_BREAKPOINT_CONDITION_EXACT 0x1 /* different name for the same thing as above */ | ||
64 | #define PPC_BREAKPOINT_CONDITION_OR 0x2 | ||
65 | #define PPC_BREAKPOINT_CONDITION_AND_OR 0x3 | ||
66 | #define PPC_BREAKPOINT_CONDITION_BE_ALL 0x00ff0000 /* byte enable bits */ | ||
67 | #define PPC_BREAKPOINT_CONDITION_BE(n) (1<<((n)+16)) | ||
68 | uint32_t condition_mode; /* break/watchpoint condition flags */ | ||
69 | |||
70 | uint64_t addr; | ||
71 | uint64_t addr2; | ||
72 | uint64_t condition_value; | ||
73 | }; | ||
74 | |||
75 | A request specifies one event, not necessarily just one register to be set. | ||
76 | For instance, if the request is for a watchpoint with a condition, both the | ||
77 | DAC and DVC registers will be set in the same request. | ||
78 | |||
79 | With this GDB can ask for all kinds of hardware breakpoints and watchpoints | ||
80 | that the BookE supports. COMEFROM breakpoints available in server processors | ||
81 | are not contemplated, but that is out of the scope of this work. | ||
82 | |||
83 | ptrace will return an integer (handle) uniquely identifying the breakpoint or | ||
84 | watchpoint just created. This integer will be used in the PTRACE_DELHWDEBUG | ||
85 | request to ask for its removal. Return -ENOSPC if the requested breakpoint | ||
86 | can't be allocated on the registers. | ||
87 | |||
88 | Some examples of using the structure to: | ||
89 | |||
90 | - set a breakpoint in the first breakpoint register | ||
91 | |||
92 | p.version = PPC_DEBUG_CURRENT_VERSION; | ||
93 | p.trigger_type = PPC_BREAKPOINT_TRIGGER_EXECUTE; | ||
94 | p.addr_mode = PPC_BREAKPOINT_MODE_EXACT; | ||
95 | p.condition_mode = PPC_BREAKPOINT_CONDITION_NONE; | ||
96 | p.addr = (uint64_t) address; | ||
97 | p.addr2 = 0; | ||
98 | p.condition_value = 0; | ||
99 | |||
100 | - set a watchpoint which triggers on reads in the second watchpoint register | ||
101 | |||
102 | p.version = PPC_DEBUG_CURRENT_VERSION; | ||
103 | p.trigger_type = PPC_BREAKPOINT_TRIGGER_READ; | ||
104 | p.addr_mode = PPC_BREAKPOINT_MODE_EXACT; | ||
105 | p.condition_mode = PPC_BREAKPOINT_CONDITION_NONE; | ||
106 | p.addr = (uint64_t) address; | ||
107 | p.addr2 = 0; | ||
108 | p.condition_value = 0; | ||
109 | |||
110 | - set a watchpoint which triggers only with a specific value | ||
111 | |||
112 | p.version = PPC_DEBUG_CURRENT_VERSION; | ||
113 | p.trigger_type = PPC_BREAKPOINT_TRIGGER_READ; | ||
114 | p.addr_mode = PPC_BREAKPOINT_MODE_EXACT; | ||
115 | p.condition_mode = PPC_BREAKPOINT_CONDITION_AND | PPC_BREAKPOINT_CONDITION_BE_ALL; | ||
116 | p.addr = (uint64_t) address; | ||
117 | p.addr2 = 0; | ||
118 | p.condition_value = (uint64_t) condition; | ||
119 | |||
120 | - set a ranged hardware breakpoint | ||
121 | |||
122 | p.version = PPC_DEBUG_CURRENT_VERSION; | ||
123 | p.trigger_type = PPC_BREAKPOINT_TRIGGER_EXECUTE; | ||
124 | p.addr_mode = PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE; | ||
125 | p.condition_mode = PPC_BREAKPOINT_CONDITION_NONE; | ||
126 | p.addr = (uint64_t) begin_range; | ||
127 | p.addr2 = (uint64_t) end_range; | ||
128 | p.condition_value = 0; | ||
129 | |||
130 | 3. PTRACE_DELHWDEBUG | ||
131 | |||
132 | Takes an integer which identifies an existing breakpoint or watchpoint | ||
133 | (i.e., the value returned from PTRACE_SETHWDEBUG), and deletes the | ||
134 | corresponding breakpoint or watchpoint.. | ||
diff --git a/Documentation/rbtree.txt b/Documentation/rbtree.txt index aae8355d3166..221f38be98f4 100644 --- a/Documentation/rbtree.txt +++ b/Documentation/rbtree.txt | |||
@@ -190,3 +190,61 @@ Example: | |||
190 | for (node = rb_first(&mytree); node; node = rb_next(node)) | 190 | for (node = rb_first(&mytree); node; node = rb_next(node)) |
191 | printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring); | 191 | printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring); |
192 | 192 | ||
193 | Support for Augmented rbtrees | ||
194 | ----------------------------- | ||
195 | |||
196 | Augmented rbtree is an rbtree with "some" additional data stored in each node. | ||
197 | This data can be used to augment some new functionality to rbtree. | ||
198 | Augmented rbtree is an optional feature built on top of basic rbtree | ||
199 | infrastructure. rbtree user who wants this feature will have an augment | ||
200 | callback function in rb_root initialized. | ||
201 | |||
202 | This callback function will be called from rbtree core routines whenever | ||
203 | a node has a change in one or both of its children. It is the responsibility | ||
204 | of the callback function to recalculate the additional data that is in the | ||
205 | rb node using new children information. Note that if this new additional | ||
206 | data affects the parent node's additional data, then callback function has | ||
207 | to handle it and do the recursive updates. | ||
208 | |||
209 | |||
210 | Interval tree is an example of augmented rb tree. Reference - | ||
211 | "Introduction to Algorithms" by Cormen, Leiserson, Rivest and Stein. | ||
212 | More details about interval trees: | ||
213 | |||
214 | Classical rbtree has a single key and it cannot be directly used to store | ||
215 | interval ranges like [lo:hi] and do a quick lookup for any overlap with a new | ||
216 | lo:hi or to find whether there is an exact match for a new lo:hi. | ||
217 | |||
218 | However, rbtree can be augmented to store such interval ranges in a structured | ||
219 | way making it possible to do efficient lookup and exact match. | ||
220 | |||
221 | This "extra information" stored in each node is the maximum hi | ||
222 | (max_hi) value among all the nodes that are its descendents. This | ||
223 | information can be maintained at each node just be looking at the node | ||
224 | and its immediate children. And this will be used in O(log n) lookup | ||
225 | for lowest match (lowest start address among all possible matches) | ||
226 | with something like: | ||
227 | |||
228 | find_lowest_match(lo, hi, node) | ||
229 | { | ||
230 | lowest_match = NULL; | ||
231 | while (node) { | ||
232 | if (max_hi(node->left) > lo) { | ||
233 | // Lowest overlap if any must be on left side | ||
234 | node = node->left; | ||
235 | } else if (overlap(lo, hi, node)) { | ||
236 | lowest_match = node; | ||
237 | break; | ||
238 | } else if (lo > node->lo) { | ||
239 | // Lowest overlap if any must be on right side | ||
240 | node = node->right; | ||
241 | } else { | ||
242 | break; | ||
243 | } | ||
244 | } | ||
245 | return lowest_match; | ||
246 | } | ||
247 | |||
248 | Finding exact match will be to first find lowest match and then to follow | ||
249 | successor nodes looking for exact match, until the start of a node is beyond | ||
250 | the hi value we are looking for. | ||
diff --git a/Documentation/rfkill.txt b/Documentation/rfkill.txt index b4860509c319..83668e5dd17f 100644 --- a/Documentation/rfkill.txt +++ b/Documentation/rfkill.txt | |||
@@ -99,37 +99,15 @@ system. Also, it is possible to switch all rfkill drivers (or all drivers of | |||
99 | a specified type) into a state which also updates the default state for | 99 | a specified type) into a state which also updates the default state for |
100 | hotplugged devices. | 100 | hotplugged devices. |
101 | 101 | ||
102 | After an application opens /dev/rfkill, it can read the current state of | 102 | After an application opens /dev/rfkill, it can read the current state of all |
103 | all devices, and afterwards can poll the descriptor for hotplug or state | 103 | devices. Changes can be either obtained by either polling the descriptor for |
104 | change events. | 104 | hotplug or state change events or by listening for uevents emitted by the |
105 | 105 | rfkill core framework. | |
106 | Applications must ignore operations (the "op" field) they do not handle, | 106 | |
107 | this allows the API to be extended in the future. | 107 | Additionally, each rfkill device is registered in sysfs and emits uevents. |
108 | 108 | ||
109 | Additionally, each rfkill device is registered in sysfs and there has the | 109 | rfkill devices issue uevents (with an action of "change"), with the following |
110 | following attributes: | 110 | environment variables set: |
111 | |||
112 | name: Name assigned by driver to this key (interface or driver name). | ||
113 | type: Driver type string ("wlan", "bluetooth", etc). | ||
114 | persistent: Whether the soft blocked state is initialised from | ||
115 | non-volatile storage at startup. | ||
116 | state: Current state of the transmitter | ||
117 | 0: RFKILL_STATE_SOFT_BLOCKED | ||
118 | transmitter is turned off by software | ||
119 | 1: RFKILL_STATE_UNBLOCKED | ||
120 | transmitter is (potentially) active | ||
121 | 2: RFKILL_STATE_HARD_BLOCKED | ||
122 | transmitter is forced off by something outside of | ||
123 | the driver's control. | ||
124 | This file is deprecated because it can only properly show | ||
125 | three of the four possible states, soft-and-hard-blocked is | ||
126 | missing. | ||
127 | claim: 0: Kernel handles events | ||
128 | This file is deprecated because there no longer is a way to | ||
129 | claim just control over a single rfkill instance. | ||
130 | |||
131 | rfkill devices also issue uevents (with an action of "change"), with the | ||
132 | following environment variables set: | ||
133 | 111 | ||
134 | RFKILL_NAME | 112 | RFKILL_NAME |
135 | RFKILL_STATE | 113 | RFKILL_STATE |
@@ -137,3 +115,7 @@ RFKILL_TYPE | |||
137 | 115 | ||
138 | The contents of these variables corresponds to the "name", "state" and | 116 | The contents of these variables corresponds to the "name", "state" and |
139 | "type" sysfs files explained above. | 117 | "type" sysfs files explained above. |
118 | |||
119 | |||
120 | For further details consult Documentation/ABI/stable/dev-rfkill and | ||
121 | Documentation/ABI/stable/sysfs-class-rfkill. | ||
diff --git a/Documentation/rt-mutex-design.txt b/Documentation/rt-mutex-design.txt index 4b736d24da7a..8df0b782c4d7 100644 --- a/Documentation/rt-mutex-design.txt +++ b/Documentation/rt-mutex-design.txt | |||
@@ -657,7 +657,7 @@ here. | |||
657 | 657 | ||
658 | The waiter structure has a "task" field that points to the task that is blocked | 658 | The waiter structure has a "task" field that points to the task that is blocked |
659 | on the mutex. This field can be NULL the first time it goes through the loop | 659 | on the mutex. This field can be NULL the first time it goes through the loop |
660 | or if the task is a pending owner and had it's mutex stolen. If the "task" | 660 | or if the task is a pending owner and had its mutex stolen. If the "task" |
661 | field is NULL then we need to set up the accounting for it. | 661 | field is NULL then we need to set up the accounting for it. |
662 | 662 | ||
663 | Task blocks on mutex | 663 | Task blocks on mutex |
diff --git a/Documentation/s390/CommonIO b/Documentation/s390/CommonIO index 339207d11d95..d378cba66456 100644 --- a/Documentation/s390/CommonIO +++ b/Documentation/s390/CommonIO | |||
@@ -87,6 +87,12 @@ Command line parameters | |||
87 | compatibility, by the device number in hexadecimal (0xabcd or abcd). Device | 87 | compatibility, by the device number in hexadecimal (0xabcd or abcd). Device |
88 | numbers given as 0xabcd will be interpreted as 0.0.abcd. | 88 | numbers given as 0xabcd will be interpreted as 0.0.abcd. |
89 | 89 | ||
90 | * /proc/cio_settle | ||
91 | |||
92 | A write request to this file is blocked until all queued cio actions are | ||
93 | handled. This will allow userspace to wait for pending work affecting | ||
94 | device availability after changing cio_ignore or the hardware configuration. | ||
95 | |||
90 | * For some of the information present in the /proc filesystem in 2.4 (namely, | 96 | * For some of the information present in the /proc filesystem in 2.4 (namely, |
91 | /proc/subchannels and /proc/chpids), see driver-model.txt. | 97 | /proc/subchannels and /proc/chpids), see driver-model.txt. |
92 | Information formerly in /proc/irq_count is now in /proc/interrupts. | 98 | Information formerly in /proc/irq_count is now in /proc/interrupts. |
diff --git a/Documentation/s390/driver-model.txt b/Documentation/s390/driver-model.txt index bde473df748d..ed265cf54cde 100644 --- a/Documentation/s390/driver-model.txt +++ b/Documentation/s390/driver-model.txt | |||
@@ -223,8 +223,8 @@ touched by the driver - it should use the ccwgroup device's driver_data for its | |||
223 | private data. | 223 | private data. |
224 | 224 | ||
225 | To implement a ccwgroup driver, please refer to include/asm/ccwgroup.h. Keep in | 225 | To implement a ccwgroup driver, please refer to include/asm/ccwgroup.h. Keep in |
226 | mind that most drivers will need to implement both a ccwgroup and a ccw driver | 226 | mind that most drivers will need to implement both a ccwgroup and a ccw |
227 | (unless you have a meta ccw driver, like cu3088 for lcs and ctc). | 227 | driver. |
228 | 228 | ||
229 | 229 | ||
230 | 2. Channel paths | 230 | 2. Channel paths |
diff --git a/Documentation/s390/kvm.txt b/Documentation/s390/kvm.txt index 6f5ceb0f09fc..85f3280d7ef6 100644 --- a/Documentation/s390/kvm.txt +++ b/Documentation/s390/kvm.txt | |||
@@ -102,7 +102,7 @@ args: unsigned long | |||
102 | see also: include/linux/kvm.h | 102 | see also: include/linux/kvm.h |
103 | This ioctl stores the state of the cpu at the guest real address given as | 103 | This ioctl stores the state of the cpu at the guest real address given as |
104 | argument, unless one of the following values defined in include/linux/kvm.h | 104 | argument, unless one of the following values defined in include/linux/kvm.h |
105 | is given as arguement: | 105 | is given as argument: |
106 | KVM_S390_STORE_STATUS_NOADDR - the CPU stores its status to the save area in | 106 | KVM_S390_STORE_STATUS_NOADDR - the CPU stores its status to the save area in |
107 | absolute lowcore as defined by the principles of operation | 107 | absolute lowcore as defined by the principles of operation |
108 | KVM_S390_STORE_STATUS_PREFIXED - the CPU stores its status to the save area in | 108 | KVM_S390_STORE_STATUS_PREFIXED - the CPU stores its status to the save area in |
diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt index 6f33593e59e2..8239ebbcddce 100644 --- a/Documentation/scheduler/sched-design-CFS.txt +++ b/Documentation/scheduler/sched-design-CFS.txt | |||
@@ -211,7 +211,7 @@ provide fair CPU time to each such task group. For example, it may be | |||
211 | desirable to first provide fair CPU time to each user on the system and then to | 211 | desirable to first provide fair CPU time to each user on the system and then to |
212 | each task belonging to a user. | 212 | each task belonging to a user. |
213 | 213 | ||
214 | CONFIG_GROUP_SCHED strives to achieve exactly that. It lets tasks to be | 214 | CONFIG_CGROUP_SCHED strives to achieve exactly that. It lets tasks to be |
215 | grouped and divides CPU time fairly among such groups. | 215 | grouped and divides CPU time fairly among such groups. |
216 | 216 | ||
217 | CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and | 217 | CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and |
@@ -220,38 +220,11 @@ SCHED_RR) tasks. | |||
220 | CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and | 220 | CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and |
221 | SCHED_BATCH) tasks. | 221 | SCHED_BATCH) tasks. |
222 | 222 | ||
223 | At present, there are two (mutually exclusive) mechanisms to group tasks for | 223 | These options need CONFIG_CGROUPS to be defined, and let the administrator |
224 | CPU bandwidth control purposes: | ||
225 | |||
226 | - Based on user id (CONFIG_USER_SCHED) | ||
227 | |||
228 | With this option, tasks are grouped according to their user id. | ||
229 | |||
230 | - Based on "cgroup" pseudo filesystem (CONFIG_CGROUP_SCHED) | ||
231 | |||
232 | This options needs CONFIG_CGROUPS to be defined, and lets the administrator | ||
233 | create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See | 224 | create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See |
234 | Documentation/cgroups/cgroups.txt for more information about this filesystem. | 225 | Documentation/cgroups/cgroups.txt for more information about this filesystem. |
235 | 226 | ||
236 | Only one of these options to group tasks can be chosen and not both. | 227 | When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each |
237 | |||
238 | When CONFIG_USER_SCHED is defined, a directory is created in sysfs for each new | ||
239 | user and a "cpu_share" file is added in that directory. | ||
240 | |||
241 | # cd /sys/kernel/uids | ||
242 | # cat 512/cpu_share # Display user 512's CPU share | ||
243 | 1024 | ||
244 | # echo 2048 > 512/cpu_share # Modify user 512's CPU share | ||
245 | # cat 512/cpu_share # Display user 512's CPU share | ||
246 | 2048 | ||
247 | # | ||
248 | |||
249 | CPU bandwidth between two users is divided in the ratio of their CPU shares. | ||
250 | For example: if you would like user "root" to get twice the bandwidth of user | ||
251 | "guest," then set the cpu_share for both the users such that "root"'s cpu_share | ||
252 | is twice "guest"'s cpu_share. | ||
253 | |||
254 | When CONFIG_CGROUP_SCHED is defined, a "cpu.shares" file is created for each | ||
255 | group created using the pseudo filesystem. See example steps below to create | 228 | group created using the pseudo filesystem. See example steps below to create |
256 | task groups and modify their CPU share using the "cgroups" pseudo filesystem. | 229 | task groups and modify their CPU share using the "cgroups" pseudo filesystem. |
257 | 230 | ||
@@ -273,24 +246,3 @@ task groups and modify their CPU share using the "cgroups" pseudo filesystem. | |||
273 | 246 | ||
274 | # #Launch gmplayer (or your favourite movie player) | 247 | # #Launch gmplayer (or your favourite movie player) |
275 | # echo <movie_player_pid> > multimedia/tasks | 248 | # echo <movie_player_pid> > multimedia/tasks |
276 | |||
277 | 8. Implementation note: user namespaces | ||
278 | |||
279 | User namespaces are intended to be hierarchical. But they are currently | ||
280 | only partially implemented. Each of those has ramifications for CFS. | ||
281 | |||
282 | First, since user namespaces are hierarchical, the /sys/kernel/uids | ||
283 | presentation is inadequate. Eventually we will likely want to use sysfs | ||
284 | tagging to provide private views of /sys/kernel/uids within each user | ||
285 | namespace. | ||
286 | |||
287 | Second, the hierarchical nature is intended to support completely | ||
288 | unprivileged use of user namespaces. So if using user groups, then | ||
289 | we want the users in a user namespace to be children of the user | ||
290 | who created it. | ||
291 | |||
292 | That is currently unimplemented. So instead, every user in a new | ||
293 | user namespace will receive 1024 shares just like any user in the | ||
294 | initial user namespace. Note that at the moment creation of a new | ||
295 | user namespace requires each of CAP_SYS_ADMIN, CAP_SETUID, and | ||
296 | CAP_SETGID. | ||
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt index 86eabe6c3419..605b0d40329d 100644 --- a/Documentation/scheduler/sched-rt-group.txt +++ b/Documentation/scheduler/sched-rt-group.txt | |||
@@ -126,23 +126,12 @@ priority! | |||
126 | 2.3 Basis for grouping tasks | 126 | 2.3 Basis for grouping tasks |
127 | ---------------------------- | 127 | ---------------------------- |
128 | 128 | ||
129 | There are two compile-time settings for allocating CPU bandwidth. These are | 129 | Enabling CONFIG_RT_GROUP_SCHED lets you explicitly allocate real |
130 | configured using the "Basis for grouping tasks" multiple choice menu under | 130 | CPU bandwidth to task groups. |
131 | General setup > Group CPU Scheduler: | ||
132 | |||
133 | a. CONFIG_USER_SCHED (aka "Basis for grouping tasks" = "user id") | ||
134 | |||
135 | This lets you use the virtual files under | ||
136 | "/sys/kernel/uids/<uid>/cpu_rt_runtime_us" to control he CPU time reserved for | ||
137 | each user . | ||
138 | |||
139 | The other option is: | ||
140 | |||
141 | .o CONFIG_CGROUP_SCHED (aka "Basis for grouping tasks" = "Control groups") | ||
142 | 131 | ||
143 | This uses the /cgroup virtual file system and | 132 | This uses the /cgroup virtual file system and |
144 | "/cgroup/<cgroup>/cpu.rt_runtime_us" to control the CPU time reserved for each | 133 | "/cgroup/<cgroup>/cpu.rt_runtime_us" to control the CPU time reserved for each |
145 | control group instead. | 134 | control group. |
146 | 135 | ||
147 | For more information on working with control groups, you should read | 136 | For more information on working with control groups, you should read |
148 | Documentation/cgroups/cgroups.txt as well. | 137 | Documentation/cgroups/cgroups.txt as well. |
@@ -161,8 +150,7 @@ For now, this can be simplified to just the following (but see Future plans): | |||
161 | =============== | 150 | =============== |
162 | 151 | ||
163 | There is work in progress to make the scheduling period for each group | 152 | There is work in progress to make the scheduling period for each group |
164 | ("/sys/kernel/uids/<uid>/cpu_rt_period_us" or | 153 | ("/cgroup/<cgroup>/cpu.rt_period_us") configurable as well. |
165 | "/cgroup/<cgroup>/cpu.rt_period_us" respectively) configurable as well. | ||
166 | 154 | ||
167 | The constraint on the period is that a subgroup must have a smaller or | 155 | The constraint on the period is that a subgroup must have a smaller or |
168 | equal period to its parent. But realistically its not very useful _yet_ | 156 | equal period to its parent. But realistically its not very useful _yet_ |
diff --git a/Documentation/scsi/ChangeLog.lpfc b/Documentation/scsi/ChangeLog.lpfc index ff19a52fe004..e759e92e286d 100644 --- a/Documentation/scsi/ChangeLog.lpfc +++ b/Documentation/scsi/ChangeLog.lpfc | |||
@@ -707,7 +707,7 @@ Changes from 20040920 to 20041018 | |||
707 | * Integrate patches from Christoph Hellwig: two new helpers common | 707 | * Integrate patches from Christoph Hellwig: two new helpers common |
708 | to lpfc_sli_resume_iocb and lpfc_sli_issue_iocb - singificant | 708 | to lpfc_sli_resume_iocb and lpfc_sli_issue_iocb - singificant |
709 | cleanup of those two functions - the unused SLI_IOCB_USE_TXQ is | 709 | cleanup of those two functions - the unused SLI_IOCB_USE_TXQ is |
710 | gone - lpfc_sli_issue_iocb_wait loses it's flags argument | 710 | gone - lpfc_sli_issue_iocb_wait loses its flags argument |
711 | totally. | 711 | totally. |
712 | * Fix in lpfc_sli.c: we can not store a 5 bit value in a 4-bit | 712 | * Fix in lpfc_sli.c: we can not store a 5 bit value in a 4-bit |
713 | field. | 713 | field. |
@@ -989,8 +989,8 @@ Changes from 20040709 to 20040716 | |||
989 | * Remove redundant port_cmp != 2 check in if | 989 | * Remove redundant port_cmp != 2 check in if |
990 | (!port_cmp) { .... if (port_cmp != 2).... } | 990 | (!port_cmp) { .... if (port_cmp != 2).... } |
991 | * Clock changes: removed struct clk_data and timerList. | 991 | * Clock changes: removed struct clk_data and timerList. |
992 | * Clock changes: seperate nodev_tmo and els_retry_delay into 2 | 992 | * Clock changes: separate nodev_tmo and els_retry_delay into 2 |
993 | seperate timers and convert to 1 argument changed | 993 | separate timers and convert to 1 argument changed |
994 | LPFC_NODE_FARP_PEND_t to struct lpfc_node_farp_pend convert | 994 | LPFC_NODE_FARP_PEND_t to struct lpfc_node_farp_pend convert |
995 | ipfarp_tmo to 1 argument convert target struct tmofunc and | 995 | ipfarp_tmo to 1 argument convert target struct tmofunc and |
996 | rtplunfunc to 1 argument * cr_count, cr_delay and | 996 | rtplunfunc to 1 argument * cr_count, cr_delay and |
@@ -1028,7 +1028,7 @@ Changes from 20040614 to 20040709 | |||
1028 | * Remove the need for buf_tmo. | 1028 | * Remove the need for buf_tmo. |
1029 | * Changed ULP_BDE64 to struct ulp_bde64. | 1029 | * Changed ULP_BDE64 to struct ulp_bde64. |
1030 | * Changed ULP_BDE to struct ulp_bde. | 1030 | * Changed ULP_BDE to struct ulp_bde. |
1031 | * Cleanup lpfc_os_return_scsi_cmd() and it's call path. | 1031 | * Cleanup lpfc_os_return_scsi_cmd() and its call path. |
1032 | * Removed lpfc_no_device_delay. | 1032 | * Removed lpfc_no_device_delay. |
1033 | * Consolidating lpfc_hba_put_event() into lpfc_put_event(). | 1033 | * Consolidating lpfc_hba_put_event() into lpfc_put_event(). |
1034 | * Removed following attributes and their functionality: | 1034 | * Removed following attributes and their functionality: |
@@ -1514,7 +1514,7 @@ Changes from 20040402 to 20040409 | |||
1514 | * Remove unused elxclock declaration in elx_sli.h. | 1514 | * Remove unused elxclock declaration in elx_sli.h. |
1515 | * Since everywhere IOCB_ENTRY is used, the return value is cast, | 1515 | * Since everywhere IOCB_ENTRY is used, the return value is cast, |
1516 | move the cast into the macro. | 1516 | move the cast into the macro. |
1517 | * Split ioctls out into seperate files | 1517 | * Split ioctls out into separate files |
1518 | 1518 | ||
1519 | Changes from 20040326 to 20040402 | 1519 | Changes from 20040326 to 20040402 |
1520 | 1520 | ||
@@ -1534,7 +1534,7 @@ Changes from 20040326 to 20040402 | |||
1534 | * Unused variable cleanup | 1534 | * Unused variable cleanup |
1535 | * Use Linux list macros for DMABUF_t | 1535 | * Use Linux list macros for DMABUF_t |
1536 | * Break up ioctls into 3 sections, dfc, util, hbaapi | 1536 | * Break up ioctls into 3 sections, dfc, util, hbaapi |
1537 | rearranged code so this could be easily seperated into a | 1537 | rearranged code so this could be easily separated into a |
1538 | differnet module later All 3 are currently turned on by | 1538 | differnet module later All 3 are currently turned on by |
1539 | defines in lpfc_ioctl.c LPFC_DFC_IOCTL, LPFC_UTIL_IOCTL, | 1539 | defines in lpfc_ioctl.c LPFC_DFC_IOCTL, LPFC_UTIL_IOCTL, |
1540 | LPFC_HBAAPI_IOCTL | 1540 | LPFC_HBAAPI_IOCTL |
@@ -1551,7 +1551,7 @@ Changes from 20040326 to 20040402 | |||
1551 | started by lpfc_online(). lpfc_offline() only stopped | 1551 | started by lpfc_online(). lpfc_offline() only stopped |
1552 | els_timeout routine. It now stops all timeout routines | 1552 | els_timeout routine. It now stops all timeout routines |
1553 | associated with that hba. | 1553 | associated with that hba. |
1554 | * Replace seperate next and prev pointers in struct | 1554 | * Replace separate next and prev pointers in struct |
1555 | lpfc_bindlist with list_head type. In elxHBA_t, replace | 1555 | lpfc_bindlist with list_head type. In elxHBA_t, replace |
1556 | fc_nlpbind_start and _end with fc_nlpbind_list and use | 1556 | fc_nlpbind_start and _end with fc_nlpbind_list and use |
1557 | list_head macros to access it. | 1557 | list_head macros to access it. |
diff --git a/Documentation/scsi/ChangeLog.megaraid_sas b/Documentation/scsi/ChangeLog.megaraid_sas index 17ffa0607712..30023568805e 100644 --- a/Documentation/scsi/ChangeLog.megaraid_sas +++ b/Documentation/scsi/ChangeLog.megaraid_sas | |||
@@ -1,3 +1,19 @@ | |||
1 | 1 Release Date : Thur. Oct 29, 2009 09:12:45 PST 2009 - | ||
2 | (emaild-id:megaraidlinux@lsi.com) | ||
3 | Bo Yang | ||
4 | |||
5 | 2 Current Version : 00.00.04.17.1-rc1 | ||
6 | 3 Older Version : 00.00.04.12 | ||
7 | |||
8 | 1. Add the pad_0 in mfi frame structure to 0 to fix the | ||
9 | context value larger than 32bit value issue. | ||
10 | |||
11 | 2. Add the logic drive list to the driver. Driver will | ||
12 | keep the logic drive list internal after driver load. | ||
13 | |||
14 | 3. driver fixed the device update issue after get the AEN | ||
15 | PD delete/ADD, LD add/delete from FW. | ||
16 | |||
1 | 1 Release Date : Tues. July 28, 2009 10:12:45 PST 2009 - | 17 | 1 Release Date : Tues. July 28, 2009 10:12:45 PST 2009 - |
2 | (emaild-id:megaraidlinux@lsi.com) | 18 | (emaild-id:megaraidlinux@lsi.com) |
3 | Bo Yang | 19 | Bo Yang |
diff --git a/Documentation/scsi/FlashPoint.txt b/Documentation/scsi/FlashPoint.txt index d5acaa300a46..1540a92f6d2b 100644 --- a/Documentation/scsi/FlashPoint.txt +++ b/Documentation/scsi/FlashPoint.txt | |||
@@ -71,7 +71,7 @@ peters@mylex.com | |||
71 | 71 | ||
72 | Ever since its introduction last October, the BusLogic FlashPoint LT has | 72 | Ever since its introduction last October, the BusLogic FlashPoint LT has |
73 | been problematic for members of the Linux community, in that no Linux | 73 | been problematic for members of the Linux community, in that no Linux |
74 | drivers have been available for this new Ultra SCSI product. Despite it's | 74 | drivers have been available for this new Ultra SCSI product. Despite its |
75 | officially being positioned as a desktop workstation product, and not being | 75 | officially being positioned as a desktop workstation product, and not being |
76 | particularly well suited for a high performance multitasking operating | 76 | particularly well suited for a high performance multitasking operating |
77 | system like Linux, the FlashPoint LT has been touted by computer system | 77 | system like Linux, the FlashPoint LT has been touted by computer system |
diff --git a/Documentation/scsi/dtc3x80.txt b/Documentation/scsi/dtc3x80.txt index e8ae6230ab3e..1d7af9f9a8ed 100644 --- a/Documentation/scsi/dtc3x80.txt +++ b/Documentation/scsi/dtc3x80.txt | |||
@@ -12,7 +12,7 @@ The 3180 does not. Otherwise, they are identical. | |||
12 | The DTC3x80 does not support DMA but it does have Pseudo-DMA which is | 12 | The DTC3x80 does not support DMA but it does have Pseudo-DMA which is |
13 | supported by the driver. | 13 | supported by the driver. |
14 | 14 | ||
15 | It's DTC406 scsi chip is supposedly compatible with the NCR 53C400. | 15 | Its DTC406 scsi chip is supposedly compatible with the NCR 53C400. |
16 | It is memory mapped, uses an IRQ, but no dma or io-port. There is | 16 | It is memory mapped, uses an IRQ, but no dma or io-port. There is |
17 | internal DMA, between SCSI bus and an on-chip 128-byte buffer. Double | 17 | internal DMA, between SCSI bus and an on-chip 128-byte buffer. Double |
18 | buffering is done automagically by the chip. Data is transferred | 18 | buffering is done automagically by the chip. Data is transferred |
diff --git a/Documentation/scsi/ncr53c8xx.txt b/Documentation/scsi/ncr53c8xx.txt index 08e2b4d04aab..cda5f8fa2c66 100644 --- a/Documentation/scsi/ncr53c8xx.txt +++ b/Documentation/scsi/ncr53c8xx.txt | |||
@@ -1479,7 +1479,7 @@ Wide16 SCSI. | |||
1479 | Enabling serial NVRAM support enables detection of the serial NVRAM included | 1479 | Enabling serial NVRAM support enables detection of the serial NVRAM included |
1480 | on Symbios and some Symbios compatible host adaptors, and Tekram boards. The | 1480 | on Symbios and some Symbios compatible host adaptors, and Tekram boards. The |
1481 | serial NVRAM is used by Symbios and Tekram to hold set up parameters for the | 1481 | serial NVRAM is used by Symbios and Tekram to hold set up parameters for the |
1482 | host adaptor and it's attached drives. | 1482 | host adaptor and its attached drives. |
1483 | 1483 | ||
1484 | The Symbios NVRAM also holds data on the boot order of host adaptors in a | 1484 | The Symbios NVRAM also holds data on the boot order of host adaptors in a |
1485 | system with more than one host adaptor. This enables the order of scanning | 1485 | system with more than one host adaptor. This enables the order of scanning |
diff --git a/Documentation/scsi/osst.txt b/Documentation/scsi/osst.txt index f536907e241d..2b21890bc983 100644 --- a/Documentation/scsi/osst.txt +++ b/Documentation/scsi/osst.txt | |||
@@ -40,7 +40,7 @@ behavior looks very much the same as st to the userspace applications. | |||
40 | 40 | ||
41 | History | 41 | History |
42 | ------- | 42 | ------- |
43 | In the first place, osst shared it's identity very much with st. That meant | 43 | In the first place, osst shared its identity very much with st. That meant |
44 | that it used the same kernel structures and the same device node as st. | 44 | that it used the same kernel structures and the same device node as st. |
45 | So you could only have either of them being present in the kernel. This has | 45 | So you could only have either of them being present in the kernel. This has |
46 | been fixed by registering an own device, now. | 46 | been fixed by registering an own device, now. |
diff --git a/Documentation/scsi/scsi_fc_transport.txt b/Documentation/scsi/scsi_fc_transport.txt index aec6549ab097..e00192de4d1c 100644 --- a/Documentation/scsi/scsi_fc_transport.txt +++ b/Documentation/scsi/scsi_fc_transport.txt | |||
@@ -70,7 +70,7 @@ Overview: | |||
70 | up to an administrative entity controlling the vport. For example, | 70 | up to an administrative entity controlling the vport. For example, |
71 | if vports are to be associated with virtual machines, a XEN mgmt | 71 | if vports are to be associated with virtual machines, a XEN mgmt |
72 | utility would be responsible for creating wwpn/wwnn's for the vport, | 72 | utility would be responsible for creating wwpn/wwnn's for the vport, |
73 | using it's own naming authority and OUI. (Note: it already does this | 73 | using its own naming authority and OUI. (Note: it already does this |
74 | for virtual MAC addresses). | 74 | for virtual MAC addresses). |
75 | 75 | ||
76 | 76 | ||
@@ -81,7 +81,7 @@ Device Trees and Vport Objects: | |||
81 | with rports and scsi target objects underneath it. Currently the FC | 81 | with rports and scsi target objects underneath it. Currently the FC |
82 | transport creates the vport object and places it under the scsi_host | 82 | transport creates the vport object and places it under the scsi_host |
83 | object corresponding to the physical adapter. The LLDD will allocate | 83 | object corresponding to the physical adapter. The LLDD will allocate |
84 | a new scsi_host for the vport and link it's object under the vport. | 84 | a new scsi_host for the vport and link its object under the vport. |
85 | The remainder of the tree under the vports scsi_host is the same | 85 | The remainder of the tree under the vports scsi_host is the same |
86 | as the non-NPIV case. The transport is written currently to easily | 86 | as the non-NPIV case. The transport is written currently to easily |
87 | allow the parent of the vport to be something other than the scsi_host. | 87 | allow the parent of the vport to be something other than the scsi_host. |
diff --git a/Documentation/scsi/sym53c8xx_2.txt b/Documentation/scsi/sym53c8xx_2.txt index eb9a7b905b64..6f63b7989679 100644 --- a/Documentation/scsi/sym53c8xx_2.txt +++ b/Documentation/scsi/sym53c8xx_2.txt | |||
@@ -687,7 +687,7 @@ maintain the driver code. | |||
687 | Enabling serial NVRAM support enables detection of the serial NVRAM included | 687 | Enabling serial NVRAM support enables detection of the serial NVRAM included |
688 | on Symbios and some Symbios compatible host adaptors, and Tekram boards. The | 688 | on Symbios and some Symbios compatible host adaptors, and Tekram boards. The |
689 | serial NVRAM is used by Symbios and Tekram to hold set up parameters for the | 689 | serial NVRAM is used by Symbios and Tekram to hold set up parameters for the |
690 | host adaptor and it's attached drives. | 690 | host adaptor and its attached drives. |
691 | 691 | ||
692 | The Symbios NVRAM also holds data on the boot order of host adaptors in a | 692 | The Symbios NVRAM also holds data on the boot order of host adaptors in a |
693 | system with more than one host adaptor. This information is no longer used | 693 | system with more than one host adaptor. This information is no longer used |
diff --git a/Documentation/serial/tty.txt b/Documentation/serial/tty.txt index 5e5349a4fcd2..7c900507279f 100644 --- a/Documentation/serial/tty.txt +++ b/Documentation/serial/tty.txt | |||
@@ -105,6 +105,10 @@ write_wakeup() - May be called at any point between open and close. | |||
105 | is permitted to call the driver write method from | 105 | is permitted to call the driver write method from |
106 | this function. In such a situation defer it. | 106 | this function. In such a situation defer it. |
107 | 107 | ||
108 | dcd_change() - Report to the tty line the current DCD pin status | ||
109 | changes and the relative timestamp. The timestamp | ||
110 | can be NULL. | ||
111 | |||
108 | 112 | ||
109 | Driver Access | 113 | Driver Access |
110 | 114 | ||
diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt index 8923597bd2bd..2075bbb8b3e2 100644 --- a/Documentation/sound/alsa/ALSA-Configuration.txt +++ b/Documentation/sound/alsa/ALSA-Configuration.txt | |||
@@ -227,6 +227,16 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
227 | 227 | ||
228 | The power-management is supported. | 228 | The power-management is supported. |
229 | 229 | ||
230 | Module snd-asihpi | ||
231 | ----------------- | ||
232 | |||
233 | Module for AudioScience ASI soundcards | ||
234 | |||
235 | enable_hpi_hwdep - enable HPI hwdep for AudioScience soundcard | ||
236 | |||
237 | This module supports multiple cards. | ||
238 | The driver requires the firmware loader support on kernel. | ||
239 | |||
230 | Module snd-atiixp | 240 | Module snd-atiixp |
231 | ----------------- | 241 | ----------------- |
232 | 242 | ||
@@ -482,6 +492,9 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
482 | 492 | ||
483 | reference_rate - reference sample rate, 44100 or 48000 (default) | 493 | reference_rate - reference sample rate, 44100 or 48000 (default) |
484 | multiple - multiple to ref. sample rate, 1 or 2 (default) | 494 | multiple - multiple to ref. sample rate, 1 or 2 (default) |
495 | subsystem - override the PCI SSID for probing; the value | ||
496 | consists of SSVID << 16 | SSDID. The default is | ||
497 | zero, which means no override. | ||
485 | 498 | ||
486 | This module supports multiple cards. | 499 | This module supports multiple cards. |
487 | 500 | ||
@@ -619,28 +632,23 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
619 | 632 | ||
620 | The power-management is supported. | 633 | The power-management is supported. |
621 | 634 | ||
622 | Module snd-es968 | ||
623 | ---------------- | ||
624 | |||
625 | Module for sound cards based on ESS ES968 chip (PnP only). | ||
626 | |||
627 | This module supports multiple cards, PnP and autoprobe. | ||
628 | |||
629 | The power-management is supported. | ||
630 | |||
631 | Module snd-es1688 | 635 | Module snd-es1688 |
632 | ----------------- | 636 | ----------------- |
633 | 637 | ||
634 | Module for ESS AudioDrive ES-1688 and ES-688 sound cards. | 638 | Module for ESS AudioDrive ES-1688 and ES-688 sound cards. |
635 | 639 | ||
636 | port - port # for ES-1688 chip (0x220,0x240,0x260) | 640 | isapnp - ISA PnP detection - 0 = disable, 1 = enable (default) |
637 | fm_port - port # for OPL3 (option; share the same port as default) | ||
638 | mpu_port - port # for MPU-401 port (0x300,0x310,0x320,0x330), -1 = disable (default) | 641 | mpu_port - port # for MPU-401 port (0x300,0x310,0x320,0x330), -1 = disable (default) |
639 | irq - IRQ # for ES-1688 chip (5,7,9,10) | ||
640 | mpu_irq - IRQ # for MPU-401 port (5,7,9,10) | 642 | mpu_irq - IRQ # for MPU-401 port (5,7,9,10) |
643 | fm_port - port # for OPL3 (option; share the same port as default) | ||
644 | |||
645 | with isapnp=0, the following additional options are available: | ||
646 | port - port # for ES-1688 chip (0x220,0x240,0x260) | ||
647 | irq - IRQ # for ES-1688 chip (5,7,9,10) | ||
641 | dma8 - DMA # for ES-1688 chip (0,1,3) | 648 | dma8 - DMA # for ES-1688 chip (0,1,3) |
642 | 649 | ||
643 | This module supports multiple cards and autoprobe (without MPU-401 port). | 650 | This module supports multiple cards and autoprobe (without MPU-401 port) |
651 | and PnP with the ES968 chip. | ||
644 | 652 | ||
645 | Module snd-es18xx | 653 | Module snd-es18xx |
646 | ----------------- | 654 | ----------------- |
@@ -1123,6 +1131,21 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
1123 | 1131 | ||
1124 | This module supports multiple cards, autoprobe and ISA PnP. | 1132 | This module supports multiple cards, autoprobe and ISA PnP. |
1125 | 1133 | ||
1134 | Module snd-jazz16 | ||
1135 | ------------------- | ||
1136 | |||
1137 | Module for Media Vision Jazz16 chipset. The chipset consists of 3 chips: | ||
1138 | MVD1216 + MVA416 + MVA514. | ||
1139 | |||
1140 | port - port # for SB DSP chip (0x210,0x220,0x230,0x240,0x250,0x260) | ||
1141 | irq - IRQ # for SB DSP chip (3,5,7,9,10,15) | ||
1142 | dma8 - DMA # for SB DSP chip (1,3) | ||
1143 | dma16 - DMA # for SB DSP chip (5,7) | ||
1144 | mpu_port - MPU-401 port # (0x300,0x310,0x320,0x330) | ||
1145 | mpu_irq - MPU-401 irq # (2,3,5,7) | ||
1146 | |||
1147 | This module supports multiple cards. | ||
1148 | |||
1126 | Module snd-korg1212 | 1149 | Module snd-korg1212 |
1127 | ------------------- | 1150 | ------------------- |
1128 | 1151 | ||
@@ -1791,6 +1814,13 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
1791 | 1814 | ||
1792 | The power-management is supported. | 1815 | The power-management is supported. |
1793 | 1816 | ||
1817 | Module snd-ua101 | ||
1818 | ---------------- | ||
1819 | |||
1820 | Module for the Edirol UA-101/UA-1000 audio/MIDI interfaces. | ||
1821 | |||
1822 | This module supports multiple devices, autoprobe and hotplugging. | ||
1823 | |||
1794 | Module snd-usb-audio | 1824 | Module snd-usb-audio |
1795 | -------------------- | 1825 | -------------------- |
1796 | 1826 | ||
@@ -1923,7 +1953,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
1923 | ------------------- | 1953 | ------------------- |
1924 | 1954 | ||
1925 | Module for sound cards based on the Asus AV100/AV200 chips, | 1955 | Module for sound cards based on the Asus AV100/AV200 chips, |
1926 | i.e., Xonar D1, DX, D2, D2X, HDAV1.3 (Deluxe), Essence ST | 1956 | i.e., Xonar D1, DX, D2, D2X, DS, HDAV1.3 (Deluxe), Essence ST |
1927 | (Deluxe) and Essence STX. | 1957 | (Deluxe) and Essence STX. |
1928 | 1958 | ||
1929 | This module supports autoprobe and multiple cards. | 1959 | This module supports autoprobe and multiple cards. |
diff --git a/Documentation/sound/alsa/HD-Audio-Models.txt b/Documentation/sound/alsa/HD-Audio-Models.txt index e72cee9e2a71..1d38b0dfba95 100644 --- a/Documentation/sound/alsa/HD-Audio-Models.txt +++ b/Documentation/sound/alsa/HD-Audio-Models.txt | |||
@@ -124,6 +124,8 @@ ALC882/883/885/888/889 | |||
124 | asus-a7m ASUS A7M | 124 | asus-a7m ASUS A7M |
125 | macpro MacPro support | 125 | macpro MacPro support |
126 | mb5 Macbook 5,1 | 126 | mb5 Macbook 5,1 |
127 | macmini3 Macmini 3,1 | ||
128 | mba21 Macbook Air 2,1 | ||
127 | mbp3 Macbook Pro rev3 | 129 | mbp3 Macbook Pro rev3 |
128 | imac24 iMac 24'' with jack detection | 130 | imac24 iMac 24'' with jack detection |
129 | imac91 iMac 9,1 | 131 | imac91 iMac 9,1 |
@@ -279,13 +281,16 @@ Conexant 5051 | |||
279 | laptop Basic Laptop config (default) | 281 | laptop Basic Laptop config (default) |
280 | hp HP Spartan laptop | 282 | hp HP Spartan laptop |
281 | hp-dv6736 HP dv6736 | 283 | hp-dv6736 HP dv6736 |
284 | hp-f700 HP Compaq Presario F700 | ||
282 | lenovo-x200 Lenovo X200 laptop | 285 | lenovo-x200 Lenovo X200 laptop |
286 | toshiba Toshiba Satellite M300 | ||
283 | 287 | ||
284 | Conexant 5066 | 288 | Conexant 5066 |
285 | ============= | 289 | ============= |
286 | laptop Basic Laptop config (default) | 290 | laptop Basic Laptop config (default) |
287 | dell-laptop Dell laptops | 291 | dell-laptop Dell laptops |
288 | olpc-xo-1_5 OLPC XO 1.5 | 292 | olpc-xo-1_5 OLPC XO 1.5 |
293 | ideapad Lenovo IdeaPad U150 | ||
289 | 294 | ||
290 | STAC9200 | 295 | STAC9200 |
291 | ======== | 296 | ======== |
diff --git a/Documentation/sound/alsa/HD-Audio.txt b/Documentation/sound/alsa/HD-Audio.txt index 6325bec06a72..bdafdbd32561 100644 --- a/Documentation/sound/alsa/HD-Audio.txt +++ b/Documentation/sound/alsa/HD-Audio.txt | |||
@@ -119,10 +119,18 @@ the codec slots 0 and 1 no matter what the hardware reports. | |||
119 | 119 | ||
120 | Interrupt Handling | 120 | Interrupt Handling |
121 | ~~~~~~~~~~~~~~~~~~ | 121 | ~~~~~~~~~~~~~~~~~~ |
122 | In rare but some cases, the interrupt isn't properly handled as | 122 | HD-audio driver uses MSI as default (if available) since 2.6.33 |
123 | default. You would notice this by the DMA transfer error reported by | 123 | kernel as MSI works better on some machines, and in general, it's |
124 | ALSA PCM core, for example. Using MSI might help in such a case. | 124 | better for performance. However, Nvidia controllers showed bad |
125 | Pass `enable_msi=1` option for enabling MSI. | 125 | regressions with MSI (especially in a combination with AMD chipset), |
126 | thus we disabled MSI for them. | ||
127 | |||
128 | There seem also still other devices that don't work with MSI. If you | ||
129 | see a regression wrt the sound quality (stuttering, etc) or a lock-up | ||
130 | in the recent kernel, try to pass `enable_msi=0` option to disable | ||
131 | MSI. If it works, you can add the known bad device to the blacklist | ||
132 | defined in hda_intel.c. In such a case, please report and give the | ||
133 | patch back to the upstream developer. | ||
126 | 134 | ||
127 | 135 | ||
128 | HD-AUDIO CODEC | 136 | HD-AUDIO CODEC |
@@ -196,7 +204,6 @@ generic parser regardless of the codec. Usually the codec-specific | |||
196 | parser is much better than the generic parser (as now). Thus this | 204 | parser is much better than the generic parser (as now). Thus this |
197 | option is more about the debugging purpose. | 205 | option is more about the debugging purpose. |
198 | 206 | ||
199 | |||
200 | Speaker and Headphone Output | 207 | Speaker and Headphone Output |
201 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 208 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
202 | One of the most frequent (and obvious) bugs with HD-audio is the | 209 | One of the most frequent (and obvious) bugs with HD-audio is the |
@@ -452,6 +459,33 @@ Similarly, the lines after `[verb]` are parsed as `init_verbs` | |||
452 | sysfs entries, and the lines after `[hint]` are parsed as `hints` | 459 | sysfs entries, and the lines after `[hint]` are parsed as `hints` |
453 | sysfs entries, respectively. | 460 | sysfs entries, respectively. |
454 | 461 | ||
462 | Another example to override the codec vendor id from 0x12345678 to | ||
463 | 0xdeadbeef is like below: | ||
464 | ------------------------------------------------------------------------ | ||
465 | [codec] | ||
466 | 0x12345678 0xabcd1234 2 | ||
467 | |||
468 | [vendor_id] | ||
469 | 0xdeadbeef | ||
470 | ------------------------------------------------------------------------ | ||
471 | |||
472 | In the similar way, you can override the codec subsystem_id via | ||
473 | `[subsystem_id]`, the revision id via `[revision_id]` line. | ||
474 | Also, the codec chip name can be rewritten via `[chip_name]` line. | ||
475 | ------------------------------------------------------------------------ | ||
476 | [codec] | ||
477 | 0x12345678 0xabcd1234 2 | ||
478 | |||
479 | [subsystem_id] | ||
480 | 0xffff1111 | ||
481 | |||
482 | [revision_id] | ||
483 | 0x10 | ||
484 | |||
485 | [chip_name] | ||
486 | My-own NEWS-0002 | ||
487 | ------------------------------------------------------------------------ | ||
488 | |||
455 | The hd-audio driver reads the file via request_firmware(). Thus, | 489 | The hd-audio driver reads the file via request_firmware(). Thus, |
456 | a patch file has to be located on the appropriate firmware path, | 490 | a patch file has to be located on the appropriate firmware path, |
457 | typically, /lib/firmware. For example, when you pass the option | 491 | typically, /lib/firmware. For example, when you pass the option |
@@ -565,6 +599,9 @@ probing, the proc file is available, so you can get the raw codec | |||
565 | information before modified by the driver. Of course, the driver | 599 | information before modified by the driver. Of course, the driver |
566 | isn't usable with `probe_only=1`. But you can continue the | 600 | isn't usable with `probe_only=1`. But you can continue the |
567 | configuration via hwdep sysfs file if hda-reconfig option is enabled. | 601 | configuration via hwdep sysfs file if hda-reconfig option is enabled. |
602 | Using `probe_only` mask 2 skips the reset of HDA codecs (use | ||
603 | `probe_only=3` as module option). The hwdep interface can be used | ||
604 | to determine the BIOS codec initialization. | ||
568 | 605 | ||
569 | 606 | ||
570 | hda-verb | 607 | hda-verb |
diff --git a/Documentation/sound/alsa/soc/dapm.txt b/Documentation/sound/alsa/soc/dapm.txt index 9ac842be9b4f..05bf5a0eee41 100644 --- a/Documentation/sound/alsa/soc/dapm.txt +++ b/Documentation/sound/alsa/soc/dapm.txt | |||
@@ -188,8 +188,8 @@ The WM8731 output mixer has 3 inputs (sources) | |||
188 | 3. Mic Sidetone Input | 188 | 3. Mic Sidetone Input |
189 | 189 | ||
190 | Each input in this example has a kcontrol associated with it (defined in example | 190 | Each input in this example has a kcontrol associated with it (defined in example |
191 | above) and is connected to the output mixer via it's kcontrol name. We can now | 191 | above) and is connected to the output mixer via its kcontrol name. We can now |
192 | connect the destination widget (wrt audio signal) with it's source widgets. | 192 | connect the destination widget (wrt audio signal) with its source widgets. |
193 | 193 | ||
194 | /* output mixer */ | 194 | /* output mixer */ |
195 | {"Output Mixer", "Line Bypass Switch", "Line Input"}, | 195 | {"Output Mixer", "Line Bypass Switch", "Line Input"}, |
diff --git a/Documentation/sound/alsa/soc/machine.txt b/Documentation/sound/alsa/soc/machine.txt index bab7711ce963..2524c75557df 100644 --- a/Documentation/sound/alsa/soc/machine.txt +++ b/Documentation/sound/alsa/soc/machine.txt | |||
@@ -67,7 +67,7 @@ static struct snd_soc_dai_link corgi_dai = { | |||
67 | .ops = &corgi_ops, | 67 | .ops = &corgi_ops, |
68 | }; | 68 | }; |
69 | 69 | ||
70 | struct snd_soc_card then sets up the machine with it's DAIs. e.g. | 70 | struct snd_soc_card then sets up the machine with its DAIs. e.g. |
71 | 71 | ||
72 | /* corgi audio machine driver */ | 72 | /* corgi audio machine driver */ |
73 | static struct snd_soc_card snd_soc_corgi = { | 73 | static struct snd_soc_card snd_soc_corgi = { |
diff --git a/Documentation/sound/alsa/soc/overview.txt b/Documentation/sound/alsa/soc/overview.txt index 1e4c6d3655f2..138ac88c1461 100644 --- a/Documentation/sound/alsa/soc/overview.txt +++ b/Documentation/sound/alsa/soc/overview.txt | |||
@@ -33,7 +33,7 @@ features :- | |||
33 | and machines. | 33 | and machines. |
34 | 34 | ||
35 | * Easy I2S/PCM audio interface setup between codec and SoC. Each SoC | 35 | * Easy I2S/PCM audio interface setup between codec and SoC. Each SoC |
36 | interface and codec registers it's audio interface capabilities with the | 36 | interface and codec registers its audio interface capabilities with the |
37 | core and are subsequently matched and configured when the application | 37 | core and are subsequently matched and configured when the application |
38 | hardware parameters are known. | 38 | hardware parameters are known. |
39 | 39 | ||
diff --git a/Documentation/sparse.txt b/Documentation/sparse.txt index 34c76a55bc04..9b659c79a547 100644 --- a/Documentation/sparse.txt +++ b/Documentation/sparse.txt | |||
@@ -54,12 +54,12 @@ Getting sparse | |||
54 | ~~~~~~~~~~~~~~ | 54 | ~~~~~~~~~~~~~~ |
55 | 55 | ||
56 | You can get latest released versions from the Sparse homepage at | 56 | You can get latest released versions from the Sparse homepage at |
57 | http://www.kernel.org/pub/linux/kernel/people/josh/sparse/ | 57 | https://sparse.wiki.kernel.org/index.php/Main_Page |
58 | 58 | ||
59 | Alternatively, you can get snapshots of the latest development version | 59 | Alternatively, you can get snapshots of the latest development version |
60 | of sparse using git to clone.. | 60 | of sparse using git to clone.. |
61 | 61 | ||
62 | git://git.kernel.org/pub/scm/linux/kernel/git/josh/sparse.git | 62 | git://git.kernel.org/pub/scm/devel/sparse/sparse.git |
63 | 63 | ||
64 | DaveJ has hourly generated tarballs of the git tree available at.. | 64 | DaveJ has hourly generated tarballs of the git tree available at.. |
65 | 65 | ||
diff --git a/Documentation/spi/ep93xx_spi b/Documentation/spi/ep93xx_spi new file mode 100644 index 000000000000..6325f5b48635 --- /dev/null +++ b/Documentation/spi/ep93xx_spi | |||
@@ -0,0 +1,95 @@ | |||
1 | Cirrus EP93xx SPI controller driver HOWTO | ||
2 | ========================================= | ||
3 | |||
4 | ep93xx_spi driver brings SPI master support for EP93xx SPI controller. Chip | ||
5 | selects are implemented with GPIO lines. | ||
6 | |||
7 | NOTE: If possible, don't use SFRMOUT (SFRM1) signal as a chip select. It will | ||
8 | not work correctly (it cannot be controlled by software). Use GPIO lines | ||
9 | instead. | ||
10 | |||
11 | Sample configuration | ||
12 | ==================== | ||
13 | |||
14 | Typically driver configuration is done in platform board files (the files under | ||
15 | arch/arm/mach-ep93xx/*.c). In this example we configure MMC over SPI through | ||
16 | this driver on TS-7260 board. You can adapt the code to suit your needs. | ||
17 | |||
18 | This example uses EGPIO9 as SD/MMC card chip select (this is wired in DIO1 | ||
19 | header on the board). | ||
20 | |||
21 | You need to select CONFIG_MMC_SPI to use mmc_spi driver. | ||
22 | |||
23 | arch/arm/mach-ep93xx/ts72xx.c: | ||
24 | |||
25 | ... | ||
26 | #include <linux/gpio.h> | ||
27 | #include <linux/spi/spi.h> | ||
28 | |||
29 | #include <mach/ep93xx_spi.h> | ||
30 | |||
31 | /* this is our GPIO line used for chip select */ | ||
32 | #define MMC_CHIP_SELECT_GPIO EP93XX_GPIO_LINE_EGPIO9 | ||
33 | |||
34 | static int ts72xx_mmc_spi_setup(struct spi_device *spi) | ||
35 | { | ||
36 | int err; | ||
37 | |||
38 | err = gpio_request(MMC_CHIP_SELECT_GPIO, spi->modalias); | ||
39 | if (err) | ||
40 | return err; | ||
41 | |||
42 | gpio_direction_output(MMC_CHIP_SELECT_GPIO, 1); | ||
43 | |||
44 | return 0; | ||
45 | } | ||
46 | |||
47 | static void ts72xx_mmc_spi_cleanup(struct spi_device *spi) | ||
48 | { | ||
49 | gpio_set_value(MMC_CHIP_SELECT_GPIO, 1); | ||
50 | gpio_direction_input(MMC_CHIP_SELECT_GPIO); | ||
51 | gpio_free(MMC_CHIP_SELECT_GPIO); | ||
52 | } | ||
53 | |||
54 | static void ts72xx_mmc_spi_cs_control(struct spi_device *spi, int value) | ||
55 | { | ||
56 | gpio_set_value(MMC_CHIP_SELECT_GPIO, value); | ||
57 | } | ||
58 | |||
59 | static struct ep93xx_spi_chip_ops ts72xx_mmc_spi_ops = { | ||
60 | .setup = ts72xx_mmc_spi_setup, | ||
61 | .cleanup = ts72xx_mmc_spi_cleanup, | ||
62 | .cs_control = ts72xx_mmc_spi_cs_control, | ||
63 | }; | ||
64 | |||
65 | static struct spi_board_info ts72xx_spi_devices[] __initdata = { | ||
66 | { | ||
67 | .modalias = "mmc_spi", | ||
68 | .controller_data = &ts72xx_mmc_spi_ops, | ||
69 | /* | ||
70 | * We use 10 MHz even though the maximum is 7.4 MHz. The driver | ||
71 | * will limit it automatically to max. frequency. | ||
72 | */ | ||
73 | .max_speed_hz = 10 * 1000 * 1000, | ||
74 | .bus_num = 0, | ||
75 | .chip_select = 0, | ||
76 | .mode = SPI_MODE_0, | ||
77 | }, | ||
78 | }; | ||
79 | |||
80 | static struct ep93xx_spi_info ts72xx_spi_info = { | ||
81 | .num_chipselect = ARRAY_SIZE(ts72xx_spi_devices), | ||
82 | }; | ||
83 | |||
84 | static void __init ts72xx_init_machine(void) | ||
85 | { | ||
86 | ... | ||
87 | ep93xx_register_spi(&ts72xx_spi_info, ts72xx_spi_devices, | ||
88 | ARRAY_SIZE(ts72xx_spi_devices)); | ||
89 | } | ||
90 | |||
91 | Thanks to | ||
92 | ========= | ||
93 | Martin Guy, H. Hartley Sweeten and others who helped me during development of | ||
94 | the driver. Simplemachines.it donated me a Sim.One board which I used testing | ||
95 | the driver on EP9307. | ||
diff --git a/Documentation/spi/spidev_fdx.c b/Documentation/spi/spidev_fdx.c index fc354f760384..36ec0774ca0b 100644 --- a/Documentation/spi/spidev_fdx.c +++ b/Documentation/spi/spidev_fdx.c | |||
@@ -58,10 +58,10 @@ static void do_msg(int fd, int len) | |||
58 | len = sizeof buf; | 58 | len = sizeof buf; |
59 | 59 | ||
60 | buf[0] = 0xaa; | 60 | buf[0] = 0xaa; |
61 | xfer[0].tx_buf = (__u64) buf; | 61 | xfer[0].tx_buf = (unsigned long)buf; |
62 | xfer[0].len = 1; | 62 | xfer[0].len = 1; |
63 | 63 | ||
64 | xfer[1].rx_buf = (__u64) buf; | 64 | xfer[1].rx_buf = (unsigned long) buf; |
65 | xfer[1].len = len; | 65 | xfer[1].len = len; |
66 | 66 | ||
67 | status = ioctl(fd, SPI_IOC_MESSAGE(2), xfer); | 67 | status = ioctl(fd, SPI_IOC_MESSAGE(2), xfer); |
diff --git a/Documentation/spi/spidev_test.c b/Documentation/spi/spidev_test.c index 10abd3773e49..16feda901469 100644 --- a/Documentation/spi/spidev_test.c +++ b/Documentation/spi/spidev_test.c | |||
@@ -58,7 +58,7 @@ static void transfer(int fd) | |||
58 | }; | 58 | }; |
59 | 59 | ||
60 | ret = ioctl(fd, SPI_IOC_MESSAGE(1), &tr); | 60 | ret = ioctl(fd, SPI_IOC_MESSAGE(1), &tr); |
61 | if (ret == 1) | 61 | if (ret < 1) |
62 | pabort("can't send spi message"); | 62 | pabort("can't send spi message"); |
63 | 63 | ||
64 | for (ret = 0; ret < ARRAY_SIZE(tx); ret++) { | 64 | for (ret = 0; ret < ARRAY_SIZE(tx); ret++) { |
diff --git a/Documentation/stable_kernel_rules.txt b/Documentation/stable_kernel_rules.txt index 5effa5bd993b..e213f45cf9d7 100644 --- a/Documentation/stable_kernel_rules.txt +++ b/Documentation/stable_kernel_rules.txt | |||
@@ -18,16 +18,15 @@ Rules on what kind of patches are accepted, and which ones are not, into the | |||
18 | - It cannot contain any "trivial" fixes in it (spelling changes, | 18 | - It cannot contain any "trivial" fixes in it (spelling changes, |
19 | whitespace cleanups, etc). | 19 | whitespace cleanups, etc). |
20 | - It must follow the Documentation/SubmittingPatches rules. | 20 | - It must follow the Documentation/SubmittingPatches rules. |
21 | - It or an equivalent fix must already exist in Linus' tree. Quote the | 21 | - It or an equivalent fix must already exist in Linus' tree (upstream). |
22 | respective commit ID in Linus' tree in your patch submission to -stable. | ||
23 | 22 | ||
24 | 23 | ||
25 | Procedure for submitting patches to the -stable tree: | 24 | Procedure for submitting patches to the -stable tree: |
26 | 25 | ||
27 | - Send the patch, after verifying that it follows the above rules, to | 26 | - Send the patch, after verifying that it follows the above rules, to |
28 | stable@kernel.org. | 27 | stable@kernel.org. You must note the upstream commit ID in the changelog |
29 | - To have the patch automatically included in the stable tree, add the | 28 | of your submission. |
30 | the tag | 29 | - To have the patch automatically included in the stable tree, add the tag |
31 | Cc: stable@kernel.org | 30 | Cc: stable@kernel.org |
32 | in the sign-off area. Once the patch is merged it will be applied to | 31 | in the sign-off area. Once the patch is merged it will be applied to |
33 | the stable tree without anything else needing to be done by the author | 32 | the stable tree without anything else needing to be done by the author |
diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index df38ef046f8d..cbd05ffc606b 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt | |||
@@ -84,6 +84,16 @@ netdev_max_backlog | |||
84 | Maximum number of packets, queued on the INPUT side, when the interface | 84 | Maximum number of packets, queued on the INPUT side, when the interface |
85 | receives packets faster than kernel can process them. | 85 | receives packets faster than kernel can process them. |
86 | 86 | ||
87 | netdev_tstamp_prequeue | ||
88 | ---------------------- | ||
89 | |||
90 | If set to 0, RX packet timestamps can be sampled after RPS processing, when | ||
91 | the target CPU processes packets. It might give some delay on timestamps, but | ||
92 | permit to distribute the load on several cpus. | ||
93 | |||
94 | If set to 1 (default), timestamps are sampled as soon as possible, before | ||
95 | queueing. | ||
96 | |||
87 | optmem_max | 97 | optmem_max |
88 | ---------- | 98 | ---------- |
89 | 99 | ||
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index fc5790d36cd9..5fdbb612aeb8 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt | |||
@@ -19,6 +19,7 @@ files can be found in mm/swap.c. | |||
19 | Currently, these files are in /proc/sys/vm: | 19 | Currently, these files are in /proc/sys/vm: |
20 | 20 | ||
21 | - block_dump | 21 | - block_dump |
22 | - compact_memory | ||
22 | - dirty_background_bytes | 23 | - dirty_background_bytes |
23 | - dirty_background_ratio | 24 | - dirty_background_ratio |
24 | - dirty_bytes | 25 | - dirty_bytes |
@@ -26,6 +27,7 @@ Currently, these files are in /proc/sys/vm: | |||
26 | - dirty_ratio | 27 | - dirty_ratio |
27 | - dirty_writeback_centisecs | 28 | - dirty_writeback_centisecs |
28 | - drop_caches | 29 | - drop_caches |
30 | - extfrag_threshold | ||
29 | - hugepages_treat_as_movable | 31 | - hugepages_treat_as_movable |
30 | - hugetlb_shm_group | 32 | - hugetlb_shm_group |
31 | - laptop_mode | 33 | - laptop_mode |
@@ -64,6 +66,15 @@ information on block I/O debugging is in Documentation/laptops/laptop-mode.txt. | |||
64 | 66 | ||
65 | ============================================================== | 67 | ============================================================== |
66 | 68 | ||
69 | compact_memory | ||
70 | |||
71 | Available only when CONFIG_COMPACTION is set. When 1 is written to the file, | ||
72 | all zones are compacted such that free memory is available in contiguous | ||
73 | blocks where possible. This can be important for example in the allocation of | ||
74 | huge pages although processes will also directly compact memory as required. | ||
75 | |||
76 | ============================================================== | ||
77 | |||
67 | dirty_background_bytes | 78 | dirty_background_bytes |
68 | 79 | ||
69 | Contains the amount of dirty memory at which the pdflush background writeback | 80 | Contains the amount of dirty memory at which the pdflush background writeback |
@@ -139,6 +150,20 @@ user should run `sync' first. | |||
139 | 150 | ||
140 | ============================================================== | 151 | ============================================================== |
141 | 152 | ||
153 | extfrag_threshold | ||
154 | |||
155 | This parameter affects whether the kernel will compact memory or direct | ||
156 | reclaim to satisfy a high-order allocation. /proc/extfrag_index shows what | ||
157 | the fragmentation index for each order is in each zone in the system. Values | ||
158 | tending towards 0 imply allocations would fail due to lack of memory, | ||
159 | values towards 1000 imply failures are due to fragmentation and -1 implies | ||
160 | that the allocation will succeed as long as watermarks are met. | ||
161 | |||
162 | The kernel will not compact memory in a zone if the | ||
163 | fragmentation index is <= extfrag_threshold. The default value is 500. | ||
164 | |||
165 | ============================================================== | ||
166 | |||
142 | hugepages_treat_as_movable | 167 | hugepages_treat_as_movable |
143 | 168 | ||
144 | This parameter is only useful when kernelcore= is specified at boot time to | 169 | This parameter is only useful when kernelcore= is specified at boot time to |
@@ -573,11 +598,14 @@ Because other nodes' memory may be free. This means system total status | |||
573 | may be not fatal yet. | 598 | may be not fatal yet. |
574 | 599 | ||
575 | If this is set to 2, the kernel panics compulsorily even on the | 600 | If this is set to 2, the kernel panics compulsorily even on the |
576 | above-mentioned. | 601 | above-mentioned. Even oom happens under memory cgroup, the whole |
602 | system panics. | ||
577 | 603 | ||
578 | The default value is 0. | 604 | The default value is 0. |
579 | 1 and 2 are for failover of clustering. Please select either | 605 | 1 and 2 are for failover of clustering. Please select either |
580 | according to your policy of failover. | 606 | according to your policy of failover. |
607 | panic_on_oom=2+kdump gives you very strong tool to investigate | ||
608 | why oom happens. You can get snapshot. | ||
581 | 609 | ||
582 | ============================================================= | 610 | ============================================================= |
583 | 611 | ||
diff --git a/Documentation/sysfs-rules.txt b/Documentation/sysfs-rules.txt index 5d8bc2cd250c..c1a1fd636bf9 100644 --- a/Documentation/sysfs-rules.txt +++ b/Documentation/sysfs-rules.txt | |||
@@ -125,7 +125,7 @@ versions of the sysfs interface. | |||
125 | - Block | 125 | - Block |
126 | The converted block subsystem at /sys/class/block or | 126 | The converted block subsystem at /sys/class/block or |
127 | /sys/subsystem/block will contain the links for disks and partitions | 127 | /sys/subsystem/block will contain the links for disks and partitions |
128 | at the same level, never in a hierarchy. Assuming the block subsytem to | 128 | at the same level, never in a hierarchy. Assuming the block subsystem to |
129 | contain only disks and not partition devices in the same flat list is | 129 | contain only disks and not partition devices in the same flat list is |
130 | a bug in the application. | 130 | a bug in the application. |
131 | 131 | ||
diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt index d56a01775423..5c17196c8fe9 100644 --- a/Documentation/sysrq.txt +++ b/Documentation/sysrq.txt | |||
@@ -177,13 +177,13 @@ virtual console (ALT+Fn) and then back again should also help. | |||
177 | 177 | ||
178 | * I hit SysRq, but nothing seems to happen, what's wrong? | 178 | * I hit SysRq, but nothing seems to happen, what's wrong? |
179 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 179 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
180 | There are some keyboards that send different scancodes for SysRq than the | 180 | There are some keyboards that produce a different keycode for SysRq than the |
181 | pre-defined 0x54. So if SysRq doesn't work out of the box for a certain | 181 | pre-defined value of 99 (see KEY_SYSRQ in include/linux/input.h), or which |
182 | keyboard, run 'showkey -s' to find out the proper scancode sequence. Then | 182 | don't have a SysRq key at all. In these cases, run 'showkey -s' to find an |
183 | use 'setkeycodes <sequence> 84' to define this sequence to the usual SysRq | 183 | appropriate scancode sequence, and use 'setkeycodes <sequence> 99' to map |
184 | code (84 is decimal for 0x54). It's probably best to put this command in a | 184 | this sequence to the usual SysRq code (e.g., 'setkeycodes e05b 99'). It's |
185 | boot script. Oh, and by the way, you exit 'showkey' by not typing anything | 185 | probably best to put this command in a boot script. Oh, and by the way, you |
186 | for ten seconds. | 186 | exit 'showkey' by not typing anything for ten seconds. |
187 | 187 | ||
188 | * I want to add SysRQ key events to a module, how does it work? | 188 | * I want to add SysRQ key events to a module, how does it work? |
189 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 189 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
diff --git a/Documentation/timers/00-INDEX b/Documentation/timers/00-INDEX index 397dc35e1323..a9248da5cdbc 100644 --- a/Documentation/timers/00-INDEX +++ b/Documentation/timers/00-INDEX | |||
@@ -4,6 +4,8 @@ highres.txt | |||
4 | - High resolution timers and dynamic ticks design notes | 4 | - High resolution timers and dynamic ticks design notes |
5 | hpet.txt | 5 | hpet.txt |
6 | - High Precision Event Timer Driver for Linux | 6 | - High Precision Event Timer Driver for Linux |
7 | hpet_example.c | ||
8 | - sample hpet timer test program | ||
7 | hrtimers.txt | 9 | hrtimers.txt |
8 | - subsystem for high-resolution kernel timers | 10 | - subsystem for high-resolution kernel timers |
9 | timer_stats.txt | 11 | timer_stats.txt |
diff --git a/Documentation/timers/Makefile b/Documentation/timers/Makefile new file mode 100644 index 000000000000..c85625f4ab25 --- /dev/null +++ b/Documentation/timers/Makefile | |||
@@ -0,0 +1,8 @@ | |||
1 | # kbuild trick to avoid linker error. Can be omitted if a module is built. | ||
2 | obj- := dummy.o | ||
3 | |||
4 | # List of programs to build | ||
5 | hostprogs-y := hpet_example | ||
6 | |||
7 | # Tell kbuild to always build the programs | ||
8 | always := $(hostprogs-y) | ||
diff --git a/Documentation/timers/hpet.txt b/Documentation/timers/hpet.txt index 16d25e6b5a00..767392ffd31e 100644 --- a/Documentation/timers/hpet.txt +++ b/Documentation/timers/hpet.txt | |||
@@ -26,274 +26,5 @@ initialization. An example of this initialization can be found in | |||
26 | arch/x86/kernel/hpet.c. | 26 | arch/x86/kernel/hpet.c. |
27 | 27 | ||
28 | The driver provides a userspace API which resembles the API found in the | 28 | The driver provides a userspace API which resembles the API found in the |
29 | RTC driver framework. An example user space program is provided below. | 29 | RTC driver framework. An example user space program is provided in |
30 | 30 | file:Documentation/timers/hpet_example.c | |
31 | #include <stdio.h> | ||
32 | #include <stdlib.h> | ||
33 | #include <unistd.h> | ||
34 | #include <fcntl.h> | ||
35 | #include <string.h> | ||
36 | #include <memory.h> | ||
37 | #include <malloc.h> | ||
38 | #include <time.h> | ||
39 | #include <ctype.h> | ||
40 | #include <sys/types.h> | ||
41 | #include <sys/wait.h> | ||
42 | #include <signal.h> | ||
43 | #include <fcntl.h> | ||
44 | #include <errno.h> | ||
45 | #include <sys/time.h> | ||
46 | #include <linux/hpet.h> | ||
47 | |||
48 | |||
49 | extern void hpet_open_close(int, const char **); | ||
50 | extern void hpet_info(int, const char **); | ||
51 | extern void hpet_poll(int, const char **); | ||
52 | extern void hpet_fasync(int, const char **); | ||
53 | extern void hpet_read(int, const char **); | ||
54 | |||
55 | #include <sys/poll.h> | ||
56 | #include <sys/ioctl.h> | ||
57 | #include <signal.h> | ||
58 | |||
59 | struct hpet_command { | ||
60 | char *command; | ||
61 | void (*func)(int argc, const char ** argv); | ||
62 | } hpet_command[] = { | ||
63 | { | ||
64 | "open-close", | ||
65 | hpet_open_close | ||
66 | }, | ||
67 | { | ||
68 | "info", | ||
69 | hpet_info | ||
70 | }, | ||
71 | { | ||
72 | "poll", | ||
73 | hpet_poll | ||
74 | }, | ||
75 | { | ||
76 | "fasync", | ||
77 | hpet_fasync | ||
78 | }, | ||
79 | }; | ||
80 | |||
81 | int | ||
82 | main(int argc, const char ** argv) | ||
83 | { | ||
84 | int i; | ||
85 | |||
86 | argc--; | ||
87 | argv++; | ||
88 | |||
89 | if (!argc) { | ||
90 | fprintf(stderr, "-hpet: requires command\n"); | ||
91 | return -1; | ||
92 | } | ||
93 | |||
94 | |||
95 | for (i = 0; i < (sizeof (hpet_command) / sizeof (hpet_command[0])); i++) | ||
96 | if (!strcmp(argv[0], hpet_command[i].command)) { | ||
97 | argc--; | ||
98 | argv++; | ||
99 | fprintf(stderr, "-hpet: executing %s\n", | ||
100 | hpet_command[i].command); | ||
101 | hpet_command[i].func(argc, argv); | ||
102 | return 0; | ||
103 | } | ||
104 | |||
105 | fprintf(stderr, "do_hpet: command %s not implemented\n", argv[0]); | ||
106 | |||
107 | return -1; | ||
108 | } | ||
109 | |||
110 | void | ||
111 | hpet_open_close(int argc, const char **argv) | ||
112 | { | ||
113 | int fd; | ||
114 | |||
115 | if (argc != 1) { | ||
116 | fprintf(stderr, "hpet_open_close: device-name\n"); | ||
117 | return; | ||
118 | } | ||
119 | |||
120 | fd = open(argv[0], O_RDONLY); | ||
121 | if (fd < 0) | ||
122 | fprintf(stderr, "hpet_open_close: open failed\n"); | ||
123 | else | ||
124 | close(fd); | ||
125 | |||
126 | return; | ||
127 | } | ||
128 | |||
129 | void | ||
130 | hpet_info(int argc, const char **argv) | ||
131 | { | ||
132 | } | ||
133 | |||
134 | void | ||
135 | hpet_poll(int argc, const char **argv) | ||
136 | { | ||
137 | unsigned long freq; | ||
138 | int iterations, i, fd; | ||
139 | struct pollfd pfd; | ||
140 | struct hpet_info info; | ||
141 | struct timeval stv, etv; | ||
142 | struct timezone tz; | ||
143 | long usec; | ||
144 | |||
145 | if (argc != 3) { | ||
146 | fprintf(stderr, "hpet_poll: device-name freq iterations\n"); | ||
147 | return; | ||
148 | } | ||
149 | |||
150 | freq = atoi(argv[1]); | ||
151 | iterations = atoi(argv[2]); | ||
152 | |||
153 | fd = open(argv[0], O_RDONLY); | ||
154 | |||
155 | if (fd < 0) { | ||
156 | fprintf(stderr, "hpet_poll: open of %s failed\n", argv[0]); | ||
157 | return; | ||
158 | } | ||
159 | |||
160 | if (ioctl(fd, HPET_IRQFREQ, freq) < 0) { | ||
161 | fprintf(stderr, "hpet_poll: HPET_IRQFREQ failed\n"); | ||
162 | goto out; | ||
163 | } | ||
164 | |||
165 | if (ioctl(fd, HPET_INFO, &info) < 0) { | ||
166 | fprintf(stderr, "hpet_poll: failed to get info\n"); | ||
167 | goto out; | ||
168 | } | ||
169 | |||
170 | fprintf(stderr, "hpet_poll: info.hi_flags 0x%lx\n", info.hi_flags); | ||
171 | |||
172 | if (info.hi_flags && (ioctl(fd, HPET_EPI, 0) < 0)) { | ||
173 | fprintf(stderr, "hpet_poll: HPET_EPI failed\n"); | ||
174 | goto out; | ||
175 | } | ||
176 | |||
177 | if (ioctl(fd, HPET_IE_ON, 0) < 0) { | ||
178 | fprintf(stderr, "hpet_poll, HPET_IE_ON failed\n"); | ||
179 | goto out; | ||
180 | } | ||
181 | |||
182 | pfd.fd = fd; | ||
183 | pfd.events = POLLIN; | ||
184 | |||
185 | for (i = 0; i < iterations; i++) { | ||
186 | pfd.revents = 0; | ||
187 | gettimeofday(&stv, &tz); | ||
188 | if (poll(&pfd, 1, -1) < 0) | ||
189 | fprintf(stderr, "hpet_poll: poll failed\n"); | ||
190 | else { | ||
191 | long data; | ||
192 | |||
193 | gettimeofday(&etv, &tz); | ||
194 | usec = stv.tv_sec * 1000000 + stv.tv_usec; | ||
195 | usec = (etv.tv_sec * 1000000 + etv.tv_usec) - usec; | ||
196 | |||
197 | fprintf(stderr, | ||
198 | "hpet_poll: expired time = 0x%lx\n", usec); | ||
199 | |||
200 | fprintf(stderr, "hpet_poll: revents = 0x%x\n", | ||
201 | pfd.revents); | ||
202 | |||
203 | if (read(fd, &data, sizeof(data)) != sizeof(data)) { | ||
204 | fprintf(stderr, "hpet_poll: read failed\n"); | ||
205 | } | ||
206 | else | ||
207 | fprintf(stderr, "hpet_poll: data 0x%lx\n", | ||
208 | data); | ||
209 | } | ||
210 | } | ||
211 | |||
212 | out: | ||
213 | close(fd); | ||
214 | return; | ||
215 | } | ||
216 | |||
217 | static int hpet_sigio_count; | ||
218 | |||
219 | static void | ||
220 | hpet_sigio(int val) | ||
221 | { | ||
222 | fprintf(stderr, "hpet_sigio: called\n"); | ||
223 | hpet_sigio_count++; | ||
224 | } | ||
225 | |||
226 | void | ||
227 | hpet_fasync(int argc, const char **argv) | ||
228 | { | ||
229 | unsigned long freq; | ||
230 | int iterations, i, fd, value; | ||
231 | sig_t oldsig; | ||
232 | struct hpet_info info; | ||
233 | |||
234 | hpet_sigio_count = 0; | ||
235 | fd = -1; | ||
236 | |||
237 | if ((oldsig = signal(SIGIO, hpet_sigio)) == SIG_ERR) { | ||
238 | fprintf(stderr, "hpet_fasync: failed to set signal handler\n"); | ||
239 | return; | ||
240 | } | ||
241 | |||
242 | if (argc != 3) { | ||
243 | fprintf(stderr, "hpet_fasync: device-name freq iterations\n"); | ||
244 | goto out; | ||
245 | } | ||
246 | |||
247 | fd = open(argv[0], O_RDONLY); | ||
248 | |||
249 | if (fd < 0) { | ||
250 | fprintf(stderr, "hpet_fasync: failed to open %s\n", argv[0]); | ||
251 | return; | ||
252 | } | ||
253 | |||
254 | |||
255 | if ((fcntl(fd, F_SETOWN, getpid()) == 1) || | ||
256 | ((value = fcntl(fd, F_GETFL)) == 1) || | ||
257 | (fcntl(fd, F_SETFL, value | O_ASYNC) == 1)) { | ||
258 | fprintf(stderr, "hpet_fasync: fcntl failed\n"); | ||
259 | goto out; | ||
260 | } | ||
261 | |||
262 | freq = atoi(argv[1]); | ||
263 | iterations = atoi(argv[2]); | ||
264 | |||
265 | if (ioctl(fd, HPET_IRQFREQ, freq) < 0) { | ||
266 | fprintf(stderr, "hpet_fasync: HPET_IRQFREQ failed\n"); | ||
267 | goto out; | ||
268 | } | ||
269 | |||
270 | if (ioctl(fd, HPET_INFO, &info) < 0) { | ||
271 | fprintf(stderr, "hpet_fasync: failed to get info\n"); | ||
272 | goto out; | ||
273 | } | ||
274 | |||
275 | fprintf(stderr, "hpet_fasync: info.hi_flags 0x%lx\n", info.hi_flags); | ||
276 | |||
277 | if (info.hi_flags && (ioctl(fd, HPET_EPI, 0) < 0)) { | ||
278 | fprintf(stderr, "hpet_fasync: HPET_EPI failed\n"); | ||
279 | goto out; | ||
280 | } | ||
281 | |||
282 | if (ioctl(fd, HPET_IE_ON, 0) < 0) { | ||
283 | fprintf(stderr, "hpet_fasync, HPET_IE_ON failed\n"); | ||
284 | goto out; | ||
285 | } | ||
286 | |||
287 | for (i = 0; i < iterations; i++) { | ||
288 | (void) pause(); | ||
289 | fprintf(stderr, "hpet_fasync: count = %d\n", hpet_sigio_count); | ||
290 | } | ||
291 | |||
292 | out: | ||
293 | signal(SIGIO, oldsig); | ||
294 | |||
295 | if (fd >= 0) | ||
296 | close(fd); | ||
297 | |||
298 | return; | ||
299 | } | ||
diff --git a/Documentation/timers/hpet_example.c b/Documentation/timers/hpet_example.c new file mode 100644 index 000000000000..4bfafb7bc4c5 --- /dev/null +++ b/Documentation/timers/hpet_example.c | |||
@@ -0,0 +1,267 @@ | |||
1 | #include <stdio.h> | ||
2 | #include <stdlib.h> | ||
3 | #include <unistd.h> | ||
4 | #include <fcntl.h> | ||
5 | #include <string.h> | ||
6 | #include <memory.h> | ||
7 | #include <malloc.h> | ||
8 | #include <time.h> | ||
9 | #include <ctype.h> | ||
10 | #include <sys/types.h> | ||
11 | #include <sys/wait.h> | ||
12 | #include <signal.h> | ||
13 | #include <errno.h> | ||
14 | #include <sys/time.h> | ||
15 | #include <linux/hpet.h> | ||
16 | |||
17 | |||
18 | extern void hpet_open_close(int, const char **); | ||
19 | extern void hpet_info(int, const char **); | ||
20 | extern void hpet_poll(int, const char **); | ||
21 | extern void hpet_fasync(int, const char **); | ||
22 | extern void hpet_read(int, const char **); | ||
23 | |||
24 | #include <sys/poll.h> | ||
25 | #include <sys/ioctl.h> | ||
26 | |||
27 | struct hpet_command { | ||
28 | char *command; | ||
29 | void (*func)(int argc, const char ** argv); | ||
30 | } hpet_command[] = { | ||
31 | { | ||
32 | "open-close", | ||
33 | hpet_open_close | ||
34 | }, | ||
35 | { | ||
36 | "info", | ||
37 | hpet_info | ||
38 | }, | ||
39 | { | ||
40 | "poll", | ||
41 | hpet_poll | ||
42 | }, | ||
43 | { | ||
44 | "fasync", | ||
45 | hpet_fasync | ||
46 | }, | ||
47 | }; | ||
48 | |||
49 | int | ||
50 | main(int argc, const char ** argv) | ||
51 | { | ||
52 | int i; | ||
53 | |||
54 | argc--; | ||
55 | argv++; | ||
56 | |||
57 | if (!argc) { | ||
58 | fprintf(stderr, "-hpet: requires command\n"); | ||
59 | return -1; | ||
60 | } | ||
61 | |||
62 | |||
63 | for (i = 0; i < (sizeof (hpet_command) / sizeof (hpet_command[0])); i++) | ||
64 | if (!strcmp(argv[0], hpet_command[i].command)) { | ||
65 | argc--; | ||
66 | argv++; | ||
67 | fprintf(stderr, "-hpet: executing %s\n", | ||
68 | hpet_command[i].command); | ||
69 | hpet_command[i].func(argc, argv); | ||
70 | return 0; | ||
71 | } | ||
72 | |||
73 | fprintf(stderr, "do_hpet: command %s not implemented\n", argv[0]); | ||
74 | |||
75 | return -1; | ||
76 | } | ||
77 | |||
78 | void | ||
79 | hpet_open_close(int argc, const char **argv) | ||
80 | { | ||
81 | int fd; | ||
82 | |||
83 | if (argc != 1) { | ||
84 | fprintf(stderr, "hpet_open_close: device-name\n"); | ||
85 | return; | ||
86 | } | ||
87 | |||
88 | fd = open(argv[0], O_RDONLY); | ||
89 | if (fd < 0) | ||
90 | fprintf(stderr, "hpet_open_close: open failed\n"); | ||
91 | else | ||
92 | close(fd); | ||
93 | |||
94 | return; | ||
95 | } | ||
96 | |||
97 | void | ||
98 | hpet_info(int argc, const char **argv) | ||
99 | { | ||
100 | } | ||
101 | |||
102 | void | ||
103 | hpet_poll(int argc, const char **argv) | ||
104 | { | ||
105 | unsigned long freq; | ||
106 | int iterations, i, fd; | ||
107 | struct pollfd pfd; | ||
108 | struct hpet_info info; | ||
109 | struct timeval stv, etv; | ||
110 | struct timezone tz; | ||
111 | long usec; | ||
112 | |||
113 | if (argc != 3) { | ||
114 | fprintf(stderr, "hpet_poll: device-name freq iterations\n"); | ||
115 | return; | ||
116 | } | ||
117 | |||
118 | freq = atoi(argv[1]); | ||
119 | iterations = atoi(argv[2]); | ||
120 | |||
121 | fd = open(argv[0], O_RDONLY); | ||
122 | |||
123 | if (fd < 0) { | ||
124 | fprintf(stderr, "hpet_poll: open of %s failed\n", argv[0]); | ||
125 | return; | ||
126 | } | ||
127 | |||
128 | if (ioctl(fd, HPET_IRQFREQ, freq) < 0) { | ||
129 | fprintf(stderr, "hpet_poll: HPET_IRQFREQ failed\n"); | ||
130 | goto out; | ||
131 | } | ||
132 | |||
133 | if (ioctl(fd, HPET_INFO, &info) < 0) { | ||
134 | fprintf(stderr, "hpet_poll: failed to get info\n"); | ||
135 | goto out; | ||
136 | } | ||
137 | |||
138 | fprintf(stderr, "hpet_poll: info.hi_flags 0x%lx\n", info.hi_flags); | ||
139 | |||
140 | if (info.hi_flags && (ioctl(fd, HPET_EPI, 0) < 0)) { | ||
141 | fprintf(stderr, "hpet_poll: HPET_EPI failed\n"); | ||
142 | goto out; | ||
143 | } | ||
144 | |||
145 | if (ioctl(fd, HPET_IE_ON, 0) < 0) { | ||
146 | fprintf(stderr, "hpet_poll, HPET_IE_ON failed\n"); | ||
147 | goto out; | ||
148 | } | ||
149 | |||
150 | pfd.fd = fd; | ||
151 | pfd.events = POLLIN; | ||
152 | |||
153 | for (i = 0; i < iterations; i++) { | ||
154 | pfd.revents = 0; | ||
155 | gettimeofday(&stv, &tz); | ||
156 | if (poll(&pfd, 1, -1) < 0) | ||
157 | fprintf(stderr, "hpet_poll: poll failed\n"); | ||
158 | else { | ||
159 | long data; | ||
160 | |||
161 | gettimeofday(&etv, &tz); | ||
162 | usec = stv.tv_sec * 1000000 + stv.tv_usec; | ||
163 | usec = (etv.tv_sec * 1000000 + etv.tv_usec) - usec; | ||
164 | |||
165 | fprintf(stderr, | ||
166 | "hpet_poll: expired time = 0x%lx\n", usec); | ||
167 | |||
168 | fprintf(stderr, "hpet_poll: revents = 0x%x\n", | ||
169 | pfd.revents); | ||
170 | |||
171 | if (read(fd, &data, sizeof(data)) != sizeof(data)) { | ||
172 | fprintf(stderr, "hpet_poll: read failed\n"); | ||
173 | } | ||
174 | else | ||
175 | fprintf(stderr, "hpet_poll: data 0x%lx\n", | ||
176 | data); | ||
177 | } | ||
178 | } | ||
179 | |||
180 | out: | ||
181 | close(fd); | ||
182 | return; | ||
183 | } | ||
184 | |||
185 | static int hpet_sigio_count; | ||
186 | |||
187 | static void | ||
188 | hpet_sigio(int val) | ||
189 | { | ||
190 | fprintf(stderr, "hpet_sigio: called\n"); | ||
191 | hpet_sigio_count++; | ||
192 | } | ||
193 | |||
194 | void | ||
195 | hpet_fasync(int argc, const char **argv) | ||
196 | { | ||
197 | unsigned long freq; | ||
198 | int iterations, i, fd, value; | ||
199 | sig_t oldsig; | ||
200 | struct hpet_info info; | ||
201 | |||
202 | hpet_sigio_count = 0; | ||
203 | fd = -1; | ||
204 | |||
205 | if ((oldsig = signal(SIGIO, hpet_sigio)) == SIG_ERR) { | ||
206 | fprintf(stderr, "hpet_fasync: failed to set signal handler\n"); | ||
207 | return; | ||
208 | } | ||
209 | |||
210 | if (argc != 3) { | ||
211 | fprintf(stderr, "hpet_fasync: device-name freq iterations\n"); | ||
212 | goto out; | ||
213 | } | ||
214 | |||
215 | fd = open(argv[0], O_RDONLY); | ||
216 | |||
217 | if (fd < 0) { | ||
218 | fprintf(stderr, "hpet_fasync: failed to open %s\n", argv[0]); | ||
219 | return; | ||
220 | } | ||
221 | |||
222 | |||
223 | if ((fcntl(fd, F_SETOWN, getpid()) == 1) || | ||
224 | ((value = fcntl(fd, F_GETFL)) == 1) || | ||
225 | (fcntl(fd, F_SETFL, value | O_ASYNC) == 1)) { | ||
226 | fprintf(stderr, "hpet_fasync: fcntl failed\n"); | ||
227 | goto out; | ||
228 | } | ||
229 | |||
230 | freq = atoi(argv[1]); | ||
231 | iterations = atoi(argv[2]); | ||
232 | |||
233 | if (ioctl(fd, HPET_IRQFREQ, freq) < 0) { | ||
234 | fprintf(stderr, "hpet_fasync: HPET_IRQFREQ failed\n"); | ||
235 | goto out; | ||
236 | } | ||
237 | |||
238 | if (ioctl(fd, HPET_INFO, &info) < 0) { | ||
239 | fprintf(stderr, "hpet_fasync: failed to get info\n"); | ||
240 | goto out; | ||
241 | } | ||
242 | |||
243 | fprintf(stderr, "hpet_fasync: info.hi_flags 0x%lx\n", info.hi_flags); | ||
244 | |||
245 | if (info.hi_flags && (ioctl(fd, HPET_EPI, 0) < 0)) { | ||
246 | fprintf(stderr, "hpet_fasync: HPET_EPI failed\n"); | ||
247 | goto out; | ||
248 | } | ||
249 | |||
250 | if (ioctl(fd, HPET_IE_ON, 0) < 0) { | ||
251 | fprintf(stderr, "hpet_fasync, HPET_IE_ON failed\n"); | ||
252 | goto out; | ||
253 | } | ||
254 | |||
255 | for (i = 0; i < iterations; i++) { | ||
256 | (void) pause(); | ||
257 | fprintf(stderr, "hpet_fasync: count = %d\n", hpet_sigio_count); | ||
258 | } | ||
259 | |||
260 | out: | ||
261 | signal(SIGIO, oldsig); | ||
262 | |||
263 | if (fd >= 0) | ||
264 | close(fd); | ||
265 | |||
266 | return; | ||
267 | } | ||
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt index 02ac6ed38b2d..09bd8e902989 100644 --- a/Documentation/trace/events.txt +++ b/Documentation/trace/events.txt | |||
@@ -90,7 +90,8 @@ In order to facilitate early boot debugging, use boot option: | |||
90 | 90 | ||
91 | trace_event=[event-list] | 91 | trace_event=[event-list] |
92 | 92 | ||
93 | The format of this boot option is the same as described in section 2.1. | 93 | event-list is a comma separated list of events. See section 2.1 for event |
94 | format. | ||
94 | 95 | ||
95 | 3. Defining an event-enabled tracepoint | 96 | 3. Defining an event-enabled tracepoint |
96 | ======================================= | 97 | ======================================= |
@@ -238,7 +239,7 @@ subsystem's filter file. | |||
238 | 239 | ||
239 | For convenience, filters for every event in a subsystem can be set or | 240 | For convenience, filters for every event in a subsystem can be set or |
240 | cleared as a group by writing a filter expression into the filter file | 241 | cleared as a group by writing a filter expression into the filter file |
241 | at the root of the subsytem. Note however, that if a filter for any | 242 | at the root of the subsystem. Note however, that if a filter for any |
242 | event within the subsystem lacks a field specified in the subsystem | 243 | event within the subsystem lacks a field specified in the subsystem |
243 | filter, or if the filter can't be applied for any other reason, the | 244 | filter, or if the filter can't be applied for any other reason, the |
244 | filter for that event will retain its previous setting. This can | 245 | filter for that event will retain its previous setting. This can |
@@ -250,7 +251,7 @@ fields can be guaranteed to propagate successfully to all events. | |||
250 | Here are a few subsystem filter examples that also illustrate the | 251 | Here are a few subsystem filter examples that also illustrate the |
251 | above points: | 252 | above points: |
252 | 253 | ||
253 | Clear the filters on all events in the sched subsytem: | 254 | Clear the filters on all events in the sched subsystem: |
254 | 255 | ||
255 | # cd /sys/kernel/debug/tracing/events/sched | 256 | # cd /sys/kernel/debug/tracing/events/sched |
256 | # echo 0 > filter | 257 | # echo 0 > filter |
@@ -260,7 +261,7 @@ none | |||
260 | none | 261 | none |
261 | 262 | ||
262 | Set a filter using only common fields for all events in the sched | 263 | Set a filter using only common fields for all events in the sched |
263 | subsytem (all events end up with the same filter): | 264 | subsystem (all events end up with the same filter): |
264 | 265 | ||
265 | # cd /sys/kernel/debug/tracing/events/sched | 266 | # cd /sys/kernel/debug/tracing/events/sched |
266 | # echo common_pid == 0 > filter | 267 | # echo common_pid == 0 > filter |
@@ -270,7 +271,7 @@ common_pid == 0 | |||
270 | common_pid == 0 | 271 | common_pid == 0 |
271 | 272 | ||
272 | Attempt to set a filter using a non-common field for all events in the | 273 | Attempt to set a filter using a non-common field for all events in the |
273 | sched subsytem (all events but those that have a prev_pid field retain | 274 | sched subsystem (all events but those that have a prev_pid field retain |
274 | their old filters): | 275 | their old filters): |
275 | 276 | ||
276 | # cd /sys/kernel/debug/tracing/events/sched | 277 | # cd /sys/kernel/debug/tracing/events/sched |
diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt index 6a5a579126b0..f1f81afee8a0 100644 --- a/Documentation/trace/ftrace-design.txt +++ b/Documentation/trace/ftrace-design.txt | |||
@@ -238,11 +238,10 @@ HAVE_SYSCALL_TRACEPOINTS | |||
238 | 238 | ||
239 | You need very few things to get the syscalls tracing in an arch. | 239 | You need very few things to get the syscalls tracing in an arch. |
240 | 240 | ||
241 | - Support HAVE_ARCH_TRACEHOOK (see arch/Kconfig). | ||
241 | - Have a NR_syscalls variable in <asm/unistd.h> that provides the number | 242 | - Have a NR_syscalls variable in <asm/unistd.h> that provides the number |
242 | of syscalls supported by the arch. | 243 | of syscalls supported by the arch. |
243 | - Implement arch_syscall_addr() that resolves a syscall address from a | 244 | - Support the TIF_SYSCALL_TRACEPOINT thread flags. |
244 | syscall number. | ||
245 | - Support the TIF_SYSCALL_TRACEPOINT thread flags | ||
246 | - Put the trace_sys_enter() and trace_sys_exit() tracepoints calls from ptrace | 245 | - Put the trace_sys_enter() and trace_sys_exit() tracepoints calls from ptrace |
247 | in the ptrace syscalls tracing path. | 246 | in the ptrace syscalls tracing path. |
248 | - Tag this arch as HAVE_SYSCALL_TRACEPOINTS. | 247 | - Tag this arch as HAVE_SYSCALL_TRACEPOINTS. |
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index bab3040da548..557c1edeccaf 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt | |||
@@ -155,6 +155,9 @@ of ftrace. Here is a list of some of the key files: | |||
155 | to be traced. Echoing names of functions into this file | 155 | to be traced. Echoing names of functions into this file |
156 | will limit the trace to only those functions. | 156 | will limit the trace to only those functions. |
157 | 157 | ||
158 | This interface also allows for commands to be used. See the | ||
159 | "Filter commands" section for more details. | ||
160 | |||
158 | set_ftrace_notrace: | 161 | set_ftrace_notrace: |
159 | 162 | ||
160 | This has an effect opposite to that of | 163 | This has an effect opposite to that of |
@@ -1337,12 +1340,14 @@ ftrace_dump_on_oops must be set. To set ftrace_dump_on_oops, one | |||
1337 | can either use the sysctl function or set it via the proc system | 1340 | can either use the sysctl function or set it via the proc system |
1338 | interface. | 1341 | interface. |
1339 | 1342 | ||
1340 | sysctl kernel.ftrace_dump_on_oops=1 | 1343 | sysctl kernel.ftrace_dump_on_oops=n |
1341 | 1344 | ||
1342 | or | 1345 | or |
1343 | 1346 | ||
1344 | echo 1 > /proc/sys/kernel/ftrace_dump_on_oops | 1347 | echo n > /proc/sys/kernel/ftrace_dump_on_oops |
1345 | 1348 | ||
1349 | If n = 1, ftrace will dump buffers of all CPUs, if n = 2 ftrace will | ||
1350 | only dump the buffer of the CPU that triggered the oops. | ||
1346 | 1351 | ||
1347 | Here's an example of such a dump after a null pointer | 1352 | Here's an example of such a dump after a null pointer |
1348 | dereference in a kernel module: | 1353 | dereference in a kernel module: |
@@ -1588,7 +1593,7 @@ module author does not need to worry about it. | |||
1588 | 1593 | ||
1589 | When tracing is enabled, kstop_machine is called to prevent | 1594 | When tracing is enabled, kstop_machine is called to prevent |
1590 | races with the CPUS executing code being modified (which can | 1595 | races with the CPUS executing code being modified (which can |
1591 | cause the CPU to do undesireable things), and the nops are | 1596 | cause the CPU to do undesirable things), and the nops are |
1592 | patched back to calls. But this time, they do not call mcount | 1597 | patched back to calls. But this time, they do not call mcount |
1593 | (which is just a function stub). They now call into the ftrace | 1598 | (which is just a function stub). They now call into the ftrace |
1594 | infrastructure. | 1599 | infrastructure. |
@@ -1822,6 +1827,47 @@ this special filter via: | |||
1822 | echo > set_graph_function | 1827 | echo > set_graph_function |
1823 | 1828 | ||
1824 | 1829 | ||
1830 | Filter commands | ||
1831 | --------------- | ||
1832 | |||
1833 | A few commands are supported by the set_ftrace_filter interface. | ||
1834 | Trace commands have the following format: | ||
1835 | |||
1836 | <function>:<command>:<parameter> | ||
1837 | |||
1838 | The following commands are supported: | ||
1839 | |||
1840 | - mod | ||
1841 | This command enables function filtering per module. The | ||
1842 | parameter defines the module. For example, if only the write* | ||
1843 | functions in the ext3 module are desired, run: | ||
1844 | |||
1845 | echo 'write*:mod:ext3' > set_ftrace_filter | ||
1846 | |||
1847 | This command interacts with the filter in the same way as | ||
1848 | filtering based on function names. Thus, adding more functions | ||
1849 | in a different module is accomplished by appending (>>) to the | ||
1850 | filter file. Remove specific module functions by prepending | ||
1851 | '!': | ||
1852 | |||
1853 | echo '!writeback*:mod:ext3' >> set_ftrace_filter | ||
1854 | |||
1855 | - traceon/traceoff | ||
1856 | These commands turn tracing on and off when the specified | ||
1857 | functions are hit. The parameter determines how many times the | ||
1858 | tracing system is turned on and off. If unspecified, there is | ||
1859 | no limit. For example, to disable tracing when a schedule bug | ||
1860 | is hit the first 5 times, run: | ||
1861 | |||
1862 | echo '__schedule_bug:traceoff:5' > set_ftrace_filter | ||
1863 | |||
1864 | These commands are cumulative whether or not they are appended | ||
1865 | to set_ftrace_filter. To remove a command, prepend it by '!' | ||
1866 | and drop the parameter: | ||
1867 | |||
1868 | echo '!__schedule_bug:traceoff' > set_ftrace_filter | ||
1869 | |||
1870 | |||
1825 | trace_pipe | 1871 | trace_pipe |
1826 | ---------- | 1872 | ---------- |
1827 | 1873 | ||
diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index 47aabeebbdf6..ec94748ae65b 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt | |||
@@ -24,6 +24,7 @@ Synopsis of kprobe_events | |||
24 | ------------------------- | 24 | ------------------------- |
25 | p[:[GRP/]EVENT] SYMBOL[+offs]|MEMADDR [FETCHARGS] : Set a probe | 25 | p[:[GRP/]EVENT] SYMBOL[+offs]|MEMADDR [FETCHARGS] : Set a probe |
26 | r[:[GRP/]EVENT] SYMBOL[+0] [FETCHARGS] : Set a return probe | 26 | r[:[GRP/]EVENT] SYMBOL[+0] [FETCHARGS] : Set a return probe |
27 | -:[GRP/]EVENT : Clear a probe | ||
27 | 28 | ||
28 | GRP : Group name. If omitted, use "kprobes" for it. | 29 | GRP : Group name. If omitted, use "kprobes" for it. |
29 | EVENT : Event name. If omitted, the event name is generated | 30 | EVENT : Event name. If omitted, the event name is generated |
@@ -37,15 +38,14 @@ Synopsis of kprobe_events | |||
37 | @SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol) | 38 | @SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol) |
38 | $stackN : Fetch Nth entry of stack (N >= 0) | 39 | $stackN : Fetch Nth entry of stack (N >= 0) |
39 | $stack : Fetch stack address. | 40 | $stack : Fetch stack address. |
40 | $argN : Fetch function argument. (N >= 0)(*) | 41 | $retval : Fetch return value.(*) |
41 | $retval : Fetch return value.(**) | 42 | +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**) |
42 | +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(***) | 43 | NAME=FETCHARG : Set NAME as the argument name of FETCHARG. |
43 | NAME=FETCHARG: Set NAME as the argument name of FETCHARG. | 44 | FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types |
45 | (u8/u16/u32/u64/s8/s16/s32/s64) are supported. | ||
44 | 46 | ||
45 | (*) aN may not correct on asmlinkaged functions and at the middle of | 47 | (*) only for return probe. |
46 | function body. | 48 | (**) this is useful for fetching a field of data structures. |
47 | (**) only for return probe. | ||
48 | (***) this is useful for fetching a field of data structures. | ||
49 | 49 | ||
50 | 50 | ||
51 | Per-Probe Event Filtering | 51 | Per-Probe Event Filtering |
@@ -82,13 +82,16 @@ Usage examples | |||
82 | To add a probe as a new event, write a new definition to kprobe_events | 82 | To add a probe as a new event, write a new definition to kprobe_events |
83 | as below. | 83 | as below. |
84 | 84 | ||
85 | echo p:myprobe do_sys_open dfd=$arg0 filename=$arg1 flags=$arg2 mode=$arg3 > /sys/kernel/debug/tracing/kprobe_events | 85 | echo 'p:myprobe do_sys_open dfd=%ax filename=%dx flags=%cx mode=+4($stack)' > /sys/kernel/debug/tracing/kprobe_events |
86 | 86 | ||
87 | This sets a kprobe on the top of do_sys_open() function with recording | 87 | This sets a kprobe on the top of do_sys_open() function with recording |
88 | 1st to 4th arguments as "myprobe" event. As this example shows, users can | 88 | 1st to 4th arguments as "myprobe" event. Note, which register/stack entry is |
89 | choose more familiar names for each arguments. | 89 | assigned to each function argument depends on arch-specific ABI. If you unsure |
90 | the ABI, please try to use probe subcommand of perf-tools (you can find it | ||
91 | under tools/perf/). | ||
92 | As this example shows, users can choose more familiar names for each arguments. | ||
90 | 93 | ||
91 | echo r:myretprobe do_sys_open $retval >> /sys/kernel/debug/tracing/kprobe_events | 94 | echo 'r:myretprobe do_sys_open $retval' >> /sys/kernel/debug/tracing/kprobe_events |
92 | 95 | ||
93 | This sets a kretprobe on the return point of do_sys_open() function with | 96 | This sets a kretprobe on the return point of do_sys_open() function with |
94 | recording return value as "myretprobe" event. | 97 | recording return value as "myretprobe" event. |
@@ -97,23 +100,24 @@ recording return value as "myretprobe" event. | |||
97 | 100 | ||
98 | cat /sys/kernel/debug/tracing/events/kprobes/myprobe/format | 101 | cat /sys/kernel/debug/tracing/events/kprobes/myprobe/format |
99 | name: myprobe | 102 | name: myprobe |
100 | ID: 75 | 103 | ID: 780 |
101 | format: | 104 | format: |
102 | field:unsigned short common_type; offset:0; size:2; | 105 | field:unsigned short common_type; offset:0; size:2; signed:0; |
103 | field:unsigned char common_flags; offset:2; size:1; | 106 | field:unsigned char common_flags; offset:2; size:1; signed:0; |
104 | field:unsigned char common_preempt_count; offset:3; size:1; | 107 | field:unsigned char common_preempt_count; offset:3; size:1;signed:0; |
105 | field:int common_pid; offset:4; size:4; | 108 | field:int common_pid; offset:4; size:4; signed:1; |
106 | field:int common_tgid; offset:8; size:4; | 109 | field:int common_lock_depth; offset:8; size:4; signed:1; |
107 | 110 | ||
108 | field: unsigned long ip; offset:16;tsize:8; | 111 | field:unsigned long __probe_ip; offset:12; size:4; signed:0; |
109 | field: int nargs; offset:24;tsize:4; | 112 | field:int __probe_nargs; offset:16; size:4; signed:1; |
110 | field: unsigned long dfd; offset:32;tsize:8; | 113 | field:unsigned long dfd; offset:20; size:4; signed:0; |
111 | field: unsigned long filename; offset:40;tsize:8; | 114 | field:unsigned long filename; offset:24; size:4; signed:0; |
112 | field: unsigned long flags; offset:48;tsize:8; | 115 | field:unsigned long flags; offset:28; size:4; signed:0; |
113 | field: unsigned long mode; offset:56;tsize:8; | 116 | field:unsigned long mode; offset:32; size:4; signed:0; |
114 | 117 | ||
115 | print fmt: "(%lx) dfd=%lx filename=%lx flags=%lx mode=%lx", REC->ip, REC->dfd, REC->filename, REC->flags, REC->mode | ||
116 | 118 | ||
119 | print fmt: "(%lx) dfd=%lx filename=%lx flags=%lx mode=%lx", REC->__probe_ip, | ||
120 | REC->dfd, REC->filename, REC->flags, REC->mode | ||
117 | 121 | ||
118 | You can see that the event has 4 arguments as in the expressions you specified. | 122 | You can see that the event has 4 arguments as in the expressions you specified. |
119 | 123 | ||
@@ -121,6 +125,12 @@ print fmt: "(%lx) dfd=%lx filename=%lx flags=%lx mode=%lx", REC->ip, REC->dfd, R | |||
121 | 125 | ||
122 | This clears all probe points. | 126 | This clears all probe points. |
123 | 127 | ||
128 | Or, | ||
129 | |||
130 | echo -:myprobe >> kprobe_events | ||
131 | |||
132 | This clears probe points selectively. | ||
133 | |||
124 | Right after definition, each event is disabled by default. For tracing these | 134 | Right after definition, each event is disabled by default. For tracing these |
125 | events, you need to enable it. | 135 | events, you need to enable it. |
126 | 136 | ||
@@ -146,4 +156,3 @@ events, you need to enable it. | |||
146 | returns from SYMBOL(e.g. "sys_open+0x1b/0x1d <- do_sys_open" means kernel | 156 | returns from SYMBOL(e.g. "sys_open+0x1b/0x1d <- do_sys_open" means kernel |
147 | returns from do_sys_open to sys_open+0x1b). | 157 | returns from do_sys_open to sys_open+0x1b). |
148 | 158 | ||
149 | |||
diff --git a/Documentation/usb/WUSB-Design-overview.txt b/Documentation/usb/WUSB-Design-overview.txt index c480e9c32dbd..4c5e37939344 100644 --- a/Documentation/usb/WUSB-Design-overview.txt +++ b/Documentation/usb/WUSB-Design-overview.txt | |||
@@ -381,7 +381,7 @@ descriptor that gives us the status of the transfer, its identification | |||
381 | we issue another URB to read into the destination buffer the chunk of | 381 | we issue another URB to read into the destination buffer the chunk of |
382 | data coming out of the remote endpoint. Done, wait for the next guy. The | 382 | data coming out of the remote endpoint. Done, wait for the next guy. The |
383 | callbacks for the URBs issued from here are the ones that will declare | 383 | callbacks for the URBs issued from here are the ones that will declare |
384 | the xfer complete at some point and call it's callback. | 384 | the xfer complete at some point and call its callback. |
385 | 385 | ||
386 | Seems simple, but the implementation is not trivial. | 386 | Seems simple, but the implementation is not trivial. |
387 | 387 | ||
diff --git a/Documentation/usb/bulk-streams.txt b/Documentation/usb/bulk-streams.txt new file mode 100644 index 000000000000..ffc02021863e --- /dev/null +++ b/Documentation/usb/bulk-streams.txt | |||
@@ -0,0 +1,78 @@ | |||
1 | Background | ||
2 | ========== | ||
3 | |||
4 | Bulk endpoint streams were added in the USB 3.0 specification. Streams allow a | ||
5 | device driver to overload a bulk endpoint so that multiple transfers can be | ||
6 | queued at once. | ||
7 | |||
8 | Streams are defined in sections 4.4.6.4 and 8.12.1.4 of the Universal Serial Bus | ||
9 | 3.0 specification at http://www.usb.org/developers/docs/ The USB Attached SCSI | ||
10 | Protocol, which uses streams to queue multiple SCSI commands, can be found on | ||
11 | the T10 website (http://t10.org/). | ||
12 | |||
13 | |||
14 | Device-side implications | ||
15 | ======================== | ||
16 | |||
17 | Once a buffer has been queued to a stream ring, the device is notified (through | ||
18 | an out-of-band mechanism on another endpoint) that data is ready for that stream | ||
19 | ID. The device then tells the host which "stream" it wants to start. The host | ||
20 | can also initiate a transfer on a stream without the device asking, but the | ||
21 | device can refuse that transfer. Devices can switch between streams at any | ||
22 | time. | ||
23 | |||
24 | |||
25 | Driver implications | ||
26 | =================== | ||
27 | |||
28 | int usb_alloc_streams(struct usb_interface *interface, | ||
29 | struct usb_host_endpoint **eps, unsigned int num_eps, | ||
30 | unsigned int num_streams, gfp_t mem_flags); | ||
31 | |||
32 | Device drivers will call this API to request that the host controller driver | ||
33 | allocate memory so the driver can use up to num_streams stream IDs. They must | ||
34 | pass an array of usb_host_endpoints that need to be setup with similar stream | ||
35 | IDs. This is to ensure that a UASP driver will be able to use the same stream | ||
36 | ID for the bulk IN and OUT endpoints used in a Bi-directional command sequence. | ||
37 | |||
38 | The return value is an error condition (if one of the endpoints doesn't support | ||
39 | streams, or the xHCI driver ran out of memory), or the number of streams the | ||
40 | host controller allocated for this endpoint. The xHCI host controller hardware | ||
41 | declares how many stream IDs it can support, and each bulk endpoint on a | ||
42 | SuperSpeed device will say how many stream IDs it can handle. Therefore, | ||
43 | drivers should be able to deal with being allocated less stream IDs than they | ||
44 | requested. | ||
45 | |||
46 | Do NOT call this function if you have URBs enqueued for any of the endpoints | ||
47 | passed in as arguments. Do not call this function to request less than two | ||
48 | streams. | ||
49 | |||
50 | Drivers will only be allowed to call this API once for the same endpoint | ||
51 | without calling usb_free_streams(). This is a simplification for the xHCI host | ||
52 | controller driver, and may change in the future. | ||
53 | |||
54 | |||
55 | Picking new Stream IDs to use | ||
56 | ============================ | ||
57 | |||
58 | Stream ID 0 is reserved, and should not be used to communicate with devices. If | ||
59 | usb_alloc_streams() returns with a value of N, you may use streams 1 though N. | ||
60 | To queue an URB for a specific stream, set the urb->stream_id value. If the | ||
61 | endpoint does not support streams, an error will be returned. | ||
62 | |||
63 | Note that new API to choose the next stream ID will have to be added if the xHCI | ||
64 | driver supports secondary stream IDs. | ||
65 | |||
66 | |||
67 | Clean up | ||
68 | ======== | ||
69 | |||
70 | If a driver wishes to stop using streams to communicate with the device, it | ||
71 | should call | ||
72 | |||
73 | void usb_free_streams(struct usb_interface *interface, | ||
74 | struct usb_host_endpoint **eps, unsigned int num_eps, | ||
75 | gfp_t mem_flags); | ||
76 | |||
77 | All stream IDs will be deallocated when the driver releases the interface, to | ||
78 | ensure that drivers that don't support streams will be able to use the endpoint. | ||
diff --git a/Documentation/usb/dma.txt b/Documentation/usb/dma.txt index cfdcd16e3abf..84ef865237db 100644 --- a/Documentation/usb/dma.txt +++ b/Documentation/usb/dma.txt | |||
@@ -16,11 +16,11 @@ OR: they can now be DMA-aware. | |||
16 | manage dma mappings for existing dma-ready buffers (see below). | 16 | manage dma mappings for existing dma-ready buffers (see below). |
17 | 17 | ||
18 | - URBs have an additional "transfer_dma" field, as well as a transfer_flags | 18 | - URBs have an additional "transfer_dma" field, as well as a transfer_flags |
19 | bit saying if it's valid. (Control requests also have "setup_dma" and a | 19 | bit saying if it's valid. (Control requests also have "setup_dma", but |
20 | corresponding transfer_flags bit.) | 20 | drivers must not use it.) |
21 | 21 | ||
22 | - "usbcore" will map those DMA addresses, if a DMA-aware driver didn't do | 22 | - "usbcore" will map this DMA address, if a DMA-aware driver didn't do |
23 | it first and set URB_NO_TRANSFER_DMA_MAP or URB_NO_SETUP_DMA_MAP. HCDs | 23 | it first and set URB_NO_TRANSFER_DMA_MAP. HCDs |
24 | don't manage dma mappings for URBs. | 24 | don't manage dma mappings for URBs. |
25 | 25 | ||
26 | - There's a new "generic DMA API", parts of which are usable by USB device | 26 | - There's a new "generic DMA API", parts of which are usable by USB device |
@@ -43,22 +43,16 @@ and effects like cache-trashing can impose subtle penalties. | |||
43 | kind of addresses to store in urb->transfer_buffer and urb->transfer_dma. | 43 | kind of addresses to store in urb->transfer_buffer and urb->transfer_dma. |
44 | You'd also set URB_NO_TRANSFER_DMA_MAP in urb->transfer_flags: | 44 | You'd also set URB_NO_TRANSFER_DMA_MAP in urb->transfer_flags: |
45 | 45 | ||
46 | void *usb_buffer_alloc (struct usb_device *dev, size_t size, | 46 | void *usb_alloc_coherent (struct usb_device *dev, size_t size, |
47 | int mem_flags, dma_addr_t *dma); | 47 | int mem_flags, dma_addr_t *dma); |
48 | 48 | ||
49 | void usb_buffer_free (struct usb_device *dev, size_t size, | 49 | void usb_free_coherent (struct usb_device *dev, size_t size, |
50 | void *addr, dma_addr_t dma); | 50 | void *addr, dma_addr_t dma); |
51 | 51 | ||
52 | Most drivers should *NOT* be using these primitives; they don't need | 52 | Most drivers should *NOT* be using these primitives; they don't need |
53 | to use this type of memory ("dma-coherent"), and memory returned from | 53 | to use this type of memory ("dma-coherent"), and memory returned from |
54 | kmalloc() will work just fine. | 54 | kmalloc() will work just fine. |
55 | 55 | ||
56 | For control transfers you can use the buffer primitives or not for each | ||
57 | of the transfer buffer and setup buffer independently. Set the flag bits | ||
58 | URB_NO_TRANSFER_DMA_MAP and URB_NO_SETUP_DMA_MAP to indicate which | ||
59 | buffers you have prepared. For non-control transfers URB_NO_SETUP_DMA_MAP | ||
60 | is ignored. | ||
61 | |||
62 | The memory buffer returned is "dma-coherent"; sometimes you might need to | 56 | The memory buffer returned is "dma-coherent"; sometimes you might need to |
63 | force a consistent memory access ordering by using memory barriers. It's | 57 | force a consistent memory access ordering by using memory barriers. It's |
64 | not using a streaming DMA mapping, so it's good for small transfers on | 58 | not using a streaming DMA mapping, so it's good for small transfers on |
@@ -130,8 +124,8 @@ of Documentation/PCI/PCI-DMA-mapping.txt, titled "What memory is DMA-able?") | |||
130 | void usb_buffer_unmap (struct urb *urb); | 124 | void usb_buffer_unmap (struct urb *urb); |
131 | 125 | ||
132 | The calls manage urb->transfer_dma for you, and set URB_NO_TRANSFER_DMA_MAP | 126 | The calls manage urb->transfer_dma for you, and set URB_NO_TRANSFER_DMA_MAP |
133 | so that usbcore won't map or unmap the buffer. The same goes for | 127 | so that usbcore won't map or unmap the buffer. They cannot be used for |
134 | urb->setup_dma and URB_NO_SETUP_DMA_MAP for control requests. | 128 | setup_packet buffers in control requests. |
135 | 129 | ||
136 | Note that several of those interfaces are currently commented out, since | 130 | Note that several of those interfaces are currently commented out, since |
137 | they don't have current users. See the source code. Other than the dmasync | 131 | they don't have current users. See the source code. Other than the dmasync |
diff --git a/Documentation/usb/error-codes.txt b/Documentation/usb/error-codes.txt index 9cf83e8c27b8..d83703ea74b2 100644 --- a/Documentation/usb/error-codes.txt +++ b/Documentation/usb/error-codes.txt | |||
@@ -41,8 +41,8 @@ USB-specific: | |||
41 | 41 | ||
42 | -EFBIG Host controller driver can't schedule that many ISO frames. | 42 | -EFBIG Host controller driver can't schedule that many ISO frames. |
43 | 43 | ||
44 | -EPIPE Specified endpoint is stalled. For non-control endpoints, | 44 | -EPIPE The pipe type specified in the URB doesn't match the |
45 | reset this status with usb_clear_halt(). | 45 | endpoint's actual type. |
46 | 46 | ||
47 | -EMSGSIZE (a) endpoint maxpacket size is zero; it is not usable | 47 | -EMSGSIZE (a) endpoint maxpacket size is zero; it is not usable |
48 | in the current interface altsetting. | 48 | in the current interface altsetting. |
@@ -60,6 +60,8 @@ USB-specific: | |||
60 | 60 | ||
61 | -EHOSTUNREACH URB was rejected because the device is suspended. | 61 | -EHOSTUNREACH URB was rejected because the device is suspended. |
62 | 62 | ||
63 | -ENOEXEC A control URB doesn't contain a Setup packet. | ||
64 | |||
63 | 65 | ||
64 | ************************************************************************** | 66 | ************************************************************************** |
65 | * Error codes returned by in urb->status * | 67 | * Error codes returned by in urb->status * |
diff --git a/Documentation/usb/gadget_hid.txt b/Documentation/usb/gadget_hid.txt new file mode 100644 index 000000000000..f4a51f567427 --- /dev/null +++ b/Documentation/usb/gadget_hid.txt | |||
@@ -0,0 +1,445 @@ | |||
1 | |||
2 | Linux USB HID gadget driver | ||
3 | |||
4 | Introduction | ||
5 | |||
6 | The HID Gadget driver provides emulation of USB Human Interface | ||
7 | Devices (HID). The basic HID handling is done in the kernel, | ||
8 | and HID reports can be sent/received through I/O on the | ||
9 | /dev/hidgX character devices. | ||
10 | |||
11 | For more details about HID, see the developer page on | ||
12 | http://www.usb.org/developers/hidpage/ | ||
13 | |||
14 | Configuration | ||
15 | |||
16 | g_hid is a platform driver, so to use it you need to add | ||
17 | struct platform_device(s) to your platform code defining the | ||
18 | HID function descriptors you want to use - E.G. something | ||
19 | like: | ||
20 | |||
21 | #include <linux/platform_device.h> | ||
22 | #include <linux/usb/g_hid.h> | ||
23 | |||
24 | /* hid descriptor for a keyboard */ | ||
25 | static struct hidg_func_descriptor my_hid_data = { | ||
26 | .subclass = 0, /* No subclass */ | ||
27 | .protocol = 1, /* Keyboard */ | ||
28 | .report_length = 8, | ||
29 | .report_desc_length = 63, | ||
30 | .report_desc = { | ||
31 | 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ | ||
32 | 0x09, 0x06, /* USAGE (Keyboard) */ | ||
33 | 0xa1, 0x01, /* COLLECTION (Application) */ | ||
34 | 0x05, 0x07, /* USAGE_PAGE (Keyboard) */ | ||
35 | 0x19, 0xe0, /* USAGE_MINIMUM (Keyboard LeftControl) */ | ||
36 | 0x29, 0xe7, /* USAGE_MAXIMUM (Keyboard Right GUI) */ | ||
37 | 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ | ||
38 | 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */ | ||
39 | 0x75, 0x01, /* REPORT_SIZE (1) */ | ||
40 | 0x95, 0x08, /* REPORT_COUNT (8) */ | ||
41 | 0x81, 0x02, /* INPUT (Data,Var,Abs) */ | ||
42 | 0x95, 0x01, /* REPORT_COUNT (1) */ | ||
43 | 0x75, 0x08, /* REPORT_SIZE (8) */ | ||
44 | 0x81, 0x03, /* INPUT (Cnst,Var,Abs) */ | ||
45 | 0x95, 0x05, /* REPORT_COUNT (5) */ | ||
46 | 0x75, 0x01, /* REPORT_SIZE (1) */ | ||
47 | 0x05, 0x08, /* USAGE_PAGE (LEDs) */ | ||
48 | 0x19, 0x01, /* USAGE_MINIMUM (Num Lock) */ | ||
49 | 0x29, 0x05, /* USAGE_MAXIMUM (Kana) */ | ||
50 | 0x91, 0x02, /* OUTPUT (Data,Var,Abs) */ | ||
51 | 0x95, 0x01, /* REPORT_COUNT (1) */ | ||
52 | 0x75, 0x03, /* REPORT_SIZE (3) */ | ||
53 | 0x91, 0x03, /* OUTPUT (Cnst,Var,Abs) */ | ||
54 | 0x95, 0x06, /* REPORT_COUNT (6) */ | ||
55 | 0x75, 0x08, /* REPORT_SIZE (8) */ | ||
56 | 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ | ||
57 | 0x25, 0x65, /* LOGICAL_MAXIMUM (101) */ | ||
58 | 0x05, 0x07, /* USAGE_PAGE (Keyboard) */ | ||
59 | 0x19, 0x00, /* USAGE_MINIMUM (Reserved) */ | ||
60 | 0x29, 0x65, /* USAGE_MAXIMUM (Keyboard Application) */ | ||
61 | 0x81, 0x00, /* INPUT (Data,Ary,Abs) */ | ||
62 | 0xc0 /* END_COLLECTION */ | ||
63 | } | ||
64 | }; | ||
65 | |||
66 | static struct platform_device my_hid = { | ||
67 | .name = "hidg", | ||
68 | .id = 0, | ||
69 | .num_resources = 0, | ||
70 | .resource = 0, | ||
71 | .dev.platform_data = &my_hid_data, | ||
72 | }; | ||
73 | |||
74 | You can add as many HID functions as you want, only limited by | ||
75 | the amount of interrupt endpoints your gadget driver supports. | ||
76 | |||
77 | Send and receive HID reports | ||
78 | |||
79 | HID reports can be sent/received using read/write on the | ||
80 | /dev/hidgX character devices. See below for an example program | ||
81 | to do this. | ||
82 | |||
83 | hid_gadget_test is a small interactive program to test the HID | ||
84 | gadget driver. To use, point it at a hidg device and set the | ||
85 | device type (keyboard / mouse / joystick) - E.G.: | ||
86 | |||
87 | # hid_gadget_test /dev/hidg0 keyboard | ||
88 | |||
89 | You are now in the prompt of hid_gadget_test. You can type any | ||
90 | combination of options and values. Available options and | ||
91 | values are listed at program start. In keyboard mode you can | ||
92 | send up to six values. | ||
93 | |||
94 | For example type: g i s t r --left-shift | ||
95 | |||
96 | Hit return and the corresponding report will be sent by the | ||
97 | HID gadget. | ||
98 | |||
99 | Another interesting example is the caps lock test. Type | ||
100 | -–caps-lock and hit return. A report is then sent by the | ||
101 | gadget and you should receive the host answer, corresponding | ||
102 | to the caps lock LED status. | ||
103 | |||
104 | --caps-lock | ||
105 | recv report:2 | ||
106 | |||
107 | With this command: | ||
108 | |||
109 | # hid_gadget_test /dev/hidg1 mouse | ||
110 | |||
111 | You can test the mouse emulation. Values are two signed numbers. | ||
112 | |||
113 | |||
114 | Sample code | ||
115 | |||
116 | /* hid_gadget_test */ | ||
117 | |||
118 | #include <pthread.h> | ||
119 | #include <string.h> | ||
120 | #include <stdio.h> | ||
121 | #include <ctype.h> | ||
122 | #include <fcntl.h> | ||
123 | #include <errno.h> | ||
124 | #include <stdio.h> | ||
125 | #include <stdlib.h> | ||
126 | #include <unistd.h> | ||
127 | |||
128 | #define BUF_LEN 512 | ||
129 | |||
130 | struct options { | ||
131 | const char *opt; | ||
132 | unsigned char val; | ||
133 | }; | ||
134 | |||
135 | static struct options kmod[] = { | ||
136 | {.opt = "--left-ctrl", .val = 0x01}, | ||
137 | {.opt = "--right-ctrl", .val = 0x10}, | ||
138 | {.opt = "--left-shift", .val = 0x02}, | ||
139 | {.opt = "--right-shift", .val = 0x20}, | ||
140 | {.opt = "--left-alt", .val = 0x04}, | ||
141 | {.opt = "--right-alt", .val = 0x40}, | ||
142 | {.opt = "--left-meta", .val = 0x08}, | ||
143 | {.opt = "--right-meta", .val = 0x80}, | ||
144 | {.opt = NULL} | ||
145 | }; | ||
146 | |||
147 | static struct options kval[] = { | ||
148 | {.opt = "--return", .val = 0x28}, | ||
149 | {.opt = "--esc", .val = 0x29}, | ||
150 | {.opt = "--bckspc", .val = 0x2a}, | ||
151 | {.opt = "--tab", .val = 0x2b}, | ||
152 | {.opt = "--spacebar", .val = 0x2c}, | ||
153 | {.opt = "--caps-lock", .val = 0x39}, | ||
154 | {.opt = "--f1", .val = 0x3a}, | ||
155 | {.opt = "--f2", .val = 0x3b}, | ||
156 | {.opt = "--f3", .val = 0x3c}, | ||
157 | {.opt = "--f4", .val = 0x3d}, | ||
158 | {.opt = "--f5", .val = 0x3e}, | ||
159 | {.opt = "--f6", .val = 0x3f}, | ||
160 | {.opt = "--f7", .val = 0x40}, | ||
161 | {.opt = "--f8", .val = 0x41}, | ||
162 | {.opt = "--f9", .val = 0x42}, | ||
163 | {.opt = "--f10", .val = 0x43}, | ||
164 | {.opt = "--f11", .val = 0x44}, | ||
165 | {.opt = "--f12", .val = 0x45}, | ||
166 | {.opt = "--insert", .val = 0x49}, | ||
167 | {.opt = "--home", .val = 0x4a}, | ||
168 | {.opt = "--pageup", .val = 0x4b}, | ||
169 | {.opt = "--del", .val = 0x4c}, | ||
170 | {.opt = "--end", .val = 0x4d}, | ||
171 | {.opt = "--pagedown", .val = 0x4e}, | ||
172 | {.opt = "--right", .val = 0x4f}, | ||
173 | {.opt = "--left", .val = 0x50}, | ||
174 | {.opt = "--down", .val = 0x51}, | ||
175 | {.opt = "--kp-enter", .val = 0x58}, | ||
176 | {.opt = "--up", .val = 0x52}, | ||
177 | {.opt = "--num-lock", .val = 0x53}, | ||
178 | {.opt = NULL} | ||
179 | }; | ||
180 | |||
181 | int keyboard_fill_report(char report[8], char buf[BUF_LEN], int *hold) | ||
182 | { | ||
183 | char *tok = strtok(buf, " "); | ||
184 | int key = 0; | ||
185 | int i = 0; | ||
186 | |||
187 | for (; tok != NULL; tok = strtok(NULL, " ")) { | ||
188 | |||
189 | if (strcmp(tok, "--quit") == 0) | ||
190 | return -1; | ||
191 | |||
192 | if (strcmp(tok, "--hold") == 0) { | ||
193 | *hold = 1; | ||
194 | continue; | ||
195 | } | ||
196 | |||
197 | if (key < 6) { | ||
198 | for (i = 0; kval[i].opt != NULL; i++) | ||
199 | if (strcmp(tok, kval[i].opt) == 0) { | ||
200 | report[2 + key++] = kval[i].val; | ||
201 | break; | ||
202 | } | ||
203 | if (kval[i].opt != NULL) | ||
204 | continue; | ||
205 | } | ||
206 | |||
207 | if (key < 6) | ||
208 | if (islower(tok[0])) { | ||
209 | report[2 + key++] = (tok[0] - ('a' - 0x04)); | ||
210 | continue; | ||
211 | } | ||
212 | |||
213 | for (i = 0; kmod[i].opt != NULL; i++) | ||
214 | if (strcmp(tok, kmod[i].opt) == 0) { | ||
215 | report[0] = report[0] | kmod[i].val; | ||
216 | break; | ||
217 | } | ||
218 | if (kmod[i].opt != NULL) | ||
219 | continue; | ||
220 | |||
221 | if (key < 6) | ||
222 | fprintf(stderr, "unknown option: %s\n", tok); | ||
223 | } | ||
224 | return 8; | ||
225 | } | ||
226 | |||
227 | static struct options mmod[] = { | ||
228 | {.opt = "--b1", .val = 0x01}, | ||
229 | {.opt = "--b2", .val = 0x02}, | ||
230 | {.opt = "--b3", .val = 0x04}, | ||
231 | {.opt = NULL} | ||
232 | }; | ||
233 | |||
234 | int mouse_fill_report(char report[8], char buf[BUF_LEN], int *hold) | ||
235 | { | ||
236 | char *tok = strtok(buf, " "); | ||
237 | int mvt = 0; | ||
238 | int i = 0; | ||
239 | for (; tok != NULL; tok = strtok(NULL, " ")) { | ||
240 | |||
241 | if (strcmp(tok, "--quit") == 0) | ||
242 | return -1; | ||
243 | |||
244 | if (strcmp(tok, "--hold") == 0) { | ||
245 | *hold = 1; | ||
246 | continue; | ||
247 | } | ||
248 | |||
249 | for (i = 0; mmod[i].opt != NULL; i++) | ||
250 | if (strcmp(tok, mmod[i].opt) == 0) { | ||
251 | report[0] = report[0] | mmod[i].val; | ||
252 | break; | ||
253 | } | ||
254 | if (mmod[i].opt != NULL) | ||
255 | continue; | ||
256 | |||
257 | if (!(tok[0] == '-' && tok[1] == '-') && mvt < 2) { | ||
258 | errno = 0; | ||
259 | report[1 + mvt++] = (char)strtol(tok, NULL, 0); | ||
260 | if (errno != 0) { | ||
261 | fprintf(stderr, "Bad value:'%s'\n", tok); | ||
262 | report[1 + mvt--] = 0; | ||
263 | } | ||
264 | continue; | ||
265 | } | ||
266 | |||
267 | fprintf(stderr, "unknown option: %s\n", tok); | ||
268 | } | ||
269 | return 3; | ||
270 | } | ||
271 | |||
272 | static struct options jmod[] = { | ||
273 | {.opt = "--b1", .val = 0x10}, | ||
274 | {.opt = "--b2", .val = 0x20}, | ||
275 | {.opt = "--b3", .val = 0x40}, | ||
276 | {.opt = "--b4", .val = 0x80}, | ||
277 | {.opt = "--hat1", .val = 0x00}, | ||
278 | {.opt = "--hat2", .val = 0x01}, | ||
279 | {.opt = "--hat3", .val = 0x02}, | ||
280 | {.opt = "--hat4", .val = 0x03}, | ||
281 | {.opt = "--hatneutral", .val = 0x04}, | ||
282 | {.opt = NULL} | ||
283 | }; | ||
284 | |||
285 | int joystick_fill_report(char report[8], char buf[BUF_LEN], int *hold) | ||
286 | { | ||
287 | char *tok = strtok(buf, " "); | ||
288 | int mvt = 0; | ||
289 | int i = 0; | ||
290 | |||
291 | *hold = 1; | ||
292 | |||
293 | /* set default hat position: neutral */ | ||
294 | report[3] = 0x04; | ||
295 | |||
296 | for (; tok != NULL; tok = strtok(NULL, " ")) { | ||
297 | |||
298 | if (strcmp(tok, "--quit") == 0) | ||
299 | return -1; | ||
300 | |||
301 | for (i = 0; jmod[i].opt != NULL; i++) | ||
302 | if (strcmp(tok, jmod[i].opt) == 0) { | ||
303 | report[3] = (report[3] & 0xF0) | jmod[i].val; | ||
304 | break; | ||
305 | } | ||
306 | if (jmod[i].opt != NULL) | ||
307 | continue; | ||
308 | |||
309 | if (!(tok[0] == '-' && tok[1] == '-') && mvt < 3) { | ||
310 | errno = 0; | ||
311 | report[mvt++] = (char)strtol(tok, NULL, 0); | ||
312 | if (errno != 0) { | ||
313 | fprintf(stderr, "Bad value:'%s'\n", tok); | ||
314 | report[mvt--] = 0; | ||
315 | } | ||
316 | continue; | ||
317 | } | ||
318 | |||
319 | fprintf(stderr, "unknown option: %s\n", tok); | ||
320 | } | ||
321 | return 4; | ||
322 | } | ||
323 | |||
324 | void print_options(char c) | ||
325 | { | ||
326 | int i = 0; | ||
327 | |||
328 | if (c == 'k') { | ||
329 | printf(" keyboard options:\n" | ||
330 | " --hold\n"); | ||
331 | for (i = 0; kmod[i].opt != NULL; i++) | ||
332 | printf("\t\t%s\n", kmod[i].opt); | ||
333 | printf("\n keyboard values:\n" | ||
334 | " [a-z] or\n"); | ||
335 | for (i = 0; kval[i].opt != NULL; i++) | ||
336 | printf("\t\t%-8s%s", kval[i].opt, i % 2 ? "\n" : ""); | ||
337 | printf("\n"); | ||
338 | } else if (c == 'm') { | ||
339 | printf(" mouse options:\n" | ||
340 | " --hold\n"); | ||
341 | for (i = 0; mmod[i].opt != NULL; i++) | ||
342 | printf("\t\t%s\n", mmod[i].opt); | ||
343 | printf("\n mouse values:\n" | ||
344 | " Two signed numbers\n" | ||
345 | "--quit to close\n"); | ||
346 | } else { | ||
347 | printf(" joystick options:\n"); | ||
348 | for (i = 0; jmod[i].opt != NULL; i++) | ||
349 | printf("\t\t%s\n", jmod[i].opt); | ||
350 | printf("\n joystick values:\n" | ||
351 | " three signed numbers\n" | ||
352 | "--quit to close\n"); | ||
353 | } | ||
354 | } | ||
355 | |||
356 | int main(int argc, const char *argv[]) | ||
357 | { | ||
358 | const char *filename = NULL; | ||
359 | int fd = 0; | ||
360 | char buf[BUF_LEN]; | ||
361 | int cmd_len; | ||
362 | char report[8]; | ||
363 | int to_send = 8; | ||
364 | int hold = 0; | ||
365 | fd_set rfds; | ||
366 | int retval, i; | ||
367 | |||
368 | if (argc < 3) { | ||
369 | fprintf(stderr, "Usage: %s devname mouse|keyboard|joystick\n", | ||
370 | argv[0]); | ||
371 | return 1; | ||
372 | } | ||
373 | |||
374 | if (argv[2][0] != 'k' && argv[2][0] != 'm' && argv[2][0] != 'j') | ||
375 | return 2; | ||
376 | |||
377 | filename = argv[1]; | ||
378 | |||
379 | if ((fd = open(filename, O_RDWR, 0666)) == -1) { | ||
380 | perror(filename); | ||
381 | return 3; | ||
382 | } | ||
383 | |||
384 | print_options(argv[2][0]); | ||
385 | |||
386 | while (42) { | ||
387 | |||
388 | FD_ZERO(&rfds); | ||
389 | FD_SET(STDIN_FILENO, &rfds); | ||
390 | FD_SET(fd, &rfds); | ||
391 | |||
392 | retval = select(fd + 1, &rfds, NULL, NULL, NULL); | ||
393 | if (retval == -1 && errno == EINTR) | ||
394 | continue; | ||
395 | if (retval < 0) { | ||
396 | perror("select()"); | ||
397 | return 4; | ||
398 | } | ||
399 | |||
400 | if (FD_ISSET(fd, &rfds)) { | ||
401 | cmd_len = read(fd, buf, BUF_LEN - 1); | ||
402 | printf("recv report:"); | ||
403 | for (i = 0; i < cmd_len; i++) | ||
404 | printf(" %02x", buf[i]); | ||
405 | printf("\n"); | ||
406 | } | ||
407 | |||
408 | if (FD_ISSET(STDIN_FILENO, &rfds)) { | ||
409 | memset(report, 0x0, sizeof(report)); | ||
410 | cmd_len = read(STDIN_FILENO, buf, BUF_LEN - 1); | ||
411 | |||
412 | if (cmd_len == 0) | ||
413 | break; | ||
414 | |||
415 | buf[cmd_len - 1] = '\0'; | ||
416 | hold = 0; | ||
417 | |||
418 | memset(report, 0x0, sizeof(report)); | ||
419 | if (argv[2][0] == 'k') | ||
420 | to_send = keyboard_fill_report(report, buf, &hold); | ||
421 | else if (argv[2][0] == 'm') | ||
422 | to_send = mouse_fill_report(report, buf, &hold); | ||
423 | else | ||
424 | to_send = joystick_fill_report(report, buf, &hold); | ||
425 | |||
426 | if (to_send == -1) | ||
427 | break; | ||
428 | |||
429 | if (write(fd, report, to_send) != to_send) { | ||
430 | perror(filename); | ||
431 | return 5; | ||
432 | } | ||
433 | if (!hold) { | ||
434 | memset(report, 0x0, sizeof(report)); | ||
435 | if (write(fd, report, to_send) != to_send) { | ||
436 | perror(filename); | ||
437 | return 6; | ||
438 | } | ||
439 | } | ||
440 | } | ||
441 | } | ||
442 | |||
443 | close(fd); | ||
444 | return 0; | ||
445 | } | ||
diff --git a/Documentation/usb/power-management.txt b/Documentation/usb/power-management.txt index 3bf6818c8cf5..b29d8e56cf28 100644 --- a/Documentation/usb/power-management.txt +++ b/Documentation/usb/power-management.txt | |||
@@ -2,7 +2,7 @@ | |||
2 | 2 | ||
3 | Alan Stern <stern@rowland.harvard.edu> | 3 | Alan Stern <stern@rowland.harvard.edu> |
4 | 4 | ||
5 | November 10, 2009 | 5 | December 11, 2009 |
6 | 6 | ||
7 | 7 | ||
8 | 8 | ||
@@ -29,9 +29,9 @@ covered to some extent (see Documentation/power/*.txt for more | |||
29 | information about system PM). | 29 | information about system PM). |
30 | 30 | ||
31 | Note: Dynamic PM support for USB is present only if the kernel was | 31 | Note: Dynamic PM support for USB is present only if the kernel was |
32 | built with CONFIG_USB_SUSPEND enabled. System PM support is present | 32 | built with CONFIG_USB_SUSPEND enabled (which depends on |
33 | only if the kernel was built with CONFIG_SUSPEND or CONFIG_HIBERNATION | 33 | CONFIG_PM_RUNTIME). System PM support is present only if the kernel |
34 | enabled. | 34 | was built with CONFIG_SUSPEND or CONFIG_HIBERNATION enabled. |
35 | 35 | ||
36 | 36 | ||
37 | What is Remote Wakeup? | 37 | What is Remote Wakeup? |
@@ -107,7 +107,9 @@ allowed to issue dynamic suspends. | |||
107 | The user interface for controlling dynamic PM is located in the power/ | 107 | The user interface for controlling dynamic PM is located in the power/ |
108 | subdirectory of each USB device's sysfs directory, that is, in | 108 | subdirectory of each USB device's sysfs directory, that is, in |
109 | /sys/bus/usb/devices/.../power/ where "..." is the device's ID. The | 109 | /sys/bus/usb/devices/.../power/ where "..." is the device's ID. The |
110 | relevant attribute files are: wakeup, level, and autosuspend. | 110 | relevant attribute files are: wakeup, control, and autosuspend. |
111 | (There may also be a file named "level"; this file was deprecated | ||
112 | as of the 2.6.35 kernel and replaced by the "control" file.) | ||
111 | 113 | ||
112 | power/wakeup | 114 | power/wakeup |
113 | 115 | ||
@@ -120,7 +122,7 @@ relevant attribute files are: wakeup, level, and autosuspend. | |||
120 | while the device is suspended, the change won't take | 122 | while the device is suspended, the change won't take |
121 | effect until the following suspend.) | 123 | effect until the following suspend.) |
122 | 124 | ||
123 | power/level | 125 | power/control |
124 | 126 | ||
125 | This file contains one of two words: "on" or "auto". | 127 | This file contains one of two words: "on" or "auto". |
126 | You can write those words to the file to change the | 128 | You can write those words to the file to change the |
@@ -148,14 +150,15 @@ relevant attribute files are: wakeup, level, and autosuspend. | |||
148 | never to autosuspend. You can write a number to the | 150 | never to autosuspend. You can write a number to the |
149 | file to change the autosuspend idle-delay time. | 151 | file to change the autosuspend idle-delay time. |
150 | 152 | ||
151 | Writing "-1" to power/autosuspend and writing "on" to power/level do | 153 | Writing "-1" to power/autosuspend and writing "on" to power/control do |
152 | essentially the same thing -- they both prevent the device from being | 154 | essentially the same thing -- they both prevent the device from being |
153 | autosuspended. Yes, this is a redundancy in the API. | 155 | autosuspended. Yes, this is a redundancy in the API. |
154 | 156 | ||
155 | (In 2.6.21 writing "0" to power/autosuspend would prevent the device | 157 | (In 2.6.21 writing "0" to power/autosuspend would prevent the device |
156 | from being autosuspended; the behavior was changed in 2.6.22. The | 158 | from being autosuspended; the behavior was changed in 2.6.22. The |
157 | power/autosuspend attribute did not exist prior to 2.6.21, and the | 159 | power/autosuspend attribute did not exist prior to 2.6.21, and the |
158 | power/level attribute did not exist prior to 2.6.22.) | 160 | power/level attribute did not exist prior to 2.6.22. power/control |
161 | was added in 2.6.34.) | ||
159 | 162 | ||
160 | 163 | ||
161 | Changing the default idle-delay time | 164 | Changing the default idle-delay time |
@@ -212,7 +215,7 @@ among printers and scanners, but plenty of other types of device have | |||
212 | the same deficiency. | 215 | the same deficiency. |
213 | 216 | ||
214 | For this reason, by default the kernel disables autosuspend (the | 217 | For this reason, by default the kernel disables autosuspend (the |
215 | power/level attribute is initialized to "on") for all devices other | 218 | power/control attribute is initialized to "on") for all devices other |
216 | than hubs. Hubs, at least, appear to be reasonably well-behaved in | 219 | than hubs. Hubs, at least, appear to be reasonably well-behaved in |
217 | this regard. | 220 | this regard. |
218 | 221 | ||
@@ -229,6 +232,11 @@ necessary operations by hand or add them to a udev script. You can | |||
229 | also change the idle-delay time; 2 seconds is not the best choice for | 232 | also change the idle-delay time; 2 seconds is not the best choice for |
230 | every device. | 233 | every device. |
231 | 234 | ||
235 | If a driver knows that its device has proper suspend/resume support, | ||
236 | it can enable autosuspend all by itself. For example, the video | ||
237 | driver for a laptop's webcam might do this, since these devices are | ||
238 | rarely used and so should normally be autosuspended. | ||
239 | |||
232 | Sometimes it turns out that even when a device does work okay with | 240 | Sometimes it turns out that even when a device does work okay with |
233 | autosuspend there are still problems. For example, there are | 241 | autosuspend there are still problems. For example, there are |
234 | experimental patches adding autosuspend support to the usbhid driver, | 242 | experimental patches adding autosuspend support to the usbhid driver, |
@@ -321,69 +329,81 @@ driver does so by calling these six functions: | |||
321 | void usb_autopm_get_interface_no_resume(struct usb_interface *intf); | 329 | void usb_autopm_get_interface_no_resume(struct usb_interface *intf); |
322 | void usb_autopm_put_interface_no_suspend(struct usb_interface *intf); | 330 | void usb_autopm_put_interface_no_suspend(struct usb_interface *intf); |
323 | 331 | ||
324 | The functions work by maintaining a counter in the usb_interface | 332 | The functions work by maintaining a usage counter in the |
325 | structure. When intf->pm_usage_count is > 0 then the interface is | 333 | usb_interface's embedded device structure. When the counter is > 0 |
326 | deemed to be busy, and the kernel will not autosuspend the interface's | 334 | then the interface is deemed to be busy, and the kernel will not |
327 | device. When intf->pm_usage_count is <= 0 then the interface is | 335 | autosuspend the interface's device. When the usage counter is = 0 |
328 | considered to be idle, and the kernel may autosuspend the device. | 336 | then the interface is considered to be idle, and the kernel may |
337 | autosuspend the device. | ||
329 | 338 | ||
330 | (There is a similar pm_usage_count field in struct usb_device, | 339 | (There is a similar usage counter field in struct usb_device, |
331 | associated with the device itself rather than any of its interfaces. | 340 | associated with the device itself rather than any of its interfaces. |
332 | This field is used only by the USB core.) | 341 | This counter is used only by the USB core.) |
333 | 342 | ||
334 | Drivers must not modify intf->pm_usage_count directly; its value | 343 | Drivers need not be concerned about balancing changes to the usage |
335 | should be changed only be using the functions listed above. Drivers | 344 | counter; the USB core will undo any remaining "get"s when a driver |
336 | are responsible for insuring that the overall change to pm_usage_count | 345 | is unbound from its interface. As a corollary, drivers must not call |
337 | during their lifetime balances out to 0 (it may be necessary for the | 346 | any of the usb_autopm_* functions after their diconnect() routine has |
338 | disconnect method to call usb_autopm_put_interface() one or more times | 347 | returned. |
339 | to fulfill this requirement). The first two routines use the PM mutex | 348 | |
340 | in struct usb_device for mutual exclusion; drivers using the async | 349 | Drivers using the async routines are responsible for their own |
341 | routines are responsible for their own synchronization and mutual | 350 | synchronization and mutual exclusion. |
342 | exclusion. | 351 | |
343 | 352 | usb_autopm_get_interface() increments the usage counter and | |
344 | usb_autopm_get_interface() increments pm_usage_count and | 353 | does an autoresume if the device is suspended. If the |
345 | attempts an autoresume if the new value is > 0 and the | 354 | autoresume fails, the counter is decremented back. |
346 | device is suspended. | 355 | |
347 | 356 | usb_autopm_put_interface() decrements the usage counter and | |
348 | usb_autopm_put_interface() decrements pm_usage_count and | 357 | attempts an autosuspend if the new value is = 0. |
349 | attempts an autosuspend if the new value is <= 0 and the | ||
350 | device isn't suspended. | ||
351 | 358 | ||
352 | usb_autopm_get_interface_async() and | 359 | usb_autopm_get_interface_async() and |
353 | usb_autopm_put_interface_async() do almost the same things as | 360 | usb_autopm_put_interface_async() do almost the same things as |
354 | their non-async counterparts. The differences are: they do | 361 | their non-async counterparts. The big difference is that they |
355 | not acquire the PM mutex, and they use a workqueue to do their | 362 | use a workqueue to do the resume or suspend part of their |
356 | jobs. As a result they can be called in an atomic context, | 363 | jobs. As a result they can be called in an atomic context, |
357 | such as an URB's completion handler, but when they return the | 364 | such as an URB's completion handler, but when they return the |
358 | device will not generally not yet be in the desired state. | 365 | device will generally not yet be in the desired state. |
359 | 366 | ||
360 | usb_autopm_get_interface_no_resume() and | 367 | usb_autopm_get_interface_no_resume() and |
361 | usb_autopm_put_interface_no_suspend() merely increment or | 368 | usb_autopm_put_interface_no_suspend() merely increment or |
362 | decrement the pm_usage_count value; they do not attempt to | 369 | decrement the usage counter; they do not attempt to carry out |
363 | carry out an autoresume or an autosuspend. Hence they can be | 370 | an autoresume or an autosuspend. Hence they can be called in |
364 | called in an atomic context. | 371 | an atomic context. |
365 | 372 | ||
366 | The conventional usage pattern is that a driver calls | 373 | The simplest usage pattern is that a driver calls |
367 | usb_autopm_get_interface() in its open routine and | 374 | usb_autopm_get_interface() in its open routine and |
368 | usb_autopm_put_interface() in its close or release routine. But | 375 | usb_autopm_put_interface() in its close or release routine. But other |
369 | other patterns are possible. | 376 | patterns are possible. |
370 | 377 | ||
371 | The autosuspend attempts mentioned above will often fail for one | 378 | The autosuspend attempts mentioned above will often fail for one |
372 | reason or another. For example, the power/level attribute might be | 379 | reason or another. For example, the power/control attribute might be |
373 | set to "on", or another interface in the same device might not be | 380 | set to "on", or another interface in the same device might not be |
374 | idle. This is perfectly normal. If the reason for failure was that | 381 | idle. This is perfectly normal. If the reason for failure was that |
375 | the device hasn't been idle for long enough, a delayed workqueue | 382 | the device hasn't been idle for long enough, a timer is scheduled to |
376 | routine is automatically set up to carry out the operation when the | 383 | carry out the operation automatically when the autosuspend idle-delay |
377 | autosuspend idle-delay has expired. | 384 | has expired. |
378 | 385 | ||
379 | Autoresume attempts also can fail, although failure would mean that | 386 | Autoresume attempts also can fail, although failure would mean that |
380 | the device is no longer present or operating properly. Unlike | 387 | the device is no longer present or operating properly. Unlike |
381 | autosuspend, there's no delay for an autoresume. | 388 | autosuspend, there's no idle-delay for an autoresume. |
382 | 389 | ||
383 | 390 | ||
384 | Other parts of the driver interface | 391 | Other parts of the driver interface |
385 | ----------------------------------- | 392 | ----------------------------------- |
386 | 393 | ||
394 | Drivers can enable autosuspend for their devices by calling | ||
395 | |||
396 | usb_enable_autosuspend(struct usb_device *udev); | ||
397 | |||
398 | in their probe() routine, if they know that the device is capable of | ||
399 | suspending and resuming correctly. This is exactly equivalent to | ||
400 | writing "auto" to the device's power/control attribute. Likewise, | ||
401 | drivers can disable autosuspend by calling | ||
402 | |||
403 | usb_disable_autosuspend(struct usb_device *udev); | ||
404 | |||
405 | This is exactly the same as writing "on" to the power/control attribute. | ||
406 | |||
387 | Sometimes a driver needs to make sure that remote wakeup is enabled | 407 | Sometimes a driver needs to make sure that remote wakeup is enabled |
388 | during autosuspend. For example, there's not much point | 408 | during autosuspend. For example, there's not much point |
389 | autosuspending a keyboard if the user can't cause the keyboard to do a | 409 | autosuspending a keyboard if the user can't cause the keyboard to do a |
@@ -395,26 +415,27 @@ though, setting this flag won't cause the kernel to autoresume it. | |||
395 | Normally a driver would set this flag in its probe method, at which | 415 | Normally a driver would set this flag in its probe method, at which |
396 | time the device is guaranteed not to be autosuspended.) | 416 | time the device is guaranteed not to be autosuspended.) |
397 | 417 | ||
398 | The synchronous usb_autopm_* routines have to run in a sleepable | 418 | If a driver does its I/O asynchronously in interrupt context, it |
399 | process context; they must not be called from an interrupt handler or | 419 | should call usb_autopm_get_interface_async() before starting output and |
400 | while holding a spinlock. In fact, the entire autosuspend mechanism | 420 | usb_autopm_put_interface_async() when the output queue drains. When |
401 | is not well geared toward interrupt-driven operation. However there | 421 | it receives an input event, it should call |
402 | is one thing a driver can do in an interrupt handler: | ||
403 | 422 | ||
404 | usb_mark_last_busy(struct usb_device *udev); | 423 | usb_mark_last_busy(struct usb_device *udev); |
405 | 424 | ||
406 | This sets udev->last_busy to the current time. udev->last_busy is the | 425 | in the event handler. This sets udev->last_busy to the current time. |
407 | field used for idle-delay calculations; updating it will cause any | 426 | udev->last_busy is the field used for idle-delay calculations; |
408 | pending autosuspend to be moved back. The usb_autopm_* routines will | 427 | updating it will cause any pending autosuspend to be moved back. Most |
409 | also set the last_busy field to the current time. | 428 | of the usb_autopm_* routines will also set the last_busy field to the |
410 | 429 | current time. | |
411 | Calling urb_mark_last_busy() from within an URB completion handler is | 430 | |
412 | subject to races: The kernel may have just finished deciding the | 431 | Asynchronous operation is always subject to races. For example, a |
413 | device has been idle for long enough but not yet gotten around to | 432 | driver may call one of the usb_autopm_*_interface_async() routines at |
414 | calling the driver's suspend method. The driver would have to be | 433 | a time when the core has just finished deciding the device has been |
415 | responsible for synchronizing its suspend method with its URB | 434 | idle for long enough but not yet gotten around to calling the driver's |
416 | completion handler and causing the autosuspend to fail with -EBUSY if | 435 | suspend method. The suspend method must be responsible for |
417 | an URB had completed too recently. | 436 | synchronizing with the output request routine and the URB completion |
437 | handler; it should cause autosuspends to fail with -EBUSY if the | ||
438 | driver needs to use the device. | ||
418 | 439 | ||
419 | External suspend calls should never be allowed to fail in this way, | 440 | External suspend calls should never be allowed to fail in this way, |
420 | only autosuspend calls. The driver can tell them apart by checking | 441 | only autosuspend calls. The driver can tell them apart by checking |
@@ -422,75 +443,23 @@ the PM_EVENT_AUTO bit in the message.event argument to the suspend | |||
422 | method; this bit will be set for internal PM events (autosuspend) and | 443 | method; this bit will be set for internal PM events (autosuspend) and |
423 | clear for external PM events. | 444 | clear for external PM events. |
424 | 445 | ||
425 | Many of the ingredients in the autosuspend framework are oriented | ||
426 | towards interfaces: The usb_interface structure contains the | ||
427 | pm_usage_cnt field, and the usb_autopm_* routines take an interface | ||
428 | pointer as their argument. But somewhat confusingly, a few of the | ||
429 | pieces (i.e., usb_mark_last_busy()) use the usb_device structure | ||
430 | instead. Drivers need to keep this straight; they can call | ||
431 | interface_to_usbdev() to find the device structure for a given | ||
432 | interface. | ||
433 | |||
434 | 446 | ||
435 | Locking requirements | 447 | Mutual exclusion |
436 | -------------------- | 448 | ---------------- |
437 | 449 | ||
438 | All three suspend/resume methods are always called while holding the | 450 | For external events -- but not necessarily for autosuspend or |
439 | usb_device's PM mutex. For external events -- but not necessarily for | 451 | autoresume -- the device semaphore (udev->dev.sem) will be held when a |
440 | autosuspend or autoresume -- the device semaphore (udev->dev.sem) will | 452 | suspend or resume method is called. This implies that external |
441 | also be held. This implies that external suspend/resume events are | 453 | suspend/resume events are mutually exclusive with calls to probe, |
442 | mutually exclusive with calls to probe, disconnect, pre_reset, and | 454 | disconnect, pre_reset, and post_reset; the USB core guarantees that |
443 | post_reset; the USB core guarantees that this is true of internal | 455 | this is true of autosuspend/autoresume events as well. |
444 | suspend/resume events as well. | ||
445 | 456 | ||
446 | If a driver wants to block all suspend/resume calls during some | 457 | If a driver wants to block all suspend/resume calls during some |
447 | critical section, it can simply acquire udev->pm_mutex. Note that | 458 | critical section, the best way is to lock the device and call |
448 | calls to resume may be triggered indirectly. Block IO due to memory | 459 | usb_autopm_get_interface() (and do the reverse at the end of the |
449 | allocations can make the vm subsystem resume a device. Thus while | 460 | critical section). Holding the device semaphore will block all |
450 | holding this lock you must not allocate memory with GFP_KERNEL or | 461 | external PM calls, and the usb_autopm_get_interface() will prevent any |
451 | GFP_NOFS. | 462 | internal PM calls, even if it fails. (Exercise: Why?) |
452 | |||
453 | Alternatively, if the critical section might call some of the | ||
454 | usb_autopm_* routines, the driver can avoid deadlock by doing: | ||
455 | |||
456 | down(&udev->dev.sem); | ||
457 | rc = usb_autopm_get_interface(intf); | ||
458 | |||
459 | and at the end of the critical section: | ||
460 | |||
461 | if (!rc) | ||
462 | usb_autopm_put_interface(intf); | ||
463 | up(&udev->dev.sem); | ||
464 | |||
465 | Holding the device semaphore will block all external PM calls, and the | ||
466 | usb_autopm_get_interface() will prevent any internal PM calls, even if | ||
467 | it fails. (Exercise: Why?) | ||
468 | |||
469 | The rules for locking order are: | ||
470 | |||
471 | Never acquire any device semaphore while holding any PM mutex. | ||
472 | |||
473 | Never acquire udev->pm_mutex while holding the PM mutex for | ||
474 | a device that isn't a descendant of udev. | ||
475 | |||
476 | In other words, PM mutexes should only be acquired going up the device | ||
477 | tree, and they should be acquired only after locking all the device | ||
478 | semaphores you need to hold. These rules don't matter to drivers very | ||
479 | much; they usually affect just the USB core. | ||
480 | |||
481 | Still, drivers do need to be careful. For example, many drivers use a | ||
482 | private mutex to synchronize their normal I/O activities with their | ||
483 | disconnect method. Now if the driver supports autosuspend then it | ||
484 | must call usb_autopm_put_interface() from somewhere -- maybe from its | ||
485 | close method. It should make the call while holding the private mutex, | ||
486 | since a driver shouldn't call any of the usb_autopm_* functions for an | ||
487 | interface from which it has been unbound. | ||
488 | |||
489 | But the usb_autpm_* routines always acquire the device's PM mutex, and | ||
490 | consequently the locking order has to be: private mutex first, PM | ||
491 | mutex second. Since the suspend method is always called with the PM | ||
492 | mutex held, it mustn't try to acquire the private mutex. It has to | ||
493 | synchronize with the driver's I/O activities in some other way. | ||
494 | 463 | ||
495 | 464 | ||
496 | Interaction between dynamic PM and system PM | 465 | Interaction between dynamic PM and system PM |
@@ -499,22 +468,11 @@ synchronize with the driver's I/O activities in some other way. | |||
499 | Dynamic power management and system power management can interact in | 468 | Dynamic power management and system power management can interact in |
500 | a couple of ways. | 469 | a couple of ways. |
501 | 470 | ||
502 | Firstly, a device may already be manually suspended or autosuspended | 471 | Firstly, a device may already be autosuspended when a system suspend |
503 | when a system suspend occurs. Since system suspends are supposed to | 472 | occurs. Since system suspends are supposed to be as transparent as |
504 | be as transparent as possible, the device should remain suspended | 473 | possible, the device should remain suspended following the system |
505 | following the system resume. The 2.6.23 kernel obeys this principle | 474 | resume. But this theory may not work out well in practice; over time |
506 | for manually suspended devices but not for autosuspended devices; they | 475 | the kernel's behavior in this regard has changed. |
507 | do get resumed when the system wakes up. (Presumably they will be | ||
508 | autosuspended again after their idle-delay time expires.) In later | ||
509 | kernels this behavior will be fixed. | ||
510 | |||
511 | (There is an exception. If a device would undergo a reset-resume | ||
512 | instead of a normal resume, and the device is enabled for remote | ||
513 | wakeup, then the reset-resume takes place even if the device was | ||
514 | already suspended when the system suspend began. The justification is | ||
515 | that a reset-resume is a kind of remote-wakeup event. Or to put it | ||
516 | another way, a device which needs a reset won't be able to generate | ||
517 | normal remote-wakeup signals, so it ought to be resumed immediately.) | ||
518 | 476 | ||
519 | Secondly, a dynamic power-management event may occur as a system | 477 | Secondly, a dynamic power-management event may occur as a system |
520 | suspend is underway. The window for this is short, since system | 478 | suspend is underway. The window for this is short, since system |
diff --git a/Documentation/usb/usb-serial.txt b/Documentation/usb/usb-serial.txt index ff2c1ff57ba2..f4d214510259 100644 --- a/Documentation/usb/usb-serial.txt +++ b/Documentation/usb/usb-serial.txt | |||
@@ -194,6 +194,10 @@ FTDI Single Port Serial Driver | |||
194 | 194 | ||
195 | This is a single port DB-25 serial adapter. | 195 | This is a single port DB-25 serial adapter. |
196 | 196 | ||
197 | Devices supported include: | ||
198 | -TripNav TN-200 USB GPS | ||
199 | -Navis Engineering Bureau CH-4711 USB GPS | ||
200 | |||
197 | For any questions or problems with this driver, please contact Bill Ryder. | 201 | For any questions or problems with this driver, please contact Bill Ryder. |
198 | 202 | ||
199 | 203 | ||
@@ -216,7 +220,7 @@ Cypress M8 CY4601 Family Serial Driver | |||
216 | 220 | ||
217 | Devices supported: | 221 | Devices supported: |
218 | 222 | ||
219 | -DeLorme's USB Earthmate (SiRF Star II lp arch) | 223 | -DeLorme's USB Earthmate GPS (SiRF Star II lp arch) |
220 | -Cypress HID->COM RS232 adapter | 224 | -Cypress HID->COM RS232 adapter |
221 | 225 | ||
222 | Note: Cypress Semiconductor claims no affiliation with the | 226 | Note: Cypress Semiconductor claims no affiliation with the |
@@ -392,9 +396,10 @@ REINER SCT cyberJack pinpad/e-com USB chipcard reader | |||
392 | Prolific PL2303 Driver | 396 | Prolific PL2303 Driver |
393 | 397 | ||
394 | This driver supports any device that has the PL2303 chip from Prolific | 398 | This driver supports any device that has the PL2303 chip from Prolific |
395 | in it. This includes a number of single port USB to serial | 399 | in it. This includes a number of single port USB to serial converters, |
396 | converters and USB GPS devices. Devices from Aten (the UC-232) and | 400 | more than 70% of USB GPS devices (in 2010), and some USB UPSes. Devices |
397 | IO-Data work with this driver, as does the DCU-11 mobile-phone cable. | 401 | from Aten (the UC-232) and IO-Data work with this driver, as does |
402 | the DCU-11 mobile-phone cable. | ||
398 | 403 | ||
399 | For any questions or problems with this driver, please contact Greg | 404 | For any questions or problems with this driver, please contact Greg |
400 | Kroah-Hartman at greg@kroah.com | 405 | Kroah-Hartman at greg@kroah.com |
@@ -435,6 +440,22 @@ Winchiphead CH341 Driver | |||
435 | For any questions or problems with this driver, please contact | 440 | For any questions or problems with this driver, please contact |
436 | frank@kingswood-consulting.co.uk. | 441 | frank@kingswood-consulting.co.uk. |
437 | 442 | ||
443 | Moschip MCS7720, MCS7715 driver | ||
444 | |||
445 | These chips are present in devices sold by various manufacturers, such as Syba | ||
446 | and Cables Unlimited. There may be others. The 7720 provides two serial | ||
447 | ports, and the 7715 provides one serial and one standard PC parallel port. | ||
448 | Support for the 7715's parallel port is enabled by a separate option, which | ||
449 | will not appear unless parallel port support is first enabled at the top-level | ||
450 | of the Device Drivers config menu. Currently only compatibility mode is | ||
451 | supported on the parallel port (no ECP/EPP). | ||
452 | |||
453 | TODO: | ||
454 | - Implement ECP/EPP modes for the parallel port. | ||
455 | - Baud rates higher than 115200 are currently broken. | ||
456 | - Devices with a single serial port based on the Moschip MCS7703 may work | ||
457 | with this driver with a simple addition to the usb_device_id table. I | ||
458 | don't have one of these devices, so I can't say for sure. | ||
438 | 459 | ||
439 | Generic Serial driver | 460 | Generic Serial driver |
440 | 461 | ||
diff --git a/Documentation/video4linux/CARDLIST.bttv b/Documentation/video4linux/CARDLIST.bttv index f11c583295e9..4739d5684305 100644 --- a/Documentation/video4linux/CARDLIST.bttv +++ b/Documentation/video4linux/CARDLIST.bttv | |||
@@ -100,7 +100,7 @@ | |||
100 | 99 -> AD-TVK503 | 100 | 99 -> AD-TVK503 |
101 | 100 -> Hercules Smart TV Stereo | 101 | 100 -> Hercules Smart TV Stereo |
102 | 101 -> Pace TV & Radio Card | 102 | 101 -> Pace TV & Radio Card |
103 | 102 -> IVC-200 [0000:a155,0001:a155,0002:a155,0003:a155,0100:a155,0101:a155,0102:a155,0103:a155] | 103 | 102 -> IVC-200 [0000:a155,0001:a155,0002:a155,0003:a155,0100:a155,0101:a155,0102:a155,0103:a155,0800:a155,0801:a155,0802:a155,0803:a155] |
104 | 103 -> Grand X-Guard / Trust 814PCI [0304:0102] | 104 | 103 -> Grand X-Guard / Trust 814PCI [0304:0102] |
105 | 104 -> Nebula Electronics DigiTV [0071:0101] | 105 | 104 -> Nebula Electronics DigiTV [0071:0101] |
106 | 105 -> ProVideo PV143 [aa00:1430,aa00:1431,aa00:1432,aa00:1433,aa03:1433] | 106 | 105 -> ProVideo PV143 [aa00:1430,aa00:1431,aa00:1432,aa00:1433,aa03:1433] |
diff --git a/Documentation/video4linux/CARDLIST.cx23885 b/Documentation/video4linux/CARDLIST.cx23885 index 7539e8fa1ffd..16ca030e1185 100644 --- a/Documentation/video4linux/CARDLIST.cx23885 +++ b/Documentation/video4linux/CARDLIST.cx23885 | |||
@@ -26,3 +26,4 @@ | |||
26 | 25 -> Compro VideoMate E800 [1858:e800] | 26 | 25 -> Compro VideoMate E800 [1858:e800] |
27 | 26 -> Hauppauge WinTV-HVR1290 [0070:8551] | 27 | 26 -> Hauppauge WinTV-HVR1290 [0070:8551] |
28 | 27 -> Mygica X8558 PRO DMB-TH [14f1:8578] | 28 | 27 -> Mygica X8558 PRO DMB-TH [14f1:8578] |
29 | 28 -> LEADTEK WinFast PxTV1200 [107d:6f22] | ||
diff --git a/Documentation/video4linux/CARDLIST.cx88 b/Documentation/video4linux/CARDLIST.cx88 index 7ec3c4e4b60f..f2510541373b 100644 --- a/Documentation/video4linux/CARDLIST.cx88 +++ b/Documentation/video4linux/CARDLIST.cx88 | |||
@@ -82,3 +82,4 @@ | |||
82 | 81 -> Leadtek WinFast DTV1800 Hybrid [107d:6654] | 82 | 81 -> Leadtek WinFast DTV1800 Hybrid [107d:6654] |
83 | 82 -> WinFast DTV2000 H rev. J [107d:6f2b] | 83 | 82 -> WinFast DTV2000 H rev. J [107d:6f2b] |
84 | 83 -> Prof 7301 DVB-S/S2 [b034:3034] | 84 | 83 -> Prof 7301 DVB-S/S2 [b034:3034] |
85 | 84 -> Samsung SMT 7020 DVB-S [18ac:dc00,18ac:dccd] | ||
diff --git a/Documentation/video4linux/CARDLIST.em28xx b/Documentation/video4linux/CARDLIST.em28xx index 0c166ff003a0..3a623aaeae5f 100644 --- a/Documentation/video4linux/CARDLIST.em28xx +++ b/Documentation/video4linux/CARDLIST.em28xx | |||
@@ -1,5 +1,5 @@ | |||
1 | 0 -> Unknown EM2800 video grabber (em2800) [eb1a:2800] | 1 | 0 -> Unknown EM2800 video grabber (em2800) [eb1a:2800] |
2 | 1 -> Unknown EM2750/28xx video grabber (em2820/em2840) [eb1a:2710,eb1a:2820,eb1a:2821,eb1a:2860,eb1a:2861,eb1a:2862,eb1a:2870,eb1a:2881,eb1a:2883,eb1a:2868] | 2 | 1 -> Unknown EM2750/28xx video grabber (em2820/em2840) [eb1a:2710,eb1a:2820,eb1a:2821,eb1a:2860,eb1a:2861,eb1a:2862,eb1a:2863,eb1a:2870,eb1a:2881,eb1a:2883,eb1a:2868] |
3 | 2 -> Terratec Cinergy 250 USB (em2820/em2840) [0ccd:0036] | 3 | 2 -> Terratec Cinergy 250 USB (em2820/em2840) [0ccd:0036] |
4 | 3 -> Pinnacle PCTV USB 2 (em2820/em2840) [2304:0208] | 4 | 3 -> Pinnacle PCTV USB 2 (em2820/em2840) [2304:0208] |
5 | 4 -> Hauppauge WinTV USB 2 (em2820/em2840) [2040:4200,2040:4201] | 5 | 4 -> Hauppauge WinTV USB 2 (em2820/em2840) [2040:4200,2040:4201] |
@@ -27,6 +27,7 @@ | |||
27 | 26 -> Hercules Smart TV USB 2.0 (em2820/em2840) | 27 | 26 -> Hercules Smart TV USB 2.0 (em2820/em2840) |
28 | 27 -> Pinnacle PCTV USB 2 (Philips FM1216ME) (em2820/em2840) | 28 | 27 -> Pinnacle PCTV USB 2 (Philips FM1216ME) (em2820/em2840) |
29 | 28 -> Leadtek Winfast USB II Deluxe (em2820/em2840) | 29 | 28 -> Leadtek Winfast USB II Deluxe (em2820/em2840) |
30 | 29 -> EM2860/TVP5150 Reference Design (em2860) | ||
30 | 30 -> Videology 20K14XUSB USB2.0 (em2820/em2840) | 31 | 30 -> Videology 20K14XUSB USB2.0 (em2820/em2840) |
31 | 31 -> Usbgear VD204v9 (em2821) | 32 | 31 -> Usbgear VD204v9 (em2821) |
32 | 32 -> Supercomp USB 2.0 TV (em2821) | 33 | 32 -> Supercomp USB 2.0 TV (em2821) |
@@ -70,3 +71,4 @@ | |||
70 | 72 -> Gadmei UTV330+ (em2861) | 71 | 72 -> Gadmei UTV330+ (em2861) |
71 | 73 -> Reddo DVB-C USB TV Box (em2870) | 72 | 73 -> Reddo DVB-C USB TV Box (em2870) |
72 | 74 -> Actionmaster/LinXcel/Digitus VC211A (em2800) | 73 | 74 -> Actionmaster/LinXcel/Digitus VC211A (em2800) |
74 | 75 -> Dikom DK300 (em2882) | ||
diff --git a/Documentation/video4linux/CARDLIST.saa7134 b/Documentation/video4linux/CARDLIST.saa7134 index fce1e7eb0474..070f2576707e 100644 --- a/Documentation/video4linux/CARDLIST.saa7134 +++ b/Documentation/video4linux/CARDLIST.saa7134 | |||
@@ -174,3 +174,7 @@ | |||
174 | 173 -> Zolid Hybrid TV Tuner PCI [1131:2004] | 174 | 173 -> Zolid Hybrid TV Tuner PCI [1131:2004] |
175 | 174 -> Asus Europa Hybrid OEM [1043:4847] | 175 | 174 -> Asus Europa Hybrid OEM [1043:4847] |
176 | 175 -> Leadtek Winfast DTV1000S [107d:6655] | 176 | 175 -> Leadtek Winfast DTV1000S [107d:6655] |
177 | 176 -> Beholder BeholdTV 505 RDS [0000:5051] | ||
178 | 177 -> Hawell HW-404M7 | ||
179 | 179 -> Beholder BeholdTV H7 [5ace:7190] | ||
180 | 180 -> Beholder BeholdTV A7 [5ace:7090] | ||
diff --git a/Documentation/video4linux/CARDLIST.tuner b/Documentation/video4linux/CARDLIST.tuner index e0d298fe8830..9b2e0dd6017e 100644 --- a/Documentation/video4linux/CARDLIST.tuner +++ b/Documentation/video4linux/CARDLIST.tuner | |||
@@ -81,3 +81,4 @@ tuner=80 - Philips FQ1216LME MK3 PAL/SECAM w/active loopthrough | |||
81 | tuner=81 - Partsnic (Daewoo) PTI-5NF05 | 81 | tuner=81 - Partsnic (Daewoo) PTI-5NF05 |
82 | tuner=82 - Philips CU1216L | 82 | tuner=82 - Philips CU1216L |
83 | tuner=83 - NXP TDA18271 | 83 | tuner=83 - NXP TDA18271 |
84 | tuner=84 - Sony BTF-Pxn01Z | ||
diff --git a/Documentation/video4linux/README.tlg2300 b/Documentation/video4linux/README.tlg2300 new file mode 100644 index 000000000000..416ccb93d8c9 --- /dev/null +++ b/Documentation/video4linux/README.tlg2300 | |||
@@ -0,0 +1,47 @@ | |||
1 | tlg2300 release notes | ||
2 | ==================== | ||
3 | |||
4 | This is a v4l2/dvb device driver for the tlg2300 chip. | ||
5 | |||
6 | |||
7 | current status | ||
8 | ============== | ||
9 | |||
10 | video | ||
11 | - support mmap and read().(no overlay) | ||
12 | |||
13 | audio | ||
14 | - The driver will register a ALSA card for the audio input. | ||
15 | |||
16 | vbi | ||
17 | - Works for almost TV norms. | ||
18 | |||
19 | dvb-t | ||
20 | - works for DVB-T | ||
21 | |||
22 | FM | ||
23 | - Works for radio. | ||
24 | |||
25 | --------------------------------------------------------------------------- | ||
26 | TESTED APPLICATIONS: | ||
27 | |||
28 | -VLC1.0.4 test the video and dvb. The GUI is friendly to use. | ||
29 | |||
30 | -Mplayer test the video. | ||
31 | |||
32 | -Mplayer test the FM. The mplayer should be compiled with --enable-radio and | ||
33 | --enable-radio-capture. | ||
34 | The command runs as this(The alsa audio registers to card 1): | ||
35 | #mplayer radio://103.7/capture/ -radio adevice=hw=1,0:arate=48000 \ | ||
36 | -rawaudio rate=48000:channels=2 | ||
37 | |||
38 | --------------------------------------------------------------------------- | ||
39 | KNOWN PROBLEMS: | ||
40 | about preemphasis: | ||
41 | You can set the preemphasis for radio by the following command: | ||
42 | #v4l2-ctl -d /dev/radio0 --set-ctrl=pre_emphasis_settings=1 | ||
43 | |||
44 | "pre_emphasis_settings=1" means that you select the 50us. If you want | ||
45 | to select the 75us, please use "pre_emphasis_settings=2" | ||
46 | |||
47 | |||
diff --git a/Documentation/video4linux/extract_xc3028.pl b/Documentation/video4linux/extract_xc3028.pl index 2cb816047fc1..47877deae6d7 100644 --- a/Documentation/video4linux/extract_xc3028.pl +++ b/Documentation/video4linux/extract_xc3028.pl | |||
@@ -5,12 +5,18 @@ | |||
5 | # | 5 | # |
6 | # In order to use, you need to: | 6 | # In order to use, you need to: |
7 | # 1) Download the windows driver with something like: | 7 | # 1) Download the windows driver with something like: |
8 | # Version 2.4 | ||
9 | # wget http://www.twinhan.com/files/AW/BDA T/20080303_V1.0.6.7.zip | ||
10 | # or wget http://www.stefanringel.de/pub/20080303_V1.0.6.7.zip | ||
11 | # Version 2.7 | ||
8 | # wget http://www.steventoth.net/linux/xc5000/HVR-12x0-14x0-17x0_1_25_25271_WHQL.zip | 12 | # wget http://www.steventoth.net/linux/xc5000/HVR-12x0-14x0-17x0_1_25_25271_WHQL.zip |
9 | # 2) Extract the file hcw85bda.sys from the zip into the current dir: | 13 | # 2) Extract the files from the zip into the current dir: |
14 | # unzip -j 20080303_V1.0.6.7.zip 20080303_v1.0.6.7/UDXTTM6000.sys | ||
10 | # unzip -j HVR-12x0-14x0-17x0_1_25_25271_WHQL.zip Driver85/hcw85bda.sys | 15 | # unzip -j HVR-12x0-14x0-17x0_1_25_25271_WHQL.zip Driver85/hcw85bda.sys |
11 | # 3) run the script: | 16 | # 3) run the script: |
12 | # ./extract_xc3028.pl | 17 | # ./extract_xc3028.pl |
13 | # 4) copy the generated file: | 18 | # 4) copy the generated files: |
19 | # cp xc3028-v24.fw /lib/firmware | ||
14 | # cp xc3028-v27.fw /lib/firmware | 20 | # cp xc3028-v27.fw /lib/firmware |
15 | 21 | ||
16 | #use strict; | 22 | #use strict; |
@@ -135,7 +141,7 @@ sub write_hunk_fix_endian($$) | |||
135 | } | 141 | } |
136 | } | 142 | } |
137 | 143 | ||
138 | sub main_firmware($$$$) | 144 | sub main_firmware_24($$$$) |
139 | { | 145 | { |
140 | my $out; | 146 | my $out; |
141 | my $j=0; | 147 | my $j=0; |
@@ -146,8 +152,774 @@ sub main_firmware($$$$) | |||
146 | 152 | ||
147 | for ($j = length($name); $j <32; $j++) { | 153 | for ($j = length($name); $j <32; $j++) { |
148 | $name = $name.chr(0); | 154 | $name = $name.chr(0); |
155 | } | ||
156 | |||
157 | open OUTFILE, ">$outfile"; | ||
158 | syswrite(OUTFILE, $name); | ||
159 | write_le16($version); | ||
160 | write_le16($nr_desc); | ||
161 | |||
162 | # | ||
163 | # Firmware 0, type: BASE FW F8MHZ (0x00000003), id: (0000000000000000), size: 6635 | ||
164 | # | ||
165 | |||
166 | write_le32(0x00000003); # Type | ||
167 | write_le64(0x00000000, 0x00000000); # ID | ||
168 | write_le32(6635); # Size | ||
169 | write_hunk_fix_endian(257752, 6635); | ||
170 | |||
171 | # | ||
172 | # Firmware 1, type: BASE FW F8MHZ MTS (0x00000007), id: (0000000000000000), size: 6635 | ||
173 | # | ||
174 | |||
175 | write_le32(0x00000007); # Type | ||
176 | write_le64(0x00000000, 0x00000000); # ID | ||
177 | write_le32(6635); # Size | ||
178 | write_hunk_fix_endian(264392, 6635); | ||
179 | |||
180 | # | ||
181 | # Firmware 2, type: BASE FW FM (0x00000401), id: (0000000000000000), size: 6525 | ||
182 | # | ||
183 | |||
184 | write_le32(0x00000401); # Type | ||
185 | write_le64(0x00000000, 0x00000000); # ID | ||
186 | write_le32(6525); # Size | ||
187 | write_hunk_fix_endian(271040, 6525); | ||
188 | |||
189 | # | ||
190 | # Firmware 3, type: BASE FW FM INPUT1 (0x00000c01), id: (0000000000000000), size: 6539 | ||
191 | # | ||
192 | |||
193 | write_le32(0x00000c01); # Type | ||
194 | write_le64(0x00000000, 0x00000000); # ID | ||
195 | write_le32(6539); # Size | ||
196 | write_hunk_fix_endian(277568, 6539); | ||
197 | |||
198 | # | ||
199 | # Firmware 4, type: BASE FW (0x00000001), id: (0000000000000000), size: 6633 | ||
200 | # | ||
201 | |||
202 | write_le32(0x00000001); # Type | ||
203 | write_le64(0x00000000, 0x00000000); # ID | ||
204 | write_le32(6633); # Size | ||
205 | write_hunk_fix_endian(284120, 6633); | ||
206 | |||
207 | # | ||
208 | # Firmware 5, type: BASE FW MTS (0x00000005), id: (0000000000000000), size: 6617 | ||
209 | # | ||
210 | |||
211 | write_le32(0x00000005); # Type | ||
212 | write_le64(0x00000000, 0x00000000); # ID | ||
213 | write_le32(6617); # Size | ||
214 | write_hunk_fix_endian(290760, 6617); | ||
215 | |||
216 | # | ||
217 | # Firmware 6, type: STD FW (0x00000000), id: PAL/BG A2/A (0000000100000007), size: 161 | ||
218 | # | ||
219 | |||
220 | write_le32(0x00000000); # Type | ||
221 | write_le64(0x00000001, 0x00000007); # ID | ||
222 | write_le32(161); # Size | ||
223 | write_hunk_fix_endian(297384, 161); | ||
224 | |||
225 | # | ||
226 | # Firmware 7, type: STD FW MTS (0x00000004), id: PAL/BG A2/A (0000000100000007), size: 169 | ||
227 | # | ||
228 | |||
229 | write_le32(0x00000004); # Type | ||
230 | write_le64(0x00000001, 0x00000007); # ID | ||
231 | write_le32(169); # Size | ||
232 | write_hunk_fix_endian(297552, 169); | ||
233 | |||
234 | # | ||
235 | # Firmware 8, type: STD FW (0x00000000), id: PAL/BG A2/B (0000000200000007), size: 161 | ||
236 | # | ||
237 | |||
238 | write_le32(0x00000000); # Type | ||
239 | write_le64(0x00000002, 0x00000007); # ID | ||
240 | write_le32(161); # Size | ||
241 | write_hunk_fix_endian(297728, 161); | ||
242 | |||
243 | # | ||
244 | # Firmware 9, type: STD FW MTS (0x00000004), id: PAL/BG A2/B (0000000200000007), size: 169 | ||
245 | # | ||
246 | |||
247 | write_le32(0x00000004); # Type | ||
248 | write_le64(0x00000002, 0x00000007); # ID | ||
249 | write_le32(169); # Size | ||
250 | write_hunk_fix_endian(297896, 169); | ||
251 | |||
252 | # | ||
253 | # Firmware 10, type: STD FW (0x00000000), id: PAL/BG NICAM/A (0000000400000007), size: 161 | ||
254 | # | ||
255 | |||
256 | write_le32(0x00000000); # Type | ||
257 | write_le64(0x00000004, 0x00000007); # ID | ||
258 | write_le32(161); # Size | ||
259 | write_hunk_fix_endian(298072, 161); | ||
260 | |||
261 | # | ||
262 | # Firmware 11, type: STD FW MTS (0x00000004), id: PAL/BG NICAM/A (0000000400000007), size: 169 | ||
263 | # | ||
264 | |||
265 | write_le32(0x00000004); # Type | ||
266 | write_le64(0x00000004, 0x00000007); # ID | ||
267 | write_le32(169); # Size | ||
268 | write_hunk_fix_endian(298240, 169); | ||
269 | |||
270 | # | ||
271 | # Firmware 12, type: STD FW (0x00000000), id: PAL/BG NICAM/B (0000000800000007), size: 161 | ||
272 | # | ||
273 | |||
274 | write_le32(0x00000000); # Type | ||
275 | write_le64(0x00000008, 0x00000007); # ID | ||
276 | write_le32(161); # Size | ||
277 | write_hunk_fix_endian(298416, 161); | ||
278 | |||
279 | # | ||
280 | # Firmware 13, type: STD FW MTS (0x00000004), id: PAL/BG NICAM/B (0000000800000007), size: 169 | ||
281 | # | ||
282 | |||
283 | write_le32(0x00000004); # Type | ||
284 | write_le64(0x00000008, 0x00000007); # ID | ||
285 | write_le32(169); # Size | ||
286 | write_hunk_fix_endian(298584, 169); | ||
287 | |||
288 | # | ||
289 | # Firmware 14, type: STD FW (0x00000000), id: PAL/DK A2 (00000003000000e0), size: 161 | ||
290 | # | ||
291 | |||
292 | write_le32(0x00000000); # Type | ||
293 | write_le64(0x00000003, 0x000000e0); # ID | ||
294 | write_le32(161); # Size | ||
295 | write_hunk_fix_endian(298760, 161); | ||
296 | |||
297 | # | ||
298 | # Firmware 15, type: STD FW MTS (0x00000004), id: PAL/DK A2 (00000003000000e0), size: 169 | ||
299 | # | ||
300 | |||
301 | write_le32(0x00000004); # Type | ||
302 | write_le64(0x00000003, 0x000000e0); # ID | ||
303 | write_le32(169); # Size | ||
304 | write_hunk_fix_endian(298928, 169); | ||
305 | |||
306 | # | ||
307 | # Firmware 16, type: STD FW (0x00000000), id: PAL/DK NICAM (0000000c000000e0), size: 161 | ||
308 | # | ||
309 | |||
310 | write_le32(0x00000000); # Type | ||
311 | write_le64(0x0000000c, 0x000000e0); # ID | ||
312 | write_le32(161); # Size | ||
313 | write_hunk_fix_endian(299104, 161); | ||
314 | |||
315 | # | ||
316 | # Firmware 17, type: STD FW MTS (0x00000004), id: PAL/DK NICAM (0000000c000000e0), size: 169 | ||
317 | # | ||
318 | |||
319 | write_le32(0x00000004); # Type | ||
320 | write_le64(0x0000000c, 0x000000e0); # ID | ||
321 | write_le32(169); # Size | ||
322 | write_hunk_fix_endian(299272, 169); | ||
323 | |||
324 | # | ||
325 | # Firmware 18, type: STD FW (0x00000000), id: SECAM/K1 (0000000000200000), size: 161 | ||
326 | # | ||
327 | |||
328 | write_le32(0x00000000); # Type | ||
329 | write_le64(0x00000000, 0x00200000); # ID | ||
330 | write_le32(161); # Size | ||
331 | write_hunk_fix_endian(299448, 161); | ||
332 | |||
333 | # | ||
334 | # Firmware 19, type: STD FW MTS (0x00000004), id: SECAM/K1 (0000000000200000), size: 169 | ||
335 | # | ||
336 | |||
337 | write_le32(0x00000004); # Type | ||
338 | write_le64(0x00000000, 0x00200000); # ID | ||
339 | write_le32(169); # Size | ||
340 | write_hunk_fix_endian(299616, 169); | ||
341 | |||
342 | # | ||
343 | # Firmware 20, type: STD FW (0x00000000), id: SECAM/K3 (0000000004000000), size: 161 | ||
344 | # | ||
345 | |||
346 | write_le32(0x00000000); # Type | ||
347 | write_le64(0x00000000, 0x04000000); # ID | ||
348 | write_le32(161); # Size | ||
349 | write_hunk_fix_endian(299792, 161); | ||
350 | |||
351 | # | ||
352 | # Firmware 21, type: STD FW MTS (0x00000004), id: SECAM/K3 (0000000004000000), size: 169 | ||
353 | # | ||
354 | |||
355 | write_le32(0x00000004); # Type | ||
356 | write_le64(0x00000000, 0x04000000); # ID | ||
357 | write_le32(169); # Size | ||
358 | write_hunk_fix_endian(299960, 169); | ||
359 | |||
360 | # | ||
361 | # Firmware 22, type: STD FW D2633 DTV6 ATSC (0x00010030), id: (0000000000000000), size: 149 | ||
362 | # | ||
363 | |||
364 | write_le32(0x00010030); # Type | ||
365 | write_le64(0x00000000, 0x00000000); # ID | ||
366 | write_le32(149); # Size | ||
367 | write_hunk_fix_endian(300136, 149); | ||
368 | |||
369 | # | ||
370 | # Firmware 23, type: STD FW D2620 DTV6 QAM (0x00000068), id: (0000000000000000), size: 149 | ||
371 | # | ||
372 | |||
373 | write_le32(0x00000068); # Type | ||
374 | write_le64(0x00000000, 0x00000000); # ID | ||
375 | write_le32(149); # Size | ||
376 | write_hunk_fix_endian(300296, 149); | ||
377 | |||
378 | # | ||
379 | # Firmware 24, type: STD FW D2633 DTV6 QAM (0x00000070), id: (0000000000000000), size: 149 | ||
380 | # | ||
381 | |||
382 | write_le32(0x00000070); # Type | ||
383 | write_le64(0x00000000, 0x00000000); # ID | ||
384 | write_le32(149); # Size | ||
385 | write_hunk_fix_endian(300448, 149); | ||
386 | |||
387 | # | ||
388 | # Firmware 25, type: STD FW D2620 DTV7 (0x00000088), id: (0000000000000000), size: 149 | ||
389 | # | ||
390 | |||
391 | write_le32(0x00000088); # Type | ||
392 | write_le64(0x00000000, 0x00000000); # ID | ||
393 | write_le32(149); # Size | ||
394 | write_hunk_fix_endian(300608, 149); | ||
395 | |||
396 | # | ||
397 | # Firmware 26, type: STD FW D2633 DTV7 (0x00000090), id: (0000000000000000), size: 149 | ||
398 | # | ||
399 | |||
400 | write_le32(0x00000090); # Type | ||
401 | write_le64(0x00000000, 0x00000000); # ID | ||
402 | write_le32(149); # Size | ||
403 | write_hunk_fix_endian(300760, 149); | ||
404 | |||
405 | # | ||
406 | # Firmware 27, type: STD FW D2620 DTV78 (0x00000108), id: (0000000000000000), size: 149 | ||
407 | # | ||
408 | |||
409 | write_le32(0x00000108); # Type | ||
410 | write_le64(0x00000000, 0x00000000); # ID | ||
411 | write_le32(149); # Size | ||
412 | write_hunk_fix_endian(300920, 149); | ||
413 | |||
414 | # | ||
415 | # Firmware 28, type: STD FW D2633 DTV78 (0x00000110), id: (0000000000000000), size: 149 | ||
416 | # | ||
417 | |||
418 | write_le32(0x00000110); # Type | ||
419 | write_le64(0x00000000, 0x00000000); # ID | ||
420 | write_le32(149); # Size | ||
421 | write_hunk_fix_endian(301072, 149); | ||
422 | |||
423 | # | ||
424 | # Firmware 29, type: STD FW D2620 DTV8 (0x00000208), id: (0000000000000000), size: 149 | ||
425 | # | ||
426 | |||
427 | write_le32(0x00000208); # Type | ||
428 | write_le64(0x00000000, 0x00000000); # ID | ||
429 | write_le32(149); # Size | ||
430 | write_hunk_fix_endian(301232, 149); | ||
431 | |||
432 | # | ||
433 | # Firmware 30, type: STD FW D2633 DTV8 (0x00000210), id: (0000000000000000), size: 149 | ||
434 | # | ||
435 | |||
436 | write_le32(0x00000210); # Type | ||
437 | write_le64(0x00000000, 0x00000000); # ID | ||
438 | write_le32(149); # Size | ||
439 | write_hunk_fix_endian(301384, 149); | ||
440 | |||
441 | # | ||
442 | # Firmware 31, type: STD FW FM (0x00000400), id: (0000000000000000), size: 135 | ||
443 | # | ||
444 | |||
445 | write_le32(0x00000400); # Type | ||
446 | write_le64(0x00000000, 0x00000000); # ID | ||
447 | write_le32(135); # Size | ||
448 | write_hunk_fix_endian(301554, 135); | ||
449 | |||
450 | # | ||
451 | # Firmware 32, type: STD FW (0x00000000), id: PAL/I (0000000000000010), size: 161 | ||
452 | # | ||
453 | |||
454 | write_le32(0x00000000); # Type | ||
455 | write_le64(0x00000000, 0x00000010); # ID | ||
456 | write_le32(161); # Size | ||
457 | write_hunk_fix_endian(301688, 161); | ||
458 | |||
459 | # | ||
460 | # Firmware 33, type: STD FW MTS (0x00000004), id: PAL/I (0000000000000010), size: 169 | ||
461 | # | ||
462 | |||
463 | write_le32(0x00000004); # Type | ||
464 | write_le64(0x00000000, 0x00000010); # ID | ||
465 | write_le32(169); # Size | ||
466 | write_hunk_fix_endian(301856, 169); | ||
467 | |||
468 | # | ||
469 | # Firmware 34, type: STD FW (0x00000000), id: SECAM/L AM (0000001000400000), size: 169 | ||
470 | # | ||
471 | |||
472 | # | ||
473 | # Firmware 35, type: STD FW (0x00000000), id: SECAM/L NICAM (0000000c00400000), size: 161 | ||
474 | # | ||
475 | |||
476 | write_le32(0x00000000); # Type | ||
477 | write_le64(0x0000000c, 0x00400000); # ID | ||
478 | write_le32(161); # Size | ||
479 | write_hunk_fix_endian(302032, 161); | ||
480 | |||
481 | # | ||
482 | # Firmware 36, type: STD FW (0x00000000), id: SECAM/Lc (0000000000800000), size: 161 | ||
483 | # | ||
484 | |||
485 | write_le32(0x00000000); # Type | ||
486 | write_le64(0x00000000, 0x00800000); # ID | ||
487 | write_le32(161); # Size | ||
488 | write_hunk_fix_endian(302200, 161); | ||
489 | |||
490 | # | ||
491 | # Firmware 37, type: STD FW (0x00000000), id: NTSC/M Kr (0000000000008000), size: 161 | ||
492 | # | ||
493 | |||
494 | write_le32(0x00000000); # Type | ||
495 | write_le64(0x00000000, 0x00008000); # ID | ||
496 | write_le32(161); # Size | ||
497 | write_hunk_fix_endian(302368, 161); | ||
498 | |||
499 | # | ||
500 | # Firmware 38, type: STD FW LCD (0x00001000), id: NTSC/M Kr (0000000000008000), size: 161 | ||
501 | # | ||
502 | |||
503 | write_le32(0x00001000); # Type | ||
504 | write_le64(0x00000000, 0x00008000); # ID | ||
505 | write_le32(161); # Size | ||
506 | write_hunk_fix_endian(302536, 161); | ||
507 | |||
508 | # | ||
509 | # Firmware 39, type: STD FW LCD NOGD (0x00003000), id: NTSC/M Kr (0000000000008000), size: 161 | ||
510 | # | ||
511 | |||
512 | write_le32(0x00003000); # Type | ||
513 | write_le64(0x00000000, 0x00008000); # ID | ||
514 | write_le32(161); # Size | ||
515 | write_hunk_fix_endian(302704, 161); | ||
516 | |||
517 | # | ||
518 | # Firmware 40, type: STD FW MTS (0x00000004), id: NTSC/M Kr (0000000000008000), size: 169 | ||
519 | # | ||
520 | |||
521 | write_le32(0x00000004); # Type | ||
522 | write_le64(0x00000000, 0x00008000); # ID | ||
523 | write_le32(169); # Size | ||
524 | write_hunk_fix_endian(302872, 169); | ||
525 | |||
526 | # | ||
527 | # Firmware 41, type: STD FW (0x00000000), id: NTSC PAL/M PAL/N (000000000000b700), size: 161 | ||
528 | # | ||
529 | |||
530 | write_le32(0x00000000); # Type | ||
531 | write_le64(0x00000000, 0x0000b700); # ID | ||
532 | write_le32(161); # Size | ||
533 | write_hunk_fix_endian(303048, 161); | ||
534 | |||
535 | # | ||
536 | # Firmware 42, type: STD FW LCD (0x00001000), id: NTSC PAL/M PAL/N (000000000000b700), size: 161 | ||
537 | # | ||
538 | |||
539 | write_le32(0x00001000); # Type | ||
540 | write_le64(0x00000000, 0x0000b700); # ID | ||
541 | write_le32(161); # Size | ||
542 | write_hunk_fix_endian(303216, 161); | ||
543 | |||
544 | # | ||
545 | # Firmware 43, type: STD FW LCD NOGD (0x00003000), id: NTSC PAL/M PAL/N (000000000000b700), size: 161 | ||
546 | # | ||
547 | |||
548 | write_le32(0x00003000); # Type | ||
549 | write_le64(0x00000000, 0x0000b700); # ID | ||
550 | write_le32(161); # Size | ||
551 | write_hunk_fix_endian(303384, 161); | ||
552 | |||
553 | # | ||
554 | # Firmware 44, type: STD FW (0x00000000), id: NTSC/M Jp (0000000000002000), size: 161 | ||
555 | # | ||
556 | |||
557 | write_le32(0x00000000); # Type | ||
558 | write_le64(0x00000000, 0x00002000); # ID | ||
559 | write_le32(161); # Size | ||
560 | write_hunk_fix_endian(303552, 161); | ||
561 | |||
562 | # | ||
563 | # Firmware 45, type: STD FW MTS (0x00000004), id: NTSC PAL/M PAL/N (000000000000b700), size: 169 | ||
564 | # | ||
565 | |||
566 | write_le32(0x00000004); # Type | ||
567 | write_le64(0x00000000, 0x0000b700); # ID | ||
568 | write_le32(169); # Size | ||
569 | write_hunk_fix_endian(303720, 169); | ||
570 | |||
571 | # | ||
572 | # Firmware 46, type: STD FW MTS LCD (0x00001004), id: NTSC PAL/M PAL/N (000000000000b700), size: 169 | ||
573 | # | ||
574 | |||
575 | write_le32(0x00001004); # Type | ||
576 | write_le64(0x00000000, 0x0000b700); # ID | ||
577 | write_le32(169); # Size | ||
578 | write_hunk_fix_endian(303896, 169); | ||
579 | |||
580 | # | ||
581 | # Firmware 47, type: STD FW MTS LCD NOGD (0x00003004), id: NTSC PAL/M PAL/N (000000000000b700), size: 169 | ||
582 | # | ||
583 | |||
584 | write_le32(0x00003004); # Type | ||
585 | write_le64(0x00000000, 0x0000b700); # ID | ||
586 | write_le32(169); # Size | ||
587 | write_hunk_fix_endian(304072, 169); | ||
588 | |||
589 | # | ||
590 | # Firmware 48, type: SCODE FW HAS IF (0x60000000), IF = 3.28 MHz id: (0000000000000000), size: 192 | ||
591 | # | ||
592 | |||
593 | write_le32(0x60000000); # Type | ||
594 | write_le64(0x00000000, 0x00000000); # ID | ||
595 | write_le16(3280); # IF | ||
596 | write_le32(192); # Size | ||
597 | write_hunk(309048, 192); | ||
598 | |||
599 | # | ||
600 | # Firmware 49, type: SCODE FW HAS IF (0x60000000), IF = 3.30 MHz id: (0000000000000000), size: 192 | ||
601 | # | ||
602 | |||
603 | # write_le32(0x60000000); # Type | ||
604 | # write_le64(0x00000000, 0x00000000); # ID | ||
605 | # write_le16(3300); # IF | ||
606 | # write_le32(192); # Size | ||
607 | # write_hunk(304440, 192); | ||
608 | |||
609 | # | ||
610 | # Firmware 50, type: SCODE FW HAS IF (0x60000000), IF = 3.44 MHz id: (0000000000000000), size: 192 | ||
611 | # | ||
612 | |||
613 | write_le32(0x60000000); # Type | ||
614 | write_le64(0x00000000, 0x00000000); # ID | ||
615 | write_le16(3440); # IF | ||
616 | write_le32(192); # Size | ||
617 | write_hunk(309432, 192); | ||
618 | |||
619 | # | ||
620 | # Firmware 51, type: SCODE FW HAS IF (0x60000000), IF = 3.46 MHz id: (0000000000000000), size: 192 | ||
621 | # | ||
622 | |||
623 | write_le32(0x60000000); # Type | ||
624 | write_le64(0x00000000, 0x00000000); # ID | ||
625 | write_le16(3460); # IF | ||
626 | write_le32(192); # Size | ||
627 | write_hunk(309624, 192); | ||
628 | |||
629 | # | ||
630 | # Firmware 52, type: SCODE FW DTV6 ATSC OREN36 HAS IF (0x60210020), IF = 3.80 MHz id: (0000000000000000), size: 192 | ||
631 | # | ||
632 | |||
633 | write_le32(0x60210020); # Type | ||
634 | write_le64(0x00000000, 0x00000000); # ID | ||
635 | write_le16(3800); # IF | ||
636 | write_le32(192); # Size | ||
637 | write_hunk(306936, 192); | ||
638 | |||
639 | # | ||
640 | # Firmware 53, type: SCODE FW HAS IF (0x60000000), IF = 4.00 MHz id: (0000000000000000), size: 192 | ||
641 | # | ||
642 | |||
643 | write_le32(0x60000000); # Type | ||
644 | write_le64(0x00000000, 0x00000000); # ID | ||
645 | write_le16(4000); # IF | ||
646 | write_le32(192); # Size | ||
647 | write_hunk(309240, 192); | ||
648 | |||
649 | # | ||
650 | # Firmware 54, type: SCODE FW DTV6 ATSC TOYOTA388 HAS IF (0x60410020), IF = 4.08 MHz id: (0000000000000000), size: 192 | ||
651 | # | ||
652 | |||
653 | write_le32(0x60410020); # Type | ||
654 | write_le64(0x00000000, 0x00000000); # ID | ||
655 | write_le16(4080); # IF | ||
656 | write_le32(192); # Size | ||
657 | write_hunk(307128, 192); | ||
658 | |||
659 | # | ||
660 | # Firmware 55, type: SCODE FW HAS IF (0x60000000), IF = 4.20 MHz id: (0000000000000000), size: 192 | ||
661 | # | ||
662 | |||
663 | write_le32(0x60000000); # Type | ||
664 | write_le64(0x00000000, 0x00000000); # ID | ||
665 | write_le16(4200); # IF | ||
666 | write_le32(192); # Size | ||
667 | write_hunk(308856, 192); | ||
668 | |||
669 | # | ||
670 | # Firmware 56, type: SCODE FW MONO HAS IF (0x60008000), IF = 4.32 MHz id: NTSC/M Kr (0000000000008000), size: 192 | ||
671 | # | ||
672 | |||
673 | write_le32(0x60008000); # Type | ||
674 | write_le64(0x00000000, 0x00008000); # ID | ||
675 | write_le16(4320); # IF | ||
676 | write_le32(192); # Size | ||
677 | write_hunk(305208, 192); | ||
678 | |||
679 | # | ||
680 | # Firmware 57, type: SCODE FW HAS IF (0x60000000), IF = 4.45 MHz id: (0000000000000000), size: 192 | ||
681 | # | ||
682 | |||
683 | write_le32(0x60000000); # Type | ||
684 | write_le64(0x00000000, 0x00000000); # ID | ||
685 | write_le16(4450); # IF | ||
686 | write_le32(192); # Size | ||
687 | write_hunk(309816, 192); | ||
688 | |||
689 | # | ||
690 | # Firmware 58, type: SCODE FW MTS LCD NOGD MONO IF HAS IF (0x6002b004), IF = 4.50 MHz id: NTSC PAL/M PAL/N (000000000000b700), size: 192 | ||
691 | # | ||
692 | |||
693 | write_le32(0x6002b004); # Type | ||
694 | write_le64(0x00000000, 0x0000b700); # ID | ||
695 | write_le16(4500); # IF | ||
696 | write_le32(192); # Size | ||
697 | write_hunk(304824, 192); | ||
698 | |||
699 | # | ||
700 | # Firmware 59, type: SCODE FW LCD NOGD IF HAS IF (0x60023000), IF = 4.60 MHz id: NTSC/M Kr (0000000000008000), size: 192 | ||
701 | # | ||
702 | |||
703 | write_le32(0x60023000); # Type | ||
704 | write_le64(0x00000000, 0x00008000); # ID | ||
705 | write_le16(4600); # IF | ||
706 | write_le32(192); # Size | ||
707 | write_hunk(305016, 192); | ||
708 | |||
709 | # | ||
710 | # Firmware 60, type: SCODE FW DTV6 QAM DTV7 DTV78 DTV8 ZARLINK456 HAS IF (0x620003e0), IF = 4.76 MHz id: (0000000000000000), size: 192 | ||
711 | # | ||
712 | |||
713 | write_le32(0x620003e0); # Type | ||
714 | write_le64(0x00000000, 0x00000000); # ID | ||
715 | write_le16(4760); # IF | ||
716 | write_le32(192); # Size | ||
717 | write_hunk(304440, 192); | ||
718 | |||
719 | # | ||
720 | # Firmware 61, type: SCODE FW HAS IF (0x60000000), IF = 4.94 MHz id: (0000000000000000), size: 192 | ||
721 | # | ||
722 | |||
723 | write_le32(0x60000000); # Type | ||
724 | write_le64(0x00000000, 0x00000000); # ID | ||
725 | write_le16(4940); # IF | ||
726 | write_le32(192); # Size | ||
727 | write_hunk(308664, 192); | ||
728 | |||
729 | # | ||
730 | # Firmware 62, type: SCODE FW HAS IF (0x60000000), IF = 5.26 MHz id: (0000000000000000), size: 192 | ||
731 | # | ||
732 | |||
733 | write_le32(0x60000000); # Type | ||
734 | write_le64(0x00000000, 0x00000000); # ID | ||
735 | write_le16(5260); # IF | ||
736 | write_le32(192); # Size | ||
737 | write_hunk(307704, 192); | ||
738 | |||
739 | # | ||
740 | # Firmware 63, type: SCODE FW MONO HAS IF (0x60008000), IF = 5.32 MHz id: PAL/BG A2 NICAM (0000000f00000007), size: 192 | ||
741 | # | ||
742 | |||
743 | write_le32(0x60008000); # Type | ||
744 | write_le64(0x0000000f, 0x00000007); # ID | ||
745 | write_le16(5320); # IF | ||
746 | write_le32(192); # Size | ||
747 | write_hunk(307896, 192); | ||
748 | |||
749 | # | ||
750 | # Firmware 64, type: SCODE FW DTV7 DTV78 DTV8 DIBCOM52 CHINA HAS IF (0x65000380), IF = 5.40 MHz id: (0000000000000000), size: 192 | ||
751 | # | ||
752 | |||
753 | write_le32(0x65000380); # Type | ||
754 | write_le64(0x00000000, 0x00000000); # ID | ||
755 | write_le16(5400); # IF | ||
756 | write_le32(192); # Size | ||
757 | write_hunk(304248, 192); | ||
758 | |||
759 | # | ||
760 | # Firmware 65, type: SCODE FW DTV6 ATSC OREN538 HAS IF (0x60110020), IF = 5.58 MHz id: (0000000000000000), size: 192 | ||
761 | # | ||
762 | |||
763 | write_le32(0x60110020); # Type | ||
764 | write_le64(0x00000000, 0x00000000); # ID | ||
765 | write_le16(5580); # IF | ||
766 | write_le32(192); # Size | ||
767 | write_hunk(306744, 192); | ||
768 | |||
769 | # | ||
770 | # Firmware 66, type: SCODE FW HAS IF (0x60000000), IF = 5.64 MHz id: PAL/BG A2 (0000000300000007), size: 192 | ||
771 | # | ||
772 | |||
773 | write_le32(0x60000000); # Type | ||
774 | write_le64(0x00000003, 0x00000007); # ID | ||
775 | write_le16(5640); # IF | ||
776 | write_le32(192); # Size | ||
777 | write_hunk(305592, 192); | ||
778 | |||
779 | # | ||
780 | # Firmware 67, type: SCODE FW HAS IF (0x60000000), IF = 5.74 MHz id: PAL/BG NICAM (0000000c00000007), size: 192 | ||
781 | # | ||
782 | |||
783 | write_le32(0x60000000); # Type | ||
784 | write_le64(0x0000000c, 0x00000007); # ID | ||
785 | write_le16(5740); # IF | ||
786 | write_le32(192); # Size | ||
787 | write_hunk(305784, 192); | ||
788 | |||
789 | # | ||
790 | # Firmware 68, type: SCODE FW HAS IF (0x60000000), IF = 5.90 MHz id: (0000000000000000), size: 192 | ||
791 | # | ||
792 | |||
793 | write_le32(0x60000000); # Type | ||
794 | write_le64(0x00000000, 0x00000000); # ID | ||
795 | write_le16(5900); # IF | ||
796 | write_le32(192); # Size | ||
797 | write_hunk(307512, 192); | ||
798 | |||
799 | # | ||
800 | # Firmware 69, type: SCODE FW MONO HAS IF (0x60008000), IF = 6.00 MHz id: PAL/DK PAL/I SECAM/K3 SECAM/L SECAM/Lc NICAM (0000000c04c000f0), size: 192 | ||
801 | # | ||
802 | |||
803 | write_le32(0x60008000); # Type | ||
804 | write_le64(0x0000000c, 0x04c000f0); # ID | ||
805 | write_le16(6000); # IF | ||
806 | write_le32(192); # Size | ||
807 | write_hunk(305576, 192); | ||
808 | |||
809 | # | ||
810 | # Firmware 70, type: SCODE FW DTV6 QAM ATSC LG60 F6MHZ HAS IF (0x68050060), IF = 6.20 MHz id: (0000000000000000), size: 192 | ||
811 | # | ||
812 | |||
813 | write_le32(0x68050060); # Type | ||
814 | write_le64(0x00000000, 0x00000000); # ID | ||
815 | write_le16(6200); # IF | ||
816 | write_le32(192); # Size | ||
817 | write_hunk(306552, 192); | ||
818 | |||
819 | # | ||
820 | # Firmware 71, type: SCODE FW HAS IF (0x60000000), IF = 6.24 MHz id: PAL/I (0000000000000010), size: 192 | ||
821 | # | ||
822 | |||
823 | write_le32(0x60000000); # Type | ||
824 | write_le64(0x00000000, 0x00000010); # ID | ||
825 | write_le16(6240); # IF | ||
826 | write_le32(192); # Size | ||
827 | write_hunk(305400, 192); | ||
828 | |||
829 | # | ||
830 | # Firmware 72, type: SCODE FW MONO HAS IF (0x60008000), IF = 6.32 MHz id: SECAM/K1 (0000000000200000), size: 192 | ||
831 | # | ||
832 | |||
833 | write_le32(0x60008000); # Type | ||
834 | write_le64(0x00000000, 0x00200000); # ID | ||
835 | write_le16(6320); # IF | ||
836 | write_le32(192); # Size | ||
837 | write_hunk(308472, 192); | ||
838 | |||
839 | # | ||
840 | # Firmware 73, type: SCODE FW HAS IF (0x60000000), IF = 6.34 MHz id: SECAM/K1 (0000000000200000), size: 192 | ||
841 | # | ||
842 | |||
843 | write_le32(0x60000000); # Type | ||
844 | write_le64(0x00000000, 0x00200000); # ID | ||
845 | write_le16(6340); # IF | ||
846 | write_le32(192); # Size | ||
847 | write_hunk(306360, 192); | ||
848 | |||
849 | # | ||
850 | # Firmware 74, type: SCODE FW MONO HAS IF (0x60008000), IF = 6.50 MHz id: PAL/DK SECAM/K3 SECAM/L NICAM (0000000c044000e0), size: 192 | ||
851 | # | ||
852 | |||
853 | write_le32(0x60008000); # Type | ||
854 | write_le64(0x0000000c, 0x044000e0); # ID | ||
855 | write_le16(6500); # IF | ||
856 | write_le32(192); # Size | ||
857 | write_hunk(308280, 192); | ||
858 | |||
859 | # | ||
860 | # Firmware 75, type: SCODE FW DTV6 ATSC ATI638 HAS IF (0x60090020), IF = 6.58 MHz id: (0000000000000000), size: 192 | ||
861 | # | ||
862 | |||
863 | write_le32(0x60090020); # Type | ||
864 | write_le64(0x00000000, 0x00000000); # ID | ||
865 | write_le16(6580); # IF | ||
866 | write_le32(192); # Size | ||
867 | write_hunk(304632, 192); | ||
868 | |||
869 | # | ||
870 | # Firmware 76, type: SCODE FW HAS IF (0x60000000), IF = 6.60 MHz id: PAL/DK A2 (00000003000000e0), size: 192 | ||
871 | # | ||
872 | |||
873 | write_le32(0x60000000); # Type | ||
874 | write_le64(0x00000003, 0x000000e0); # ID | ||
875 | write_le16(6600); # IF | ||
876 | write_le32(192); # Size | ||
877 | write_hunk(306168, 192); | ||
878 | |||
879 | # | ||
880 | # Firmware 77, type: SCODE FW MONO HAS IF (0x60008000), IF = 6.68 MHz id: PAL/DK A2 (00000003000000e0), size: 192 | ||
881 | # | ||
882 | |||
883 | write_le32(0x60008000); # Type | ||
884 | write_le64(0x00000003, 0x000000e0); # ID | ||
885 | write_le16(6680); # IF | ||
886 | write_le32(192); # Size | ||
887 | write_hunk(308088, 192); | ||
888 | |||
889 | # | ||
890 | # Firmware 78, type: SCODE FW DTV6 ATSC TOYOTA794 HAS IF (0x60810020), IF = 8.14 MHz id: (0000000000000000), size: 192 | ||
891 | # | ||
892 | |||
893 | write_le32(0x60810020); # Type | ||
894 | write_le64(0x00000000, 0x00000000); # ID | ||
895 | write_le16(8140); # IF | ||
896 | write_le32(192); # Size | ||
897 | write_hunk(307320, 192); | ||
898 | |||
899 | # | ||
900 | # Firmware 79, type: SCODE FW HAS IF (0x60000000), IF = 8.20 MHz id: (0000000000000000), size: 192 | ||
901 | # | ||
902 | |||
903 | # write_le32(0x60000000); # Type | ||
904 | # write_le64(0x00000000, 0x00000000); # ID | ||
905 | # write_le16(8200); # IF | ||
906 | # write_le32(192); # Size | ||
907 | # write_hunk(308088, 192); | ||
149 | } | 908 | } |
150 | 909 | ||
910 | sub main_firmware_27($$$$) | ||
911 | { | ||
912 | my $out; | ||
913 | my $j=0; | ||
914 | my $outfile = shift; | ||
915 | my $name = shift; | ||
916 | my $version = shift; | ||
917 | my $nr_desc = shift; | ||
918 | |||
919 | for ($j = length($name); $j <32; $j++) { | ||
920 | $name = $name.chr(0); | ||
921 | } | ||
922 | |||
151 | open OUTFILE, ">$outfile"; | 923 | open OUTFILE, ">$outfile"; |
152 | syswrite(OUTFILE, $name); | 924 | syswrite(OUTFILE, $name); |
153 | write_le16($version); | 925 | write_le16($version); |
@@ -906,20 +1678,39 @@ sub main_firmware($$$$) | |||
906 | write_hunk(812856, 192); | 1678 | write_hunk(812856, 192); |
907 | } | 1679 | } |
908 | 1680 | ||
1681 | |||
909 | sub extract_firmware { | 1682 | sub extract_firmware { |
910 | my $sourcefile = "hcw85bda.sys"; | 1683 | my $sourcefile_24 = "UDXTTM6000.sys"; |
911 | my $hash = "0e44dbf63bb0169d57446aec21881ff2"; | 1684 | my $hash_24 = "cb9deb5508a5e150af2880f5b0066d78"; |
912 | my $outfile = "xc3028-v27.fw"; | 1685 | my $outfile_24 = "xc3028-v24.fw"; |
913 | my $name = "xc2028 firmware"; | 1686 | my $name_24 = "xc2028 firmware"; |
914 | my $version = 519; | 1687 | my $version_24 = 516; |
915 | my $nr_desc = 80; | 1688 | my $nr_desc_24 = 77; |
1689 | my $out; | ||
1690 | |||
1691 | my $sourcefile_27 = "hcw85bda.sys"; | ||
1692 | my $hash_27 = "0e44dbf63bb0169d57446aec21881ff2"; | ||
1693 | my $outfile_27 = "xc3028-v27.fw"; | ||
1694 | my $name_27 = "xc2028 firmware"; | ||
1695 | my $version_27 = 519; | ||
1696 | my $nr_desc_27 = 80; | ||
916 | my $out; | 1697 | my $out; |
917 | 1698 | ||
918 | verify($sourcefile, $hash); | 1699 | if (-e $sourcefile_24) { |
1700 | verify($sourcefile_24, $hash_24); | ||
1701 | |||
1702 | open INFILE, "<$sourcefile_24"; | ||
1703 | main_firmware_24($outfile_24, $name_24, $version_24, $nr_desc_24); | ||
1704 | close INFILE; | ||
1705 | } | ||
919 | 1706 | ||
920 | open INFILE, "<$sourcefile"; | 1707 | if (-e $sourcefile_27) { |
921 | main_firmware($outfile, $name, $version, $nr_desc); | 1708 | verify($sourcefile_27, $hash_27); |
922 | close INFILE; | 1709 | |
1710 | open INFILE, "<$sourcefile_27"; | ||
1711 | main_firmware_27($outfile_27, $name_27, $version_27, $nr_desc_27); | ||
1712 | close INFILE; | ||
1713 | } | ||
923 | } | 1714 | } |
924 | 1715 | ||
925 | extract_firmware; | 1716 | extract_firmware; |
diff --git a/Documentation/video4linux/gspca.txt b/Documentation/video4linux/gspca.txt index 1800a62cf135..8f3f5d33327c 100644 --- a/Documentation/video4linux/gspca.txt +++ b/Documentation/video4linux/gspca.txt | |||
@@ -42,6 +42,7 @@ ov519 041e:4064 Creative Live! VISTA VF0420 | |||
42 | ov519 041e:4067 Creative Live! Cam Video IM (VF0350) | 42 | ov519 041e:4067 Creative Live! Cam Video IM (VF0350) |
43 | ov519 041e:4068 Creative Live! VISTA VF0470 | 43 | ov519 041e:4068 Creative Live! VISTA VF0470 |
44 | spca561 0458:7004 Genius VideoCAM Express V2 | 44 | spca561 0458:7004 Genius VideoCAM Express V2 |
45 | sn9c2028 0458:7005 Genius Smart 300, version 2 | ||
45 | sunplus 0458:7006 Genius Dsc 1.3 Smart | 46 | sunplus 0458:7006 Genius Dsc 1.3 Smart |
46 | zc3xx 0458:7007 Genius VideoCam V2 | 47 | zc3xx 0458:7007 Genius VideoCam V2 |
47 | zc3xx 0458:700c Genius VideoCam V3 | 48 | zc3xx 0458:700c Genius VideoCam V3 |
@@ -49,6 +50,8 @@ zc3xx 0458:700f Genius VideoCam Web V2 | |||
49 | sonixj 0458:7025 Genius Eye 311Q | 50 | sonixj 0458:7025 Genius Eye 311Q |
50 | sn9c20x 0458:7029 Genius Look 320s | 51 | sn9c20x 0458:7029 Genius Look 320s |
51 | sonixj 0458:702e Genius Slim 310 NB | 52 | sonixj 0458:702e Genius Slim 310 NB |
53 | sn9c20x 0458:704a Genius Slim 1320 | ||
54 | sn9c20x 0458:704c Genius i-Look 1321 | ||
52 | sn9c20x 045e:00f4 LifeCam VX-6000 (SN9C20x + OV9650) | 55 | sn9c20x 045e:00f4 LifeCam VX-6000 (SN9C20x + OV9650) |
53 | sonixj 045e:00f5 MicroSoft VX3000 | 56 | sonixj 045e:00f5 MicroSoft VX3000 |
54 | sonixj 045e:00f7 MicroSoft VX1000 | 57 | sonixj 045e:00f7 MicroSoft VX1000 |
@@ -109,6 +112,7 @@ sunplus 04a5:3003 Benq DC 1300 | |||
109 | sunplus 04a5:3008 Benq DC 1500 | 112 | sunplus 04a5:3008 Benq DC 1500 |
110 | sunplus 04a5:300a Benq DC 3410 | 113 | sunplus 04a5:300a Benq DC 3410 |
111 | spca500 04a5:300c Benq DC 1016 | 114 | spca500 04a5:300c Benq DC 1016 |
115 | benq 04a5:3035 Benq DC E300 | ||
112 | finepix 04cb:0104 Fujifilm FinePix 4800 | 116 | finepix 04cb:0104 Fujifilm FinePix 4800 |
113 | finepix 04cb:0109 Fujifilm FinePix A202 | 117 | finepix 04cb:0109 Fujifilm FinePix A202 |
114 | finepix 04cb:010b Fujifilm FinePix A203 | 118 | finepix 04cb:010b Fujifilm FinePix A203 |
@@ -142,6 +146,7 @@ sunplus 04fc:5360 Sunplus Generic | |||
142 | spca500 04fc:7333 PalmPixDC85 | 146 | spca500 04fc:7333 PalmPixDC85 |
143 | sunplus 04fc:ffff Pure DigitalDakota | 147 | sunplus 04fc:ffff Pure DigitalDakota |
144 | spca501 0506:00df 3Com HomeConnect Lite | 148 | spca501 0506:00df 3Com HomeConnect Lite |
149 | sunplus 052b:1507 Megapixel 5 Pretec DC-1007 | ||
145 | sunplus 052b:1513 Megapix V4 | 150 | sunplus 052b:1513 Megapix V4 |
146 | sunplus 052b:1803 MegaImage VI | 151 | sunplus 052b:1803 MegaImage VI |
147 | tv8532 0545:808b Veo Stingray | 152 | tv8532 0545:808b Veo Stingray |
@@ -151,6 +156,7 @@ sunplus 0546:3191 Polaroid Ion 80 | |||
151 | sunplus 0546:3273 Polaroid PDC2030 | 156 | sunplus 0546:3273 Polaroid PDC2030 |
152 | ov519 054c:0154 Sonny toy4 | 157 | ov519 054c:0154 Sonny toy4 |
153 | ov519 054c:0155 Sonny toy5 | 158 | ov519 054c:0155 Sonny toy5 |
159 | cpia1 0553:0002 CPIA CPiA (version1) based cameras | ||
154 | zc3xx 055f:c005 Mustek Wcam300A | 160 | zc3xx 055f:c005 Mustek Wcam300A |
155 | spca500 055f:c200 Mustek Gsmart 300 | 161 | spca500 055f:c200 Mustek Gsmart 300 |
156 | sunplus 055f:c211 Kowa Bs888e Microcamera | 162 | sunplus 055f:c211 Kowa Bs888e Microcamera |
@@ -188,8 +194,7 @@ spca500 06bd:0404 Agfa CL20 | |||
188 | spca500 06be:0800 Optimedia | 194 | spca500 06be:0800 Optimedia |
189 | sunplus 06d6:0031 Trust 610 LCD PowerC@m Zoom | 195 | sunplus 06d6:0031 Trust 610 LCD PowerC@m Zoom |
190 | spca506 06e1:a190 ADS Instant VCD | 196 | spca506 06e1:a190 ADS Instant VCD |
191 | ov534 06f8:3002 Hercules Blog Webcam | 197 | ov534_9 06f8:3003 Hercules Dualpix HD Weblog |
192 | ov534 06f8:3003 Hercules Dualpix HD Weblog | ||
193 | sonixj 06f8:3004 Hercules Classic Silver | 198 | sonixj 06f8:3004 Hercules Classic Silver |
194 | sonixj 06f8:3008 Hercules Deluxe Optical Glass | 199 | sonixj 06f8:3008 Hercules Deluxe Optical Glass |
195 | pac7302 06f8:3009 Hercules Classic Link | 200 | pac7302 06f8:3009 Hercules Classic Link |
@@ -204,6 +209,7 @@ sunplus 0733:2221 Mercury Digital Pro 3.1p | |||
204 | sunplus 0733:3261 Concord 3045 spca536a | 209 | sunplus 0733:3261 Concord 3045 spca536a |
205 | sunplus 0733:3281 Cyberpix S550V | 210 | sunplus 0733:3281 Cyberpix S550V |
206 | spca506 0734:043b 3DeMon USB Capture aka | 211 | spca506 0734:043b 3DeMon USB Capture aka |
212 | cpia1 0813:0001 QX3 camera | ||
207 | ov519 0813:0002 Dual Mode USB Camera Plus | 213 | ov519 0813:0002 Dual Mode USB Camera Plus |
208 | spca500 084d:0003 D-Link DSC-350 | 214 | spca500 084d:0003 D-Link DSC-350 |
209 | spca500 08ca:0103 Aiptek PocketDV | 215 | spca500 08ca:0103 Aiptek PocketDV |
@@ -225,7 +231,8 @@ sunplus 08ca:2050 Medion MD 41437 | |||
225 | sunplus 08ca:2060 Aiptek PocketDV5300 | 231 | sunplus 08ca:2060 Aiptek PocketDV5300 |
226 | tv8532 0923:010f ICM532 cams | 232 | tv8532 0923:010f ICM532 cams |
227 | mars 093a:050f Mars-Semi Pc-Camera | 233 | mars 093a:050f Mars-Semi Pc-Camera |
228 | mr97310a 093a:010f Sakar Digital no. 77379 | 234 | mr97310a 093a:010e All known CIF cams with this ID |
235 | mr97310a 093a:010f All known VGA cams with this ID | ||
229 | pac207 093a:2460 Qtec Webcam 100 | 236 | pac207 093a:2460 Qtec Webcam 100 |
230 | pac207 093a:2461 HP Webcam | 237 | pac207 093a:2461 HP Webcam |
231 | pac207 093a:2463 Philips SPC 220 NC | 238 | pac207 093a:2463 Philips SPC 220 NC |
@@ -300,11 +307,14 @@ sonixj 0c45:6138 Sn9c120 Mo4000 | |||
300 | sonixj 0c45:613a Microdia Sonix PC Camera | 307 | sonixj 0c45:613a Microdia Sonix PC Camera |
301 | sonixj 0c45:613b Surfer SN-206 | 308 | sonixj 0c45:613b Surfer SN-206 |
302 | sonixj 0c45:613c Sonix Pccam168 | 309 | sonixj 0c45:613c Sonix Pccam168 |
310 | sonixj 0c45:6142 Hama PC-Webcam AC-150 | ||
303 | sonixj 0c45:6143 Sonix Pccam168 | 311 | sonixj 0c45:6143 Sonix Pccam168 |
304 | sonixj 0c45:6148 Digitus DA-70811/ZSMC USB PC Camera ZS211/Microdia | 312 | sonixj 0c45:6148 Digitus DA-70811/ZSMC USB PC Camera ZS211/Microdia |
313 | sonixj 0c45:614a Frontech E-Ccam (JIL-2225) | ||
305 | sn9c20x 0c45:6240 PC Camera (SN9C201 + MT9M001) | 314 | sn9c20x 0c45:6240 PC Camera (SN9C201 + MT9M001) |
306 | sn9c20x 0c45:6242 PC Camera (SN9C201 + MT9M111) | 315 | sn9c20x 0c45:6242 PC Camera (SN9C201 + MT9M111) |
307 | sn9c20x 0c45:6248 PC Camera (SN9C201 + OV9655) | 316 | sn9c20x 0c45:6248 PC Camera (SN9C201 + OV9655) |
317 | sn9c20x 0c45:624c PC Camera (SN9C201 + MT9M112) | ||
308 | sn9c20x 0c45:624e PC Camera (SN9C201 + SOI968) | 318 | sn9c20x 0c45:624e PC Camera (SN9C201 + SOI968) |
309 | sn9c20x 0c45:624f PC Camera (SN9C201 + OV9650) | 319 | sn9c20x 0c45:624f PC Camera (SN9C201 + OV9650) |
310 | sn9c20x 0c45:6251 PC Camera (SN9C201 + OV9650) | 320 | sn9c20x 0c45:6251 PC Camera (SN9C201 + OV9650) |
@@ -317,6 +327,7 @@ sn9c20x 0c45:627f PC Camera (SN9C201 + OV9650) | |||
317 | sn9c20x 0c45:6280 PC Camera (SN9C202 + MT9M001) | 327 | sn9c20x 0c45:6280 PC Camera (SN9C202 + MT9M001) |
318 | sn9c20x 0c45:6282 PC Camera (SN9C202 + MT9M111) | 328 | sn9c20x 0c45:6282 PC Camera (SN9C202 + MT9M111) |
319 | sn9c20x 0c45:6288 PC Camera (SN9C202 + OV9655) | 329 | sn9c20x 0c45:6288 PC Camera (SN9C202 + OV9655) |
330 | sn9c20x 0c45:628c PC Camera (SN9C201 + MT9M112) | ||
320 | sn9c20x 0c45:628e PC Camera (SN9C202 + SOI968) | 331 | sn9c20x 0c45:628e PC Camera (SN9C202 + SOI968) |
321 | sn9c20x 0c45:628f PC Camera (SN9C202 + OV9650) | 332 | sn9c20x 0c45:628f PC Camera (SN9C202 + OV9650) |
322 | sn9c20x 0c45:62a0 PC Camera (SN9C202 + OV7670) | 333 | sn9c20x 0c45:62a0 PC Camera (SN9C202 + OV7670) |
@@ -324,6 +335,10 @@ sn9c20x 0c45:62b0 PC Camera (SN9C202 + MT9V011/MT9V111/MT9V112) | |||
324 | sn9c20x 0c45:62b3 PC Camera (SN9C202 + OV9655) | 335 | sn9c20x 0c45:62b3 PC Camera (SN9C202 + OV9655) |
325 | sn9c20x 0c45:62bb PC Camera (SN9C202 + OV7660) | 336 | sn9c20x 0c45:62bb PC Camera (SN9C202 + OV7660) |
326 | sn9c20x 0c45:62bc PC Camera (SN9C202 + HV7131R) | 337 | sn9c20x 0c45:62bc PC Camera (SN9C202 + HV7131R) |
338 | sn9c2028 0c45:8001 Wild Planet Digital Spy Camera | ||
339 | sn9c2028 0c45:8003 Sakar #11199, #6637x, #67480 keychain cams | ||
340 | sn9c2028 0c45:8008 Mini-Shotz ms-350 | ||
341 | sn9c2028 0c45:800a Vivitar Vivicam 3350B | ||
327 | sunplus 0d64:0303 Sunplus FashionCam DXG | 342 | sunplus 0d64:0303 Sunplus FashionCam DXG |
328 | ov519 0e96:c001 TRUST 380 USB2 SPACEC@M | 343 | ov519 0e96:c001 TRUST 380 USB2 SPACEC@M |
329 | etoms 102c:6151 Qcam Sangha CIF | 344 | etoms 102c:6151 Qcam Sangha CIF |
@@ -341,10 +356,11 @@ spca501 1776:501c Arowana 300K CMOS Camera | |||
341 | t613 17a1:0128 TASCORP JPEG Webcam, NGS Cyclops | 356 | t613 17a1:0128 TASCORP JPEG Webcam, NGS Cyclops |
342 | vc032x 17ef:4802 Lenovo Vc0323+MI1310_SOC | 357 | vc032x 17ef:4802 Lenovo Vc0323+MI1310_SOC |
343 | pac207 2001:f115 D-Link DSB-C120 | 358 | pac207 2001:f115 D-Link DSB-C120 |
344 | sq905c 2770:9050 sq905c | 359 | sq905c 2770:9050 Disney pix micro (CIF) |
345 | sq905c 2770:905c DualCamera | 360 | sq905c 2770:9052 Disney pix micro 2 (VGA) |
346 | sq905 2770:9120 Argus Digital Camera DC1512 | 361 | sq905c 2770:905c All 11 known cameras with this ID |
347 | sq905c 2770:913d sq905c | 362 | sq905 2770:9120 All 24 known cameras with this ID |
363 | sq905c 2770:913d All 4 known cameras with this ID | ||
348 | spca500 2899:012c Toptro Industrial | 364 | spca500 2899:012c Toptro Industrial |
349 | ov519 8020:ef04 ov519 | 365 | ov519 8020:ef04 ov519 |
350 | spca508 8086:0110 Intel Easy PC Camera | 366 | spca508 8086:0110 Intel Easy PC Camera |
diff --git a/Documentation/video4linux/sh_mobile_ceu_camera.txt b/Documentation/video4linux/sh_mobile_ceu_camera.txt index 2ae16349a78d..cb47e723af74 100644 --- a/Documentation/video4linux/sh_mobile_ceu_camera.txt +++ b/Documentation/video4linux/sh_mobile_ceu_camera.txt | |||
@@ -17,18 +17,18 @@ Generic scaling / cropping scheme | |||
17 | -2-- -\ | 17 | -2-- -\ |
18 | | --\ | 18 | | --\ |
19 | | --\ | 19 | | --\ |
20 | +-5-- -\ -- -3-- | 20 | +-5-- . -- -3-- -\ |
21 | | ---\ | 21 | | `... -\ |
22 | | --- -4-- -\ | 22 | | `... -4-- . - -7.. |
23 | | -\ | 23 | | `. |
24 | | - -6-- | 24 | | `. .6-- |
25 | | | 25 | | |
26 | | - -6'- | 26 | | . .6'- |
27 | | -/ | 27 | | .´ |
28 | | --- -4'- -/ | 28 | | ... -4'- .´ |
29 | | ---/ | 29 | | ...´ - -7'. |
30 | +-5'- -/ | 30 | +-5'- .´ -/ |
31 | | -- -3'- | 31 | | -- -3'- -/ |
32 | | --/ | 32 | | --/ |
33 | | --/ | 33 | | --/ |
34 | -2'- -/ | 34 | -2'- -/ |
@@ -36,7 +36,11 @@ Generic scaling / cropping scheme | |||
36 | | | 36 | | |
37 | -1'- | 37 | -1'- |
38 | 38 | ||
39 | Produced by user requests: | 39 | In the above chart minuses and slashes represent "real" data amounts, points and |
40 | accents represent "useful" data, basically, CEU scaled amd cropped output, | ||
41 | mapped back onto the client's source plane. | ||
42 | |||
43 | Such a configuration can be produced by user requests: | ||
40 | 44 | ||
41 | S_CROP(left / top = (5) - (1), width / height = (5') - (5)) | 45 | S_CROP(left / top = (5) - (1), width / height = (5') - (5)) |
42 | S_FMT(width / height = (6') - (6)) | 46 | S_FMT(width / height = (6') - (6)) |
@@ -106,52 +110,30 @@ window: | |||
106 | S_CROP | 110 | S_CROP |
107 | ------ | 111 | ------ |
108 | 112 | ||
109 | If old scale applied to new crop is invalid produce nearest new scale possible | 113 | The API at http://v4l2spec.bytesex.org/spec/x1904.htm says: |
110 | |||
111 | 1. Calculate current combined scales. | ||
112 | |||
113 | scale_comb = (((4') - (4)) / ((6') - (6))) * (((2') - (2)) / ((3') - (3))) | ||
114 | |||
115 | 2. Apply iterative sensor S_CROP for new input window. | ||
116 | |||
117 | 3. If old combined scales applied to new crop produce an impossible user window, | ||
118 | adjust scales to produce nearest possible window. | ||
119 | |||
120 | width_u_out = ((5') - (5)) / scale_comb | ||
121 | 114 | ||
122 | if (width_u_out > max) | 115 | "...specification does not define an origin or units. However by convention |
123 | scale_comb = ((5') - (5)) / max; | 116 | drivers should horizontally count unscaled samples relative to 0H." |
124 | else if (width_u_out < min) | ||
125 | scale_comb = ((5') - (5)) / min; | ||
126 | 117 | ||
127 | 4. Issue G_CROP to retrieve actual input window. | 118 | We choose to follow the advise and interpret cropping units as client input |
119 | pixels. | ||
128 | 120 | ||
129 | 5. Using actual input window and calculated combined scales calculate sensor | 121 | Cropping is performed in the following 6 steps: |
130 | target output window. | ||
131 | |||
132 | width_s_out = ((3') - (3)) = ((2') - (2)) / scale_comb | ||
133 | |||
134 | 6. Apply iterative S_FMT for new sensor target output window. | ||
135 | |||
136 | 7. Issue G_FMT to retrieve the actual sensor output window. | ||
137 | |||
138 | 8. Calculate sensor scales. | ||
139 | |||
140 | scale_s = ((3') - (3)) / ((2') - (2)) | ||
141 | 122 | ||
142 | 9. Calculate sensor output subwindow to be cropped on CEU by applying sensor | 123 | 1. Request exactly user rectangle from the sensor. |
143 | scales to the requested window. | ||
144 | 124 | ||
145 | width_ceu = ((5') - (5)) / scale_s | 125 | 2. If smaller - iterate until a larger one is obtained. Result: sensor cropped |
126 | to 2 : 2', target crop 5 : 5', current output format 6' - 6. | ||
146 | 127 | ||
147 | 10. Use CEU cropping for above calculated window. | 128 | 3. In the previous step the sensor has tried to preserve its output frame as |
129 | good as possible, but it could have changed. Retrieve it again. | ||
148 | 130 | ||
149 | 11. Calculate CEU scales from sensor scales from results of (10) and user window | 131 | 4. Sensor scaled to 3 : 3'. Sensor's scale is (2' - 2) / (3' - 3). Calculate |
150 | from (3) | 132 | intermediate window: 4' - 4 = (5' - 5) * (3' - 3) / (2' - 2) |
151 | 133 | ||
152 | scale_ceu = calc_scale(((5') - (5)), &width_u_out) | 134 | 5. Calculate and apply host scale = (6' - 6) / (4' - 4) |
153 | 135 | ||
154 | 12. Apply CEU scales. | 136 | 6. Calculate and apply host crop: 6 - 7 = (5 - 2) * (6' - 6) / (5' - 5) |
155 | 137 | ||
156 | -- | 138 | -- |
157 | Author: Guennadi Liakhovetski <g.liakhovetski@gmx.de> | 139 | Author: Guennadi Liakhovetski <g.liakhovetski@gmx.de> |
diff --git a/Documentation/video4linux/v4l2-framework.txt b/Documentation/video4linux/v4l2-framework.txt index 74d677c8b036..e831aaca66f8 100644 --- a/Documentation/video4linux/v4l2-framework.txt +++ b/Documentation/video4linux/v4l2-framework.txt | |||
@@ -545,12 +545,11 @@ unregister them: | |||
545 | This will remove the device nodes from sysfs (causing udev to remove them | 545 | This will remove the device nodes from sysfs (causing udev to remove them |
546 | from /dev). | 546 | from /dev). |
547 | 547 | ||
548 | After video_unregister_device() returns no new opens can be done. | 548 | After video_unregister_device() returns no new opens can be done. However, |
549 | 549 | in the case of USB devices some application might still have one of these | |
550 | However, in the case of USB devices some application might still have one | 550 | device nodes open. So after the unregister all file operations will return |
551 | of these device nodes open. You should block all new accesses to read, | 551 | an error as well, except for the ioctl and unlocked_ioctl file operations: |
552 | write, poll, etc. except possibly for certain ioctl operations like | 552 | those will still be passed on since some buffer ioctls may still be needed. |
553 | queueing buffers. | ||
554 | 553 | ||
555 | When the last user of the video device node exits, then the vdev->release() | 554 | When the last user of the video device node exits, then the vdev->release() |
556 | callback is called and you can do the final cleanup there. | 555 | callback is called and you can do the final cleanup there. |
@@ -599,99 +598,145 @@ video_device::minor fields. | |||
599 | video buffer helper functions | 598 | video buffer helper functions |
600 | ----------------------------- | 599 | ----------------------------- |
601 | 600 | ||
602 | The v4l2 core API provides a standard method for dealing with video | 601 | The v4l2 core API provides a set of standard methods (called "videobuf") |
603 | buffers. Those methods allow a driver to implement read(), mmap() and | 602 | for dealing with video buffers. Those methods allow a driver to implement |
604 | overlay() on a consistent way. | 603 | read(), mmap() and overlay() in a consistent way. There are currently |
605 | 604 | methods for using video buffers on devices that supports DMA with | |
606 | There are currently methods for using video buffers on devices that | 605 | scatter/gather method (videobuf-dma-sg), DMA with linear access |
607 | supports DMA with scatter/gather method (videobuf-dma-sg), DMA with | 606 | (videobuf-dma-contig), and vmalloced buffers, mostly used on USB drivers |
608 | linear access (videobuf-dma-contig), and vmalloced buffers, mostly | 607 | (videobuf-vmalloc). |
609 | used on USB drivers (videobuf-vmalloc). | 608 | |
610 | 609 | Please see Documentation/video4linux/videobuf for more information on how | |
611 | Any driver using videobuf should provide operations (callbacks) for | 610 | to use the videobuf layer. |
612 | four handlers: | 611 | |
613 | 612 | struct v4l2_fh | |
614 | ops->buf_setup - calculates the size of the video buffers and avoid they | 613 | -------------- |
615 | to waste more than some maximum limit of RAM; | 614 | |
616 | ops->buf_prepare - fills the video buffer structs and calls | 615 | struct v4l2_fh provides a way to easily keep file handle specific data |
617 | videobuf_iolock() to alloc and prepare mmaped memory; | 616 | that is used by the V4L2 framework. Using v4l2_fh is optional for |
618 | ops->buf_queue - advices the driver that another buffer were | 617 | drivers. |
619 | requested (by read() or by QBUF); | 618 | |
620 | ops->buf_release - frees any buffer that were allocated. | 619 | The users of v4l2_fh (in the V4L2 framework, not the driver) know |
621 | 620 | whether a driver uses v4l2_fh as its file->private_data pointer by | |
622 | In order to use it, the driver need to have a code (generally called at | 621 | testing the V4L2_FL_USES_V4L2_FH bit in video_device->flags. |
623 | interrupt context) that will properly handle the buffer request lists, | 622 | |
624 | announcing that a new buffer were filled. | 623 | Useful functions: |
625 | 624 | ||
626 | The irq handling code should handle the videobuf task lists, in order | 625 | - v4l2_fh_init() |
627 | to advice videobuf that a new frame were filled, in order to honor to a | 626 | |
628 | request. The code is generally like this one: | 627 | Initialise the file handle. This *MUST* be performed in the driver's |
629 | if (list_empty(&dma_q->active)) | 628 | v4l2_file_operations->open() handler. |
630 | return; | 629 | |
631 | 630 | - v4l2_fh_add() | |
632 | buf = list_entry(dma_q->active.next, struct vbuffer, vb.queue); | 631 | |
633 | 632 | Add a v4l2_fh to video_device file handle list. May be called after | |
634 | if (!waitqueue_active(&buf->vb.done)) | 633 | initialising the file handle. |
635 | return; | 634 | |
636 | 635 | - v4l2_fh_del() | |
637 | /* Some logic to handle the buf may be needed here */ | 636 | |
638 | 637 | Unassociate the file handle from video_device(). The file handle | |
639 | list_del(&buf->vb.queue); | 638 | exit function may now be called. |
640 | do_gettimeofday(&buf->vb.ts); | 639 | |
641 | wake_up(&buf->vb.done); | 640 | - v4l2_fh_exit() |
642 | 641 | ||
643 | Those are the videobuffer functions used on drivers, implemented on | 642 | Uninitialise the file handle. After uninitialisation the v4l2_fh |
644 | videobuf-core: | 643 | memory can be freed. |
645 | 644 | ||
646 | - Videobuf init functions | 645 | struct v4l2_fh is allocated as a part of the driver's own file handle |
647 | videobuf_queue_sg_init() | 646 | structure and is set to file->private_data in the driver's open |
648 | Initializes the videobuf infrastructure. This function should be | 647 | function by the driver. Drivers can extract their own file handle |
649 | called before any other videobuf function on drivers that uses DMA | 648 | structure by using the container_of macro. Example: |
650 | Scatter/Gather buffers. | 649 | |
651 | 650 | struct my_fh { | |
652 | videobuf_queue_dma_contig_init | 651 | int blah; |
653 | Initializes the videobuf infrastructure. This function should be | 652 | struct v4l2_fh fh; |
654 | called before any other videobuf function on drivers that need DMA | 653 | }; |
655 | contiguous buffers. | 654 | |
656 | 655 | ... | |
657 | videobuf_queue_vmalloc_init() | 656 | |
658 | Initializes the videobuf infrastructure. This function should be | 657 | int my_open(struct file *file) |
659 | called before any other videobuf function on USB (and other drivers) | 658 | { |
660 | that need a vmalloced type of videobuf. | 659 | struct my_fh *my_fh; |
661 | 660 | struct video_device *vfd; | |
662 | - videobuf_iolock() | 661 | int ret; |
663 | Prepares the videobuf memory for the proper method (read, mmap, overlay). | 662 | |
664 | 663 | ... | |
665 | - videobuf_queue_is_busy() | 664 | |
666 | Checks if a videobuf is streaming. | 665 | ret = v4l2_fh_init(&my_fh->fh, vfd); |
667 | 666 | if (ret) | |
668 | - videobuf_queue_cancel() | 667 | return ret; |
669 | Stops video handling. | 668 | |
670 | 669 | v4l2_fh_add(&my_fh->fh); | |
671 | - videobuf_mmap_free() | 670 | |
672 | frees mmap buffers. | 671 | file->private_data = &my_fh->fh; |
673 | 672 | ||
674 | - videobuf_stop() | 673 | ... |
675 | Stops video handling, ends mmap and frees mmap and other buffers. | 674 | } |
676 | 675 | ||
677 | - V4L2 api functions. Those functions correspond to VIDIOC_foo ioctls: | 676 | int my_release(struct file *file) |
678 | videobuf_reqbufs(), videobuf_querybuf(), videobuf_qbuf(), | 677 | { |
679 | videobuf_dqbuf(), videobuf_streamon(), videobuf_streamoff(). | 678 | struct v4l2_fh *fh = file->private_data; |
680 | 679 | struct my_fh *my_fh = container_of(fh, struct my_fh, fh); | |
681 | - V4L1 api function (corresponds to VIDIOCMBUF ioctl): | 680 | |
682 | videobuf_cgmbuf() | 681 | ... |
683 | This function is used to provide backward compatibility with V4L1 | 682 | } |
684 | API. | 683 | |
685 | 684 | V4L2 events | |
686 | - Some help functions for read()/poll() operations: | 685 | ----------- |
687 | videobuf_read_stream() | 686 | |
688 | For continuous stream read() | 687 | The V4L2 events provide a generic way to pass events to user space. |
689 | videobuf_read_one() | 688 | The driver must use v4l2_fh to be able to support V4L2 events. |
690 | For snapshot read() | 689 | |
691 | videobuf_poll_stream() | 690 | Useful functions: |
692 | polling help function | 691 | |
693 | 692 | - v4l2_event_alloc() | |
694 | The better way to understand it is to take a look at vivi driver. One | 693 | |
695 | of the main reasons for vivi is to be a videobuf usage example. the | 694 | To use events, the driver must allocate events for the file handle. By |
696 | vivi_thread_tick() does the task that the IRQ callback would do on PCI | 695 | calling the function more than once, the driver may assure that at least n |
697 | drivers (or the irq callback on USB). | 696 | events in total have been allocated. The function may not be called in |
697 | atomic context. | ||
698 | |||
699 | - v4l2_event_queue() | ||
700 | |||
701 | Queue events to video device. The driver's only responsibility is to fill | ||
702 | in the type and the data fields. The other fields will be filled in by | ||
703 | V4L2. | ||
704 | |||
705 | - v4l2_event_subscribe() | ||
706 | |||
707 | The video_device->ioctl_ops->vidioc_subscribe_event must check the driver | ||
708 | is able to produce events with specified event id. Then it calls | ||
709 | v4l2_event_subscribe() to subscribe the event. | ||
710 | |||
711 | - v4l2_event_unsubscribe() | ||
712 | |||
713 | vidioc_unsubscribe_event in struct v4l2_ioctl_ops. A driver may use | ||
714 | v4l2_event_unsubscribe() directly unless it wants to be involved in | ||
715 | unsubscription process. | ||
716 | |||
717 | The special type V4L2_EVENT_ALL may be used to unsubscribe all events. The | ||
718 | drivers may want to handle this in a special way. | ||
719 | |||
720 | - v4l2_event_pending() | ||
721 | |||
722 | Returns the number of pending events. Useful when implementing poll. | ||
723 | |||
724 | Drivers do not initialise events directly. The events are initialised | ||
725 | through v4l2_fh_init() if video_device->ioctl_ops->vidioc_subscribe_event is | ||
726 | non-NULL. This *MUST* be performed in the driver's | ||
727 | v4l2_file_operations->open() handler. | ||
728 | |||
729 | Events are delivered to user space through the poll system call. The driver | ||
730 | can use v4l2_fh->events->wait wait_queue_head_t as the argument for | ||
731 | poll_wait(). | ||
732 | |||
733 | There are standard and private events. New standard events must use the | ||
734 | smallest available event type. The drivers must allocate their events from | ||
735 | their own class starting from class base. Class base is | ||
736 | V4L2_EVENT_PRIVATE_START + n * 1000 where n is the lowest available number. | ||
737 | The first event type in the class is reserved for future use, so the first | ||
738 | available event type is 'class base + 1'. | ||
739 | |||
740 | An example on how the V4L2 events may be used can be found in the OMAP | ||
741 | 3 ISP driver available at <URL:http://gitorious.org/omap3camera> as of | ||
742 | writing this. | ||
diff --git a/Documentation/video4linux/videobuf b/Documentation/video4linux/videobuf new file mode 100644 index 000000000000..17a1f9abf260 --- /dev/null +++ b/Documentation/video4linux/videobuf | |||
@@ -0,0 +1,360 @@ | |||
1 | An introduction to the videobuf layer | ||
2 | Jonathan Corbet <corbet@lwn.net> | ||
3 | Current as of 2.6.33 | ||
4 | |||
5 | The videobuf layer functions as a sort of glue layer between a V4L2 driver | ||
6 | and user space. It handles the allocation and management of buffers for | ||
7 | the storage of video frames. There is a set of functions which can be used | ||
8 | to implement many of the standard POSIX I/O system calls, including read(), | ||
9 | poll(), and, happily, mmap(). Another set of functions can be used to | ||
10 | implement the bulk of the V4L2 ioctl() calls related to streaming I/O, | ||
11 | including buffer allocation, queueing and dequeueing, and streaming | ||
12 | control. Using videobuf imposes a few design decisions on the driver | ||
13 | author, but the payback comes in the form of reduced code in the driver and | ||
14 | a consistent implementation of the V4L2 user-space API. | ||
15 | |||
16 | Buffer types | ||
17 | |||
18 | Not all video devices use the same kind of buffers. In fact, there are (at | ||
19 | least) three common variations: | ||
20 | |||
21 | - Buffers which are scattered in both the physical and (kernel) virtual | ||
22 | address spaces. (Almost) all user-space buffers are like this, but it | ||
23 | makes great sense to allocate kernel-space buffers this way as well when | ||
24 | it is possible. Unfortunately, it is not always possible; working with | ||
25 | this kind of buffer normally requires hardware which can do | ||
26 | scatter/gather DMA operations. | ||
27 | |||
28 | - Buffers which are physically scattered, but which are virtually | ||
29 | contiguous; buffers allocated with vmalloc(), in other words. These | ||
30 | buffers are just as hard to use for DMA operations, but they can be | ||
31 | useful in situations where DMA is not available but virtually-contiguous | ||
32 | buffers are convenient. | ||
33 | |||
34 | - Buffers which are physically contiguous. Allocation of this kind of | ||
35 | buffer can be unreliable on fragmented systems, but simpler DMA | ||
36 | controllers cannot deal with anything else. | ||
37 | |||
38 | Videobuf can work with all three types of buffers, but the driver author | ||
39 | must pick one at the outset and design the driver around that decision. | ||
40 | |||
41 | [It's worth noting that there's a fourth kind of buffer: "overlay" buffers | ||
42 | which are located within the system's video memory. The overlay | ||
43 | functionality is considered to be deprecated for most use, but it still | ||
44 | shows up occasionally in system-on-chip drivers where the performance | ||
45 | benefits merit the use of this technique. Overlay buffers can be handled | ||
46 | as a form of scattered buffer, but there are very few implementations in | ||
47 | the kernel and a description of this technique is currently beyond the | ||
48 | scope of this document.] | ||
49 | |||
50 | Data structures, callbacks, and initialization | ||
51 | |||
52 | Depending on which type of buffers are being used, the driver should | ||
53 | include one of the following files: | ||
54 | |||
55 | <media/videobuf-dma-sg.h> /* Physically scattered */ | ||
56 | <media/videobuf-vmalloc.h> /* vmalloc() buffers */ | ||
57 | <media/videobuf-dma-contig.h> /* Physically contiguous */ | ||
58 | |||
59 | The driver's data structure describing a V4L2 device should include a | ||
60 | struct videobuf_queue instance for the management of the buffer queue, | ||
61 | along with a list_head for the queue of available buffers. There will also | ||
62 | need to be an interrupt-safe spinlock which is used to protect (at least) | ||
63 | the queue. | ||
64 | |||
65 | The next step is to write four simple callbacks to help videobuf deal with | ||
66 | the management of buffers: | ||
67 | |||
68 | struct videobuf_queue_ops { | ||
69 | int (*buf_setup)(struct videobuf_queue *q, | ||
70 | unsigned int *count, unsigned int *size); | ||
71 | int (*buf_prepare)(struct videobuf_queue *q, | ||
72 | struct videobuf_buffer *vb, | ||
73 | enum v4l2_field field); | ||
74 | void (*buf_queue)(struct videobuf_queue *q, | ||
75 | struct videobuf_buffer *vb); | ||
76 | void (*buf_release)(struct videobuf_queue *q, | ||
77 | struct videobuf_buffer *vb); | ||
78 | }; | ||
79 | |||
80 | buf_setup() is called early in the I/O process, when streaming is being | ||
81 | initiated; its purpose is to tell videobuf about the I/O stream. The count | ||
82 | parameter will be a suggested number of buffers to use; the driver should | ||
83 | check it for rationality and adjust it if need be. As a practical rule, a | ||
84 | minimum of two buffers are needed for proper streaming, and there is | ||
85 | usually a maximum (which cannot exceed 32) which makes sense for each | ||
86 | device. The size parameter should be set to the expected (maximum) size | ||
87 | for each frame of data. | ||
88 | |||
89 | Each buffer (in the form of a struct videobuf_buffer pointer) will be | ||
90 | passed to buf_prepare(), which should set the buffer's size, width, height, | ||
91 | and field fields properly. If the buffer's state field is | ||
92 | VIDEOBUF_NEEDS_INIT, the driver should pass it to: | ||
93 | |||
94 | int videobuf_iolock(struct videobuf_queue* q, struct videobuf_buffer *vb, | ||
95 | struct v4l2_framebuffer *fbuf); | ||
96 | |||
97 | Among other things, this call will usually allocate memory for the buffer. | ||
98 | Finally, the buf_prepare() function should set the buffer's state to | ||
99 | VIDEOBUF_PREPARED. | ||
100 | |||
101 | When a buffer is queued for I/O, it is passed to buf_queue(), which should | ||
102 | put it onto the driver's list of available buffers and set its state to | ||
103 | VIDEOBUF_QUEUED. Note that this function is called with the queue spinlock | ||
104 | held; if it tries to acquire it as well things will come to a screeching | ||
105 | halt. Yes, this is the voice of experience. Note also that videobuf may | ||
106 | wait on the first buffer in the queue; placing other buffers in front of it | ||
107 | could again gum up the works. So use list_add_tail() to enqueue buffers. | ||
108 | |||
109 | Finally, buf_release() is called when a buffer is no longer intended to be | ||
110 | used. The driver should ensure that there is no I/O active on the buffer, | ||
111 | then pass it to the appropriate free routine(s): | ||
112 | |||
113 | /* Scatter/gather drivers */ | ||
114 | int videobuf_dma_unmap(struct videobuf_queue *q, | ||
115 | struct videobuf_dmabuf *dma); | ||
116 | int videobuf_dma_free(struct videobuf_dmabuf *dma); | ||
117 | |||
118 | /* vmalloc drivers */ | ||
119 | void videobuf_vmalloc_free (struct videobuf_buffer *buf); | ||
120 | |||
121 | /* Contiguous drivers */ | ||
122 | void videobuf_dma_contig_free(struct videobuf_queue *q, | ||
123 | struct videobuf_buffer *buf); | ||
124 | |||
125 | One way to ensure that a buffer is no longer under I/O is to pass it to: | ||
126 | |||
127 | int videobuf_waiton(struct videobuf_buffer *vb, int non_blocking, int intr); | ||
128 | |||
129 | Here, vb is the buffer, non_blocking indicates whether non-blocking I/O | ||
130 | should be used (it should be zero in the buf_release() case), and intr | ||
131 | controls whether an interruptible wait is used. | ||
132 | |||
133 | File operations | ||
134 | |||
135 | At this point, much of the work is done; much of the rest is slipping | ||
136 | videobuf calls into the implementation of the other driver callbacks. The | ||
137 | first step is in the open() function, which must initialize the | ||
138 | videobuf queue. The function to use depends on the type of buffer used: | ||
139 | |||
140 | void videobuf_queue_sg_init(struct videobuf_queue *q, | ||
141 | struct videobuf_queue_ops *ops, | ||
142 | struct device *dev, | ||
143 | spinlock_t *irqlock, | ||
144 | enum v4l2_buf_type type, | ||
145 | enum v4l2_field field, | ||
146 | unsigned int msize, | ||
147 | void *priv); | ||
148 | |||
149 | void videobuf_queue_vmalloc_init(struct videobuf_queue *q, | ||
150 | struct videobuf_queue_ops *ops, | ||
151 | struct device *dev, | ||
152 | spinlock_t *irqlock, | ||
153 | enum v4l2_buf_type type, | ||
154 | enum v4l2_field field, | ||
155 | unsigned int msize, | ||
156 | void *priv); | ||
157 | |||
158 | void videobuf_queue_dma_contig_init(struct videobuf_queue *q, | ||
159 | struct videobuf_queue_ops *ops, | ||
160 | struct device *dev, | ||
161 | spinlock_t *irqlock, | ||
162 | enum v4l2_buf_type type, | ||
163 | enum v4l2_field field, | ||
164 | unsigned int msize, | ||
165 | void *priv); | ||
166 | |||
167 | In each case, the parameters are the same: q is the queue structure for the | ||
168 | device, ops is the set of callbacks as described above, dev is the device | ||
169 | structure for this video device, irqlock is an interrupt-safe spinlock to | ||
170 | protect access to the data structures, type is the buffer type used by the | ||
171 | device (cameras will use V4L2_BUF_TYPE_VIDEO_CAPTURE, for example), field | ||
172 | describes which field is being captured (often V4L2_FIELD_NONE for | ||
173 | progressive devices), msize is the size of any containing structure used | ||
174 | around struct videobuf_buffer, and priv is a private data pointer which | ||
175 | shows up in the priv_data field of struct videobuf_queue. Note that these | ||
176 | are void functions which, evidently, are immune to failure. | ||
177 | |||
178 | V4L2 capture drivers can be written to support either of two APIs: the | ||
179 | read() system call and the rather more complicated streaming mechanism. As | ||
180 | a general rule, it is necessary to support both to ensure that all | ||
181 | applications have a chance of working with the device. Videobuf makes it | ||
182 | easy to do that with the same code. To implement read(), the driver need | ||
183 | only make a call to one of: | ||
184 | |||
185 | ssize_t videobuf_read_one(struct videobuf_queue *q, | ||
186 | char __user *data, size_t count, | ||
187 | loff_t *ppos, int nonblocking); | ||
188 | |||
189 | ssize_t videobuf_read_stream(struct videobuf_queue *q, | ||
190 | char __user *data, size_t count, | ||
191 | loff_t *ppos, int vbihack, int nonblocking); | ||
192 | |||
193 | Either one of these functions will read frame data into data, returning the | ||
194 | amount actually read; the difference is that videobuf_read_one() will only | ||
195 | read a single frame, while videobuf_read_stream() will read multiple frames | ||
196 | if they are needed to satisfy the count requested by the application. A | ||
197 | typical driver read() implementation will start the capture engine, call | ||
198 | one of the above functions, then stop the engine before returning (though a | ||
199 | smarter implementation might leave the engine running for a little while in | ||
200 | anticipation of another read() call happening in the near future). | ||
201 | |||
202 | The poll() function can usually be implemented with a direct call to: | ||
203 | |||
204 | unsigned int videobuf_poll_stream(struct file *file, | ||
205 | struct videobuf_queue *q, | ||
206 | poll_table *wait); | ||
207 | |||
208 | Note that the actual wait queue eventually used will be the one associated | ||
209 | with the first available buffer. | ||
210 | |||
211 | When streaming I/O is done to kernel-space buffers, the driver must support | ||
212 | the mmap() system call to enable user space to access the data. In many | ||
213 | V4L2 drivers, the often-complex mmap() implementation simplifies to a | ||
214 | single call to: | ||
215 | |||
216 | int videobuf_mmap_mapper(struct videobuf_queue *q, | ||
217 | struct vm_area_struct *vma); | ||
218 | |||
219 | Everything else is handled by the videobuf code. | ||
220 | |||
221 | The release() function requires two separate videobuf calls: | ||
222 | |||
223 | void videobuf_stop(struct videobuf_queue *q); | ||
224 | int videobuf_mmap_free(struct videobuf_queue *q); | ||
225 | |||
226 | The call to videobuf_stop() terminates any I/O in progress - though it is | ||
227 | still up to the driver to stop the capture engine. The call to | ||
228 | videobuf_mmap_free() will ensure that all buffers have been unmapped; if | ||
229 | so, they will all be passed to the buf_release() callback. If buffers | ||
230 | remain mapped, videobuf_mmap_free() returns an error code instead. The | ||
231 | purpose is clearly to cause the closing of the file descriptor to fail if | ||
232 | buffers are still mapped, but every driver in the 2.6.32 kernel cheerfully | ||
233 | ignores its return value. | ||
234 | |||
235 | ioctl() operations | ||
236 | |||
237 | The V4L2 API includes a very long list of driver callbacks to respond to | ||
238 | the many ioctl() commands made available to user space. A number of these | ||
239 | - those associated with streaming I/O - turn almost directly into videobuf | ||
240 | calls. The relevant helper functions are: | ||
241 | |||
242 | int videobuf_reqbufs(struct videobuf_queue *q, | ||
243 | struct v4l2_requestbuffers *req); | ||
244 | int videobuf_querybuf(struct videobuf_queue *q, struct v4l2_buffer *b); | ||
245 | int videobuf_qbuf(struct videobuf_queue *q, struct v4l2_buffer *b); | ||
246 | int videobuf_dqbuf(struct videobuf_queue *q, struct v4l2_buffer *b, | ||
247 | int nonblocking); | ||
248 | int videobuf_streamon(struct videobuf_queue *q); | ||
249 | int videobuf_streamoff(struct videobuf_queue *q); | ||
250 | int videobuf_cgmbuf(struct videobuf_queue *q, struct video_mbuf *mbuf, | ||
251 | int count); | ||
252 | |||
253 | So, for example, a VIDIOC_REQBUFS call turns into a call to the driver's | ||
254 | vidioc_reqbufs() callback which, in turn, usually only needs to locate the | ||
255 | proper struct videobuf_queue pointer and pass it to videobuf_reqbufs(). | ||
256 | These support functions can replace a great deal of buffer management | ||
257 | boilerplate in a lot of V4L2 drivers. | ||
258 | |||
259 | The vidioc_streamon() and vidioc_streamoff() functions will be a bit more | ||
260 | complex, of course, since they will also need to deal with starting and | ||
261 | stopping the capture engine. videobuf_cgmbuf(), called from the driver's | ||
262 | vidiocgmbuf() function, only exists if the V4L1 compatibility module has | ||
263 | been selected with CONFIG_VIDEO_V4L1_COMPAT, so its use must be surrounded | ||
264 | with #ifdef directives. | ||
265 | |||
266 | Buffer allocation | ||
267 | |||
268 | Thus far, we have talked about buffers, but have not looked at how they are | ||
269 | allocated. The scatter/gather case is the most complex on this front. For | ||
270 | allocation, the driver can leave buffer allocation entirely up to the | ||
271 | videobuf layer; in this case, buffers will be allocated as anonymous | ||
272 | user-space pages and will be very scattered indeed. If the application is | ||
273 | using user-space buffers, no allocation is needed; the videobuf layer will | ||
274 | take care of calling get_user_pages() and filling in the scatterlist array. | ||
275 | |||
276 | If the driver needs to do its own memory allocation, it should be done in | ||
277 | the vidioc_reqbufs() function, *after* calling videobuf_reqbufs(). The | ||
278 | first step is a call to: | ||
279 | |||
280 | struct videobuf_dmabuf *videobuf_to_dma(struct videobuf_buffer *buf); | ||
281 | |||
282 | The returned videobuf_dmabuf structure (defined in | ||
283 | <media/videobuf-dma-sg.h>) includes a couple of relevant fields: | ||
284 | |||
285 | struct scatterlist *sglist; | ||
286 | int sglen; | ||
287 | |||
288 | The driver must allocate an appropriately-sized scatterlist array and | ||
289 | populate it with pointers to the pieces of the allocated buffer; sglen | ||
290 | should be set to the length of the array. | ||
291 | |||
292 | Drivers using the vmalloc() method need not (and cannot) concern themselves | ||
293 | with buffer allocation at all; videobuf will handle those details. The | ||
294 | same is normally true of contiguous-DMA drivers as well; videobuf will | ||
295 | allocate the buffers (with dma_alloc_coherent()) when it sees fit. That | ||
296 | means that these drivers may be trying to do high-order allocations at any | ||
297 | time, an operation which is not always guaranteed to work. Some drivers | ||
298 | play tricks by allocating DMA space at system boot time; videobuf does not | ||
299 | currently play well with those drivers. | ||
300 | |||
301 | As of 2.6.31, contiguous-DMA drivers can work with a user-supplied buffer, | ||
302 | as long as that buffer is physically contiguous. Normal user-space | ||
303 | allocations will not meet that criterion, but buffers obtained from other | ||
304 | kernel drivers, or those contained within huge pages, will work with these | ||
305 | drivers. | ||
306 | |||
307 | Filling the buffers | ||
308 | |||
309 | The final part of a videobuf implementation has no direct callback - it's | ||
310 | the portion of the code which actually puts frame data into the buffers, | ||
311 | usually in response to interrupts from the device. For all types of | ||
312 | drivers, this process works approximately as follows: | ||
313 | |||
314 | - Obtain the next available buffer and make sure that somebody is actually | ||
315 | waiting for it. | ||
316 | |||
317 | - Get a pointer to the memory and put video data there. | ||
318 | |||
319 | - Mark the buffer as done and wake up the process waiting for it. | ||
320 | |||
321 | Step (1) above is done by looking at the driver-managed list_head structure | ||
322 | - the one which is filled in the buf_queue() callback. Because starting | ||
323 | the engine and enqueueing buffers are done in separate steps, it's possible | ||
324 | for the engine to be running without any buffers available - in the | ||
325 | vmalloc() case especially. So the driver should be prepared for the list | ||
326 | to be empty. It is equally possible that nobody is yet interested in the | ||
327 | buffer; the driver should not remove it from the list or fill it until a | ||
328 | process is waiting on it. That test can be done by examining the buffer's | ||
329 | done field (a wait_queue_head_t structure) with waitqueue_active(). | ||
330 | |||
331 | A buffer's state should be set to VIDEOBUF_ACTIVE before being mapped for | ||
332 | DMA; that ensures that the videobuf layer will not try to do anything with | ||
333 | it while the device is transferring data. | ||
334 | |||
335 | For scatter/gather drivers, the needed memory pointers will be found in the | ||
336 | scatterlist structure described above. Drivers using the vmalloc() method | ||
337 | can get a memory pointer with: | ||
338 | |||
339 | void *videobuf_to_vmalloc(struct videobuf_buffer *buf); | ||
340 | |||
341 | For contiguous DMA drivers, the function to use is: | ||
342 | |||
343 | dma_addr_t videobuf_to_dma_contig(struct videobuf_buffer *buf); | ||
344 | |||
345 | The contiguous DMA API goes out of its way to hide the kernel-space address | ||
346 | of the DMA buffer from drivers. | ||
347 | |||
348 | The final step is to set the size field of the relevant videobuf_buffer | ||
349 | structure to the actual size of the captured image, set state to | ||
350 | VIDEOBUF_DONE, then call wake_up() on the done queue. At this point, the | ||
351 | buffer is owned by the videobuf layer and the driver should not touch it | ||
352 | again. | ||
353 | |||
354 | Developers who are interested in more information can go into the relevant | ||
355 | header files; there are a few low-level functions declared there which have | ||
356 | not been talked about here. Also worthwhile is the vivi driver | ||
357 | (drivers/media/video/vivi.c), which is maintained as an example of how V4L2 | ||
358 | drivers should be written. Vivi only uses the vmalloc() API, but it's good | ||
359 | enough to get started with. Note also that all of these calls are exported | ||
360 | GPL-only, so they will not be available to non-GPL kernel modules. | ||
diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX index e57d6a9dd32b..dca82d7c83d8 100644 --- a/Documentation/vm/00-INDEX +++ b/Documentation/vm/00-INDEX | |||
@@ -4,23 +4,35 @@ active_mm.txt | |||
4 | - An explanation from Linus about tsk->active_mm vs tsk->mm. | 4 | - An explanation from Linus about tsk->active_mm vs tsk->mm. |
5 | balance | 5 | balance |
6 | - various information on memory balancing. | 6 | - various information on memory balancing. |
7 | hugepage-mmap.c | ||
8 | - Example app using huge page memory with the mmap system call. | ||
9 | hugepage-shm.c | ||
10 | - Example app using huge page memory with Sys V shared memory system calls. | ||
7 | hugetlbpage.txt | 11 | hugetlbpage.txt |
8 | - a brief summary of hugetlbpage support in the Linux kernel. | 12 | - a brief summary of hugetlbpage support in the Linux kernel. |
13 | hwpoison.txt | ||
14 | - explains what hwpoison is | ||
9 | ksm.txt | 15 | ksm.txt |
10 | - how to use the Kernel Samepage Merging feature. | 16 | - how to use the Kernel Samepage Merging feature. |
11 | locking | 17 | locking |
12 | - info on how locking and synchronization is done in the Linux vm code. | 18 | - info on how locking and synchronization is done in the Linux vm code. |
19 | map_hugetlb.c | ||
20 | - an example program that uses the MAP_HUGETLB mmap flag. | ||
13 | numa | 21 | numa |
14 | - information about NUMA specific code in the Linux vm. | 22 | - information about NUMA specific code in the Linux vm. |
15 | numa_memory_policy.txt | 23 | numa_memory_policy.txt |
16 | - documentation of concepts and APIs of the 2.6 memory policy support. | 24 | - documentation of concepts and APIs of the 2.6 memory policy support. |
17 | overcommit-accounting | 25 | overcommit-accounting |
18 | - description of the Linux kernels overcommit handling modes. | 26 | - description of the Linux kernels overcommit handling modes. |
27 | page-types.c | ||
28 | - Tool for querying page flags | ||
19 | page_migration | 29 | page_migration |
20 | - description of page migration in NUMA systems. | 30 | - description of page migration in NUMA systems. |
31 | pagemap.txt | ||
32 | - pagemap, from the userspace perspective | ||
21 | slabinfo.c | 33 | slabinfo.c |
22 | - source code for a tool to get reports about slabs. | 34 | - source code for a tool to get reports about slabs. |
23 | slub.txt | 35 | slub.txt |
24 | - a short users guide for SLUB. | 36 | - a short users guide for SLUB. |
25 | map_hugetlb.c | 37 | unevictable-lru.txt |
26 | - an example program that uses the MAP_HUGETLB mmap flag. | 38 | - Unevictable LRU infrastructure |
diff --git a/Documentation/vm/Makefile b/Documentation/vm/Makefile index 5bd269b3731a..9dcff328b964 100644 --- a/Documentation/vm/Makefile +++ b/Documentation/vm/Makefile | |||
@@ -2,7 +2,7 @@ | |||
2 | obj- := dummy.o | 2 | obj- := dummy.o |
3 | 3 | ||
4 | # List of programs to build | 4 | # List of programs to build |
5 | hostprogs-y := slabinfo page-types | 5 | hostprogs-y := slabinfo page-types hugepage-mmap hugepage-shm map_hugetlb |
6 | 6 | ||
7 | # Tell kbuild to always build the programs | 7 | # Tell kbuild to always build the programs |
8 | always := $(hostprogs-y) | 8 | always := $(hostprogs-y) |
diff --git a/Documentation/vm/hugepage-mmap.c b/Documentation/vm/hugepage-mmap.c new file mode 100644 index 000000000000..db0dd9a33d54 --- /dev/null +++ b/Documentation/vm/hugepage-mmap.c | |||
@@ -0,0 +1,91 @@ | |||
1 | /* | ||
2 | * hugepage-mmap: | ||
3 | * | ||
4 | * Example of using huge page memory in a user application using the mmap | ||
5 | * system call. Before running this application, make sure that the | ||
6 | * administrator has mounted the hugetlbfs filesystem (on some directory | ||
7 | * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this | ||
8 | * example, the app is requesting memory of size 256MB that is backed by | ||
9 | * huge pages. | ||
10 | * | ||
11 | * For the ia64 architecture, the Linux kernel reserves Region number 4 for | ||
12 | * huge pages. That means that if one requires a fixed address, a huge page | ||
13 | * aligned address starting with 0x800000... will be required. If a fixed | ||
14 | * address is not required, the kernel will select an address in the proper | ||
15 | * range. | ||
16 | * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. | ||
17 | */ | ||
18 | |||
19 | #include <stdlib.h> | ||
20 | #include <stdio.h> | ||
21 | #include <unistd.h> | ||
22 | #include <sys/mman.h> | ||
23 | #include <fcntl.h> | ||
24 | |||
25 | #define FILE_NAME "/mnt/hugepagefile" | ||
26 | #define LENGTH (256UL*1024*1024) | ||
27 | #define PROTECTION (PROT_READ | PROT_WRITE) | ||
28 | |||
29 | /* Only ia64 requires this */ | ||
30 | #ifdef __ia64__ | ||
31 | #define ADDR (void *)(0x8000000000000000UL) | ||
32 | #define FLAGS (MAP_SHARED | MAP_FIXED) | ||
33 | #else | ||
34 | #define ADDR (void *)(0x0UL) | ||
35 | #define FLAGS (MAP_SHARED) | ||
36 | #endif | ||
37 | |||
38 | static void check_bytes(char *addr) | ||
39 | { | ||
40 | printf("First hex is %x\n", *((unsigned int *)addr)); | ||
41 | } | ||
42 | |||
43 | static void write_bytes(char *addr) | ||
44 | { | ||
45 | unsigned long i; | ||
46 | |||
47 | for (i = 0; i < LENGTH; i++) | ||
48 | *(addr + i) = (char)i; | ||
49 | } | ||
50 | |||
51 | static void read_bytes(char *addr) | ||
52 | { | ||
53 | unsigned long i; | ||
54 | |||
55 | check_bytes(addr); | ||
56 | for (i = 0; i < LENGTH; i++) | ||
57 | if (*(addr + i) != (char)i) { | ||
58 | printf("Mismatch at %lu\n", i); | ||
59 | break; | ||
60 | } | ||
61 | } | ||
62 | |||
63 | int main(void) | ||
64 | { | ||
65 | void *addr; | ||
66 | int fd; | ||
67 | |||
68 | fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755); | ||
69 | if (fd < 0) { | ||
70 | perror("Open failed"); | ||
71 | exit(1); | ||
72 | } | ||
73 | |||
74 | addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0); | ||
75 | if (addr == MAP_FAILED) { | ||
76 | perror("mmap"); | ||
77 | unlink(FILE_NAME); | ||
78 | exit(1); | ||
79 | } | ||
80 | |||
81 | printf("Returned address is %p\n", addr); | ||
82 | check_bytes(addr); | ||
83 | write_bytes(addr); | ||
84 | read_bytes(addr); | ||
85 | |||
86 | munmap(addr, LENGTH); | ||
87 | close(fd); | ||
88 | unlink(FILE_NAME); | ||
89 | |||
90 | return 0; | ||
91 | } | ||
diff --git a/Documentation/vm/hugepage-shm.c b/Documentation/vm/hugepage-shm.c new file mode 100644 index 000000000000..07956d8592c9 --- /dev/null +++ b/Documentation/vm/hugepage-shm.c | |||
@@ -0,0 +1,98 @@ | |||
1 | /* | ||
2 | * hugepage-shm: | ||
3 | * | ||
4 | * Example of using huge page memory in a user application using Sys V shared | ||
5 | * memory system calls. In this example the app is requesting 256MB of | ||
6 | * memory that is backed by huge pages. The application uses the flag | ||
7 | * SHM_HUGETLB in the shmget system call to inform the kernel that it is | ||
8 | * requesting huge pages. | ||
9 | * | ||
10 | * For the ia64 architecture, the Linux kernel reserves Region number 4 for | ||
11 | * huge pages. That means that if one requires a fixed address, a huge page | ||
12 | * aligned address starting with 0x800000... will be required. If a fixed | ||
13 | * address is not required, the kernel will select an address in the proper | ||
14 | * range. | ||
15 | * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. | ||
16 | * | ||
17 | * Note: The default shared memory limit is quite low on many kernels, | ||
18 | * you may need to increase it via: | ||
19 | * | ||
20 | * echo 268435456 > /proc/sys/kernel/shmmax | ||
21 | * | ||
22 | * This will increase the maximum size per shared memory segment to 256MB. | ||
23 | * The other limit that you will hit eventually is shmall which is the | ||
24 | * total amount of shared memory in pages. To set it to 16GB on a system | ||
25 | * with a 4kB pagesize do: | ||
26 | * | ||
27 | * echo 4194304 > /proc/sys/kernel/shmall | ||
28 | */ | ||
29 | |||
30 | #include <stdlib.h> | ||
31 | #include <stdio.h> | ||
32 | #include <sys/types.h> | ||
33 | #include <sys/ipc.h> | ||
34 | #include <sys/shm.h> | ||
35 | #include <sys/mman.h> | ||
36 | |||
37 | #ifndef SHM_HUGETLB | ||
38 | #define SHM_HUGETLB 04000 | ||
39 | #endif | ||
40 | |||
41 | #define LENGTH (256UL*1024*1024) | ||
42 | |||
43 | #define dprintf(x) printf(x) | ||
44 | |||
45 | /* Only ia64 requires this */ | ||
46 | #ifdef __ia64__ | ||
47 | #define ADDR (void *)(0x8000000000000000UL) | ||
48 | #define SHMAT_FLAGS (SHM_RND) | ||
49 | #else | ||
50 | #define ADDR (void *)(0x0UL) | ||
51 | #define SHMAT_FLAGS (0) | ||
52 | #endif | ||
53 | |||
54 | int main(void) | ||
55 | { | ||
56 | int shmid; | ||
57 | unsigned long i; | ||
58 | char *shmaddr; | ||
59 | |||
60 | if ((shmid = shmget(2, LENGTH, | ||
61 | SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) { | ||
62 | perror("shmget"); | ||
63 | exit(1); | ||
64 | } | ||
65 | printf("shmid: 0x%x\n", shmid); | ||
66 | |||
67 | shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS); | ||
68 | if (shmaddr == (char *)-1) { | ||
69 | perror("Shared memory attach failure"); | ||
70 | shmctl(shmid, IPC_RMID, NULL); | ||
71 | exit(2); | ||
72 | } | ||
73 | printf("shmaddr: %p\n", shmaddr); | ||
74 | |||
75 | dprintf("Starting the writes:\n"); | ||
76 | for (i = 0; i < LENGTH; i++) { | ||
77 | shmaddr[i] = (char)(i); | ||
78 | if (!(i % (1024 * 1024))) | ||
79 | dprintf("."); | ||
80 | } | ||
81 | dprintf("\n"); | ||
82 | |||
83 | dprintf("Starting the Check..."); | ||
84 | for (i = 0; i < LENGTH; i++) | ||
85 | if (shmaddr[i] != (char)i) | ||
86 | printf("\nIndex %lu mismatched\n", i); | ||
87 | dprintf("Done.\n"); | ||
88 | |||
89 | if (shmdt((const void *)shmaddr) != 0) { | ||
90 | perror("Detach failure"); | ||
91 | shmctl(shmid, IPC_RMID, NULL); | ||
92 | exit(3); | ||
93 | } | ||
94 | |||
95 | shmctl(shmid, IPC_RMID, NULL); | ||
96 | |||
97 | return 0; | ||
98 | } | ||
diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt index bc31636973e3..457634c1e03e 100644 --- a/Documentation/vm/hugetlbpage.txt +++ b/Documentation/vm/hugetlbpage.txt | |||
@@ -299,176 +299,11 @@ map_hugetlb.c. | |||
299 | ******************************************************************* | 299 | ******************************************************************* |
300 | 300 | ||
301 | /* | 301 | /* |
302 | * Example of using huge page memory in a user application using Sys V shared | 302 | * hugepage-shm: see Documentation/vm/hugepage-shm.c |
303 | * memory system calls. In this example the app is requesting 256MB of | ||
304 | * memory that is backed by huge pages. The application uses the flag | ||
305 | * SHM_HUGETLB in the shmget system call to inform the kernel that it is | ||
306 | * requesting huge pages. | ||
307 | * | ||
308 | * For the ia64 architecture, the Linux kernel reserves Region number 4 for | ||
309 | * huge pages. That means that if one requires a fixed address, a huge page | ||
310 | * aligned address starting with 0x800000... will be required. If a fixed | ||
311 | * address is not required, the kernel will select an address in the proper | ||
312 | * range. | ||
313 | * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. | ||
314 | * | ||
315 | * Note: The default shared memory limit is quite low on many kernels, | ||
316 | * you may need to increase it via: | ||
317 | * | ||
318 | * echo 268435456 > /proc/sys/kernel/shmmax | ||
319 | * | ||
320 | * This will increase the maximum size per shared memory segment to 256MB. | ||
321 | * The other limit that you will hit eventually is shmall which is the | ||
322 | * total amount of shared memory in pages. To set it to 16GB on a system | ||
323 | * with a 4kB pagesize do: | ||
324 | * | ||
325 | * echo 4194304 > /proc/sys/kernel/shmall | ||
326 | */ | 303 | */ |
327 | #include <stdlib.h> | ||
328 | #include <stdio.h> | ||
329 | #include <sys/types.h> | ||
330 | #include <sys/ipc.h> | ||
331 | #include <sys/shm.h> | ||
332 | #include <sys/mman.h> | ||
333 | |||
334 | #ifndef SHM_HUGETLB | ||
335 | #define SHM_HUGETLB 04000 | ||
336 | #endif | ||
337 | |||
338 | #define LENGTH (256UL*1024*1024) | ||
339 | |||
340 | #define dprintf(x) printf(x) | ||
341 | |||
342 | #define ADDR (void *)(0x0UL) /* let kernel choose address */ | ||
343 | #define SHMAT_FLAGS (0) | ||
344 | |||
345 | int main(void) | ||
346 | { | ||
347 | int shmid; | ||
348 | unsigned long i; | ||
349 | char *shmaddr; | ||
350 | |||
351 | if ((shmid = shmget(2, LENGTH, | ||
352 | SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) { | ||
353 | perror("shmget"); | ||
354 | exit(1); | ||
355 | } | ||
356 | printf("shmid: 0x%x\n", shmid); | ||
357 | |||
358 | shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS); | ||
359 | if (shmaddr == (char *)-1) { | ||
360 | perror("Shared memory attach failure"); | ||
361 | shmctl(shmid, IPC_RMID, NULL); | ||
362 | exit(2); | ||
363 | } | ||
364 | printf("shmaddr: %p\n", shmaddr); | ||
365 | |||
366 | dprintf("Starting the writes:\n"); | ||
367 | for (i = 0; i < LENGTH; i++) { | ||
368 | shmaddr[i] = (char)(i); | ||
369 | if (!(i % (1024 * 1024))) | ||
370 | dprintf("."); | ||
371 | } | ||
372 | dprintf("\n"); | ||
373 | |||
374 | dprintf("Starting the Check..."); | ||
375 | for (i = 0; i < LENGTH; i++) | ||
376 | if (shmaddr[i] != (char)i) | ||
377 | printf("\nIndex %lu mismatched\n", i); | ||
378 | dprintf("Done.\n"); | ||
379 | |||
380 | if (shmdt((const void *)shmaddr) != 0) { | ||
381 | perror("Detach failure"); | ||
382 | shmctl(shmid, IPC_RMID, NULL); | ||
383 | exit(3); | ||
384 | } | ||
385 | |||
386 | shmctl(shmid, IPC_RMID, NULL); | ||
387 | |||
388 | return 0; | ||
389 | } | ||
390 | 304 | ||
391 | ******************************************************************* | 305 | ******************************************************************* |
392 | 306 | ||
393 | /* | 307 | /* |
394 | * Example of using huge page memory in a user application using the mmap | 308 | * hugepage-mmap: see Documentation/vm/hugepage-mmap.c |
395 | * system call. Before running this application, make sure that the | ||
396 | * administrator has mounted the hugetlbfs filesystem (on some directory | ||
397 | * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this | ||
398 | * example, the app is requesting memory of size 256MB that is backed by | ||
399 | * huge pages. | ||
400 | * | ||
401 | * For the ia64 architecture, the Linux kernel reserves Region number 4 for | ||
402 | * huge pages. That means that if one requires a fixed address, a huge page | ||
403 | * aligned address starting with 0x800000... will be required. If a fixed | ||
404 | * address is not required, the kernel will select an address in the proper | ||
405 | * range. | ||
406 | * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. | ||
407 | */ | 309 | */ |
408 | #include <stdlib.h> | ||
409 | #include <stdio.h> | ||
410 | #include <unistd.h> | ||
411 | #include <sys/mman.h> | ||
412 | #include <fcntl.h> | ||
413 | |||
414 | #define FILE_NAME "/mnt/hugepagefile" | ||
415 | #define LENGTH (256UL*1024*1024) | ||
416 | #define PROTECTION (PROT_READ | PROT_WRITE) | ||
417 | |||
418 | #define ADDR (void *)(0x0UL) /* let kernel choose address */ | ||
419 | #define FLAGS (MAP_SHARED) | ||
420 | |||
421 | void check_bytes(char *addr) | ||
422 | { | ||
423 | printf("First hex is %x\n", *((unsigned int *)addr)); | ||
424 | } | ||
425 | |||
426 | void write_bytes(char *addr) | ||
427 | { | ||
428 | unsigned long i; | ||
429 | |||
430 | for (i = 0; i < LENGTH; i++) | ||
431 | *(addr + i) = (char)i; | ||
432 | } | ||
433 | |||
434 | void read_bytes(char *addr) | ||
435 | { | ||
436 | unsigned long i; | ||
437 | |||
438 | check_bytes(addr); | ||
439 | for (i = 0; i < LENGTH; i++) | ||
440 | if (*(addr + i) != (char)i) { | ||
441 | printf("Mismatch at %lu\n", i); | ||
442 | break; | ||
443 | } | ||
444 | } | ||
445 | |||
446 | int main(void) | ||
447 | { | ||
448 | void *addr; | ||
449 | int fd; | ||
450 | |||
451 | fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755); | ||
452 | if (fd < 0) { | ||
453 | perror("Open failed"); | ||
454 | exit(1); | ||
455 | } | ||
456 | |||
457 | addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0); | ||
458 | if (addr == MAP_FAILED) { | ||
459 | perror("mmap"); | ||
460 | unlink(FILE_NAME); | ||
461 | exit(1); | ||
462 | } | ||
463 | |||
464 | printf("Returned address is %p\n", addr); | ||
465 | check_bytes(addr); | ||
466 | write_bytes(addr); | ||
467 | read_bytes(addr); | ||
468 | |||
469 | munmap(addr, LENGTH); | ||
470 | close(fd); | ||
471 | unlink(FILE_NAME); | ||
472 | |||
473 | return 0; | ||
474 | } | ||
diff --git a/Documentation/vm/map_hugetlb.c b/Documentation/vm/map_hugetlb.c index e2bdae37f499..eda1a6d3578a 100644 --- a/Documentation/vm/map_hugetlb.c +++ b/Documentation/vm/map_hugetlb.c | |||
@@ -19,7 +19,7 @@ | |||
19 | #define PROTECTION (PROT_READ | PROT_WRITE) | 19 | #define PROTECTION (PROT_READ | PROT_WRITE) |
20 | 20 | ||
21 | #ifndef MAP_HUGETLB | 21 | #ifndef MAP_HUGETLB |
22 | #define MAP_HUGETLB 0x40 | 22 | #define MAP_HUGETLB 0x40000 /* arch specific */ |
23 | #endif | 23 | #endif |
24 | 24 | ||
25 | /* Only ia64 requires this */ | 25 | /* Only ia64 requires this */ |
@@ -31,12 +31,12 @@ | |||
31 | #define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) | 31 | #define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) |
32 | #endif | 32 | #endif |
33 | 33 | ||
34 | void check_bytes(char *addr) | 34 | static void check_bytes(char *addr) |
35 | { | 35 | { |
36 | printf("First hex is %x\n", *((unsigned int *)addr)); | 36 | printf("First hex is %x\n", *((unsigned int *)addr)); |
37 | } | 37 | } |
38 | 38 | ||
39 | void write_bytes(char *addr) | 39 | static void write_bytes(char *addr) |
40 | { | 40 | { |
41 | unsigned long i; | 41 | unsigned long i; |
42 | 42 | ||
@@ -44,7 +44,7 @@ void write_bytes(char *addr) | |||
44 | *(addr + i) = (char)i; | 44 | *(addr + i) = (char)i; |
45 | } | 45 | } |
46 | 46 | ||
47 | void read_bytes(char *addr) | 47 | static void read_bytes(char *addr) |
48 | { | 48 | { |
49 | unsigned long i; | 49 | unsigned long i; |
50 | 50 | ||
diff --git a/Documentation/vm/numa b/Documentation/vm/numa index e93ad9425e2a..a200a386429d 100644 --- a/Documentation/vm/numa +++ b/Documentation/vm/numa | |||
@@ -1,41 +1,149 @@ | |||
1 | Started Nov 1999 by Kanoj Sarcar <kanoj@sgi.com> | 1 | Started Nov 1999 by Kanoj Sarcar <kanoj@sgi.com> |
2 | 2 | ||
3 | The intent of this file is to have an uptodate, running commentary | 3 | What is NUMA? |
4 | from different people about NUMA specific code in the Linux vm. | 4 | |
5 | 5 | This question can be answered from a couple of perspectives: the | |
6 | What is NUMA? It is an architecture where the memory access times | 6 | hardware view and the Linux software view. |
7 | for different regions of memory from a given processor varies | 7 | |
8 | according to the "distance" of the memory region from the processor. | 8 | From the hardware perspective, a NUMA system is a computer platform that |
9 | Each region of memory to which access times are the same from any | 9 | comprises multiple components or assemblies each of which may contain 0 |
10 | cpu, is called a node. On such architectures, it is beneficial if | 10 | or more CPUs, local memory, and/or IO buses. For brevity and to |
11 | the kernel tries to minimize inter node communications. Schemes | 11 | disambiguate the hardware view of these physical components/assemblies |
12 | for this range from kernel text and read-only data replication | 12 | from the software abstraction thereof, we'll call the components/assemblies |
13 | across nodes, and trying to house all the data structures that | 13 | 'cells' in this document. |
14 | key components of the kernel need on memory on that node. | 14 | |
15 | 15 | Each of the 'cells' may be viewed as an SMP [symmetric multi-processor] subset | |
16 | Currently, all the numa support is to provide efficient handling | 16 | of the system--although some components necessary for a stand-alone SMP system |
17 | of widely discontiguous physical memory, so architectures which | 17 | may not be populated on any given cell. The cells of the NUMA system are |
18 | are not NUMA but can have huge holes in the physical address space | 18 | connected together with some sort of system interconnect--e.g., a crossbar or |
19 | can use the same code. All this code is bracketed by CONFIG_DISCONTIGMEM. | 19 | point-to-point link are common types of NUMA system interconnects. Both of |
20 | 20 | these types of interconnects can be aggregated to create NUMA platforms with | |
21 | The initial port includes NUMAizing the bootmem allocator code by | 21 | cells at multiple distances from other cells. |
22 | encapsulating all the pieces of information into a bootmem_data_t | 22 | |
23 | structure. Node specific calls have been added to the allocator. | 23 | For Linux, the NUMA platforms of interest are primarily what is known as Cache |
24 | In theory, any platform which uses the bootmem allocator should | 24 | Coherent NUMA or ccNUMA systems. With ccNUMA systems, all memory is visible |
25 | be able to put the bootmem and mem_map data structures anywhere | 25 | to and accessible from any CPU attached to any cell and cache coherency |
26 | it deems best. | 26 | is handled in hardware by the processor caches and/or the system interconnect. |
27 | 27 | ||
28 | Each node's page allocation data structures have also been encapsulated | 28 | Memory access time and effective memory bandwidth varies depending on how far |
29 | into a pg_data_t. The bootmem_data_t is just one part of this. To | 29 | away the cell containing the CPU or IO bus making the memory access is from the |
30 | make the code look uniform between NUMA and regular UMA platforms, | 30 | cell containing the target memory. For example, access to memory by CPUs |
31 | UMA platforms have a statically allocated pg_data_t too (contig_page_data). | 31 | attached to the same cell will experience faster access times and higher |
32 | For the sake of uniformity, the function num_online_nodes() is also defined | 32 | bandwidths than accesses to memory on other, remote cells. NUMA platforms |
33 | for all platforms. As we run benchmarks, we might decide to NUMAize | 33 | can have cells at multiple remote distances from any given cell. |
34 | more variables like low_on_memory, nr_free_pages etc into the pg_data_t. | 34 | |
35 | 35 | Platform vendors don't build NUMA systems just to make software developers' | |
36 | The NUMA aware page allocation code currently tries to allocate pages | 36 | lives interesting. Rather, this architecture is a means to provide scalable |
37 | from different nodes in a round robin manner. This will be changed to | 37 | memory bandwidth. However, to achieve scalable memory bandwidth, system and |
38 | do concentratic circle search, starting from current node, once the | 38 | application software must arrange for a large majority of the memory references |
39 | NUMA port achieves more maturity. The call alloc_pages_node has been | 39 | [cache misses] to be to "local" memory--memory on the same cell, if any--or |
40 | added, so that drivers can make the call and not worry about whether | 40 | to the closest cell with memory. |
41 | it is running on a NUMA or UMA platform. | 41 | |
42 | This leads to the Linux software view of a NUMA system: | ||
43 | |||
44 | Linux divides the system's hardware resources into multiple software | ||
45 | abstractions called "nodes". Linux maps the nodes onto the physical cells | ||
46 | of the hardware platform, abstracting away some of the details for some | ||
47 | architectures. As with physical cells, software nodes may contain 0 or more | ||
48 | CPUs, memory and/or IO buses. And, again, memory accesses to memory on | ||
49 | "closer" nodes--nodes that map to closer cells--will generally experience | ||
50 | faster access times and higher effective bandwidth than accesses to more | ||
51 | remote cells. | ||
52 | |||
53 | For some architectures, such as x86, Linux will "hide" any node representing a | ||
54 | physical cell that has no memory attached, and reassign any CPUs attached to | ||
55 | that cell to a node representing a cell that does have memory. Thus, on | ||
56 | these architectures, one cannot assume that all CPUs that Linux associates with | ||
57 | a given node will see the same local memory access times and bandwidth. | ||
58 | |||
59 | In addition, for some architectures, again x86 is an example, Linux supports | ||
60 | the emulation of additional nodes. For NUMA emulation, linux will carve up | ||
61 | the existing nodes--or the system memory for non-NUMA platforms--into multiple | ||
62 | nodes. Each emulated node will manage a fraction of the underlying cells' | ||
63 | physical memory. NUMA emluation is useful for testing NUMA kernel and | ||
64 | application features on non-NUMA platforms, and as a sort of memory resource | ||
65 | management mechanism when used together with cpusets. | ||
66 | [see Documentation/cgroups/cpusets.txt] | ||
67 | |||
68 | For each node with memory, Linux constructs an independent memory management | ||
69 | subsystem, complete with its own free page lists, in-use page lists, usage | ||
70 | statistics and locks to mediate access. In addition, Linux constructs for | ||
71 | each memory zone [one or more of DMA, DMA32, NORMAL, HIGH_MEMORY, MOVABLE], | ||
72 | an ordered "zonelist". A zonelist specifies the zones/nodes to visit when a | ||
73 | selected zone/node cannot satisfy the allocation request. This situation, | ||
74 | when a zone has no available memory to satisfy a request, is called | ||
75 | "overflow" or "fallback". | ||
76 | |||
77 | Because some nodes contain multiple zones containing different types of | ||
78 | memory, Linux must decide whether to order the zonelists such that allocations | ||
79 | fall back to the same zone type on a different node, or to a different zone | ||
80 | type on the same node. This is an important consideration because some zones, | ||
81 | such as DMA or DMA32, represent relatively scarce resources. Linux chooses | ||
82 | a default zonelist order based on the sizes of the various zone types relative | ||
83 | to the total memory of the node and the total memory of the system. The | ||
84 | default zonelist order may be overridden using the numa_zonelist_order kernel | ||
85 | boot parameter or sysctl. [see Documentation/kernel-parameters.txt and | ||
86 | Documentation/sysctl/vm.txt] | ||
87 | |||
88 | By default, Linux will attempt to satisfy memory allocation requests from the | ||
89 | node to which the CPU that executes the request is assigned. Specifically, | ||
90 | Linux will attempt to allocate from the first node in the appropriate zonelist | ||
91 | for the node where the request originates. This is called "local allocation." | ||
92 | If the "local" node cannot satisfy the request, the kernel will examine other | ||
93 | nodes' zones in the selected zonelist looking for the first zone in the list | ||
94 | that can satisfy the request. | ||
95 | |||
96 | Local allocation will tend to keep subsequent access to the allocated memory | ||
97 | "local" to the underlying physical resources and off the system interconnect-- | ||
98 | as long as the task on whose behalf the kernel allocated some memory does not | ||
99 | later migrate away from that memory. The Linux scheduler is aware of the | ||
100 | NUMA topology of the platform--embodied in the "scheduling domains" data | ||
101 | structures [see Documentation/scheduler/sched-domains.txt]--and the scheduler | ||
102 | attempts to minimize task migration to distant scheduling domains. However, | ||
103 | the scheduler does not take a task's NUMA footprint into account directly. | ||
104 | Thus, under sufficient imbalance, tasks can migrate between nodes, remote | ||
105 | from their initial node and kernel data structures. | ||
106 | |||
107 | System administrators and application designers can restrict a task's migration | ||
108 | to improve NUMA locality using various CPU affinity command line interfaces, | ||
109 | such as taskset(1) and numactl(1), and program interfaces such as | ||
110 | sched_setaffinity(2). Further, one can modify the kernel's default local | ||
111 | allocation behavior using Linux NUMA memory policy. | ||
112 | [see Documentation/vm/numa_memory_policy.] | ||
113 | |||
114 | System administrators can restrict the CPUs and nodes' memories that a non- | ||
115 | privileged user can specify in the scheduling or NUMA commands and functions | ||
116 | using control groups and CPUsets. [see Documentation/cgroups/CPUsets.txt] | ||
117 | |||
118 | On architectures that do not hide memoryless nodes, Linux will include only | ||
119 | zones [nodes] with memory in the zonelists. This means that for a memoryless | ||
120 | node the "local memory node"--the node of the first zone in CPU's node's | ||
121 | zonelist--will not be the node itself. Rather, it will be the node that the | ||
122 | kernel selected as the nearest node with memory when it built the zonelists. | ||
123 | So, default, local allocations will succeed with the kernel supplying the | ||
124 | closest available memory. This is a consequence of the same mechanism that | ||
125 | allows such allocations to fallback to other nearby nodes when a node that | ||
126 | does contain memory overflows. | ||
127 | |||
128 | Some kernel allocations do not want or cannot tolerate this allocation fallback | ||
129 | behavior. Rather they want to be sure they get memory from the specified node | ||
130 | or get notified that the node has no free memory. This is usually the case when | ||
131 | a subsystem allocates per CPU memory resources, for example. | ||
132 | |||
133 | A typical model for making such an allocation is to obtain the node id of the | ||
134 | node to which the "current CPU" is attached using one of the kernel's | ||
135 | numa_node_id() or CPU_to_node() functions and then request memory from only | ||
136 | the node id returned. When such an allocation fails, the requesting subsystem | ||
137 | may revert to its own fallback path. The slab kernel memory allocator is an | ||
138 | example of this. Or, the subsystem may choose to disable or not to enable | ||
139 | itself on allocation failure. The kernel profiling subsystem is an example of | ||
140 | this. | ||
141 | |||
142 | If the architecture supports--does not hide--memoryless nodes, then CPUs | ||
143 | attached to memoryless nodes would always incur the fallback path overhead | ||
144 | or some subsystems would fail to initialize if they attempted to allocated | ||
145 | memory exclusively from a node without memory. To support such | ||
146 | architectures transparently, kernel subsystems can use the numa_mem_id() | ||
147 | or cpu_to_mem() function to locate the "local memory node" for the calling or | ||
148 | specified CPU. Again, this is the same node from which default, local page | ||
149 | allocations will be attempted. | ||
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt index be45dbb9d7f2..6690fc34ef6d 100644 --- a/Documentation/vm/numa_memory_policy.txt +++ b/Documentation/vm/numa_memory_policy.txt | |||
@@ -45,7 +45,7 @@ most general to most specific: | |||
45 | to establish the task policy for a child task exec()'d from an | 45 | to establish the task policy for a child task exec()'d from an |
46 | executable image that has no awareness of memory policy. See the | 46 | executable image that has no awareness of memory policy. See the |
47 | MEMORY POLICY APIS section, below, for an overview of the system call | 47 | MEMORY POLICY APIS section, below, for an overview of the system call |
48 | that a task may use to set/change it's task/process policy. | 48 | that a task may use to set/change its task/process policy. |
49 | 49 | ||
50 | In a multi-threaded task, task policies apply only to the thread | 50 | In a multi-threaded task, task policies apply only to the thread |
51 | [Linux kernel task] that installs the policy and any threads | 51 | [Linux kernel task] that installs the policy and any threads |
@@ -301,7 +301,7 @@ decrement this reference count, respectively. mpol_put() will only free | |||
301 | the structure back to the mempolicy kmem cache when the reference count | 301 | the structure back to the mempolicy kmem cache when the reference count |
302 | goes to zero. | 302 | goes to zero. |
303 | 303 | ||
304 | When a new memory policy is allocated, it's reference count is initialized | 304 | When a new memory policy is allocated, its reference count is initialized |
305 | to '1', representing the reference held by the task that is installing the | 305 | to '1', representing the reference held by the task that is installing the |
306 | new policy. When a pointer to a memory policy structure is stored in another | 306 | new policy. When a pointer to a memory policy structure is stored in another |
307 | structure, another reference is added, as the task's reference will be dropped | 307 | structure, another reference is added, as the task's reference will be dropped |
diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt index b37300edf27c..07375e73981a 100644 --- a/Documentation/vm/slub.txt +++ b/Documentation/vm/slub.txt | |||
@@ -41,6 +41,7 @@ Possible debug options are | |||
41 | P Poisoning (object and padding) | 41 | P Poisoning (object and padding) |
42 | U User tracking (free and alloc) | 42 | U User tracking (free and alloc) |
43 | T Trace (please only use on single slabs) | 43 | T Trace (please only use on single slabs) |
44 | A Toggle failslab filter mark for the cache | ||
44 | O Switch debugging off for caches that would have | 45 | O Switch debugging off for caches that would have |
45 | caused higher minimum slab orders | 46 | caused higher minimum slab orders |
46 | - Switch all debugging off (useful if the kernel is | 47 | - Switch all debugging off (useful if the kernel is |
diff --git a/Documentation/volatile-considered-harmful.txt b/Documentation/volatile-considered-harmful.txt index 991c26a6ef64..db0cb228d64a 100644 --- a/Documentation/volatile-considered-harmful.txt +++ b/Documentation/volatile-considered-harmful.txt | |||
@@ -63,9 +63,9 @@ way to perform a busy wait is: | |||
63 | cpu_relax(); | 63 | cpu_relax(); |
64 | 64 | ||
65 | The cpu_relax() call can lower CPU power consumption or yield to a | 65 | The cpu_relax() call can lower CPU power consumption or yield to a |
66 | hyperthreaded twin processor; it also happens to serve as a memory barrier, | 66 | hyperthreaded twin processor; it also happens to serve as a compiler |
67 | so, once again, volatile is unnecessary. Of course, busy-waiting is | 67 | barrier, so, once again, volatile is unnecessary. Of course, busy- |
68 | generally an anti-social act to begin with. | 68 | waiting is generally an anti-social act to begin with. |
69 | 69 | ||
70 | There are still a few rare situations where volatile makes sense in the | 70 | There are still a few rare situations where volatile makes sense in the |
71 | kernel: | 71 | kernel: |
diff --git a/Documentation/voyager.txt b/Documentation/voyager.txt deleted file mode 100644 index 2749af552cdf..000000000000 --- a/Documentation/voyager.txt +++ /dev/null | |||
@@ -1,95 +0,0 @@ | |||
1 | Running Linux on the Voyager Architecture | ||
2 | ========================================= | ||
3 | |||
4 | For full details and current project status, see | ||
5 | |||
6 | http://www.hansenpartnership.com/voyager | ||
7 | |||
8 | The voyager architecture was designed by NCR in the mid 80s to be a | ||
9 | fully SMP capable RAS computing architecture built around intel's 486 | ||
10 | chip set. The voyager came in three levels of architectural | ||
11 | sophistication: 3,4 and 5 --- 1 and 2 never made it out of prototype. | ||
12 | The linux patches support only the Level 5 voyager architecture (any | ||
13 | machine class 3435 and above). | ||
14 | |||
15 | The Voyager Architecture | ||
16 | ------------------------ | ||
17 | |||
18 | Voyager machines consist of a Baseboard with a 386 diagnostic | ||
19 | processor, a Power Supply Interface (PSI) a Primary and possibly | ||
20 | Secondary Microchannel bus and between 2 and 20 voyager slots. The | ||
21 | voyager slots can be populated with memory and cpu cards (up to 4GB | ||
22 | memory and from 1 486 to 32 Pentium Pro processors). Internally, the | ||
23 | voyager has a dual arbitrated system bus and a configuration and test | ||
24 | bus (CAT). The voyager bus speed is 40MHz. Therefore (since all | ||
25 | voyager cards are dual ported for each system bus) the maximum | ||
26 | transfer rate is 320Mb/s but only if you have your slot configuration | ||
27 | tuned (only memory cards can communicate with both busses at once, CPU | ||
28 | cards utilise them one at a time). | ||
29 | |||
30 | Voyager SMP | ||
31 | ----------- | ||
32 | |||
33 | Since voyager was the first intel based SMP system, it is slightly | ||
34 | more primitive than the Intel IO-APIC approach to SMP. Voyager allows | ||
35 | arbitrary interrupt routing (including processor affinity routing) of | ||
36 | all 16 PC type interrupts. However it does this by using a modified | ||
37 | 5259 master/slave chip set instead of an APIC bus. Additionally, | ||
38 | voyager supports Cross Processor Interrupts (CPI) equivalent to the | ||
39 | APIC IPIs. There are two routed voyager interrupt lines provided to | ||
40 | each slot. | ||
41 | |||
42 | Processor Cards | ||
43 | --------------- | ||
44 | |||
45 | These come in single, dyadic and quad configurations (the quads are | ||
46 | problematic--see later). The maximum configuration is 8 quad cards | ||
47 | for 32 way SMP. | ||
48 | |||
49 | Quad Processors | ||
50 | --------------- | ||
51 | |||
52 | Because voyager only supplies two interrupt lines to each Processor | ||
53 | card, the Quad processors have to be configured (and Bootstrapped) in | ||
54 | as a pair of Master/Slave processors. | ||
55 | |||
56 | In fact, most Quad cards only accept one VIC interrupt line, so they | ||
57 | have one interrupt handling processor (called the VIC extended | ||
58 | processor) and three non-interrupt handling processors. | ||
59 | |||
60 | Current Status | ||
61 | -------------- | ||
62 | |||
63 | The System will boot on Mono, Dyad and Quad cards. There was | ||
64 | originally a Quad boot problem which has been fixed by proper gdt | ||
65 | alignment in the initial boot loader. If you still cannot get your | ||
66 | voyager system to boot, email me at: | ||
67 | |||
68 | <J.E.J.Bottomley@HansenPartnership.com> | ||
69 | |||
70 | |||
71 | The Quad cards now support using the separate Quad CPI vectors instead | ||
72 | of going through the VIC mailbox system. | ||
73 | |||
74 | The Level 4 architecture (3430 and 3360 Machines) should also work | ||
75 | fine. | ||
76 | |||
77 | Dump Switch | ||
78 | ----------- | ||
79 | |||
80 | The voyager dump switch sends out a broadcast NMI which the voyager | ||
81 | code intercepts and does a task dump. | ||
82 | |||
83 | Power Switch | ||
84 | ------------ | ||
85 | |||
86 | The front panel power switch is intercepted by the kernel and should | ||
87 | cause a system shutdown and power off. | ||
88 | |||
89 | A Note About Mixed CPU Systems | ||
90 | ------------------------------ | ||
91 | |||
92 | Linux isn't designed to handle mixed CPU systems very well. In order | ||
93 | to get everything going you *must* make sure that your lowest | ||
94 | capability CPU is used for booting. Also, mixing CPU classes | ||
95 | (e.g. 486 and 586) is really not going to work very well at all. | ||
diff --git a/Documentation/w1/w1.generic b/Documentation/w1/w1.generic index e3333eec4320..212f4ac31c01 100644 --- a/Documentation/w1/w1.generic +++ b/Documentation/w1/w1.generic | |||
@@ -25,7 +25,7 @@ When a w1 master driver registers with the w1 subsystem, the following occurs: | |||
25 | - sysfs entries for that w1 master are created | 25 | - sysfs entries for that w1 master are created |
26 | - the w1 bus is periodically searched for new slave devices | 26 | - the w1 bus is periodically searched for new slave devices |
27 | 27 | ||
28 | When a device is found on the bus, w1 core checks if driver for it's family is | 28 | When a device is found on the bus, w1 core checks if driver for its family is |
29 | loaded. If so, the family driver is attached to the slave. | 29 | loaded. If so, the family driver is attached to the slave. |
30 | If there is no driver for the family, default one is assigned, which allows to perform | 30 | If there is no driver for the family, default one is assigned, which allows to perform |
31 | almost any kind of operations. Each logical operation is a transaction | 31 | almost any kind of operations. Each logical operation is a transaction |
diff --git a/Documentation/watchdog/00-INDEX b/Documentation/watchdog/00-INDEX index c3ea47e507fe..ee994513a9b1 100644 --- a/Documentation/watchdog/00-INDEX +++ b/Documentation/watchdog/00-INDEX | |||
@@ -1,10 +1,15 @@ | |||
1 | 00-INDEX | 1 | 00-INDEX |
2 | - this file. | 2 | - this file. |
3 | hpwdt.txt | ||
4 | - information on the HP iLO2 NMI watchdog | ||
3 | pcwd-watchdog.txt | 5 | pcwd-watchdog.txt |
4 | - documentation for Berkshire Products PC Watchdog ISA cards. | 6 | - documentation for Berkshire Products PC Watchdog ISA cards. |
5 | src/ | 7 | src/ |
6 | - directory holding watchdog related example programs. | 8 | - directory holding watchdog related example programs. |
7 | watchdog-api.txt | 9 | watchdog-api.txt |
8 | - description of the Linux Watchdog driver API. | 10 | - description of the Linux Watchdog driver API. |
11 | watchdog-parameters.txt | ||
12 | - information on driver parameters (for drivers other than | ||
13 | the ones that have driver-specific files here) | ||
9 | wdt.txt | 14 | wdt.txt |
10 | - description of the Watchdog Timer Interfaces for Linux. | 15 | - description of the Watchdog Timer Interfaces for Linux. |
diff --git a/Documentation/watchdog/src/watchdog-simple.c b/Documentation/watchdog/src/watchdog-simple.c index 4cf72f3fa8e9..ba45803a2216 100644 --- a/Documentation/watchdog/src/watchdog-simple.c +++ b/Documentation/watchdog/src/watchdog-simple.c | |||
@@ -17,9 +17,6 @@ int main(void) | |||
17 | ret = -1; | 17 | ret = -1; |
18 | break; | 18 | break; |
19 | } | 19 | } |
20 | ret = fsync(fd); | ||
21 | if (ret) | ||
22 | break; | ||
23 | sleep(10); | 20 | sleep(10); |
24 | } | 21 | } |
25 | close(fd); | 22 | close(fd); |
diff --git a/Documentation/watchdog/src/watchdog-test.c b/Documentation/watchdog/src/watchdog-test.c index a750532ffcf8..63fdc34ceb98 100644 --- a/Documentation/watchdog/src/watchdog-test.c +++ b/Documentation/watchdog/src/watchdog-test.c | |||
@@ -31,6 +31,8 @@ static void keep_alive(void) | |||
31 | */ | 31 | */ |
32 | int main(int argc, char *argv[]) | 32 | int main(int argc, char *argv[]) |
33 | { | 33 | { |
34 | int flags; | ||
35 | |||
34 | fd = open("/dev/watchdog", O_WRONLY); | 36 | fd = open("/dev/watchdog", O_WRONLY); |
35 | 37 | ||
36 | if (fd == -1) { | 38 | if (fd == -1) { |
@@ -41,12 +43,14 @@ int main(int argc, char *argv[]) | |||
41 | 43 | ||
42 | if (argc > 1) { | 44 | if (argc > 1) { |
43 | if (!strncasecmp(argv[1], "-d", 2)) { | 45 | if (!strncasecmp(argv[1], "-d", 2)) { |
44 | ioctl(fd, WDIOC_SETOPTIONS, WDIOS_DISABLECARD); | 46 | flags = WDIOS_DISABLECARD; |
47 | ioctl(fd, WDIOC_SETOPTIONS, &flags); | ||
45 | fprintf(stderr, "Watchdog card disabled.\n"); | 48 | fprintf(stderr, "Watchdog card disabled.\n"); |
46 | fflush(stderr); | 49 | fflush(stderr); |
47 | exit(0); | 50 | exit(0); |
48 | } else if (!strncasecmp(argv[1], "-e", 2)) { | 51 | } else if (!strncasecmp(argv[1], "-e", 2)) { |
49 | ioctl(fd, WDIOC_SETOPTIONS, WDIOS_ENABLECARD); | 52 | flags = WDIOS_ENABLECARD; |
53 | ioctl(fd, WDIOC_SETOPTIONS, &flags); | ||
50 | fprintf(stderr, "Watchdog card enabled.\n"); | 54 | fprintf(stderr, "Watchdog card enabled.\n"); |
51 | fflush(stderr); | 55 | fflush(stderr); |
52 | exit(0); | 56 | exit(0); |
diff --git a/Documentation/watchdog/watchdog-api.txt b/Documentation/watchdog/watchdog-api.txt index 4cc4ba9d7150..eb7132ed8bbc 100644 --- a/Documentation/watchdog/watchdog-api.txt +++ b/Documentation/watchdog/watchdog-api.txt | |||
@@ -222,11 +222,10 @@ returned value is the temperature in degrees fahrenheit. | |||
222 | ioctl(fd, WDIOC_GETTEMP, &temperature); | 222 | ioctl(fd, WDIOC_GETTEMP, &temperature); |
223 | 223 | ||
224 | Finally the SETOPTIONS ioctl can be used to control some aspects of | 224 | Finally the SETOPTIONS ioctl can be used to control some aspects of |
225 | the cards operation; right now the pcwd driver is the only one | 225 | the cards operation. |
226 | supporting this ioctl. | ||
227 | 226 | ||
228 | int options = 0; | 227 | int options = 0; |
229 | ioctl(fd, WDIOC_SETOPTIONS, options); | 228 | ioctl(fd, WDIOC_SETOPTIONS, &options); |
230 | 229 | ||
231 | The following options are available: | 230 | The following options are available: |
232 | 231 | ||
diff --git a/Documentation/watchdog/watchdog-parameters.txt b/Documentation/watchdog/watchdog-parameters.txt new file mode 100644 index 000000000000..41c95cc1dc1f --- /dev/null +++ b/Documentation/watchdog/watchdog-parameters.txt | |||
@@ -0,0 +1,390 @@ | |||
1 | This file provides information on the module parameters of many of | ||
2 | the Linux watchdog drivers. Watchdog driver parameter specs should | ||
3 | be listed here unless the driver has its own driver-specific information | ||
4 | file. | ||
5 | |||
6 | |||
7 | See Documentation/kernel-parameters.txt for information on | ||
8 | providing kernel parameters for builtin drivers versus loadable | ||
9 | modules. | ||
10 | |||
11 | |||
12 | ------------------------------------------------- | ||
13 | acquirewdt: | ||
14 | wdt_stop: Acquire WDT 'stop' io port (default 0x43) | ||
15 | wdt_start: Acquire WDT 'start' io port (default 0x443) | ||
16 | nowayout: Watchdog cannot be stopped once started | ||
17 | (default=kernel config parameter) | ||
18 | ------------------------------------------------- | ||
19 | advantechwdt: | ||
20 | wdt_stop: Advantech WDT 'stop' io port (default 0x443) | ||
21 | wdt_start: Advantech WDT 'start' io port (default 0x443) | ||
22 | timeout: Watchdog timeout in seconds. 1<= timeout <=63, default=60. | ||
23 | nowayout: Watchdog cannot be stopped once started | ||
24 | (default=kernel config parameter) | ||
25 | ------------------------------------------------- | ||
26 | alim1535_wdt: | ||
27 | timeout: Watchdog timeout in seconds. (0 < timeout < 18000, default=60 | ||
28 | nowayout: Watchdog cannot be stopped once started | ||
29 | (default=kernel config parameter) | ||
30 | ------------------------------------------------- | ||
31 | alim7101_wdt: | ||
32 | timeout: Watchdog timeout in seconds. (1<=timeout<=3600, default=30 | ||
33 | use_gpio: Use the gpio watchdog (required by old cobalt boards). | ||
34 | default=0/off/no | ||
35 | nowayout: Watchdog cannot be stopped once started | ||
36 | (default=kernel config parameter) | ||
37 | ------------------------------------------------- | ||
38 | ar7_wdt: | ||
39 | margin: Watchdog margin in seconds (default=60) | ||
40 | nowayout: Disable watchdog shutdown on close | ||
41 | (default=kernel config parameter) | ||
42 | ------------------------------------------------- | ||
43 | at32ap700x_wdt: | ||
44 | timeout: Timeout value. Limited to be 1 or 2 seconds. (default=2) | ||
45 | nowayout: Watchdog cannot be stopped once started | ||
46 | (default=kernel config parameter) | ||
47 | ------------------------------------------------- | ||
48 | at91rm9200_wdt: | ||
49 | wdt_time: Watchdog time in seconds. (default=5) | ||
50 | nowayout: Watchdog cannot be stopped once started | ||
51 | (default=kernel config parameter) | ||
52 | ------------------------------------------------- | ||
53 | at91sam9_wdt: | ||
54 | heartbeat: Watchdog heartbeats in seconds. (default = 15) | ||
55 | nowayout: Watchdog cannot be stopped once started | ||
56 | (default=kernel config parameter) | ||
57 | ------------------------------------------------- | ||
58 | bcm47xx_wdt: | ||
59 | wdt_time: Watchdog time in seconds. (default=30) | ||
60 | nowayout: Watchdog cannot be stopped once started | ||
61 | (default=kernel config parameter) | ||
62 | ------------------------------------------------- | ||
63 | bfin_wdt: | ||
64 | timeout: Watchdog timeout in seconds. (1<=timeout<=((2^32)/SCLK), default=20) | ||
65 | nowayout: Watchdog cannot be stopped once started | ||
66 | (default=kernel config parameter) | ||
67 | ------------------------------------------------- | ||
68 | coh901327_wdt: | ||
69 | margin: Watchdog margin in seconds (default 60s) | ||
70 | ------------------------------------------------- | ||
71 | cpu5wdt: | ||
72 | port: base address of watchdog card, default is 0x91 | ||
73 | verbose: be verbose, default is 0 (no) | ||
74 | ticks: count down ticks, default is 10000 | ||
75 | ------------------------------------------------- | ||
76 | cpwd: | ||
77 | wd0_timeout: Default watchdog0 timeout in 1/10secs | ||
78 | wd1_timeout: Default watchdog1 timeout in 1/10secs | ||
79 | wd2_timeout: Default watchdog2 timeout in 1/10secs | ||
80 | ------------------------------------------------- | ||
81 | davinci_wdt: | ||
82 | heartbeat: Watchdog heartbeat period in seconds from 1 to 600, default 60 | ||
83 | ------------------------------------------------- | ||
84 | ep93xx_wdt: | ||
85 | nowayout: Watchdog cannot be stopped once started | ||
86 | timeout: Watchdog timeout in seconds. (1<=timeout<=3600, default=TBD) | ||
87 | ------------------------------------------------- | ||
88 | eurotechwdt: | ||
89 | nowayout: Watchdog cannot be stopped once started | ||
90 | (default=kernel config parameter) | ||
91 | io: Eurotech WDT io port (default=0x3f0) | ||
92 | irq: Eurotech WDT irq (default=10) | ||
93 | ev: Eurotech WDT event type (default is `int') | ||
94 | ------------------------------------------------- | ||
95 | gef_wdt: | ||
96 | nowayout: Watchdog cannot be stopped once started | ||
97 | (default=kernel config parameter) | ||
98 | ------------------------------------------------- | ||
99 | geodewdt: | ||
100 | timeout: Watchdog timeout in seconds. 1<= timeout <=131, default=60. | ||
101 | nowayout: Watchdog cannot be stopped once started | ||
102 | (default=kernel config parameter) | ||
103 | ------------------------------------------------- | ||
104 | i6300esb: | ||
105 | heartbeat: Watchdog heartbeat in seconds. (1<heartbeat<2046, default=30) | ||
106 | nowayout: Watchdog cannot be stopped once started | ||
107 | (default=kernel config parameter) | ||
108 | ------------------------------------------------- | ||
109 | iTCO_wdt: | ||
110 | heartbeat: Watchdog heartbeat in seconds. | ||
111 | (2<heartbeat<39 (TCO v1) or 613 (TCO v2), default=30) | ||
112 | nowayout: Watchdog cannot be stopped once started | ||
113 | (default=kernel config parameter) | ||
114 | ------------------------------------------------- | ||
115 | iTCO_vendor_support: | ||
116 | vendorsupport: iTCO vendor specific support mode, default=0 (none), | ||
117 | 1=SuperMicro Pent3, 2=SuperMicro Pent4+, 911=Broken SMI BIOS | ||
118 | ------------------------------------------------- | ||
119 | ib700wdt: | ||
120 | timeout: Watchdog timeout in seconds. 0<= timeout <=30, default=30. | ||
121 | nowayout: Watchdog cannot be stopped once started | ||
122 | (default=kernel config parameter) | ||
123 | ------------------------------------------------- | ||
124 | ibmasr: | ||
125 | nowayout: Watchdog cannot be stopped once started | ||
126 | (default=kernel config parameter) | ||
127 | ------------------------------------------------- | ||
128 | indydog: | ||
129 | nowayout: Watchdog cannot be stopped once started | ||
130 | (default=kernel config parameter) | ||
131 | ------------------------------------------------- | ||
132 | iop_wdt: | ||
133 | nowayout: Watchdog cannot be stopped once started | ||
134 | (default=kernel config parameter) | ||
135 | ------------------------------------------------- | ||
136 | it8712f_wdt: | ||
137 | margin: Watchdog margin in seconds (default 60) | ||
138 | nowayout: Disable watchdog shutdown on close | ||
139 | (default=kernel config parameter) | ||
140 | ------------------------------------------------- | ||
141 | it87_wdt: | ||
142 | nogameport: Forbid the activation of game port, default=0 | ||
143 | exclusive: Watchdog exclusive device open, default=1 | ||
144 | timeout: Watchdog timeout in seconds, default=60 | ||
145 | testmode: Watchdog test mode (1 = no reboot), default=0 | ||
146 | nowayout: Watchdog cannot be stopped once started | ||
147 | (default=kernel config parameter) | ||
148 | ------------------------------------------------- | ||
149 | ixp2000_wdt: | ||
150 | heartbeat: Watchdog heartbeat in seconds (default 60s) | ||
151 | nowayout: Watchdog cannot be stopped once started | ||
152 | (default=kernel config parameter) | ||
153 | ------------------------------------------------- | ||
154 | ixp4xx_wdt: | ||
155 | heartbeat: Watchdog heartbeat in seconds (default 60s) | ||
156 | nowayout: Watchdog cannot be stopped once started | ||
157 | (default=kernel config parameter) | ||
158 | ------------------------------------------------- | ||
159 | ks8695_wdt: | ||
160 | wdt_time: Watchdog time in seconds. (default=5) | ||
161 | nowayout: Watchdog cannot be stopped once started | ||
162 | (default=kernel config parameter) | ||
163 | ------------------------------------------------- | ||
164 | machzwd: | ||
165 | nowayout: Watchdog cannot be stopped once started | ||
166 | (default=kernel config parameter) | ||
167 | action: after watchdog resets, generate: | ||
168 | 0 = RESET(*) 1 = SMI 2 = NMI 3 = SCI | ||
169 | ------------------------------------------------- | ||
170 | max63xx_wdt: | ||
171 | heartbeat: Watchdog heartbeat period in seconds from 1 to 60, default 60 | ||
172 | nowayout: Watchdog cannot be stopped once started | ||
173 | (default=kernel config parameter) | ||
174 | nodelay: Force selection of a timeout setting without initial delay | ||
175 | (max6373/74 only, default=0) | ||
176 | ------------------------------------------------- | ||
177 | mixcomwd: | ||
178 | nowayout: Watchdog cannot be stopped once started | ||
179 | (default=kernel config parameter) | ||
180 | ------------------------------------------------- | ||
181 | mpc8xxx_wdt: | ||
182 | timeout: Watchdog timeout in ticks. (0<timeout<65536, default=65535) | ||
183 | reset: Watchdog Interrupt/Reset Mode. 0 = interrupt, 1 = reset | ||
184 | nowayout: Watchdog cannot be stopped once started | ||
185 | (default=kernel config parameter) | ||
186 | ------------------------------------------------- | ||
187 | mpcore_wdt: | ||
188 | mpcore_margin: MPcore timer margin in seconds. | ||
189 | (0 < mpcore_margin < 65536, default=60) | ||
190 | nowayout: Watchdog cannot be stopped once started | ||
191 | (default=kernel config parameter) | ||
192 | mpcore_noboot: MPcore watchdog action, set to 1 to ignore reboots, | ||
193 | 0 to reboot (default=0 | ||
194 | ------------------------------------------------- | ||
195 | mv64x60_wdt: | ||
196 | nowayout: Watchdog cannot be stopped once started | ||
197 | (default=kernel config parameter) | ||
198 | ------------------------------------------------- | ||
199 | nuc900_wdt: | ||
200 | heartbeat: Watchdog heartbeats in seconds. | ||
201 | (default = 15) | ||
202 | nowayout: Watchdog cannot be stopped once started | ||
203 | (default=kernel config parameter) | ||
204 | ------------------------------------------------- | ||
205 | omap_wdt: | ||
206 | timer_margin: initial watchdog timeout (in seconds) | ||
207 | ------------------------------------------------- | ||
208 | orion_wdt: | ||
209 | heartbeat: Initial watchdog heartbeat in seconds | ||
210 | nowayout: Watchdog cannot be stopped once started | ||
211 | (default=kernel config parameter) | ||
212 | ------------------------------------------------- | ||
213 | pc87413_wdt: | ||
214 | io: pc87413 WDT I/O port (default: io). | ||
215 | timeout: Watchdog timeout in minutes (default=timeout). | ||
216 | nowayout: Watchdog cannot be stopped once started | ||
217 | (default=kernel config parameter) | ||
218 | ------------------------------------------------- | ||
219 | pika_wdt: | ||
220 | heartbeat: Watchdog heartbeats in seconds. (default = 15) | ||
221 | nowayout: Watchdog cannot be stopped once started | ||
222 | (default=kernel config parameter) | ||
223 | ------------------------------------------------- | ||
224 | pnx4008_wdt: | ||
225 | heartbeat: Watchdog heartbeat period in seconds from 1 to 60, default 19 | ||
226 | nowayout: Set to 1 to keep watchdog running after device release | ||
227 | ------------------------------------------------- | ||
228 | pnx833x_wdt: | ||
229 | timeout: Watchdog timeout in Mhz. (68Mhz clock), default=2040000000 (30 seconds) | ||
230 | nowayout: Watchdog cannot be stopped once started | ||
231 | (default=kernel config parameter) | ||
232 | start_enabled: Watchdog is started on module insertion (default=1) | ||
233 | ------------------------------------------------- | ||
234 | rc32434_wdt: | ||
235 | timeout: Watchdog timeout value, in seconds (default=20) | ||
236 | nowayout: Watchdog cannot be stopped once started | ||
237 | (default=kernel config parameter) | ||
238 | ------------------------------------------------- | ||
239 | riowd: | ||
240 | riowd_timeout: Watchdog timeout in minutes (default=1) | ||
241 | ------------------------------------------------- | ||
242 | s3c2410_wdt: | ||
243 | tmr_margin: Watchdog tmr_margin in seconds. (default=15) | ||
244 | tmr_atboot: Watchdog is started at boot time if set to 1, default=0 | ||
245 | nowayout: Watchdog cannot be stopped once started | ||
246 | (default=kernel config parameter) | ||
247 | soft_noboot: Watchdog action, set to 1 to ignore reboots, 0 to reboot | ||
248 | debug: Watchdog debug, set to >1 for debug, (default 0) | ||
249 | ------------------------------------------------- | ||
250 | sa1100_wdt: | ||
251 | margin: Watchdog margin in seconds (default 60s) | ||
252 | ------------------------------------------------- | ||
253 | sb_wdog: | ||
254 | timeout: Watchdog timeout in microseconds (max/default 8388607 or 8.3ish secs) | ||
255 | ------------------------------------------------- | ||
256 | sbc60xxwdt: | ||
257 | wdt_stop: SBC60xx WDT 'stop' io port (default 0x45) | ||
258 | wdt_start: SBC60xx WDT 'start' io port (default 0x443) | ||
259 | timeout: Watchdog timeout in seconds. (1<=timeout<=3600, default=30) | ||
260 | nowayout: Watchdog cannot be stopped once started | ||
261 | (default=kernel config parameter) | ||
262 | ------------------------------------------------- | ||
263 | sbc7240_wdt: | ||
264 | timeout: Watchdog timeout in seconds. (1<=timeout<=255, default=30) | ||
265 | nowayout: Disable watchdog when closing device file | ||
266 | ------------------------------------------------- | ||
267 | sbc8360: | ||
268 | timeout: Index into timeout table (0-63) (default=27 (60s)) | ||
269 | nowayout: Watchdog cannot be stopped once started | ||
270 | (default=kernel config parameter) | ||
271 | ------------------------------------------------- | ||
272 | sbc_epx_c3: | ||
273 | nowayout: Watchdog cannot be stopped once started | ||
274 | (default=kernel config parameter) | ||
275 | ------------------------------------------------- | ||
276 | sbc_fitpc2_wdt: | ||
277 | margin: Watchdog margin in seconds (default 60s) | ||
278 | nowayout: Watchdog cannot be stopped once started | ||
279 | ------------------------------------------------- | ||
280 | sc1200wdt: | ||
281 | isapnp: When set to 0 driver ISA PnP support will be disabled (default=1) | ||
282 | io: io port | ||
283 | timeout: range is 0-255 minutes, default is 1 | ||
284 | nowayout: Watchdog cannot be stopped once started | ||
285 | (default=kernel config parameter) | ||
286 | ------------------------------------------------- | ||
287 | sc520_wdt: | ||
288 | timeout: Watchdog timeout in seconds. (1 <= timeout <= 3600, default=30) | ||
289 | nowayout: Watchdog cannot be stopped once started | ||
290 | (default=kernel config parameter) | ||
291 | ------------------------------------------------- | ||
292 | sch311x_wdt: | ||
293 | force_id: Override the detected device ID | ||
294 | therm_trip: Should a ThermTrip trigger the reset generator | ||
295 | timeout: Watchdog timeout in seconds. 1<= timeout <=15300, default=60 | ||
296 | nowayout: Watchdog cannot be stopped once started | ||
297 | (default=kernel config parameter) | ||
298 | ------------------------------------------------- | ||
299 | scx200_wdt: | ||
300 | margin: Watchdog margin in seconds | ||
301 | nowayout: Disable watchdog shutdown on close | ||
302 | ------------------------------------------------- | ||
303 | shwdt: | ||
304 | clock_division_ratio: Clock division ratio. Valid ranges are from 0x5 (1.31ms) | ||
305 | to 0x7 (5.25ms). (default=7) | ||
306 | heartbeat: Watchdog heartbeat in seconds. (1 <= heartbeat <= 3600, default=30 | ||
307 | nowayout: Watchdog cannot be stopped once started | ||
308 | (default=kernel config parameter) | ||
309 | ------------------------------------------------- | ||
310 | smsc37b787_wdt: | ||
311 | timeout: range is 1-255 units, default is 60 | ||
312 | nowayout: Watchdog cannot be stopped once started | ||
313 | (default=kernel config parameter) | ||
314 | ------------------------------------------------- | ||
315 | softdog: | ||
316 | soft_margin: Watchdog soft_margin in seconds. | ||
317 | (0 < soft_margin < 65536, default=60) | ||
318 | nowayout: Watchdog cannot be stopped once started | ||
319 | (default=kernel config parameter) | ||
320 | soft_noboot: Softdog action, set to 1 to ignore reboots, 0 to reboot | ||
321 | (default=0) | ||
322 | ------------------------------------------------- | ||
323 | stmp3xxx_wdt: | ||
324 | heartbeat: Watchdog heartbeat period in seconds from 1 to 4194304, default 19 | ||
325 | ------------------------------------------------- | ||
326 | ts72xx_wdt: | ||
327 | timeout: Watchdog timeout in seconds. (1 <= timeout <= 8, default=8) | ||
328 | nowayout: Disable watchdog shutdown on close | ||
329 | ------------------------------------------------- | ||
330 | twl4030_wdt: | ||
331 | nowayout: Watchdog cannot be stopped once started | ||
332 | (default=kernel config parameter) | ||
333 | ------------------------------------------------- | ||
334 | txx9wdt: | ||
335 | timeout: Watchdog timeout in seconds. (0<timeout<N, default=60) | ||
336 | nowayout: Watchdog cannot be stopped once started | ||
337 | (default=kernel config parameter) | ||
338 | ------------------------------------------------- | ||
339 | w83627hf_wdt: | ||
340 | wdt_io: w83627hf/thf WDT io port (default 0x2E) | ||
341 | timeout: Watchdog timeout in seconds. 1 <= timeout <= 255, default=60. | ||
342 | nowayout: Watchdog cannot be stopped once started | ||
343 | (default=kernel config parameter) | ||
344 | ------------------------------------------------- | ||
345 | w83697hf_wdt: | ||
346 | wdt_io: w83697hf/hg WDT io port (default 0x2e, 0 = autodetect) | ||
347 | timeout: Watchdog timeout in seconds. 1<= timeout <=255 (default=60) | ||
348 | nowayout: Watchdog cannot be stopped once started | ||
349 | (default=kernel config parameter) | ||
350 | early_disable: Watchdog gets disabled at boot time (default=1) | ||
351 | ------------------------------------------------- | ||
352 | w83697ug_wdt: | ||
353 | wdt_io: w83697ug/uf WDT io port (default 0x2e) | ||
354 | timeout: Watchdog timeout in seconds. 1<= timeout <=255 (default=60) | ||
355 | nowayout: Watchdog cannot be stopped once started | ||
356 | (default=kernel config parameter) | ||
357 | ------------------------------------------------- | ||
358 | w83877f_wdt: | ||
359 | timeout: Watchdog timeout in seconds. (1<=timeout<=3600, default=30) | ||
360 | nowayout: Watchdog cannot be stopped once started | ||
361 | (default=kernel config parameter) | ||
362 | ------------------------------------------------- | ||
363 | w83977f_wdt: | ||
364 | timeout: Watchdog timeout in seconds (15..7635), default=45) | ||
365 | testmode: Watchdog testmode (1 = no reboot), default=0 | ||
366 | nowayout: Watchdog cannot be stopped once started | ||
367 | (default=kernel config parameter) | ||
368 | ------------------------------------------------- | ||
369 | wafer5823wdt: | ||
370 | timeout: Watchdog timeout in seconds. 1 <= timeout <= 255, default=60. | ||
371 | nowayout: Watchdog cannot be stopped once started | ||
372 | (default=kernel config parameter) | ||
373 | ------------------------------------------------- | ||
374 | wdt285: | ||
375 | soft_margin: Watchdog timeout in seconds (default=60) | ||
376 | ------------------------------------------------- | ||
377 | wdt977: | ||
378 | timeout: Watchdog timeout in seconds (60..15300, default=60) | ||
379 | testmode: Watchdog testmode (1 = no reboot), default=0 | ||
380 | nowayout: Watchdog cannot be stopped once started | ||
381 | (default=kernel config parameter) | ||
382 | ------------------------------------------------- | ||
383 | wm831x_wdt: | ||
384 | nowayout: Watchdog cannot be stopped once started | ||
385 | (default=kernel config parameter) | ||
386 | ------------------------------------------------- | ||
387 | wm8350_wdt: | ||
388 | nowayout: Watchdog cannot be stopped once started | ||
389 | (default=kernel config parameter) | ||
390 | ------------------------------------------------- | ||
diff --git a/Documentation/watchdog/wdt.txt b/Documentation/watchdog/wdt.txt index 03fd756d976d..061c2e35384f 100644 --- a/Documentation/watchdog/wdt.txt +++ b/Documentation/watchdog/wdt.txt | |||
@@ -14,14 +14,22 @@ reboot will depend on the state of the machines and interrupts. The hardware | |||
14 | boards physically pull the machine down off their own onboard timers and | 14 | boards physically pull the machine down off their own onboard timers and |
15 | will reboot from almost anything. | 15 | will reboot from almost anything. |
16 | 16 | ||
17 | A second temperature monitoring interface is available on the WDT501P cards | 17 | A second temperature monitoring interface is available on the WDT501P cards. |
18 | This provides /dev/temperature. This is the machine internal temperature in | 18 | This provides /dev/temperature. This is the machine internal temperature in |
19 | degrees Fahrenheit. Each read returns a single byte giving the temperature. | 19 | degrees Fahrenheit. Each read returns a single byte giving the temperature. |
20 | 20 | ||
21 | The third interface logs kernel messages on additional alert events. | 21 | The third interface logs kernel messages on additional alert events. |
22 | 22 | ||
23 | The wdt card cannot be safely probed for. Instead you need to pass | 23 | The ICS ISA-bus wdt card cannot be safely probed for. Instead you need to |
24 | wdt=ioaddr,irq as a boot parameter - eg "wdt=0x240,11". | 24 | pass IO address and IRQ boot parameters. E.g.: |
25 | wdt.io=0x240 wdt.irq=11 | ||
26 | |||
27 | Other "wdt" driver parameters are: | ||
28 | heartbeat Watchdog heartbeat in seconds (default 60) | ||
29 | nowayout Watchdog cannot be stopped once started (kernel | ||
30 | build parameter) | ||
31 | tachometer WDT501-P Fan Tachometer support (0=disable, default=0) | ||
32 | type WDT501-P Card type (500 or 501, default=500) | ||
25 | 33 | ||
26 | Features | 34 | Features |
27 | -------- | 35 | -------- |
@@ -40,4 +48,3 @@ Minor numbers are however allocated for it. | |||
40 | 48 | ||
41 | 49 | ||
42 | Example Watchdog Driver: see Documentation/watchdog/src/watchdog-simple.c | 50 | Example Watchdog Driver: see Documentation/watchdog/src/watchdog-simple.c |
43 | |||
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 29a6ff8bc7d3..7fbbaf85f5b7 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt | |||
@@ -166,19 +166,13 @@ NUMA | |||
166 | 166 | ||
167 | numa=noacpi Don't parse the SRAT table for NUMA setup | 167 | numa=noacpi Don't parse the SRAT table for NUMA setup |
168 | 168 | ||
169 | numa=fake=CMDLINE | 169 | numa=fake=<size>[MG] |
170 | If a number, fakes CMDLINE nodes and ignores NUMA setup of the | 170 | If given as a memory unit, fills all system RAM with nodes of |
171 | actual machine. Otherwise, system memory is configured | 171 | size interleaved over physical nodes. |
172 | depending on the sizes and coefficients listed. For example: | 172 | |
173 | numa=fake=2*512,1024,4*256,*128 | 173 | numa=fake=<N> |
174 | gives two 512M nodes, a 1024M node, four 256M nodes, and the | 174 | If given as an integer, fills all system RAM with N fake nodes |
175 | rest split into 128M chunks. If the last character of CMDLINE | 175 | interleaved over physical nodes. |
176 | is a *, the remaining memory is divided up equally among its | ||
177 | coefficient: | ||
178 | numa=fake=2*512,2* | ||
179 | gives two 512M nodes and the rest split into two nodes. | ||
180 | Otherwise, the remaining system RAM is allocated to an | ||
181 | additional node. | ||
182 | 176 | ||
183 | ACPI | 177 | ACPI |
184 | 178 | ||